From 2ccdc1979ac620b5e26e84ed426965f5d3a72e9c Mon Sep 17 00:00:00 2001 From: adilger Date: Wed, 1 Feb 2006 00:39:56 +0000 Subject: [PATCH] Branch b_release_1_4_6 LLNL has updated the BG/L clients to 2.6.9 and have stopped using 2.4.19. --- lustre/kernel_patches/patches/2.4.19-ext3.patch | 7892 -------------------- lustre/kernel_patches/patches/2.4.19-jbd.patch | 6524 ---------------- .../patches/add_page_private-2.4.19-bgl.patch | 15 - .../patches/export-show_task-2.4-bgl.patch | 32 - .../patches/export-truncate-bgl.patch | 37 - .../patches/exports_2.4.19-bgl.patch | 42 - .../patches/ext-2.4-patch-1-2.4.19-suse.patch | 2560 ------- .../patches/ext3-delete_thread-2.4.19-suse.patch | 481 -- .../patches/ext3-delete_thread-2.4.20.patch | 541 -- .../patches/ext3-htree-2.4.19-bgl.patch | 2584 ------- .../patches/ext3-nlinks-2.6.12.patch | 161 - .../patches/ext3-no-write-super.patch | 22 - .../patches/ext3-orphan_lock-2.4.19-suse.patch | 85 - .../kernel_patches/patches/ext3-unmount_sync.patch | 21 - .../patches/ext3-use-after-free-2.4.19-pre1.patch | 53 - .../patches/ext3-use-after-free-suse.patch | 53 - .../patches/extN-wantedi-2.4.19-suse.patch | 226 - .../patches/invalidate_show-2.4.19-bgl.patch | 121 - .../patches/iod-stock-24-exports-2.4.19-bgl.patch | 52 - .../patches/iod-stock-24-exports-2.4.19-suse.patch | 52 - .../kernel_patches/patches/iopen-2.4.19-bgl.patch | 497 -- .../kernel_patches/patches/iopen-2.4.19-suse.patch | 497 -- .../kernel_patches/patches/jbd-2.4.18-jcberr.patch | 274 - .../patches/jbd-2.4.19-pre1-jcberr.patch | 274 - .../patches/jbd-flushtime-2.4.19-suse.patch | 35 - .../kernel_patches/patches/kallsyms-2.4-bgl.patch | 685 -- .../kernel_patches/patches/kksymoops-2.4-bgl.patch | 678 -- .../patches/linux-2.4.18-netdump.patch | 1842 ----- .../patches/linux-2.4.19-bgl-xattr-0.8.54.patch | 5242 ------------- .../linux-2.4.19-suse-xattr-0.8.54-hp.patch | 346 - .../patches/linux-2.4.19-xattr-0.8.54-suse.patch | 47 - .../patches/listman-2.4.19-bgl.patch | 72 - lustre/kernel_patches/patches/mcore-2.4.20-8.patch | 2738 ------- .../patches/mkdep-revert-rh-2.4.patch | 50 - .../patches/nfs_export_kernel-2.4.19-bgl.patch | 742 -- .../patches/removepage-2.4.19-suse.patch | 30 - .../patches/resched-2.4.19-pre1.patch | 16 - .../patches/socket-exports-2.4.19-bgl.patch | 46 - .../patches/tcp-zero-copy-2.4.19-pre1.patch | 461 -- .../patches/vfs_intent-2.4.19-bgl.patch | 1849 ----- .../patches/vfs_intent-2.4.19-suse.patch | 1858 ----- .../patches/vmalloc_to_page-2.4.19-bgl.patch | 12 - lustre/kernel_patches/series/bgl-2.4.19 | 47 - 43 files changed, 39892 deletions(-) delete mode 100644 lustre/kernel_patches/patches/2.4.19-ext3.patch delete mode 100644 lustre/kernel_patches/patches/2.4.19-jbd.patch delete mode 100644 lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch delete mode 100644 lustre/kernel_patches/patches/export-truncate-bgl.patch delete mode 100644 lustre/kernel_patches/patches/exports_2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch delete mode 100644 lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/ext3-nlinks-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/ext3-no-write-super.patch delete mode 100644 lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/ext3-unmount_sync.patch delete mode 100644 lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch delete mode 100644 lustre/kernel_patches/patches/ext3-use-after-free-suse.patch delete mode 100644 lustre/kernel_patches/patches/extN-wantedi-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/iopen-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/jbd-2.4.18-jcberr.patch delete mode 100644 lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch delete mode 100644 lustre/kernel_patches/patches/jbd-flushtime-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch delete mode 100644 lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.4.18-netdump.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch delete mode 100644 lustre/kernel_patches/patches/listman-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/mcore-2.4.20-8.patch delete mode 100644 lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch delete mode 100644 lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/removepage-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/resched-2.4.19-pre1.patch delete mode 100644 lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/series/bgl-2.4.19 diff --git a/lustre/kernel_patches/patches/2.4.19-ext3.patch b/lustre/kernel_patches/patches/2.4.19-ext3.patch deleted file mode 100644 index a167c6a..0000000 --- a/lustre/kernel_patches/patches/2.4.19-ext3.patch +++ /dev/null @@ -1,7892 +0,0 @@ -diff -rup --new-file linux.mcp2/fs/ext3/Makefile linux_tmp/fs/ext3/Makefile ---- linux.mcp2/fs/ext3/Makefile 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/Makefile 2001-12-21 09:41:55.000000000 -0800 -@@ -0,0 +1,16 @@ -+# -+# Makefile for the linux ext2-filesystem routines. -+# -+# Note! Dependencies are done automagically by 'make dep', which also -+# removes any old dependencies. DON'T put your own dependencies here -+# unless it's something special (ie not a .c file). -+# -+# Note 2! The CFLAGS definitions are now in the main makefile... -+ -+O_TARGET := ext3.o -+ -+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ ioctl.o namei.o super.o symlink.o -+obj-m := $(O_TARGET) -+ -+include $(TOPDIR)/Rules.make -diff -rup --new-file linux.mcp2/fs/ext3/balloc.c linux_tmp/fs/ext3/balloc.c ---- linux.mcp2/fs/ext3/balloc.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/balloc.c 2002-08-02 17:39:45.000000000 -0700 -@@ -0,0 +1,999 @@ -+/* -+ * linux/fs/ext3/balloc.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * balloc.c contains the blocks allocation and deallocation routines -+ */ -+ -+/* -+ * The free blocks are managed by bitmaps. A file system contains several -+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap -+ * block for inodes, N blocks for the inode table and data blocks. -+ * -+ * The file system contains group descriptors which are located after the -+ * super block. Each descriptor contains the number of the bitmap block and -+ * the free blocks count in the block. The descriptors are loaded in memory -+ * when a file system is mounted (see ext3_read_super). -+ */ -+ -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -+ unsigned int block_group, -+ struct buffer_head ** bh) -+{ -+ unsigned long group_desc; -+ unsigned long desc; -+ struct ext3_group_desc * gdp; -+ -+ if (block_group >= sb->u.ext3_sb.s_groups_count) { -+ ext3_error (sb, "ext3_get_group_desc", -+ "block_group >= groups_count - " -+ "block_group = %d, groups_count = %lu", -+ block_group, sb->u.ext3_sb.s_groups_count); -+ -+ return NULL; -+ } -+ -+ group_desc = block_group / EXT3_DESC_PER_BLOCK(sb); -+ desc = block_group % EXT3_DESC_PER_BLOCK(sb); -+ if (!sb->u.ext3_sb.s_group_desc[group_desc]) { -+ ext3_error (sb, "ext3_get_group_desc", -+ "Group descriptor not loaded - " -+ "block_group = %d, group_desc = %lu, desc = %lu", -+ block_group, group_desc, desc); -+ return NULL; -+ } -+ -+ gdp = (struct ext3_group_desc *) -+ sb->u.ext3_sb.s_group_desc[group_desc]->b_data; -+ if (bh) -+ *bh = sb->u.ext3_sb.s_group_desc[group_desc]; -+ return gdp + desc; -+} -+ -+/* -+ * Read the bitmap for a given block_group, reading into the specified -+ * slot in the superblock's bitmap cache. -+ * -+ * Return >=0 on success or a -ve error code. -+ */ -+ -+static int read_block_bitmap (struct super_block * sb, -+ unsigned int block_group, -+ unsigned long bitmap_nr) -+{ -+ struct ext3_group_desc * gdp; -+ struct buffer_head * bh = NULL; -+ int retval = -EIO; -+ -+ gdp = ext3_get_group_desc (sb, block_group, NULL); -+ if (!gdp) -+ goto error_out; -+ retval = 0; -+ bh = sb_bread(sb, le32_to_cpu(gdp->bg_block_bitmap)); -+ if (!bh) { -+ ext3_error (sb, "read_block_bitmap", -+ "Cannot read block bitmap - " -+ "block_group = %d, block_bitmap = %lu", -+ block_group, (unsigned long) gdp->bg_block_bitmap); -+ retval = -EIO; -+ } -+ /* -+ * On IO error, just leave a zero in the superblock's block pointer for -+ * this group. The IO will be retried next time. -+ */ -+error_out: -+ sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group; -+ sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh; -+ return retval; -+} -+ -+/* -+ * load_block_bitmap loads the block bitmap for a blocks group -+ * -+ * It maintains a cache for the last bitmaps loaded. This cache is managed -+ * with a LRU algorithm. -+ * -+ * Notes: -+ * 1/ There is one cache per mounted file system. -+ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, -+ * this function reads the bitmap without maintaining a LRU cache. -+ * -+ * Return the slot used to store the bitmap, or a -ve error code. -+ */ -+static int __load_block_bitmap (struct super_block * sb, -+ unsigned int block_group) -+{ -+ int i, j, retval = 0; -+ unsigned long block_bitmap_number; -+ struct buffer_head * block_bitmap; -+ -+ if (block_group >= sb->u.ext3_sb.s_groups_count) -+ ext3_panic (sb, "load_block_bitmap", -+ "block_group >= groups_count - " -+ "block_group = %d, groups_count = %lu", -+ block_group, sb->u.ext3_sb.s_groups_count); -+ -+ if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) { -+ if (sb->u.ext3_sb.s_block_bitmap[block_group]) { -+ if (sb->u.ext3_sb.s_block_bitmap_number[block_group] == -+ block_group) -+ return block_group; -+ ext3_error (sb, "__load_block_bitmap", -+ "block_group != block_bitmap_number"); -+ } -+ retval = read_block_bitmap (sb, block_group, block_group); -+ if (retval < 0) -+ return retval; -+ return block_group; -+ } -+ -+ for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps && -+ sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++) -+ ; -+ if (i < sb->u.ext3_sb.s_loaded_block_bitmaps && -+ sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) { -+ block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i]; -+ block_bitmap = sb->u.ext3_sb.s_block_bitmap[i]; -+ for (j = i; j > 0; j--) { -+ sb->u.ext3_sb.s_block_bitmap_number[j] = -+ sb->u.ext3_sb.s_block_bitmap_number[j - 1]; -+ sb->u.ext3_sb.s_block_bitmap[j] = -+ sb->u.ext3_sb.s_block_bitmap[j - 1]; -+ } -+ sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number; -+ sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap; -+ -+ /* -+ * There's still one special case here --- if block_bitmap == 0 -+ * then our last attempt to read the bitmap failed and we have -+ * just ended up caching that failure. Try again to read it. -+ */ -+ if (!block_bitmap) -+ retval = read_block_bitmap (sb, block_group, 0); -+ } else { -+ if (sb->u.ext3_sb.s_loaded_block_bitmapsu.ext3_sb.s_loaded_block_bitmaps++; -+ else -+ brelse (sb->u.ext3_sb.s_block_bitmap -+ [EXT3_MAX_GROUP_LOADED - 1]); -+ for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1; -+ j > 0; j--) { -+ sb->u.ext3_sb.s_block_bitmap_number[j] = -+ sb->u.ext3_sb.s_block_bitmap_number[j - 1]; -+ sb->u.ext3_sb.s_block_bitmap[j] = -+ sb->u.ext3_sb.s_block_bitmap[j - 1]; -+ } -+ retval = read_block_bitmap (sb, block_group, 0); -+ } -+ return retval; -+} -+ -+/* -+ * Load the block bitmap for a given block group. First of all do a couple -+ * of fast lookups for common cases and then pass the request onto the guts -+ * of the bitmap loader. -+ * -+ * Return the slot number of the group in the superblock bitmap cache's on -+ * success, or a -ve error code. -+ * -+ * There is still one inconsistency here --- if the number of groups in this -+ * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of -+ * differentiating between a group for which we have never performed a bitmap -+ * IO request, and a group for which the last bitmap read request failed. -+ */ -+static inline int load_block_bitmap (struct super_block * sb, -+ unsigned int block_group) -+{ -+ int slot; -+ -+ /* -+ * Do the lookup for the slot. First of all, check if we're asking -+ * for the same slot as last time, and did we succeed that last time? -+ */ -+ if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 && -+ sb->u.ext3_sb.s_block_bitmap_number[0] == block_group && -+ sb->u.ext3_sb.s_block_bitmap[0]) { -+ return 0; -+ } -+ /* -+ * Or can we do a fast lookup based on a loaded group on a filesystem -+ * small enough to be mapped directly into the superblock? -+ */ -+ else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && -+ sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group -+ && sb->u.ext3_sb.s_block_bitmap[block_group]) { -+ slot = block_group; -+ } -+ /* -+ * If not, then do a full lookup for this block group. -+ */ -+ else { -+ slot = __load_block_bitmap (sb, block_group); -+ } -+ -+ /* -+ * <0 means we just got an error -+ */ -+ if (slot < 0) -+ return slot; -+ -+ /* -+ * If it's a valid slot, we may still have cached a previous IO error, -+ * in which case the bh in the superblock cache will be zero. -+ */ -+ if (!sb->u.ext3_sb.s_block_bitmap[slot]) -+ return -EIO; -+ -+ /* -+ * Must have been read in OK to get this far. -+ */ -+ return slot; -+} -+ -+/* Free given blocks, update quota and i_blocks field */ -+void ext3_free_blocks (handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count) -+{ -+ struct buffer_head *bitmap_bh; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ unsigned long bit; -+ unsigned long i; -+ int bitmap_nr; -+ unsigned long overflow; -+ struct super_block * sb; -+ struct ext3_group_desc * gdp; -+ struct ext3_super_block * es; -+ int err = 0, ret; -+ int dquot_freed_blocks = 0; -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ (block + count) > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug ("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ bitmap_nr = load_block_bitmap (sb, block_group); -+ if (bitmap_nr < 0) -+ goto error_return; -+ -+ bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ sb->u.ext3_sb.s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ sb->u.ext3_sb.s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ /* -+ * We are about to start releasing blocks in the bitmap, -+ * so we need undo access. -+ */ -+ /* @@@ check errors */ -+ BUFFER_TRACE(bitmap_bh, "getting undo access"); -+ err = ext3_journal_get_undo_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (err) -+ goto error_return; -+ -+ for (i = 0; i < count; i++) { -+ /* -+ * An HJ special. This is expensive... -+ */ -+#ifdef CONFIG_JBD_DEBUG -+ { -+ struct buffer_head *debug_bh; -+ debug_bh = sb_get_hash_table(sb, block + i); -+ if (debug_bh) { -+ BUFFER_TRACE(debug_bh, "Deleted!"); -+ if (!bh2jh(bitmap_bh)->b_committed_data) -+ BUFFER_TRACE(debug_bh, -+ "No commited data in bitmap"); -+ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); -+ __brelse(debug_bh); -+ } -+ } -+#endif -+ BUFFER_TRACE(bitmap_bh, "clear bit"); -+ if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { -+ ext3_error (sb, __FUNCTION__, -+ "bit already cleared for block %lu", -+ block + i); -+ BUFFER_TRACE(bitmap_bh, "bit already cleared"); -+ } else { -+ dquot_freed_blocks++; -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1); -+ } -+ /* @@@ This prevents newly-allocated data from being -+ * freed and then reallocated within the same -+ * transaction. -+ * -+ * Ideally we would want to allow that to happen, but to -+ * do so requires making journal_forget() capable of -+ * revoking the queued write of a data block, which -+ * implies blocking on the journal lock. *forget() -+ * cannot block due to truncate races. -+ * -+ * Eventually we can fix this by making journal_forget() -+ * return a status indicating whether or not it was able -+ * to revoke the buffer. On successful revoke, it is -+ * safe not to set the allocation bit in the committed -+ * bitmap, because we know that there is no outstanding -+ * activity on the buffer any more and so it is safe to -+ * reallocate it. -+ */ -+ BUFFER_TRACE(bitmap_bh, "clear in b_committed_data"); -+ J_ASSERT_BH(bitmap_bh, -+ bh2jh(bitmap_bh)->b_committed_data != NULL); -+ ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data); -+ } -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ /* And the superblock */ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock"); -+ ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ ext3_std_error(sb, err); -+ unlock_super(sb); -+ if (dquot_freed_blocks) -+ DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -+ return; -+} -+ -+/* For ext3 allocations, we must not reuse any blocks which are -+ * allocated in the bitmap buffer's "last committed data" copy. This -+ * prevents deletes from freeing up the page for reuse until we have -+ * committed the delete transaction. -+ * -+ * If we didn't do this, then deleting something and reallocating it as -+ * data would allow the old block to be overwritten before the -+ * transaction committed (because we force data to disk before commit). -+ * This would lead to corruption if we crashed between overwriting the -+ * data and committing the delete. -+ * -+ * @@@ We may want to make this allocation behaviour conditional on -+ * data-writes at some point, and disable it for metadata allocations or -+ * sync-data inodes. -+ */ -+static int ext3_test_allocatable(int nr, struct buffer_head *bh) -+{ -+ if (ext3_test_bit(nr, bh->b_data)) -+ return 0; -+ if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data) -+ return 1; -+ return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data); -+} -+ -+/* -+ * Find an allocatable block in a bitmap. We honour both the bitmap and -+ * its last-committed copy (if that exists), and perform the "most -+ * appropriate allocation" algorithm of looking for a free block near -+ * the initial goal; then for a free byte somewhere in the bitmap; then -+ * for any free bit in the bitmap. -+ */ -+static int find_next_usable_block(int start, -+ struct buffer_head *bh, int maxblocks) -+{ -+ int here, next; -+ char *p, *r; -+ -+ if (start > 0) { -+ /* -+ * The goal was occupied; search forward for a free -+ * block within the next XX blocks. -+ * -+ * end_goal is more or less random, but it has to be -+ * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the -+ * next 64-bit boundary is simple.. -+ */ -+ int end_goal = (start + 63) & ~63; -+ here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); -+ if (here < end_goal && ext3_test_allocatable(here, bh)) -+ return here; -+ -+ ext3_debug ("Bit not found near goal\n"); -+ -+ } -+ -+ here = start; -+ if (here < 0) -+ here = 0; -+ -+ /* -+ * There has been no free block found in the near vicinity of -+ * the goal: do a search forward through the block groups, -+ * searching in each group first for an entire free byte in the -+ * bitmap and then for any free bit. -+ * -+ * Search first in the remainder of the current group -+ */ -+ p = ((char *) bh->b_data) + (here >> 3); -+ r = memscan(p, 0, (maxblocks - here + 7) >> 3); -+ next = (r - ((char *) bh->b_data)) << 3; -+ -+ if (next < maxblocks && ext3_test_allocatable(next, bh)) -+ return next; -+ -+ /* The bitmap search --- search forward alternately -+ * through the actual bitmap and the last-committed copy -+ * until we find a bit free in both. */ -+ -+ while (here < maxblocks) { -+ next = ext3_find_next_zero_bit ((unsigned long *) bh->b_data, -+ maxblocks, here); -+ if (next >= maxblocks) -+ return -1; -+ if (ext3_test_allocatable(next, bh)) -+ return next; -+ -+ J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data); -+ here = ext3_find_next_zero_bit -+ ((unsigned long *) bh2jh(bh)->b_committed_data, -+ maxblocks, next); -+ } -+ return -1; -+} -+ -+/* -+ * ext3_new_block uses a goal block to assist allocation. If the goal is -+ * free, or there is a free block within 32 blocks of the goal, that block -+ * is allocated. Otherwise a forward search is made for a free block; within -+ * each block group the search first looks for an entire free byte in the block -+ * bitmap, and then for any free bit if that fails. -+ * This function also updates quota and i_blocks field. -+ */ -+int ext3_new_block (handle_t *handle, struct inode * inode, -+ unsigned long goal, u32 * prealloc_count, -+ u32 * prealloc_block, int * errp) -+{ -+ struct buffer_head * bh, *bhtmp; -+ struct buffer_head * bh2; -+#if 0 -+ char * p, * r; -+#endif -+ int i, j, k, tmp, alloctmp; -+ int bitmap_nr; -+ int fatal = 0, err; -+ int performed_allocation = 0; -+ struct super_block * sb; -+ struct ext3_group_desc * gdp; -+ struct ext3_super_block * es; -+#ifdef EXT3FS_DEBUG -+ static int goal_hits = 0, goal_attempts = 0; -+#endif -+ *errp = -ENOSPC; -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_new_block: nonexistent device"); -+ return 0; -+ } -+ -+ /* -+ * Check quota for allocation of this block. -+ */ -+ if (DQUOT_ALLOC_BLOCK(inode, 1)) { -+ *errp = -EDQUOT; -+ return 0; -+ } -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ if (le32_to_cpu(es->s_free_blocks_count) <= -+ le32_to_cpu(es->s_r_blocks_count) && -+ ((sb->u.ext3_sb.s_resuid != current->fsuid) && -+ (sb->u.ext3_sb.s_resgid == 0 || -+ !in_group_p (sb->u.ext3_sb.s_resgid)) && -+ !capable(CAP_SYS_RESOURCE))) -+ goto out; -+ -+ ext3_debug ("goal=%lu.\n", goal); -+ -+ /* -+ * First, test whether the goal block is free. -+ */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ i = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ gdp = ext3_get_group_desc (sb, i, &bh2); -+ if (!gdp) -+ goto io_error; -+ -+ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { -+ j = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+#ifdef EXT3FS_DEBUG -+ if (j) -+ goal_attempts++; -+#endif -+ bitmap_nr = load_block_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ goto io_error; -+ -+ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; -+ -+ ext3_debug ("goal is at %d:%d.\n", i, j); -+ -+ if (ext3_test_allocatable(j, bh)) { -+#ifdef EXT3FS_DEBUG -+ goal_hits++; -+ ext3_debug ("goal bit allocated.\n"); -+#endif -+ goto got_block; -+ } -+ -+ j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb)); -+ if (j >= 0) -+ goto search_back; -+ } -+ -+ ext3_debug ("Bit not found in block group %d.\n", i); -+ -+ /* -+ * Now search the rest of the groups. We assume that -+ * i and gdp correctly point to the last group visited. -+ */ -+ for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) { -+ i++; -+ if (i >= sb->u.ext3_sb.s_groups_count) -+ i = 0; -+ gdp = ext3_get_group_desc (sb, i, &bh2); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out; -+ } -+ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { -+ bitmap_nr = load_block_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ goto io_error; -+ -+ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; -+ j = find_next_usable_block(-1, bh, -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ if (j >= 0) -+ goto search_back; -+ } -+ } -+ -+ /* No space left on the device */ -+ goto out; -+ -+search_back: -+ /* -+ * We have succeeded in finding a free byte in the block -+ * bitmap. Now search backwards up to 7 bits to find the -+ * start of this group of free blocks. -+ */ -+ for ( k = 0; -+ k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh); -+ k++, j--) -+ ; -+ -+got_block: -+ -+ ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count); -+ -+ /* Make sure we use undo access for the bitmap, because it is -+ critical that we do the frozen_data COW on bitmap buffers in -+ all cases even if the buffer is in BJ_Forget state in the -+ committing transaction. */ -+ BUFFER_TRACE(bh, "get undo access for marking new block"); -+ fatal = ext3_journal_get_undo_access(handle, bh); -+ if (fatal) goto out; -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ fatal = ext3_journal_get_write_access(handle, bh2); -+ if (fatal) goto out; -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); -+ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (fatal) goto out; -+ -+ tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (tmp == le32_to_cpu(gdp->bg_block_bitmap) || -+ tmp == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range (tmp, le32_to_cpu(gdp->bg_inode_table), -+ sb->u.ext3_sb.s_itb_per_group)) -+ ext3_error (sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = %u", tmp); -+ -+ /* The superblock lock should guard against anybody else beating -+ * us to this point! */ -+ J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data)); -+ BUFFER_TRACE(bh, "setting bitmap bit"); -+ ext3_set_bit(j, bh->b_data); -+ performed_allocation = 1; -+ -+#ifdef CONFIG_JBD_DEBUG -+ { -+ struct buffer_head *debug_bh; -+ -+ /* Record bitmap buffer state in the newly allocated block */ -+ debug_bh = sb_get_hash_table(sb, tmp); -+ if (debug_bh) { -+ BUFFER_TRACE(debug_bh, "state when allocated"); -+ BUFFER_TRACE2(debug_bh, bh, "bitmap state"); -+ brelse(debug_bh); -+ } -+ } -+#endif -+ if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data) -+ J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data)); -+ bhtmp = bh; -+ alloctmp = j; -+ -+ ext3_debug ("found bit %d\n", j); -+ -+ /* -+ * Do block preallocation now if required. -+ */ -+#ifdef EXT3_PREALLOCATE -+ /* -+ * akpm: this is not enabled for ext3. Need to use -+ * ext3_test_allocatable() -+ */ -+ /* Writer: ->i_prealloc* */ -+ if (prealloc_count && !*prealloc_count) { -+ int prealloc_goal; -+ unsigned long next_block = tmp + 1; -+ -+ prealloc_goal = es->s_prealloc_blocks ? -+ es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS; -+ -+ *prealloc_block = next_block; -+ /* Writer: end */ -+ for (k = 1; -+ k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb); -+ k++, next_block++) { -+ if (DQUOT_PREALLOC_BLOCK(inode, 1)) -+ break; -+ /* Writer: ->i_prealloc* */ -+ if (*prealloc_block + *prealloc_count != next_block || -+ ext3_set_bit (j + k, bh->b_data)) { -+ /* Writer: end */ -+ DQUOT_FREE_BLOCK(inode, 1); -+ break; -+ } -+ (*prealloc_count)++; -+ /* Writer: end */ -+ } -+ /* -+ * As soon as we go for per-group spinlocks we'll need these -+ * done inside the loop above. -+ */ -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - -+ (k - 1)); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - -+ (k - 1)); -+ ext3_debug ("Preallocated a further %lu bits.\n", -+ (k - 1)); -+ } -+#endif -+ -+ j = tmp; -+ -+ BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (!fatal) fatal = err; -+ -+ if (j >= le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_new_block", -+ "block(%d) >= blocks count(%d) - " -+ "block_group = %d, es == %p ",j, -+ le32_to_cpu(es->s_blocks_count), i, es); -+ goto out; -+ } -+ -+ /* -+ * It is up to the caller to add the new buffer to a journal -+ * list of some description. We don't know in advance whether -+ * the caller wants to use it as metadata or data. -+ */ -+ -+ ext3_debug ("allocating block %d. " -+ "Goal hits %d of %d.\n", j, goal_hits, goal_attempts); -+ -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1); -+ -+ BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor"); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (!fatal) fatal = err; -+ -+ BUFFER_TRACE(bh, "journal_dirty_metadata for superblock"); -+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ if (!fatal) fatal = err; -+ -+ sb->s_dirt = 1; -+ if (fatal) -+ goto out; -+ -+ unlock_super (sb); -+ *errp = 0; -+ return j; -+ -+io_error: -+ *errp = -EIO; -+out: -+ if (fatal) { -+ *errp = fatal; -+ ext3_std_error(sb, fatal); -+ } -+ unlock_super (sb); -+ /* -+ * Undo the block allocation -+ */ -+ if (!performed_allocation) -+ DQUOT_FREE_BLOCK(inode, 1); -+ return 0; -+ -+} -+ -+unsigned long ext3_count_free_blocks (struct super_block * sb) -+{ -+#ifdef EXT3FS_DEBUG -+ struct ext3_super_block * es; -+ unsigned long desc_count, bitmap_count, x; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ int i; -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { -+ gdp = ext3_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count); -+ bitmap_nr = load_block_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ continue; -+ -+ x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr], -+ sb->s_blocksize); -+ printk ("group %d: stored = %d, counted = %lu\n", -+ i, le16_to_cpu(gdp->bg_free_blocks_count), x); -+ bitmap_count += x; -+ } -+ printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n", -+ le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); -+ unlock_super (sb); -+ return bitmap_count; -+#else -+ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count); -+#endif -+} -+ -+static inline int block_in_use (unsigned long block, -+ struct super_block * sb, -+ unsigned char * map) -+{ -+ return ext3_test_bit ((block - -+ le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb), map); -+} -+ -+static inline int test_root(int a, int b) -+{ -+ if (a == 0) -+ return 1; -+ while (1) { -+ if (a == 1) -+ return 1; -+ if (a % b) -+ return 0; -+ a = a / b; -+ } -+} -+ -+int ext3_group_sparse(int group) -+{ -+ return (test_root(group, 3) || test_root(group, 5) || -+ test_root(group, 7)); -+} -+ -+/** -+ * ext3_bg_has_super - number of blocks used by the superblock in group -+ * @sb: superblock for filesystem -+ * @group: group number to check -+ * -+ * Return the number of blocks used by the superblock (primary or backup) -+ * in this group. Currently this will be only 0 or 1. -+ */ -+int ext3_bg_has_super(struct super_block *sb, int group) -+{ -+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& -+ !ext3_group_sparse(group)) -+ return 0; -+ return 1; -+} -+ -+/** -+ * ext3_bg_num_gdb - number of blocks used by the group table in group -+ * @sb: superblock for filesystem -+ * @group: group number to check -+ * -+ * Return the number of blocks used by the group descriptor table -+ * (primary or backup) in this group. In the future there may be a -+ * different number of descriptor blocks in each group. -+ */ -+unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) -+{ -+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& -+ !ext3_group_sparse(group)) -+ return 0; -+ return EXT3_SB(sb)->s_gdb_count; -+} -+ -+#ifdef CONFIG_EXT3_CHECK -+/* Called at mount-time, super-block is locked */ -+void ext3_check_blocks_bitmap (struct super_block * sb) -+{ -+ struct buffer_head * bh; -+ struct ext3_super_block * es; -+ unsigned long desc_count, bitmap_count, x, j; -+ unsigned long desc_blocks; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ int i; -+ -+ es = sb->u.ext3_sb.s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { -+ gdp = ext3_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count); -+ bitmap_nr = load_block_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ continue; -+ -+ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; -+ -+ if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data)) -+ ext3_error(sb, __FUNCTION__, -+ "Superblock in group %d is marked free", i); -+ -+ desc_blocks = ext3_bg_num_gdb(sb, i); -+ for (j = 0; j < desc_blocks; j++) -+ if (!ext3_test_bit(j + 1, bh->b_data)) -+ ext3_error(sb, __FUNCTION__, -+ "Descriptor block #%ld in group " -+ "%d is marked free", j, i); -+ -+ if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap), -+ sb, bh->b_data)) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Block bitmap for group %d is marked free", -+ i); -+ -+ if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap), -+ sb, bh->b_data)) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Inode bitmap for group %d is marked free", -+ i); -+ -+ for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++) -+ if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, -+ sb, bh->b_data)) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Block #%d of the inode table in " -+ "group %d is marked free", j, i); -+ -+ x = ext3_count_free (bh, sb->s_blocksize); -+ if (le16_to_cpu(gdp->bg_free_blocks_count) != x) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Wrong free blocks count for group %d, " -+ "stored = %d, counted = %lu", i, -+ le16_to_cpu(gdp->bg_free_blocks_count), x); -+ bitmap_count += x; -+ } -+ if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Wrong free blocks count in super block, " -+ "stored = %lu, counted = %lu", -+ (unsigned long)le32_to_cpu(es->s_free_blocks_count), -+ bitmap_count); -+} -+#endif -diff -rup --new-file linux.mcp2/fs/ext3/bitmap.c linux_tmp/fs/ext3/bitmap.c ---- linux.mcp2/fs/ext3/bitmap.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/bitmap.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,26 @@ -+/* -+ * linux/fs/ext3/bitmap.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ */ -+ -+#include -+ -+ -+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -+ -+unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) -+{ -+ unsigned int i; -+ unsigned long sum = 0; -+ -+ if (!map) -+ return (0); -+ for (i = 0; i < numchars; i++) -+ sum += nibblemap[map->b_data[i] & 0xf] + -+ nibblemap[(map->b_data[i] >> 4) & 0xf]; -+ return (sum); -+} -diff -rup --new-file linux.mcp2/fs/ext3/dir.c linux_tmp/fs/ext3/dir.c ---- linux.mcp2/fs/ext3/dir.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,190 @@ -+/* -+ * linux/fs/ext3/dir.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/dir.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3 directory handling functions -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+ -+static unsigned char ext3_filetype_table[] = { -+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -+}; -+ -+static int ext3_readdir(struct file *, void *, filldir_t); -+ -+struct file_operations ext3_dir_operations = { -+ read: generic_read_dir, -+ readdir: ext3_readdir, /* BKL held */ -+ ioctl: ext3_ioctl, /* BKL held */ -+ fsync: ext3_sync_file, /* BKL held */ -+}; -+ -+int ext3_check_dir_entry (const char * function, struct inode * dir, -+ struct ext3_dir_entry_2 * de, -+ struct buffer_head * bh, -+ unsigned long offset) -+{ -+ const char * error_msg = NULL; -+ const int rlen = le16_to_cpu(de->rec_len); -+ -+ if (rlen < EXT3_DIR_REC_LEN(1)) -+ error_msg = "rec_len is smaller than minimal"; -+ else if (rlen % 4 != 0) -+ error_msg = "rec_len % 4 != 0"; -+ else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) -+ error_msg = "rec_len is too small for name_len"; -+ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) -+ error_msg = "directory entry across blocks"; -+ else if (le32_to_cpu(de->inode) > -+ le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) -+ error_msg = "inode out of bounds"; -+ -+ if (error_msg != NULL) -+ ext3_error (dir->i_sb, function, -+ "bad entry in directory #%lu: %s - " -+ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", -+ dir->i_ino, error_msg, offset, -+ (unsigned long) le32_to_cpu(de->inode), -+ rlen, de->name_len); -+ return error_msg == NULL ? 1 : 0; -+} -+ -+static int ext3_readdir(struct file * filp, -+ void * dirent, filldir_t filldir) -+{ -+ int error = 0; -+ unsigned long offset, blk; -+ int i, num, stored; -+ struct buffer_head * bh, * tmp, * bha[16]; -+ struct ext3_dir_entry_2 * de; -+ struct super_block * sb; -+ int err; -+ struct inode *inode = filp->f_dentry->d_inode; -+ -+ sb = inode->i_sb; -+ -+ stored = 0; -+ bh = NULL; -+ offset = filp->f_pos & (sb->s_blocksize - 1); -+ -+ while (!error && !stored && filp->f_pos < inode->i_size) { -+ blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb); -+ bh = ext3_bread (0, inode, blk, 0, &err); -+ if (!bh) { -+ ext3_error (sb, "ext3_readdir", -+ "directory #%lu contains a hole at offset %lu", -+ inode->i_ino, (unsigned long)filp->f_pos); -+ filp->f_pos += sb->s_blocksize - offset; -+ continue; -+ } -+ -+ /* -+ * Do the readahead -+ */ -+ if (!offset) { -+ for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0; -+ i > 0; i--) { -+ tmp = ext3_getblk (NULL, inode, ++blk, 0, &err); -+ if (tmp && !buffer_uptodate(tmp) && -+ !buffer_locked(tmp)) -+ bha[num++] = tmp; -+ else -+ brelse (tmp); -+ } -+ if (num) { -+ ll_rw_block (READA, num, bha); -+ for (i = 0; i < num; i++) -+ brelse (bha[i]); -+ } -+ } -+ -+revalidate: -+ /* If the dir block has changed since the last call to -+ * readdir(2), then we might be pointing to an invalid -+ * dirent right now. Scan from the start of the block -+ * to make sure. */ -+ if (filp->f_version != inode->i_version) { -+ for (i = 0; i < sb->s_blocksize && i < offset; ) { -+ de = (struct ext3_dir_entry_2 *) -+ (bh->b_data + i); -+ /* It's too expensive to do a full -+ * dirent test each time round this -+ * loop, but we do have to test at -+ * least that it is non-zero. A -+ * failure will be detected in the -+ * dirent test below. */ -+ if (le16_to_cpu(de->rec_len) < -+ EXT3_DIR_REC_LEN(1)) -+ break; -+ i += le16_to_cpu(de->rec_len); -+ } -+ offset = i; -+ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) -+ | offset; -+ filp->f_version = inode->i_version; -+ } -+ -+ while (!error && filp->f_pos < inode->i_size -+ && offset < sb->s_blocksize) { -+ de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); -+ if (!ext3_check_dir_entry ("ext3_readdir", inode, de, -+ bh, offset)) { -+ /* On error, skip the f_pos to the -+ next block. */ -+ filp->f_pos = (filp->f_pos | -+ (sb->s_blocksize - 1)) + 1; -+ brelse (bh); -+ return stored; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ if (le32_to_cpu(de->inode)) { -+ /* We might block in the next section -+ * if the data destination is -+ * currently swapped out. So, use a -+ * version stamp to detect whether or -+ * not the directory has been modified -+ * during the copy operation. -+ */ -+ unsigned long version = filp->f_version; -+ unsigned char d_type = DT_UNKNOWN; -+ -+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, -+ EXT3_FEATURE_INCOMPAT_FILETYPE) -+ && de->file_type < EXT3_FT_MAX) -+ d_type = -+ ext3_filetype_table[de->file_type]; -+ error = filldir(dirent, de->name, -+ de->name_len, -+ filp->f_pos, -+ le32_to_cpu(de->inode), -+ d_type); -+ if (error) -+ break; -+ if (version != filp->f_version) -+ goto revalidate; -+ stored ++; -+ } -+ filp->f_pos += le16_to_cpu(de->rec_len); -+ } -+ offset = 0; -+ brelse (bh); -+ } -+ UPDATE_ATIME(inode); -+ return 0; -+} -diff -rup --new-file linux.mcp2/fs/ext3/file.c linux_tmp/fs/ext3/file.c ---- linux.mcp2/fs/ext3/file.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/file.c 2001-11-15 13:37:55.000000000 -0800 -@@ -0,0 +1,94 @@ -+/* -+ * linux/fs/ext3/file.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/file.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3 fs regular file handling primitives -+ * -+ * 64-bit file support on 64-bit platforms by Jakub Jelinek -+ * (jj@sunsite.ms.mff.cuni.cz) -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * Called when an inode is released. Note that this is different -+ * from ext3_file_open: open gets called at every open, but release -+ * gets called only when /all/ the files are closed. -+ */ -+static int ext3_release_file (struct inode * inode, struct file * filp) -+{ -+ if (filp->f_mode & FMODE_WRITE) -+ ext3_discard_prealloc (inode); -+ return 0; -+} -+ -+/* -+ * Called when an inode is about to be opened. -+ * We use this to disallow opening RW large files on 32bit systems if -+ * the caller didn't specify O_LARGEFILE. On 64bit systems we force -+ * on this flag in sys_open. -+ */ -+static int ext3_open_file (struct inode * inode, struct file * filp) -+{ -+ if (!(filp->f_flags & O_LARGEFILE) && -+ inode->i_size > 0x7FFFFFFFLL) -+ return -EFBIG; -+ return 0; -+} -+ -+/* -+ * ext3_file_write(). -+ * -+ * Most things are done in ext3_prepare_write() and ext3_commit_write(). -+ */ -+ -+static ssize_t -+ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ -+ /* -+ * Nasty: if the file is subject to synchronous writes then we need -+ * to force generic_osync_inode() to call ext3_write_inode(). -+ * We do that by marking the inode dirty. This adds much more -+ * computational expense than we need, but we're going to sync -+ * anyway. -+ */ -+ if (IS_SYNC(inode) || (file->f_flags & O_SYNC)) -+ mark_inode_dirty(inode); -+ -+ return generic_file_write(file, buf, count, ppos); -+} -+ -+struct file_operations ext3_file_operations = { -+ llseek: generic_file_llseek, /* BKL held */ -+ read: generic_file_read, /* BKL not held. Don't need */ -+ write: ext3_file_write, /* BKL not held. Don't need */ -+ ioctl: ext3_ioctl, /* BKL held */ -+ mmap: generic_file_mmap, -+ open: ext3_open_file, /* BKL not held. Don't need */ -+ release: ext3_release_file, /* BKL not held. Don't need */ -+ fsync: ext3_sync_file, /* BKL held */ -+}; -+ -+struct inode_operations ext3_file_inode_operations = { -+ truncate: ext3_truncate, /* BKL held */ -+ setattr: ext3_setattr, /* BKL held */ -+}; -+ -diff -rup --new-file linux.mcp2/fs/ext3/fsync.c linux_tmp/fs/ext3/fsync.c ---- linux.mcp2/fs/ext3/fsync.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/fsync.c 2001-11-20 21:34:13.000000000 -0800 -@@ -0,0 +1,70 @@ -+/* -+ * linux/fs/ext3/fsync.c -+ * -+ * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) -+ * from -+ * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * from -+ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3fs fsync primitive -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * -+ * Removed unnecessary code duplication for little endian machines -+ * and excessive __inline__s. -+ * Andi Kleen, 1997 -+ * -+ * Major simplications and cleanup - we only need to do the metadata, because -+ * we can depend on generic_block_fdatasync() to sync the data blocks. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * akpm: A new design for ext3_sync_file(). -+ * -+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). -+ * There cannot be a transaction open by this task. (AKPM: quotas?) -+ * Another task could have dirtied this inode. Its data can be in any -+ * state in the journalling system. -+ * -+ * What we do is just kick off a commit and wait on it. This will snapshot the -+ * inode to disk. -+ * -+ * Note that there is a serious optimisation we can make here: if the current -+ * inode is not part of j_running_transaction or j_committing_transaction -+ * then we have nothing to do. That would require implementation of t_ilist, -+ * which isn't too hard. -+ */ -+ -+int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) -+{ -+ struct inode *inode = dentry->d_inode; -+ int ret; -+ -+ J_ASSERT(ext3_journal_current_handle() == 0); -+ -+ /* -+ * fsync_inode_buffers() just walks i_dirty_buffers and waits -+ * on them. It's a no-op for full data journalling because -+ * i_dirty_buffers will be ampty. -+ * Really, we only need to start I/O on the dirty buffers - -+ * we'll end up waiting on them in commit. -+ */ -+ ret = fsync_inode_buffers(inode); -+ ret |= fsync_inode_data_buffers(inode); -+ -+ ext3_force_commit(inode->i_sb); -+ -+ return ret; -+} -diff -rup --new-file linux.mcp2/fs/ext3/ialloc.c linux_tmp/fs/ext3/ialloc.c ---- linux.mcp2/fs/ext3/ialloc.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/ialloc.c 2002-02-25 11:38:08.000000000 -0800 -@@ -0,0 +1,663 @@ -+/* -+ * linux/fs/ext3/ialloc.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * BSD ufs-inspired inode and directory allocation by -+ * Stephen Tweedie (sct@redhat.com), 1993 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+/* -+ * ialloc.c contains the inodes allocation and deallocation routines -+ */ -+ -+/* -+ * The free inodes are managed by bitmaps. A file system contains several -+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap -+ * block for inodes, N blocks for the inode table and data blocks. -+ * -+ * The file system contains group descriptors which are located after the -+ * super block. Each descriptor contains the number of the bitmap block and -+ * the free blocks count in the block. The descriptors are loaded in memory -+ * when a file system is mounted (see ext3_read_super). -+ */ -+ -+ -+/* -+ * Read the inode allocation bitmap for a given block_group, reading -+ * into the specified slot in the superblock's bitmap cache. -+ * -+ * Return >=0 on success or a -ve error code. -+ */ -+static int read_inode_bitmap (struct super_block * sb, -+ unsigned long block_group, -+ unsigned int bitmap_nr) -+{ -+ struct ext3_group_desc * gdp; -+ struct buffer_head * bh = NULL; -+ int retval = 0; -+ -+ gdp = ext3_get_group_desc (sb, block_group, NULL); -+ if (!gdp) { -+ retval = -EIO; -+ goto error_out; -+ } -+ bh = sb_bread(sb, le32_to_cpu(gdp->bg_inode_bitmap)); -+ if (!bh) { -+ ext3_error (sb, "read_inode_bitmap", -+ "Cannot read inode bitmap - " -+ "block_group = %lu, inode_bitmap = %lu", -+ block_group, (unsigned long) gdp->bg_inode_bitmap); -+ retval = -EIO; -+ } -+ /* -+ * On IO error, just leave a zero in the superblock's block pointer for -+ * this group. The IO will be retried next time. -+ */ -+error_out: -+ sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group; -+ sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh; -+ return retval; -+} -+ -+/* -+ * load_inode_bitmap loads the inode bitmap for a blocks group -+ * -+ * It maintains a cache for the last bitmaps loaded. This cache is managed -+ * with a LRU algorithm. -+ * -+ * Notes: -+ * 1/ There is one cache per mounted file system. -+ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, -+ * this function reads the bitmap without maintaining a LRU cache. -+ * -+ * Return the slot used to store the bitmap, or a -ve error code. -+ */ -+static int load_inode_bitmap (struct super_block * sb, -+ unsigned int block_group) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long inode_bitmap_number; -+ struct buffer_head * inode_bitmap; -+ int i, j, retval = 0; -+ -+ if (block_group >= sbi->s_groups_count) -+ ext3_panic (sb, "load_inode_bitmap", -+ "block_group >= groups_count - " -+ "block_group = %d, groups_count = %lu", -+ block_group, sbi->s_groups_count); -+ if (sbi->s_loaded_inode_bitmaps > 0 && -+ sbi->s_inode_bitmap_number[0] == block_group && -+ sbi->s_inode_bitmap[0] != NULL) -+ return 0; -+ if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) { -+ if (sbi->s_inode_bitmap[block_group]) { -+ if (sbi->s_inode_bitmap_number[block_group] != -+ block_group) -+ ext3_panic(sb, "load_inode_bitmap", -+ "block_group != inode_bitmap_number"); -+ return block_group; -+ } -+ retval = read_inode_bitmap(sb, block_group, block_group); -+ if (retval < 0) -+ return retval; -+ return block_group; -+ } -+ -+ for (i = 0; i < sbi->s_loaded_inode_bitmaps && -+ sbi->s_inode_bitmap_number[i] != block_group; i++) -+ /* do nothing */; -+ if (i < sbi->s_loaded_inode_bitmaps && -+ sbi->s_inode_bitmap_number[i] == block_group) { -+ inode_bitmap_number = sbi->s_inode_bitmap_number[i]; -+ inode_bitmap = sbi->s_inode_bitmap[i]; -+ for (j = i; j > 0; j--) { -+ sbi->s_inode_bitmap_number[j] = -+ sbi->s_inode_bitmap_number[j - 1]; -+ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; -+ } -+ sbi->s_inode_bitmap_number[0] = inode_bitmap_number; -+ sbi->s_inode_bitmap[0] = inode_bitmap; -+ -+ /* -+ * There's still one special case here --- if inode_bitmap == 0 -+ * then our last attempt to read the bitmap failed and we have -+ * just ended up caching that failure. Try again to read it. -+ */ -+ if (!inode_bitmap) -+ retval = read_inode_bitmap (sb, block_group, 0); -+ } else { -+ if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED) -+ sbi->s_loaded_inode_bitmaps++; -+ else -+ brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]); -+ for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) { -+ sbi->s_inode_bitmap_number[j] = -+ sbi->s_inode_bitmap_number[j - 1]; -+ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; -+ } -+ retval = read_inode_bitmap (sb, block_group, 0); -+ } -+ return retval; -+} -+ -+/* -+ * NOTE! When we get the inode, we're the only people -+ * that have access to it, and as such there are no -+ * race conditions we have to worry about. The inode -+ * is not on the hash-lists, and it cannot be reached -+ * through the filesystem because the directory entry -+ * has been deleted earlier. -+ * -+ * HOWEVER: we must make sure that we get no aliases, -+ * which means that we have to call "clear_inode()" -+ * _before_ we mark the inode not in use in the inode -+ * bitmaps. Otherwise a newly created file might use -+ * the same inode number (not actually the same pointer -+ * though), and then we'd have two inodes sharing the -+ * same inode number and space on the harddisk. -+ */ -+void ext3_free_inode (handle_t *handle, struct inode * inode) -+{ -+ struct super_block * sb = inode->i_sb; -+ int is_directory; -+ unsigned long ino; -+ struct buffer_head * bh; -+ struct buffer_head * bh2; -+ unsigned long block_group; -+ unsigned long bit; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ struct ext3_super_block * es; -+ int fatal = 0, err; -+ -+ if (!inode->i_dev) { -+ printk ("ext3_free_inode: inode has no device\n"); -+ return; -+ } -+ if (atomic_read(&inode->i_count) > 1) { -+ printk ("ext3_free_inode: inode has count=%d\n", -+ atomic_read(&inode->i_count)); -+ return; -+ } -+ if (inode->i_nlink) { -+ printk ("ext3_free_inode: inode has nlink=%d\n", -+ inode->i_nlink); -+ return; -+ } -+ if (!sb) { -+ printk("ext3_free_inode: inode on nonexistent device\n"); -+ return; -+ } -+ -+ ino = inode->i_ino; -+ ext3_debug ("freeing inode %lu\n", ino); -+ -+ /* -+ * Note: we must free any quota before locking the superblock, -+ * as writing the quota to disk may need the lock as well. -+ */ -+ DQUOT_INIT(inode); -+ DQUOT_FREE_INODE(inode); -+ DQUOT_DROP(inode); -+ -+ is_directory = S_ISDIR(inode->i_mode); -+ -+ /* Do this BEFORE marking the inode not in use or returning an error */ -+ clear_inode (inode); -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { -+ ext3_error (sb, "ext3_free_inode", -+ "reserved or nonexistent inode %lu", ino); -+ goto error_return; -+ } -+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); -+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); -+ bitmap_nr = load_inode_bitmap (sb, block_group); -+ if (bitmap_nr < 0) -+ goto error_return; -+ -+ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ fatal = ext3_journal_get_write_access(handle, bh); -+ if (fatal) -+ goto error_return; -+ -+ /* Ok, now we can actually update the inode bitmaps.. */ -+ if (!ext3_clear_bit (bit, bh->b_data)) -+ ext3_error (sb, "ext3_free_inode", -+ "bit already cleared for inode %lu", ino); -+ else { -+ gdp = ext3_get_group_desc (sb, block_group, &bh2); -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ fatal = ext3_journal_get_write_access(handle, bh2); -+ if (fatal) goto error_return; -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access"); -+ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (fatal) goto error_return; -+ -+ if (gdp) { -+ gdp->bg_free_inodes_count = cpu_to_le16( -+ le16_to_cpu(gdp->bg_free_inodes_count) + 1); -+ if (is_directory) -+ gdp->bg_used_dirs_count = cpu_to_le16( -+ le16_to_cpu(gdp->bg_used_dirs_count) - 1); -+ } -+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (!fatal) fatal = err; -+ es->s_free_inodes_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, -+ "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ if (!fatal) fatal = err; -+ } -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (!fatal) -+ fatal = err; -+ sb->s_dirt = 1; -+error_return: -+ ext3_std_error(sb, fatal); -+ unlock_super(sb); -+} -+ -+/* -+ * There are two policies for allocating an inode. If the new inode is -+ * a directory, then a forward search is made for a block group with both -+ * free space and a low directory-to-inode ratio; if that fails, then of -+ * the groups with above-average free space, that group with the fewest -+ * directories already is chosen. -+ * -+ * For other inodes, search forward from the parent directory's block -+ * group to find a free inode. -+ */ -+struct inode * ext3_new_inode (handle_t *handle, -+ const struct inode * dir, int mode) -+{ -+ struct super_block * sb; -+ struct buffer_head * bh; -+ struct buffer_head * bh2; -+ int i, j, avefreei; -+ struct inode * inode; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ struct ext3_group_desc * tmp; -+ struct ext3_super_block * es; -+ int err = 0; -+ -+ /* Cannot create files in a deleted directory */ -+ if (!dir || !dir->i_nlink) -+ return ERR_PTR(-EPERM); -+ -+ sb = dir->i_sb; -+ inode = new_inode(sb); -+ if (!inode) -+ return ERR_PTR(-ENOMEM); -+ init_rwsem(&inode->u.ext3_i.truncate_sem); -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+repeat: -+ gdp = NULL; -+ i = 0; -+ -+ if (S_ISDIR(mode)) { -+ avefreei = le32_to_cpu(es->s_free_inodes_count) / -+ sb->u.ext3_sb.s_groups_count; -+ if (!gdp) { -+ for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) { -+ struct buffer_head *temp_buffer; -+ tmp = ext3_get_group_desc (sb, j, &temp_buffer); -+ if (tmp && -+ le16_to_cpu(tmp->bg_free_inodes_count) && -+ le16_to_cpu(tmp->bg_free_inodes_count) >= -+ avefreei) { -+ if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) > -+ le16_to_cpu(gdp->bg_free_blocks_count))) { -+ i = j; -+ gdp = tmp; -+ bh2 = temp_buffer; -+ } -+ } -+ } -+ } -+ } else { -+ /* -+ * Try to place the inode in its parent directory -+ */ -+ i = dir->u.ext3_i.i_block_group; -+ tmp = ext3_get_group_desc (sb, i, &bh2); -+ if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) -+ gdp = tmp; -+ else -+ { -+ /* -+ * Use a quadratic hash to find a group with a -+ * free inode -+ */ -+ for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) { -+ i += j; -+ if (i >= sb->u.ext3_sb.s_groups_count) -+ i -= sb->u.ext3_sb.s_groups_count; -+ tmp = ext3_get_group_desc (sb, i, &bh2); -+ if (tmp && -+ le16_to_cpu(tmp->bg_free_inodes_count)) { -+ gdp = tmp; -+ break; -+ } -+ } -+ } -+ if (!gdp) { -+ /* -+ * That failed: try linear search for a free inode -+ */ -+ i = dir->u.ext3_i.i_block_group + 1; -+ for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) { -+ if (++i >= sb->u.ext3_sb.s_groups_count) -+ i = 0; -+ tmp = ext3_get_group_desc (sb, i, &bh2); -+ if (tmp && -+ le16_to_cpu(tmp->bg_free_inodes_count)) { -+ gdp = tmp; -+ break; -+ } -+ } -+ } -+ } -+ -+ err = -ENOSPC; -+ if (!gdp) -+ goto fail; -+ -+ err = -EIO; -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ goto fail; -+ -+ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; -+ -+ if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data, -+ EXT3_INODES_PER_GROUP(sb))) < -+ EXT3_INODES_PER_GROUP(sb)) { -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) goto fail; -+ -+ if (ext3_set_bit (j, bh->b_data)) { -+ ext3_error (sb, "ext3_new_inode", -+ "bit already set for inode %d", j); -+ goto repeat; -+ } -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) goto fail; -+ } else { -+ if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) { -+ ext3_error (sb, "ext3_new_inode", -+ "Free inodes count corrupted in group %d", -+ i); -+ /* Is it really ENOSPC? */ -+ err = -ENOSPC; -+ if (sb->s_flags & MS_RDONLY) -+ goto fail; -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh2); -+ if (err) goto fail; -+ gdp->bg_free_inodes_count = 0; -+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (err) goto fail; -+ } -+ goto repeat; -+ } -+ j += i * EXT3_INODES_PER_GROUP(sb) + 1; -+ if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { -+ ext3_error (sb, "ext3_new_inode", -+ "reserved inode or inode > inodes count - " -+ "block_group = %d,inode=%d", i, j); -+ err = -EIO; -+ goto fail; -+ } -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh2); -+ if (err) goto fail; -+ gdp->bg_free_inodes_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); -+ if (S_ISDIR(mode)) -+ gdp->bg_used_dirs_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); -+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (err) goto fail; -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (err) goto fail; -+ es->s_free_inodes_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ sb->s_dirt = 1; -+ if (err) goto fail; -+ -+ inode->i_uid = current->fsuid; -+ if (test_opt (sb, GRPID)) -+ inode->i_gid = dir->i_gid; -+ else if (dir->i_mode & S_ISGID) { -+ inode->i_gid = dir->i_gid; -+ if (S_ISDIR(mode)) -+ mode |= S_ISGID; -+ } else -+ inode->i_gid = current->fsgid; -+ inode->i_mode = mode; -+ -+ inode->i_ino = j; -+ /* This is the optimal IO size (for stat), not the fs block size */ -+ inode->i_blksize = PAGE_SIZE; -+ inode->i_blocks = 0; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; -+ inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; -+ if (S_ISLNK(mode)) -+ inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); -+#ifdef EXT3_FRAGMENTS -+ inode->u.ext3_i.i_faddr = 0; -+ inode->u.ext3_i.i_frag_no = 0; -+ inode->u.ext3_i.i_frag_size = 0; -+#endif -+ inode->u.ext3_i.i_file_acl = 0; -+ inode->u.ext3_i.i_dir_acl = 0; -+ inode->u.ext3_i.i_dtime = 0; -+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+#ifdef EXT3_PREALLOCATE -+ inode->u.ext3_i.i_prealloc_count = 0; -+#endif -+ inode->u.ext3_i.i_block_group = i; -+ -+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) -+ inode->i_flags |= S_SYNC; -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ insert_inode_hash(inode); -+ inode->i_generation = sb->u.ext3_sb.s_next_generation++; -+ -+ inode->u.ext3_i.i_state = EXT3_STATE_NEW; -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err) goto fail; -+ -+ unlock_super (sb); -+ if(DQUOT_ALLOC_INODE(inode)) { -+ DQUOT_DROP(inode); -+ inode->i_flags |= S_NOQUOTA; -+ inode->i_nlink = 0; -+ iput(inode); -+ return ERR_PTR(-EDQUOT); -+ } -+ ext3_debug ("allocating inode %lu\n", inode->i_ino); -+ return inode; -+ -+fail: -+ unlock_super(sb); -+ iput(inode); -+ ext3_std_error(sb, err); -+ return ERR_PTR(err); -+} -+ -+/* Verify that we are loading a valid orphan from disk */ -+struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino) -+{ -+ ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); -+ unsigned long block_group; -+ int bit; -+ int bitmap_nr; -+ struct buffer_head *bh; -+ struct inode *inode = NULL; -+ -+ /* Error cases - e2fsck has already cleaned up for us */ -+ if (ino > max_ino) { -+ ext3_warning(sb, __FUNCTION__, -+ "bad orphan ino %ld! e2fsck was run?\n", ino); -+ return NULL; -+ } -+ -+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); -+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); -+ if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 || -+ !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) { -+ ext3_warning(sb, __FUNCTION__, -+ "inode bitmap error for orphan %ld\n", ino); -+ return NULL; -+ } -+ -+ /* Having the inode bit set should be a 100% indicator that this -+ * is a valid orphan (no e2fsck run on fs). Orphans also include -+ * inodes that were being truncated, so we can't check i_nlink==0. -+ */ -+ if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) || -+ is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { -+ ext3_warning(sb, __FUNCTION__, -+ "bad orphan inode %ld! e2fsck was run?\n", ino); -+ printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n", -+ bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data)); -+ printk(KERN_NOTICE "inode=%p\n", inode); -+ if (inode) { -+ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", -+ is_bad_inode(inode)); -+ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n", -+ NEXT_ORPHAN(inode)); -+ printk(KERN_NOTICE "max_ino=%ld\n", max_ino); -+ } -+ /* Avoid freeing blocks if we got a bad deleted inode */ -+ if (inode && inode->i_nlink == 0) -+ inode->i_blocks = 0; -+ iput(inode); -+ return NULL; -+ } -+ -+ return inode; -+} -+ -+unsigned long ext3_count_free_inodes (struct super_block * sb) -+{ -+#ifdef EXT3FS_DEBUG -+ struct ext3_super_block * es; -+ unsigned long desc_count, bitmap_count, x; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ int i; -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { -+ gdp = ext3_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count); -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ continue; -+ -+ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], -+ EXT3_INODES_PER_GROUP(sb) / 8); -+ printk ("group %d: stored = %d, counted = %lu\n", -+ i, le16_to_cpu(gdp->bg_free_inodes_count), x); -+ bitmap_count += x; -+ } -+ printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n", -+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); -+ unlock_super (sb); -+ return desc_count; -+#else -+ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count); -+#endif -+} -+ -+#ifdef CONFIG_EXT3_CHECK -+/* Called at mount-time, super-block is locked */ -+void ext3_check_inodes_bitmap (struct super_block * sb) -+{ -+ struct ext3_super_block * es; -+ unsigned long desc_count, bitmap_count, x; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ int i; -+ -+ es = sb->u.ext3_sb.s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { -+ gdp = ext3_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count); -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ continue; -+ -+ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], -+ EXT3_INODES_PER_GROUP(sb) / 8); -+ if (le16_to_cpu(gdp->bg_free_inodes_count) != x) -+ ext3_error (sb, "ext3_check_inodes_bitmap", -+ "Wrong free inodes count in group %d, " -+ "stored = %d, counted = %lu", i, -+ le16_to_cpu(gdp->bg_free_inodes_count), x); -+ bitmap_count += x; -+ } -+ if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count) -+ ext3_error (sb, "ext3_check_inodes_bitmap", -+ "Wrong free inodes count in super block, " -+ "stored = %lu, counted = %lu", -+ (unsigned long)le32_to_cpu(es->s_free_inodes_count), -+ bitmap_count); -+} -+#endif -diff -rup --new-file linux.mcp2/fs/ext3/inode.c linux_tmp/fs/ext3/inode.c ---- linux.mcp2/fs/ext3/inode.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/inode.c 2002-08-02 17:39:45.000000000 -0700 -@@ -0,0 +1,2699 @@ -+/* -+ * linux/fs/ext3/inode.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/inode.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Goal-directed block allocation by Stephen Tweedie -+ * (sct@redhat.com), 1993, 1998 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * 64-bit file support on 64-bit platforms by Jakub Jelinek -+ * (jj@sunsite.ms.mff.cuni.cz) -+ * -+ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * SEARCH_FROM_ZERO forces each block allocation to search from the start -+ * of the filesystem. This is to force rapid reallocation of recently-freed -+ * blocks. The file fragmentation is horrendous. -+ */ -+#undef SEARCH_FROM_ZERO -+ -+/* The ext3 forget function must perform a revoke if we are freeing data -+ * which has been journaled. Metadata (eg. indirect blocks) must be -+ * revoked in all cases. -+ * -+ * "bh" may be NULL: a metadata block may have been freed from memory -+ * but there may still be a record of it in the journal, and that record -+ * still needs to be revoked. -+ */ -+ -+static int ext3_forget(handle_t *handle, int is_metadata, -+ struct inode *inode, struct buffer_head *bh, -+ int blocknr) -+{ -+ int err; -+ -+ BUFFER_TRACE(bh, "enter"); -+ -+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " -+ "data mode %lx\n", -+ bh, is_metadata, inode->i_mode, -+ test_opt(inode->i_sb, DATA_FLAGS)); -+ -+ /* Never use the revoke function if we are doing full data -+ * journaling: there is no need to, and a V1 superblock won't -+ * support it. Otherwise, only skip the revoke on un-journaled -+ * data blocks. */ -+ -+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || -+ (!is_metadata && !ext3_should_journal_data(inode))) { -+ if (bh) { -+ BUFFER_TRACE(bh, "call journal_forget"); -+ ext3_journal_forget(handle, bh); -+ } -+ return 0; -+ } -+ -+ /* -+ * data!=journal && (is_metadata || should_journal_data(inode)) -+ */ -+ BUFFER_TRACE(bh, "call ext3_journal_revoke"); -+ err = ext3_journal_revoke(handle, blocknr, bh); -+ if (err) -+ ext3_abort(inode->i_sb, __FUNCTION__, -+ "error %d when attempting revoke", err); -+ BUFFER_TRACE(bh, "exit"); -+ return err; -+} -+ -+/* -+ * Truncate transactions can be complex and absolutely huge. So we need to -+ * be able to restart the transaction at a conventient checkpoint to make -+ * sure we don't overflow the journal. -+ * -+ * start_transaction gets us a new handle for a truncate transaction, -+ * and extend_transaction tries to extend the existing one a bit. If -+ * extend fails, we need to propagate the failure up and restart the -+ * transaction in the top-level truncate loop. --sct -+ */ -+ -+static handle_t *start_transaction(struct inode *inode) -+{ -+ long needed; -+ handle_t *result; -+ -+ needed = inode->i_blocks; -+ if (needed > EXT3_MAX_TRANS_DATA) -+ needed = EXT3_MAX_TRANS_DATA; -+ -+ result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); -+ if (!IS_ERR(result)) -+ return result; -+ -+ ext3_std_error(inode->i_sb, PTR_ERR(result)); -+ return result; -+} -+ -+/* -+ * Try to extend this transaction for the purposes of truncation. -+ * -+ * Returns 0 if we managed to create more room. If we can't create more -+ * room, and the transaction must be restarted we return 1. -+ */ -+static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -+{ -+ long needed; -+ -+ if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) -+ return 0; -+ needed = inode->i_blocks; -+ if (needed > EXT3_MAX_TRANS_DATA) -+ needed = EXT3_MAX_TRANS_DATA; -+ if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * Restart the transaction associated with *handle. This does a commit, -+ * so before we call here everything must be consistently dirtied against -+ * this transaction. -+ */ -+static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) -+{ -+ long needed = inode->i_blocks; -+ if (needed > EXT3_MAX_TRANS_DATA) -+ needed = EXT3_MAX_TRANS_DATA; -+ jbd_debug(2, "restarting handle %p\n", handle); -+ return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed); -+} -+ -+/* -+ * Called at each iput() -+ */ -+void ext3_put_inode (struct inode * inode) -+{ -+ ext3_discard_prealloc (inode); -+} -+ -+/* -+ * Called at the last iput() if i_nlink is zero. -+ */ -+void ext3_delete_inode (struct inode * inode) -+{ -+ handle_t *handle; -+ -+ if (is_bad_inode(inode) || -+ inode->i_ino == EXT3_ACL_IDX_INO || -+ inode->i_ino == EXT3_ACL_DATA_INO) -+ goto no_delete; -+ -+ lock_kernel(); -+ handle = start_transaction(inode); -+ if (IS_ERR(handle)) { -+ /* If we're going to skip the normal cleanup, we still -+ * need to make sure that the in-core orphan linked list -+ * is properly cleaned up. */ -+ ext3_orphan_del(NULL, inode); -+ -+ ext3_std_error(inode->i_sb, PTR_ERR(handle)); -+ unlock_kernel(); -+ goto no_delete; -+ } -+ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ inode->i_size = 0; -+ if (inode->i_blocks) -+ ext3_truncate(inode); -+ /* -+ * Kill off the orphan record which ext3_truncate created. -+ * AKPM: I think this can be inside the above `if'. -+ * Note that ext3_orphan_del() has to be able to cope with the -+ * deletion of a non-existent orphan - this is because we don't -+ * know if ext3_truncate() actually created an orphan record. -+ * (Well, we could do this if we need to, but heck - it works) -+ */ -+ ext3_orphan_del(handle, inode); -+ inode->u.ext3_i.i_dtime = CURRENT_TIME; -+ -+ /* -+ * One subtle ordering requirement: if anything has gone wrong -+ * (transaction abort, IO errors, whatever), then we can still -+ * do these next steps (the fs will already have been marked as -+ * having errors), but we can't free the inode if the mark_dirty -+ * fails. -+ */ -+ if (ext3_mark_inode_dirty(handle, inode)) -+ /* If that failed, just do the required in-core inode clear. */ -+ clear_inode(inode); -+ else -+ ext3_free_inode(handle, inode); -+ ext3_journal_stop(handle, inode); -+ unlock_kernel(); -+ return; -+no_delete: -+ clear_inode(inode); /* We must guarantee clearing of inode... */ -+} -+ -+void ext3_discard_prealloc (struct inode * inode) -+{ -+#ifdef EXT3_PREALLOCATE -+ lock_kernel(); -+ /* Writer: ->i_prealloc* */ -+ if (inode->u.ext3_i.i_prealloc_count) { -+ unsigned short total = inode->u.ext3_i.i_prealloc_count; -+ unsigned long block = inode->u.ext3_i.i_prealloc_block; -+ inode->u.ext3_i.i_prealloc_count = 0; -+ inode->u.ext3_i.i_prealloc_block = 0; -+ /* Writer: end */ -+ ext3_free_blocks (inode, block, total); -+ } -+ unlock_kernel(); -+#endif -+} -+ -+static int ext3_alloc_block (handle_t *handle, -+ struct inode * inode, unsigned long goal, int *err) -+{ -+#ifdef EXT3FS_DEBUG -+ static unsigned long alloc_hits = 0, alloc_attempts = 0; -+#endif -+ unsigned long result; -+ -+#ifdef EXT3_PREALLOCATE -+ /* Writer: ->i_prealloc* */ -+ if (inode->u.ext3_i.i_prealloc_count && -+ (goal == inode->u.ext3_i.i_prealloc_block || -+ goal + 1 == inode->u.ext3_i.i_prealloc_block)) -+ { -+ result = inode->u.ext3_i.i_prealloc_block++; -+ inode->u.ext3_i.i_prealloc_count--; -+ /* Writer: end */ -+ ext3_debug ("preallocation hit (%lu/%lu).\n", -+ ++alloc_hits, ++alloc_attempts); -+ } else { -+ ext3_discard_prealloc (inode); -+ ext3_debug ("preallocation miss (%lu/%lu).\n", -+ alloc_hits, ++alloc_attempts); -+ if (S_ISREG(inode->i_mode)) -+ result = ext3_new_block (inode, goal, -+ &inode->u.ext3_i.i_prealloc_count, -+ &inode->u.ext3_i.i_prealloc_block, err); -+ else -+ result = ext3_new_block (inode, goal, 0, 0, err); -+ /* -+ * AKPM: this is somewhat sticky. I'm not surprised it was -+ * disabled in 2.2's ext3. Need to integrate b_committed_data -+ * guarding with preallocation, if indeed preallocation is -+ * effective. -+ */ -+ } -+#else -+ result = ext3_new_block (handle, inode, goal, 0, 0, err); -+#endif -+ return result; -+} -+ -+ -+typedef struct { -+ u32 *p; -+ u32 key; -+ struct buffer_head *bh; -+} Indirect; -+ -+static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v) -+{ -+ p->key = *(p->p = v); -+ p->bh = bh; -+} -+ -+static inline int verify_chain(Indirect *from, Indirect *to) -+{ -+ while (from <= to && from->key == *from->p) -+ from++; -+ return (from > to); -+} -+ -+/** -+ * ext3_block_to_path - parse the block number into array of offsets -+ * @inode: inode in question (we are only interested in its superblock) -+ * @i_block: block number to be parsed -+ * @offsets: array to store the offsets in -+ * -+ * To store the locations of file's data ext3 uses a data structure common -+ * for UNIX filesystems - tree of pointers anchored in the inode, with -+ * data blocks at leaves and indirect blocks in intermediate nodes. -+ * This function translates the block number into path in that tree - -+ * return value is the path length and @offsets[n] is the offset of -+ * pointer to (n+1)th node in the nth one. If @block is out of range -+ * (negative or too large) warning is printed and zero returned. -+ * -+ * Note: function doesn't find node addresses, so no IO is needed. All -+ * we need to know is the capacity of indirect blocks (taken from the -+ * inode->i_sb). -+ */ -+ -+/* -+ * Portability note: the last comparison (check that we fit into triple -+ * indirect block) is spelled differently, because otherwise on an -+ * architecture with 32-bit longs and 8Kb pages we might get into trouble -+ * if our filesystem had 8Kb blocks. We might use long long, but that would -+ * kill us on x86. Oh, well, at least the sign propagation does not matter - -+ * i_block would have to be negative in the very beginning, so we would not -+ * get there at all. -+ */ -+ -+static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4]) -+{ -+ int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); -+ int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); -+ const long direct_blocks = EXT3_NDIR_BLOCKS, -+ indirect_blocks = ptrs, -+ double_blocks = (1 << (ptrs_bits * 2)); -+ int n = 0; -+ -+ if (i_block < 0) { -+ ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); -+ } else if (i_block < direct_blocks) { -+ offsets[n++] = i_block; -+ } else if ( (i_block -= direct_blocks) < indirect_blocks) { -+ offsets[n++] = EXT3_IND_BLOCK; -+ offsets[n++] = i_block; -+ } else if ((i_block -= indirect_blocks) < double_blocks) { -+ offsets[n++] = EXT3_DIND_BLOCK; -+ offsets[n++] = i_block >> ptrs_bits; -+ offsets[n++] = i_block & (ptrs - 1); -+ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { -+ offsets[n++] = EXT3_TIND_BLOCK; -+ offsets[n++] = i_block >> (ptrs_bits * 2); -+ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); -+ offsets[n++] = i_block & (ptrs - 1); -+ } else { -+ ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); -+ } -+ return n; -+} -+ -+/** -+ * ext3_get_branch - read the chain of indirect blocks leading to data -+ * @inode: inode in question -+ * @depth: depth of the chain (1 - direct pointer, etc.) -+ * @offsets: offsets of pointers in inode/indirect blocks -+ * @chain: place to store the result -+ * @err: here we store the error value -+ * -+ * Function fills the array of triples and returns %NULL -+ * if everything went OK or the pointer to the last filled triple -+ * (incomplete one) otherwise. Upon the return chain[i].key contains -+ * the number of (i+1)-th block in the chain (as it is stored in memory, -+ * i.e. little-endian 32-bit), chain[i].p contains the address of that -+ * number (it points into struct inode for i==0 and into the bh->b_data -+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect -+ * block for i>0 and NULL for i==0. In other words, it holds the block -+ * numbers of the chain, addresses they were taken from (and where we can -+ * verify that chain did not change) and buffer_heads hosting these -+ * numbers. -+ * -+ * Function stops when it stumbles upon zero pointer (absent block) -+ * (pointer to last triple returned, *@err == 0) -+ * or when it gets an IO error reading an indirect block -+ * (ditto, *@err == -EIO) -+ * or when it notices that chain had been changed while it was reading -+ * (ditto, *@err == -EAGAIN) -+ * or when it reads all @depth-1 indirect blocks successfully and finds -+ * the whole chain, all way to the data (returns %NULL, *err == 0). -+ */ -+static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, -+ Indirect chain[4], int *err) -+{ -+ struct super_block *sb = inode->i_sb; -+ Indirect *p = chain; -+ struct buffer_head *bh; -+ -+ *err = 0; -+ /* i_data is not going away, no lock needed */ -+ add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets); -+ if (!p->key) -+ goto no_block; -+ while (--depth) { -+ bh = sb_bread(sb, le32_to_cpu(p->key)); -+ if (!bh) -+ goto failure; -+ /* Reader: pointers */ -+ if (!verify_chain(chain, p)) -+ goto changed; -+ add_chain(++p, bh, (u32*)bh->b_data + *++offsets); -+ /* Reader: end */ -+ if (!p->key) -+ goto no_block; -+ } -+ return NULL; -+ -+changed: -+ *err = -EAGAIN; -+ goto no_block; -+failure: -+ *err = -EIO; -+no_block: -+ return p; -+} -+ -+/** -+ * ext3_find_near - find a place for allocation with sufficient locality -+ * @inode: owner -+ * @ind: descriptor of indirect block. -+ * -+ * This function returns the prefered place for block allocation. -+ * It is used when heuristic for sequential allocation fails. -+ * Rules are: -+ * + if there is a block to the left of our position - allocate near it. -+ * + if pointer will live in indirect block - allocate near that block. -+ * + if pointer will live in inode - allocate in the same -+ * cylinder group. -+ * Caller must make sure that @ind is valid and will stay that way. -+ */ -+ -+static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) -+{ -+ u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; -+ u32 *p; -+ -+ /* Try to find previous block */ -+ for (p = ind->p - 1; p >= start; p--) -+ if (*p) -+ return le32_to_cpu(*p); -+ -+ /* No such thing, so let's try location of indirect block */ -+ if (ind->bh) -+ return ind->bh->b_blocknr; -+ -+ /* -+ * It is going to be refered from inode itself? OK, just put it into -+ * the same cylinder group then. -+ */ -+ return (inode->u.ext3_i.i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); -+} -+ -+/** -+ * ext3_find_goal - find a prefered place for allocation. -+ * @inode: owner -+ * @block: block we want -+ * @chain: chain of indirect blocks -+ * @partial: pointer to the last triple within a chain -+ * @goal: place to store the result. -+ * -+ * Normally this function find the prefered place for block allocation, -+ * stores it in *@goal and returns zero. If the branch had been changed -+ * under us we return -EAGAIN. -+ */ -+ -+static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], -+ Indirect *partial, unsigned long *goal) -+{ -+ /* Writer: ->i_next_alloc* */ -+ if (block == inode->u.ext3_i.i_next_alloc_block + 1) { -+ inode->u.ext3_i.i_next_alloc_block++; -+ inode->u.ext3_i.i_next_alloc_goal++; -+ } -+#ifdef SEARCH_FROM_ZERO -+ inode->u.ext3_i.i_next_alloc_block = 0; -+ inode->u.ext3_i.i_next_alloc_goal = 0; -+#endif -+ /* Writer: end */ -+ /* Reader: pointers, ->i_next_alloc* */ -+ if (verify_chain(chain, partial)) { -+ /* -+ * try the heuristic for sequential allocation, -+ * failing that at least try to get decent locality. -+ */ -+ if (block == inode->u.ext3_i.i_next_alloc_block) -+ *goal = inode->u.ext3_i.i_next_alloc_goal; -+ if (!*goal) -+ *goal = ext3_find_near(inode, partial); -+#ifdef SEARCH_FROM_ZERO -+ *goal = 0; -+#endif -+ return 0; -+ } -+ /* Reader: end */ -+ return -EAGAIN; -+} -+ -+/** -+ * ext3_alloc_branch - allocate and set up a chain of blocks. -+ * @inode: owner -+ * @num: depth of the chain (number of blocks to allocate) -+ * @offsets: offsets (in the blocks) to store the pointers to next. -+ * @branch: place to store the chain in. -+ * -+ * This function allocates @num blocks, zeroes out all but the last one, -+ * links them into chain and (if we are synchronous) writes them to disk. -+ * In other words, it prepares a branch that can be spliced onto the -+ * inode. It stores the information about that chain in the branch[], in -+ * the same format as ext3_get_branch() would do. We are calling it after -+ * we had read the existing part of chain and partial points to the last -+ * triple of that (one with zero ->key). Upon the exit we have the same -+ * picture as after the successful ext3_get_block(), excpet that in one -+ * place chain is disconnected - *branch->p is still zero (we did not -+ * set the last link), but branch->key contains the number that should -+ * be placed into *branch->p to fill that gap. -+ * -+ * If allocation fails we free all blocks we've allocated (and forget -+ * their buffer_heads) and return the error value the from failed -+ * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain -+ * as described above and return 0. -+ */ -+ -+static int ext3_alloc_branch(handle_t *handle, struct inode *inode, -+ int num, -+ unsigned long goal, -+ int *offsets, -+ Indirect *branch) -+{ -+ int blocksize = inode->i_sb->s_blocksize; -+ int n = 0, keys = 0; -+ int err = 0; -+ int i; -+ int parent = ext3_alloc_block(handle, inode, goal, &err); -+ -+ branch[0].key = cpu_to_le32(parent); -+ if (parent) { -+ for (n = 1; n < num; n++) { -+ struct buffer_head *bh; -+ /* Allocate the next block */ -+ int nr = ext3_alloc_block(handle, inode, parent, &err); -+ if (!nr) -+ break; -+ branch[n].key = cpu_to_le32(nr); -+ keys = n+1; -+ -+ /* -+ * Get buffer_head for parent block, zero it out -+ * and set the pointer to new one, then send -+ * parent to disk. -+ */ -+ bh = sb_getblk(inode->i_sb, parent); -+ branch[n].bh = bh; -+ lock_buffer(bh); -+ BUFFER_TRACE(bh, "call get_create_access"); -+ err = ext3_journal_get_create_access(handle, bh); -+ if (err) { -+ unlock_buffer(bh); -+ brelse(bh); -+ break; -+ } -+ -+ memset(bh->b_data, 0, blocksize); -+ branch[n].p = (u32*) bh->b_data + offsets[n]; -+ *branch[n].p = branch[n].key; -+ BUFFER_TRACE(bh, "marking uptodate"); -+ mark_buffer_uptodate(bh, 1); -+ unlock_buffer(bh); -+ -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ break; -+ -+ parent = nr; -+ } -+ } -+ if (n == num) -+ return 0; -+ -+ /* Allocation failed, free what we already allocated */ -+ for (i = 1; i < keys; i++) { -+ BUFFER_TRACE(branch[i].bh, "call journal_forget"); -+ ext3_journal_forget(handle, branch[i].bh); -+ } -+ for (i = 0; i < keys; i++) -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ return err; -+} -+ -+/** -+ * ext3_splice_branch - splice the allocated branch onto inode. -+ * @inode: owner -+ * @block: (logical) number of block we are adding -+ * @chain: chain of indirect blocks (with a missing link - see -+ * ext3_alloc_branch) -+ * @where: location of missing link -+ * @num: number of blocks we are adding -+ * -+ * This function verifies that chain (up to the missing link) had not -+ * changed, fills the missing link and does all housekeeping needed in -+ * inode (->i_blocks, etc.). In case of success we end up with the full -+ * chain to new block and return 0. Otherwise (== chain had been changed) -+ * we free the new blocks (forgetting their buffer_heads, indeed) and -+ * return -EAGAIN. -+ */ -+ -+static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, -+ Indirect chain[4], Indirect *where, int num) -+{ -+ int i; -+ int err = 0; -+ -+ /* -+ * If we're splicing into a [td]indirect block (as opposed to the -+ * inode) then we need to get write access to the [td]indirect block -+ * before the splice. -+ */ -+ if (where->bh) { -+ BUFFER_TRACE(where->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, where->bh); -+ if (err) -+ goto err_out; -+ } -+ /* Verify that place we are splicing to is still there and vacant */ -+ -+ /* Writer: pointers, ->i_next_alloc* */ -+ if (!verify_chain(chain, where-1) || *where->p) -+ /* Writer: end */ -+ goto changed; -+ -+ /* That's it */ -+ -+ *where->p = where->key; -+ inode->u.ext3_i.i_next_alloc_block = block; -+ inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); -+#ifdef SEARCH_FROM_ZERO -+ inode->u.ext3_i.i_next_alloc_block = 0; -+ inode->u.ext3_i.i_next_alloc_goal = 0; -+#endif -+ /* Writer: end */ -+ -+ /* We are done with atomic stuff, now do the rest of housekeeping */ -+ -+ inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ /* had we spliced it onto indirect block? */ -+ if (where->bh) { -+ /* -+ * akpm: If we spliced it onto an indirect block, we haven't -+ * altered the inode. Note however that if it is being spliced -+ * onto an indirect block at the very end of the file (the -+ * file is growing) then we *will* alter the inode to reflect -+ * the new i_size. But that is not done here - it is done in -+ * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. -+ */ -+ jbd_debug(5, "splicing indirect only\n"); -+ BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, where->bh); -+ if (err) -+ goto err_out; -+ } else { -+ /* -+ * OK, we spliced it into the inode itself on a direct block. -+ * Inode was dirtied above. -+ */ -+ jbd_debug(5, "splicing direct\n"); -+ } -+ return err; -+ -+changed: -+ /* -+ * AKPM: if where[i].bh isn't part of the current updating -+ * transaction then we explode nastily. Test this code path. -+ */ -+ jbd_debug(1, "the chain changed: try again\n"); -+ err = -EAGAIN; -+ -+err_out: -+ for (i = 1; i < num; i++) { -+ BUFFER_TRACE(where[i].bh, "call journal_forget"); -+ ext3_journal_forget(handle, where[i].bh); -+ } -+ /* For the normal collision cleanup case, we free up the blocks. -+ * On genuine filesystem errors we don't even think about doing -+ * that. */ -+ if (err == -EAGAIN) -+ for (i = 0; i < num; i++) -+ ext3_free_blocks(handle, inode, -+ le32_to_cpu(where[i].key), 1); -+ return err; -+} -+ -+/* -+ * Allocation strategy is simple: if we have to allocate something, we will -+ * have to go the whole way to leaf. So let's do it before attaching anything -+ * to tree, set linkage between the newborn blocks, write them if sync is -+ * required, recheck the path, free and repeat if check fails, otherwise -+ * set the last missing link (that will protect us from any truncate-generated -+ * removals - all blocks on the path are immune now) and possibly force the -+ * write on the parent block. -+ * That has a nice additional property: no special recovery from the failed -+ * allocations is needed - we simply release blocks and do not touch anything -+ * reachable from inode. -+ * -+ * akpm: `handle' can be NULL if create == 0. -+ * -+ * The BKL may not be held on entry here. Be sure to take it early. -+ */ -+ -+static int ext3_get_block_handle(handle_t *handle, struct inode *inode, -+ long iblock, -+ struct buffer_head *bh_result, int create) -+{ -+ int err = -EIO; -+ int offsets[4]; -+ Indirect chain[4]; -+ Indirect *partial; -+ unsigned long goal; -+ int left; -+ int depth = ext3_block_to_path(inode, iblock, offsets); -+ loff_t new_size; -+ -+ J_ASSERT(handle != NULL || create == 0); -+ -+ if (depth == 0) -+ goto out; -+ -+ lock_kernel(); -+reread: -+ partial = ext3_get_branch(inode, depth, offsets, chain, &err); -+ -+ /* Simplest case - block found, no allocation needed */ -+ if (!partial) { -+ bh_result->b_state &= ~(1UL << BH_New); -+got_it: -+ bh_result->b_dev = inode->i_dev; -+ bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key); -+ bh_result->b_state |= (1UL << BH_Mapped); -+ /* Clean up and exit */ -+ partial = chain+depth-1; /* the whole chain */ -+ goto cleanup; -+ } -+ -+ /* Next simple case - plain lookup or failed read of indirect block */ -+ if (!create || err == -EIO) { -+cleanup: -+ while (partial > chain) { -+ BUFFER_TRACE(partial->bh, "call brelse"); -+ brelse(partial->bh); -+ partial--; -+ } -+ BUFFER_TRACE(bh_result, "returned"); -+ unlock_kernel(); -+out: -+ return err; -+ } -+ -+ /* -+ * Indirect block might be removed by truncate while we were -+ * reading it. Handling of that case (forget what we've got and -+ * reread) is taken out of the main path. -+ */ -+ if (err == -EAGAIN) -+ goto changed; -+ -+ if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) -+ goto changed; -+ -+ left = (chain + depth) - partial; -+ -+ /* -+ * Block out ext3_truncate while we alter the tree -+ */ -+ down_read(&inode->u.ext3_i.truncate_sem); -+ err = ext3_alloc_branch(handle, inode, left, goal, -+ offsets+(partial-chain), partial); -+ -+ /* The ext3_splice_branch call will free and forget any buffers -+ * on the new chain if there is a failure, but that risks using -+ * up transaction credits, especially for bitmaps where the -+ * credits cannot be returned. Can we handle this somehow? We -+ * may need to return -EAGAIN upwards in the worst case. --sct */ -+ if (!err) -+ err = ext3_splice_branch(handle, inode, iblock, chain, -+ partial, left); -+ up_read(&inode->u.ext3_i.truncate_sem); -+ if (err == -EAGAIN) -+ goto changed; -+ if (err) -+ goto cleanup; -+ -+ new_size = inode->i_size; -+ /* -+ * This is not racy against ext3_truncate's modification of i_disksize -+ * because VM/VFS ensures that the file cannot be extended while -+ * truncate is in progress. It is racy between multiple parallel -+ * instances of get_block, but we have the BKL. -+ */ -+ if (new_size > inode->u.ext3_i.i_disksize) -+ inode->u.ext3_i.i_disksize = new_size; -+ -+ bh_result->b_state |= (1UL << BH_New); -+ goto got_it; -+ -+changed: -+ while (partial > chain) { -+ jbd_debug(1, "buffer chain changed, retrying\n"); -+ BUFFER_TRACE(partial->bh, "brelsing"); -+ brelse(partial->bh); -+ partial--; -+ } -+ goto reread; -+} -+ -+/* -+ * The BKL is not held on entry here. -+ */ -+static int ext3_get_block(struct inode *inode, long iblock, -+ struct buffer_head *bh_result, int create) -+{ -+ handle_t *handle = 0; -+ int ret; -+ -+ if (create) { -+ handle = ext3_journal_current_handle(); -+ J_ASSERT(handle != 0); -+ } -+ ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); -+ return ret; -+} -+ -+/* -+ * `handle' can be NULL if create is zero -+ */ -+struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, -+ long block, int create, int * errp) -+{ -+ struct buffer_head dummy; -+ int fatal = 0, err; -+ -+ J_ASSERT(handle != NULL || create == 0); -+ -+ dummy.b_state = 0; -+ dummy.b_blocknr = -1000; -+ buffer_trace_init(&dummy.b_history); -+ *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); -+ if (!*errp && buffer_mapped(&dummy)) { -+ struct buffer_head *bh; -+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -+ if (buffer_new(&dummy)) { -+ J_ASSERT(create != 0); -+ J_ASSERT(handle != 0); -+ -+ /* Now that we do not always journal data, we -+ should keep in mind whether this should -+ always journal the new buffer as metadata. -+ For now, regular file writes use -+ ext3_get_block instead, so it's not a -+ problem. */ -+ lock_kernel(); -+ lock_buffer(bh); -+ BUFFER_TRACE(bh, "call get_create_access"); -+ fatal = ext3_journal_get_create_access(handle, bh); -+ if (!fatal) { -+ memset(bh->b_data, 0, -+ inode->i_sb->s_blocksize); -+ mark_buffer_uptodate(bh, 1); -+ } -+ unlock_buffer(bh); -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (!fatal) fatal = err; -+ unlock_kernel(); -+ } else { -+ BUFFER_TRACE(bh, "not a new buffer"); -+ } -+ if (fatal) { -+ *errp = fatal; -+ brelse(bh); -+ bh = NULL; -+ } -+ return bh; -+ } -+ return NULL; -+} -+ -+struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, -+ int block, int create, int *err) -+{ -+ struct buffer_head * bh; -+ int prev_blocks; -+ -+ prev_blocks = inode->i_blocks; -+ -+ bh = ext3_getblk (handle, inode, block, create, err); -+ if (!bh) -+ return bh; -+#ifdef EXT3_PREALLOCATE -+ /* -+ * If the inode has grown, and this is a directory, then use a few -+ * more of the preallocated blocks to keep directory fragmentation -+ * down. The preallocated blocks are guaranteed to be contiguous. -+ */ -+ if (create && -+ S_ISDIR(inode->i_mode) && -+ inode->i_blocks > prev_blocks && -+ EXT3_HAS_COMPAT_FEATURE(inode->i_sb, -+ EXT3_FEATURE_COMPAT_DIR_PREALLOC)) { -+ int i; -+ struct buffer_head *tmp_bh; -+ -+ for (i = 1; -+ inode->u.ext3_i.i_prealloc_count && -+ i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; -+ i++) { -+ /* -+ * ext3_getblk will zero out the contents of the -+ * directory for us -+ */ -+ tmp_bh = ext3_getblk(handle, inode, -+ block+i, create, err); -+ if (!tmp_bh) { -+ brelse (bh); -+ return 0; -+ } -+ brelse (tmp_bh); -+ } -+ } -+#endif -+ if (buffer_uptodate(bh)) -+ return bh; -+ ll_rw_block (READ, 1, &bh); -+ wait_on_buffer (bh); -+ if (buffer_uptodate(bh)) -+ return bh; -+ brelse (bh); -+ *err = -EIO; -+ return NULL; -+} -+ -+static int walk_page_buffers( handle_t *handle, -+ struct buffer_head *head, -+ unsigned from, -+ unsigned to, -+ int *partial, -+ int (*fn)( handle_t *handle, -+ struct buffer_head *bh)) -+{ -+ struct buffer_head *bh; -+ unsigned block_start, block_end; -+ unsigned blocksize = head->b_size; -+ int err, ret = 0; -+ -+ for ( bh = head, block_start = 0; -+ ret == 0 && (bh != head || !block_start); -+ block_start = block_end, bh = bh->b_this_page) -+ { -+ block_end = block_start + blocksize; -+ if (block_end <= from || block_start >= to) { -+ if (partial && !buffer_uptodate(bh)) -+ *partial = 1; -+ continue; -+ } -+ err = (*fn)(handle, bh); -+ if (!ret) -+ ret = err; -+ } -+ return ret; -+} -+ -+/* -+ * To preserve ordering, it is essential that the hole instantiation and -+ * the data write be encapsulated in a single transaction. We cannot -+ * close off a transaction and start a new one between the ext3_get_block() -+ * and the commit_write(). So doing the journal_start at the start of -+ * prepare_write() is the right place. -+ * -+ * Also, this function can nest inside ext3_writepage() -> -+ * block_write_full_page(). In that case, we *know* that ext3_writepage() -+ * has generated enough buffer credits to do the whole page. So we won't -+ * block on the journal in that case, which is good, because the caller may -+ * be PF_MEMALLOC. -+ * -+ * By accident, ext3 can be reentered when a transaction is open via -+ * quota file writes. If we were to commit the transaction while thus -+ * reentered, there can be a deadlock - we would be holding a quota -+ * lock, and the commit would never complete if another thread had a -+ * transaction open and was blocking on the quota lock - a ranking -+ * violation. -+ * -+ * So what we do is to rely on the fact that journal_stop/journal_start -+ * will _not_ run commit under these circumstances because handle->h_ref -+ * is elevated. We'll still have enough credits for the tiny quotafile -+ * write. -+ */ -+ -+static int do_journal_get_write_access(handle_t *handle, -+ struct buffer_head *bh) -+{ -+ return ext3_journal_get_write_access(handle, bh); -+} -+ -+static int ext3_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ struct inode *inode = page->mapping->host; -+ int ret, needed_blocks = ext3_writepage_trans_blocks(inode); -+ handle_t *handle; -+ -+ lock_kernel(); -+ handle = ext3_journal_start(inode, needed_blocks); -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out; -+ } -+ unlock_kernel(); -+ ret = block_prepare_write(page, from, to, ext3_get_block); -+ lock_kernel(); -+ if (ret != 0) -+ goto prepare_write_failed; -+ -+ if (ext3_should_journal_data(inode)) { -+ ret = walk_page_buffers(handle, page->buffers, -+ from, to, NULL, do_journal_get_write_access); -+ if (ret) { -+ /* -+ * We're going to fail this prepare_write(), -+ * so commit_write() will not be called. -+ * We need to undo block_prepare_write()'s kmap(). -+ * AKPM: Do we need to clear PageUptodate? I don't -+ * think so. -+ */ -+ kunmap(page); -+ } -+ } -+prepare_write_failed: -+ if (ret) -+ ext3_journal_stop(handle, inode); -+out: -+ unlock_kernel(); -+ return ret; -+} -+ -+static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) -+{ -+ return ext3_journal_dirty_data(handle, bh, 0); -+} -+ -+/* -+ * For ext3_writepage(). We also brelse() the buffer to account for -+ * the bget() which ext3_writepage() performs. -+ */ -+static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) -+{ -+ int ret = ext3_journal_dirty_data(handle, bh, 1); -+ __brelse(bh); -+ return ret; -+} -+ -+/* For commit_write() in data=journal mode */ -+static int commit_write_fn(handle_t *handle, struct buffer_head *bh) -+{ -+ set_bit(BH_Uptodate, &bh->b_state); -+ return ext3_journal_dirty_metadata(handle, bh); -+} -+ -+/* -+ * We need to pick up the new inode size which generic_commit_write gave us -+ * `file' can be NULL - eg, when called from block_symlink(). -+ * -+ * ext3 inode->i_dirty_buffers policy: If we're journalling data we -+ * definitely don't want them to appear on the inode at all - instead -+ * we need to manage them at the JBD layer and we need to intercept -+ * the relevant sync operations and translate them into journal operations. -+ * -+ * If we're not journalling data then we can just leave the buffers -+ * on ->i_dirty_buffers. If someone writes them out for us then thanks. -+ * Otherwise we'll do it in commit, if we're using ordered data. -+ */ -+ -+static int ext3_commit_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ handle_t *handle = ext3_journal_current_handle(); -+ struct inode *inode = page->mapping->host; -+ int ret = 0, ret2; -+ -+ lock_kernel(); -+ if (ext3_should_journal_data(inode)) { -+ /* -+ * Here we duplicate the generic_commit_write() functionality -+ */ -+ int partial = 0; -+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; -+ -+ ret = walk_page_buffers(handle, page->buffers, -+ from, to, &partial, commit_write_fn); -+ if (!partial) -+ SetPageUptodate(page); -+ kunmap(page); -+ if (pos > inode->i_size) -+ inode->i_size = pos; -+ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; -+ } else { -+ if (ext3_should_order_data(inode)) { -+ ret = walk_page_buffers(handle, page->buffers, -+ from, to, NULL, journal_dirty_sync_data); -+ } -+ /* Be careful here if generic_commit_write becomes a -+ * required invocation after block_prepare_write. */ -+ if (ret == 0) { -+ ret = generic_commit_write(file, page, from, to); -+ } else { -+ /* -+ * block_prepare_write() was called, but we're not -+ * going to call generic_commit_write(). So we -+ * need to perform generic_commit_write()'s kunmap -+ * by hand. -+ */ -+ kunmap(page); -+ } -+ } -+ if (inode->i_size > inode->u.ext3_i.i_disksize) { -+ inode->u.ext3_i.i_disksize = inode->i_size; -+ ret2 = ext3_mark_inode_dirty(handle, inode); -+ if (!ret) -+ ret = ret2; -+ } -+ ret2 = ext3_journal_stop(handle, inode); -+ unlock_kernel(); -+ if (!ret) -+ ret = ret2; -+ return ret; -+} -+ -+/* -+ * bmap() is special. It gets used by applications such as lilo and by -+ * the swapper to find the on-disk block of a specific piece of data. -+ * -+ * Naturally, this is dangerous if the block concerned is still in the -+ * journal. If somebody makes a swapfile on an ext3 data-journaling -+ * filesystem and enables swap, then they may get a nasty shock when the -+ * data getting swapped to that swapfile suddenly gets overwritten by -+ * the original zero's written out previously to the journal and -+ * awaiting writeback in the kernel's buffer cache. -+ * -+ * So, if we see any bmap calls here on a modified, data-journaled file, -+ * take extra steps to flush any blocks which might be in the cache. -+ */ -+static int ext3_bmap(struct address_space *mapping, long block) -+{ -+ struct inode *inode = mapping->host; -+ journal_t *journal; -+ int err; -+ -+ if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { -+ /* -+ * This is a REALLY heavyweight approach, but the use of -+ * bmap on dirty files is expected to be extremely rare: -+ * only if we run lilo or swapon on a freshly made file -+ * do we expect this to happen. -+ * -+ * (bmap requires CAP_SYS_RAWIO so this does not -+ * represent an unprivileged user DOS attack --- we'd be -+ * in trouble if mortal users could trigger this path at -+ * will.) -+ * -+ * NB. EXT3_STATE_JDATA is not set on files other than -+ * regular files. If somebody wants to bmap a directory -+ * or symlink and gets confused because the buffer -+ * hasn't yet been flushed to disk, they deserve -+ * everything they get. -+ */ -+ -+ EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; -+ journal = EXT3_JOURNAL(inode); -+ journal_lock_updates(journal); -+ err = journal_flush(journal); -+ journal_unlock_updates(journal); -+ -+ if (err) -+ return 0; -+ } -+ -+ return generic_block_bmap(mapping,block,ext3_get_block); -+} -+ -+static int bget_one(handle_t *handle, struct buffer_head *bh) -+{ -+ atomic_inc(&bh->b_count); -+ return 0; -+} -+ -+/* -+ * Note that we always start a transaction even if we're not journalling -+ * data. This is to preserve ordering: any hole instantiation within -+ * __block_write_full_page -> ext3_get_block() should be journalled -+ * along with the data so we don't crash and then get metadata which -+ * refers to old data. -+ * -+ * In all journalling modes block_write_full_page() will start the I/O. -+ * -+ * Problem: -+ * -+ * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> -+ * ext3_writepage() -+ * -+ * Similar for: -+ * -+ * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... -+ * -+ * Same applies to ext3_get_block(). We will deadlock on various things like -+ * lock_journal and i_truncate_sem. -+ * -+ * Setting PF_MEMALLOC here doesn't work - too many internal memory -+ * allocations fail. -+ * -+ * 16May01: If we're reentered then journal_current_handle() will be -+ * non-zero. We simply *return*. -+ * -+ * 1 July 2001: @@@ FIXME: -+ * In journalled data mode, a data buffer may be metadata against the -+ * current transaction. But the same file is part of a shared mapping -+ * and someone does a writepage() on it. -+ * -+ * We will move the buffer onto the async_data list, but *after* it has -+ * been dirtied. So there's a small window where we have dirty data on -+ * BJ_Metadata. -+ * -+ * Note that this only applies to the last partial page in the file. The -+ * bit which block_write_full_page() uses prepare/commit for. (That's -+ * broken code anyway: it's wrong for msync()). -+ * -+ * It's a rare case: affects the final partial page, for journalled data -+ * where the file is subject to bith write() and writepage() in the same -+ * transction. To fix it we'll need a custom block_write_full_page(). -+ * We'll probably need that anyway for journalling writepage() output. -+ * -+ * We don't honour synchronous mounts for writepage(). That would be -+ * disastrous. Any write() or metadata operation will sync the fs for -+ * us. -+ */ -+static int ext3_writepage(struct page *page) -+{ -+ struct inode *inode = page->mapping->host; -+ struct buffer_head *page_buffers; -+ handle_t *handle = NULL; -+ int ret = 0, err; -+ int needed; -+ int order_data; -+ -+ J_ASSERT(PageLocked(page)); -+ -+ /* -+ * We give up here if we're reentered, because it might be -+ * for a different filesystem. One *could* look for a -+ * nested transaction opportunity. -+ */ -+ lock_kernel(); -+ if (ext3_journal_current_handle()) -+ goto out_fail; -+ -+ needed = ext3_writepage_trans_blocks(inode); -+ if (current->flags & PF_MEMALLOC) -+ handle = ext3_journal_try_start(inode, needed); -+ else -+ handle = ext3_journal_start(inode, needed); -+ -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out_fail; -+ } -+ -+ order_data = ext3_should_order_data(inode) || -+ ext3_should_journal_data(inode); -+ -+ unlock_kernel(); -+ -+ page_buffers = NULL; /* Purely to prevent compiler warning */ -+ -+ /* bget() all the buffers */ -+ if (order_data) { -+ if (!page->buffers) -+ create_empty_buffers(page, -+ inode->i_dev, inode->i_sb->s_blocksize); -+ page_buffers = page->buffers; -+ walk_page_buffers(handle, page_buffers, 0, -+ PAGE_CACHE_SIZE, NULL, bget_one); -+ } -+ -+ ret = block_write_full_page(page, ext3_get_block); -+ -+ /* -+ * The page can become unlocked at any point now, and -+ * truncate can then come in and change things. So we -+ * can't touch *page from now on. But *page_buffers is -+ * safe due to elevated refcount. -+ */ -+ -+ handle = ext3_journal_current_handle(); -+ lock_kernel(); -+ -+ /* And attach them to the current transaction */ -+ if (order_data) { -+ err = walk_page_buffers(handle, page_buffers, -+ 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); -+ if (!ret) -+ ret = err; -+ } -+ -+ err = ext3_journal_stop(handle, inode); -+ if (!ret) -+ ret = err; -+ unlock_kernel(); -+ return ret; -+ -+out_fail: -+ -+ unlock_kernel(); -+ SetPageDirty(page); -+ UnlockPage(page); -+ return ret; -+} -+ -+static int ext3_readpage(struct file *file, struct page *page) -+{ -+ return block_read_full_page(page,ext3_get_block); -+} -+ -+ -+static int ext3_flushpage(struct page *page, unsigned long offset) -+{ -+ journal_t *journal = EXT3_JOURNAL(page->mapping->host); -+ return journal_flushpage(journal, page, offset); -+} -+ -+static int ext3_releasepage(struct page *page, int wait) -+{ -+ journal_t *journal = EXT3_JOURNAL(page->mapping->host); -+ return journal_try_to_free_buffers(journal, page, wait); -+} -+ -+ -+struct address_space_operations ext3_aops = { -+ readpage: ext3_readpage, /* BKL not held. Don't need */ -+ writepage: ext3_writepage, /* BKL not held. We take it */ -+ sync_page: block_sync_page, -+ prepare_write: ext3_prepare_write, /* BKL not held. We take it */ -+ commit_write: ext3_commit_write, /* BKL not held. We take it */ -+ bmap: ext3_bmap, /* BKL held */ -+ flushpage: ext3_flushpage, /* BKL not held. Don't need */ -+ releasepage: ext3_releasepage, /* BKL not held. Don't need */ -+}; -+ -+/* -+ * ext3_block_truncate_page() zeroes out a mapping from file offset `from' -+ * up to the end of the block which corresponds to `from'. -+ * This required during truncate. We need to physically zero the tail end -+ * of that block so it doesn't yield old data if the file is later grown. -+ */ -+static int ext3_block_truncate_page(handle_t *handle, -+ struct address_space *mapping, loff_t from) -+{ -+ unsigned long index = from >> PAGE_CACHE_SHIFT; -+ unsigned offset = from & (PAGE_CACHE_SIZE-1); -+ unsigned blocksize, iblock, length, pos; -+ struct inode *inode = mapping->host; -+ struct page *page; -+ struct buffer_head *bh; -+ int err; -+ -+ blocksize = inode->i_sb->s_blocksize; -+ length = offset & (blocksize - 1); -+ -+ /* Block boundary? Nothing to do */ -+ if (!length) -+ return 0; -+ -+ length = blocksize - length; -+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); -+ -+ page = grab_cache_page(mapping, index); -+ err = -ENOMEM; -+ if (!page) -+ goto out; -+ -+ if (!page->buffers) -+ create_empty_buffers(page, inode->i_dev, blocksize); -+ -+ /* Find the buffer that contains "offset" */ -+ bh = page->buffers; -+ pos = blocksize; -+ while (offset >= pos) { -+ bh = bh->b_this_page; -+ iblock++; -+ pos += blocksize; -+ } -+ -+ err = 0; -+ if (!buffer_mapped(bh)) { -+ /* Hole? Nothing to do */ -+ if (buffer_uptodate(bh)) -+ goto unlock; -+ ext3_get_block(inode, iblock, bh, 0); -+ /* Still unmapped? Nothing to do */ -+ if (!buffer_mapped(bh)) -+ goto unlock; -+ } -+ -+ /* Ok, it's mapped. Make sure it's up-to-date */ -+ if (Page_Uptodate(page)) -+ set_bit(BH_Uptodate, &bh->b_state); -+ -+ if (!buffer_uptodate(bh)) { -+ err = -EIO; -+ ll_rw_block(READ, 1, &bh); -+ wait_on_buffer(bh); -+ /* Uhhuh. Read error. Complain and punt. */ -+ if (!buffer_uptodate(bh)) -+ goto unlock; -+ } -+ -+ if (ext3_should_journal_data(inode)) { -+ BUFFER_TRACE(bh, "get write access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto unlock; -+ } -+ -+ memset(kmap(page) + offset, 0, length); -+ flush_dcache_page(page); -+ kunmap(page); -+ -+ BUFFER_TRACE(bh, "zeroed end of block"); -+ -+ err = 0; -+ if (ext3_should_journal_data(inode)) { -+ err = ext3_journal_dirty_metadata(handle, bh); -+ } else { -+ if (ext3_should_order_data(inode)) -+ err = ext3_journal_dirty_data(handle, bh, 0); -+ __mark_buffer_dirty(bh); -+ } -+ -+unlock: -+ UnlockPage(page); -+ page_cache_release(page); -+out: -+ return err; -+} -+ -+/* -+ * Probably it should be a library function... search for first non-zero word -+ * or memcmp with zero_page, whatever is better for particular architecture. -+ * Linus? -+ */ -+static inline int all_zeroes(u32 *p, u32 *q) -+{ -+ while (p < q) -+ if (*p++) -+ return 0; -+ return 1; -+} -+ -+/** -+ * ext3_find_shared - find the indirect blocks for partial truncation. -+ * @inode: inode in question -+ * @depth: depth of the affected branch -+ * @offsets: offsets of pointers in that branch (see ext3_block_to_path) -+ * @chain: place to store the pointers to partial indirect blocks -+ * @top: place to the (detached) top of branch -+ * -+ * This is a helper function used by ext3_truncate(). -+ * -+ * When we do truncate() we may have to clean the ends of several -+ * indirect blocks but leave the blocks themselves alive. Block is -+ * partially truncated if some data below the new i_size is refered -+ * from it (and it is on the path to the first completely truncated -+ * data block, indeed). We have to free the top of that path along -+ * with everything to the right of the path. Since no allocation -+ * past the truncation point is possible until ext3_truncate() -+ * finishes, we may safely do the latter, but top of branch may -+ * require special attention - pageout below the truncation point -+ * might try to populate it. -+ * -+ * We atomically detach the top of branch from the tree, store the -+ * block number of its root in *@top, pointers to buffer_heads of -+ * partially truncated blocks - in @chain[].bh and pointers to -+ * their last elements that should not be removed - in -+ * @chain[].p. Return value is the pointer to last filled element -+ * of @chain. -+ * -+ * The work left to caller to do the actual freeing of subtrees: -+ * a) free the subtree starting from *@top -+ * b) free the subtrees whose roots are stored in -+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data) -+ * c) free the subtrees growing from the inode past the @chain[0]. -+ * (no partially truncated stuff there). */ -+ -+static Indirect *ext3_find_shared(struct inode *inode, -+ int depth, -+ int offsets[4], -+ Indirect chain[4], -+ u32 *top) -+{ -+ Indirect *partial, *p; -+ int k, err; -+ -+ *top = 0; -+ /* Make k index the deepest non-null offest + 1 */ -+ for (k = depth; k > 1 && !offsets[k-1]; k--) -+ ; -+ partial = ext3_get_branch(inode, k, offsets, chain, &err); -+ /* Writer: pointers */ -+ if (!partial) -+ partial = chain + k-1; -+ /* -+ * If the branch acquired continuation since we've looked at it - -+ * fine, it should all survive and (new) top doesn't belong to us. -+ */ -+ if (!partial->key && *partial->p) -+ /* Writer: end */ -+ goto no_top; -+ for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--) -+ ; -+ /* -+ * OK, we've found the last block that must survive. The rest of our -+ * branch should be detached before unlocking. However, if that rest -+ * of branch is all ours and does not grow immediately from the inode -+ * it's easier to cheat and just decrement partial->p. -+ */ -+ if (p == chain + k - 1 && p > chain) { -+ p->p--; -+ } else { -+ *top = *p->p; -+ /* Nope, don't do this in ext3. Must leave the tree intact */ -+#if 0 -+ *p->p = 0; -+#endif -+ } -+ /* Writer: end */ -+ -+ while(partial > p) -+ { -+ brelse(partial->bh); -+ partial--; -+ } -+no_top: -+ return partial; -+} -+ -+/* -+ * Zero a number of block pointers in either an inode or an indirect block. -+ * If we restart the transaction we must again get write access to the -+ * indirect block for further modification. -+ * -+ * We release `count' blocks on disk, but (last - first) may be greater -+ * than `count' because there can be holes in there. -+ */ -+static void -+ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, -+ unsigned long block_to_free, unsigned long count, -+ u32 *first, u32 *last) -+{ -+ u32 *p; -+ if (try_to_extend_transaction(handle, inode)) { -+ if (bh) { -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, bh); -+ } -+ ext3_mark_inode_dirty(handle, inode); -+ ext3_journal_test_restart(handle, inode); -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, bh); -+ } -+ -+ /* -+ * Any buffers which are on the journal will be in memory. We find -+ * them on the hash table so journal_revoke() will run journal_forget() -+ * on them. We've already detached each block from the file, so -+ * bforget() in journal_forget() should be safe. -+ * -+ * AKPM: turn on bforget in journal_forget()!!! -+ */ -+ for (p = first; p < last; p++) { -+ u32 nr = le32_to_cpu(*p); -+ if (nr) { -+ struct buffer_head *bh; -+ -+ *p = 0; -+ bh = sb_get_hash_table(inode->i_sb, nr); -+ ext3_forget(handle, 0, inode, bh, nr); -+ } -+ } -+ -+ ext3_free_blocks(handle, inode, block_to_free, count); -+} -+ -+/** -+ * ext3_free_data - free a list of data blocks -+ * @handle: handle for this transaction -+ * @inode: inode we are dealing with -+ * @this_bh: indirect buffer_head which contains *@first and *@last -+ * @first: array of block numbers -+ * @last: points immediately past the end of array -+ * -+ * We are freeing all blocks refered from that array (numbers are stored as -+ * little-endian 32-bit) and updating @inode->i_blocks appropriately. -+ * -+ * We accumulate contiguous runs of blocks to free. Conveniently, if these -+ * blocks are contiguous then releasing them at one time will only affect one -+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't -+ * actually use a lot of journal space. -+ * -+ * @this_bh will be %NULL if @first and @last point into the inode's direct -+ * block pointers. -+ */ -+static void ext3_free_data(handle_t *handle, struct inode *inode, -+ struct buffer_head *this_bh, u32 *first, u32 *last) -+{ -+ unsigned long block_to_free = 0; /* Starting block # of a run */ -+ unsigned long count = 0; /* Number of blocks in the run */ -+ u32 *block_to_free_p = NULL; /* Pointer into inode/ind -+ corresponding to -+ block_to_free */ -+ unsigned long nr; /* Current block # */ -+ u32 *p; /* Pointer into inode/ind -+ for current block */ -+ int err; -+ -+ if (this_bh) { /* For indirect block */ -+ BUFFER_TRACE(this_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, this_bh); -+ /* Important: if we can't update the indirect pointers -+ * to the blocks, we can't free them. */ -+ if (err) -+ return; -+ } -+ -+ for (p = first; p < last; p++) { -+ nr = le32_to_cpu(*p); -+ if (nr) { -+ /* accumulate blocks to free if they're contiguous */ -+ if (count == 0) { -+ block_to_free = nr; -+ block_to_free_p = p; -+ count = 1; -+ } else if (nr == block_to_free + count) { -+ count++; -+ } else { -+ ext3_clear_blocks(handle, inode, this_bh, -+ block_to_free, -+ count, block_to_free_p, p); -+ block_to_free = nr; -+ block_to_free_p = p; -+ count = 1; -+ } -+ } -+ } -+ -+ if (count > 0) -+ ext3_clear_blocks(handle, inode, this_bh, block_to_free, -+ count, block_to_free_p, p); -+ -+ if (this_bh) { -+ BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, this_bh); -+ } -+} -+ -+/** -+ * ext3_free_branches - free an array of branches -+ * @handle: JBD handle for this transaction -+ * @inode: inode we are dealing with -+ * @parent_bh: the buffer_head which contains *@first and *@last -+ * @first: array of block numbers -+ * @last: pointer immediately past the end of array -+ * @depth: depth of the branches to free -+ * -+ * We are freeing all blocks refered from these branches (numbers are -+ * stored as little-endian 32-bit) and updating @inode->i_blocks -+ * appropriately. -+ */ -+static void ext3_free_branches(handle_t *handle, struct inode *inode, -+ struct buffer_head *parent_bh, -+ u32 *first, u32 *last, int depth) -+{ -+ unsigned long nr; -+ u32 *p; -+ -+ if (is_handle_aborted(handle)) -+ return; -+ -+ if (depth--) { -+ struct buffer_head *bh; -+ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); -+ p = last; -+ while (--p >= first) { -+ nr = le32_to_cpu(*p); -+ if (!nr) -+ continue; /* A hole */ -+ -+ /* Go read the buffer for the next level down */ -+ bh = sb_bread(inode->i_sb, nr); -+ -+ /* -+ * A read failure? Report error and clear slot -+ * (should be rare). -+ */ -+ if (!bh) { -+ ext3_error(inode->i_sb, "ext3_free_branches", -+ "Read failure, inode=%ld, block=%ld", -+ inode->i_ino, nr); -+ continue; -+ } -+ -+ /* This zaps the entire block. Bottom up. */ -+ BUFFER_TRACE(bh, "free child branches"); -+ ext3_free_branches(handle, inode, bh, (u32*)bh->b_data, -+ (u32*)bh->b_data + addr_per_block, -+ depth); -+ -+ /* -+ * We've probably journalled the indirect block several -+ * times during the truncate. But it's no longer -+ * needed and we now drop it from the transaction via -+ * journal_revoke(). -+ * -+ * That's easy if it's exclusively part of this -+ * transaction. But if it's part of the committing -+ * transaction then journal_forget() will simply -+ * brelse() it. That means that if the underlying -+ * block is reallocated in ext3_get_block(), -+ * unmap_underlying_metadata() will find this block -+ * and will try to get rid of it. damn, damn. -+ * -+ * If this block has already been committed to the -+ * journal, a revoke record will be written. And -+ * revoke records must be emitted *before* clearing -+ * this block's bit in the bitmaps. -+ */ -+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr); -+ -+ /* -+ * Everything below this this pointer has been -+ * released. Now let this top-of-subtree go. -+ * -+ * We want the freeing of this indirect block to be -+ * atomic in the journal with the updating of the -+ * bitmap block which owns it. So make some room in -+ * the journal. -+ * -+ * We zero the parent pointer *after* freeing its -+ * pointee in the bitmaps, so if extend_transaction() -+ * for some reason fails to put the bitmap changes and -+ * the release into the same transaction, recovery -+ * will merely complain about releasing a free block, -+ * rather than leaking blocks. -+ */ -+ if (is_handle_aborted(handle)) -+ return; -+ if (try_to_extend_transaction(handle, inode)) { -+ ext3_mark_inode_dirty(handle, inode); -+ ext3_journal_test_restart(handle, inode); -+ } -+ -+ ext3_free_blocks(handle, inode, nr, 1); -+ -+ if (parent_bh) { -+ /* -+ * The block which we have just freed is -+ * pointed to by an indirect block: journal it -+ */ -+ BUFFER_TRACE(parent_bh, "get_write_access"); -+ if (!ext3_journal_get_write_access(handle, -+ parent_bh)){ -+ *p = 0; -+ BUFFER_TRACE(parent_bh, -+ "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, -+ parent_bh); -+ } -+ } -+ } -+ } else { -+ /* We have reached the bottom of the tree. */ -+ BUFFER_TRACE(parent_bh, "free data blocks"); -+ ext3_free_data(handle, inode, parent_bh, first, last); -+ } -+} -+ -+/* -+ * ext3_truncate() -+ * -+ * We block out ext3_get_block() block instantiations across the entire -+ * transaction, and VFS/VM ensures that ext3_truncate() cannot run -+ * simultaneously on behalf of the same inode. -+ * -+ * As we work through the truncate and commmit bits of it to the journal there -+ * is one core, guiding principle: the file's tree must always be consistent on -+ * disk. We must be able to restart the truncate after a crash. -+ * -+ * The file's tree may be transiently inconsistent in memory (although it -+ * probably isn't), but whenever we close off and commit a journal transaction, -+ * the contents of (the filesystem + the journal) must be consistent and -+ * restartable. It's pretty simple, really: bottom up, right to left (although -+ * left-to-right works OK too). -+ * -+ * Note that at recovery time, journal replay occurs *before* the restart of -+ * truncate against the orphan inode list. -+ * -+ * The committed inode has the new, desired i_size (which is the same as -+ * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see -+ * that this inode's truncate did not complete and it will again call -+ * ext3_truncate() to have another go. So there will be instantiated blocks -+ * to the right of the truncation point in a crashed ext3 filesystem. But -+ * that's fine - as long as they are linked from the inode, the post-crash -+ * ext3_truncate() run will find them and release them. -+ */ -+ -+void ext3_truncate(struct inode * inode) -+{ -+ handle_t *handle; -+ u32 *i_data = inode->u.ext3_i.i_data; -+ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); -+ int offsets[4]; -+ Indirect chain[4]; -+ Indirect *partial; -+ int nr = 0; -+ int n; -+ long last_block; -+ unsigned blocksize; -+ -+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || -+ S_ISLNK(inode->i_mode))) -+ return; -+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) -+ return; -+ -+ ext3_discard_prealloc(inode); -+ -+ handle = start_transaction(inode); -+ if (IS_ERR(handle)) -+ return; /* AKPM: return what? */ -+ -+ blocksize = inode->i_sb->s_blocksize; -+ last_block = (inode->i_size + blocksize-1) -+ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); -+ -+ ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size); -+ -+ -+ n = ext3_block_to_path(inode, last_block, offsets); -+ if (n == 0) -+ goto out_stop; /* error */ -+ -+ /* -+ * OK. This truncate is going to happen. We add the inode to the -+ * orphan list, so that if this truncate spans multiple transactions, -+ * and we crash, we will resume the truncate when the filesystem -+ * recovers. It also marks the inode dirty, to catch the new size. -+ * -+ * Implication: the file must always be in a sane, consistent -+ * truncatable state while each transaction commits. -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* -+ * The orphan list entry will now protect us from any crash which -+ * occurs before the truncate completes, so it is now safe to propagate -+ * the new, shorter inode size (held for now in i_size) into the -+ * on-disk inode. We do this via i_disksize, which is the value which -+ * ext3 *really* writes onto the disk inode. -+ */ -+ inode->u.ext3_i.i_disksize = inode->i_size; -+ -+ /* -+ * From here we block out all ext3_get_block() callers who want to -+ * modify the block allocation tree. -+ */ -+ down_write(&inode->u.ext3_i.truncate_sem); -+ -+ if (n == 1) { /* direct blocks */ -+ ext3_free_data(handle, inode, NULL, i_data+offsets[0], -+ i_data + EXT3_NDIR_BLOCKS); -+ goto do_indirects; -+ } -+ -+ partial = ext3_find_shared(inode, n, offsets, chain, &nr); -+ /* Kill the top of shared branch (not detached) */ -+ if (nr) { -+ if (partial == chain) { -+ /* Shared branch grows from the inode */ -+ ext3_free_branches(handle, inode, NULL, -+ &nr, &nr+1, (chain+n-1) - partial); -+ *partial->p = 0; -+ /* -+ * We mark the inode dirty prior to restart, -+ * and prior to stop. No need for it here. -+ */ -+ } else { -+ /* Shared branch grows from an indirect block */ -+ BUFFER_TRACE(partial->bh, "get_write_access"); -+ ext3_free_branches(handle, inode, partial->bh, -+ partial->p, -+ partial->p+1, (chain+n-1) - partial); -+ } -+ } -+ /* Clear the ends of indirect blocks on the shared branch */ -+ while (partial > chain) { -+ ext3_free_branches(handle, inode, partial->bh, partial->p + 1, -+ (u32*)partial->bh->b_data + addr_per_block, -+ (chain+n-1) - partial); -+ BUFFER_TRACE(partial->bh, "call brelse"); -+ brelse (partial->bh); -+ partial--; -+ } -+do_indirects: -+ /* Kill the remaining (whole) subtrees */ -+ switch (offsets[0]) { -+ default: -+ nr = i_data[EXT3_IND_BLOCK]; -+ if (nr) { -+ ext3_free_branches(handle, inode, NULL, -+ &nr, &nr+1, 1); -+ i_data[EXT3_IND_BLOCK] = 0; -+ } -+ case EXT3_IND_BLOCK: -+ nr = i_data[EXT3_DIND_BLOCK]; -+ if (nr) { -+ ext3_free_branches(handle, inode, NULL, -+ &nr, &nr+1, 2); -+ i_data[EXT3_DIND_BLOCK] = 0; -+ } -+ case EXT3_DIND_BLOCK: -+ nr = i_data[EXT3_TIND_BLOCK]; -+ if (nr) { -+ ext3_free_branches(handle, inode, NULL, -+ &nr, &nr+1, 3); -+ i_data[EXT3_TIND_BLOCK] = 0; -+ } -+ case EXT3_TIND_BLOCK: -+ ; -+ } -+ up_write(&inode->u.ext3_i.truncate_sem); -+ inode->i_mtime = inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ /* In a multi-transaction truncate, we only make the final -+ * transaction synchronous */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3_orphan_del(handle, inode); -+ -+ ext3_journal_stop(handle, inode); -+} -+ -+/* -+ * ext3_get_inode_loc returns with an extra refcount against the -+ * inode's underlying buffer_head on success. -+ */ -+ -+int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) -+{ -+ struct buffer_head *bh = 0; -+ unsigned long block; -+ unsigned long block_group; -+ unsigned long group_desc; -+ unsigned long desc; -+ unsigned long offset; -+ struct ext3_group_desc * gdp; -+ -+ if ((inode->i_ino != EXT3_ROOT_INO && -+ inode->i_ino != EXT3_ACL_IDX_INO && -+ inode->i_ino != EXT3_ACL_DATA_INO && -+ inode->i_ino != EXT3_JOURNAL_INO && -+ inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || -+ inode->i_ino > le32_to_cpu( -+ inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) { -+ ext3_error (inode->i_sb, "ext3_get_inode_loc", -+ "bad inode number: %lu", inode->i_ino); -+ goto bad_inode; -+ } -+ block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); -+ if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) { -+ ext3_error (inode->i_sb, "ext3_get_inode_loc", -+ "group >= groups count"); -+ goto bad_inode; -+ } -+ group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); -+ desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); -+ bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; -+ if (!bh) { -+ ext3_error (inode->i_sb, "ext3_get_inode_loc", -+ "Descriptor not loaded"); -+ goto bad_inode; -+ } -+ -+ gdp = (struct ext3_group_desc *) bh->b_data; -+ /* -+ * Figure out the offset within the block group inode table -+ */ -+ offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * -+ EXT3_INODE_SIZE(inode->i_sb); -+ block = le32_to_cpu(gdp[desc].bg_inode_table) + -+ (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); -+ if (!(bh = sb_bread(inode->i_sb, block))) { -+ ext3_error (inode->i_sb, "ext3_get_inode_loc", -+ "unable to read inode block - " -+ "inode=%lu, block=%lu", inode->i_ino, block); -+ goto bad_inode; -+ } -+ offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); -+ -+ iloc->bh = bh; -+ iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); -+ iloc->block_group = block_group; -+ -+ return 0; -+ -+ bad_inode: -+ return -EIO; -+} -+ -+void ext3_read_inode(struct inode * inode) -+{ -+ struct ext3_iloc iloc; -+ struct ext3_inode *raw_inode; -+ struct buffer_head *bh; -+ int block; -+ -+ if(ext3_get_inode_loc(inode, &iloc)) -+ goto bad_inode; -+ bh = iloc.bh; -+ raw_inode = iloc.raw_inode; -+ init_rwsem(&inode->u.ext3_i.truncate_sem); -+ inode->i_mode = le16_to_cpu(raw_inode->i_mode); -+ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); -+ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); -+ if(!(test_opt (inode->i_sb, NO_UID32))) { -+ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; -+ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; -+ } -+ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); -+ inode->i_size = le32_to_cpu(raw_inode->i_size); -+ inode->i_atime = le32_to_cpu(raw_inode->i_atime); -+ inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); -+ inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); -+ inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); -+ /* We now have enough fields to check if the inode was active or not. -+ * This is needed because nfsd might try to access dead inodes -+ * the test is that same one that e2fsck uses -+ * NeilBrown 1999oct15 -+ */ -+ if (inode->i_nlink == 0) { -+ if (inode->i_mode == 0 || -+ !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) { -+ /* this inode is deleted */ -+ brelse (bh); -+ goto bad_inode; -+ } -+ /* The only unlinked inodes we let through here have -+ * valid i_mode and are being read by the orphan -+ * recovery code: that's fine, we're about to complete -+ * the process of deleting those. */ -+ } -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); -+ inode->i_version = ++event; -+ inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags); -+#ifdef EXT3_FRAGMENTS -+ inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); -+ inode->u.ext3_i.i_frag_no = raw_inode->i_frag; -+ inode->u.ext3_i.i_frag_size = raw_inode->i_fsize; -+#endif -+ inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); -+ if (!S_ISREG(inode->i_mode)) { -+ inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); -+ } else { -+ inode->i_size |= -+ ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; -+ } -+ inode->u.ext3_i.i_disksize = inode->i_size; -+ inode->i_generation = le32_to_cpu(raw_inode->i_generation); -+#ifdef EXT3_PREALLOCATE -+ inode->u.ext3_i.i_prealloc_count = 0; -+#endif -+ inode->u.ext3_i.i_block_group = iloc.block_group; -+ -+ /* -+ * NOTE! The in-memory inode i_data array is in little-endian order -+ * even on big-endian machines: we do NOT byteswap the block numbers! -+ */ -+ for (block = 0; block < EXT3_N_BLOCKS; block++) -+ inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; -+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+ -+ brelse (iloc.bh); -+ -+ if (inode->i_ino == EXT3_ACL_IDX_INO || -+ inode->i_ino == EXT3_ACL_DATA_INO) -+ /* Nothing to do */ ; -+ else if (S_ISREG(inode->i_mode)) { -+ inode->i_op = &ext3_file_inode_operations; -+ inode->i_fop = &ext3_file_operations; -+ inode->i_mapping->a_ops = &ext3_aops; -+ } else if (S_ISDIR(inode->i_mode)) { -+ inode->i_op = &ext3_dir_inode_operations; -+ inode->i_fop = &ext3_dir_operations; -+ } else if (S_ISLNK(inode->i_mode)) { -+ if (!inode->i_blocks) -+ inode->i_op = &ext3_fast_symlink_inode_operations; -+ else { -+ inode->i_op = &page_symlink_inode_operations; -+ inode->i_mapping->a_ops = &ext3_aops; -+ } -+ } else -+ init_special_inode(inode, inode->i_mode, -+ le32_to_cpu(iloc.raw_inode->i_block[0])); -+ /* inode->i_attr_flags = 0; unused */ -+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { -+ /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ -+ inode->i_flags |= S_SYNC; -+ } -+ if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) { -+ /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */ -+ inode->i_flags |= S_APPEND; -+ } -+ if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FL) { -+ /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */ -+ inode->i_flags |= S_IMMUTABLE; -+ } -+ if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) { -+ /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ -+ inode->i_flags |= S_NOATIME; -+ } -+ return; -+ -+bad_inode: -+ make_bad_inode(inode); -+ return; -+} -+ -+/* -+ * Post the struct inode info into an on-disk inode location in the -+ * buffer-cache. This gobbles the caller's reference to the -+ * buffer_head in the inode location struct. -+ */ -+ -+static int ext3_do_update_inode(handle_t *handle, -+ struct inode *inode, -+ struct ext3_iloc *iloc) -+{ -+ struct ext3_inode *raw_inode = iloc->raw_inode; -+ struct buffer_head *bh = iloc->bh; -+ int err = 0, rc, block; -+ -+ if (handle) { -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto out_brelse; -+ } -+ raw_inode->i_mode = cpu_to_le16(inode->i_mode); -+ if(!(test_opt(inode->i_sb, NO_UID32))) { -+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); -+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); -+/* -+ * Fix up interoperability with old kernels. Otherwise, old inodes get -+ * re-used with the upper 16 bits of the uid/gid intact -+ */ -+ if(!inode->u.ext3_i.i_dtime) { -+ raw_inode->i_uid_high = -+ cpu_to_le16(high_16_bits(inode->i_uid)); -+ raw_inode->i_gid_high = -+ cpu_to_le16(high_16_bits(inode->i_gid)); -+ } else { -+ raw_inode->i_uid_high = 0; -+ raw_inode->i_gid_high = 0; -+ } -+ } else { -+ raw_inode->i_uid_low = -+ cpu_to_le16(fs_high2lowuid(inode->i_uid)); -+ raw_inode->i_gid_low = -+ cpu_to_le16(fs_high2lowgid(inode->i_gid)); -+ raw_inode->i_uid_high = 0; -+ raw_inode->i_gid_high = 0; -+ } -+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); -+ raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); -+ raw_inode->i_atime = cpu_to_le32(inode->i_atime); -+ raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); -+ raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); -+ raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); -+ raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime); -+ raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags); -+#ifdef EXT3_FRAGMENTS -+ raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr); -+ raw_inode->i_frag = inode->u.ext3_i.i_frag_no; -+ raw_inode->i_fsize = inode->u.ext3_i.i_frag_size; -+#else -+ /* If we are not tracking these fields in the in-memory inode, -+ * then preserve them on disk, but still initialise them to zero -+ * for new inodes. */ -+ if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { -+ raw_inode->i_faddr = 0; -+ raw_inode->i_frag = 0; -+ raw_inode->i_fsize = 0; -+ } -+#endif -+ raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); -+ if (!S_ISREG(inode->i_mode)) { -+ raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); -+ } else { -+ raw_inode->i_size_high = -+ cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); -+ if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) { -+ struct super_block *sb = inode->i_sb; -+ if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, -+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || -+ EXT3_SB(sb)->s_es->s_rev_level == -+ cpu_to_le32(EXT3_GOOD_OLD_REV)) { -+ /* If this is the first large file -+ * created, add a flag to the superblock. -+ */ -+ err = ext3_journal_get_write_access(handle, -+ sb->u.ext3_sb.s_sbh); -+ if (err) -+ goto out_brelse; -+ ext3_update_dynamic_rev(sb); -+ EXT3_SET_RO_COMPAT_FEATURE(sb, -+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE); -+ sb->s_dirt = 1; -+ handle->h_sync = 1; -+ err = ext3_journal_dirty_metadata(handle, -+ sb->u.ext3_sb.s_sbh); -+ } -+ } -+ } -+ raw_inode->i_generation = le32_to_cpu(inode->i_generation); -+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) -+ raw_inode->i_block[0] = -+ cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); -+ else for (block = 0; block < EXT3_N_BLOCKS; block++) -+ raw_inode->i_block[block] = inode->u.ext3_i.i_data[block]; -+ -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ rc = ext3_journal_dirty_metadata(handle, bh); -+ if (!err) -+ err = rc; -+ EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; -+ -+out_brelse: -+ brelse (bh); -+ ext3_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * ext3_write_inode() -+ * -+ * We are called from a few places: -+ * -+ * - Within generic_file_write() for O_SYNC files. -+ * Here, there will be no transaction running. We wait for any running -+ * trasnaction to commit. -+ * -+ * - Within sys_sync(), kupdate and such. -+ * We wait on commit, if tol to. -+ * -+ * - Within prune_icache() (PF_MEMALLOC == true) -+ * Here we simply return. We can't afford to block kswapd on the -+ * journal commit. -+ * -+ * In all cases it is actually safe for us to return without doing anything, -+ * because the inode has been copied into a raw inode buffer in -+ * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for -+ * knfsd. -+ * -+ * Note that we are absolutely dependent upon all inode dirtiers doing the -+ * right thing: they *must* call mark_inode_dirty() after dirtying info in -+ * which we are interested. -+ * -+ * It would be a bug for them to not do this. The code: -+ * -+ * mark_inode_dirty(inode) -+ * stuff(); -+ * inode->i_size = expr; -+ * -+ * is in error because a kswapd-driven write_inode() could occur while -+ * `stuff()' is running, and the new i_size will be lost. Plus the inode -+ * will no longer be on the superblock's dirty inode list. -+ */ -+void ext3_write_inode(struct inode *inode, int wait) -+{ -+ if (current->flags & PF_MEMALLOC) -+ return; -+ -+ if (ext3_journal_current_handle()) { -+ jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); -+ return; -+ } -+ -+ if (!wait) -+ return; -+ -+ ext3_force_commit(inode->i_sb); -+} -+ -+/* -+ * ext3_setattr() -+ * -+ * Called from notify_change. -+ * -+ * We want to trap VFS attempts to truncate the file as soon as -+ * possible. In particular, we want to make sure that when the VFS -+ * shrinks i_size, we put the inode on the orphan list and modify -+ * i_disksize immediately, so that during the subsequent flushing of -+ * dirty pages and freeing of disk blocks, we can guarantee that any -+ * commit will leave the blocks being flushed in an unused state on -+ * disk. (On recovery, the inode will get truncated and the blocks will -+ * be freed, so we have a strong guarantee that no future commit will -+ * leave these blocks visible to the user.) -+ * -+ * This is only needed for regular files. rmdir() has its own path, and -+ * we can never truncate a direcory except on final unlink (at which -+ * point i_nlink is zero so recovery is easy.) -+ * -+ * Called with the BKL. -+ */ -+ -+int ext3_setattr(struct dentry *dentry, struct iattr *attr) -+{ -+ struct inode *inode = dentry->d_inode; -+ int error, rc = 0; -+ const unsigned int ia_valid = attr->ia_valid; -+ -+ error = inode_change_ok(inode, attr); -+ if (error) -+ return error; -+ -+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || -+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { -+ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; -+ if (error) -+ return error; -+ } -+ -+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { -+ handle_t *handle; -+ -+ handle = ext3_journal_start(inode, 3); -+ if (IS_ERR(handle)) { -+ error = PTR_ERR(handle); -+ goto err_out; -+ } -+ -+ error = ext3_orphan_add(handle, inode); -+ inode->u.ext3_i.i_disksize = attr->ia_size; -+ rc = ext3_mark_inode_dirty(handle, inode); -+ if (!error) -+ error = rc; -+ ext3_journal_stop(handle, inode); -+ } -+ -+ rc = inode_setattr(inode, attr); -+ -+ /* If inode_setattr's call to ext3_truncate failed to get a -+ * transaction handle at all, we need to clean up the in-core -+ * orphan list manually. */ -+ if (inode->i_nlink) -+ ext3_orphan_del(NULL, inode); -+ -+err_out: -+ ext3_std_error(inode->i_sb, error); -+ if (!error) -+ error = rc; -+ return error; -+} -+ -+ -+/* -+ * akpm: how many blocks doth make a writepage()? -+ * -+ * With N blocks per page, it may be: -+ * N data blocks -+ * 2 indirect block -+ * 2 dindirect -+ * 1 tindirect -+ * N+5 bitmap blocks (from the above) -+ * N+5 group descriptor summary blocks -+ * 1 inode block -+ * 1 superblock. -+ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files -+ * -+ * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS -+ * -+ * With ordered or writeback data it's the same, less the N data blocks. -+ * -+ * If the inode's direct blocks can hold an integral number of pages then a -+ * page cannot straddle two indirect blocks, and we can only touch one indirect -+ * and dindirect block, and the "5" above becomes "3". -+ * -+ * This still overestimates under most circumstances. If we were to pass the -+ * start and end offsets in here as well we could do block_to_path() on each -+ * block and work out the exact number of indirects which are touched. Pah. -+ */ -+ -+int ext3_writepage_trans_blocks(struct inode *inode) -+{ -+ int bpp = ext3_journal_blocks_per_page(inode); -+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; -+ int ret; -+ -+ if (ext3_should_journal_data(inode)) -+ ret = 3 * (bpp + indirects) + 2; -+ else -+ ret = 2 * (bpp + indirects) + 2; -+ -+#ifdef CONFIG_QUOTA -+ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return ret; -+} -+ -+int -+ext3_mark_iloc_dirty(handle_t *handle, -+ struct inode *inode, -+ struct ext3_iloc *iloc) -+{ -+ int err = 0; -+ -+ if (handle) { -+ /* the do_update_inode consumes one bh->b_count */ -+ atomic_inc(&iloc->bh->b_count); -+ err = ext3_do_update_inode(handle, inode, iloc); -+ /* ext3_do_update_inode() does journal_dirty_metadata */ -+ brelse(iloc->bh); -+ } else { -+ printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n"); -+ } -+ return err; -+} -+ -+/* -+ * On success, We end up with an outstanding reference count against -+ * iloc->bh. This _must_ be cleaned up later. -+ */ -+ -+int -+ext3_reserve_inode_write(handle_t *handle, struct inode *inode, -+ struct ext3_iloc *iloc) -+{ -+ int err = 0; -+ if (handle) { -+ err = ext3_get_inode_loc(inode, iloc); -+ if (!err) { -+ BUFFER_TRACE(iloc->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, iloc->bh); -+ if (err) { -+ brelse(iloc->bh); -+ iloc->bh = NULL; -+ } -+ } -+ } -+ ext3_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * akpm: What we do here is to mark the in-core inode as clean -+ * with respect to inode dirtiness (it may still be data-dirty). -+ * This means that the in-core inode may be reaped by prune_icache -+ * without having to perform any I/O. This is a very good thing, -+ * because *any* task may call prune_icache - even ones which -+ * have a transaction open against a different journal. -+ * -+ * Is this cheating? Not really. Sure, we haven't written the -+ * inode out, but prune_icache isn't a user-visible syncing function. -+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) -+ * we start and wait on commits. -+ * -+ * Is this efficient/effective? Well, we're being nice to the system -+ * by cleaning up our inodes proactively so they can be reaped -+ * without I/O. But we are potentially leaving up to five seconds' -+ * worth of inodes floating about which prune_icache wants us to -+ * write out. One way to fix that would be to get prune_icache() -+ * to do a write_super() to free up some memory. It has the desired -+ * effect. -+ */ -+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_iloc iloc; -+ int err; -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (!err) -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ return err; -+} -+ -+/* -+ * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() -+ * -+ * We're really interested in the case where a file is being extended. -+ * i_size has been changed by generic_commit_write() and we thus need -+ * to include the updated inode in the current transaction. -+ * -+ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks -+ * are allocated to the file. -+ * -+ * If the inode is marked synchronous, we don't honour that here - doing -+ * so would cause a commit on atime updates, which we don't bother doing. -+ * We handle synchronous inodes at the highest possible level. -+ */ -+void ext3_dirty_inode(struct inode *inode) -+{ -+ handle_t *current_handle = ext3_journal_current_handle(); -+ handle_t *handle; -+ -+ lock_kernel(); -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ goto out; -+ if (current_handle && -+ current_handle->h_transaction != handle->h_transaction) { -+ /* This task has a transaction open against a different fs */ -+ printk(KERN_EMERG __FUNCTION__": transactions do not match!\n"); -+ } else { -+ jbd_debug(5, "marking dirty. outer handle=%p\n", -+ current_handle); -+ ext3_mark_inode_dirty(handle, inode); -+ } -+ ext3_journal_stop(handle, inode); -+out: -+ unlock_kernel(); -+} -+ -+#ifdef AKPM -+/* -+ * Bind an inode's backing buffer_head into this transaction, to prevent -+ * it from being flushed to disk early. Unlike -+ * ext3_reserve_inode_write, this leaves behind no bh reference and -+ * returns no iloc structure, so the caller needs to repeat the iloc -+ * lookup to mark the inode dirty later. -+ */ -+static inline int -+ext3_pin_inode(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_iloc iloc; -+ -+ int err = 0; -+ if (handle) { -+ err = ext3_get_inode_loc(inode, &iloc); -+ if (!err) { -+ BUFFER_TRACE(iloc.bh, "get_write_access"); -+ err = journal_get_write_access(handle, iloc.bh); -+ if (!err) -+ err = ext3_journal_dirty_metadata(handle, -+ iloc.bh); -+ brelse(iloc.bh); -+ } -+ } -+ ext3_std_error(inode->i_sb, err); -+ return err; -+} -+#endif -+ -+int ext3_change_inode_journal_flag(struct inode *inode, int val) -+{ -+ journal_t *journal; -+ handle_t *handle; -+ int err; -+ -+ /* -+ * We have to be very careful here: changing a data block's -+ * journaling status dynamically is dangerous. If we write a -+ * data block to the journal, change the status and then delete -+ * that block, we risk forgetting to revoke the old log record -+ * from the journal and so a subsequent replay can corrupt data. -+ * So, first we make sure that the journal is empty and that -+ * nobody is changing anything. -+ */ -+ -+ journal = EXT3_JOURNAL(inode); -+ if (is_journal_aborted(journal) || IS_RDONLY(inode)) -+ return -EROFS; -+ -+ journal_lock_updates(journal); -+ journal_flush(journal); -+ -+ /* -+ * OK, there are no updates running now, and all cached data is -+ * synced to disk. We are now in a completely consistent state -+ * which doesn't have anything in the journal, and we know that -+ * no filesystem updates are running, so it is safe to modify -+ * the inode's in-core data-journaling state flag now. -+ */ -+ -+ if (val) -+ inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL; -+ else -+ inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL; -+ -+ journal_unlock_updates(journal); -+ -+ /* Finally we can mark the inode as dirty. */ -+ -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ err = ext3_mark_inode_dirty(handle, inode); -+ handle->h_sync = 1; -+ ext3_journal_stop(handle, inode); -+ ext3_std_error(inode->i_sb, err); -+ -+ return err; -+} -+ -+ -+/* -+ * ext3_aops_journal_start(). -+ * -+ * -+ * -+ * We need to take the inode semaphore *outside* the -+ * journal_start/journal_stop. Otherwise, a different task could do a -+ * wait_for_commit() while holding ->i_sem, which deadlocks. The rule -+ * is: transaction open/closes are considered to be a locking operation -+ * and they nest *inside* ->i_sem. -+ * ---------------------------------------------------------------------------- -+ * Possible problem: -+ * ext3_file_write() -+ * -> generic_file_write() -+ * -> __alloc_pages() -+ * -> page_launder() -+ * -> ext3_writepage() -+ * -+ * And the writepage can be on a different fs while we have a -+ * transaction open against this one! Bad. -+ * -+ * I tried making the task PF_MEMALLOC here, but that simply results in -+ * 0-order allocation failures passed back to generic_file_write(). -+ * Instead, we rely on the reentrancy protection in ext3_writepage(). -+ * ---------------------------------------------------------------------------- -+ * When we do the journal_start() here we don't really need to reserve -+ * any blocks - we won't need any until we hit ext3_prepare_write(), -+ * which does all the needed journal extending. However! There is a -+ * problem with quotas: -+ * -+ * Thread 1: -+ * sys_sync -+ * ->sync_dquots -+ * ->commit_dquot -+ * ->lock_dquot -+ * ->write_dquot -+ * ->ext3_file_write -+ * ->journal_start -+ * ->ext3_prepare_write -+ * ->journal_extend -+ * ->journal_start -+ * Thread 2: -+ * ext3_create (for example) -+ * ->ext3_new_inode -+ * ->dquot_initialize -+ * ->lock_dquot -+ * -+ * Deadlock. Thread 1's journal_start blocks because thread 2 has a -+ * transaction open. Thread 2's transaction will never close because -+ * thread 2 is stuck waiting for the dquot lock. -+ * -+ * So. We must ensure that thread 1 *never* needs to extend the journal -+ * for quota writes. We do that by reserving enough journal blocks -+ * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we -+ * need to extend" test in ext3_prepare_write() succeeds. -+ */ -diff -rup --new-file linux.mcp2/fs/ext3/ioctl.c linux_tmp/fs/ext3/ioctl.c ---- linux.mcp2/fs/ext3/ioctl.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/ioctl.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,170 @@ -+/* -+ * linux/fs/ext3/ioctl.c -+ * -+ * Copyright (C) 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ unsigned int flags; -+ -+ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); -+ -+ switch (cmd) { -+ case EXT3_IOC_GETFLAGS: -+ flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; -+ return put_user(flags, (int *) arg); -+ case EXT3_IOC_SETFLAGS: { -+ handle_t *handle = NULL; -+ int err; -+ struct ext3_iloc iloc; -+ unsigned int oldflags; -+ unsigned int jflag; -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ -+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) -+ return -EPERM; -+ -+ if (get_user(flags, (int *) arg)) -+ return -EFAULT; -+ -+ oldflags = inode->u.ext3_i.i_flags; -+ -+ /* The JOURNAL_DATA flag is modifiable only by root */ -+ jflag = flags & EXT3_JOURNAL_DATA_FL; -+ -+ /* -+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by -+ * the relevant capability. -+ * -+ * This test looks nicer. Thanks to Pauline Middelink -+ */ -+ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { -+ if (!capable(CAP_LINUX_IMMUTABLE)) -+ return -EPERM; -+ } -+ -+ /* -+ * The JOURNAL_DATA flag can only be changed by -+ * the relevant capability. -+ */ -+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { -+ if (!capable(CAP_SYS_RESOURCE)) -+ return -EPERM; -+ } -+ -+ -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto flags_err; -+ -+ flags = flags & EXT3_FL_USER_MODIFIABLE; -+ flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; -+ inode->u.ext3_i.i_flags = flags; -+ -+ if (flags & EXT3_SYNC_FL) -+ inode->i_flags |= S_SYNC; -+ else -+ inode->i_flags &= ~S_SYNC; -+ if (flags & EXT3_APPEND_FL) -+ inode->i_flags |= S_APPEND; -+ else -+ inode->i_flags &= ~S_APPEND; -+ if (flags & EXT3_IMMUTABLE_FL) -+ inode->i_flags |= S_IMMUTABLE; -+ else -+ inode->i_flags &= ~S_IMMUTABLE; -+ if (flags & EXT3_NOATIME_FL) -+ inode->i_flags |= S_NOATIME; -+ else -+ inode->i_flags &= ~S_NOATIME; -+ inode->i_ctime = CURRENT_TIME; -+ -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+flags_err: -+ ext3_journal_stop(handle, inode); -+ if (err) -+ return err; -+ -+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) -+ err = ext3_change_inode_journal_flag(inode, jflag); -+ return err; -+ } -+ case EXT3_IOC_GETVERSION: -+ case EXT3_IOC_GETVERSION_OLD: -+ return put_user(inode->i_generation, (int *) arg); -+ case EXT3_IOC_SETVERSION: -+ case EXT3_IOC_SETVERSION_OLD: { -+ handle_t *handle; -+ struct ext3_iloc iloc; -+ __u32 generation; -+ int err; -+ -+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) -+ return -EPERM; -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (get_user(generation, (int *) arg)) -+ return -EFAULT; -+ -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ return err; -+ -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_generation = generation; -+ -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ ext3_journal_stop(handle, inode); -+ return err; -+ } -+#ifdef CONFIG_JBD_DEBUG -+ case EXT3_IOC_WAIT_FOR_READONLY: -+ /* -+ * This is racy - by the time we're woken up and running, -+ * the superblock could be released. And the module could -+ * have been unloaded. So sue me. -+ * -+ * Returns 1 if it slept, else zero. -+ */ -+ { -+ struct super_block *sb = inode->i_sb; -+ DECLARE_WAITQUEUE(wait, current); -+ int ret = 0; -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); -+ if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) { -+ schedule(); -+ ret = 1; -+ } -+ remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); -+ return ret; -+ } -+#endif -+ default: -+ return -ENOTTY; -+ } -+} -diff -rup --new-file linux.mcp2/fs/ext3/namei.c linux_tmp/fs/ext3/namei.c ---- linux.mcp2/fs/ext3/namei.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/namei.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,1125 @@ -+/* -+ * linux/fs/ext3/namei.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/namei.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * Directory entry file type support and forward compatibility hooks -+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+/* -+ * define how far ahead to read directories while searching them. -+ */ -+#define NAMEI_RA_CHUNKS 2 -+#define NAMEI_RA_BLOCKS 4 -+#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -+#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) -+ -+/* -+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. -+ * -+ * `len <= EXT3_NAME_LEN' is guaranteed by caller. -+ * `de != NULL' is guaranteed by caller. -+ */ -+static inline int ext3_match (int len, const char * const name, -+ struct ext3_dir_entry_2 * de) -+{ -+ if (len != de->name_len) -+ return 0; -+ if (!de->inode) -+ return 0; -+ return !memcmp(name, de->name, len); -+} -+ -+/* -+ * Returns 0 if not found, -1 on failure, and 1 on success -+ */ -+static int inline search_dirblock(struct buffer_head * bh, -+ struct inode *dir, -+ struct dentry *dentry, -+ unsigned long offset, -+ struct ext3_dir_entry_2 ** res_dir) -+{ -+ struct ext3_dir_entry_2 * de; -+ char * dlimit; -+ int de_len; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ dlimit = bh->b_data + dir->i_sb->s_blocksize; -+ while ((char *) de < dlimit) { -+ /* this code is executed quadratically often */ -+ /* do minimal checking `by hand' */ -+ -+ if ((char *) de + namelen <= dlimit && -+ ext3_match (namelen, name, de)) { -+ /* found a match - just to be sure, do a full check */ -+ if (!ext3_check_dir_entry("ext3_find_entry", -+ dir, de, bh, offset)) -+ return -1; -+ *res_dir = de; -+ return 1; -+ } -+ /* prevent looping on a bad block */ -+ de_len = le16_to_cpu(de->rec_len); -+ if (de_len <= 0) -+ return -1; -+ offset += de_len; -+ de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); -+ } -+ return 0; -+} -+ -+/* -+ * ext3_find_entry() -+ * -+ * finds an entry in the specified directory with the wanted name. It -+ * returns the cache buffer in which the entry was found, and the entry -+ * itself (as a parameter - res_dir). It does NOT read the inode of the -+ * entry - you'll have to do that yourself if you want to. -+ * -+ * The returned buffer_head has ->b_count elevated. The caller is expected -+ * to brelse() it when appropriate. -+ */ -+static struct buffer_head * ext3_find_entry (struct dentry *dentry, -+ struct ext3_dir_entry_2 ** res_dir) -+{ -+ struct super_block * sb; -+ struct buffer_head * bh_use[NAMEI_RA_SIZE]; -+ struct buffer_head * bh, *ret = NULL; -+ unsigned long start, block, b; -+ int ra_max = 0; /* Number of bh's in the readahead -+ buffer, bh_use[] */ -+ int ra_ptr = 0; /* Current index into readahead -+ buffer */ -+ int num = 0; -+ int nblocks, i, err; -+ struct inode *dir = dentry->d_parent->d_inode; -+ -+ *res_dir = NULL; -+ sb = dir->i_sb; -+ -+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); -+ start = dir->u.ext3_i.i_dir_start_lookup; -+ if (start >= nblocks) -+ start = 0; -+ block = start; -+restart: -+ do { -+ /* -+ * We deal with the read-ahead logic here. -+ */ -+ if (ra_ptr >= ra_max) { -+ /* Refill the readahead buffer */ -+ ra_ptr = 0; -+ b = block; -+ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { -+ /* -+ * Terminate if we reach the end of the -+ * directory and must wrap, or if our -+ * search has finished at this block. -+ */ -+ if (b >= nblocks || (num && block == start)) { -+ bh_use[ra_max] = NULL; -+ break; -+ } -+ num++; -+ bh = ext3_getblk(NULL, dir, b++, 0, &err); -+ bh_use[ra_max] = bh; -+ if (bh) -+ ll_rw_block(READ, 1, &bh); -+ } -+ } -+ if ((bh = bh_use[ra_ptr++]) == NULL) -+ goto next; -+ wait_on_buffer(bh); -+ if (!buffer_uptodate(bh)) { -+ /* read error, skip block & hope for the best */ -+ brelse(bh); -+ goto next; -+ } -+ i = search_dirblock(bh, dir, dentry, -+ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); -+ if (i == 1) { -+ dir->u.ext3_i.i_dir_start_lookup = block; -+ ret = bh; -+ goto cleanup_and_exit; -+ } else { -+ brelse(bh); -+ if (i < 0) -+ goto cleanup_and_exit; -+ } -+ next: -+ if (++block >= nblocks) -+ block = 0; -+ } while (block != start); -+ -+ /* -+ * If the directory has grown while we were searching, then -+ * search the last part of the directory before giving up. -+ */ -+ block = nblocks; -+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); -+ if (block < nblocks) { -+ start = 0; -+ goto restart; -+ } -+ -+cleanup_and_exit: -+ /* Clean up the read-ahead blocks */ -+ for (; ra_ptr < ra_max; ra_ptr++) -+ brelse (bh_use[ra_ptr]); -+ return ret; -+} -+ -+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) -+{ -+ struct inode * inode; -+ struct ext3_dir_entry_2 * de; -+ struct buffer_head * bh; -+ -+ if (dentry->d_name.len > EXT3_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ bh = ext3_find_entry(dentry, &de); -+ inode = NULL; -+ if (bh) { -+ unsigned long ino = le32_to_cpu(de->inode); -+ brelse (bh); -+ inode = iget(dir->i_sb, ino); -+ -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ } -+ d_add(dentry, inode); -+ return NULL; -+} -+ -+#define S_SHIFT 12 -+static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { -+ [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE, -+ [S_IFDIR >> S_SHIFT] EXT3_FT_DIR, -+ [S_IFCHR >> S_SHIFT] EXT3_FT_CHRDEV, -+ [S_IFBLK >> S_SHIFT] EXT3_FT_BLKDEV, -+ [S_IFIFO >> S_SHIFT] EXT3_FT_FIFO, -+ [S_IFSOCK >> S_SHIFT] EXT3_FT_SOCK, -+ [S_IFLNK >> S_SHIFT] EXT3_FT_SYMLINK, -+}; -+ -+static inline void ext3_set_de_type(struct super_block *sb, -+ struct ext3_dir_entry_2 *de, -+ umode_t mode) { -+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) -+ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -+} -+ -+/* -+ * ext3_add_entry() -+ * -+ * adds a file entry to the specified directory, using the same -+ * semantics as ext3_find_entry(). It returns NULL if it failed. -+ * -+ * NOTE!! The inode part of 'de' is left at 0 - which means you -+ * may not sleep between calling this and putting something into -+ * the entry, as someone else might have used it while you slept. -+ */ -+ -+/* -+ * AKPM: the journalling code here looks wrong on the error paths -+ */ -+static int ext3_add_entry (handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ unsigned long offset; -+ unsigned short rec_len; -+ struct buffer_head * bh; -+ struct ext3_dir_entry_2 * de, * de1; -+ struct super_block * sb; -+ int retval; -+ -+ sb = dir->i_sb; -+ -+ if (!namelen) -+ return -EINVAL; -+ bh = ext3_bread (handle, dir, 0, 0, &retval); -+ if (!bh) -+ return retval; -+ rec_len = EXT3_DIR_REC_LEN(namelen); -+ offset = 0; -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ while (1) { -+ if ((char *)de >= sb->s_blocksize + bh->b_data) { -+ brelse (bh); -+ bh = NULL; -+ bh = ext3_bread (handle, dir, -+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); -+ if (!bh) -+ return retval; -+ if (dir->i_size <= offset) { -+ if (dir->i_size == 0) { -+ brelse(bh); -+ return -ENOENT; -+ } -+ -+ ext3_debug ("creating next block\n"); -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, bh); -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ de->inode = 0; -+ de->rec_len = le16_to_cpu(sb->s_blocksize); -+ dir->u.ext3_i.i_disksize = -+ dir->i_size = offset + sb->s_blocksize; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ } else { -+ -+ ext3_debug ("skipping to next block\n"); -+ -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ } -+ } -+ if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, -+ offset)) { -+ brelse (bh); -+ return -ENOENT; -+ } -+ if (ext3_match (namelen, name, de)) { -+ brelse (bh); -+ return -EEXIST; -+ } -+ if ((le32_to_cpu(de->inode) == 0 && -+ le16_to_cpu(de->rec_len) >= rec_len) || -+ (le16_to_cpu(de->rec_len) >= -+ EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, bh); -+ /* By now the buffer is marked for journaling */ -+ offset += le16_to_cpu(de->rec_len); -+ if (le32_to_cpu(de->inode)) { -+ de1 = (struct ext3_dir_entry_2 *) ((char *) de + -+ EXT3_DIR_REC_LEN(de->name_len)); -+ de1->rec_len = -+ cpu_to_le16(le16_to_cpu(de->rec_len) - -+ EXT3_DIR_REC_LEN(de->name_len)); -+ de->rec_len = cpu_to_le16( -+ EXT3_DIR_REC_LEN(de->name_len)); -+ de = de1; -+ } -+ de->file_type = EXT3_FT_UNKNOWN; -+ if (inode) { -+ de->inode = cpu_to_le32(inode->i_ino); -+ ext3_set_de_type(dir->i_sb, de, inode->i_mode); -+ } else -+ de->inode = 0; -+ de->name_len = namelen; -+ memcpy (de->name, name, namelen); -+ /* -+ * XXX shouldn't update any times until successful -+ * completion of syscall, but too many callers depend -+ * on this. -+ * -+ * XXX similarly, too many callers depend on -+ * ext3_new_inode() setting the times, but error -+ * recovery deletes the inode, so the worst that can -+ * happen is that the times are slightly out of date -+ * and/or different from the directory change time. -+ */ -+ dir->i_mtime = dir->i_ctime = CURRENT_TIME; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ dir->i_version = ++event; -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, bh); -+ brelse(bh); -+ return 0; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ brelse (bh); -+ return -ENOSPC; -+} -+ -+/* -+ * ext3_delete_entry deletes a directory entry by merging it with the -+ * previous entry -+ */ -+static int ext3_delete_entry (handle_t *handle, -+ struct inode * dir, -+ struct ext3_dir_entry_2 * de_del, -+ struct buffer_head * bh) -+{ -+ struct ext3_dir_entry_2 * de, * pde; -+ int i; -+ -+ i = 0; -+ pde = NULL; -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ while (i < bh->b_size) { -+ if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) -+ return -EIO; -+ if (de == de_del) { -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, bh); -+ if (pde) -+ pde->rec_len = -+ cpu_to_le16(le16_to_cpu(pde->rec_len) + -+ le16_to_cpu(de->rec_len)); -+ else -+ de->inode = 0; -+ dir->i_version = ++event; -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, bh); -+ return 0; -+ } -+ i += le16_to_cpu(de->rec_len); -+ pde = de; -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ return -ENOENT; -+} -+ -+/* -+ * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we -+ * do not perform it in these functions. We perform it at the call site, -+ * if it is needed. -+ */ -+static inline void ext3_inc_count(handle_t *handle, struct inode *inode) -+{ -+ inode->i_nlink++; -+} -+ -+static inline void ext3_dec_count(handle_t *handle, struct inode *inode) -+{ -+ inode->i_nlink--; -+} -+ -+static int ext3_add_nondir(handle_t *handle, -+ struct dentry *dentry, struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ d_instantiate(dentry, inode); -+ return 0; -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ -+/* -+ * By the time this is called, we already have created -+ * the directory cache entry for the new file, but it -+ * is so far negative - it has no inode. -+ * -+ * If the create succeeds, we fill in the inode information -+ * with d_instantiate(). -+ */ -+static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ int err; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3_new_inode (handle, dir, mode); -+ err = PTR_ERR(inode); -+ if (!IS_ERR(inode)) { -+ inode->i_op = &ext3_file_inode_operations; -+ inode->i_fop = &ext3_file_operations; -+ inode->i_mapping->a_ops = &ext3_aops; -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_nondir(handle, dentry, inode); -+ } -+ ext3_journal_stop(handle, dir); -+ return err; -+} -+ -+static int ext3_mknod (struct inode * dir, struct dentry *dentry, -+ int mode, int rdev) -+{ -+ handle_t *handle; -+ struct inode *inode; -+ int err; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3_new_inode (handle, dir, mode); -+ err = PTR_ERR(inode); -+ if (!IS_ERR(inode)) { -+ init_special_inode(inode, mode, rdev); -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_nondir(handle, dentry, inode); -+ } -+ ext3_journal_stop(handle, dir); -+ return err; -+} -+ -+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ struct buffer_head * dir_block; -+ struct ext3_dir_entry_2 * de; -+ int err; -+ -+ if (dir->i_nlink >= EXT3_LINK_MAX) -+ return -EMLINK; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3_new_inode (handle, dir, S_IFDIR); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out_stop; -+ -+ inode->i_op = &ext3_dir_inode_operations; -+ inode->i_fop = &ext3_dir_operations; -+ inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; -+ inode->i_blocks = 0; -+ dir_block = ext3_bread (handle, inode, 0, 1, &err); -+ if (!dir_block) { -+ inode->i_nlink--; /* is this nlink == 0? */ -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } -+ BUFFER_TRACE(dir_block, "get_write_access"); -+ ext3_journal_get_write_access(handle, dir_block); -+ de = (struct ext3_dir_entry_2 *) dir_block->b_data; -+ de->inode = cpu_to_le32(inode->i_ino); -+ de->name_len = 1; -+ de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len)); -+ strcpy (de->name, "."); -+ ext3_set_de_type(dir->i_sb, de, S_IFDIR); -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ de->inode = cpu_to_le32(dir->i_ino); -+ de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1)); -+ de->name_len = 2; -+ strcpy (de->name, ".."); -+ ext3_set_de_type(dir->i_sb, de, S_IFDIR); -+ inode->i_nlink = 2; -+ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, dir_block); -+ brelse (dir_block); -+ inode->i_mode = S_IFDIR | mode; -+ if (dir->i_mode & S_ISGID) -+ inode->i_mode |= S_ISGID; -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_entry (handle, dentry, inode); -+ if (err) -+ goto out_no_entry; -+ dir->i_nlink++; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ d_instantiate(dentry, inode); -+out_stop: -+ ext3_journal_stop(handle, dir); -+ return err; -+ -+out_no_entry: -+ inode->i_nlink = 0; -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+} -+ -+/* -+ * routine to check that the specified directory is empty (for rmdir) -+ */ -+static int empty_dir (struct inode * inode) -+{ -+ unsigned long offset; -+ struct buffer_head * bh; -+ struct ext3_dir_entry_2 * de, * de1; -+ struct super_block * sb; -+ int err; -+ -+ sb = inode->i_sb; -+ if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || -+ !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { -+ ext3_warning (inode->i_sb, "empty_dir", -+ "bad directory (dir #%lu) - no data block", -+ inode->i_ino); -+ return 1; -+ } -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ de1 = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ if (le32_to_cpu(de->inode) != inode->i_ino || -+ !le32_to_cpu(de1->inode) || -+ strcmp (".", de->name) || -+ strcmp ("..", de1->name)) { -+ ext3_warning (inode->i_sb, "empty_dir", -+ "bad directory (dir #%lu) - no `.' or `..'", -+ inode->i_ino); -+ brelse (bh); -+ return 1; -+ } -+ offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de1 + le16_to_cpu(de1->rec_len)); -+ while (offset < inode->i_size ) { -+ if (!bh || -+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { -+ brelse (bh); -+ bh = ext3_bread (NULL, inode, -+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); -+ if (!bh) { -+#if 0 -+ ext3_error (sb, "empty_dir", -+ "directory #%lu contains a hole at offset %lu", -+ inode->i_ino, offset); -+#endif -+ offset += sb->s_blocksize; -+ continue; -+ } -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ } -+ if (!ext3_check_dir_entry ("empty_dir", inode, de, bh, -+ offset)) { -+ brelse (bh); -+ return 1; -+ } -+ if (le32_to_cpu(de->inode)) { -+ brelse (bh); -+ return 0; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ brelse (bh); -+ return 1; -+} -+ -+/* ext3_orphan_add() links an unlinked or truncated inode into a list of -+ * such inodes, starting at the superblock, in case we crash before the -+ * file is closed/deleted, or in case the inode truncate spans multiple -+ * transactions and the last transaction is not recovered after a crash. -+ * -+ * At filesystem recovery time, we walk this list deleting unlinked -+ * inodes and truncating linked inodes in ext3_orphan_cleanup(). -+ */ -+int ext3_orphan_add(handle_t *handle, struct inode *inode) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct ext3_iloc iloc; -+ int err = 0, rc; -+ -+ lock_super(sb); -+ if (!list_empty(&inode->u.ext3_i.i_orphan)) -+ goto out_unlock; -+ -+ /* Orphan handling is only valid for files with data blocks -+ * being truncated, or files being unlinked. */ -+ -+ /* @@@ FIXME: Observation from aviro: -+ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block -+ * here (on lock_super()), so race with ext3_link() which might bump -+ * ->i_nlink. For, say it, character device. Not a regular file, -+ * not a directory, not a symlink and ->i_nlink > 0. -+ */ -+ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || -+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (err) -+ goto out_unlock; -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto out_unlock; -+ -+ /* Insert this inode at the head of the on-disk orphan list... */ -+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); -+ EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); -+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ rc = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ if (!err) -+ err = rc; -+ -+ /* Only add to the head of the in-memory list if all the -+ * previous operations succeeded. If the orphan_add is going to -+ * fail (possibly taking the journal offline), we can't risk -+ * leaving the inode on the orphan list: stray orphan-list -+ * entries can cause panics at unmount time. -+ * -+ * This is safe: on error we're going to ignore the orphan list -+ * anyway on the next recovery. */ -+ if (!err) -+ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); -+ -+ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); -+ jbd_debug(4, "orphan inode %ld will point to %d\n", -+ inode->i_ino, NEXT_ORPHAN(inode)); -+out_unlock: -+ unlock_super(sb); -+ ext3_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * ext3_orphan_del() removes an unlinked or truncated inode from the list -+ * of such inodes stored on disk, because it is finally being cleaned up. -+ */ -+int ext3_orphan_del(handle_t *handle, struct inode *inode) -+{ -+ struct list_head *prev; -+ struct ext3_sb_info *sbi; -+ ino_t ino_next; -+ struct ext3_iloc iloc; -+ int err = 0; -+ -+ lock_super(inode->i_sb); -+ if (list_empty(&inode->u.ext3_i.i_orphan)) { -+ unlock_super(inode->i_sb); -+ return 0; -+ } -+ -+ ino_next = NEXT_ORPHAN(inode); -+ prev = inode->u.ext3_i.i_orphan.prev; -+ sbi = EXT3_SB(inode->i_sb); -+ -+ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); -+ -+ list_del(&inode->u.ext3_i.i_orphan); -+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+ -+ /* If we're on an error path, we may not have a valid -+ * transaction handle with which to update the orphan list on -+ * disk, but we still need to remove the inode from the linked -+ * list in memory. */ -+ if (!handle) -+ goto out; -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto out_err; -+ -+ if (prev == &sbi->s_orphan) { -+ jbd_debug(4, "superblock will point to %ld\n", ino_next); -+ BUFFER_TRACE(sbi->s_sbh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, sbi->s_sbh); -+ if (err) -+ goto out_brelse; -+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); -+ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); -+ } else { -+ struct ext3_iloc iloc2; -+ struct inode *i_prev = -+ list_entry(prev, struct inode, u.ext3_i.i_orphan); -+ -+ jbd_debug(4, "orphan inode %ld will point to %ld\n", -+ i_prev->i_ino, ino_next); -+ err = ext3_reserve_inode_write(handle, i_prev, &iloc2); -+ if (err) -+ goto out_brelse; -+ NEXT_ORPHAN(i_prev) = ino_next; -+ err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); -+ } -+ if (err) -+ goto out_brelse; -+ NEXT_ORPHAN(inode) = 0; -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ if (err) -+ goto out_brelse; -+ -+out_err: -+ ext3_std_error(inode->i_sb, err); -+out: -+ unlock_super(inode->i_sb); -+ return err; -+ -+out_brelse: -+ brelse(iloc.bh); -+ goto out_err; -+} -+ -+static int ext3_rmdir (struct inode * dir, struct dentry *dentry) -+{ -+ int retval; -+ struct inode * inode; -+ struct buffer_head * bh; -+ struct ext3_dir_entry_2 * de; -+ handle_t *handle; -+ -+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ retval = -ENOENT; -+ bh = ext3_find_entry (dentry, &de); -+ if (!bh) -+ goto end_rmdir; -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = dentry->d_inode; -+ DQUOT_INIT(inode); -+ -+ retval = -EIO; -+ if (le32_to_cpu(de->inode) != inode->i_ino) -+ goto end_rmdir; -+ -+ retval = -ENOTEMPTY; -+ if (!empty_dir (inode)) -+ goto end_rmdir; -+ -+ retval = ext3_delete_entry(handle, dir, de, bh); -+ if (retval) -+ goto end_rmdir; -+ if (inode->i_nlink != 2) -+ ext3_warning (inode->i_sb, "ext3_rmdir", -+ "empty directory has nlink!=2 (%d)", -+ inode->i_nlink); -+ inode->i_version = ++event; -+ inode->i_nlink = 0; -+ /* There's no need to set i_disksize: the fact that i_nlink is -+ * zero will ensure that the right thing happens during any -+ * recovery. */ -+ inode->i_size = 0; -+ ext3_orphan_add(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ dir->i_nlink--; -+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ -+end_rmdir: -+ ext3_journal_stop(handle, dir); -+ brelse (bh); -+ return retval; -+} -+ -+static int ext3_unlink(struct inode * dir, struct dentry *dentry) -+{ -+ int retval; -+ struct inode * inode; -+ struct buffer_head * bh; -+ struct ext3_dir_entry_2 * de; -+ handle_t *handle; -+ -+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ retval = -ENOENT; -+ bh = ext3_find_entry (dentry, &de); -+ if (!bh) -+ goto end_unlink; -+ -+ inode = dentry->d_inode; -+ DQUOT_INIT(inode); -+ -+ retval = -EIO; -+ if (le32_to_cpu(de->inode) != inode->i_ino) -+ goto end_unlink; -+ -+ if (!inode->i_nlink) { -+ ext3_warning (inode->i_sb, "ext3_unlink", -+ "Deleting nonexistent file (%lu), %d", -+ inode->i_ino, inode->i_nlink); -+ inode->i_nlink = 1; -+ } -+ retval = ext3_delete_entry(handle, dir, de, bh); -+ if (retval) -+ goto end_unlink; -+ dir->i_ctime = dir->i_mtime = CURRENT_TIME; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ inode->i_nlink--; -+ if (!inode->i_nlink) -+ ext3_orphan_add(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ inode->i_ctime = dir->i_ctime; -+ retval = 0; -+ -+end_unlink: -+ ext3_journal_stop(handle, dir); -+ brelse (bh); -+ return retval; -+} -+ -+static int ext3_symlink (struct inode * dir, -+ struct dentry *dentry, const char * symname) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ int l, err; -+ -+ l = strlen(symname)+1; -+ if (l > dir->i_sb->s_blocksize) -+ return -ENAMETOOLONG; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out_stop; -+ -+ if (l > sizeof (inode->u.ext3_i.i_data)) { -+ inode->i_op = &page_symlink_inode_operations; -+ inode->i_mapping->a_ops = &ext3_aops; -+ /* -+ * block_symlink() calls back into ext3_prepare/commit_write. -+ * We have a transaction open. All is sweetness. It also sets -+ * i_size in generic_commit_write(). -+ */ -+ err = block_symlink(inode, symname, l); -+ if (err) -+ goto out_no_entry; -+ } else { -+ inode->i_op = &ext3_fast_symlink_inode_operations; -+ memcpy((char*)&inode->u.ext3_i.i_data,symname,l); -+ inode->i_size = l-1; -+ } -+ inode->u.ext3_i.i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_nondir(handle, dentry, inode); -+out_stop: -+ ext3_journal_stop(handle, dir); -+ return err; -+ -+out_no_entry: -+ ext3_dec_count(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+} -+ -+static int ext3_link (struct dentry * old_dentry, -+ struct inode * dir, struct dentry *dentry) -+{ -+ handle_t *handle; -+ struct inode *inode = old_dentry->d_inode; -+ int err; -+ -+ if (S_ISDIR(inode->i_mode)) -+ return -EPERM; -+ -+ if (inode->i_nlink >= EXT3_LINK_MAX) -+ return -EMLINK; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode->i_ctime = CURRENT_TIME; -+ ext3_inc_count(handle, inode); -+ atomic_inc(&inode->i_count); -+ -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_nondir(handle, dentry, inode); -+ ext3_journal_stop(handle, dir); -+ return err; -+} -+ -+#define PARENT_INO(buffer) \ -+ ((struct ext3_dir_entry_2 *) ((char *) buffer + \ -+ le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode -+ -+/* -+ * Anybody can rename anything with this: the permission checks are left to the -+ * higher-level routines. -+ */ -+static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, -+ struct inode * new_dir,struct dentry *new_dentry) -+{ -+ handle_t *handle; -+ struct inode * old_inode, * new_inode; -+ struct buffer_head * old_bh, * new_bh, * dir_bh; -+ struct ext3_dir_entry_2 * old_de, * new_de; -+ int retval; -+ -+ old_bh = new_bh = dir_bh = NULL; -+ -+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) -+ handle->h_sync = 1; -+ -+ old_bh = ext3_find_entry (old_dentry, &old_de); -+ /* -+ * Check for inode number is _not_ due to possible IO errors. -+ * We might rmdir the source, keep it as pwd of some process -+ * and merrily kill the link to whatever was created under the -+ * same name. Goodbye sticky bit ;-< -+ */ -+ old_inode = old_dentry->d_inode; -+ retval = -ENOENT; -+ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) -+ goto end_rename; -+ -+ new_inode = new_dentry->d_inode; -+ new_bh = ext3_find_entry (new_dentry, &new_de); -+ if (new_bh) { -+ if (!new_inode) { -+ brelse (new_bh); -+ new_bh = NULL; -+ } else { -+ DQUOT_INIT(new_inode); -+ } -+ } -+ if (S_ISDIR(old_inode->i_mode)) { -+ if (new_inode) { -+ retval = -ENOTEMPTY; -+ if (!empty_dir (new_inode)) -+ goto end_rename; -+ } -+ retval = -EIO; -+ dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); -+ if (!dir_bh) -+ goto end_rename; -+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) -+ goto end_rename; -+ retval = -EMLINK; -+ if (!new_inode && new_dir!=old_dir && -+ new_dir->i_nlink >= EXT3_LINK_MAX) -+ goto end_rename; -+ } -+ if (!new_bh) { -+ retval = ext3_add_entry (handle, new_dentry, old_inode); -+ if (retval) -+ goto end_rename; -+ } else { -+ BUFFER_TRACE(new_bh, "get write access"); -+ BUFFER_TRACE(new_bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, new_bh); -+ new_de->inode = le32_to_cpu(old_inode->i_ino); -+ if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, -+ EXT3_FEATURE_INCOMPAT_FILETYPE)) -+ new_de->file_type = old_de->file_type; -+ new_dir->i_version = ++event; -+ BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, new_bh); -+ brelse(new_bh); -+ new_bh = NULL; -+ } -+ -+ /* -+ * Like most other Unix systems, set the ctime for inodes on a -+ * rename. -+ */ -+ old_inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, old_inode); -+ -+ /* -+ * ok, that's it -+ */ -+ ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ -+ if (new_inode) { -+ new_inode->i_nlink--; -+ new_inode->i_ctime = CURRENT_TIME; -+ } -+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; -+ old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ if (dir_bh) { -+ BUFFER_TRACE(dir_bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, dir_bh); -+ PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); -+ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, dir_bh); -+ old_dir->i_nlink--; -+ if (new_inode) { -+ new_inode->i_nlink--; -+ } else { -+ new_dir->i_nlink++; -+ new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, new_dir); -+ } -+ } -+ ext3_mark_inode_dirty(handle, old_dir); -+ if (new_inode) { -+ ext3_mark_inode_dirty(handle, new_inode); -+ if (!new_inode->i_nlink) -+ ext3_orphan_add(handle, new_inode); -+ } -+ retval = 0; -+ -+end_rename: -+ brelse (dir_bh); -+ brelse (old_bh); -+ brelse (new_bh); -+ ext3_journal_stop(handle, old_dir); -+ return retval; -+} -+ -+/* -+ * directories can handle most operations... -+ */ -+struct inode_operations ext3_dir_inode_operations = { -+ create: ext3_create, /* BKL held */ -+ lookup: ext3_lookup, /* BKL held */ -+ link: ext3_link, /* BKL held */ -+ unlink: ext3_unlink, /* BKL held */ -+ symlink: ext3_symlink, /* BKL held */ -+ mkdir: ext3_mkdir, /* BKL held */ -+ rmdir: ext3_rmdir, /* BKL held */ -+ mknod: ext3_mknod, /* BKL held */ -+ rename: ext3_rename, /* BKL held */ -+}; -diff -rup --new-file linux.mcp2/fs/ext3/super.c linux_tmp/fs/ext3/super.c ---- linux.mcp2/fs/ext3/super.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/super.c 2002-02-25 11:38:08.000000000 -0800 -@@ -0,0 +1,1753 @@ -+/* -+ * linux/fs/ext3/super.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/inode.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_JBD_DEBUG -+static int ext3_ro_after; /* Make fs read-only after this many jiffies */ -+#endif -+ -+static int ext3_load_journal(struct super_block *, struct ext3_super_block *); -+static int ext3_create_journal(struct super_block *, struct ext3_super_block *, -+ int); -+static void ext3_commit_super (struct super_block * sb, -+ struct ext3_super_block * es, -+ int sync); -+static void ext3_mark_recovery_complete(struct super_block * sb, -+ struct ext3_super_block * es); -+static void ext3_clear_journal_err(struct super_block * sb, -+ struct ext3_super_block * es); -+ -+#ifdef CONFIG_JBD_DEBUG -+int journal_no_write[2]; -+ -+/* -+ * Debug code for turning filesystems "read-only" after a specified -+ * amount of time. This is for crash/recovery testing. -+ */ -+ -+static void make_rdonly(kdev_t dev, int *no_write) -+{ -+ if (dev) { -+ printk(KERN_WARNING "Turning device %s read-only\n", -+ bdevname(dev)); -+ *no_write = 0xdead0000 + dev; -+ } -+} -+ -+static void turn_fs_readonly(unsigned long arg) -+{ -+ struct super_block *sb = (struct super_block *)arg; -+ -+ make_rdonly(sb->s_dev, &journal_no_write[0]); -+ make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]); -+ wake_up(&EXT3_SB(sb)->ro_wait_queue); -+} -+ -+static void setup_ro_after(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ init_timer(&sbi->turn_ro_timer); -+ if (ext3_ro_after) { -+ printk(KERN_DEBUG "fs will go read-only in %d jiffies\n", -+ ext3_ro_after); -+ init_waitqueue_head(&sbi->ro_wait_queue); -+ journal_no_write[0] = 0; -+ journal_no_write[1] = 0; -+ sbi->turn_ro_timer.function = turn_fs_readonly; -+ sbi->turn_ro_timer.data = (unsigned long)sb; -+ sbi->turn_ro_timer.expires = jiffies + ext3_ro_after; -+ ext3_ro_after = 0; -+ add_timer(&sbi->turn_ro_timer); -+ } -+} -+ -+static void clear_ro_after(struct super_block *sb) -+{ -+ del_timer_sync(&EXT3_SB(sb)->turn_ro_timer); -+ journal_no_write[0] = 0; -+ journal_no_write[1] = 0; -+ ext3_ro_after = 0; -+} -+#else -+#define setup_ro_after(sb) do {} while (0) -+#define clear_ro_after(sb) do {} while (0) -+#endif -+ -+ -+static char error_buf[1024]; -+ -+/* Determine the appropriate response to ext3_error on a given filesystem */ -+ -+static int ext3_error_behaviour(struct super_block *sb) -+{ -+ /* First check for mount-time options */ -+ if (test_opt (sb, ERRORS_PANIC)) -+ return EXT3_ERRORS_PANIC; -+ if (test_opt (sb, ERRORS_RO)) -+ return EXT3_ERRORS_RO; -+ if (test_opt (sb, ERRORS_CONT)) -+ return EXT3_ERRORS_CONTINUE; -+ -+ /* If no overrides were specified on the mount, then fall back -+ * to the default behaviour set in the filesystem's superblock -+ * on disk. */ -+ switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) { -+ case EXT3_ERRORS_PANIC: -+ return EXT3_ERRORS_PANIC; -+ case EXT3_ERRORS_RO: -+ return EXT3_ERRORS_RO; -+ default: -+ break; -+ } -+ return EXT3_ERRORS_CONTINUE; -+} -+ -+/* Deal with the reporting of failure conditions on a filesystem such as -+ * inconsistencies detected or read IO failures. -+ * -+ * On ext2, we can store the error state of the filesystem in the -+ * superblock. That is not possible on ext3, because we may have other -+ * write ordering constraints on the superblock which prevent us from -+ * writing it out straight away; and given that the journal is about to -+ * be aborted, we can't rely on the current, or future, transactions to -+ * write out the superblock safely. -+ * -+ * We'll just use the journal_abort() error code to record an error in -+ * the journal instead. On recovery, the journal will compain about -+ * that error until we've noted it down and cleared it. -+ */ -+ -+static void ext3_handle_error(struct super_block *sb) -+{ -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ -+ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; -+ es->s_state |= cpu_to_le32(EXT3_ERROR_FS); -+ -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ -+ if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) { -+ EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; -+ journal_abort(EXT3_SB(sb)->s_journal, -EIO); -+ } -+ -+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) -+ panic ("EXT3-fs (device %s): panic forced after error\n", -+ bdevname(sb->s_dev)); -+ -+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) { -+ printk (KERN_CRIT "Remounting filesystem read-only\n"); -+ sb->s_flags |= MS_RDONLY; -+ } -+ -+ ext3_commit_super(sb, es, 1); -+} -+ -+void ext3_error (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ va_start (args, fmt); -+ vsprintf (error_buf, fmt, args); -+ va_end (args); -+ -+ printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+ -+ ext3_handle_error(sb); -+} -+ -+const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]) -+{ -+ char *errstr = NULL; -+ -+ switch (errno) { -+ case -EIO: -+ errstr = "IO failure"; -+ break; -+ case -ENOMEM: -+ errstr = "Out of memory"; -+ break; -+ case -EROFS: -+ if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) -+ errstr = "Journal has aborted"; -+ else -+ errstr = "Readonly filesystem"; -+ break; -+ default: -+ /* If the caller passed in an extra buffer for unknown -+ * errors, textualise them now. Else we just return -+ * NULL. */ -+ if (nbuf) { -+ /* Check for truncated error codes... */ -+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) -+ errstr = nbuf; -+ } -+ -+ break; -+ } -+ -+ return errstr; -+} -+ -+/* __ext3_std_error decodes expected errors from journaling functions -+ * automatically and invokes the appropriate error response. */ -+ -+void __ext3_std_error (struct super_block * sb, const char * function, -+ int errno) -+{ -+ char nbuf[16]; -+ const char *errstr = ext3_decode_error(sb, errno, nbuf); -+ -+ printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", -+ bdevname(sb->s_dev), function, errstr); -+ -+ ext3_handle_error(sb); -+} -+ -+/* -+ * ext3_abort is a much stronger failure handler than ext3_error. The -+ * abort function may be used to deal with unrecoverable failures such -+ * as journal IO errors or ENOMEM at a critical moment in log management. -+ * -+ * We unconditionally force the filesystem into an ABORT|READONLY state, -+ * unless the error response on the fs has been set to panic in which -+ * case we take the easy way out and panic immediately. -+ */ -+ -+void ext3_abort (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ printk (KERN_CRIT "ext3_abort called.\n"); -+ -+ va_start (args, fmt); -+ vsprintf (error_buf, fmt, args); -+ va_end (args); -+ -+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) -+ panic ("EXT3-fs panic (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+ -+ printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+ -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ -+ printk (KERN_CRIT "Remounting filesystem read-only\n"); -+ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; -+ sb->s_flags |= MS_RDONLY; -+ sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT; -+ journal_abort(EXT3_SB(sb)->s_journal, -EIO); -+} -+ -+/* Deal with the reporting of failure conditions while running, such as -+ * inconsistencies in operation or invalid system states. -+ * -+ * Use ext3_error() for cases of invalid filesystem states, as that will -+ * record an error on disk and force a filesystem check on the next boot. -+ */ -+NORET_TYPE void ext3_panic (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ va_start (args, fmt); -+ vsprintf (error_buf, fmt, args); -+ va_end (args); -+ -+ /* this is to prevent panic from syncing this filesystem */ -+ /* AKPM: is this sufficient? */ -+ sb->s_flags |= MS_RDONLY; -+ panic ("EXT3-fs panic (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+} -+ -+void ext3_warning (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ va_start (args, fmt); -+ vsprintf (error_buf, fmt, args); -+ va_end (args); -+ printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+} -+ -+void ext3_update_dynamic_rev(struct super_block *sb) -+{ -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ -+ if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) -+ return; -+ -+ ext3_warning(sb, __FUNCTION__, -+ "updating to rev %d because of new feature flag, " -+ "running e2fsck is recommended", -+ EXT3_DYNAMIC_REV); -+ -+ es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); -+ es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); -+ es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); -+ /* leave es->s_feature_*compat flags alone */ -+ /* es->s_uuid will be set by e2fsck if empty */ -+ -+ /* -+ * The rest of the superblock fields should be zero, and if not it -+ * means they are likely already in use, so leave them alone. We -+ * can leave it up to e2fsck to clean up any inconsistencies there. -+ */ -+} -+ -+/* -+ * Open the external journal device -+ */ -+static struct block_device *ext3_blkdev_get(kdev_t dev) -+{ -+ struct block_device *bdev; -+ int err = -ENODEV; -+ -+ bdev = bdget(kdev_t_to_nr(dev)); -+ if (bdev == NULL) -+ goto fail; -+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS); -+ if (err < 0) -+ goto fail; -+ return bdev; -+ -+fail: -+ printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n", -+ bdevname(dev), err); -+ return NULL; -+} -+ -+/* -+ * Release the journal device -+ */ -+static int ext3_blkdev_put(struct block_device *bdev) -+{ -+ return blkdev_put(bdev, BDEV_FS); -+} -+ -+static int ext3_blkdev_remove(struct ext3_sb_info *sbi) -+{ -+ struct block_device *bdev; -+ int ret = -ENODEV; -+ -+ bdev = sbi->journal_bdev; -+ if (bdev) { -+ ret = ext3_blkdev_put(bdev); -+ sbi->journal_bdev = 0; -+ } -+ return ret; -+} -+ -+#define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan) -+ -+static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) -+{ -+ struct list_head *l; -+ -+ printk(KERN_ERR "sb orphan head is %d\n", -+ le32_to_cpu(sbi->s_es->s_last_orphan)); -+ -+ printk(KERN_ERR "sb_info orphan list:\n"); -+ list_for_each(l, &sbi->s_orphan) { -+ struct inode *inode = orphan_list_entry(l); -+ printk(KERN_ERR " " -+ "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n", -+ inode->i_dev, inode->i_ino, inode, -+ inode->i_mode, inode->i_nlink, -+ le32_to_cpu(NEXT_ORPHAN(inode))); -+ } -+} -+ -+void ext3_put_super (struct super_block * sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_super_block *es = sbi->s_es; -+ kdev_t j_dev = sbi->s_journal->j_dev; -+ int i; -+ -+ journal_destroy(sbi->s_journal); -+ if (!(sb->s_flags & MS_RDONLY)) { -+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ es->s_state = le16_to_cpu(sbi->s_mount_state); -+ BUFFER_TRACE(sbi->s_sbh, "marking dirty"); -+ mark_buffer_dirty(sbi->s_sbh); -+ ext3_commit_super(sb, es, 1); -+ } -+ -+ for (i = 0; i < sbi->s_gdb_count; i++) -+ brelse(sbi->s_group_desc[i]); -+ kfree(sbi->s_group_desc); -+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) -+ brelse(sbi->s_inode_bitmap[i]); -+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) -+ brelse(sbi->s_block_bitmap[i]); -+ brelse(sbi->s_sbh); -+ -+ /* Debugging code just in case the in-memory inode orphan list -+ * isn't empty. The on-disk one can be non-empty if we've -+ * detected an error and taken the fs readonly, but the -+ * in-memory list had better be clean by this point. */ -+ if (!list_empty(&sbi->s_orphan)) -+ dump_orphan_list(sb, sbi); -+ J_ASSERT(list_empty(&sbi->s_orphan)); -+ -+ invalidate_buffers(sb->s_dev); -+ if (j_dev != sb->s_dev) { -+ /* -+ * Invalidate the journal device's buffers. We don't want them -+ * floating about in memory - the physical journal device may -+ * hotswapped, and it breaks the `ro-after' testing code. -+ */ -+ fsync_no_super(j_dev); -+ invalidate_buffers(j_dev); -+ ext3_blkdev_remove(sbi); -+ } -+ clear_ro_after(sb); -+ -+ return; -+} -+ -+static struct super_operations ext3_sops = { -+ read_inode: ext3_read_inode, /* BKL held */ -+ write_inode: ext3_write_inode, /* BKL not held. Don't need */ -+ dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ -+ put_inode: ext3_put_inode, /* BKL not held. Don't need */ -+ delete_inode: ext3_delete_inode, /* BKL not held. We take it */ -+ put_super: ext3_put_super, /* BKL held */ -+ write_super: ext3_write_super, /* BKL held */ -+ write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ -+ unlockfs: ext3_unlockfs, /* BKL not held. We take it */ -+ statfs: ext3_statfs, /* BKL held */ -+ remount_fs: ext3_remount, /* BKL held */ -+}; -+ -+static int want_value(char *value, char *option) -+{ -+ if (!value || !*value) { -+ printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n", -+ option); -+ return -1; -+ } -+ return 0; -+} -+ -+static int want_null_value(char *value, char *option) -+{ -+ if (*value) { -+ printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n", -+ option, value); -+ return -1; -+ } -+ return 0; -+} -+ -+static int want_numeric(char *value, char *option, unsigned long *number) -+{ -+ if (want_value(value, option)) -+ return -1; -+ *number = simple_strtoul(value, &value, 0); -+ if (want_null_value(value, option)) -+ return -1; -+ return 0; -+} -+ -+/* -+ * This function has been shamelessly adapted from the msdos fs -+ */ -+static int parse_options (char * options, unsigned long * sb_block, -+ struct ext3_sb_info *sbi, -+ unsigned long * inum, -+ int is_remount) -+{ -+ unsigned long *mount_options = &sbi->s_mount_opt; -+ uid_t *resuid = &sbi->s_resuid; -+ gid_t *resgid = &sbi->s_resgid; -+ char * this_char; -+ char * value; -+ -+ if (!options) -+ return 1; -+ for (this_char = strtok (options, ","); -+ this_char != NULL; -+ this_char = strtok (NULL, ",")) { -+ if ((value = strchr (this_char, '=')) != NULL) -+ *value++ = 0; -+ if (!strcmp (this_char, "bsddf")) -+ clear_opt (*mount_options, MINIX_DF); -+ else if (!strcmp (this_char, "nouid32")) { -+ set_opt (*mount_options, NO_UID32); -+ } -+ else if (!strcmp (this_char, "abort")) -+ set_opt (*mount_options, ABORT); -+ else if (!strcmp (this_char, "check")) { -+ if (!value || !*value || !strcmp (value, "none")) -+ clear_opt (*mount_options, CHECK); -+ else -+#ifdef CONFIG_EXT3_CHECK -+ set_opt (*mount_options, CHECK); -+#else -+ printk(KERN_ERR -+ "EXT3 Check option not supported\n"); -+#endif -+ } -+ else if (!strcmp (this_char, "debug")) -+ set_opt (*mount_options, DEBUG); -+ else if (!strcmp (this_char, "errors")) { -+ if (want_value(value, "errors")) -+ return 0; -+ if (!strcmp (value, "continue")) { -+ clear_opt (*mount_options, ERRORS_RO); -+ clear_opt (*mount_options, ERRORS_PANIC); -+ set_opt (*mount_options, ERRORS_CONT); -+ } -+ else if (!strcmp (value, "remount-ro")) { -+ clear_opt (*mount_options, ERRORS_CONT); -+ clear_opt (*mount_options, ERRORS_PANIC); -+ set_opt (*mount_options, ERRORS_RO); -+ } -+ else if (!strcmp (value, "panic")) { -+ clear_opt (*mount_options, ERRORS_CONT); -+ clear_opt (*mount_options, ERRORS_RO); -+ set_opt (*mount_options, ERRORS_PANIC); -+ } -+ else { -+ printk (KERN_ERR -+ "EXT3-fs: Invalid errors option: %s\n", -+ value); -+ return 0; -+ } -+ } -+ else if (!strcmp (this_char, "grpid") || -+ !strcmp (this_char, "bsdgroups")) -+ set_opt (*mount_options, GRPID); -+ else if (!strcmp (this_char, "minixdf")) -+ set_opt (*mount_options, MINIX_DF); -+ else if (!strcmp (this_char, "nocheck")) -+ clear_opt (*mount_options, CHECK); -+ else if (!strcmp (this_char, "nogrpid") || -+ !strcmp (this_char, "sysvgroups")) -+ clear_opt (*mount_options, GRPID); -+ else if (!strcmp (this_char, "resgid")) { -+ unsigned long v; -+ if (want_numeric(value, "resgid", &v)) -+ return 0; -+ *resgid = v; -+ } -+ else if (!strcmp (this_char, "resuid")) { -+ unsigned long v; -+ if (want_numeric(value, "resuid", &v)) -+ return 0; -+ *resuid = v; -+ } -+ else if (!strcmp (this_char, "sb")) { -+ if (want_numeric(value, "sb", sb_block)) -+ return 0; -+ } -+#ifdef CONFIG_JBD_DEBUG -+ else if (!strcmp (this_char, "ro-after")) { -+ unsigned long v; -+ if (want_numeric(value, "ro-after", &v)) -+ return 0; -+ ext3_ro_after = v; -+ } -+#endif -+ /* Silently ignore the quota options */ -+ else if (!strcmp (this_char, "grpquota") -+ || !strcmp (this_char, "noquota") -+ || !strcmp (this_char, "quota") -+ || !strcmp (this_char, "usrquota")) -+ /* Don't do anything ;-) */ ; -+ else if (!strcmp (this_char, "journal")) { -+ /* @@@ FIXME */ -+ /* Eventually we will want to be able to create -+ a journal file here. For now, only allow the -+ user to specify an existing inode to be the -+ journal file. */ -+ if (is_remount) { -+ printk(KERN_ERR "EXT3-fs: cannot specify " -+ "journal on remount\n"); -+ return 0; -+ } -+ -+ if (want_value(value, "journal")) -+ return 0; -+ if (!strcmp (value, "update")) -+ set_opt (*mount_options, UPDATE_JOURNAL); -+ else if (want_numeric(value, "journal", inum)) -+ return 0; -+ } -+ else if (!strcmp (this_char, "noload")) -+ set_opt (*mount_options, NOLOAD); -+ else if (!strcmp (this_char, "data")) { -+ int data_opt = 0; -+ -+ if (want_value(value, "data")) -+ return 0; -+ if (!strcmp (value, "journal")) -+ data_opt = EXT3_MOUNT_JOURNAL_DATA; -+ else if (!strcmp (value, "ordered")) -+ data_opt = EXT3_MOUNT_ORDERED_DATA; -+ else if (!strcmp (value, "writeback")) -+ data_opt = EXT3_MOUNT_WRITEBACK_DATA; -+ else { -+ printk (KERN_ERR -+ "EXT3-fs: Invalid data option: %s\n", -+ value); -+ return 0; -+ } -+ if (is_remount) { -+ if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) != -+ data_opt) { -+ printk(KERN_ERR -+ "EXT3-fs: cannot change data " -+ "mode on remount\n"); -+ return 0; -+ } -+ } else { -+ *mount_options &= ~EXT3_MOUNT_DATA_FLAGS; -+ *mount_options |= data_opt; -+ } -+ } else { -+ printk (KERN_ERR -+ "EXT3-fs: Unrecognized mount option %s\n", -+ this_char); -+ return 0; -+ } -+ } -+ return 1; -+} -+ -+static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, -+ int read_only) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int res = 0; -+ -+ if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { -+ printk (KERN_ERR "EXT3-fs warning: revision level too high, " -+ "forcing read-only mode\n"); -+ res = MS_RDONLY; -+ } -+ if (read_only) -+ return res; -+ if (!(sbi->s_mount_state & EXT3_VALID_FS)) -+ printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " -+ "running e2fsck is recommended\n"); -+ else if ((sbi->s_mount_state & EXT3_ERROR_FS)) -+ printk (KERN_WARNING -+ "EXT3-fs warning: mounting fs with errors, " -+ "running e2fsck is recommended\n"); -+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && -+ le16_to_cpu(es->s_mnt_count) >= -+ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) -+ printk (KERN_WARNING -+ "EXT3-fs warning: maximal mount count reached, " -+ "running e2fsck is recommended\n"); -+ else if (le32_to_cpu(es->s_checkinterval) && -+ (le32_to_cpu(es->s_lastcheck) + -+ le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME)) -+ printk (KERN_WARNING -+ "EXT3-fs warning: checktime reached, " -+ "running e2fsck is recommended\n"); -+#if 0 -+ /* @@@ We _will_ want to clear the valid bit if we find -+ inconsistencies, to force a fsck at reboot. But for -+ a plain journaled filesystem we can keep it set as -+ valid forever! :) */ -+ es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS); -+#endif -+ if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) -+ es->s_max_mnt_count = -+ (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); -+ es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); -+ es->s_mtime = cpu_to_le32(CURRENT_TIME); -+ ext3_update_dynamic_rev(sb); -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ ext3_commit_super (sb, es, 1); -+ if (test_opt (sb, DEBUG)) -+ printk (KERN_INFO -+ "[EXT3 FS %s, %s, bs=%lu, gc=%lu, " -+ "bpg=%lu, ipg=%lu, mo=%04lx]\n", -+ EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize, -+ sbi->s_groups_count, -+ EXT3_BLOCKS_PER_GROUP(sb), -+ EXT3_INODES_PER_GROUP(sb), -+ sbi->s_mount_opt); -+ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", -+ bdevname(sb->s_dev)); -+ if (EXT3_SB(sb)->s_journal->j_inode == NULL) { -+ printk("external journal on %s\n", -+ bdevname(EXT3_SB(sb)->s_journal->j_dev)); -+ } else { -+ printk("internal journal\n"); -+ } -+#ifdef CONFIG_EXT3_CHECK -+ if (test_opt (sb, CHECK)) { -+ ext3_check_blocks_bitmap (sb); -+ ext3_check_inodes_bitmap (sb); -+ } -+#endif -+ setup_ro_after(sb); -+ return res; -+} -+ -+static int ext3_check_descriptors (struct super_block * sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); -+ struct ext3_group_desc * gdp = NULL; -+ int desc_block = 0; -+ int i; -+ -+ ext3_debug ("Checking group descriptors"); -+ -+ for (i = 0; i < sbi->s_groups_count; i++) -+ { -+ if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0) -+ gdp = (struct ext3_group_desc *) -+ sbi->s_group_desc[desc_block++]->b_data; -+ if (le32_to_cpu(gdp->bg_block_bitmap) < block || -+ le32_to_cpu(gdp->bg_block_bitmap) >= -+ block + EXT3_BLOCKS_PER_GROUP(sb)) -+ { -+ ext3_error (sb, "ext3_check_descriptors", -+ "Block bitmap for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_block_bitmap)); -+ return 0; -+ } -+ if (le32_to_cpu(gdp->bg_inode_bitmap) < block || -+ le32_to_cpu(gdp->bg_inode_bitmap) >= -+ block + EXT3_BLOCKS_PER_GROUP(sb)) -+ { -+ ext3_error (sb, "ext3_check_descriptors", -+ "Inode bitmap for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_inode_bitmap)); -+ return 0; -+ } -+ if (le32_to_cpu(gdp->bg_inode_table) < block || -+ le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= -+ block + EXT3_BLOCKS_PER_GROUP(sb)) -+ { -+ ext3_error (sb, "ext3_check_descriptors", -+ "Inode table for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_inode_table)); -+ return 0; -+ } -+ block += EXT3_BLOCKS_PER_GROUP(sb); -+ gdp++; -+ } -+ return 1; -+} -+ -+ -+/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at -+ * the superblock) which were deleted from all directories, but held open by -+ * a process at the time of a crash. We walk the list and try to delete these -+ * inodes at recovery time (only with a read-write filesystem). -+ * -+ * In order to keep the orphan inode chain consistent during traversal (in -+ * case of crash during recovery), we link each inode into the superblock -+ * orphan list_head and handle it the same way as an inode deletion during -+ * normal operation (which journals the operations for us). -+ * -+ * We only do an iget() and an iput() on each inode, which is very safe if we -+ * accidentally point at an in-use or already deleted inode. The worst that -+ * can happen in this case is that we get a "bit already cleared" message from -+ * ext3_free_inode(). The only reason we would point at a wrong inode is if -+ * e2fsck was run on this filesystem, and it must have already done the orphan -+ * inode cleanup for us, so we can safely abort without any further action. -+ */ -+static void ext3_orphan_cleanup (struct super_block * sb, -+ struct ext3_super_block * es) -+{ -+ unsigned int s_flags = sb->s_flags; -+ int nr_orphans = 0, nr_truncates = 0; -+ if (!es->s_last_orphan) { -+ jbd_debug(4, "no orphan inodes to clean up\n"); -+ return; -+ } -+ -+ if (s_flags & MS_RDONLY) { -+ printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", -+ bdevname(sb->s_dev)); -+ sb->s_flags &= ~MS_RDONLY; -+ } -+ -+ if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) { -+ if (es->s_last_orphan) -+ jbd_debug(1, "Errors on filesystem, " -+ "clearing orphan list.\n"); -+ es->s_last_orphan = 0; -+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); -+ return; -+ } -+ -+ while (es->s_last_orphan) { -+ struct inode *inode; -+ -+ if (!(inode = -+ ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { -+ es->s_last_orphan = 0; -+ break; -+ } -+ -+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); -+ if (inode->i_nlink) { -+ printk(KERN_DEBUG __FUNCTION__ -+ ": truncating inode %ld to %Ld bytes\n", -+ inode->i_ino, inode->i_size); -+ jbd_debug(2, "truncating inode %ld to %Ld bytes\n", -+ inode->i_ino, inode->i_size); -+ ext3_truncate(inode); -+ nr_truncates++; -+ } else { -+ printk(KERN_DEBUG __FUNCTION__ -+ ": deleting unreferenced inode %ld\n", -+ inode->i_ino); -+ jbd_debug(2, "deleting unreferenced inode %ld\n", -+ inode->i_ino); -+ nr_orphans++; -+ } -+ iput(inode); /* The delete magic happens here! */ -+ } -+ -+#define PLURAL(x) (x), ((x)==1) ? "" : "s" -+ -+ if (nr_orphans) -+ printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", -+ bdevname(sb->s_dev), PLURAL(nr_orphans)); -+ if (nr_truncates) -+ printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", -+ bdevname(sb->s_dev), PLURAL(nr_truncates)); -+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */ -+} -+ -+#define log2(n) ffz(~(n)) -+ -+/* -+ * Maximal file size. There is a direct, and {,double-,triple-}indirect -+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. -+ * We need to be 1 filesystem block less than the 2^32 sector limit. -+ */ -+static loff_t ext3_max_size(int bits) -+{ -+ loff_t res = EXT3_NDIR_BLOCKS; -+ res += 1LL << (bits-2); -+ res += 1LL << (2*(bits-2)); -+ res += 1LL << (3*(bits-2)); -+ res <<= bits; -+ if (res > (512LL << 32) - (1 << bits)) -+ res = (512LL << 32) - (1 << bits); -+ return res; -+} -+ -+struct super_block * ext3_read_super (struct super_block * sb, void * data, -+ int silent) -+{ -+ struct buffer_head * bh; -+ struct ext3_super_block *es = 0; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long sb_block = 1; -+ unsigned long logic_sb_block = 1; -+ unsigned long offset = 0; -+ unsigned long journal_inum = 0; -+ kdev_t dev = sb->s_dev; -+ int blocksize; -+ int hblock; -+ int db_count; -+ int i; -+ int needs_recovery; -+ -+#ifdef CONFIG_JBD_DEBUG -+ ext3_ro_after = 0; -+#endif -+ /* -+ * See what the current blocksize for the device is, and -+ * use that as the blocksize. Otherwise (or if the blocksize -+ * is smaller than the default) use the default. -+ * This is important for devices that have a hardware -+ * sectorsize that is larger than the default. -+ */ -+ blocksize = EXT3_MIN_BLOCK_SIZE; -+ hblock = get_hardsect_size(dev); -+ if (blocksize < hblock) -+ blocksize = hblock; -+ -+ sbi->s_mount_opt = 0; -+ sbi->s_resuid = EXT3_DEF_RESUID; -+ sbi->s_resgid = EXT3_DEF_RESGID; -+ if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { -+ sb->s_dev = 0; -+ goto out_fail; -+ } -+ -+ sb->s_blocksize = blocksize; -+ set_blocksize (dev, blocksize); -+ -+ /* -+ * The ext3 superblock will not be buffer aligned for other than 1kB -+ * block sizes. We need to calculate the offset from buffer start. -+ */ -+ if (blocksize != EXT3_MIN_BLOCK_SIZE) { -+ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; -+ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; -+ } -+ -+ if (!(bh = sb_bread(sb, logic_sb_block))) { -+ printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); -+ goto out_fail; -+ } -+ /* -+ * Note: s_es must be initialized as soon as possible because -+ * some ext3 macro-instructions depend on its value -+ */ -+ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); -+ sbi->s_es = es; -+ sb->s_magic = le16_to_cpu(es->s_magic); -+ if (sb->s_magic != EXT3_SUPER_MAGIC) { -+ if (!silent) -+ printk(KERN_ERR -+ "VFS: Can't find ext3 filesystem on dev %s.\n", -+ bdevname(dev)); -+ goto failed_mount; -+ } -+ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && -+ (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || -+ EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || -+ EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) -+ printk(KERN_WARNING -+ "EXT3-fs warning: feature flags set on rev 0 fs, " -+ "running e2fsck is recommended\n"); -+ /* -+ * Check feature flags regardless of the revision level, since we -+ * previously didn't change the revision level when setting the flags, -+ * so there is a chance incompat flags are set on a rev 0 filesystem. -+ */ -+ if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) { -+ printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " -+ "unsupported optional features (%x).\n", -+ bdevname(dev), i); -+ goto failed_mount; -+ } -+ if (!(sb->s_flags & MS_RDONLY) && -+ (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){ -+ printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " -+ "unsupported optional features (%x).\n", -+ bdevname(dev), i); -+ goto failed_mount; -+ } -+ sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10; -+ sb->s_blocksize = 1 << sb->s_blocksize_bits; -+ -+ if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE || -+ sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) { -+ printk(KERN_ERR -+ "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", -+ blocksize, bdevname(dev)); -+ goto failed_mount; -+ } -+ -+ sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); -+ -+ if (sb->s_blocksize != blocksize) { -+ blocksize = sb->s_blocksize; -+ -+ /* -+ * Make sure the blocksize for the filesystem is larger -+ * than the hardware sectorsize for the machine. -+ */ -+ if (sb->s_blocksize < hblock) { -+ printk(KERN_ERR "EXT3-fs: blocksize %d too small for " -+ "device blocksize %d.\n", blocksize, hblock); -+ goto failed_mount; -+ } -+ -+ brelse (bh); -+ set_blocksize (dev, sb->s_blocksize); -+ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; -+ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; -+ bh = sb_bread(sb, logic_sb_block); -+ if (!bh) { -+ printk(KERN_ERR -+ "EXT3-fs: Can't read superblock on 2nd try.\n"); -+ return NULL; -+ } -+ es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); -+ sbi->s_es = es; -+ if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) { -+ printk (KERN_ERR -+ "EXT3-fs: Magic mismatch, very weird !\n"); -+ goto failed_mount; -+ } -+ } -+ -+ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { -+ sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; -+ sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; -+ } else { -+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size); -+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino); -+ if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) { -+ printk (KERN_ERR -+ "EXT3-fs: unsupported inode size: %d\n", -+ sbi->s_inode_size); -+ goto failed_mount; -+ } -+ } -+ sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << -+ le32_to_cpu(es->s_log_frag_size); -+ if (blocksize != sbi->s_frag_size) { -+ printk(KERN_ERR -+ "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", -+ sbi->s_frag_size, blocksize); -+ goto failed_mount; -+ } -+ sbi->s_frags_per_block = 1; -+ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); -+ sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); -+ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); -+ sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); -+ sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block; -+ sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); -+ sbi->s_sbh = bh; -+ if (sbi->s_resuid == EXT3_DEF_RESUID) -+ sbi->s_resuid = le16_to_cpu(es->s_def_resuid); -+ if (sbi->s_resgid == EXT3_DEF_RESGID) -+ sbi->s_resgid = le16_to_cpu(es->s_def_resgid); -+ sbi->s_mount_state = le16_to_cpu(es->s_state); -+ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); -+ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); -+ -+ if (sbi->s_blocks_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3-fs: #blocks per group too big: %lu\n", -+ sbi->s_blocks_per_group); -+ goto failed_mount; -+ } -+ if (sbi->s_frags_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3-fs: #fragments per group too big: %lu\n", -+ sbi->s_frags_per_group); -+ goto failed_mount; -+ } -+ if (sbi->s_inodes_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3-fs: #inodes per group too big: %lu\n", -+ sbi->s_inodes_per_group); -+ goto failed_mount; -+ } -+ -+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - -+ le32_to_cpu(es->s_first_data_block) + -+ EXT3_BLOCKS_PER_GROUP(sb) - 1) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / -+ EXT3_DESC_PER_BLOCK(sb); -+ sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), -+ GFP_KERNEL); -+ if (sbi->s_group_desc == NULL) { -+ printk (KERN_ERR "EXT3-fs: not enough memory\n"); -+ goto failed_mount; -+ } -+ for (i = 0; i < db_count; i++) { -+ sbi->s_group_desc[i] = sb_bread(sb, logic_sb_block + i + 1); -+ if (!sbi->s_group_desc[i]) { -+ printk (KERN_ERR "EXT3-fs: " -+ "can't read group descriptor %d\n", i); -+ db_count = i; -+ goto failed_mount2; -+ } -+ } -+ if (!ext3_check_descriptors (sb)) { -+ printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n"); -+ goto failed_mount2; -+ } -+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) { -+ sbi->s_inode_bitmap_number[i] = 0; -+ sbi->s_inode_bitmap[i] = NULL; -+ sbi->s_block_bitmap_number[i] = 0; -+ sbi->s_block_bitmap[i] = NULL; -+ } -+ sbi->s_loaded_inode_bitmaps = 0; -+ sbi->s_loaded_block_bitmaps = 0; -+ sbi->s_gdb_count = db_count; -+ get_random_bytes(&sbi->s_next_generation, sizeof(u32)); -+ /* -+ * set up enough so that it can read an inode -+ */ -+ sb->s_op = &ext3_sops; -+ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ -+ -+ sb->s_root = 0; -+ -+ needs_recovery = (es->s_last_orphan != 0 || -+ EXT3_HAS_INCOMPAT_FEATURE(sb, -+ EXT3_FEATURE_INCOMPAT_RECOVER)); -+ -+ /* -+ * The first inode we look at is the journal inode. Don't try -+ * root first: it may be modified in the journal! -+ */ -+ if (!test_opt(sb, NOLOAD) && -+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { -+ if (ext3_load_journal(sb, es)) -+ goto failed_mount2; -+ } else if (journal_inum) { -+ if (ext3_create_journal(sb, es, journal_inum)) -+ goto failed_mount2; -+ } else { -+ if (!silent) -+ printk (KERN_ERR -+ "ext3: No journal on filesystem on %s\n", -+ bdevname(dev)); -+ goto failed_mount2; -+ } -+ -+ /* We have now updated the journal if required, so we can -+ * validate the data journaling mode. */ -+ switch (test_opt(sb, DATA_FLAGS)) { -+ case 0: -+ /* No mode set, assume a default based on the journal -+ capabilities: ORDERED_DATA if the journal can -+ cope, else JOURNAL_DATA */ -+ if (journal_check_available_features -+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) -+ set_opt(sbi->s_mount_opt, ORDERED_DATA); -+ else -+ set_opt(sbi->s_mount_opt, JOURNAL_DATA); -+ break; -+ -+ case EXT3_MOUNT_ORDERED_DATA: -+ case EXT3_MOUNT_WRITEBACK_DATA: -+ if (!journal_check_available_features -+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { -+ printk(KERN_ERR "EXT3-fs: Journal does not support " -+ "requested data journaling mode\n"); -+ goto failed_mount3; -+ } -+ default: -+ break; -+ } -+ -+ /* -+ * The journal_load will have done any necessary log recovery, -+ * so we can safely mount the rest of the filesystem now. -+ */ -+ -+ sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO)); -+ if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) || -+ !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) { -+ if (sb->s_root) { -+ dput(sb->s_root); -+ sb->s_root = NULL; -+ printk(KERN_ERR -+ "EXT3-fs: corrupt root inode, run e2fsck\n"); -+ } else -+ printk(KERN_ERR "EXT3-fs: get root inode failed\n"); -+ goto failed_mount3; -+ } -+ -+ ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ /* -+ * akpm: core read_super() calls in here with the superblock locked. -+ * That deadlocks, because orphan cleanup needs to lock the superblock -+ * in numerous places. Here we just pop the lock - it's relatively -+ * harmless, because we are now ready to accept write_super() requests, -+ * and aviro says that's the only reason for hanging onto the -+ * superblock lock. -+ */ -+ EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; -+ unlock_super(sb); /* akpm: sigh */ -+ ext3_orphan_cleanup(sb, es); -+ lock_super(sb); -+ EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; -+ if (needs_recovery) -+ printk (KERN_INFO "EXT3-fs: recovery complete.\n"); -+ ext3_mark_recovery_complete(sb, es); -+ printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", -+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": -+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": -+ "writeback"); -+ -+ return sb; -+ -+failed_mount3: -+ journal_destroy(sbi->s_journal); -+failed_mount2: -+ for (i = 0; i < db_count; i++) -+ brelse(sbi->s_group_desc[i]); -+ kfree(sbi->s_group_desc); -+failed_mount: -+ ext3_blkdev_remove(sbi); -+ brelse(bh); -+out_fail: -+ return NULL; -+} -+ -+static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) -+{ -+ struct inode *journal_inode; -+ journal_t *journal; -+ -+ /* First, test for the existence of a valid inode on disk. Bad -+ * things happen if we iget() an unused inode, as the subsequent -+ * iput() will try to delete it. */ -+ -+ journal_inode = iget(sb, journal_inum); -+ if (!journal_inode) { -+ printk(KERN_ERR "EXT3-fs: no journal found.\n"); -+ return NULL; -+ } -+ if (!journal_inode->i_nlink) { -+ make_bad_inode(journal_inode); -+ iput(journal_inode); -+ printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); -+ return NULL; -+ } -+ -+ jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", -+ journal_inode, journal_inode->i_size); -+ if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) { -+ printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); -+ iput(journal_inode); -+ return NULL; -+ } -+ -+ journal = journal_init_inode(journal_inode); -+ if (!journal) { -+ printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); -+ iput(journal_inode); -+ } -+ -+ return journal; -+} -+ -+static journal_t *ext3_get_dev_journal(struct super_block *sb, -+ int dev) -+{ -+ struct buffer_head * bh; -+ journal_t *journal; -+ int start; -+ int len; -+ int hblock, blocksize; -+ unsigned long sb_block; -+ unsigned long offset; -+ kdev_t journal_dev = to_kdev_t(dev); -+ struct ext3_super_block * es; -+ struct block_device *bdev; -+ -+ bdev = ext3_blkdev_get(journal_dev); -+ if (bdev == NULL) -+ return NULL; -+ -+ blocksize = sb->s_blocksize; -+ hblock = get_hardsect_size(journal_dev); -+ if (blocksize < hblock) { -+ printk(KERN_ERR -+ "EXT3-fs: blocksize too small for journal device.\n"); -+ goto out_bdev; -+ } -+ -+ sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; -+ offset = EXT3_MIN_BLOCK_SIZE % blocksize; -+ set_blocksize(dev, blocksize); -+ if (!(bh = bread(dev, sb_block, blocksize))) { -+ printk(KERN_ERR "EXT3-fs: couldn't read superblock of " -+ "external journal\n"); -+ goto out_bdev; -+ } -+ -+ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); -+ if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || -+ !(le32_to_cpu(es->s_feature_incompat) & -+ EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { -+ printk(KERN_ERR "EXT3-fs: external journal has " -+ "bad superblock\n"); -+ brelse(bh); -+ goto out_bdev; -+ } -+ -+ if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { -+ printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); -+ brelse(bh); -+ goto out_bdev; -+ } -+ -+ len = le32_to_cpu(es->s_blocks_count); -+ start = sb_block + 1; -+ brelse(bh); /* we're done with the superblock */ -+ -+ journal = journal_init_dev(journal_dev, sb->s_dev, -+ start, len, blocksize); -+ if (!journal) { -+ printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); -+ goto out_bdev; -+ } -+ ll_rw_block(READ, 1, &journal->j_sb_buffer); -+ wait_on_buffer(journal->j_sb_buffer); -+ if (!buffer_uptodate(journal->j_sb_buffer)) { -+ printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); -+ goto out_journal; -+ } -+ if (ntohl(journal->j_superblock->s_nr_users) != 1) { -+ printk(KERN_ERR "EXT3-fs: External journal has more than one " -+ "user (unsupported) - %d\n", -+ ntohl(journal->j_superblock->s_nr_users)); -+ goto out_journal; -+ } -+ EXT3_SB(sb)->journal_bdev = bdev; -+ return journal; -+out_journal: -+ journal_destroy(journal); -+out_bdev: -+ ext3_blkdev_put(bdev); -+ return NULL; -+} -+ -+static int ext3_load_journal(struct super_block * sb, -+ struct ext3_super_block * es) -+{ -+ journal_t *journal; -+ int journal_inum = le32_to_cpu(es->s_journal_inum); -+ int journal_dev = le32_to_cpu(es->s_journal_dev); -+ int err = 0; -+ int really_read_only; -+ -+ really_read_only = is_read_only(sb->s_dev); -+ -+ /* -+ * Are we loading a blank journal or performing recovery after a -+ * crash? For recovery, we need to check in advance whether we -+ * can get read-write access to the device. -+ */ -+ -+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { -+ if (sb->s_flags & MS_RDONLY) { -+ printk(KERN_INFO "EXT3-fs: INFO: recovery " -+ "required on readonly filesystem.\n"); -+ if (really_read_only) { -+ printk(KERN_ERR "EXT3-fs: write access " -+ "unavailable, cannot proceed.\n"); -+ return -EROFS; -+ } -+ printk (KERN_INFO "EXT3-fs: write access will " -+ "be enabled during recovery.\n"); -+ } -+ } -+ -+ if (journal_inum && journal_dev) { -+ printk(KERN_ERR "EXT3-fs: filesystem has both journal " -+ "and inode journals!\n"); -+ return -EINVAL; -+ } -+ -+ if (journal_inum) { -+ if (!(journal = ext3_get_journal(sb, journal_inum))) -+ return -EINVAL; -+ } else { -+ if (!(journal = ext3_get_dev_journal(sb, journal_dev))) -+ return -EINVAL; -+ } -+ -+ -+ if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { -+ err = journal_update_format(journal); -+ if (err) { -+ printk(KERN_ERR "EXT3-fs: error updating journal.\n"); -+ journal_destroy(journal); -+ return err; -+ } -+ } -+ -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) -+ err = journal_wipe(journal, !really_read_only); -+ if (!err) -+ err = journal_load(journal); -+ -+ if (err) { -+ printk(KERN_ERR "EXT3-fs: error loading journal.\n"); -+ journal_destroy(journal); -+ return err; -+ } -+ -+ EXT3_SB(sb)->s_journal = journal; -+ ext3_clear_journal_err(sb, es); -+ return 0; -+} -+ -+static int ext3_create_journal(struct super_block * sb, -+ struct ext3_super_block * es, -+ int journal_inum) -+{ -+ journal_t *journal; -+ -+ if (sb->s_flags & MS_RDONLY) { -+ printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " -+ "create journal.\n"); -+ return -EROFS; -+ } -+ -+ if (!(journal = ext3_get_journal(sb, journal_inum))) -+ return -EINVAL; -+ -+ printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n", -+ journal_inum); -+ -+ if (journal_create(journal)) { -+ printk(KERN_ERR "EXT3-fs: error creating journal.\n"); -+ journal_destroy(journal); -+ return -EIO; -+ } -+ -+ EXT3_SB(sb)->s_journal = journal; -+ -+ ext3_update_dynamic_rev(sb); -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); -+ -+ es->s_journal_inum = cpu_to_le32(journal_inum); -+ sb->s_dirt = 1; -+ -+ /* Make sure we flush the recovery flag to disk. */ -+ ext3_commit_super(sb, es, 1); -+ -+ return 0; -+} -+ -+static void ext3_commit_super (struct super_block * sb, -+ struct ext3_super_block * es, -+ int sync) -+{ -+ es->s_wtime = cpu_to_le32(CURRENT_TIME); -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty"); -+ mark_buffer_dirty(sb->u.ext3_sb.s_sbh); -+ if (sync) { -+ ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh); -+ wait_on_buffer(sb->u.ext3_sb.s_sbh); -+ } -+} -+ -+ -+/* -+ * Have we just finished recovery? If so, and if we are mounting (or -+ * remounting) the filesystem readonly, then we will end up with a -+ * consistent fs on disk. Record that fact. -+ */ -+static void ext3_mark_recovery_complete(struct super_block * sb, -+ struct ext3_super_block * es) -+{ -+ journal_flush(EXT3_SB(sb)->s_journal); -+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && -+ sb->s_flags & MS_RDONLY) { -+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ sb->s_dirt = 0; -+ ext3_commit_super(sb, es, 1); -+ } -+} -+ -+/* -+ * If we are mounting (or read-write remounting) a filesystem whose journal -+ * has recorded an error from a previous lifetime, move that error to the -+ * main filesystem now. -+ */ -+static void ext3_clear_journal_err(struct super_block * sb, -+ struct ext3_super_block * es) -+{ -+ journal_t *journal; -+ int j_errno; -+ const char *errstr; -+ -+ journal = EXT3_SB(sb)->s_journal; -+ -+ /* -+ * Now check for any error status which may have been recorded in the -+ * journal by a prior ext3_error() or ext3_abort() -+ */ -+ -+ j_errno = journal_errno(journal); -+ if (j_errno) { -+ char nbuf[16]; -+ -+ errstr = ext3_decode_error(sb, j_errno, nbuf); -+ ext3_warning(sb, __FUNCTION__, "Filesystem error recorded " -+ "from previous mount: %s", errstr); -+ ext3_warning(sb, __FUNCTION__, "Marking fs in need of " -+ "filesystem check."); -+ -+ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; -+ es->s_state |= cpu_to_le16(EXT3_ERROR_FS); -+ ext3_commit_super (sb, es, 1); -+ -+ journal_clear_err(journal); -+ } -+} -+ -+/* -+ * Force the running and committing transactions to commit, -+ * and wait on the commit. -+ */ -+int ext3_force_commit(struct super_block *sb) -+{ -+ journal_t *journal; -+ int ret; -+ -+ if (sb->s_flags & MS_RDONLY) -+ return 0; -+ -+ journal = EXT3_SB(sb)->s_journal; -+ sb->s_dirt = 0; -+ lock_kernel(); /* important: lock down j_running_transaction */ -+ ret = ext3_journal_force_commit(journal); -+ unlock_kernel(); -+ return ret; -+} -+ -+/* -+ * Ext3 always journals updates to the superblock itself, so we don't -+ * have to propagate any other updates to the superblock on disk at this -+ * point. Just start an async writeback to get the buffers on their way -+ * to the disk. -+ * -+ * This implicitly triggers the writebehind on sync(). -+ */ -+ -+static int do_sync_supers = 0; -+MODULE_PARM(do_sync_supers, "i"); -+MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously"); -+ -+void ext3_write_super (struct super_block * sb) -+{ -+ tid_t target; -+ -+ if (down_trylock(&sb->s_lock) == 0) -+ BUG(); /* aviro detector */ -+ sb->s_dirt = 0; -+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); -+ -+ if (do_sync_supers) { -+ unlock_super(sb); -+ log_wait_commit(EXT3_SB(sb)->s_journal, target); -+ lock_super(sb); -+ } -+} -+ -+/* -+ * LVM calls this function before a (read-only) snapshot is created. This -+ * gives us a chance to flush the journal completely and mark the fs clean. -+ */ -+void ext3_write_super_lockfs(struct super_block *sb) -+{ -+ sb->s_dirt = 0; -+ -+ lock_kernel(); /* 2.4.5 forgot to do this for us */ -+ if (!(sb->s_flags & MS_RDONLY)) { -+ journal_t *journal = EXT3_SB(sb)->s_journal; -+ -+ /* Now we set up the journal barrier. */ -+ journal_lock_updates(journal); -+ journal_flush(journal); -+ -+ /* Journal blocked and flushed, clear needs_recovery flag. */ -+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); -+ } -+ unlock_kernel(); -+} -+ -+/* -+ * Called by LVM after the snapshot is done. We need to reset the RECOVER -+ * flag here, even though the filesystem is not technically dirty yet. -+ */ -+void ext3_unlockfs(struct super_block *sb) -+{ -+ if (!(sb->s_flags & MS_RDONLY)) { -+ lock_kernel(); -+ lock_super(sb); -+ /* Reser the needs_recovery flag before the fs is unlocked. */ -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); -+ unlock_super(sb); -+ journal_unlock_updates(EXT3_SB(sb)->s_journal); -+ unlock_kernel(); -+ } -+} -+ -+int ext3_remount (struct super_block * sb, int * flags, char * data) -+{ -+ struct ext3_super_block * es; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long tmp; -+ -+ clear_ro_after(sb); -+ -+ /* -+ * Allow the "check" option to be passed as a remount option. -+ */ -+ if (!parse_options(data, &tmp, sbi, &tmp, 1)) -+ return -EINVAL; -+ -+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) -+ ext3_abort(sb, __FUNCTION__, "Abort forced by user"); -+ -+ es = sbi->s_es; -+ -+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { -+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) -+ return -EROFS; -+ -+ if (*flags & MS_RDONLY) { -+ /* -+ * First of all, the unconditional stuff we have to do -+ * to disable replay of the journal when we next remount -+ */ -+ sb->s_flags |= MS_RDONLY; -+ -+ /* -+ * OK, test if we are remounting a valid rw partition -+ * readonly, and if so set the rdonly flag and then -+ * mark the partition as valid again. -+ */ -+ if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && -+ (sbi->s_mount_state & EXT3_VALID_FS)) -+ es->s_state = cpu_to_le16(sbi->s_mount_state); -+ -+ ext3_mark_recovery_complete(sb, es); -+ } else { -+ int ret; -+ if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, -+ ~EXT3_FEATURE_RO_COMPAT_SUPP))) { -+ printk(KERN_WARNING "EXT3-fs: %s: couldn't " -+ "remount RDWR because of unsupported " -+ "optional features (%x).\n", -+ bdevname(sb->s_dev), ret); -+ return -EROFS; -+ } -+ /* -+ * Mounting a RDONLY partition read-write, so reread -+ * and store the current valid flag. (It may have -+ * been changed by e2fsck since we originally mounted -+ * the partition.) -+ */ -+ ext3_clear_journal_err(sb, es); -+ sbi->s_mount_state = le16_to_cpu(es->s_state); -+ if (!ext3_setup_super (sb, es, 0)) -+ sb->s_flags &= ~MS_RDONLY; -+ } -+ } -+ setup_ro_after(sb); -+ return 0; -+} -+ -+int ext3_statfs (struct super_block * sb, struct statfs * buf) -+{ -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ unsigned long overhead; -+ int i; -+ -+ if (test_opt (sb, MINIX_DF)) -+ overhead = 0; -+ else { -+ /* -+ * Compute the overhead (FS structures) -+ */ -+ -+ /* -+ * All of the blocks before first_data_block are -+ * overhead -+ */ -+ overhead = le32_to_cpu(es->s_first_data_block); -+ -+ /* -+ * Add the overhead attributed to the superblock and -+ * block group descriptors. If the sparse superblocks -+ * feature is turned on, then not all groups have this. -+ */ -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ overhead += ext3_bg_has_super(sb, i) + -+ ext3_bg_num_gdb(sb, i); -+ -+ /* -+ * Every block group has an inode bitmap, a block -+ * bitmap, and an inode table. -+ */ -+ overhead += (EXT3_SB(sb)->s_groups_count * -+ (2 + EXT3_SB(sb)->s_itb_per_group)); -+ } -+ -+ buf->f_type = EXT3_SUPER_MAGIC; -+ buf->f_bsize = sb->s_blocksize; -+ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; -+ buf->f_bfree = ext3_count_free_blocks (sb); -+ buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); -+ if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) -+ buf->f_bavail = 0; -+ buf->f_files = le32_to_cpu(es->s_inodes_count); -+ buf->f_ffree = ext3_count_free_inodes (sb); -+ buf->f_namelen = EXT3_NAME_LEN; -+ return 0; -+} -+ -+static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super); -+ -+static int __init init_ext3_fs(void) -+{ -+ return register_filesystem(&ext3_fs_type); -+} -+ -+static void __exit exit_ext3_fs(void) -+{ -+ unregister_filesystem(&ext3_fs_type); -+} -+ -+EXPORT_NO_SYMBOLS; -+ -+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -+MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -+MODULE_LICENSE("GPL"); -+module_init(init_ext3_fs) -+module_exit(exit_ext3_fs) -diff -rup --new-file linux.mcp2/fs/ext3/symlink.c linux_tmp/fs/ext3/symlink.c ---- linux.mcp2/fs/ext3/symlink.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/symlink.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,39 @@ -+/* -+ * linux/fs/ext3/symlink.c -+ * -+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999 -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/symlink.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3 symlink handling code -+ */ -+ -+#include -+#include -+#include -+ -+static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) -+{ -+ char *s = (char *)dentry->d_inode->u.ext3_i.i_data; -+ return vfs_readlink(dentry, buffer, buflen, s); -+} -+ -+static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) -+{ -+ char *s = (char *)dentry->d_inode->u.ext3_i.i_data; -+ return vfs_follow_link(nd, s); -+} -+ -+struct inode_operations ext3_fast_symlink_inode_operations = { -+ readlink: ext3_readlink, /* BKL not held. Don't need */ -+ follow_link: ext3_follow_link, /* BKL not held. Don't need */ -+}; diff --git a/lustre/kernel_patches/patches/2.4.19-jbd.patch b/lustre/kernel_patches/patches/2.4.19-jbd.patch deleted file mode 100644 index 4f4b38e..0000000 --- a/lustre/kernel_patches/patches/2.4.19-jbd.patch +++ /dev/null @@ -1,6524 +0,0 @@ -diff -ruP linux.mcp2/fs/jbd/Makefile linuxppc_2.4.19_final/fs/jbd/Makefile ---- linux.mcp2/fs/jbd/Makefile 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/Makefile 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,15 @@ -+# -+# fs/jbd/Makefile -+# -+# Makefile for the linux journaling routines. -+# -+ -+export-objs := journal.o -+O_TARGET := jbd.o -+ -+obj-y := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o -+ -+obj-m := $(O_TARGET) -+ -+include $(TOPDIR)/Rules.make -+ -diff -ruP linux.mcp2/fs/jbd/checkpoint.c linuxppc_2.4.19_final/fs/jbd/checkpoint.c ---- linux.mcp2/fs/jbd/checkpoint.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/checkpoint.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,605 @@ -+/* -+ * linux/fs/checkpoint.c -+ * -+ * Written by Stephen C. Tweedie , 1999 -+ * -+ * Copyright 1999 Red Hat Software --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Checkpoint routines for the generic filesystem journaling code. -+ * Part of the ext2fs journaling system. -+ * -+ * Checkpointing is the process of ensuring that a section of the log is -+ * committed fully to disk, so that that portion of the log can be -+ * reused. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+extern spinlock_t journal_datalist_lock; -+ -+/* -+ * Unlink a buffer from a transaction. -+ * -+ * Called with journal_datalist_lock held. -+ */ -+ -+static inline void __buffer_unlink(struct journal_head *jh) -+{ -+ transaction_t *transaction; -+ -+ transaction = jh->b_cp_transaction; -+ jh->b_cp_transaction = NULL; -+ -+ jh->b_cpnext->b_cpprev = jh->b_cpprev; -+ jh->b_cpprev->b_cpnext = jh->b_cpnext; -+ if (transaction->t_checkpoint_list == jh) -+ transaction->t_checkpoint_list = jh->b_cpnext; -+ if (transaction->t_checkpoint_list == jh) -+ transaction->t_checkpoint_list = NULL; -+} -+ -+/* -+ * Try to release a checkpointed buffer from its transaction. -+ * Returns 1 if we released it. -+ * Requires journal_datalist_lock -+ */ -+static int __try_to_free_cp_buf(struct journal_head *jh) -+{ -+ int ret = 0; -+ struct buffer_head *bh = jh2bh(jh); -+ -+ if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { -+ JBUFFER_TRACE(jh, "remove from checkpoint list"); -+ __journal_remove_checkpoint(jh); -+ __journal_remove_journal_head(bh); -+ BUFFER_TRACE(bh, "release"); -+ /* BUF_LOCKED -> BUF_CLEAN (fwiw) */ -+ refile_buffer(bh); -+ __brelse(bh); -+ ret = 1; -+ } -+ return ret; -+} -+ -+/* -+ * log_wait_for_space: wait until there is space in the journal. -+ * -+ * Called with the journal already locked, but it will be unlocked if we have -+ * to wait for a checkpoint to free up some space in the log. -+ */ -+ -+void log_wait_for_space(journal_t *journal, int nblocks) -+{ -+ while (log_space_left(journal) < nblocks) { -+ if (journal->j_flags & JFS_ABORT) -+ return; -+ unlock_journal(journal); -+ down(&journal->j_checkpoint_sem); -+ lock_journal(journal); -+ -+ /* Test again, another process may have checkpointed -+ * while we were waiting for the checkpoint lock */ -+ if (log_space_left(journal) < nblocks) { -+ log_do_checkpoint(journal, nblocks); -+ } -+ up(&journal->j_checkpoint_sem); -+ } -+} -+ -+/* -+ * Clean up a transaction's checkpoint list. -+ * -+ * We wait for any pending IO to complete and make sure any clean -+ * buffers are removed from the transaction. -+ * -+ * Return 1 if we performed any actions which might have destroyed the -+ * checkpoint. (journal_remove_checkpoint() deletes the transaction when -+ * the last checkpoint buffer is cleansed) -+ * -+ * Called with the journal locked. -+ * Called with journal_datalist_lock held. -+ */ -+static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) -+{ -+ struct journal_head *jh, *next_jh, *last_jh; -+ struct buffer_head *bh; -+ int ret = 0; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ jh = transaction->t_checkpoint_list; -+ if (!jh) -+ return 0; -+ -+ last_jh = jh->b_cpprev; -+ next_jh = jh; -+ do { -+ jh = next_jh; -+ bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ atomic_inc(&bh->b_count); -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ /* the journal_head may have gone by now */ -+ BUFFER_TRACE(bh, "brelse"); -+ __brelse(bh); -+ goto out_return_1; -+ } -+ -+ if (jh->b_transaction != NULL) { -+ transaction_t *transaction = jh->b_transaction; -+ tid_t tid = transaction->t_tid; -+ -+ spin_unlock(&journal_datalist_lock); -+ log_start_commit(journal, transaction); -+ unlock_journal(journal); -+ log_wait_commit(journal, tid); -+ goto out_return_1; -+ } -+ -+ /* -+ * We used to test for (jh->b_list != BUF_CLEAN) here. -+ * But unmap_underlying_metadata() can place buffer onto -+ * BUF_CLEAN. Since refile_buffer() no longer takes buffers -+ * off checkpoint lists, we cope with it here -+ */ -+ /* -+ * AKPM: I think the buffer_jdirty test is redundant - it -+ * shouldn't have NULL b_transaction? -+ */ -+ next_jh = jh->b_cpnext; -+ if (!buffer_dirty(bh) && !buffer_jdirty(bh)) { -+ BUFFER_TRACE(bh, "remove from checkpoint"); -+ __journal_remove_checkpoint(jh); -+ __journal_remove_journal_head(bh); -+ refile_buffer(bh); -+ __brelse(bh); -+ ret = 1; -+ } -+ -+ jh = next_jh; -+ } while (jh != last_jh); -+ -+ return ret; -+out_return_1: -+ lock_journal(journal); -+ spin_lock(&journal_datalist_lock); -+ return 1; -+} -+ -+#define NR_BATCH 64 -+ -+static void __flush_batch(struct buffer_head **bhs, int *batch_count) -+{ -+ int i; -+ -+ spin_unlock(&journal_datalist_lock); -+ ll_rw_block(WRITE, *batch_count, bhs); -+ run_task_queue(&tq_disk); -+ spin_lock(&journal_datalist_lock); -+ for (i = 0; i < *batch_count; i++) { -+ struct buffer_head *bh = bhs[i]; -+ clear_bit(BH_JWrite, &bh->b_state); -+ BUFFER_TRACE(bh, "brelse"); -+ __brelse(bh); -+ } -+ *batch_count = 0; -+} -+ -+/* -+ * Try to flush one buffer from the checkpoint list to disk. -+ * -+ * Return 1 if something happened which requires us to abort the current -+ * scan of the checkpoint list. -+ * -+ * Called with journal_datalist_lock held. -+ */ -+static int __flush_buffer(journal_t *journal, struct journal_head *jh, -+ struct buffer_head **bhs, int *batch_count, -+ int *drop_count) -+{ -+ struct buffer_head *bh = jh2bh(jh); -+ int ret = 0; -+ -+ if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { -+ J_ASSERT_JH(jh, jh->b_transaction == NULL); -+ -+ /* -+ * Important: we are about to write the buffer, and -+ * possibly block, while still holding the journal lock. -+ * We cannot afford to let the transaction logic start -+ * messing around with this buffer before we write it to -+ * disk, as that would break recoverability. -+ */ -+ BUFFER_TRACE(bh, "queue"); -+ atomic_inc(&bh->b_count); -+ J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state)); -+ set_bit(BH_JWrite, &bh->b_state); -+ bhs[*batch_count] = bh; -+ (*batch_count)++; -+ if (*batch_count == NR_BATCH) { -+ __flush_batch(bhs, batch_count); -+ ret = 1; -+ } -+ } else { -+ int last_buffer = 0; -+ if (jh->b_cpnext == jh) { -+ /* We may be about to drop the transaction. Tell the -+ * caller that the lists have changed. -+ */ -+ last_buffer = 1; -+ } -+ if (__try_to_free_cp_buf(jh)) { -+ (*drop_count)++; -+ ret = last_buffer; -+ } -+ } -+ return ret; -+} -+ -+ -+/* -+ * Perform an actual checkpoint. We don't write out only enough to -+ * satisfy the current blocked requests: rather we submit a reasonably -+ * sized chunk of the outstanding data to disk at once for -+ * efficiency. log_wait_for_space() will retry if we didn't free enough. -+ * -+ * However, we _do_ take into account the amount requested so that once -+ * the IO has been queued, we can return as soon as enough of it has -+ * completed to disk. -+ * -+ * The journal should be locked before calling this function. -+ */ -+ -+/* @@@ `nblocks' is unused. Should it be used? */ -+int log_do_checkpoint (journal_t *journal, int nblocks) -+{ -+ transaction_t *transaction, *last_transaction, *next_transaction; -+ int result; -+ int target; -+ int batch_count = 0; -+ struct buffer_head *bhs[NR_BATCH]; -+ -+ jbd_debug(1, "Start checkpoint\n"); -+ -+ /* -+ * First thing: if there are any transactions in the log which -+ * don't need checkpointing, just eliminate them from the -+ * journal straight away. -+ */ -+ result = cleanup_journal_tail(journal); -+ jbd_debug(1, "cleanup_journal_tail returned %d\n", result); -+ if (result <= 0) -+ return result; -+ -+ /* -+ * OK, we need to start writing disk blocks. Try to free up a -+ * quarter of the log in a single checkpoint if we can. -+ */ -+ /* -+ * AKPM: check this code. I had a feeling a while back that it -+ * degenerates into a busy loop at unmount time. -+ */ -+ target = (journal->j_last - journal->j_first) / 4; -+ -+ spin_lock(&journal_datalist_lock); -+repeat: -+ transaction = journal->j_checkpoint_transactions; -+ if (transaction == NULL) -+ goto done; -+ last_transaction = transaction->t_cpprev; -+ next_transaction = transaction; -+ -+ do { -+ struct journal_head *jh, *last_jh, *next_jh; -+ int drop_count = 0; -+ int cleanup_ret, retry = 0; -+ -+ transaction = next_transaction; -+ next_transaction = transaction->t_cpnext; -+ jh = transaction->t_checkpoint_list; -+ last_jh = jh->b_cpprev; -+ next_jh = jh; -+ do { -+ jh = next_jh; -+ next_jh = jh->b_cpnext; -+ retry = __flush_buffer(journal, jh, bhs, &batch_count, -+ &drop_count); -+ } while (jh != last_jh && !retry); -+ if (batch_count) { -+ __flush_batch(bhs, &batch_count); -+ goto repeat; -+ } -+ if (retry) -+ goto repeat; -+ /* -+ * We have walked the whole transaction list without -+ * finding anything to write to disk. We had better be -+ * able to make some progress or we are in trouble. -+ */ -+ cleanup_ret = __cleanup_transaction(journal, transaction); -+ J_ASSERT(drop_count != 0 || cleanup_ret != 0); -+ goto repeat; /* __cleanup may have dropped lock */ -+ } while (transaction != last_transaction); -+ -+done: -+ spin_unlock(&journal_datalist_lock); -+ result = cleanup_journal_tail(journal); -+ if (result < 0) -+ return result; -+ -+ return 0; -+} -+ -+/* -+ * Check the list of checkpoint transactions for the journal to see if -+ * we have already got rid of any since the last update of the log tail -+ * in the journal superblock. If so, we can instantly roll the -+ * superblock forward to remove those transactions from the log. -+ * -+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up. -+ * -+ * Called with the journal lock held. -+ * -+ * This is the only part of the journaling code which really needs to be -+ * aware of transaction aborts. Checkpointing involves writing to the -+ * main filesystem area rather than to the journal, so it can proceed -+ * even in abort state, but we must not update the journal superblock if -+ * we have an abort error outstanding. -+ */ -+ -+int cleanup_journal_tail(journal_t *journal) -+{ -+ transaction_t * transaction; -+ tid_t first_tid; -+ unsigned long blocknr, freed; -+ -+ /* OK, work out the oldest transaction remaining in the log, and -+ * the log block it starts at. -+ * -+ * If the log is now empty, we need to work out which is the -+ * next transaction ID we will write, and where it will -+ * start. */ -+ -+ /* j_checkpoint_transactions needs locking */ -+ spin_lock(&journal_datalist_lock); -+ transaction = journal->j_checkpoint_transactions; -+ if (transaction) { -+ first_tid = transaction->t_tid; -+ blocknr = transaction->t_log_start; -+ } else if ((transaction = journal->j_committing_transaction) != NULL) { -+ first_tid = transaction->t_tid; -+ blocknr = transaction->t_log_start; -+ } else if ((transaction = journal->j_running_transaction) != NULL) { -+ first_tid = transaction->t_tid; -+ blocknr = journal->j_head; -+ } else { -+ first_tid = journal->j_transaction_sequence; -+ blocknr = journal->j_head; -+ } -+ spin_unlock(&journal_datalist_lock); -+ J_ASSERT (blocknr != 0); -+ -+ /* If the oldest pinned transaction is at the tail of the log -+ already then there's not much we can do right now. */ -+ if (journal->j_tail_sequence == first_tid) -+ return 1; -+ -+ /* OK, update the superblock to recover the freed space. -+ * Physical blocks come first: have we wrapped beyond the end of -+ * the log? */ -+ freed = blocknr - journal->j_tail; -+ if (blocknr < journal->j_tail) -+ freed = freed + journal->j_last - journal->j_first; -+ -+ jbd_debug(1, -+ "Cleaning journal tail from %d to %d (offset %lu), " -+ "freeing %lu\n", -+ journal->j_tail_sequence, first_tid, blocknr, freed); -+ -+ journal->j_free += freed; -+ journal->j_tail_sequence = first_tid; -+ journal->j_tail = blocknr; -+ if (!(journal->j_flags & JFS_ABORT)) -+ journal_update_superblock(journal, 1); -+ return 0; -+} -+ -+ -+/* Checkpoint list management */ -+ -+/* -+ * journal_clean_checkpoint_list -+ * -+ * Find all the written-back checkpoint buffers in the journal and release them. -+ * -+ * Called with the journal locked. -+ * Called with journal_datalist_lock held. -+ * Returns number of bufers reaped (for debug) -+ */ -+ -+int __journal_clean_checkpoint_list(journal_t *journal) -+{ -+ transaction_t *transaction, *last_transaction, *next_transaction; -+ int ret = 0; -+ -+ transaction = journal->j_checkpoint_transactions; -+ if (transaction == 0) -+ goto out; -+ -+ last_transaction = transaction->t_cpprev; -+ next_transaction = transaction; -+ do { -+ struct journal_head *jh; -+ -+ transaction = next_transaction; -+ next_transaction = transaction->t_cpnext; -+ jh = transaction->t_checkpoint_list; -+ if (jh) { -+ struct journal_head *last_jh = jh->b_cpprev; -+ struct journal_head *next_jh = jh; -+ do { -+ jh = next_jh; -+ next_jh = jh->b_cpnext; -+ ret += __try_to_free_cp_buf(jh); -+ } while (jh != last_jh); -+ } -+ } while (transaction != last_transaction); -+out: -+ return ret; -+} -+ -+/* -+ * journal_remove_checkpoint: called after a buffer has been committed -+ * to disk (either by being write-back flushed to disk, or being -+ * committed to the log). -+ * -+ * We cannot safely clean a transaction out of the log until all of the -+ * buffer updates committed in that transaction have safely been stored -+ * elsewhere on disk. To achieve this, all of the buffers in a -+ * transaction need to be maintained on the transaction's checkpoint -+ * list until they have been rewritten, at which point this function is -+ * called to remove the buffer from the existing transaction's -+ * checkpoint list. -+ * -+ * This function is called with the journal locked. -+ * This function is called with journal_datalist_lock held. -+ */ -+ -+void __journal_remove_checkpoint(struct journal_head *jh) -+{ -+ transaction_t *transaction; -+ journal_t *journal; -+ -+ JBUFFER_TRACE(jh, "entry"); -+ -+ if ((transaction = jh->b_cp_transaction) == NULL) { -+ JBUFFER_TRACE(jh, "not on transaction"); -+ goto out; -+ } -+ -+ journal = transaction->t_journal; -+ -+ __buffer_unlink(jh); -+ -+ if (transaction->t_checkpoint_list != NULL) -+ goto out; -+ JBUFFER_TRACE(jh, "transaction has no more buffers"); -+ -+ /* There is one special case to worry about: if we have just -+ pulled the buffer off a committing transaction's forget list, -+ then even if the checkpoint list is empty, the transaction -+ obviously cannot be dropped! */ -+ -+ if (transaction == journal->j_committing_transaction) { -+ JBUFFER_TRACE(jh, "belongs to committing transaction"); -+ goto out; -+ } -+ -+ /* OK, that was the last buffer for the transaction: we can now -+ safely remove this transaction from the log */ -+ -+ __journal_drop_transaction(journal, transaction); -+ -+ /* Just in case anybody was waiting for more transactions to be -+ checkpointed... */ -+ wake_up(&journal->j_wait_logspace); -+out: -+ JBUFFER_TRACE(jh, "exit"); -+} -+ -+void journal_remove_checkpoint(struct journal_head *jh) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_remove_checkpoint(jh); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint -+ * list so that we know when it is safe to clean the transaction out of -+ * the log. -+ * -+ * Called with the journal locked. -+ * Called with journal_datalist_lock held. -+ */ -+void __journal_insert_checkpoint(struct journal_head *jh, -+ transaction_t *transaction) -+{ -+ JBUFFER_TRACE(jh, "entry"); -+ J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh))); -+ J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); -+ -+ assert_spin_locked(&journal_datalist_lock); -+ jh->b_cp_transaction = transaction; -+ -+ if (!transaction->t_checkpoint_list) { -+ jh->b_cpnext = jh->b_cpprev = jh; -+ } else { -+ jh->b_cpnext = transaction->t_checkpoint_list; -+ jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; -+ jh->b_cpprev->b_cpnext = jh; -+ jh->b_cpnext->b_cpprev = jh; -+ } -+ transaction->t_checkpoint_list = jh; -+} -+ -+void journal_insert_checkpoint(struct journal_head *jh, -+ transaction_t *transaction) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_insert_checkpoint(jh, transaction); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * We've finished with this transaction structure: adios... -+ * -+ * The transaction must have no links except for the checkpoint by this -+ * point. -+ * -+ * Called with the journal locked. -+ * Called with journal_datalist_lock held. -+ */ -+ -+void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) -+{ -+ assert_spin_locked(&journal_datalist_lock); -+ if (transaction->t_cpnext) { -+ transaction->t_cpnext->t_cpprev = transaction->t_cpprev; -+ transaction->t_cpprev->t_cpnext = transaction->t_cpnext; -+ if (journal->j_checkpoint_transactions == transaction) -+ journal->j_checkpoint_transactions = -+ transaction->t_cpnext; -+ if (journal->j_checkpoint_transactions == transaction) -+ journal->j_checkpoint_transactions = NULL; -+ } -+ -+ J_ASSERT (transaction->t_ilist == NULL); -+ J_ASSERT (transaction->t_buffers == NULL); -+ J_ASSERT (transaction->t_sync_datalist == NULL); -+ J_ASSERT (transaction->t_async_datalist == NULL); -+ J_ASSERT (transaction->t_forget == NULL); -+ J_ASSERT (transaction->t_iobuf_list == NULL); -+ J_ASSERT (transaction->t_shadow_list == NULL); -+ J_ASSERT (transaction->t_log_list == NULL); -+ J_ASSERT (transaction->t_checkpoint_list == NULL); -+ J_ASSERT (transaction->t_updates == 0); -+ -+ J_ASSERT (transaction->t_journal->j_committing_transaction != -+ transaction); -+ -+ jbd_debug (1, "Dropping transaction %d, all done\n", -+ transaction->t_tid); -+ kfree (transaction); -+} -+ -diff -ruP linux.mcp2/fs/jbd/commit.c linuxppc_2.4.19_final/fs/jbd/commit.c ---- linux.mcp2/fs/jbd/commit.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/commit.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,719 @@ -+/* -+ * linux/fs/commit.c -+ * -+ * Written by Stephen C. Tweedie , 1998 -+ * -+ * Copyright 1998 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Journal commit routines for the generic filesystem journaling code; -+ * part of the ext2fs journaling system. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+extern spinlock_t journal_datalist_lock; -+ -+/* -+ * Default IO end handler for temporary BJ_IO buffer_heads. -+ */ -+void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) -+{ -+ BUFFER_TRACE(bh, ""); -+ mark_buffer_uptodate(bh, uptodate); -+ unlock_buffer(bh); -+} -+ -+/* -+ * journal_commit_transaction -+ * -+ * The primary function for committing a transaction to the log. This -+ * function is called by the journal thread to begin a complete commit. -+ */ -+void journal_commit_transaction(journal_t *journal) -+{ -+ transaction_t *commit_transaction; -+ struct journal_head *jh, *new_jh, *descriptor; -+ struct journal_head *next_jh, *last_jh; -+ struct buffer_head *wbuf[64]; -+ int bufs; -+ int flags; -+ int err; -+ unsigned long blocknr; -+ char *tagp = NULL; -+ journal_header_t *header; -+ journal_block_tag_t *tag = NULL; -+ int space_left = 0; -+ int first_tag = 0; -+ int tag_flag; -+ int i; -+ -+ /* -+ * First job: lock down the current transaction and wait for -+ * all outstanding updates to complete. -+ */ -+ -+ lock_journal(journal); /* Protect journal->j_running_transaction */ -+ -+#ifdef COMMIT_STATS -+ spin_lock(&journal_datalist_lock); -+ summarise_journal_usage(journal); -+ spin_unlock(&journal_datalist_lock); -+#endif -+ -+ lock_kernel(); -+ -+ J_ASSERT (journal->j_running_transaction != NULL); -+ J_ASSERT (journal->j_committing_transaction == NULL); -+ -+ commit_transaction = journal->j_running_transaction; -+ J_ASSERT (commit_transaction->t_state == T_RUNNING); -+ -+ jbd_debug (1, "JBD: starting commit of transaction %d\n", -+ commit_transaction->t_tid); -+ -+ commit_transaction->t_state = T_LOCKED; -+ while (commit_transaction->t_updates != 0) { -+ unlock_journal(journal); -+ sleep_on(&journal->j_wait_updates); -+ lock_journal(journal); -+ } -+ -+ J_ASSERT (commit_transaction->t_outstanding_credits <= -+ journal->j_max_transaction_buffers); -+ -+ /* Do we need to erase the effects of a prior journal_flush? */ -+ if (journal->j_flags & JFS_FLUSHED) { -+ jbd_debug(3, "super block updated\n"); -+ journal_update_superblock(journal, 1); -+ } else { -+ jbd_debug(3, "superblock not updated\n"); -+ } -+ -+ /* -+ * First thing we are allowed to do is to discard any remaining -+ * BJ_Reserved buffers. Note, it is _not_ permissible to assume -+ * that there are no such buffers: if a large filesystem -+ * operation like a truncate needs to split itself over multiple -+ * transactions, then it may try to do a journal_restart() while -+ * there are still BJ_Reserved buffers outstanding. These must -+ * be released cleanly from the current transaction. -+ * -+ * In this case, the filesystem must still reserve write access -+ * again before modifying the buffer in the new transaction, but -+ * we do not require it to remember exactly which old buffers it -+ * has reserved. This is consistent with the existing behaviour -+ * that multiple journal_get_write_access() calls to the same -+ * buffer are perfectly permissable. -+ */ -+ -+ while (commit_transaction->t_reserved_list) { -+ jh = commit_transaction->t_reserved_list; -+ JBUFFER_TRACE(jh, "reserved, unused: refile"); -+ journal_refile_buffer(jh); -+ } -+ -+ /* -+ * Now try to drop any written-back buffers from the journal's -+ * checkpoint lists. We do this *before* commit because it potentially -+ * frees some memory -+ */ -+ spin_lock(&journal_datalist_lock); -+ __journal_clean_checkpoint_list(journal); -+ spin_unlock(&journal_datalist_lock); -+ -+ /* First part of the commit: force the revoke list out to disk. -+ * The revoke code generates its own metadata blocks on disk for this. -+ * -+ * It is important that we do this while the transaction is -+ * still locked. Generating the revoke records should not -+ * generate any IO stalls, so this should be quick; and doing -+ * the work while we have the transaction locked means that we -+ * only ever have to maintain the revoke list for one -+ * transaction at a time. -+ */ -+ -+ jbd_debug (3, "JBD: commit phase 1\n"); -+ -+ journal_write_revoke_records(journal, commit_transaction); -+ -+ /* -+ * Now that we have built the revoke records, we can start -+ * reusing the revoke list for a new running transaction. We -+ * can now safely start committing the old transaction: time to -+ * get a new running transaction for incoming filesystem updates -+ */ -+ -+ commit_transaction->t_state = T_FLUSH; -+ -+ wake_up(&journal->j_wait_transaction_locked); -+ -+ journal->j_committing_transaction = commit_transaction; -+ journal->j_running_transaction = NULL; -+ -+ commit_transaction->t_log_start = journal->j_head; -+ -+ unlock_kernel(); -+ -+ jbd_debug (3, "JBD: commit phase 2\n"); -+ -+ /* -+ * Now start flushing things to disk, in the order they appear -+ * on the transaction lists. Data blocks go first. -+ */ -+ -+ /* -+ * Whenever we unlock the journal and sleep, things can get added -+ * onto ->t_datalist, so we have to keep looping back to write_out_data -+ * until we *know* that the list is empty. -+ */ -+write_out_data: -+ -+ /* -+ * Cleanup any flushed data buffers from the data list. Even in -+ * abort mode, we want to flush this out as soon as possible. -+ * -+ * We take journal_datalist_lock to protect the lists from -+ * journal_try_to_free_buffers(). -+ */ -+ spin_lock(&journal_datalist_lock); -+ -+write_out_data_locked: -+ bufs = 0; -+ next_jh = commit_transaction->t_sync_datalist; -+ if (next_jh == NULL) -+ goto sync_datalist_empty; -+ last_jh = next_jh->b_tprev; -+ -+ do { -+ struct buffer_head *bh; -+ -+ jh = next_jh; -+ next_jh = jh->b_tnext; -+ bh = jh2bh(jh); -+ if (!buffer_locked(bh)) { -+ if (buffer_dirty(bh)) { -+ BUFFER_TRACE(bh, "start journal writeout"); -+ atomic_inc(&bh->b_count); -+ wbuf[bufs++] = bh; -+ } else { -+ BUFFER_TRACE(bh, "writeout complete: unfile"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ __journal_remove_journal_head(bh); -+ refile_buffer(bh); -+ __brelse(bh); -+ } -+ } -+ if (bufs == ARRAY_SIZE(wbuf)) { -+ /* -+ * Major speedup: start here on the next scan -+ */ -+ J_ASSERT(commit_transaction->t_sync_datalist != 0); -+ commit_transaction->t_sync_datalist = jh; -+ break; -+ } -+ } while (jh != last_jh); -+ -+ if (bufs || current->need_resched) { -+ jbd_debug(2, "submit %d writes\n", bufs); -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ if (bufs) -+ ll_rw_block(WRITE, bufs, wbuf); -+ if (current->need_resched) -+ schedule(); -+ journal_brelse_array(wbuf, bufs); -+ lock_journal(journal); -+ spin_lock(&journal_datalist_lock); -+ if (bufs) -+ goto write_out_data_locked; -+ } -+ -+ /* -+ * Wait for all previously submitted IO on the data list to complete. -+ */ -+ jh = commit_transaction->t_sync_datalist; -+ if (jh == NULL) -+ goto sync_datalist_empty; -+ -+ do { -+ struct buffer_head *bh; -+ jh = jh->b_tprev; /* Wait on the last written */ -+ bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ /* the journal_head may have been removed now */ -+ lock_journal(journal); -+ goto write_out_data; -+ } else if (buffer_dirty(bh)) { -+ goto write_out_data_locked; -+ } -+ } while (jh != commit_transaction->t_sync_datalist); -+ goto write_out_data_locked; -+ -+sync_datalist_empty: -+ /* -+ * Wait for all the async writepage data. As they become unlocked -+ * in end_buffer_io_async(), the only place where they can be -+ * reaped is in try_to_free_buffers(), and we're locked against -+ * that. -+ */ -+ while ((jh = commit_transaction->t_async_datalist)) { -+ struct buffer_head *bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ lock_journal(journal); -+ spin_lock(&journal_datalist_lock); -+ continue; /* List may have changed */ -+ } -+ if (jh->b_next_transaction) { -+ /* -+ * For writepage() buffers in journalled data mode: a -+ * later transaction may want the buffer for "metadata" -+ */ -+ __journal_refile_buffer(jh); -+ } else { -+ BUFFER_TRACE(bh, "finished async writeout: unfile"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ __journal_remove_journal_head(bh); -+ BUFFER_TRACE(bh, "finished async writeout: refile"); -+ /* It can sometimes be on BUF_LOCKED due to migration -+ * from syncdata to asyncdata */ -+ if (bh->b_list != BUF_CLEAN) -+ refile_buffer(bh); -+ __brelse(bh); -+ } -+ } -+ spin_unlock(&journal_datalist_lock); -+ -+ /* -+ * If we found any dirty or locked buffers, then we should have -+ * looped back up to the write_out_data label. If there weren't -+ * any then journal_clean_data_list should have wiped the list -+ * clean by now, so check that it is in fact empty. -+ */ -+ J_ASSERT (commit_transaction->t_sync_datalist == NULL); -+ J_ASSERT (commit_transaction->t_async_datalist == NULL); -+ -+ jbd_debug (3, "JBD: commit phase 3\n"); -+ -+ /* -+ * Way to go: we have now written out all of the data for a -+ * transaction! Now comes the tricky part: we need to write out -+ * metadata. Loop over the transaction's entire buffer list: -+ */ -+ commit_transaction->t_state = T_COMMIT; -+ -+ descriptor = 0; -+ bufs = 0; -+ while (commit_transaction->t_buffers) { -+ -+ /* Find the next buffer to be journaled... */ -+ -+ jh = commit_transaction->t_buffers; -+ -+ /* If we're in abort mode, we just un-journal the buffer and -+ release it for background writing. */ -+ -+ if (is_journal_aborted(journal)) { -+ JBUFFER_TRACE(jh, "journal is aborting: refile"); -+ journal_refile_buffer(jh); -+ /* If that was the last one, we need to clean up -+ * any descriptor buffers which may have been -+ * already allocated, even if we are now -+ * aborting. */ -+ if (!commit_transaction->t_buffers) -+ goto start_journal_io; -+ continue; -+ } -+ -+ /* Make sure we have a descriptor block in which to -+ record the metadata buffer. */ -+ -+ if (!descriptor) { -+ struct buffer_head *bh; -+ -+ J_ASSERT (bufs == 0); -+ -+ jbd_debug(4, "JBD: get descriptor\n"); -+ -+ descriptor = journal_get_descriptor_buffer(journal); -+ if (!descriptor) { -+ __journal_abort_hard(journal); -+ continue; -+ } -+ -+ bh = jh2bh(descriptor); -+ jbd_debug(4, "JBD: got buffer %ld (%p)\n", -+ bh->b_blocknr, bh->b_data); -+ header = (journal_header_t *)&bh->b_data[0]; -+ header->h_magic = htonl(JFS_MAGIC_NUMBER); -+ header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK); -+ header->h_sequence = htonl(commit_transaction->t_tid); -+ -+ tagp = &bh->b_data[sizeof(journal_header_t)]; -+ space_left = bh->b_size - sizeof(journal_header_t); -+ first_tag = 1; -+ set_bit(BH_JWrite, &bh->b_state); -+ wbuf[bufs++] = bh; -+ -+ /* Record it so that we can wait for IO -+ completion later */ -+ BUFFER_TRACE(bh, "ph3: file as descriptor"); -+ journal_file_buffer(descriptor, commit_transaction, -+ BJ_LogCtl); -+ } -+ -+ /* Where is the buffer to be written? */ -+ -+ err = journal_next_log_block(journal, &blocknr); -+ /* If the block mapping failed, just abandon the buffer -+ and repeat this loop: we'll fall into the -+ refile-on-abort condition above. */ -+ if (err) { -+ __journal_abort_hard(journal); -+ continue; -+ } -+ -+ /* Bump b_count to prevent truncate from stumbling over -+ the shadowed buffer! @@@ This can go if we ever get -+ rid of the BJ_IO/BJ_Shadow pairing of buffers. */ -+ atomic_inc(&jh2bh(jh)->b_count); -+ -+ /* Make a temporary IO buffer with which to write it out -+ (this will requeue both the metadata buffer and the -+ temporary IO buffer). new_bh goes on BJ_IO*/ -+ -+ set_bit(BH_JWrite, &jh2bh(jh)->b_state); -+ /* -+ * akpm: journal_write_metadata_buffer() sets -+ * new_bh->b_transaction to commit_transaction. -+ * We need to clean this up before we release new_bh -+ * (which is of type BJ_IO) -+ */ -+ JBUFFER_TRACE(jh, "ph3: write metadata"); -+ flags = journal_write_metadata_buffer(commit_transaction, -+ jh, &new_jh, blocknr); -+ set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); -+ set_bit(BH_Lock, &jh2bh(new_jh)->b_state); -+ wbuf[bufs++] = jh2bh(new_jh); -+ -+ /* Record the new block's tag in the current descriptor -+ buffer */ -+ -+ tag_flag = 0; -+ if (flags & 1) -+ tag_flag |= JFS_FLAG_ESCAPE; -+ if (!first_tag) -+ tag_flag |= JFS_FLAG_SAME_UUID; -+ -+ tag = (journal_block_tag_t *) tagp; -+ tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr); -+ tag->t_flags = htonl(tag_flag); -+ tagp += sizeof(journal_block_tag_t); -+ space_left -= sizeof(journal_block_tag_t); -+ -+ if (first_tag) { -+ memcpy (tagp, journal->j_uuid, 16); -+ tagp += 16; -+ space_left -= 16; -+ first_tag = 0; -+ } -+ -+ /* If there's no more to do, or if the descriptor is full, -+ let the IO rip! */ -+ -+ if (bufs == ARRAY_SIZE(wbuf) || -+ commit_transaction->t_buffers == NULL || -+ space_left < sizeof(journal_block_tag_t) + 16) { -+ -+ jbd_debug(4, "JBD: Submit %d IOs\n", bufs); -+ -+ /* Write an end-of-descriptor marker before -+ submitting the IOs. "tag" still points to -+ the last tag we set up. */ -+ -+ tag->t_flags |= htonl(JFS_FLAG_LAST_TAG); -+ -+start_journal_io: -+ unlock_journal(journal); -+ for (i=0; ib_state); -+ bh->b_end_io = journal_end_buffer_io_sync; -+ submit_bh(WRITE, bh); -+ } -+ if (current->need_resched) -+ schedule(); -+ lock_journal(journal); -+ -+ /* Force a new descriptor to be generated next -+ time round the loop. */ -+ descriptor = NULL; -+ bufs = 0; -+ } -+ } -+ -+ /* Lo and behold: we have just managed to send a transaction to -+ the log. Before we can commit it, wait for the IO so far to -+ complete. Control buffers being written are on the -+ transaction's t_log_list queue, and metadata buffers are on -+ the t_iobuf_list queue. -+ -+ Wait for the transactions in reverse order. That way we are -+ less likely to be woken up until all IOs have completed, and -+ so we incur less scheduling load. -+ */ -+ -+ jbd_debug(3, "JBD: commit phase 4\n"); -+ -+ /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */ -+ wait_for_iobuf: -+ while (commit_transaction->t_iobuf_list != NULL) { -+ struct buffer_head *bh; -+ jh = commit_transaction->t_iobuf_list->b_tprev; -+ bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ lock_journal(journal); -+ goto wait_for_iobuf; -+ } -+ -+ clear_bit(BH_JWrite, &jh2bh(jh)->b_state); -+ -+ JBUFFER_TRACE(jh, "ph4: unfile after journal write"); -+ journal_unfile_buffer(jh); -+ -+ /* -+ * akpm: don't put back a buffer_head with stale pointers -+ * dangling around. -+ */ -+ J_ASSERT_JH(jh, jh->b_transaction != NULL); -+ jh->b_transaction = NULL; -+ -+ /* -+ * ->t_iobuf_list should contain only dummy buffer_heads -+ * which were created by journal_write_metadata_buffer(). -+ */ -+ bh = jh2bh(jh); -+ BUFFER_TRACE(bh, "dumping temporary bh"); -+ journal_unlock_journal_head(jh); -+ __brelse(bh); -+ J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); -+ put_unused_buffer_head(bh); -+ -+ /* We also have to unlock and free the corresponding -+ shadowed buffer */ -+ jh = commit_transaction->t_shadow_list->b_tprev; -+ bh = jh2bh(jh); -+ clear_bit(BH_JWrite, &bh->b_state); -+ J_ASSERT_BH(bh, buffer_jdirty(bh)); -+ -+ /* The metadata is now released for reuse, but we need -+ to remember it against this transaction so that when -+ we finally commit, we can do any checkpointing -+ required. */ -+ JBUFFER_TRACE(jh, "file as BJ_Forget"); -+ journal_file_buffer(jh, commit_transaction, BJ_Forget); -+ /* Wake up any transactions which were waiting for this -+ IO to complete */ -+ wake_up(&bh->b_wait); -+ JBUFFER_TRACE(jh, "brelse shadowed buffer"); -+ __brelse(bh); -+ } -+ -+ J_ASSERT (commit_transaction->t_shadow_list == NULL); -+ -+ jbd_debug(3, "JBD: commit phase 5\n"); -+ -+ /* Here we wait for the revoke record and descriptor record buffers */ -+ wait_for_ctlbuf: -+ while (commit_transaction->t_log_list != NULL) { -+ struct buffer_head *bh; -+ -+ jh = commit_transaction->t_log_list->b_tprev; -+ bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ lock_journal(journal); -+ goto wait_for_ctlbuf; -+ } -+ -+ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); -+ clear_bit(BH_JWrite, &bh->b_state); -+ journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ journal_unlock_journal_head(jh); -+ put_bh(bh); /* One for getblk */ -+ } -+ -+ jbd_debug(3, "JBD: commit phase 6\n"); -+ -+ if (is_journal_aborted(journal)) -+ goto skip_commit; -+ -+ /* Done it all: now write the commit record. We should have -+ * cleaned up our previous buffers by now, so if we are in abort -+ * mode we can now just skip the rest of the journal write -+ * entirely. */ -+ -+ descriptor = journal_get_descriptor_buffer(journal); -+ if (!descriptor) { -+ __journal_abort_hard(journal); -+ goto skip_commit; -+ } -+ -+ /* AKPM: buglet - add `i' to tmp! */ -+ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { -+ journal_header_t *tmp = -+ (journal_header_t*)jh2bh(descriptor)->b_data; -+ tmp->h_magic = htonl(JFS_MAGIC_NUMBER); -+ tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK); -+ tmp->h_sequence = htonl(commit_transaction->t_tid); -+ } -+ -+ unlock_journal(journal); -+ JBUFFER_TRACE(descriptor, "write commit block"); -+ { -+ struct buffer_head *bh = jh2bh(descriptor); -+ clear_bit(BH_Dirty, &bh->b_state); -+ bh->b_end_io = journal_end_buffer_io_sync; -+ submit_bh(WRITE, bh); -+ wait_on_buffer(bh); -+ put_bh(bh); /* One for getblk() */ -+ journal_unlock_journal_head(descriptor); -+ } -+ lock_journal(journal); -+ -+ /* End of a transaction! Finally, we can do checkpoint -+ processing: any buffers committed as a result of this -+ transaction can be removed from any checkpoint list it was on -+ before. */ -+ -+skip_commit: -+ -+ jbd_debug(3, "JBD: commit phase 7\n"); -+ -+ J_ASSERT(commit_transaction->t_sync_datalist == NULL); -+ J_ASSERT(commit_transaction->t_async_datalist == NULL); -+ J_ASSERT(commit_transaction->t_buffers == NULL); -+ J_ASSERT(commit_transaction->t_checkpoint_list == NULL); -+ J_ASSERT(commit_transaction->t_iobuf_list == NULL); -+ J_ASSERT(commit_transaction->t_shadow_list == NULL); -+ J_ASSERT(commit_transaction->t_log_list == NULL); -+ -+ while (commit_transaction->t_forget) { -+ transaction_t *cp_transaction; -+ struct buffer_head *bh; -+ -+ jh = commit_transaction->t_forget; -+ J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || -+ jh->b_transaction == journal->j_running_transaction); -+ -+ /* -+ * If there is undo-protected committed data against -+ * this buffer, then we can remove it now. If it is a -+ * buffer needing such protection, the old frozen_data -+ * field now points to a committed version of the -+ * buffer, so rotate that field to the new committed -+ * data. -+ * -+ * Otherwise, we can just throw away the frozen data now. -+ */ -+ if (jh->b_committed_data) { -+ kfree(jh->b_committed_data); -+ jh->b_committed_data = NULL; -+ if (jh->b_frozen_data) { -+ jh->b_committed_data = jh->b_frozen_data; -+ jh->b_frozen_data = NULL; -+ } -+ } else if (jh->b_frozen_data) { -+ kfree(jh->b_frozen_data); -+ jh->b_frozen_data = NULL; -+ } -+ -+ spin_lock(&journal_datalist_lock); -+ cp_transaction = jh->b_cp_transaction; -+ if (cp_transaction) { -+ JBUFFER_TRACE(jh, "remove from old cp transaction"); -+ J_ASSERT_JH(jh, commit_transaction != cp_transaction); -+ __journal_remove_checkpoint(jh); -+ } -+ -+ /* Only re-checkpoint the buffer_head if it is marked -+ * dirty. If the buffer was added to the BJ_Forget list -+ * by journal_forget, it may no longer be dirty and -+ * there's no point in keeping a checkpoint record for -+ * it. */ -+ bh = jh2bh(jh); -+ if (buffer_jdirty(bh)) { -+ JBUFFER_TRACE(jh, "add to new checkpointing trans"); -+ __journal_insert_checkpoint(jh, commit_transaction); -+ JBUFFER_TRACE(jh, "refile for checkpoint writeback"); -+ __journal_refile_buffer(jh); -+ } else { -+ J_ASSERT_BH(bh, !buffer_dirty(bh)); -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = 0; -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ spin_unlock(&journal_datalist_lock); -+ } -+ -+ /* Done with this transaction! */ -+ -+ jbd_debug(3, "JBD: commit phase 8\n"); -+ -+ J_ASSERT (commit_transaction->t_state == T_COMMIT); -+ commit_transaction->t_state = T_FINISHED; -+ -+ J_ASSERT (commit_transaction == journal->j_committing_transaction); -+ journal->j_commit_sequence = commit_transaction->t_tid; -+ journal->j_committing_transaction = NULL; -+ -+ spin_lock(&journal_datalist_lock); -+ if (commit_transaction->t_checkpoint_list == NULL) { -+ __journal_drop_transaction(journal, commit_transaction); -+ } else { -+ if (journal->j_checkpoint_transactions == NULL) { -+ journal->j_checkpoint_transactions = commit_transaction; -+ commit_transaction->t_cpnext = commit_transaction; -+ commit_transaction->t_cpprev = commit_transaction; -+ } else { -+ commit_transaction->t_cpnext = -+ journal->j_checkpoint_transactions; -+ commit_transaction->t_cpprev = -+ commit_transaction->t_cpnext->t_cpprev; -+ commit_transaction->t_cpnext->t_cpprev = -+ commit_transaction; -+ commit_transaction->t_cpprev->t_cpnext = -+ commit_transaction; -+ } -+ } -+ spin_unlock(&journal_datalist_lock); -+ -+ jbd_debug(1, "JBD: commit %d complete, head %d\n", -+ journal->j_commit_sequence, journal->j_tail_sequence); -+ -+ unlock_journal(journal); -+ wake_up(&journal->j_wait_done_commit); -+} -diff -ruP linux.mcp2/fs/jbd/journal.c linuxppc_2.4.19_final/fs/jbd/journal.c ---- linux.mcp2/fs/jbd/journal.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/journal.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,1877 @@ -+/* -+ * linux/fs/journal.c -+ * -+ * Written by Stephen C. Tweedie , 1998 -+ * -+ * Copyright 1998 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Generic filesystem journal-writing code; part of the ext2fs -+ * journaling system. -+ * -+ * This file manages journals: areas of disk reserved for logging -+ * transactional updates. This includes the kernel journaling thread -+ * which is responsible for scheduling updates to the log. -+ * -+ * We do not actually manage the physical storage of the journal in this -+ * file: that is left to a per-journal policy function, which allows us -+ * to store the journal within a filesystem-specified area for ext2 -+ * journaling (ext2 can use a reserved inode for storing the log). -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+EXPORT_SYMBOL(journal_start); -+EXPORT_SYMBOL(journal_try_start); -+EXPORT_SYMBOL(journal_restart); -+EXPORT_SYMBOL(journal_extend); -+EXPORT_SYMBOL(journal_stop); -+EXPORT_SYMBOL(journal_lock_updates); -+EXPORT_SYMBOL(journal_unlock_updates); -+EXPORT_SYMBOL(journal_get_write_access); -+EXPORT_SYMBOL(journal_get_create_access); -+EXPORT_SYMBOL(journal_get_undo_access); -+EXPORT_SYMBOL(journal_dirty_data); -+EXPORT_SYMBOL(journal_dirty_metadata); -+#if 0 -+EXPORT_SYMBOL(journal_release_buffer); -+#endif -+EXPORT_SYMBOL(journal_forget); -+#if 0 -+EXPORT_SYMBOL(journal_sync_buffer); -+#endif -+EXPORT_SYMBOL(journal_flush); -+EXPORT_SYMBOL(journal_revoke); -+ -+EXPORT_SYMBOL(journal_init_dev); -+EXPORT_SYMBOL(journal_init_inode); -+EXPORT_SYMBOL(journal_update_format); -+EXPORT_SYMBOL(journal_check_used_features); -+EXPORT_SYMBOL(journal_check_available_features); -+EXPORT_SYMBOL(journal_set_features); -+EXPORT_SYMBOL(journal_create); -+EXPORT_SYMBOL(journal_load); -+EXPORT_SYMBOL(journal_destroy); -+EXPORT_SYMBOL(journal_recover); -+EXPORT_SYMBOL(journal_update_superblock); -+EXPORT_SYMBOL(journal_abort); -+EXPORT_SYMBOL(journal_errno); -+EXPORT_SYMBOL(journal_ack_err); -+EXPORT_SYMBOL(journal_clear_err); -+EXPORT_SYMBOL(log_wait_commit); -+EXPORT_SYMBOL(log_start_commit); -+EXPORT_SYMBOL(journal_wipe); -+EXPORT_SYMBOL(journal_blocks_per_page); -+EXPORT_SYMBOL(journal_flushpage); -+EXPORT_SYMBOL(journal_try_to_free_buffers); -+EXPORT_SYMBOL(journal_bmap); -+EXPORT_SYMBOL(journal_force_commit); -+ -+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); -+ -+/* -+ * journal_datalist_lock is used to protect data buffers: -+ * -+ * bh->b_transaction -+ * bh->b_tprev -+ * bh->b_tnext -+ * -+ * journal_free_buffer() is called from journal_try_to_free_buffer(), and is -+ * async wrt everything else. -+ * -+ * It is also used for checkpoint data, also to protect against -+ * journal_try_to_free_buffer(): -+ * -+ * bh->b_cp_transaction -+ * bh->b_cpnext -+ * bh->b_cpprev -+ * transaction->t_checkpoint_list -+ * transaction->t_cpnext -+ * transaction->t_cpprev -+ * journal->j_checkpoint_transactions -+ * -+ * It is global at this time rather than per-journal because it's -+ * impossible for __journal_free_buffer to go from a buffer_head -+ * back to a journal_t unracily (well, not true. Fix later) -+ * -+ * -+ * The `datalist' and `checkpoint list' functions are quite -+ * separate and we could use two spinlocks here. -+ * -+ * lru_list_lock nests inside journal_datalist_lock. -+ */ -+spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED; -+ -+/* -+ * jh_splice_lock needs explantion. -+ * -+ * In a number of places we want to do things like: -+ * -+ * if (buffer_jbd(bh) && bh2jh(bh)->foo) -+ * -+ * This is racy on SMP, because another CPU could remove the journal_head -+ * in the middle of this expression. We need locking. -+ * -+ * But we can greatly optimise the locking cost by testing BH_JBD -+ * outside the lock. So, effectively: -+ * -+ * ret = 0; -+ * if (buffer_jbd(bh)) { -+ * spin_lock(&jh_splice_lock); -+ * if (buffer_jbd(bh)) { (* Still there? *) -+ * ret = bh2jh(bh)->foo; -+ * } -+ * spin_unlock(&jh_splice_lock); -+ * } -+ * return ret; -+ * -+ * Now, that protects us from races where another CPU can remove the -+ * journal_head. But it doesn't defend us from the situation where another -+ * CPU can *add* a journal_head. This is a correctness issue. But it's not -+ * a problem because a) the calling code was *already* racy and b) it often -+ * can't happen at the call site and c) the places where we add journal_heads -+ * tend to be under external locking. -+ */ -+spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED; -+ -+/* -+ * List of all journals in the system. Protected by the BKL. -+ */ -+static LIST_HEAD(all_journals); -+ -+/* -+ * Helper function used to manage commit timeouts -+ */ -+ -+static void commit_timeout(unsigned long __data) -+{ -+ struct task_struct * p = (struct task_struct *) __data; -+ -+ wake_up_process(p); -+} -+ -+/* Static check for data structure consistency. There's no code -+ * invoked --- we'll just get a linker failure if things aren't right. -+ */ -+void __journal_internal_check(void) -+{ -+ extern void journal_bad_superblock_size(void); -+ if (sizeof(struct journal_superblock_s) != 1024) -+ journal_bad_superblock_size(); -+} -+ -+/* -+ * kjournald: The main thread function used to manage a logging device -+ * journal. -+ * -+ * This kernel thread is responsible for two things: -+ * -+ * 1) COMMIT: Every so often we need to commit the current state of the -+ * filesystem to disk. The journal thread is responsible for writing -+ * all of the metadata buffers to disk. -+ * -+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all -+ * of the data in that part of the log has been rewritten elsewhere on -+ * the disk. Flushing these old buffers to reclaim space in the log is -+ * known as checkpointing, and this thread is responsible for that job. -+ */ -+ -+journal_t *current_journal; // AKPM: debug -+ -+int kjournald(void *arg) -+{ -+ journal_t *journal = (journal_t *) arg; -+ transaction_t *transaction; -+ struct timer_list timer; -+ -+ current_journal = journal; -+ -+ lock_kernel(); -+ daemonize(); -+ reparent_to_init(); -+ spin_lock_irq(¤t->sigmask_lock); -+ sigfillset(¤t->blocked); -+ recalc_sigpending(current); -+ spin_unlock_irq(¤t->sigmask_lock); -+ -+ sprintf(current->comm, "kjournald"); -+ -+ /* Set up an interval timer which can be used to trigger a -+ commit wakeup after the commit interval expires */ -+ init_timer(&timer); -+ timer.data = (unsigned long) current; -+ timer.function = commit_timeout; -+ journal->j_commit_timer = &timer; -+ -+ /* Record that the journal thread is running */ -+ journal->j_task = current; -+ wake_up(&journal->j_wait_done_commit); -+ -+ printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", -+ journal->j_commit_interval / HZ); -+ list_add(&journal->j_all_journals, &all_journals); -+ -+ /* And now, wait forever for commit wakeup events. */ -+ while (1) { -+ if (journal->j_flags & JFS_UNMOUNT) -+ break; -+ -+ jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", -+ journal->j_commit_sequence, journal->j_commit_request); -+ -+ if (journal->j_commit_sequence != journal->j_commit_request) { -+ jbd_debug(1, "OK, requests differ\n"); -+ if (journal->j_commit_timer_active) { -+ journal->j_commit_timer_active = 0; -+ del_timer(journal->j_commit_timer); -+ } -+ -+ journal_commit_transaction(journal); -+ continue; -+ } -+ -+ wake_up(&journal->j_wait_done_commit); -+ interruptible_sleep_on(&journal->j_wait_commit); -+ -+ jbd_debug(1, "kjournald wakes\n"); -+ -+ /* Were we woken up by a commit wakeup event? */ -+ if ((transaction = journal->j_running_transaction) != NULL && -+ time_after_eq(jiffies, transaction->t_expires)) { -+ journal->j_commit_request = transaction->t_tid; -+ jbd_debug(1, "woke because of timeout\n"); -+ } -+ } -+ -+ if (journal->j_commit_timer_active) { -+ journal->j_commit_timer_active = 0; -+ del_timer_sync(journal->j_commit_timer); -+ } -+ -+ list_del(&journal->j_all_journals); -+ -+ journal->j_task = NULL; -+ wake_up(&journal->j_wait_done_commit); -+ unlock_kernel(); -+ jbd_debug(1, "Journal thread exiting.\n"); -+ return 0; -+} -+ -+static void journal_start_thread(journal_t *journal) -+{ -+ kernel_thread(kjournald, (void *) journal, -+ CLONE_VM | CLONE_FS | CLONE_FILES); -+ while (!journal->j_task) -+ sleep_on(&journal->j_wait_done_commit); -+} -+ -+static void journal_kill_thread(journal_t *journal) -+{ -+ journal->j_flags |= JFS_UNMOUNT; -+ -+ while (journal->j_task) { -+ wake_up(&journal->j_wait_commit); -+ sleep_on(&journal->j_wait_done_commit); -+ } -+} -+ -+#if 0 -+ -+This is no longer needed - we do it in commit quite efficiently. -+Note that if this function is resurrected, the loop needs to -+be reorganised into the next_jh/last_jh algorithm. -+ -+/* -+ * journal_clean_data_list: cleanup after data IO. -+ * -+ * Once the IO system has finished writing the buffers on the transaction's -+ * data list, we can remove those buffers from the list. This function -+ * scans the list for such buffers and removes them cleanly. -+ * -+ * We assume that the journal is already locked. -+ * We are called with journal_datalist_lock held. -+ * -+ * AKPM: This function looks inefficient. Approximately O(n^2) -+ * for potentially thousands of buffers. It no longer shows on profiles -+ * because these buffers are mainly dropped in journal_commit_transaction(). -+ */ -+ -+void __journal_clean_data_list(transaction_t *transaction) -+{ -+ struct journal_head *jh, *next; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ -+restart: -+ jh = transaction->t_sync_datalist; -+ if (!jh) -+ goto out; -+ do { -+ next = jh->b_tnext; -+ if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) { -+ struct buffer_head *bh = jh2bh(jh); -+ BUFFER_TRACE(bh, "data writeout complete: unfile"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ __journal_remove_journal_head(bh); -+ refile_buffer(bh); -+ __brelse(bh); -+ goto restart; -+ } -+ jh = next; -+ } while (transaction->t_sync_datalist && -+ jh != transaction->t_sync_datalist); -+out: -+ return; -+} -+#endif -+ -+/* -+ * journal_write_metadata_buffer: write a metadata buffer to the journal. -+ * -+ * Writes a metadata buffer to a given disk block. The actual IO is not -+ * performed but a new buffer_head is constructed which labels the data -+ * to be written with the correct destination disk block. -+ * -+ * Any magic-number escaping which needs to be done will cause a -+ * copy-out here. If the buffer happens to start with the -+ * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the -+ * magic number is only written to the log for descripter blocks. In -+ * this case, we copy the data and replace the first word with 0, and we -+ * return a result code which indicates that this buffer needs to be -+ * marked as an escaped buffer in the corresponding log descriptor -+ * block. The missing word can then be restored when the block is read -+ * during recovery. -+ * -+ * If the source buffer has already been modified by a new transaction -+ * since we took the last commit snapshot, we use the frozen copy of -+ * that data for IO. If we end up using the existing buffer_head's data -+ * for the write, then we *have* to lock the buffer to prevent anyone -+ * else from using and possibly modifying it while the IO is in -+ * progress. -+ * -+ * The function returns a pointer to the buffer_heads to be used for IO. -+ * -+ * We assume that the journal has already been locked in this function. -+ * -+ * Return value: -+ * <0: Error -+ * >=0: Finished OK -+ * -+ * On success: -+ * Bit 0 set == escape performed on the data -+ * Bit 1 set == buffer copy-out performed (kfree the data after IO) -+ */ -+ -+static inline unsigned long virt_to_offset(void *p) -+{return ((unsigned long) p) & ~PAGE_MASK;} -+ -+int journal_write_metadata_buffer(transaction_t *transaction, -+ struct journal_head *jh_in, -+ struct journal_head **jh_out, -+ int blocknr) -+{ -+ int need_copy_out = 0; -+ int done_copy_out = 0; -+ int do_escape = 0; -+ char *mapped_data; -+ struct buffer_head *new_bh; -+ struct journal_head * new_jh; -+ struct page *new_page; -+ unsigned int new_offset; -+ -+ /* -+ * The buffer really shouldn't be locked: only the current committing -+ * transaction is allowed to write it, so nobody else is allowed -+ * to do any IO. -+ * -+ * akpm: except if we're journalling data, and write() output is -+ * also part of a shared mapping, and another thread has -+ * decided to launch a writepage() against this buffer. -+ */ -+ J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in))); -+ -+ /* -+ * If a new transaction has already done a buffer copy-out, then -+ * we use that version of the data for the commit. -+ */ -+ -+ if (jh_in->b_frozen_data) { -+ done_copy_out = 1; -+ new_page = virt_to_page(jh_in->b_frozen_data); -+ new_offset = virt_to_offset(jh_in->b_frozen_data); -+ } else { -+ new_page = jh2bh(jh_in)->b_page; -+ new_offset = virt_to_offset(jh2bh(jh_in)->b_data); -+ } -+ -+ mapped_data = ((char *) kmap(new_page)) + new_offset; -+ -+ /* -+ * Check for escaping -+ */ -+ if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) { -+ need_copy_out = 1; -+ do_escape = 1; -+ } -+ -+ /* -+ * Do we need to do a data copy? -+ */ -+ -+ if (need_copy_out && !done_copy_out) { -+ char *tmp; -+ tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS); -+ -+ jh_in->b_frozen_data = tmp; -+ memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size); -+ -+ /* If we get to this path, we'll always need the new -+ address kmapped so that we can clear the escaped -+ magic number below. */ -+ kunmap(new_page); -+ new_page = virt_to_page(tmp); -+ new_offset = virt_to_offset(tmp); -+ mapped_data = ((char *) kmap(new_page)) + new_offset; -+ -+ done_copy_out = 1; -+ } -+ -+ /* -+ * Right, time to make up the new buffer_head. -+ */ -+ do { -+ new_bh = get_unused_buffer_head(0); -+ if (!new_bh) { -+ printk (KERN_NOTICE __FUNCTION__ -+ ": ENOMEM at get_unused_buffer_head, " -+ "trying again.\n"); -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ } -+ } while (!new_bh); -+ /* keep subsequent assertions sane */ -+ new_bh->b_prev_free = 0; -+ new_bh->b_next_free = 0; -+ new_bh->b_state = 0; -+ init_buffer(new_bh, NULL, NULL); -+ atomic_set(&new_bh->b_count, 1); -+ new_jh = journal_add_journal_head(new_bh); -+ -+ set_bh_page(new_bh, new_page, new_offset); -+ -+ new_jh->b_transaction = NULL; -+ new_bh->b_size = jh2bh(jh_in)->b_size; -+ new_bh->b_dev = transaction->t_journal->j_dev; -+ new_bh->b_blocknr = blocknr; -+ new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty); -+ -+ *jh_out = new_jh; -+ -+ /* -+ * Did we need to do an escaping? Now we've done all the -+ * copying, we can finally do so. -+ */ -+ -+ if (do_escape) -+ * ((unsigned int *) mapped_data) = 0; -+ kunmap(new_page); -+ -+ /* -+ * The to-be-written buffer needs to get moved to the io queue, -+ * and the original buffer whose contents we are shadowing or -+ * copying is moved to the transaction's shadow queue. -+ */ -+ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); -+ journal_file_buffer(jh_in, transaction, BJ_Shadow); -+ JBUFFER_TRACE(new_jh, "file as BJ_IO"); -+ journal_file_buffer(new_jh, transaction, BJ_IO); -+ -+ return do_escape | (done_copy_out << 1); -+} -+ -+/* -+ * Allocation code for the journal file. Manage the space left in the -+ * journal, so that we can begin checkpointing when appropriate. -+ */ -+ -+/* -+ * log_space_left: Return the number of free blocks left in the journal. -+ * -+ * Called with the journal already locked. -+ */ -+ -+int log_space_left (journal_t *journal) -+{ -+ int left = journal->j_free; -+ -+ /* Be pessimistic here about the number of those free blocks -+ * which might be required for log descriptor control blocks. */ -+ -+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ -+ -+ left -= MIN_LOG_RESERVED_BLOCKS; -+ -+ if (left <= 0) -+ return 0; -+ left -= (left >> 3); -+ return left; -+} -+ -+/* -+ * This function must be non-allocating for PF_MEMALLOC tasks -+ */ -+tid_t log_start_commit (journal_t *journal, transaction_t *transaction) -+{ -+ tid_t target = journal->j_commit_request; -+ -+ lock_kernel(); /* Protect journal->j_running_transaction */ -+ -+ /* -+ * A NULL transaction asks us to commit the currently running -+ * transaction, if there is one. -+ */ -+ if (transaction) -+ target = transaction->t_tid; -+ else { -+ transaction = journal->j_running_transaction; -+ if (!transaction) -+ goto out; -+ target = transaction->t_tid; -+ } -+ -+ /* -+ * Are we already doing a recent enough commit? -+ */ -+ if (tid_geq(journal->j_commit_request, target)) -+ goto out; -+ -+ /* -+ * We want a new commit: OK, mark the request and wakup the -+ * commit thread. We do _not_ do the commit ourselves. -+ */ -+ -+ journal->j_commit_request = target; -+ jbd_debug(1, "JBD: requesting commit %d/%d\n", -+ journal->j_commit_request, -+ journal->j_commit_sequence); -+ wake_up(&journal->j_wait_commit); -+ -+out: -+ unlock_kernel(); -+ return target; -+} -+ -+/* -+ * Wait for a specified commit to complete. -+ * The caller may not hold the journal lock. -+ */ -+void log_wait_commit (journal_t *journal, tid_t tid) -+{ -+ lock_kernel(); -+#ifdef CONFIG_JBD_DEBUG -+ lock_journal(journal); -+ if (!tid_geq(journal->j_commit_request, tid)) { -+ printk(KERN_EMERG __FUNCTION__ -+ ": error: j_commit_request=%d, tid=%d\n", -+ journal->j_commit_request, tid); -+ } -+ unlock_journal(journal); -+#endif -+ while (tid_gt(tid, journal->j_commit_sequence)) { -+ jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", -+ tid, journal->j_commit_sequence); -+ wake_up(&journal->j_wait_commit); -+ sleep_on(&journal->j_wait_done_commit); -+ } -+ unlock_kernel(); -+} -+ -+/* -+ * Log buffer allocation routines: -+ */ -+ -+int journal_next_log_block(journal_t *journal, unsigned long *retp) -+{ -+ unsigned long blocknr; -+ -+ J_ASSERT(journal->j_free > 1); -+ -+ blocknr = journal->j_head; -+ journal->j_head++; -+ journal->j_free--; -+ if (journal->j_head == journal->j_last) -+ journal->j_head = journal->j_first; -+ return journal_bmap(journal, blocknr, retp); -+} -+ -+/* -+ * Conversion of logical to physical block numbers for the journal -+ * -+ * On external journals the journal blocks are identity-mapped, so -+ * this is a no-op. If needed, we can use j_blk_offset - everything is -+ * ready. -+ */ -+int journal_bmap(journal_t *journal, unsigned long blocknr, -+ unsigned long *retp) -+{ -+ int err = 0; -+ unsigned long ret; -+ -+ if (journal->j_inode) { -+ ret = bmap(journal->j_inode, blocknr); -+ if (ret) -+ *retp = ret; -+ else { -+ printk (KERN_ALERT __FUNCTION__ -+ ": journal block not found " -+ "at offset %lu on %s\n", -+ blocknr, bdevname(journal->j_dev)); -+ err = -EIO; -+ __journal_abort_soft(journal, err); -+ } -+ } else { -+ *retp = blocknr; /* +journal->j_blk_offset */ -+ } -+ return err; -+} -+ -+/* -+ * We play buffer_head aliasing tricks to write data/metadata blocks to -+ * the journal without copying their contents, but for journal -+ * descriptor blocks we do need to generate bona fide buffers. -+ * -+ * We return a jh whose bh is locked and ready to be populated. -+ */ -+ -+struct journal_head * journal_get_descriptor_buffer(journal_t *journal) -+{ -+ struct buffer_head *bh; -+ unsigned long blocknr; -+ int err; -+ -+ err = journal_next_log_block(journal, &blocknr); -+ -+ if (err) -+ return NULL; -+ -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ lock_buffer(bh); -+ BUFFER_TRACE(bh, "return this buffer"); -+ return journal_add_journal_head(bh); -+} -+ -+/* -+ * Management for journal control blocks: functions to create and -+ * destroy journal_t structures, and to initialise and read existing -+ * journal blocks from disk. */ -+ -+/* First: create and setup a journal_t object in memory. We initialise -+ * very few fields yet: that has to wait until we have created the -+ * journal structures from from scratch, or loaded them from disk. */ -+ -+static journal_t * journal_init_common (void) -+{ -+ journal_t *journal; -+ int err; -+ -+ MOD_INC_USE_COUNT; -+ -+ journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); -+ if (!journal) -+ goto fail; -+ memset(journal, 0, sizeof(*journal)); -+ -+ init_waitqueue_head(&journal->j_wait_transaction_locked); -+ init_waitqueue_head(&journal->j_wait_logspace); -+ init_waitqueue_head(&journal->j_wait_done_commit); -+ init_waitqueue_head(&journal->j_wait_checkpoint); -+ init_waitqueue_head(&journal->j_wait_commit); -+ init_waitqueue_head(&journal->j_wait_updates); -+ init_MUTEX(&journal->j_barrier); -+ init_MUTEX(&journal->j_checkpoint_sem); -+ init_MUTEX(&journal->j_sem); -+ -+ journal->j_commit_interval = (HZ * 5); -+ -+ /* The journal is marked for error until we succeed with recovery! */ -+ journal->j_flags = JFS_ABORT; -+ -+ /* Set up a default-sized revoke table for the new mount. */ -+ err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); -+ if (err) { -+ kfree(journal); -+ goto fail; -+ } -+ return journal; -+fail: -+ MOD_DEC_USE_COUNT; -+ return NULL; -+} -+ -+/* journal_init_dev and journal_init_inode: -+ * -+ * Create a journal structure assigned some fixed set of disk blocks to -+ * the journal. We don't actually touch those disk blocks yet, but we -+ * need to set up all of the mapping information to tell the journaling -+ * system where the journal blocks are. -+ * -+ * journal_init_dev creates a journal which maps a fixed contiguous -+ * range of blocks on an arbitrary block device. -+ * -+ * journal_init_inode creates a journal which maps an on-disk inode as -+ * the journal. The inode must exist already, must support bmap() and -+ * must have all data blocks preallocated. -+ */ -+ -+journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev, -+ int start, int len, int blocksize) -+{ -+ journal_t *journal = journal_init_common(); -+ struct buffer_head *bh; -+ -+ if (!journal) -+ return NULL; -+ -+ journal->j_dev = dev; -+ journal->j_fs_dev = fs_dev; -+ journal->j_blk_offset = start; -+ journal->j_maxlen = len; -+ journal->j_blocksize = blocksize; -+ -+ bh = getblk(journal->j_dev, start, journal->j_blocksize); -+ J_ASSERT(bh != NULL); -+ journal->j_sb_buffer = bh; -+ journal->j_superblock = (journal_superblock_t *)bh->b_data; -+ -+ return journal; -+} -+ -+journal_t * journal_init_inode (struct inode *inode) -+{ -+ struct buffer_head *bh; -+ journal_t *journal = journal_init_common(); -+ int err; -+ unsigned long blocknr; -+ -+ if (!journal) -+ return NULL; -+ -+ journal->j_dev = inode->i_dev; -+ journal->j_fs_dev = inode->i_dev; -+ journal->j_inode = inode; -+ jbd_debug(1, -+ "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", -+ journal, bdevname(inode->i_dev), inode->i_ino, -+ (long long) inode->i_size, -+ inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); -+ -+ journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; -+ journal->j_blocksize = inode->i_sb->s_blocksize; -+ -+ err = journal_bmap(journal, 0, &blocknr); -+ /* If that failed, give up */ -+ if (err) { -+ printk(KERN_ERR __FUNCTION__ ": Cannnot locate journal " -+ "superblock\n"); -+ kfree(journal); -+ return NULL; -+ } -+ -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ J_ASSERT(bh != NULL); -+ journal->j_sb_buffer = bh; -+ journal->j_superblock = (journal_superblock_t *)bh->b_data; -+ -+ return journal; -+} -+ -+/* -+ * If the journal init or create aborts, we need to mark the journal -+ * superblock as being NULL to prevent the journal destroy from writing -+ * back a bogus superblock. -+ */ -+static void journal_fail_superblock (journal_t *journal) -+{ -+ struct buffer_head *bh = journal->j_sb_buffer; -+ brelse(bh); -+ journal->j_sb_buffer = NULL; -+} -+ -+/* -+ * Given a journal_t structure, initialise the various fields for -+ * startup of a new journaling session. We use this both when creating -+ * a journal, and after recovering an old journal to reset it for -+ * subsequent use. -+ */ -+ -+static int journal_reset (journal_t *journal) -+{ -+ journal_superblock_t *sb = journal->j_superblock; -+ unsigned int first, last; -+ -+ first = ntohl(sb->s_first); -+ last = ntohl(sb->s_maxlen); -+ -+ journal->j_first = first; -+ journal->j_last = last; -+ -+ journal->j_head = first; -+ journal->j_tail = first; -+ journal->j_free = last - first; -+ -+ journal->j_tail_sequence = journal->j_transaction_sequence; -+ journal->j_commit_sequence = journal->j_transaction_sequence - 1; -+ journal->j_commit_request = journal->j_commit_sequence; -+ -+ journal->j_max_transaction_buffers = journal->j_maxlen / 4; -+ -+ /* Add the dynamic fields and write it to disk. */ -+ journal_update_superblock(journal, 1); -+ -+ lock_journal(journal); -+ journal_start_thread(journal); -+ unlock_journal(journal); -+ -+ return 0; -+} -+ -+/* -+ * Given a journal_t structure which tells us which disk blocks we can -+ * use, create a new journal superblock and initialise all of the -+ * journal fields from scratch. */ -+ -+int journal_create (journal_t *journal) -+{ -+ unsigned long blocknr; -+ struct buffer_head *bh; -+ journal_superblock_t *sb; -+ int i, err; -+ -+ if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { -+ printk (KERN_ERR "Journal length (%d blocks) too short.\n", -+ journal->j_maxlen); -+ journal_fail_superblock(journal); -+ return -EINVAL; -+ } -+ -+ if (journal->j_inode == NULL) { -+ /* -+ * We don't know what block to start at! -+ */ -+ printk(KERN_EMERG __FUNCTION__ -+ ": creation of journal on external device!\n"); -+ BUG(); -+ } -+ -+ /* Zero out the entire journal on disk. We cannot afford to -+ have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ -+ jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); -+ for (i = 0; i < journal->j_maxlen; i++) { -+ err = journal_bmap(journal, i, &blocknr); -+ if (err) -+ return err; -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ wait_on_buffer(bh); -+ memset (bh->b_data, 0, journal->j_blocksize); -+ BUFFER_TRACE(bh, "marking dirty"); -+ mark_buffer_dirty(bh); -+ BUFFER_TRACE(bh, "marking uptodate"); -+ mark_buffer_uptodate(bh, 1); -+ __brelse(bh); -+ } -+ -+ sync_dev(journal->j_dev); -+ jbd_debug(1, "JBD: journal cleared.\n"); -+ -+ /* OK, fill in the initial static fields in the new superblock */ -+ sb = journal->j_superblock; -+ -+ sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER); -+ sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2); -+ -+ sb->s_blocksize = htonl(journal->j_blocksize); -+ sb->s_maxlen = htonl(journal->j_maxlen); -+ sb->s_first = htonl(1); -+ -+ journal->j_transaction_sequence = 1; -+ -+ journal->j_flags &= ~JFS_ABORT; -+ journal->j_format_version = 2; -+ -+ return journal_reset(journal); -+} -+ -+/* -+ * Update a journal's dynamic superblock fields and write it to disk, -+ * optionally waiting for the IO to complete. -+*/ -+ -+void journal_update_superblock(journal_t *journal, int wait) -+{ -+ journal_superblock_t *sb = journal->j_superblock; -+ struct buffer_head *bh = journal->j_sb_buffer; -+ -+ jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", -+ journal->j_tail, journal->j_tail_sequence, journal->j_errno); -+ -+ sb->s_sequence = htonl(journal->j_tail_sequence); -+ sb->s_start = htonl(journal->j_tail); -+ sb->s_errno = htonl(journal->j_errno); -+ -+ BUFFER_TRACE(bh, "marking dirty"); -+ mark_buffer_dirty(bh); -+ ll_rw_block(WRITE, 1, &bh); -+ if (wait) -+ wait_on_buffer(bh); -+ -+ /* If we have just flushed the log (by marking s_start==0), then -+ * any future commit will have to be careful to update the -+ * superblock again to re-record the true start of the log. */ -+ -+ if (sb->s_start) -+ journal->j_flags &= ~JFS_FLUSHED; -+ else -+ journal->j_flags |= JFS_FLUSHED; -+} -+ -+ -+/* -+ * Read the superblock for a given journal, performing initial -+ * validation of the format. -+ */ -+ -+static int journal_get_superblock(journal_t *journal) -+{ -+ struct buffer_head *bh; -+ journal_superblock_t *sb; -+ int err = -EIO; -+ -+ bh = journal->j_sb_buffer; -+ -+ J_ASSERT(bh != NULL); -+ if (!buffer_uptodate(bh)) { -+ ll_rw_block(READ, 1, &bh); -+ wait_on_buffer(bh); -+ if (!buffer_uptodate(bh)) { -+ printk (KERN_ERR -+ "JBD: IO error reading journal superblock\n"); -+ goto out; -+ } -+ } -+ -+ sb = journal->j_superblock; -+ -+ err = -EINVAL; -+ -+ if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) || -+ sb->s_blocksize != htonl(journal->j_blocksize)) { -+ printk(KERN_WARNING "JBD: no valid journal superblock found\n"); -+ goto out; -+ } -+ -+ switch(ntohl(sb->s_header.h_blocktype)) { -+ case JFS_SUPERBLOCK_V1: -+ journal->j_format_version = 1; -+ break; -+ case JFS_SUPERBLOCK_V2: -+ journal->j_format_version = 2; -+ break; -+ default: -+ printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); -+ goto out; -+ } -+ -+ if (ntohl(sb->s_maxlen) < journal->j_maxlen) -+ journal->j_maxlen = ntohl(sb->s_maxlen); -+ else if (ntohl(sb->s_maxlen) > journal->j_maxlen) { -+ printk (KERN_WARNING "JBD: journal file too short\n"); -+ goto out; -+ } -+ -+ return 0; -+ -+out: -+ journal_fail_superblock(journal); -+ return err; -+} -+ -+/* -+ * Load the on-disk journal superblock and read the key fields into the -+ * journal_t. -+ */ -+ -+static int load_superblock(journal_t *journal) -+{ -+ int err; -+ journal_superblock_t *sb; -+ -+ err = journal_get_superblock(journal); -+ if (err) -+ return err; -+ -+ sb = journal->j_superblock; -+ -+ journal->j_tail_sequence = ntohl(sb->s_sequence); -+ journal->j_tail = ntohl(sb->s_start); -+ journal->j_first = ntohl(sb->s_first); -+ journal->j_last = ntohl(sb->s_maxlen); -+ journal->j_errno = ntohl(sb->s_errno); -+ -+ return 0; -+} -+ -+ -+/* -+ * Given a journal_t structure which tells us which disk blocks contain -+ * a journal, read the journal from disk to initialise the in-memory -+ * structures. -+ */ -+ -+int journal_load(journal_t *journal) -+{ -+ int err; -+ -+ err = load_superblock(journal); -+ if (err) -+ return err; -+ -+ /* If this is a V2 superblock, then we have to check the -+ * features flags on it. */ -+ -+ if (journal->j_format_version >= 2) { -+ journal_superblock_t *sb = journal->j_superblock; -+ -+ if ((sb->s_feature_ro_compat & -+ ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || -+ (sb->s_feature_incompat & -+ ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { -+ printk (KERN_WARNING -+ "JBD: Unrecognised features on journal\n"); -+ return -EINVAL; -+ } -+ } -+ -+ /* Let the recovery code check whether it needs to recover any -+ * data from the journal. */ -+ if (journal_recover(journal)) -+ goto recovery_error; -+ -+ /* OK, we've finished with the dynamic journal bits: -+ * reinitialise the dynamic contents of the superblock in memory -+ * and reset them on disk. */ -+ if (journal_reset(journal)) -+ goto recovery_error; -+ -+ journal->j_flags &= ~JFS_ABORT; -+ journal->j_flags |= JFS_LOADED; -+ return 0; -+ -+recovery_error: -+ printk (KERN_WARNING "JBD: recovery failed\n"); -+ return -EIO; -+} -+ -+/* -+ * Release a journal_t structure once it is no longer in use by the -+ * journaled object. -+ */ -+ -+void journal_destroy (journal_t *journal) -+{ -+ /* Wait for the commit thread to wake up and die. */ -+ journal_kill_thread(journal); -+ -+ /* Force a final log commit */ -+ if (journal->j_running_transaction) -+ journal_commit_transaction(journal); -+ -+ /* Force any old transactions to disk */ -+ lock_journal(journal); -+ while (journal->j_checkpoint_transactions != NULL) -+ log_do_checkpoint(journal, 1); -+ -+ J_ASSERT(journal->j_running_transaction == NULL); -+ J_ASSERT(journal->j_committing_transaction == NULL); -+ J_ASSERT(journal->j_checkpoint_transactions == NULL); -+ -+ /* We can now mark the journal as empty. */ -+ journal->j_tail = 0; -+ journal->j_tail_sequence = ++journal->j_transaction_sequence; -+ if (journal->j_sb_buffer) { -+ journal_update_superblock(journal, 1); -+ brelse(journal->j_sb_buffer); -+ } -+ -+ if (journal->j_inode) -+ iput(journal->j_inode); -+ if (journal->j_revoke) -+ journal_destroy_revoke(journal); -+ -+ unlock_journal(journal); -+ kfree(journal); -+ MOD_DEC_USE_COUNT; -+} -+ -+ -+/* Published API: Check whether the journal uses all of a given set of -+ * features. Return true (non-zero) if it does. */ -+ -+int journal_check_used_features (journal_t *journal, unsigned long compat, -+ unsigned long ro, unsigned long incompat) -+{ -+ journal_superblock_t *sb; -+ -+ if (!compat && !ro && !incompat) -+ return 1; -+ if (journal->j_format_version == 1) -+ return 0; -+ -+ sb = journal->j_superblock; -+ -+ if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && -+ ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && -+ ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) -+ return 1; -+ -+ return 0; -+} -+ -+/* Published API: Check whether the journaling code supports the use of -+ * all of a given set of features on this journal. Return true -+ * (non-zero) if it can. */ -+ -+int journal_check_available_features (journal_t *journal, unsigned long compat, -+ unsigned long ro, unsigned long incompat) -+{ -+ journal_superblock_t *sb; -+ -+ if (!compat && !ro && !incompat) -+ return 1; -+ -+ sb = journal->j_superblock; -+ -+ /* We can support any known requested features iff the -+ * superblock is in version 2. Otherwise we fail to support any -+ * extended sb features. */ -+ -+ if (journal->j_format_version != 2) -+ return 0; -+ -+ if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && -+ (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && -+ (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) -+ return 1; -+ -+ return 0; -+} -+ -+/* Published API: Mark a given journal feature as present on the -+ * superblock. Returns true if the requested features could be set. */ -+ -+int journal_set_features (journal_t *journal, unsigned long compat, -+ unsigned long ro, unsigned long incompat) -+{ -+ journal_superblock_t *sb; -+ -+ if (journal_check_used_features(journal, compat, ro, incompat)) -+ return 1; -+ -+ if (!journal_check_available_features(journal, compat, ro, incompat)) -+ return 0; -+ -+ jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", -+ compat, ro, incompat); -+ -+ sb = journal->j_superblock; -+ -+ sb->s_feature_compat |= cpu_to_be32(compat); -+ sb->s_feature_ro_compat |= cpu_to_be32(ro); -+ sb->s_feature_incompat |= cpu_to_be32(incompat); -+ -+ return 1; -+} -+ -+ -+/* -+ * Published API: -+ * Given an initialised but unloaded journal struct, poke about in the -+ * on-disk structure to update it to the most recent supported version. -+ */ -+ -+int journal_update_format (journal_t *journal) -+{ -+ journal_superblock_t *sb; -+ int err; -+ -+ err = journal_get_superblock(journal); -+ if (err) -+ return err; -+ -+ sb = journal->j_superblock; -+ -+ switch (ntohl(sb->s_header.h_blocktype)) { -+ case JFS_SUPERBLOCK_V2: -+ return 0; -+ case JFS_SUPERBLOCK_V1: -+ return journal_convert_superblock_v1(journal, sb); -+ default: -+ break; -+ } -+ return -EINVAL; -+} -+ -+static int journal_convert_superblock_v1(journal_t *journal, -+ journal_superblock_t *sb) -+{ -+ int offset, blocksize; -+ struct buffer_head *bh; -+ -+ printk(KERN_WARNING -+ "JBD: Converting superblock from version 1 to 2.\n"); -+ -+ /* Pre-initialise new fields to zero */ -+ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); -+ blocksize = ntohl(sb->s_blocksize); -+ memset(&sb->s_feature_compat, 0, blocksize-offset); -+ -+ sb->s_nr_users = cpu_to_be32(1); -+ sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); -+ journal->j_format_version = 2; -+ -+ bh = journal->j_sb_buffer; -+ BUFFER_TRACE(bh, "marking dirty"); -+ mark_buffer_dirty(bh); -+ ll_rw_block(WRITE, 1, &bh); -+ wait_on_buffer(bh); -+ return 0; -+} -+ -+ -+/* -+ * Flush all data for a given journal to disk and empty the journal. -+ * Filesystems can use this when remounting readonly to ensure that -+ * recovery does not need to happen on remount. -+ */ -+ -+int journal_flush (journal_t *journal) -+{ -+ int err = 0; -+ transaction_t *transaction = NULL; -+ unsigned long old_tail; -+ -+ lock_kernel(); -+ -+ /* Force everything buffered to the log... */ -+ if (journal->j_running_transaction) { -+ transaction = journal->j_running_transaction; -+ log_start_commit(journal, transaction); -+ } else if (journal->j_committing_transaction) -+ transaction = journal->j_committing_transaction; -+ -+ /* Wait for the log commit to complete... */ -+ if (transaction) -+ log_wait_commit(journal, transaction->t_tid); -+ -+ /* ...and flush everything in the log out to disk. */ -+ lock_journal(journal); -+ while (!err && journal->j_checkpoint_transactions != NULL) -+ err = log_do_checkpoint(journal, journal->j_maxlen); -+ cleanup_journal_tail(journal); -+ -+ /* Finally, mark the journal as really needing no recovery. -+ * This sets s_start==0 in the underlying superblock, which is -+ * the magic code for a fully-recovered superblock. Any future -+ * commits of data to the journal will restore the current -+ * s_start value. */ -+ old_tail = journal->j_tail; -+ journal->j_tail = 0; -+ journal_update_superblock(journal, 1); -+ journal->j_tail = old_tail; -+ -+ unlock_journal(journal); -+ -+ J_ASSERT(!journal->j_running_transaction); -+ J_ASSERT(!journal->j_committing_transaction); -+ J_ASSERT(!journal->j_checkpoint_transactions); -+ J_ASSERT(journal->j_head == journal->j_tail); -+ J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); -+ -+ unlock_kernel(); -+ -+ return err; -+} -+ -+/* -+ * Wipe out all of the contents of a journal, safely. This will produce -+ * a warning if the journal contains any valid recovery information. -+ * Must be called between journal_init_*() and journal_load(). -+ * -+ * If (write) is non-zero, then we wipe out the journal on disk; otherwise -+ * we merely suppress recovery. -+ */ -+ -+int journal_wipe (journal_t *journal, int write) -+{ -+ journal_superblock_t *sb; -+ int err = 0; -+ -+ J_ASSERT (!(journal->j_flags & JFS_LOADED)); -+ -+ err = load_superblock(journal); -+ if (err) -+ return err; -+ -+ sb = journal->j_superblock; -+ -+ if (!journal->j_tail) -+ goto no_recovery; -+ -+ printk (KERN_WARNING "JBD: %s recovery information on journal\n", -+ write ? "Clearing" : "Ignoring"); -+ -+ err = journal_skip_recovery(journal); -+ if (write) -+ journal_update_superblock(journal, 1); -+ -+ no_recovery: -+ return err; -+} -+ -+/* -+ * journal_dev_name: format a character string to describe on what -+ * device this journal is present. -+ */ -+ -+const char * journal_dev_name(journal_t *journal) -+{ -+ kdev_t dev; -+ -+ if (journal->j_inode) -+ dev = journal->j_inode->i_dev; -+ else -+ dev = journal->j_dev; -+ -+ return bdevname(dev); -+} -+ -+/* -+ * journal_abort: perform a complete, immediate shutdown of the ENTIRE -+ * journal (not of a single transaction). This operation cannot be -+ * undone without closing and reopening the journal. -+ * -+ * The journal_abort function is intended to support higher level error -+ * recovery mechanisms such as the ext2/ext3 remount-readonly error -+ * mode. -+ * -+ * Journal abort has very specific semantics. Any existing dirty, -+ * unjournaled buffers in the main filesystem will still be written to -+ * disk by bdflush, but the journaling mechanism will be suspended -+ * immediately and no further transaction commits will be honoured. -+ * -+ * Any dirty, journaled buffers will be written back to disk without -+ * hitting the journal. Atomicity cannot be guaranteed on an aborted -+ * filesystem, but we _do_ attempt to leave as much data as possible -+ * behind for fsck to use for cleanup. -+ * -+ * Any attempt to get a new transaction handle on a journal which is in -+ * ABORT state will just result in an -EROFS error return. A -+ * journal_stop on an existing handle will return -EIO if we have -+ * entered abort state during the update. -+ * -+ * Recursive transactions are not disturbed by journal abort until the -+ * final journal_stop, which will receive the -EIO error. -+ * -+ * Finally, the journal_abort call allows the caller to supply an errno -+ * which will be recored (if possible) in the journal superblock. This -+ * allows a client to record failure conditions in the middle of a -+ * transaction without having to complete the transaction to record the -+ * failure to disk. ext3_error, for example, now uses this -+ * functionality. -+ * -+ * Errors which originate from within the journaling layer will NOT -+ * supply an errno; a null errno implies that absolutely no further -+ * writes are done to the journal (unless there are any already in -+ * progress). -+ */ -+ -+/* Quick version for internal journal use (doesn't lock the journal). -+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, -+ * and don't attempt to make any other journal updates. */ -+void __journal_abort_hard (journal_t *journal) -+{ -+ transaction_t *transaction; -+ -+ if (journal->j_flags & JFS_ABORT) -+ return; -+ -+ printk (KERN_ERR "Aborting journal on device %s.\n", -+ journal_dev_name(journal)); -+ -+ journal->j_flags |= JFS_ABORT; -+ transaction = journal->j_running_transaction; -+ if (transaction) -+ log_start_commit(journal, transaction); -+} -+ -+/* Soft abort: record the abort error status in the journal superblock, -+ * but don't do any other IO. */ -+void __journal_abort_soft (journal_t *journal, int errno) -+{ -+ if (journal->j_flags & JFS_ABORT) -+ return; -+ -+ if (!journal->j_errno) -+ journal->j_errno = errno; -+ -+ __journal_abort_hard(journal); -+ -+ if (errno) -+ journal_update_superblock(journal, 1); -+} -+ -+/* Full version for external use */ -+void journal_abort (journal_t *journal, int errno) -+{ -+ lock_journal(journal); -+ __journal_abort_soft(journal, errno); -+ unlock_journal(journal); -+} -+ -+int journal_errno (journal_t *journal) -+{ -+ int err; -+ -+ lock_journal(journal); -+ if (journal->j_flags & JFS_ABORT) -+ err = -EROFS; -+ else -+ err = journal->j_errno; -+ unlock_journal(journal); -+ return err; -+} -+ -+int journal_clear_err (journal_t *journal) -+{ -+ int err = 0; -+ -+ lock_journal(journal); -+ if (journal->j_flags & JFS_ABORT) -+ err = -EROFS; -+ else -+ journal->j_errno = 0; -+ unlock_journal(journal); -+ return err; -+} -+ -+void journal_ack_err (journal_t *journal) -+{ -+ lock_journal(journal); -+ if (journal->j_errno) -+ journal->j_flags |= JFS_ACK_ERR; -+ unlock_journal(journal); -+} -+ -+int journal_blocks_per_page(struct inode *inode) -+{ -+ return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); -+} -+ -+/* -+ * shrink_journal_memory(). -+ * Called when we're under memory pressure. Free up all the written-back -+ * checkpointed metadata buffers. -+ */ -+void shrink_journal_memory(void) -+{ -+ struct list_head *list; -+ -+ lock_kernel(); -+ list_for_each(list, &all_journals) { -+ journal_t *journal = -+ list_entry(list, journal_t, j_all_journals); -+ spin_lock(&journal_datalist_lock); -+ __journal_clean_checkpoint_list(journal); -+ spin_unlock(&journal_datalist_lock); -+ } -+ unlock_kernel(); -+} -+ -+/* -+ * Simple support for retying memory allocations. Introduced to help to -+ * debug different VM deadlock avoidance strategies. -+ */ -+/* -+ * Simple support for retying memory allocations. Introduced to help to -+ * debug different VM deadlock avoidance strategies. -+ */ -+void * __jbd_kmalloc (char *where, size_t size, int flags, int retry) -+{ -+ void *p; -+ static unsigned long last_warning; -+ -+ while (1) { -+ p = kmalloc(size, flags); -+ if (p) -+ return p; -+ if (!retry) -+ return NULL; -+ /* Log every retry for debugging. Also log them to the -+ * syslog, but do rate-limiting on the non-debugging -+ * messages. */ -+ jbd_debug(1, "ENOMEM in %s, retrying.\n", where); -+ -+ if (time_after(jiffies, last_warning + 5*HZ)) { -+ printk(KERN_NOTICE -+ "ENOMEM in %s, retrying.\n", where); -+ last_warning = jiffies; -+ } -+ -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ } -+} -+ -+/* -+ * Journal_head storage management -+ */ -+static kmem_cache_t *journal_head_cache; -+#ifdef CONFIG_JBD_DEBUG -+static atomic_t nr_journal_heads = ATOMIC_INIT(0); -+#endif -+ -+static int journal_init_journal_head_cache(void) -+{ -+ int retval; -+ -+ J_ASSERT(journal_head_cache == 0); -+ journal_head_cache = kmem_cache_create("journal_head", -+ sizeof(struct journal_head), -+ 0, /* offset */ -+ 0, /* flags */ -+ NULL, /* ctor */ -+ NULL); /* dtor */ -+ retval = 0; -+ if (journal_head_cache == 0) { -+ retval = -ENOMEM; -+ printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); -+ } -+ return retval; -+} -+ -+static void journal_destroy_journal_head_cache(void) -+{ -+ J_ASSERT(journal_head_cache != NULL); -+ kmem_cache_destroy(journal_head_cache); -+ journal_head_cache = 0; -+} -+ -+/* -+ * journal_head splicing and dicing -+ */ -+static struct journal_head *journal_alloc_journal_head(void) -+{ -+ struct journal_head *ret; -+ static unsigned long last_warning; -+ -+#ifdef CONFIG_JBD_DEBUG -+ atomic_inc(&nr_journal_heads); -+#endif -+ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); -+ if (ret == 0) { -+ jbd_debug(1, "out of memory for journal_head\n"); -+ if (time_after(jiffies, last_warning + 5*HZ)) { -+ printk(KERN_NOTICE "ENOMEM in " __FUNCTION__ -+ ", retrying.\n"); -+ last_warning = jiffies; -+ } -+ while (ret == 0) { -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); -+ } -+ } -+ return ret; -+} -+ -+static void journal_free_journal_head(struct journal_head *jh) -+{ -+#ifdef CONFIG_JBD_DEBUG -+ atomic_dec(&nr_journal_heads); -+ memset(jh, 0x5b, sizeof(*jh)); -+#endif -+ kmem_cache_free(journal_head_cache, jh); -+} -+ -+/* -+ * A journal_head is attached to a buffer_head whenever JBD has an -+ * interest in the buffer. -+ * -+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit -+ * is set. This bit is tested in core kernel code where we need to take -+ * JBD-specific actions. Testing the zeroness of ->b_private is not reliable -+ * there. -+ * -+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. -+ * -+ * When a buffer has its BH_JBD bit set it is immune from being released by -+ * core kernel code, mainly via ->b_count. -+ * -+ * A journal_head may be detached from its buffer_head when the journal_head's -+ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. -+ * Various places in JBD call journal_remove_journal_head() to indicate that the -+ * journal_head can be dropped if needed. -+ * -+ * Various places in the kernel want to attach a journal_head to a buffer_head -+ * _before_ attaching the journal_head to a transaction. To protect the -+ * journal_head in this situation, journal_add_journal_head elevates the -+ * journal_head's b_jcount refcount by one. The caller must call -+ * journal_unlock_journal_head() to undo this. -+ * -+ * So the typical usage would be: -+ * -+ * (Attach a journal_head if needed. Increments b_jcount) -+ * struct journal_head *jh = journal_add_journal_head(bh); -+ * ... -+ * jh->b_transaction = xxx; -+ * journal_unlock_journal_head(jh); -+ * -+ * Now, the journal_head's b_jcount is zero, but it is safe from being released -+ * because it has a non-zero b_transaction. -+ */ -+ -+/* -+ * Give a buffer_head a journal_head. -+ * -+ * Doesn't need the journal lock. -+ * May sleep. -+ * Cannot be called with journal_datalist_lock held. -+ */ -+struct journal_head *journal_add_journal_head(struct buffer_head *bh) -+{ -+ struct journal_head *jh; -+ -+ spin_lock(&journal_datalist_lock); -+ if (buffer_jbd(bh)) { -+ jh = bh2jh(bh); -+ } else { -+ J_ASSERT_BH(bh, -+ (atomic_read(&bh->b_count) > 0) || -+ (bh->b_page && bh->b_page->mapping)); -+ spin_unlock(&journal_datalist_lock); -+ jh = journal_alloc_journal_head(); -+ memset(jh, 0, sizeof(*jh)); -+ spin_lock(&journal_datalist_lock); -+ -+ if (buffer_jbd(bh)) { -+ /* Someone did it for us! */ -+ J_ASSERT_BH(bh, bh->b_private != NULL); -+ journal_free_journal_head(jh); -+ jh = bh->b_private; -+ } else { -+ /* -+ * We actually don't need jh_splice_lock when -+ * adding a journal_head - only on removal. -+ */ -+ spin_lock(&jh_splice_lock); -+ set_bit(BH_JBD, &bh->b_state); -+ bh->b_private = jh; -+ jh->b_bh = bh; -+ atomic_inc(&bh->b_count); -+ spin_unlock(&jh_splice_lock); -+ BUFFER_TRACE(bh, "added journal_head"); -+ } -+ } -+ jh->b_jcount++; -+ spin_unlock(&journal_datalist_lock); -+ return bh->b_private; -+} -+ -+/* -+ * journal_remove_journal_head(): if the buffer isn't attached to a transaction -+ * and has a zero b_jcount then remove and release its journal_head. If we did -+ * see that the buffer is not used by any transaction we also "logically" -+ * decrement ->b_count. -+ * -+ * We in fact take an additional increment on ->b_count as a convenience, -+ * because the caller usually wants to do additional things with the bh -+ * after calling here. -+ * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some -+ * time. Once the caller has run __brelse(), the buffer is eligible for -+ * reaping by try_to_free_buffers(). -+ * -+ * Requires journal_datalist_lock. -+ */ -+void __journal_remove_journal_head(struct buffer_head *bh) -+{ -+ struct journal_head *jh = bh2jh(bh); -+ -+ assert_spin_locked(&journal_datalist_lock); -+ J_ASSERT_JH(jh, jh->b_jcount >= 0); -+ atomic_inc(&bh->b_count); -+ if (jh->b_jcount == 0) { -+ if (jh->b_transaction == NULL && -+ jh->b_next_transaction == NULL && -+ jh->b_cp_transaction == NULL) { -+ J_ASSERT_BH(bh, buffer_jbd(bh)); -+ J_ASSERT_BH(bh, jh2bh(jh) == bh); -+ BUFFER_TRACE(bh, "remove journal_head"); -+ spin_lock(&jh_splice_lock); -+ bh->b_private = NULL; -+ jh->b_bh = NULL; /* debug, really */ -+ clear_bit(BH_JBD, &bh->b_state); -+ __brelse(bh); -+ spin_unlock(&jh_splice_lock); -+ journal_free_journal_head(jh); -+ } else { -+ BUFFER_TRACE(bh, "journal_head was locked"); -+ } -+ } -+} -+ -+void journal_unlock_journal_head(struct journal_head *jh) -+{ -+ spin_lock(&journal_datalist_lock); -+ J_ASSERT_JH(jh, jh->b_jcount > 0); -+ --jh->b_jcount; -+ if (!jh->b_jcount && !jh->b_transaction) { -+ struct buffer_head *bh; -+ bh = jh2bh(jh); -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ -+ spin_unlock(&journal_datalist_lock); -+} -+ -+void journal_remove_journal_head(struct buffer_head *bh) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_remove_journal_head(bh); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * /proc tunables -+ */ -+#if defined(CONFIG_JBD_DEBUG) -+int journal_enable_debug; -+EXPORT_SYMBOL(journal_enable_debug); -+#endif -+ -+#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) -+ -+static struct proc_dir_entry *proc_jbd_debug; -+ -+int read_jbd_debug(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int ret; -+ -+ ret = sprintf(page + off, "%d\n", journal_enable_debug); -+ *eof = 1; -+ return ret; -+} -+ -+int write_jbd_debug(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char buf[32]; -+ -+ if (count > ARRAY_SIZE(buf) - 1) -+ count = ARRAY_SIZE(buf) - 1; -+ if (copy_from_user(buf, buffer, count)) -+ return -EFAULT; -+ buf[ARRAY_SIZE(buf) - 1] = '\0'; -+ journal_enable_debug = simple_strtoul(buf, NULL, 10); -+ return count; -+} -+ -+#define JBD_PROC_NAME "sys/fs/jbd-debug" -+ -+static void __init create_jbd_proc_entry(void) -+{ -+ proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); -+ if (proc_jbd_debug) { -+ /* Why is this so hard? */ -+ proc_jbd_debug->read_proc = read_jbd_debug; -+ proc_jbd_debug->write_proc = write_jbd_debug; -+ } -+} -+ -+static void __exit remove_jbd_proc_entry(void) -+{ -+ if (proc_jbd_debug) -+ remove_proc_entry(JBD_PROC_NAME, NULL); -+} -+ -+#else -+ -+#define create_jbd_proc_entry() do {} while (0) -+#define remove_jbd_proc_entry() do {} while (0) -+ -+#endif -+ -+/* -+ * Module startup and shutdown -+ */ -+ -+static int __init journal_init_caches(void) -+{ -+ int ret; -+ -+ ret = journal_init_revoke_caches(); -+ if (ret == 0) -+ ret = journal_init_journal_head_cache(); -+ return ret; -+} -+ -+static void journal_destroy_caches(void) -+{ -+ journal_destroy_revoke_caches(); -+ journal_destroy_journal_head_cache(); -+} -+ -+static int __init journal_init(void) -+{ -+ int ret; -+ -+ printk(KERN_INFO "Journalled Block Device driver loaded\n"); -+ ret = journal_init_caches(); -+ if (ret != 0) -+ journal_destroy_caches(); -+ create_jbd_proc_entry(); -+ return ret; -+} -+ -+static void __exit journal_exit(void) -+{ -+#ifdef CONFIG_JBD_DEBUG -+ int n = atomic_read(&nr_journal_heads); -+ if (n) -+ printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); -+#endif -+ remove_jbd_proc_entry(); -+ journal_destroy_caches(); -+} -+ -+MODULE_LICENSE("GPL"); -+module_init(journal_init); -+module_exit(journal_exit); -+ -diff -ruP linux.mcp2/fs/jbd/recovery.c linuxppc_2.4.19_final/fs/jbd/recovery.c ---- linux.mcp2/fs/jbd/recovery.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/recovery.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,589 @@ -+/* -+ * linux/fs/recovery.c -+ * -+ * Written by Stephen C. Tweedie , 1999 -+ * -+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Journal recovery routines for the generic filesystem journaling code; -+ * part of the ext2fs journaling system. -+ */ -+ -+#ifndef __KERNEL__ -+#include "jfs_user.h" -+#else -+#include -+#include -+#include -+#include -+#include -+#include -+#endif -+ -+/* -+ * Maintain information about the progress of the recovery job, so that -+ * the different passes can carry information between them. -+ */ -+struct recovery_info -+{ -+ tid_t start_transaction; -+ tid_t end_transaction; -+ -+ int nr_replays; -+ int nr_revokes; -+ int nr_revoke_hits; -+}; -+ -+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; -+static int do_one_pass(journal_t *journal, -+ struct recovery_info *info, enum passtype pass); -+static int scan_revoke_records(journal_t *, struct buffer_head *, -+ tid_t, struct recovery_info *); -+ -+#ifdef __KERNEL__ -+ -+/* Release readahead buffers after use */ -+void journal_brelse_array(struct buffer_head *b[], int n) -+{ -+ while (--n >= 0) -+ brelse (b[n]); -+} -+ -+ -+/* -+ * When reading from the journal, we are going through the block device -+ * layer directly and so there is no readahead being done for us. We -+ * need to implement any readahead ourselves if we want it to happen at -+ * all. Recovery is basically one long sequential read, so make sure we -+ * do the IO in reasonably large chunks. -+ * -+ * This is not so critical that we need to be enormously clever about -+ * the readahead size, though. 128K is a purely arbitrary, good-enough -+ * fixed value. -+ */ -+ -+#define MAXBUF 8 -+static int do_readahead(journal_t *journal, unsigned int start) -+{ -+ int err; -+ unsigned int max, nbufs, next; -+ unsigned long blocknr; -+ struct buffer_head *bh; -+ -+ struct buffer_head * bufs[MAXBUF]; -+ -+ /* Do up to 128K of readahead */ -+ max = start + (128 * 1024 / journal->j_blocksize); -+ if (max > journal->j_maxlen) -+ max = journal->j_maxlen; -+ -+ /* Do the readahead itself. We'll submit MAXBUF buffer_heads at -+ * a time to the block device IO layer. */ -+ -+ nbufs = 0; -+ -+ for (next = start; next < max; next++) { -+ err = journal_bmap(journal, next, &blocknr); -+ -+ if (err) { -+ printk (KERN_ERR "JBD: bad block at offset %u\n", -+ next); -+ goto failed; -+ } -+ -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ if (!bh) { -+ err = -ENOMEM; -+ goto failed; -+ } -+ -+ if (!buffer_uptodate(bh) && !buffer_locked(bh)) { -+ bufs[nbufs++] = bh; -+ if (nbufs == MAXBUF) { -+ ll_rw_block(READ, nbufs, bufs); -+ journal_brelse_array(bufs, nbufs); -+ nbufs = 0; -+ } -+ } else -+ brelse(bh); -+ } -+ -+ if (nbufs) -+ ll_rw_block(READ, nbufs, bufs); -+ err = 0; -+ -+failed: -+ if (nbufs) -+ journal_brelse_array(bufs, nbufs); -+ return err; -+} -+ -+#endif /* __KERNEL__ */ -+ -+ -+/* -+ * Read a block from the journal -+ */ -+ -+static int jread(struct buffer_head **bhp, journal_t *journal, -+ unsigned int offset) -+{ -+ int err; -+ unsigned long blocknr; -+ struct buffer_head *bh; -+ -+ *bhp = NULL; -+ -+ J_ASSERT (offset < journal->j_maxlen); -+ -+ err = journal_bmap(journal, offset, &blocknr); -+ -+ if (err) { -+ printk (KERN_ERR "JBD: bad block at offset %u\n", -+ offset); -+ return err; -+ } -+ -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ if (!bh) -+ return -ENOMEM; -+ -+ if (!buffer_uptodate(bh)) { -+ /* If this is a brand new buffer, start readahead. -+ Otherwise, we assume we are already reading it. */ -+ if (!buffer_req(bh)) -+ do_readahead(journal, offset); -+ wait_on_buffer(bh); -+ } -+ -+ if (!buffer_uptodate(bh)) { -+ printk (KERN_ERR "JBD: Failed to read block at offset %u\n", -+ offset); -+ brelse(bh); -+ return -EIO; -+ } -+ -+ *bhp = bh; -+ return 0; -+} -+ -+ -+/* -+ * Count the number of in-use tags in a journal descriptor block. -+ */ -+ -+static int count_tags(struct buffer_head *bh, int size) -+{ -+ char * tagp; -+ journal_block_tag_t * tag; -+ int nr = 0; -+ -+ tagp = &bh->b_data[sizeof(journal_header_t)]; -+ -+ while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { -+ tag = (journal_block_tag_t *) tagp; -+ -+ nr++; -+ tagp += sizeof(journal_block_tag_t); -+ if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID))) -+ tagp += 16; -+ -+ if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG)) -+ break; -+ } -+ -+ return nr; -+} -+ -+ -+/* Make sure we wrap around the log correctly! */ -+#define wrap(journal, var) \ -+do { \ -+ if (var >= (journal)->j_last) \ -+ var -= ((journal)->j_last - (journal)->j_first); \ -+} while (0) -+ -+/* -+ * journal_recover -+ * -+ * The primary function for recovering the log contents when mounting a -+ * journaled device. -+ * -+ * Recovery is done in three passes. In the first pass, we look for the -+ * end of the log. In the second, we assemble the list of revoke -+ * blocks. In the third and final pass, we replay any un-revoked blocks -+ * in the log. -+ */ -+ -+int journal_recover(journal_t *journal) -+{ -+ int err; -+ journal_superblock_t * sb; -+ -+ struct recovery_info info; -+ -+ memset(&info, 0, sizeof(info)); -+ sb = journal->j_superblock; -+ -+ /* -+ * The journal superblock's s_start field (the current log head) -+ * is always zero if, and only if, the journal was cleanly -+ * unmounted. -+ */ -+ -+ if (!sb->s_start) { -+ jbd_debug(1, "No recovery required, last transaction %d\n", -+ ntohl(sb->s_sequence)); -+ journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1; -+ return 0; -+ } -+ -+ -+ err = do_one_pass(journal, &info, PASS_SCAN); -+ if (!err) -+ err = do_one_pass(journal, &info, PASS_REVOKE); -+ if (!err) -+ err = do_one_pass(journal, &info, PASS_REPLAY); -+ -+ jbd_debug(0, "JBD: recovery, exit status %d, " -+ "recovered transactions %u to %u\n", -+ err, info.start_transaction, info.end_transaction); -+ jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", -+ info.nr_replays, info.nr_revoke_hits, info.nr_revokes); -+ -+ /* Restart the log at the next transaction ID, thus invalidating -+ * any existing commit records in the log. */ -+ journal->j_transaction_sequence = ++info.end_transaction; -+ -+ journal_clear_revoke(journal); -+ fsync_no_super(journal->j_fs_dev); -+ return err; -+} -+ -+/* -+ * journal_skip_recovery -+ * -+ * Locate any valid recovery information from the journal and set up the -+ * journal structures in memory to ignore it (presumably because the -+ * caller has evidence that it is out of date). -+ * -+ * We perform one pass over the journal to allow us to tell the user how -+ * much recovery information is being erased, and to let us initialise -+ * the journal transaction sequence numbers to the next unused ID. -+ */ -+ -+int journal_skip_recovery(journal_t *journal) -+{ -+ int err; -+ journal_superblock_t * sb; -+ -+ struct recovery_info info; -+ -+ memset (&info, 0, sizeof(info)); -+ sb = journal->j_superblock; -+ -+ err = do_one_pass(journal, &info, PASS_SCAN); -+ -+ if (err) { -+ printk(KERN_ERR "JBD: error %d scanning journal\n", err); -+ ++journal->j_transaction_sequence; -+ } else { -+#ifdef CONFIG_JBD_DEBUG -+ int dropped = info.end_transaction - ntohl(sb->s_sequence); -+#endif -+ -+ jbd_debug(0, -+ "JBD: ignoring %d transaction%s from the journal.\n", -+ dropped, (dropped == 1) ? "" : "s"); -+ journal->j_transaction_sequence = ++info.end_transaction; -+ } -+ -+ journal->j_tail = 0; -+ -+ return err; -+} -+ -+static int do_one_pass(journal_t *journal, -+ struct recovery_info *info, enum passtype pass) -+{ -+ -+ unsigned int first_commit_ID, next_commit_ID; -+ unsigned long next_log_block; -+ int err, success = 0; -+ journal_superblock_t * sb; -+ journal_header_t * tmp; -+ struct buffer_head * bh; -+ unsigned int sequence; -+ int blocktype; -+ -+ /* Precompute the maximum metadata descriptors in a descriptor block */ -+ int MAX_BLOCKS_PER_DESC; -+ MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) -+ / sizeof(journal_block_tag_t)); -+ -+ /* -+ * First thing is to establish what we expect to find in the log -+ * (in terms of transaction IDs), and where (in terms of log -+ * block offsets): query the superblock. -+ */ -+ -+ sb = journal->j_superblock; -+ next_commit_ID = ntohl(sb->s_sequence); -+ next_log_block = ntohl(sb->s_start); -+ -+ first_commit_ID = next_commit_ID; -+ if (pass == PASS_SCAN) -+ info->start_transaction = first_commit_ID; -+ -+ jbd_debug(1, "Starting recovery pass %d\n", pass); -+ -+ /* -+ * Now we walk through the log, transaction by transaction, -+ * making sure that each transaction has a commit block in the -+ * expected place. Each complete transaction gets replayed back -+ * into the main filesystem. -+ */ -+ -+ while (1) { -+ int flags; -+ char * tagp; -+ journal_block_tag_t * tag; -+ struct buffer_head * obh; -+ struct buffer_head * nbh; -+ -+ /* If we already know where to stop the log traversal, -+ * check right now that we haven't gone past the end of -+ * the log. */ -+ -+ if (pass != PASS_SCAN) -+ if (tid_geq(next_commit_ID, info->end_transaction)) -+ break; -+ -+ jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", -+ next_commit_ID, next_log_block, journal->j_last); -+ -+ /* Skip over each chunk of the transaction looking -+ * either the next descriptor block or the final commit -+ * record. */ -+ -+ jbd_debug(3, "JBD: checking block %ld\n", next_log_block); -+ err = jread(&bh, journal, next_log_block); -+ if (err) -+ goto failed; -+ -+ next_log_block++; -+ wrap(journal, next_log_block); -+ -+ /* What kind of buffer is it? -+ * -+ * If it is a descriptor block, check that it has the -+ * expected sequence number. Otherwise, we're all done -+ * here. */ -+ -+ tmp = (journal_header_t *)bh->b_data; -+ -+ if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) { -+ brelse(bh); -+ break; -+ } -+ -+ blocktype = ntohl(tmp->h_blocktype); -+ sequence = ntohl(tmp->h_sequence); -+ jbd_debug(3, "Found magic %d, sequence %d\n", -+ blocktype, sequence); -+ -+ if (sequence != next_commit_ID) { -+ brelse(bh); -+ break; -+ } -+ -+ /* OK, we have a valid descriptor block which matches -+ * all of the sequence number checks. What are we going -+ * to do with it? That depends on the pass... */ -+ -+ switch(blocktype) { -+ case JFS_DESCRIPTOR_BLOCK: -+ /* If it is a valid descriptor block, replay it -+ * in pass REPLAY; otherwise, just skip over the -+ * blocks it describes. */ -+ if (pass != PASS_REPLAY) { -+ next_log_block += -+ count_tags(bh, journal->j_blocksize); -+ wrap(journal, next_log_block); -+ brelse(bh); -+ continue; -+ } -+ -+ /* A descriptor block: we can now write all of -+ * the data blocks. Yay, useful work is finally -+ * getting done here! */ -+ -+ tagp = &bh->b_data[sizeof(journal_header_t)]; -+ while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) -+ <= journal->j_blocksize) { -+ unsigned long io_block; -+ -+ tag = (journal_block_tag_t *) tagp; -+ flags = ntohl(tag->t_flags); -+ -+ io_block = next_log_block++; -+ wrap(journal, next_log_block); -+ err = jread(&obh, journal, io_block); -+ if (err) { -+ /* Recover what we can, but -+ * report failure at the end. */ -+ success = err; -+ printk (KERN_ERR -+ "JBD: IO error %d recovering " -+ "block %ld in log\n", -+ err, io_block); -+ } else { -+ unsigned long blocknr; -+ -+ J_ASSERT(obh != NULL); -+ blocknr = ntohl(tag->t_blocknr); -+ -+ /* If the block has been -+ * revoked, then we're all done -+ * here. */ -+ if (journal_test_revoke -+ (journal, blocknr, -+ next_commit_ID)) { -+ brelse(obh); -+ ++info->nr_revoke_hits; -+ goto skip_write; -+ } -+ -+ /* Find a buffer for the new -+ * data being restored */ -+ nbh = getblk(journal->j_fs_dev, blocknr, -+ journal->j_blocksize); -+ if (nbh == NULL) { -+ printk(KERN_ERR -+ "JBD: Out of memory " -+ "during recovery.\n"); -+ err = -ENOMEM; -+ brelse(bh); -+ brelse(obh); -+ goto failed; -+ } -+ -+ lock_buffer(nbh); -+ memcpy(nbh->b_data, obh->b_data, -+ journal->j_blocksize); -+ if (flags & JFS_FLAG_ESCAPE) { -+ *((unsigned int *)bh->b_data) = -+ htonl(JFS_MAGIC_NUMBER); -+ } -+ -+ BUFFER_TRACE(nbh, "marking dirty"); -+ mark_buffer_dirty(nbh); -+ BUFFER_TRACE(nbh, "marking uptodate"); -+ mark_buffer_uptodate(nbh, 1); -+ unlock_buffer(nbh); -+ ++info->nr_replays; -+ /* ll_rw_block(WRITE, 1, &nbh); */ -+ brelse(obh); -+ brelse(nbh); -+ } -+ -+ skip_write: -+ tagp += sizeof(journal_block_tag_t); -+ if (!(flags & JFS_FLAG_SAME_UUID)) -+ tagp += 16; -+ -+ if (flags & JFS_FLAG_LAST_TAG) -+ break; -+ } -+ -+ brelse(bh); -+ continue; -+ -+ case JFS_COMMIT_BLOCK: -+ /* Found an expected commit block: not much to -+ * do other than move on to the next sequence -+ * number. */ -+ brelse(bh); -+ next_commit_ID++; -+ continue; -+ -+ case JFS_REVOKE_BLOCK: -+ /* If we aren't in the REVOKE pass, then we can -+ * just skip over this block. */ -+ if (pass != PASS_REVOKE) { -+ brelse(bh); -+ continue; -+ } -+ -+ err = scan_revoke_records(journal, bh, -+ next_commit_ID, info); -+ brelse(bh); -+ if (err) -+ goto failed; -+ continue; -+ -+ default: -+ jbd_debug(3, "Unrecognised magic %d, end of scan.\n", -+ blocktype); -+ goto done; -+ } -+ } -+ -+ done: -+ /* -+ * We broke out of the log scan loop: either we came to the -+ * known end of the log or we found an unexpected block in the -+ * log. If the latter happened, then we know that the "current" -+ * transaction marks the end of the valid log. -+ */ -+ -+ if (pass == PASS_SCAN) -+ info->end_transaction = next_commit_ID; -+ else { -+ /* It's really bad news if different passes end up at -+ * different places (but possible due to IO errors). */ -+ if (info->end_transaction != next_commit_ID) { -+ printk (KERN_ERR "JBD: recovery pass %d ended at " -+ "transaction %u, expected %u\n", -+ pass, next_commit_ID, info->end_transaction); -+ if (!success) -+ success = -EIO; -+ } -+ } -+ -+ return success; -+ -+ failed: -+ return err; -+} -+ -+ -+/* Scan a revoke record, marking all blocks mentioned as revoked. */ -+ -+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, -+ tid_t sequence, struct recovery_info *info) -+{ -+ journal_revoke_header_t *header; -+ int offset, max; -+ -+ header = (journal_revoke_header_t *) bh->b_data; -+ offset = sizeof(journal_revoke_header_t); -+ max = ntohl(header->r_count); -+ -+ while (offset < max) { -+ unsigned long blocknr; -+ int err; -+ -+ blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset))); -+ offset += 4; -+ err = journal_set_revoke(journal, blocknr, sequence); -+ if (err) -+ return err; -+ ++info->nr_revokes; -+ } -+ return 0; -+} -diff -ruP linux.mcp2/fs/jbd/revoke.c linuxppc_2.4.19_final/fs/jbd/revoke.c ---- linux.mcp2/fs/jbd/revoke.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/revoke.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,636 @@ -+/* -+ * linux/fs/revoke.c -+ * -+ * Written by Stephen C. Tweedie , 2000 -+ * -+ * Copyright 2000 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Journal revoke routines for the generic filesystem journaling code; -+ * part of the ext2fs journaling system. -+ * -+ * Revoke is the mechanism used to prevent old log records for deleted -+ * metadata from being replayed on top of newer data using the same -+ * blocks. The revoke mechanism is used in two separate places: -+ * -+ * + Commit: during commit we write the entire list of the current -+ * transaction's revoked blocks to the journal -+ * -+ * + Recovery: during recovery we record the transaction ID of all -+ * revoked blocks. If there are multiple revoke records in the log -+ * for a single block, only the last one counts, and if there is a log -+ * entry for a block beyond the last revoke, then that log entry still -+ * gets replayed. -+ * -+ * We can get interactions between revokes and new log data within a -+ * single transaction: -+ * -+ * Block is revoked and then journaled: -+ * The desired end result is the journaling of the new block, so we -+ * cancel the revoke before the transaction commits. -+ * -+ * Block is journaled and then revoked: -+ * The revoke must take precedence over the write of the block, so we -+ * need either to cancel the journal entry or to write the revoke -+ * later in the log than the log block. In this case, we choose the -+ * latter: journaling a block cancels any revoke record for that block -+ * in the current transaction, so any revoke for that block in the -+ * transaction must have happened after the block was journaled and so -+ * the revoke must take precedence. -+ * -+ * Block is revoked and then written as data: -+ * The data write is allowed to succeed, but the revoke is _not_ -+ * cancelled. We still need to prevent old log records from -+ * overwriting the new data. We don't even need to clear the revoke -+ * bit here. -+ * -+ * Revoke information on buffers is a tri-state value: -+ * -+ * RevokeValid clear: no cached revoke status, need to look it up -+ * RevokeValid set, Revoked clear: -+ * buffer has not been revoked, and cancel_revoke -+ * need do nothing. -+ * RevokeValid set, Revoked set: -+ * buffer has been revoked. -+ */ -+ -+#ifndef __KERNEL__ -+#include "jfs_user.h" -+#else -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#endif -+ -+static kmem_cache_t *revoke_record_cache; -+static kmem_cache_t *revoke_table_cache; -+ -+/* Each revoke record represents one single revoked block. During -+ journal replay, this involves recording the transaction ID of the -+ last transaction to revoke this block. */ -+ -+struct jbd_revoke_record_s -+{ -+ struct list_head hash; -+ tid_t sequence; /* Used for recovery only */ -+ unsigned long blocknr; -+}; -+ -+ -+/* The revoke table is just a simple hash table of revoke records. */ -+struct jbd_revoke_table_s -+{ -+ /* It is conceivable that we might want a larger hash table -+ * for recovery. Must be a power of two. */ -+ int hash_size; -+ int hash_shift; -+ struct list_head *hash_table; -+}; -+ -+ -+#ifdef __KERNEL__ -+static void write_one_revoke_record(journal_t *, transaction_t *, -+ struct journal_head **, int *, -+ struct jbd_revoke_record_s *); -+static void flush_descriptor(journal_t *, struct journal_head *, int); -+#endif -+ -+/* Utility functions to maintain the revoke table */ -+ -+/* Borrowed from buffer.c: this is a tried and tested block hash function */ -+static inline int hash(journal_t *journal, unsigned long block) -+{ -+ struct jbd_revoke_table_s *table = journal->j_revoke; -+ int hash_shift = table->hash_shift; -+ -+ return ((block << (hash_shift - 6)) ^ -+ (block >> 13) ^ -+ (block << (hash_shift - 12))) & (table->hash_size - 1); -+} -+ -+int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq) -+{ -+ struct list_head *hash_list; -+ struct jbd_revoke_record_s *record; -+ -+repeat: -+ record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); -+ if (!record) -+ goto oom; -+ -+ record->sequence = seq; -+ record->blocknr = blocknr; -+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; -+ list_add(&record->hash, hash_list); -+ return 0; -+ -+oom: -+ if (!journal_oom_retry) -+ return -ENOMEM; -+ jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n"); -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ goto repeat; -+} -+ -+/* Find a revoke record in the journal's hash table. */ -+ -+static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, -+ unsigned long blocknr) -+{ -+ struct list_head *hash_list; -+ struct jbd_revoke_record_s *record; -+ -+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; -+ -+ record = (struct jbd_revoke_record_s *) hash_list->next; -+ while (&(record->hash) != hash_list) { -+ if (record->blocknr == blocknr) -+ return record; -+ record = (struct jbd_revoke_record_s *) record->hash.next; -+ } -+ return NULL; -+} -+ -+int __init journal_init_revoke_caches(void) -+{ -+ revoke_record_cache = kmem_cache_create("revoke_record", -+ sizeof(struct jbd_revoke_record_s), -+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); -+ if (revoke_record_cache == 0) -+ return -ENOMEM; -+ -+ revoke_table_cache = kmem_cache_create("revoke_table", -+ sizeof(struct jbd_revoke_table_s), -+ 0, 0, NULL, NULL); -+ if (revoke_table_cache == 0) { -+ kmem_cache_destroy(revoke_record_cache); -+ revoke_record_cache = NULL; -+ return -ENOMEM; -+ } -+ return 0; -+} -+ -+void journal_destroy_revoke_caches(void) -+{ -+ kmem_cache_destroy(revoke_record_cache); -+ revoke_record_cache = 0; -+ kmem_cache_destroy(revoke_table_cache); -+ revoke_table_cache = 0; -+} -+ -+/* Initialise the revoke table for a given journal to a given size. */ -+ -+int journal_init_revoke(journal_t *journal, int hash_size) -+{ -+ int shift, tmp; -+ -+ J_ASSERT (journal->j_revoke == NULL); -+ -+ journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); -+ if (!journal->j_revoke) -+ return -ENOMEM; -+ -+ /* Check that the hash_size is a power of two */ -+ J_ASSERT ((hash_size & (hash_size-1)) == 0); -+ -+ journal->j_revoke->hash_size = hash_size; -+ -+ shift = 0; -+ tmp = hash_size; -+ while((tmp >>= 1UL) != 0UL) -+ shift++; -+ journal->j_revoke->hash_shift = shift; -+ -+ journal->j_revoke->hash_table = -+ kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); -+ if (!journal->j_revoke->hash_table) { -+ kmem_cache_free(revoke_table_cache, journal->j_revoke); -+ journal->j_revoke = NULL; -+ return -ENOMEM; -+ } -+ -+ for (tmp = 0; tmp < hash_size; tmp++) -+ INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); -+ -+ return 0; -+} -+ -+/* Destoy a journal's revoke table. The table must already be empty! */ -+ -+void journal_destroy_revoke(journal_t *journal) -+{ -+ struct jbd_revoke_table_s *table; -+ struct list_head *hash_list; -+ int i; -+ -+ table = journal->j_revoke; -+ if (!table) -+ return; -+ -+ for (i=0; ihash_size; i++) { -+ hash_list = &table->hash_table[i]; -+ J_ASSERT (list_empty(hash_list)); -+ } -+ -+ kfree(table->hash_table); -+ kmem_cache_free(revoke_table_cache, table); -+ journal->j_revoke = NULL; -+} -+ -+ -+#ifdef __KERNEL__ -+ -+/* -+ * journal_revoke: revoke a given buffer_head from the journal. This -+ * prevents the block from being replayed during recovery if we take a -+ * crash after this current transaction commits. Any subsequent -+ * metadata writes of the buffer in this transaction cancel the -+ * revoke. -+ * -+ * Note that this call may block --- it is up to the caller to make -+ * sure that there are no further calls to journal_write_metadata -+ * before the revoke is complete. In ext3, this implies calling the -+ * revoke before clearing the block bitmap when we are deleting -+ * metadata. -+ * -+ * Revoke performs a journal_forget on any buffer_head passed in as a -+ * parameter, but does _not_ forget the buffer_head if the bh was only -+ * found implicitly. -+ * -+ * bh_in may not be a journalled buffer - it may have come off -+ * the hash tables without an attached journal_head. -+ * -+ * If bh_in is non-zero, journal_revoke() will decrement its b_count -+ * by one. -+ */ -+ -+int journal_revoke(handle_t *handle, unsigned long blocknr, -+ struct buffer_head *bh_in) -+{ -+ struct buffer_head *bh = NULL; -+ journal_t *journal; -+ kdev_t dev; -+ int err; -+ -+ if (bh_in) -+ BUFFER_TRACE(bh_in, "enter"); -+ -+ journal = handle->h_transaction->t_journal; -+ if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ -+ J_ASSERT (!"Cannot set revoke feature!"); -+ return -EINVAL; -+ } -+ -+ dev = journal->j_fs_dev; -+ bh = bh_in; -+ -+ if (!bh) { -+ bh = get_hash_table(dev, blocknr, journal->j_blocksize); -+ if (bh) -+ BUFFER_TRACE(bh, "found on hash"); -+ } -+#ifdef JBD_EXPENSIVE_CHECKING -+ else { -+ struct buffer_head *bh2; -+ -+ /* If there is a different buffer_head lying around in -+ * memory anywhere... */ -+ bh2 = get_hash_table(dev, blocknr, journal->j_blocksize); -+ if (bh2) { -+ /* ... and it has RevokeValid status... */ -+ if ((bh2 != bh) && -+ test_bit(BH_RevokeValid, &bh2->b_state)) -+ /* ...then it better be revoked too, -+ * since it's illegal to create a revoke -+ * record against a buffer_head which is -+ * not marked revoked --- that would -+ * risk missing a subsequent revoke -+ * cancel. */ -+ J_ASSERT_BH(bh2, test_bit(BH_Revoked, & -+ bh2->b_state)); -+ __brelse(bh2); -+ } -+ } -+#endif -+ -+ /* We really ought not ever to revoke twice in a row without -+ first having the revoke cancelled: it's illegal to free a -+ block twice without allocating it in between! */ -+ if (bh) { -+ J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state)); -+ set_bit(BH_Revoked, &bh->b_state); -+ set_bit(BH_RevokeValid, &bh->b_state); -+ if (bh_in) { -+ BUFFER_TRACE(bh_in, "call journal_forget"); -+ journal_forget(handle, bh_in); -+ } else { -+ BUFFER_TRACE(bh, "call brelse"); -+ __brelse(bh); -+ } -+ } -+ -+ lock_journal(journal); -+ jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); -+ err = insert_revoke_hash(journal, blocknr, -+ handle->h_transaction->t_tid); -+ unlock_journal(journal); -+ BUFFER_TRACE(bh_in, "exit"); -+ return err; -+} -+ -+/* -+ * Cancel an outstanding revoke. For use only internally by the -+ * journaling code (called from journal_get_write_access). -+ * -+ * We trust the BH_Revoked bit on the buffer if the buffer is already -+ * being journaled: if there is no revoke pending on the buffer, then we -+ * don't do anything here. -+ * -+ * This would break if it were possible for a buffer to be revoked and -+ * discarded, and then reallocated within the same transaction. In such -+ * a case we would have lost the revoked bit, but when we arrived here -+ * the second time we would still have a pending revoke to cancel. So, -+ * do not trust the Revoked bit on buffers unless RevokeValid is also -+ * set. -+ * -+ * The caller must have the journal locked. -+ */ -+int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) -+{ -+ struct jbd_revoke_record_s *record; -+ journal_t *journal = handle->h_transaction->t_journal; -+ int need_cancel; -+ int did_revoke = 0; /* akpm: debug */ -+ struct buffer_head *bh = jh2bh(jh); -+ -+ jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); -+ -+ /* Is the existing Revoke bit valid? If so, we trust it, and -+ * only perform the full cancel if the revoke bit is set. If -+ * not, we can't trust the revoke bit, and we need to do the -+ * full search for a revoke record. */ -+ if (test_and_set_bit(BH_RevokeValid, &bh->b_state)) -+ need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state)); -+ else { -+ need_cancel = 1; -+ clear_bit(BH_Revoked, &bh->b_state); -+ } -+ -+ if (need_cancel) { -+ record = find_revoke_record(journal, bh->b_blocknr); -+ if (record) { -+ jbd_debug(4, "cancelled existing revoke on " -+ "blocknr %lu\n", bh->b_blocknr); -+ list_del(&record->hash); -+ kmem_cache_free(revoke_record_cache, record); -+ did_revoke = 1; -+ } -+ } -+ -+#ifdef JBD_EXPENSIVE_CHECKING -+ /* There better not be one left behind by now! */ -+ record = find_revoke_record(journal, bh->b_blocknr); -+ J_ASSERT_JH(jh, record == NULL); -+#endif -+ -+ /* Finally, have we just cleared revoke on an unhashed -+ * buffer_head? If so, we'd better make sure we clear the -+ * revoked status on any hashed alias too, otherwise the revoke -+ * state machine will get very upset later on. */ -+ if (need_cancel && !bh->b_pprev) { -+ struct buffer_head *bh2; -+ bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); -+ if (bh2) { -+ clear_bit(BH_Revoked, &bh2->b_state); -+ __brelse(bh2); -+ } -+ } -+ -+ return did_revoke; -+} -+ -+ -+/* -+ * Write revoke records to the journal for all entries in the current -+ * revoke hash, deleting the entries as we go. -+ * -+ * Called with the journal lock held. -+ */ -+ -+void journal_write_revoke_records(journal_t *journal, -+ transaction_t *transaction) -+{ -+ struct journal_head *descriptor; -+ struct jbd_revoke_record_s *record; -+ struct jbd_revoke_table_s *revoke; -+ struct list_head *hash_list; -+ int i, offset, count; -+ -+ descriptor = NULL; -+ offset = 0; -+ count = 0; -+ revoke = journal->j_revoke; -+ -+ for (i = 0; i < revoke->hash_size; i++) { -+ hash_list = &revoke->hash_table[i]; -+ -+ while (!list_empty(hash_list)) { -+ record = (struct jbd_revoke_record_s *) -+ hash_list->next; -+ write_one_revoke_record(journal, transaction, -+ &descriptor, &offset, -+ record); -+ count++; -+ list_del(&record->hash); -+ kmem_cache_free(revoke_record_cache, record); -+ } -+ } -+ if (descriptor) -+ flush_descriptor(journal, descriptor, offset); -+ jbd_debug(1, "Wrote %d revoke records\n", count); -+} -+ -+/* -+ * Write out one revoke record. We need to create a new descriptor -+ * block if the old one is full or if we have not already created one. -+ */ -+ -+static void write_one_revoke_record(journal_t *journal, -+ transaction_t *transaction, -+ struct journal_head **descriptorp, -+ int *offsetp, -+ struct jbd_revoke_record_s *record) -+{ -+ struct journal_head *descriptor; -+ int offset; -+ journal_header_t *header; -+ -+ /* If we are already aborting, this all becomes a noop. We -+ still need to go round the loop in -+ journal_write_revoke_records in order to free all of the -+ revoke records: only the IO to the journal is omitted. */ -+ if (is_journal_aborted(journal)) -+ return; -+ -+ descriptor = *descriptorp; -+ offset = *offsetp; -+ -+ /* Make sure we have a descriptor with space left for the record */ -+ if (descriptor) { -+ if (offset == journal->j_blocksize) { -+ flush_descriptor(journal, descriptor, offset); -+ descriptor = NULL; -+ } -+ } -+ -+ if (!descriptor) { -+ descriptor = journal_get_descriptor_buffer(journal); -+ if (!descriptor) -+ return; -+ header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; -+ header->h_magic = htonl(JFS_MAGIC_NUMBER); -+ header->h_blocktype = htonl(JFS_REVOKE_BLOCK); -+ header->h_sequence = htonl(transaction->t_tid); -+ -+ /* Record it so that we can wait for IO completion later */ -+ JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); -+ journal_file_buffer(descriptor, transaction, BJ_LogCtl); -+ -+ offset = sizeof(journal_revoke_header_t); -+ *descriptorp = descriptor; -+ } -+ -+ * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) = -+ htonl(record->blocknr); -+ offset += 4; -+ *offsetp = offset; -+} -+ -+/* -+ * Flush a revoke descriptor out to the journal. If we are aborting, -+ * this is a noop; otherwise we are generating a buffer which needs to -+ * be waited for during commit, so it has to go onto the appropriate -+ * journal buffer list. -+ */ -+ -+static void flush_descriptor(journal_t *journal, -+ struct journal_head *descriptor, -+ int offset) -+{ -+ journal_revoke_header_t *header; -+ -+ if (is_journal_aborted(journal)) { -+ JBUFFER_TRACE(descriptor, "brelse"); -+ unlock_buffer(jh2bh(descriptor)); -+ __brelse(jh2bh(descriptor)); -+ return; -+ } -+ -+ header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; -+ header->r_count = htonl(offset); -+ set_bit(BH_JWrite, &jh2bh(descriptor)->b_state); -+ { -+ struct buffer_head *bh = jh2bh(descriptor); -+ BUFFER_TRACE(bh, "write"); -+ clear_bit(BH_Dirty, &bh->b_state); -+ bh->b_end_io = journal_end_buffer_io_sync; -+ submit_bh(WRITE, bh); -+ } -+} -+ -+#endif -+ -+/* -+ * Revoke support for recovery. -+ * -+ * Recovery needs to be able to: -+ * -+ * record all revoke records, including the tid of the latest instance -+ * of each revoke in the journal -+ * -+ * check whether a given block in a given transaction should be replayed -+ * (ie. has not been revoked by a revoke record in that or a subsequent -+ * transaction) -+ * -+ * empty the revoke table after recovery. -+ */ -+ -+/* -+ * First, setting revoke records. We create a new revoke record for -+ * every block ever revoked in the log as we scan it for recovery, and -+ * we update the existing records if we find multiple revokes for a -+ * single block. -+ */ -+ -+int journal_set_revoke(journal_t *journal, -+ unsigned long blocknr, -+ tid_t sequence) -+{ -+ struct jbd_revoke_record_s *record; -+ -+ record = find_revoke_record(journal, blocknr); -+ if (record) { -+ /* If we have multiple occurences, only record the -+ * latest sequence number in the hashed record */ -+ if (tid_gt(sequence, record->sequence)) -+ record->sequence = sequence; -+ return 0; -+ } -+ return insert_revoke_hash(journal, blocknr, sequence); -+} -+ -+/* -+ * Test revoke records. For a given block referenced in the log, has -+ * that block been revoked? A revoke record with a given transaction -+ * sequence number revokes all blocks in that transaction and earlier -+ * ones, but later transactions still need replayed. -+ */ -+ -+int journal_test_revoke(journal_t *journal, -+ unsigned long blocknr, -+ tid_t sequence) -+{ -+ struct jbd_revoke_record_s *record; -+ -+ record = find_revoke_record(journal, blocknr); -+ if (!record) -+ return 0; -+ if (tid_gt(sequence, record->sequence)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * Finally, once recovery is over, we need to clear the revoke table so -+ * that it can be reused by the running filesystem. -+ */ -+ -+void journal_clear_revoke(journal_t *journal) -+{ -+ int i; -+ struct list_head *hash_list; -+ struct jbd_revoke_record_s *record; -+ struct jbd_revoke_table_s *revoke; -+ -+ revoke = journal->j_revoke; -+ -+ for (i = 0; i < revoke->hash_size; i++) { -+ hash_list = &revoke->hash_table[i]; -+ while (!list_empty(hash_list)) { -+ record = (struct jbd_revoke_record_s*) hash_list->next; -+ list_del(&record->hash); -+ kmem_cache_free(revoke_record_cache, record); -+ } -+ } -+} -+ -diff -ruP linux.mcp2/fs/jbd/transaction.c linuxppc_2.4.19_final/fs/jbd/transaction.c ---- linux.mcp2/fs/jbd/transaction.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/transaction.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,2055 @@ -+/* -+ * linux/fs/transaction.c -+ * -+ * Written by Stephen C. Tweedie , 1998 -+ * -+ * Copyright 1998 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Generic filesystem transaction handling code; part of the ext2fs -+ * journaling system. -+ * -+ * This file manages transactions (compound commits managed by the -+ * journaling code) and handles (individual atomic operations by the -+ * filesystem). -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+extern spinlock_t journal_datalist_lock; -+ -+/* -+ * get_transaction: obtain a new transaction_t object. -+ * -+ * Simply allocate and initialise a new transaction. Create it in -+ * RUNNING state and add it to the current journal (which should not -+ * have an existing running transaction: we only make a new transaction -+ * once we have started to commit the old one). -+ * -+ * Preconditions: -+ * The journal MUST be locked. We don't perform atomic mallocs on the -+ * new transaction and we can't block without protecting against other -+ * processes trying to touch the journal while it is in transition. -+ */ -+ -+static transaction_t * get_transaction (journal_t * journal, int is_try) -+{ -+ transaction_t * transaction; -+ -+ transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS); -+ if (!transaction) -+ return NULL; -+ -+ memset (transaction, 0, sizeof (transaction_t)); -+ -+ transaction->t_journal = journal; -+ transaction->t_state = T_RUNNING; -+ transaction->t_tid = journal->j_transaction_sequence++; -+ transaction->t_expires = jiffies + journal->j_commit_interval; -+ -+ /* Set up the commit timer for the new transaction. */ -+ J_ASSERT (!journal->j_commit_timer_active); -+ journal->j_commit_timer_active = 1; -+ journal->j_commit_timer->expires = transaction->t_expires; -+ add_timer(journal->j_commit_timer); -+ -+ J_ASSERT (journal->j_running_transaction == NULL); -+ journal->j_running_transaction = transaction; -+ -+ return transaction; -+} -+ -+/* -+ * Handle management. -+ * -+ * A handle_t is an object which represents a single atomic update to a -+ * filesystem, and which tracks all of the modifications which form part -+ * of that one update. -+ */ -+ -+/* -+ * start_this_handle: Given a handle, deal with any locking or stalling -+ * needed to make sure that there is enough journal space for the handle -+ * to begin. Attach the handle to a transaction and set up the -+ * transaction's buffer credits. -+ */ -+ -+static int start_this_handle(journal_t *journal, handle_t *handle) -+{ -+ transaction_t *transaction; -+ int needed; -+ int nblocks = handle->h_buffer_credits; -+ -+ jbd_debug(3, "New handle %p going live.\n", handle); -+ -+repeat: -+ -+ lock_journal(journal); -+ -+repeat_locked: -+ -+ if (is_journal_aborted(journal) || -+ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { -+ unlock_journal(journal); -+ return -EROFS; -+ } -+ -+ /* Wait on the journal's transaction barrier if necessary */ -+ if (journal->j_barrier_count) { -+ unlock_journal(journal); -+ sleep_on(&journal->j_wait_transaction_locked); -+ goto repeat; -+ } -+ -+ if (!journal->j_running_transaction) -+ get_transaction(journal, 0); -+ /* @@@ Error? */ -+ J_ASSERT(journal->j_running_transaction); -+ -+ transaction = journal->j_running_transaction; -+ -+ /* If the current transaction is locked down for commit, wait -+ * for the lock to be released. */ -+ -+ if (transaction->t_state == T_LOCKED) { -+ unlock_journal(journal); -+ jbd_debug(3, "Handle %p stalling...\n", handle); -+ sleep_on(&journal->j_wait_transaction_locked); -+ goto repeat; -+ } -+ -+ /* If there is not enough space left in the log to write all -+ * potential buffers requested by this operation, we need to -+ * stall pending a log checkpoint to free some more log -+ * space. */ -+ -+ needed = transaction->t_outstanding_credits + nblocks; -+ -+ if (needed > journal->j_max_transaction_buffers) { -+ /* If the current transaction is already too large, then -+ * start to commit it: we can then go back and attach -+ * this handle to a new transaction. */ -+ -+ jbd_debug(2, "Handle %p starting new commit...\n", handle); -+ log_start_commit(journal, transaction); -+ unlock_journal(journal); -+ sleep_on(&journal->j_wait_transaction_locked); -+ lock_journal(journal); -+ goto repeat_locked; -+ } -+ -+ /* -+ * The commit code assumes that it can get enough log space -+ * without forcing a checkpoint. This is *critical* for -+ * correctness: a checkpoint of a buffer which is also -+ * associated with a committing transaction creates a deadlock, -+ * so commit simply cannot force through checkpoints. -+ * -+ * We must therefore ensure the necessary space in the journal -+ * *before* starting to dirty potentially checkpointed buffers -+ * in the new transaction. -+ * -+ * The worst part is, any transaction currently committing can -+ * reduce the free space arbitrarily. Be careful to account for -+ * those buffers when checkpointing. -+ */ -+ -+ /* -+ * @@@ AKPM: This seems rather over-defensive. We're giving commit -+ * a _lot_ of headroom: 1/4 of the journal plus the size of -+ * the committing transaction. Really, we only need to give it -+ * committing_transaction->t_outstanding_credits plus "enough" for -+ * the log control blocks. -+ * Also, this test is inconsitent with the matching one in -+ * journal_extend(). -+ */ -+ needed = journal->j_max_transaction_buffers; -+ if (journal->j_committing_transaction) -+ needed += journal->j_committing_transaction-> -+ t_outstanding_credits; -+ -+ if (log_space_left(journal) < needed) { -+ jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); -+ log_wait_for_space(journal, needed); -+ goto repeat_locked; -+ } -+ -+ /* OK, account for the buffers that this operation expects to -+ * use and add the handle to the running transaction. */ -+ -+ handle->h_transaction = transaction; -+ transaction->t_outstanding_credits += nblocks; -+ transaction->t_updates++; -+ transaction->t_handle_count++; -+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", -+ handle, nblocks, transaction->t_outstanding_credits, -+ log_space_left(journal)); -+ -+ unlock_journal(journal); -+ -+ return 0; -+} -+ -+/* -+ * Obtain a new handle. -+ * -+ * We make sure that the transaction can guarantee at least nblocks of -+ * modified buffers in the log. We block until the log can guarantee -+ * that much space. -+ * -+ * This function is visible to journal users (like ext2fs), so is not -+ * called with the journal already locked. -+ * -+ * Return a pointer to a newly allocated handle, or NULL on failure -+ */ -+ -+handle_t *journal_start(journal_t *journal, int nblocks) -+{ -+ handle_t *handle = journal_current_handle(); -+ int err; -+ -+ if (!journal) -+ return ERR_PTR(-EROFS); -+ -+ if (handle) { -+ J_ASSERT(handle->h_transaction->t_journal == journal); -+ handle->h_ref++; -+ return handle; -+ } -+ -+ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return ERR_PTR(-ENOMEM); -+ memset (handle, 0, sizeof (handle_t)); -+ -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ current->journal_info = handle; -+ -+ err = start_this_handle(journal, handle); -+ if (err < 0) { -+ kfree(handle); -+ current->journal_info = NULL; -+ return ERR_PTR(err); -+ } -+ -+ return handle; -+} -+ -+/* -+ * Return zero on success -+ */ -+static int try_start_this_handle(journal_t *journal, handle_t *handle) -+{ -+ transaction_t *transaction; -+ int needed; -+ int nblocks = handle->h_buffer_credits; -+ int ret = 0; -+ -+ jbd_debug(3, "New handle %p maybe going live.\n", handle); -+ -+ lock_journal(journal); -+ -+ if (is_journal_aborted(journal) || -+ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { -+ ret = -EROFS; -+ goto fail_unlock; -+ } -+ -+ if (journal->j_barrier_count) -+ goto fail_unlock; -+ -+ if (!journal->j_running_transaction && get_transaction(journal, 1) == 0) -+ goto fail_unlock; -+ -+ transaction = journal->j_running_transaction; -+ if (transaction->t_state == T_LOCKED) -+ goto fail_unlock; -+ -+ needed = transaction->t_outstanding_credits + nblocks; -+ /* We could run log_start_commit here */ -+ if (needed > journal->j_max_transaction_buffers) -+ goto fail_unlock; -+ -+ needed = journal->j_max_transaction_buffers; -+ if (journal->j_committing_transaction) -+ needed += journal->j_committing_transaction-> -+ t_outstanding_credits; -+ -+ if (log_space_left(journal) < needed) -+ goto fail_unlock; -+ -+ handle->h_transaction = transaction; -+ transaction->t_outstanding_credits += nblocks; -+ transaction->t_updates++; -+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", -+ handle, nblocks, transaction->t_outstanding_credits, -+ log_space_left(journal)); -+ unlock_journal(journal); -+ return 0; -+ -+fail_unlock: -+ unlock_journal(journal); -+ if (ret >= 0) -+ ret = -1; -+ return ret; -+} -+ -+/* -+ * Try to start a handle, but non-blockingly. If we weren't able -+ * to, return an ERR_PTR value. -+ */ -+handle_t *journal_try_start(journal_t *journal, int nblocks) -+{ -+ handle_t *handle = journal_current_handle(); -+ int err; -+ -+ if (!journal) -+ return ERR_PTR(-EROFS); -+ -+ if (handle) { -+ jbd_debug(4, "h_ref %d -> %d\n", -+ handle->h_ref, -+ handle->h_ref + 1); -+ J_ASSERT(handle->h_transaction->t_journal == journal); -+ if (is_handle_aborted(handle)) -+ return ERR_PTR(-EIO); -+ handle->h_ref++; -+ return handle; -+ } else { -+ jbd_debug(4, "no current transaction\n"); -+ } -+ -+ if (is_journal_aborted(journal)) -+ return ERR_PTR(-EIO); -+ -+ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return ERR_PTR(-ENOMEM); -+ memset (handle, 0, sizeof (handle_t)); -+ -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ current->journal_info = handle; -+ -+ err = try_start_this_handle(journal, handle); -+ if (err < 0) { -+ kfree(handle); -+ current->journal_info = NULL; -+ return ERR_PTR(err); -+ } -+ -+ return handle; -+} -+ -+/* -+ * journal_extend: extend buffer credits. -+ * -+ * Some transactions, such as large extends and truncates, can be done -+ * atomically all at once or in several stages. The operation requests -+ * a credit for a number of buffer modications in advance, but can -+ * extend its credit if it needs more. -+ * -+ * journal_extend tries to give the running handle more buffer credits. -+ * It does not guarantee that allocation: this is a best-effort only. -+ * The calling process MUST be able to deal cleanly with a failure to -+ * extend here. -+ * -+ * Return 0 on success, non-zero on failure. -+ * -+ * return code < 0 implies an error -+ * return code > 0 implies normal transaction-full status. -+ */ -+ -+int journal_extend (handle_t *handle, int nblocks) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ int result; -+ int wanted; -+ -+ lock_journal (journal); -+ -+ result = -EIO; -+ if (is_handle_aborted(handle)) -+ goto error_out; -+ -+ result = 1; -+ -+ /* Don't extend a locked-down transaction! */ -+ if (handle->h_transaction->t_state != T_RUNNING) { -+ jbd_debug(3, "denied handle %p %d blocks: " -+ "transaction not running\n", handle, nblocks); -+ goto error_out; -+ } -+ -+ wanted = transaction->t_outstanding_credits + nblocks; -+ -+ if (wanted > journal->j_max_transaction_buffers) { -+ jbd_debug(3, "denied handle %p %d blocks: " -+ "transaction too large\n", handle, nblocks); -+ goto error_out; -+ } -+ -+ if (wanted > log_space_left(journal)) { -+ jbd_debug(3, "denied handle %p %d blocks: " -+ "insufficient log space\n", handle, nblocks); -+ goto error_out; -+ } -+ -+ handle->h_buffer_credits += nblocks; -+ transaction->t_outstanding_credits += nblocks; -+ result = 0; -+ -+ jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); -+ -+error_out: -+ unlock_journal (journal); -+ return result; -+} -+ -+ -+/* -+ * journal_restart: restart a handle for a multi-transaction filesystem -+ * operation. -+ * -+ * If the journal_extend() call above fails to grant new buffer credits -+ * to a running handle, a call to journal_restart will commit the -+ * handle's transaction so far and reattach the handle to a new -+ * transaction capabable of guaranteeing the requested number of -+ * credits. -+ */ -+ -+int journal_restart(handle_t *handle, int nblocks) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ int ret; -+ -+ /* If we've had an abort of any type, don't even think about -+ * actually doing the restart! */ -+ if (is_handle_aborted(handle)) -+ return 0; -+ -+ /* First unlink the handle from its current transaction, and -+ * start the commit on that. */ -+ -+ J_ASSERT (transaction->t_updates > 0); -+ J_ASSERT (journal_current_handle() == handle); -+ -+ transaction->t_outstanding_credits -= handle->h_buffer_credits; -+ transaction->t_updates--; -+ -+ if (!transaction->t_updates) -+ wake_up(&journal->j_wait_updates); -+ -+ jbd_debug(2, "restarting handle %p\n", handle); -+ log_start_commit(journal, transaction); -+ -+ handle->h_buffer_credits = nblocks; -+ ret = start_this_handle(journal, handle); -+ return ret; -+} -+ -+ -+/* -+ * Barrier operation: establish a transaction barrier. -+ * -+ * This locks out any further updates from being started, and blocks -+ * until all existing updates have completed, returning only once the -+ * journal is in a quiescent state with no updates running. -+ * -+ * The journal lock should not be held on entry. -+ */ -+ -+void journal_lock_updates (journal_t *journal) -+{ -+ lock_journal(journal); -+ ++journal->j_barrier_count; -+ -+ /* Wait until there are no running updates */ -+ while (1) { -+ transaction_t *transaction = journal->j_running_transaction; -+ if (!transaction) -+ break; -+ if (!transaction->t_updates) -+ break; -+ -+ unlock_journal(journal); -+ sleep_on(&journal->j_wait_updates); -+ lock_journal(journal); -+ } -+ -+ unlock_journal(journal); -+ -+ /* We have now established a barrier against other normal -+ * updates, but we also need to barrier against other -+ * journal_lock_updates() calls to make sure that we serialise -+ * special journal-locked operations too. */ -+ down(&journal->j_barrier); -+} -+ -+/* -+ * Release a transaction barrier obtained with journal_lock_updates(). -+ * -+ * Should be called without the journal lock held. -+ */ -+ -+void journal_unlock_updates (journal_t *journal) -+{ -+ lock_journal(journal); -+ -+ J_ASSERT (journal->j_barrier_count != 0); -+ -+ up(&journal->j_barrier); -+ --journal->j_barrier_count; -+ wake_up(&journal->j_wait_transaction_locked); -+ unlock_journal(journal); -+} -+ -+/* -+ * journal_get_write_access: notify intent to modify a buffer for metadata -+ * (not data) update. -+ * -+ * If the buffer is already part of the current transaction, then there -+ * is nothing we need to do. If it is already part of a prior -+ * transaction which we are still committing to disk, then we need to -+ * make sure that we do not overwrite the old copy: we do copy-out to -+ * preserve the copy going to disk. We also account the buffer against -+ * the handle's metadata buffer credits (unless the buffer is already -+ * part of the transaction, that is). -+ * -+ * Returns an error code or 0 on success. -+ * -+ * In full data journalling mode the buffer may be of type BJ_AsyncData, -+ * because we're write()ing a buffer which is also part of a shared mapping. -+ */ -+ -+static int -+do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ int error; -+ char *frozen_buffer = NULL; -+ int need_copy = 0; -+ -+ jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); -+ -+ JBUFFER_TRACE(jh, "entry"); -+repeat: -+ /* @@@ Need to check for errors here at some point. */ -+ -+ /* -+ * AKPM: neither bdflush nor kupdate run with the BKL. There's -+ * nothing we can do to prevent them from starting writeout of a -+ * BUF_DIRTY buffer at any time. And checkpointing buffers are on -+ * BUF_DIRTY. So. We no longer assert that the buffer is unlocked. -+ * -+ * However. It is very wrong for us to allow ext3 to start directly -+ * altering the ->b_data of buffers which may at that very time be -+ * undergoing writeout to the client filesystem. This can leave -+ * the filesystem in an inconsistent, transient state if we crash. -+ * So what we do is to steal the buffer if it is in checkpoint -+ * mode and dirty. The journal lock will keep out checkpoint-mode -+ * state transitions within journal_remove_checkpoint() and the buffer -+ * is locked to keep bdflush/kupdate/whoever away from it as well. -+ * -+ * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a -+ * simple lock_journal(). This code here will care for locked buffers. -+ */ -+ /* -+ * The buffer_locked() || buffer_dirty() tests here are simply an -+ * optimisation tweak. If anyone else in the system decides to -+ * lock this buffer later on, we'll blow up. There doesn't seem -+ * to be a good reason why they should do this. -+ */ -+ if (jh->b_cp_transaction && -+ (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) { -+ unlock_journal(journal); -+ lock_buffer(jh2bh(jh)); -+ spin_lock(&journal_datalist_lock); -+ if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) { -+ /* OK, we need to steal it */ -+ JBUFFER_TRACE(jh, "stealing from checkpoint mode"); -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ J_ASSERT_JH(jh, jh->b_frozen_data == NULL); -+ -+ J_ASSERT(handle->h_buffer_credits > 0); -+ handle->h_buffer_credits--; -+ -+ /* This will clear BH_Dirty and set BH_JBDDirty. */ -+ JBUFFER_TRACE(jh, "file as BJ_Reserved"); -+ __journal_file_buffer(jh, transaction, BJ_Reserved); -+ -+ /* And pull it off BUF_DIRTY, onto BUF_CLEAN */ -+ refile_buffer(jh2bh(jh)); -+ -+ /* -+ * The buffer is now hidden from bdflush. It is -+ * metadata against the current transaction. -+ */ -+ JBUFFER_TRACE(jh, "steal from cp mode is complete"); -+ } -+ spin_unlock(&journal_datalist_lock); -+ unlock_buffer(jh2bh(jh)); -+ lock_journal(journal); -+ goto repeat; -+ } -+ -+ J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh))); -+ -+ error = -EROFS; -+ if (is_handle_aborted(handle)) -+ goto out_unlocked; -+ error = 0; -+ -+ spin_lock(&journal_datalist_lock); -+ -+ /* The buffer is already part of this transaction if -+ * b_transaction or b_next_transaction points to it. */ -+ -+ if (jh->b_transaction == transaction || -+ jh->b_next_transaction == transaction) -+ goto done_locked; -+ -+ /* If there is already a copy-out version of this buffer, then -+ * we don't need to make another one. */ -+ -+ if (jh->b_frozen_data) { -+ JBUFFER_TRACE(jh, "has frozen data"); -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ jh->b_next_transaction = transaction; -+ -+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0); -+ handle->h_buffer_credits--; -+ goto done_locked; -+ } -+ -+ /* Is there data here we need to preserve? */ -+ -+ if (jh->b_transaction && jh->b_transaction != transaction) { -+ JBUFFER_TRACE(jh, "owned by older transaction"); -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ J_ASSERT_JH(jh, jh->b_transaction == -+ journal->j_committing_transaction); -+ -+ /* There is one case we have to be very careful about. -+ * If the committing transaction is currently writing -+ * this buffer out to disk and has NOT made a copy-out, -+ * then we cannot modify the buffer contents at all -+ * right now. The essence of copy-out is that it is the -+ * extra copy, not the primary copy, which gets -+ * journaled. If the primary copy is already going to -+ * disk then we cannot do copy-out here. */ -+ -+ if (jh->b_jlist == BJ_Shadow) { -+ JBUFFER_TRACE(jh, "on shadow: sleep"); -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ /* commit wakes up all shadow buffers after IO */ -+ sleep_on(&jh2bh(jh)->b_wait); -+ lock_journal(journal); -+ goto repeat; -+ } -+ -+ /* Only do the copy if the currently-owning transaction -+ * still needs it. If it is on the Forget list, the -+ * committing transaction is past that stage. The -+ * buffer had better remain locked during the kmalloc, -+ * but that should be true --- we hold the journal lock -+ * still and the buffer is already on the BUF_JOURNAL -+ * list so won't be flushed. -+ * -+ * Subtle point, though: if this is a get_undo_access, -+ * then we will be relying on the frozen_data to contain -+ * the new value of the committed_data record after the -+ * transaction, so we HAVE to force the frozen_data copy -+ * in that case. */ -+ -+ if (jh->b_jlist != BJ_Forget || force_copy) { -+ JBUFFER_TRACE(jh, "generate frozen data"); -+ if (!frozen_buffer) { -+ JBUFFER_TRACE(jh, "allocate memory for buffer"); -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, -+ GFP_NOFS); -+ lock_journal(journal); -+ if (!frozen_buffer) { -+ printk(KERN_EMERG __FUNCTION__ -+ "OOM for frozen_buffer\n"); -+ JBUFFER_TRACE(jh, "oom!"); -+ error = -ENOMEM; -+ spin_lock(&journal_datalist_lock); -+ goto done_locked; -+ } -+ goto repeat; -+ } -+ -+ jh->b_frozen_data = frozen_buffer; -+ frozen_buffer = NULL; -+ need_copy = 1; -+ } -+ jh->b_next_transaction = transaction; -+ } -+ -+ J_ASSERT(handle->h_buffer_credits > 0); -+ handle->h_buffer_credits--; -+ -+ /* Finally, if the buffer is not journaled right now, we need to -+ * make sure it doesn't get written to disk before the caller -+ * actually commits the new data. */ -+ -+ if (!jh->b_transaction) { -+ JBUFFER_TRACE(jh, "no transaction"); -+ J_ASSERT_JH(jh, !jh->b_next_transaction); -+ jh->b_transaction = transaction; -+ JBUFFER_TRACE(jh, "file as BJ_Reserved"); -+ __journal_file_buffer(jh, transaction, BJ_Reserved); -+ } -+ -+done_locked: -+ spin_unlock(&journal_datalist_lock); -+ if (need_copy) { -+ struct page *page; -+ int offset; -+ char *source; -+ -+ J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh))); -+ page = jh2bh(jh)->b_page; -+ offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; -+ source = kmap(page); -+ memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); -+ kunmap(page); -+ } -+ -+ -+ /* If we are about to journal a buffer, then any revoke pending -+ on it is no longer valid. */ -+ journal_cancel_revoke(handle, jh); -+ -+out_unlocked: -+ if (frozen_buffer) -+ kfree(frozen_buffer); -+ -+ JBUFFER_TRACE(jh, "exit"); -+ return error; -+} -+ -+int journal_get_write_access (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh = journal_add_journal_head(bh); -+ int rc; -+ -+ /* We do not want to get caught playing with fields which the -+ * log thread also manipulates. Make sure that the buffer -+ * completes any outstanding IO before proceeding. */ -+ lock_journal(journal); -+ rc = do_get_write_access(handle, jh, 0); -+ journal_unlock_journal_head(jh); -+ unlock_journal(journal); -+ return rc; -+} -+ -+ -+/* -+ * When the user wants to journal a newly created buffer_head -+ * (ie. getblk() returned a new buffer and we are going to populate it -+ * manually rather than reading off disk), then we need to keep the -+ * buffer_head locked until it has been completely filled with new -+ * data. In this case, we should be able to make the assertion that -+ * the bh is not already part of an existing transaction. -+ * -+ * The buffer should already be locked by the caller by this point. -+ * There is no lock ranking violation: it was a newly created, -+ * unlocked buffer beforehand. */ -+ -+int journal_get_create_access (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh = journal_add_journal_head(bh); -+ int err; -+ -+ jbd_debug(5, "journal_head %p\n", jh); -+ lock_journal(journal); -+ err = -EROFS; -+ if (is_handle_aborted(handle)) -+ goto out; -+ err = 0; -+ -+ JBUFFER_TRACE(jh, "entry"); -+ /* The buffer may already belong to this transaction due to -+ * pre-zeroing in the filesystem's new_block code. It may also -+ * be on the previous, committing transaction's lists, but it -+ * HAS to be in Forget state in that case: the transaction must -+ * have deleted the buffer for it to be reused here. */ -+ J_ASSERT_JH(jh, (jh->b_transaction == transaction || -+ jh->b_transaction == NULL || -+ (jh->b_transaction == journal->j_committing_transaction && -+ jh->b_jlist == BJ_Forget))); -+ -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); -+ -+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0); -+ handle->h_buffer_credits--; -+ -+ spin_lock(&journal_datalist_lock); -+ if (jh->b_transaction == NULL) { -+ jh->b_transaction = transaction; -+ JBUFFER_TRACE(jh, "file as BJ_Reserved"); -+ __journal_file_buffer(jh, transaction, BJ_Reserved); -+ JBUFFER_TRACE(jh, "refile"); -+ refile_buffer(jh2bh(jh)); -+ } else if (jh->b_transaction == journal->j_committing_transaction) { -+ JBUFFER_TRACE(jh, "set next transaction"); -+ jh->b_next_transaction = transaction; -+ } -+ spin_unlock(&journal_datalist_lock); -+ -+ /* -+ * akpm: I added this. ext3_alloc_branch can pick up new indirect -+ * blocks which contain freed but then revoked metadata. We need -+ * to cancel the revoke in case we end up freeing it yet again -+ * and the reallocating as data - this would cause a second revoke, -+ * which hits an assertion error. -+ */ -+ JBUFFER_TRACE(jh, "cancelling revoke"); -+ journal_cancel_revoke(handle, jh); -+ journal_unlock_journal_head(jh); -+out: -+ unlock_journal(journal); -+ return err; -+} -+ -+ -+ -+/* -+ * journal_get_undo_access: Notify intent to modify metadata with non- -+ * rewindable consequences -+ * -+ * Sometimes there is a need to distinguish between metadata which has -+ * been committed to disk and that which has not. The ext3fs code uses -+ * this for freeing and allocating space: we have to make sure that we -+ * do not reuse freed space until the deallocation has been committed, -+ * since if we overwrote that space we would make the delete -+ * un-rewindable in case of a crash. -+ * -+ * To deal with that, journal_get_undo_access requests write access to a -+ * buffer for parts of non-rewindable operations such as delete -+ * operations on the bitmaps. The journaling code must keep a copy of -+ * the buffer's contents prior to the undo_access call until such time -+ * as we know that the buffer has definitely been committed to disk. -+ * -+ * We never need to know which transaction the committed data is part -+ * of: buffers touched here are guaranteed to be dirtied later and so -+ * will be committed to a new transaction in due course, at which point -+ * we can discard the old committed data pointer. -+ * -+ * Returns error number or 0 on success. -+ */ -+ -+int journal_get_undo_access (handle_t *handle, struct buffer_head *bh) -+{ -+ journal_t *journal = handle->h_transaction->t_journal; -+ int err; -+ struct journal_head *jh = journal_add_journal_head(bh); -+ -+ JBUFFER_TRACE(jh, "entry"); -+ lock_journal(journal); -+ -+ /* Do this first --- it can drop the journal lock, so we want to -+ * make sure that obtaining the committed_data is done -+ * atomically wrt. completion of any outstanding commits. */ -+ err = do_get_write_access (handle, jh, 1); -+ if (err) -+ goto out; -+ -+ if (!jh->b_committed_data) { -+ /* Copy out the current buffer contents into the -+ * preserved, committed copy. */ -+ JBUFFER_TRACE(jh, "generate b_committed data"); -+ jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size, -+ GFP_NOFS); -+ if (!jh->b_committed_data) { -+ printk(KERN_EMERG __FUNCTION__ -+ ": No memory for committed data!\n"); -+ err = -ENOMEM; -+ goto out; -+ } -+ -+ memcpy (jh->b_committed_data, jh2bh(jh)->b_data, -+ jh2bh(jh)->b_size); -+ } -+ -+out: -+ if (!err) -+ J_ASSERT_JH(jh, jh->b_committed_data); -+ journal_unlock_journal_head(jh); -+ unlock_journal(journal); -+ return err; -+} -+ -+/* -+ * journal_dirty_data: mark a buffer as containing dirty data which -+ * needs to be flushed before we can commit the current transaction. -+ * -+ * The buffer is placed on the transaction's data list and is marked as -+ * belonging to the transaction. -+ * -+ * If `async' is set then the writebask will be initiated by the caller -+ * using submit_bh -> end_buffer_io_async. We put the buffer onto -+ * t_async_datalist. -+ * -+ * Returns error number or 0 on success. -+ * -+ * journal_dirty_data() can be called via page_launder->ext3_writepage -+ * by kswapd. So it cannot block. Happily, there's nothing here -+ * which needs lock_journal if `async' is set. -+ * -+ * When the buffer is on the current transaction we freely move it -+ * between BJ_AsyncData and BJ_SyncData according to who tried to -+ * change its state last. -+ */ -+ -+int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async) -+{ -+ journal_t *journal = handle->h_transaction->t_journal; -+ int need_brelse = 0; -+ int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData; -+ struct journal_head *jh; -+ -+ if (is_handle_aborted(handle)) -+ return 0; -+ -+ jh = journal_add_journal_head(bh); -+ JBUFFER_TRACE(jh, "entry"); -+ -+ /* -+ * The buffer could *already* be dirty. Writeout can start -+ * at any time. -+ */ -+ jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); -+ -+ /* -+ * What if the buffer is already part of a running transaction? -+ * -+ * There are two cases: -+ * 1) It is part of the current running transaction. Refile it, -+ * just in case we have allocated it as metadata, deallocated -+ * it, then reallocated it as data. -+ * 2) It is part of the previous, still-committing transaction. -+ * If all we want to do is to guarantee that the buffer will be -+ * written to disk before this new transaction commits, then -+ * being sure that the *previous* transaction has this same -+ * property is sufficient for us! Just leave it on its old -+ * transaction. -+ * -+ * In case (2), the buffer must not already exist as metadata -+ * --- that would violate write ordering (a transaction is free -+ * to write its data at any point, even before the previous -+ * committing transaction has committed). The caller must -+ * never, ever allow this to happen: there's nothing we can do -+ * about it in this layer. -+ */ -+ spin_lock(&journal_datalist_lock); -+ if (jh->b_transaction) { -+ JBUFFER_TRACE(jh, "has transaction"); -+ if (jh->b_transaction != handle->h_transaction) { -+ JBUFFER_TRACE(jh, "belongs to older transaction"); -+ J_ASSERT_JH(jh, jh->b_transaction == -+ journal->j_committing_transaction); -+ -+ /* @@@ IS THIS TRUE ? */ -+ /* -+ * Not any more. Scenario: someone does a write() -+ * in data=journal mode. The buffer's transaction has -+ * moved into commit. Then someone does another -+ * write() to the file. We do the frozen data copyout -+ * and set b_next_transaction to point to j_running_t. -+ * And while we're in that state, someone does a -+ * writepage() in an attempt to pageout the same area -+ * of the file via a shared mapping. At present that -+ * calls journal_dirty_data(), and we get right here. -+ * It may be too late to journal the data. Simply -+ * falling through to the next test will suffice: the -+ * data will be dirty and wil be checkpointed. The -+ * ordering comments in the next comment block still -+ * apply. -+ */ -+ //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ -+ /* -+ * If we're journalling data, and this buffer was -+ * subject to a write(), it could be metadata, forget -+ * or shadow against the committing transaction. Now, -+ * someone has dirtied the same darn page via a mapping -+ * and it is being writepage()'d. -+ * We *could* just steal the page from commit, with some -+ * fancy locking there. Instead, we just skip it - -+ * don't tie the page's buffers to the new transaction -+ * at all. -+ * Implication: if we crash before the writepage() data -+ * is written into the filesystem, recovery will replay -+ * the write() data. -+ */ -+ if (jh->b_jlist != BJ_None && -+ jh->b_jlist != BJ_SyncData && -+ jh->b_jlist != BJ_AsyncData) { -+ JBUFFER_TRACE(jh, "Not stealing"); -+ goto no_journal; -+ } -+ -+ /* -+ * This buffer may be undergoing writeout in commit. We -+ * can't return from here and let the caller dirty it -+ * again because that can cause the write-out loop in -+ * commit to never terminate. -+ */ -+ if (!async && buffer_dirty(bh)) { -+ atomic_inc(&bh->b_count); -+ spin_unlock(&journal_datalist_lock); -+ need_brelse = 1; -+ ll_rw_block(WRITE, 1, &bh); -+ wait_on_buffer(bh); -+ spin_lock(&journal_datalist_lock); -+ /* The buffer may become locked again at any -+ time if it is redirtied */ -+ } -+ -+ /* journal_clean_data_list() may have got there first */ -+ if (jh->b_transaction != NULL) { -+ JBUFFER_TRACE(jh, "unfile from commit"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ } -+ /* The buffer will be refiled below */ -+ -+ } -+ /* -+ * Special case --- the buffer might actually have been -+ * allocated and then immediately deallocated in the previous, -+ * committing transaction, so might still be left on that -+ * transaction's metadata lists. -+ */ -+ if (jh->b_jlist != wanted_jlist) { -+ JBUFFER_TRACE(jh, "not on correct data list: unfile"); -+ J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ JBUFFER_TRACE(jh, "file as data"); -+ __journal_file_buffer(jh, handle->h_transaction, -+ wanted_jlist); -+ } -+ } else { -+ JBUFFER_TRACE(jh, "not on a transaction"); -+ __journal_file_buffer(jh, handle->h_transaction, wanted_jlist); -+ } -+no_journal: -+ spin_unlock(&journal_datalist_lock); -+ if (need_brelse) { -+ BUFFER_TRACE(bh, "brelse"); -+ __brelse(bh); -+ } -+ JBUFFER_TRACE(jh, "exit"); -+ journal_unlock_journal_head(jh); -+ return 0; -+} -+ -+/* -+ * journal_dirty_metadata: mark a buffer as containing dirty metadata -+ * which needs to be journaled as part of the current transaction. -+ * -+ * The buffer is placed on the transaction's metadata list and is marked -+ * as belonging to the transaction. -+ * -+ * Special care needs to be taken if the buffer already belongs to the -+ * current committing transaction (in which case we should have frozen -+ * data present for that commit). In that case, we don't relink the -+ * buffer: that only gets done when the old transaction finally -+ * completes its commit. -+ * -+ * Returns error number or 0 on success. -+ */ -+ -+int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh = bh2jh(bh); -+ -+ jbd_debug(5, "journal_head %p\n", jh); -+ JBUFFER_TRACE(jh, "entry"); -+ lock_journal(journal); -+ if (is_handle_aborted(handle)) -+ goto out_unlock; -+ -+ spin_lock(&journal_datalist_lock); -+ set_bit(BH_JBDDirty, &bh->b_state); -+ set_buffer_flushtime(bh); -+ -+ J_ASSERT_JH(jh, jh->b_transaction != NULL); -+ -+ /* -+ * Metadata already on the current transaction list doesn't -+ * need to be filed. Metadata on another transaction's list must -+ * be committing, and will be refiled once the commit completes: -+ * leave it alone for now. -+ */ -+ -+ if (jh->b_transaction != transaction) { -+ JBUFFER_TRACE(jh, "already on other transaction"); -+ J_ASSERT_JH(jh, jh->b_transaction == -+ journal->j_committing_transaction); -+ J_ASSERT_JH(jh, jh->b_next_transaction == transaction); -+ /* And this case is illegal: we can't reuse another -+ * transaction's data buffer, ever. */ -+ /* FIXME: writepage() should be journalled */ -+ J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData); -+ goto done_locked; -+ } -+ -+ /* That test should have eliminated the following case: */ -+ J_ASSERT_JH(jh, jh->b_frozen_data == 0); -+ -+ JBUFFER_TRACE(jh, "file as BJ_Metadata"); -+ __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); -+ -+done_locked: -+ spin_unlock(&journal_datalist_lock); -+ JBUFFER_TRACE(jh, "exit"); -+out_unlock: -+ unlock_journal(journal); -+ return 0; -+} -+ -+#if 0 -+/* -+ * journal_release_buffer: undo a get_write_access without any buffer -+ * updates, if the update decided in the end that it didn't need access. -+ * -+ * journal_get_write_access() can block, so it is quite possible for a -+ * journaling component to decide after the write access is returned -+ * that global state has changed and the update is no longer required. */ -+ -+void journal_release_buffer (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh = bh2jh(bh); -+ -+ lock_journal(journal); -+ JBUFFER_TRACE(jh, "entry"); -+ -+ /* If the buffer is reserved but not modified by this -+ * transaction, then it is safe to release it. In all other -+ * cases, just leave the buffer as it is. */ -+ -+ spin_lock(&journal_datalist_lock); -+ if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction && -+ !buffer_jdirty(jh2bh(jh))) { -+ JBUFFER_TRACE(jh, "unused: refiling it"); -+ handle->h_buffer_credits++; -+ __journal_refile_buffer(jh); -+ } -+ spin_unlock(&journal_datalist_lock); -+ -+ JBUFFER_TRACE(jh, "exit"); -+ unlock_journal(journal); -+} -+#endif -+ -+/* -+ * journal_forget: bforget() for potentially-journaled buffers. We can -+ * only do the bforget if there are no commits pending against the -+ * buffer. If the buffer is dirty in the current running transaction we -+ * can safely unlink it. -+ * -+ * bh may not be a journalled buffer at all - it may be a non-JBD -+ * buffer which came off the hashtable. Check for this. -+ * -+ * Decrements bh->b_count by one. -+ * -+ * Allow this call even if the handle has aborted --- it may be part of -+ * the caller's cleanup after an abort. -+ */ -+ -+void journal_forget (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh; -+ -+ BUFFER_TRACE(bh, "entry"); -+ -+ lock_journal(journal); -+ spin_lock(&journal_datalist_lock); -+ -+ if (!buffer_jbd(bh)) -+ goto not_jbd; -+ jh = bh2jh(bh); -+ -+ if (jh->b_transaction == handle->h_transaction) { -+ J_ASSERT_JH(jh, !jh->b_frozen_data); -+ -+ /* If we are forgetting a buffer which is already part -+ * of this transaction, then we can just drop it from -+ * the transaction immediately. */ -+ clear_bit(BH_Dirty, &bh->b_state); -+ clear_bit(BH_JBDDirty, &bh->b_state); -+ -+ JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); -+ J_ASSERT_JH(jh, !jh->b_committed_data); -+ -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = 0; -+ -+ /* -+ * We are no longer going to journal this buffer. -+ * However, the commit of this transaction is still -+ * important to the buffer: the delete that we are now -+ * processing might obsolete an old log entry, so by -+ * committing, we can satisfy the buffer's checkpoint. -+ * -+ * So, if we have a checkpoint on the buffer, we should -+ * now refile the buffer on our BJ_Forget list so that -+ * we know to remove the checkpoint after we commit. -+ */ -+ -+ if (jh->b_cp_transaction) { -+ __journal_file_buffer(jh, transaction, BJ_Forget); -+ } else { -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ if (!buffer_jbd(bh)) { -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ __bforget(bh); -+ return; -+ } -+ } -+ -+ } else if (jh->b_transaction) { -+ J_ASSERT_JH(jh, (jh->b_transaction == -+ journal->j_committing_transaction)); -+ /* However, if the buffer is still owned by a prior -+ * (committing) transaction, we can't drop it yet... */ -+ JBUFFER_TRACE(jh, "belongs to older transaction"); -+ /* ... but we CAN drop it from the new transaction if we -+ * have also modified it since the original commit. */ -+ -+ if (jh->b_next_transaction) { -+ J_ASSERT(jh->b_next_transaction == transaction); -+ jh->b_next_transaction = NULL; -+ } -+ } -+ -+not_jbd: -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ __brelse(bh); -+ return; -+} -+ -+#if 0 /* Unused */ -+/* -+ * journal_sync_buffer: flush a potentially-journaled buffer to disk. -+ * -+ * Used for O_SYNC filesystem operations. If the buffer is journaled, -+ * we need to complete the O_SYNC by waiting for the transaction to -+ * complete. It is an error to call journal_sync_buffer before -+ * journal_stop! -+ */ -+ -+void journal_sync_buffer(struct buffer_head *bh) -+{ -+ transaction_t *transaction; -+ journal_t *journal; -+ long sequence; -+ struct journal_head *jh; -+ -+ /* If the buffer isn't journaled, this is easy: just sync it to -+ * disk. */ -+ BUFFER_TRACE(bh, "entry"); -+ -+ spin_lock(&journal_datalist_lock); -+ if (!buffer_jbd(bh)) { -+ spin_unlock(&journal_datalist_lock); -+ return; -+ } -+ jh = bh2jh(bh); -+ if (jh->b_transaction == NULL) { -+ /* If the buffer has already been journaled, then this -+ * is a noop. */ -+ if (jh->b_cp_transaction == NULL) { -+ spin_unlock(&journal_datalist_lock); -+ return; -+ } -+ atomic_inc(&bh->b_count); -+ spin_unlock(&journal_datalist_lock); -+ ll_rw_block (WRITE, 1, &bh); -+ wait_on_buffer(bh); -+ __brelse(bh); -+ goto out; -+ } -+ -+ /* Otherwise, just wait until the transaction is synced to disk. */ -+ transaction = jh->b_transaction; -+ journal = transaction->t_journal; -+ sequence = transaction->t_tid; -+ spin_unlock(&journal_datalist_lock); -+ -+ jbd_debug(2, "requesting commit for jh %p\n", jh); -+ log_start_commit (journal, transaction); -+ -+ while (tid_gt(sequence, journal->j_commit_sequence)) { -+ wake_up(&journal->j_wait_done_commit); -+ sleep_on(&journal->j_wait_done_commit); -+ } -+ JBUFFER_TRACE(jh, "exit"); -+out: -+ return; -+} -+#endif -+ -+/* -+ * All done for a particular handle. -+ * -+ * There is not much action needed here. We just return any remaining -+ * buffer credits to the transaction and remove the handle. The only -+ * complication is that we need to start a commit operation if the -+ * filesystem is marked for synchronous update. -+ * -+ * journal_stop itself will not usually return an error, but it may -+ * do so in unusual circumstances. In particular, expect it to -+ * return -EIO if a journal_abort has been executed since the -+ * transaction began. -+ */ -+ -+int journal_stop(handle_t *handle) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ int old_handle_count, err; -+ -+ if (!handle) -+ return 0; -+ -+ J_ASSERT (transaction->t_updates > 0); -+ J_ASSERT (journal_current_handle() == handle); -+ -+ if (is_handle_aborted(handle)) -+ err = -EIO; -+ else -+ err = 0; -+ -+ if (--handle->h_ref > 0) { -+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, -+ handle->h_ref); -+ return err; -+ } -+ -+ jbd_debug(4, "Handle %p going down\n", handle); -+ -+ /* -+ * Implement synchronous transaction batching. If the handle -+ * was synchronous, don't force a commit immediately. Let's -+ * yield and let another thread piggyback onto this transaction. -+ * Keep doing that while new threads continue to arrive. -+ * It doesn't cost much - we're about to run a commit and sleep -+ * on IO anyway. Speeds up many-threaded, many-dir operations -+ * by 30x or more... -+ */ -+ if (handle->h_sync) { -+ do { -+ old_handle_count = transaction->t_handle_count; -+ set_current_state(TASK_RUNNING); -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ } while (old_handle_count != transaction->t_handle_count); -+ } -+ -+ current->journal_info = NULL; -+ transaction->t_outstanding_credits -= handle->h_buffer_credits; -+ transaction->t_updates--; -+ if (!transaction->t_updates) { -+ wake_up(&journal->j_wait_updates); -+ if (journal->j_barrier_count) -+ wake_up(&journal->j_wait_transaction_locked); -+ } -+ -+ /* -+ * If the handle is marked SYNC, we need to set another commit -+ * going! We also want to force a commit if the current -+ * transaction is occupying too much of the log, or if the -+ * transaction is too old now. -+ */ -+ if (handle->h_sync || -+ transaction->t_outstanding_credits > -+ journal->j_max_transaction_buffers || -+ time_after_eq(jiffies, transaction->t_expires)) { -+ /* Do this even for aborted journals: an abort still -+ * completes the commit thread, it just doesn't write -+ * anything to disk. */ -+ tid_t tid = transaction->t_tid; -+ -+ jbd_debug(2, "transaction too old, requesting commit for " -+ "handle %p\n", handle); -+ /* This is non-blocking */ -+ log_start_commit(journal, transaction); -+ -+ /* -+ * Special case: JFS_SYNC synchronous updates require us -+ * to wait for the commit to complete. -+ */ -+ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) -+ log_wait_commit(journal, tid); -+ } -+ kfree(handle); -+ return err; -+} -+ -+/* -+ * For synchronous operations: force any uncommitted trasnactions -+ * to disk. May seem kludgy, but it reuses all the handle batching -+ * code in a very simple manner. -+ */ -+int journal_force_commit(journal_t *journal) -+{ -+ handle_t *handle; -+ int ret = 0; -+ -+ lock_kernel(); -+ handle = journal_start(journal, 1); -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out; -+ } -+ handle->h_sync = 1; -+ journal_stop(handle); -+out: -+ unlock_kernel(); -+ return ret; -+} -+ -+/* -+ * -+ * List management code snippets: various functions for manipulating the -+ * transaction buffer lists. -+ * -+ */ -+ -+/* -+ * Append a buffer to a transaction list, given the transaction's list head -+ * pointer. -+ * journal_datalist_lock is held. -+ */ -+ -+static inline void -+__blist_add_buffer(struct journal_head **list, struct journal_head *jh) -+{ -+ if (!*list) { -+ jh->b_tnext = jh->b_tprev = jh; -+ *list = jh; -+ } else { -+ /* Insert at the tail of the list to preserve order */ -+ struct journal_head *first = *list, *last = first->b_tprev; -+ jh->b_tprev = last; -+ jh->b_tnext = first; -+ last->b_tnext = first->b_tprev = jh; -+ } -+} -+ -+/* -+ * Remove a buffer from a transaction list, given the transaction's list -+ * head pointer. -+ * -+ * Called with journal_datalist_lock held, and the journal may not -+ * be locked. -+ */ -+ -+static inline void -+__blist_del_buffer(struct journal_head **list, struct journal_head *jh) -+{ -+ if (*list == jh) { -+ *list = jh->b_tnext; -+ if (*list == jh) -+ *list = 0; -+ } -+ jh->b_tprev->b_tnext = jh->b_tnext; -+ jh->b_tnext->b_tprev = jh->b_tprev; -+} -+ -+/* -+ * Remove a buffer from the appropriate transaction list. -+ * -+ * Note that this function can *change* the value of -+ * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget, -+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller -+ * is holding onto a copy of one of thee pointers, it could go bad. -+ * Generally the caller needs to re-read the pointer from the transaction_t. -+ * -+ * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called -+ * via journal_try_to_free_buffer() or journal_clean_data_list(). In that -+ * case, journal_datalist_lock will be held, and the journal may not be locked. -+ */ -+void __journal_unfile_buffer(struct journal_head *jh) -+{ -+ struct journal_head **list = 0; -+ transaction_t * transaction; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ transaction = jh->b_transaction; -+ -+#ifdef __SMP__ -+ J_ASSERT (current->lock_depth >= 0); -+#endif -+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); -+ -+ if (jh->b_jlist != BJ_None) -+ J_ASSERT_JH(jh, transaction != 0); -+ -+ switch (jh->b_jlist) { -+ case BJ_None: -+ return; -+ case BJ_SyncData: -+ list = &transaction->t_sync_datalist; -+ break; -+ case BJ_AsyncData: -+ list = &transaction->t_async_datalist; -+ break; -+ case BJ_Metadata: -+ transaction->t_nr_buffers--; -+ J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); -+ list = &transaction->t_buffers; -+ break; -+ case BJ_Forget: -+ list = &transaction->t_forget; -+ break; -+ case BJ_IO: -+ list = &transaction->t_iobuf_list; -+ break; -+ case BJ_Shadow: -+ list = &transaction->t_shadow_list; -+ break; -+ case BJ_LogCtl: -+ list = &transaction->t_log_list; -+ break; -+ case BJ_Reserved: -+ list = &transaction->t_reserved_list; -+ break; -+ } -+ -+ __blist_del_buffer(list, jh); -+ jh->b_jlist = BJ_None; -+ if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) { -+ set_bit(BH_Dirty, &jh2bh(jh)->b_state); -+ } -+} -+ -+void journal_unfile_buffer(struct journal_head *jh) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_unfile_buffer(jh); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * Called from journal_try_to_free_buffers(). The journal is not -+ * locked. lru_list_lock is not held. -+ * -+ * Here we see why journal_datalist_lock is global and not per-journal. -+ * We cannot get back to this buffer's journal pointer without locking -+ * out journal_clean_data_list() in some manner. -+ * -+ * One could use journal_datalist_lock to get unracy access to a -+ * per-journal lock. -+ * -+ * Called with journal_datalist_lock held. -+ * -+ * Returns non-zero iff we were able to free the journal_head. -+ */ -+static int __journal_try_to_free_buffer(struct buffer_head *bh, -+ int *locked_or_dirty) -+{ -+ struct journal_head *jh; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ -+ jh = bh2jh(bh); -+ -+ if (buffer_locked(bh) || buffer_dirty(bh)) { -+ *locked_or_dirty = 1; -+ goto out; -+ } -+ -+ if (!buffer_uptodate(bh)) -+ goto out; -+ -+ if (jh->b_next_transaction != 0) -+ goto out; -+ -+ if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { -+ if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) { -+ /* A written-back ordered data buffer */ -+ JBUFFER_TRACE(jh, "release data"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = 0; -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ } -+ else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { -+ /* written-back checkpointed metadata buffer */ -+ if (jh->b_jlist == BJ_None) { -+ JBUFFER_TRACE(jh, "remove from checkpoint list"); -+ __journal_remove_checkpoint(jh); -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ } -+ return !buffer_jbd(bh); -+ -+out: -+ return 0; -+} -+ -+/* -+ * journal_try_to_free_buffers(). For all the buffers on this page, -+ * if they are fully written out ordered data, move them onto BUF_CLEAN -+ * so try_to_free_buffers() can reap them. Called with lru_list_lock -+ * not held. Does its own locking. -+ * -+ * This complicates JBD locking somewhat. We aren't protected by the -+ * BKL here. We wish to remove the buffer from its committing or -+ * running transaction's ->t_datalist via __journal_unfile_buffer. -+ * -+ * This may *change* the value of transaction_t->t_datalist, so anyone -+ * who looks at t_datalist needs to lock against this function. -+ * -+ * Even worse, someone may be doing a journal_dirty_data on this -+ * buffer. So we need to lock against that. journal_dirty_data() -+ * will come out of the lock with the buffer dirty, which makes it -+ * ineligible for release here. -+ * -+ * Who else is affected by this? hmm... Really the only contender -+ * is do_get_write_access() - it could be looking at the buffer while -+ * journal_try_to_free_buffer() is changing its state. But that -+ * cannot happen because we never reallocate freed data as metadata -+ * while the data is part of a transaction. Yes? -+ * -+ * This function returns non-zero if we wish try_to_free_buffers() -+ * to be called. We do this is the page is releasable by try_to_free_buffers(). -+ * We also do it if the page has locked or dirty buffers and the caller wants -+ * us to perform sync or async writeout. -+ */ -+int journal_try_to_free_buffers(journal_t *journal, -+ struct page *page, int gfp_mask) -+{ -+ struct buffer_head *bh; -+ struct buffer_head *tmp; -+ int locked_or_dirty = 0; -+ int call_ttfb = 1; -+ -+ J_ASSERT(PageLocked(page)); -+ -+ bh = page->buffers; -+ tmp = bh; -+ spin_lock(&journal_datalist_lock); -+ do { -+ struct buffer_head *p = tmp; -+ -+ tmp = tmp->b_this_page; -+ if (buffer_jbd(p)) -+ if (!__journal_try_to_free_buffer(p, &locked_or_dirty)) -+ call_ttfb = 0; -+ } while (tmp != bh); -+ spin_unlock(&journal_datalist_lock); -+ -+ if (!(gfp_mask & (__GFP_IO|__GFP_WAIT))) -+ goto out; -+ if (!locked_or_dirty) -+ goto out; -+ /* -+ * The VM wants us to do writeout, or to block on IO, or both. -+ * So we allow try_to_free_buffers to be called even if the page -+ * still has journalled buffers. -+ */ -+ call_ttfb = 1; -+out: -+ return call_ttfb; -+} -+ -+/* -+ * This buffer is no longer needed. If it is on an older transaction's -+ * checkpoint list we need to record it on this transaction's forget list -+ * to pin this buffer (and hence its checkpointing transaction) down until -+ * this transaction commits. If the buffer isn't on a checkpoint list, we -+ * release it. -+ * Returns non-zero if JBD no longer has an interest in the buffer. -+ */ -+static int dispose_buffer(struct journal_head *jh, -+ transaction_t *transaction) -+{ -+ int may_free = 1; -+ struct buffer_head *bh = jh2bh(jh); -+ -+ spin_lock(&journal_datalist_lock); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = 0; -+ -+ if (jh->b_cp_transaction) { -+ JBUFFER_TRACE(jh, "on running+cp transaction"); -+ __journal_file_buffer(jh, transaction, BJ_Forget); -+ clear_bit(BH_JBDDirty, &bh->b_state); -+ may_free = 0; -+ } else { -+ JBUFFER_TRACE(jh, "on running transaction"); -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ spin_unlock(&journal_datalist_lock); -+ return may_free; -+} -+ -+/* -+ * journal_flushpage -+ * -+ * This code is tricky. It has a number of cases to deal with. -+ * -+ * There are two invariants which this code relies on: -+ * -+ * i_size must be updated on disk before we start calling flushpage on the -+ * data. -+ * -+ * This is done in ext3 by defining an ext3_setattr method which -+ * updates i_size before truncate gets going. By maintaining this -+ * invariant, we can be sure that it is safe to throw away any buffers -+ * attached to the current transaction: once the transaction commits, -+ * we know that the data will not be needed. -+ * -+ * Note however that we can *not* throw away data belonging to the -+ * previous, committing transaction! -+ * -+ * Any disk blocks which *are* part of the previous, committing -+ * transaction (and which therefore cannot be discarded immediately) are -+ * not going to be reused in the new running transaction -+ * -+ * The bitmap committed_data images guarantee this: any block which is -+ * allocated in one transaction and removed in the next will be marked -+ * as in-use in the committed_data bitmap, so cannot be reused until -+ * the next transaction to delete the block commits. This means that -+ * leaving committing buffers dirty is quite safe: the disk blocks -+ * cannot be reallocated to a different file and so buffer aliasing is -+ * not possible. -+ * -+ * -+ * The above applies mainly to ordered data mode. In writeback mode we -+ * don't make guarantees about the order in which data hits disk --- in -+ * particular we don't guarantee that new dirty data is flushed before -+ * transaction commit --- so it is always safe just to discard data -+ * immediately in that mode. --sct -+ */ -+ -+/* -+ * The journal_unmap_buffer helper function returns zero if the buffer -+ * concerned remains pinned as an anonymous buffer belonging to an older -+ * transaction. -+ * -+ * We're outside-transaction here. Either or both of j_running_transaction -+ * and j_committing_transaction may be NULL. -+ */ -+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) -+{ -+ transaction_t *transaction; -+ struct journal_head *jh; -+ int may_free = 1; -+ -+ BUFFER_TRACE(bh, "entry"); -+ -+ if (!buffer_mapped(bh)) -+ return 1; -+ -+ /* It is safe to proceed here without the -+ * journal_datalist_spinlock because the buffers cannot be -+ * stolen by try_to_free_buffers as long as we are holding the -+ * page lock. --sct */ -+ -+ if (!buffer_jbd(bh)) -+ goto zap_buffer; -+ -+ jh = bh2jh(bh); -+ transaction = jh->b_transaction; -+ if (transaction == NULL) { -+ /* First case: not on any transaction. If it -+ * has no checkpoint link, then we can zap it: -+ * it's a writeback-mode buffer so we don't care -+ * if it hits disk safely. */ -+ if (!jh->b_cp_transaction) { -+ JBUFFER_TRACE(jh, "not on any transaction: zap"); -+ goto zap_buffer; -+ } -+ -+ if (!buffer_dirty(bh)) { -+ /* bdflush has written it. We can drop it now */ -+ goto zap_buffer; -+ } -+ -+ /* OK, it must be in the journal but still not -+ * written fully to disk: it's metadata or -+ * journaled data... */ -+ -+ if (journal->j_running_transaction) { -+ /* ... and once the current transaction has -+ * committed, the buffer won't be needed any -+ * longer. */ -+ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); -+ return dispose_buffer(jh, -+ journal->j_running_transaction); -+ } else { -+ /* There is no currently-running transaction. So the -+ * orphan record which we wrote for this file must have -+ * passed into commit. We must attach this buffer to -+ * the committing transaction, if it exists. */ -+ if (journal->j_committing_transaction) { -+ JBUFFER_TRACE(jh, "give to committing trans"); -+ return dispose_buffer(jh, -+ journal->j_committing_transaction); -+ } else { -+ /* The orphan record's transaction has -+ * committed. We can cleanse this buffer */ -+ clear_bit(BH_JBDDirty, &bh->b_state); -+ goto zap_buffer; -+ } -+ } -+ } else if (transaction == journal->j_committing_transaction) { -+ /* If it is committing, we simply cannot touch it. We -+ * can remove it's next_transaction pointer from the -+ * running transaction if that is set, but nothing -+ * else. */ -+ JBUFFER_TRACE(jh, "on committing transaction"); -+ if (jh->b_next_transaction) { -+ J_ASSERT(jh->b_next_transaction == -+ journal->j_running_transaction); -+ jh->b_next_transaction = NULL; -+ } -+ return 0; -+ } else { -+ /* Good, the buffer belongs to the running transaction. -+ * We are writing our own transaction's data, not any -+ * previous one's, so it is safe to throw it away -+ * (remember that we expect the filesystem to have set -+ * i_size already for this truncate so recovery will not -+ * expose the disk blocks we are discarding here.) */ -+ J_ASSERT_JH(jh, transaction == journal->j_running_transaction); -+ may_free = dispose_buffer(jh, transaction); -+ } -+ -+zap_buffer: -+ if (buffer_dirty(bh)) -+ mark_buffer_clean(bh); -+ J_ASSERT_BH(bh, !buffer_jdirty(bh)); -+ clear_bit(BH_Uptodate, &bh->b_state); -+ clear_bit(BH_Mapped, &bh->b_state); -+ clear_bit(BH_Req, &bh->b_state); -+ clear_bit(BH_New, &bh->b_state); -+ return may_free; -+} -+ -+/* -+ * Return non-zero if the page's buffers were successfully reaped -+ */ -+int journal_flushpage(journal_t *journal, -+ struct page *page, -+ unsigned long offset) -+{ -+ struct buffer_head *head, *bh, *next; -+ unsigned int curr_off = 0; -+ int may_free = 1; -+ -+ if (!PageLocked(page)) -+ BUG(); -+ if (!page->buffers) -+ return 1; -+ -+ /* We will potentially be playing with lists other than just the -+ * data lists (especially for journaled data mode), so be -+ * cautious in our locking. */ -+ lock_journal(journal); -+ -+ head = bh = page->buffers; -+ do { -+ unsigned int next_off = curr_off + bh->b_size; -+ next = bh->b_this_page; -+ -+ /* AKPM: doing lock_buffer here may be overly paranoid */ -+ if (offset <= curr_off) { -+ /* This block is wholly outside the truncation point */ -+ lock_buffer(bh); -+ may_free &= journal_unmap_buffer(journal, bh); -+ unlock_buffer(bh); -+ } -+ curr_off = next_off; -+ bh = next; -+ -+ } while (bh != head); -+ -+ unlock_journal(journal); -+ -+ if (!offset) { -+ if (!may_free || !try_to_free_buffers(page, 0)) -+ return 0; -+ J_ASSERT(page->buffers == NULL); -+ } -+ return 1; -+} -+ -+/* -+ * File a buffer on the given transaction list. -+ */ -+void __journal_file_buffer(struct journal_head *jh, -+ transaction_t *transaction, int jlist) -+{ -+ struct journal_head **list = 0; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ -+#ifdef __SMP__ -+ J_ASSERT (current->lock_depth >= 0); -+#endif -+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); -+ J_ASSERT_JH(jh, jh->b_transaction == transaction || -+ jh->b_transaction == 0); -+ -+ if (jh->b_transaction) { -+ if (jh->b_jlist == jlist) -+ return; -+ __journal_unfile_buffer(jh); -+ } else { -+ jh->b_transaction = transaction; -+ } -+ -+ switch (jlist) { -+ case BJ_None: -+ J_ASSERT_JH(jh, !jh->b_committed_data); -+ J_ASSERT_JH(jh, !jh->b_frozen_data); -+ return; -+ case BJ_SyncData: -+ list = &transaction->t_sync_datalist; -+ break; -+ case BJ_AsyncData: -+ list = &transaction->t_async_datalist; -+ break; -+ case BJ_Metadata: -+ transaction->t_nr_buffers++; -+ list = &transaction->t_buffers; -+ break; -+ case BJ_Forget: -+ list = &transaction->t_forget; -+ break; -+ case BJ_IO: -+ list = &transaction->t_iobuf_list; -+ break; -+ case BJ_Shadow: -+ list = &transaction->t_shadow_list; -+ break; -+ case BJ_LogCtl: -+ list = &transaction->t_log_list; -+ break; -+ case BJ_Reserved: -+ list = &transaction->t_reserved_list; -+ break; -+ } -+ -+ __blist_add_buffer(list, jh); -+ jh->b_jlist = jlist; -+ -+ if (jlist == BJ_Metadata || jlist == BJ_Reserved || -+ jlist == BJ_Shadow || jlist == BJ_Forget) { -+ if (atomic_set_buffer_clean(jh2bh(jh))) { -+ set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); -+ } -+ } -+} -+ -+void journal_file_buffer(struct journal_head *jh, -+ transaction_t *transaction, int jlist) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_file_buffer(jh, transaction, jlist); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * Remove a buffer from its current buffer list in preparation for -+ * dropping it from its current transaction entirely. If the buffer has -+ * already started to be used by a subsequent transaction, refile the -+ * buffer on that transaction's metadata list. -+ */ -+ -+void __journal_refile_buffer(struct journal_head *jh) -+{ -+ assert_spin_locked(&journal_datalist_lock); -+#ifdef __SMP__ -+ J_ASSERT_JH(jh, current->lock_depth >= 0); -+#endif -+ __journal_unfile_buffer(jh); -+ -+ /* If the buffer is now unused, just drop it. If it has been -+ modified by a later transaction, add it to the new -+ transaction's metadata list. */ -+ -+ jh->b_transaction = jh->b_next_transaction; -+ jh->b_next_transaction = NULL; -+ -+ if (jh->b_transaction != NULL) { -+ __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); -+ J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); -+ } else { -+ /* Onto BUF_DIRTY for writeback */ -+ refile_buffer(jh2bh(jh)); -+ } -+} -+ -+/* -+ * For the unlocked version of this call, also make sure that any -+ * hanging journal_head is cleaned up if necessary. -+ * -+ * __journal_refile_buffer is usually called as part of a single locked -+ * operation on a buffer_head, in which the caller is probably going to -+ * be hooking the journal_head onto other lists. In that case it is up -+ * to the caller to remove the journal_head if necessary. For the -+ * unlocked journal_refile_buffer call, the caller isn't going to be -+ * doing anything else to the buffer so we need to do the cleanup -+ * ourselves to avoid a jh leak. -+ * -+ * *** The journal_head may be freed by this call! *** -+ */ -+void journal_refile_buffer(struct journal_head *jh) -+{ -+ struct buffer_head *bh; -+ -+ spin_lock(&journal_datalist_lock); -+ bh = jh2bh(jh); -+ -+ __journal_refile_buffer(jh); -+ __journal_remove_journal_head(bh); -+ -+ spin_unlock(&journal_datalist_lock); -+ __brelse(bh); -+} diff --git a/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch b/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch deleted file mode 100644 index 9bb754a..0000000 --- a/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch +++ /dev/null @@ -1,15 +0,0 @@ - include/linux/mm.h | 1 + - 1 files changed, 1 insertion(+) - -Index: linux.mcp2/include/linux/mm.h -=================================================================== ---- linux.mcp2.orig/include/linux/mm.h 2004-05-05 14:32:29.000000000 -0700 -+++ linux.mcp2/include/linux/mm.h 2004-05-05 14:46:54.000000000 -0700 -@@ -162,6 +162,7 @@ - protected by pagemap_lru_lock !! */ - struct page **pprev_hash; /* Complement to *next_hash. */ - struct buffer_head * buffers; /* Buffer maps us to a disk block. */ -+ unsigned long private; - - /* - * On machines where all RAM is mapped into kernel address space, diff --git a/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch b/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch deleted file mode 100644 index a7bdb63..0000000 --- a/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch +++ /dev/null @@ -1,32 +0,0 @@ -Index: linux-bgl/kernel/sched.c -=================================================================== ---- linux-bgl.orig/kernel/sched.c 2003-07-02 08:43:33.000000000 -0700 -+++ linux-bgl/kernel/sched.c 2004-10-26 23:37:44.314193755 -0700 -@@ -1124,7 +1124,7 @@ - return retval; - } - --static void show_task(struct task_struct * p) -+void show_task(struct task_struct * p) - { - unsigned long free = 0; - int state; -Index: linux-bgl/kernel/ksyms.c -=================================================================== ---- linux-bgl.orig/kernel/ksyms.c 2004-10-26 23:23:00.518654978 -0700 -+++ linux-bgl/kernel/ksyms.c 2004-10-26 23:38:29.289071295 -0700 -@@ -76,6 +76,7 @@ - }; - #endif - -+void show_task(struct task_struct *); - - EXPORT_SYMBOL(inter_module_register); - EXPORT_SYMBOL(inter_module_unregister); -@@ -595,3 +596,6 @@ - - EXPORT_SYMBOL(tasklist_lock); - EXPORT_SYMBOL(pidhash); -+ -+/* debug */ -+EXPORT_SYMBOL(show_task); diff --git a/lustre/kernel_patches/patches/export-truncate-bgl.patch b/lustre/kernel_patches/patches/export-truncate-bgl.patch deleted file mode 100644 index 9508215..0000000 --- a/lustre/kernel_patches/patches/export-truncate-bgl.patch +++ /dev/null @@ -1,37 +0,0 @@ - include/linux/mm.h | 1 + - mm/filemap.c | 3 ++- - 2 files changed, 3 insertions(+), 1 deletion(-) - -Index: linux-ion/include/linux/mm.h -=================================================================== ---- linux-ion.orig/include/linux/mm.h 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/include/linux/mm.h 2004-09-27 15:07:50.000000000 -0700 -@@ -593,6 +593,7 @@ - /* filemap.c */ - extern void remove_inode_page(struct page *); - extern unsigned long page_unuse(struct page *); -+extern void truncate_complete_page(struct page *); - extern void truncate_inode_pages(struct address_space *, loff_t); - - /* generic vm_area_ops exported for stackable file systems */ -Index: linux-ion/mm/filemap.c -=================================================================== ---- linux-ion.orig/mm/filemap.c 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/mm/filemap.c 2004-09-27 15:08:13.000000000 -0700 -@@ -231,7 +231,7 @@ - do_flushpage(page, partial); - } - --static void truncate_complete_page(struct page *page) -+void truncate_complete_page(struct page *page) - { - /* Leave it on the LRU if it gets converted into anonymous buffers */ - if (!page->buffers || do_flushpage(page, 0)) -@@ -249,6 +249,7 @@ - remove_inode_page(page); - page_cache_release(page); - } -+EXPORT_SYMBOL(truncate_complete_page); - - static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); - static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) diff --git a/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch b/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch deleted file mode 100644 index 82a0182..0000000 --- a/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch +++ /dev/null @@ -1,42 +0,0 @@ - - - -Index: linux-ion/kernel/ksyms.c -=================================================================== ---- linux-ion.orig/kernel/ksyms.c 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/kernel/ksyms.c 2004-09-27 15:04:52.000000000 -0700 -@@ -286,6 +286,10 @@ - EXPORT_SYMBOL(dcache_readdir); - EXPORT_SYMBOL(dcache_dir_ops); - -+/* lustre */ -+EXPORT_SYMBOL(panic_notifier_list); -+EXPORT_SYMBOL(do_kern_mount); -+ - /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ - EXPORT_SYMBOL(default_llseek); - EXPORT_SYMBOL(dentry_open); -Index: linux-ion/include/linux/fs.h -=================================================================== ---- linux-ion.orig/include/linux/fs.h 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/include/linux/fs.h 2004-09-27 15:04:52.000000000 -0700 -@@ -1050,6 +1050,7 @@ - extern struct vfsmount *kern_mount(struct file_system_type *); - extern int may_umount(struct vfsmount *); - extern long do_mount(char *, char *, char *, unsigned long, void *); -+struct vfsmount *do_kern_mount(const char *fstype, int flags, char *name, void *data); - extern void umount_tree(struct vfsmount *); - - #define kern_umount mntput -Index: linux-ion/mm/memory.c -=================================================================== ---- linux-ion.orig/mm/memory.c 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/mm/memory.c 2004-09-27 15:05:56.000000000 -0700 -@@ -401,6 +401,7 @@ - mm->rss = 0; - spin_unlock(&mm->page_table_lock); - } -+EXPORT_SYMBOL(zap_page_range); - - /* - * Do a quick page-table lookup for a single page. diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch deleted file mode 100644 index 1cdaa93..0000000 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch +++ /dev/null @@ -1,2560 +0,0 @@ - fs/ext3/Makefile | 2 - fs/ext3/dir.c | 299 +++++++++ - fs/ext3/file.c | 3 - fs/ext3/hash.c | 215 ++++++ - fs/ext3/namei.c | 1388 ++++++++++++++++++++++++++++++++++++++++----- - fs/ext3/super.c | 7 - include/linux/ext3_fs.h | 85 ++ - include/linux/ext3_fs_sb.h | 2 - include/linux/ext3_jbd.h | 2 - include/linux/rbtree.h | 2 - lib/rbtree.c | 42 + - 11 files changed, 1887 insertions(+), 160 deletions(-) - -Index: linux-2.4.19.SuSE/fs/ext3/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/Makefile 2004-05-27 11:07:21.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext3/Makefile 2004-05-27 11:08:28.000000000 -0700 -@@ -12,7 +12,7 @@ - export-objs := super.o inode.o - - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -- ioctl.o namei.o super.o symlink.o -+ ioctl.o namei.o super.o symlink.o hash.o - obj-m := $(O_TARGET) - - obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o -Index: linux-2.4.19.SuSE/fs/ext3/dir.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/dir.c 2004-05-27 11:08:28.000000000 -0700 -@@ -21,12 +21,16 @@ - #include - #include - #include -+#include -+#include - - static unsigned char ext3_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK - }; - - static int ext3_readdir(struct file *, void *, filldir_t); -+static int ext3_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir); - - struct file_operations ext3_dir_operations = { - read: generic_read_dir, -@@ -35,6 +39,17 @@ - fsync: ext3_sync_file, /* BKL held */ - }; - -+ -+static unsigned char get_dtype(struct super_block *sb, int filetype) -+{ -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || -+ (filetype >= EXT3_FT_MAX)) -+ return DT_UNKNOWN; -+ -+ return (ext3_filetype_table[filetype]); -+} -+ -+ - int ext3_check_dir_entry (const char * function, struct inode * dir, - struct ext3_dir_entry_2 * de, - struct buffer_head * bh, -@@ -79,6 +94,16 @@ - - sb = inode->i_sb; - -+ if (is_dx(inode)) { -+ err = ext3_dx_readdir(filp, dirent, filldir); -+ if (err != ERR_BAD_DX_DIR) -+ return err; -+ /* -+ * We don't set the inode dirty flag since it's not -+ * critical that it get flushed back to the disk. -+ */ -+ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; -+ } - stored = 0; - bh = NULL; - offset = filp->f_pos & (sb->s_blocksize - 1); -@@ -162,18 +187,12 @@ - * during the copy operation. - */ - unsigned long version = filp->f_version; -- unsigned char d_type = DT_UNKNOWN; - -- if (EXT3_HAS_INCOMPAT_FEATURE(sb, -- EXT3_FEATURE_INCOMPAT_FILETYPE) -- && de->file_type < EXT3_FT_MAX) -- d_type = -- ext3_filetype_table[de->file_type]; - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), -- d_type); -+ get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) -@@ -188,3 +207,269 @@ - UPDATE_ATIME(inode); - return 0; - } -+ -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * These functions convert from the major/minor hash to an f_pos -+ * value. -+ * -+ * Currently we only use major hash numer. This is unfortunate, but -+ * on 32-bit machines, the same VFS interface is used for lseek and -+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of -+ * lseek/telldir/seekdir will blow out spectacularly, and from within -+ * the ext2 low-level routine, we don't know if we're being called by -+ * a 64-bit version of the system call or the 32-bit version of the -+ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir -+ * cookie. Sigh. -+ */ -+#define hash2pos(major, minor) (major >> 1) -+#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -+#define pos2min_hash(pos) (0) -+ -+/* -+ * This structure holds the nodes of the red-black tree used to store -+ * the directory entry in hash order. -+ */ -+struct fname { -+ __u32 hash; -+ __u32 minor_hash; -+ rb_node_t rb_hash; -+ struct fname *next; -+ __u32 inode; -+ __u8 name_len; -+ __u8 file_type; -+ char name[0]; -+}; -+ -+/* -+ * This functoin implements a non-recursive way of freeing all of the -+ * nodes in the red-black tree. -+ */ -+static void free_rb_tree_fname(rb_root_t *root) -+{ -+ rb_node_t *n = root->rb_node; -+ rb_node_t *parent; -+ struct fname *fname; -+ -+ while (n) { -+ /* Do the node's children first */ -+ if ((n)->rb_left) { -+ n = n->rb_left; -+ continue; -+ } -+ if (n->rb_right) { -+ n = n->rb_right; -+ continue; -+ } -+ /* -+ * The node has no children; free it, and then zero -+ * out parent's link to it. Finally go to the -+ * beginning of the loop and try to free the parent -+ * node. -+ */ -+ parent = n->rb_parent; -+ fname = rb_entry(n, struct fname, rb_hash); -+ kfree(fname); -+ if (!parent) -+ root->rb_node = 0; -+ else if (parent->rb_left == n) -+ parent->rb_left = 0; -+ else if (parent->rb_right == n) -+ parent->rb_right = 0; -+ n = parent; -+ } -+ root->rb_node = 0; -+} -+ -+ -+struct dir_private_info *create_dir_info(loff_t pos) -+{ -+ struct dir_private_info *p; -+ -+ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); -+ if (!p) -+ return NULL; -+ p->root.rb_node = 0; -+ p->curr_node = 0; -+ p->extra_fname = 0; -+ p->last_pos = 0; -+ p->curr_hash = pos2maj_hash(pos); -+ p->curr_minor_hash = pos2min_hash(pos); -+ p->next_hash = 0; -+ return p; -+} -+ -+void ext3_htree_free_dir_info(struct dir_private_info *p) -+{ -+ free_rb_tree_fname(&p->root); -+ kfree(p); -+} -+ -+/* -+ * Given a directory entry, enter it into the fname rb tree. -+ */ -+void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3_dir_entry_2 *dirent) -+{ -+ rb_node_t **p, *parent = NULL; -+ struct fname * fname, *new_fn; -+ struct dir_private_info *info; -+ int len; -+ -+ info = (struct dir_private_info *) dir_file->private_data; -+ p = &info->root.rb_node; -+ -+ /* Create and allocate the fname structure */ -+ len = sizeof(struct fname) + dirent->name_len + 1; -+ new_fn = kmalloc(len, GFP_KERNEL); -+ memset(new_fn, 0, len); -+ new_fn->hash = hash; -+ new_fn->minor_hash = minor_hash; -+ new_fn->inode = le32_to_cpu(dirent->inode); -+ new_fn->name_len = dirent->name_len; -+ new_fn->file_type = dirent->file_type; -+ memcpy(new_fn->name, dirent->name, dirent->name_len); -+ new_fn->name[dirent->name_len] = 0; -+ -+ while (*p) { -+ parent = *p; -+ fname = rb_entry(parent, struct fname, rb_hash); -+ -+ /* -+ * If the hash and minor hash match up, then we put -+ * them on a linked list. This rarely happens... -+ */ -+ if ((new_fn->hash == fname->hash) && -+ (new_fn->minor_hash == fname->minor_hash)) { -+ new_fn->next = fname->next; -+ fname->next = new_fn; -+ return; -+ } -+ -+ if (new_fn->hash < fname->hash) -+ p = &(*p)->rb_left; -+ else if (new_fn->hash > fname->hash) -+ p = &(*p)->rb_right; -+ else if (new_fn->minor_hash < fname->minor_hash) -+ p = &(*p)->rb_left; -+ else /* if (new_fn->minor_hash > fname->minor_hash) */ -+ p = &(*p)->rb_right; -+ } -+ -+ rb_link_node(&new_fn->rb_hash, parent, p); -+ rb_insert_color(&new_fn->rb_hash, &info->root); -+} -+ -+ -+ -+/* -+ * This is a helper function for ext3_dx_readdir. It calls filldir -+ * for all entres on the fname linked list. (Normally there is only -+ * one entry on the linked list, unless there are 62 bit hash collisions.) -+ */ -+static int call_filldir(struct file * filp, void * dirent, -+ filldir_t filldir, struct fname *fname) -+{ -+ struct dir_private_info *info = filp->private_data; -+ loff_t curr_pos; -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct super_block * sb; -+ int error; -+ -+ sb = inode->i_sb; -+ -+ if (!fname) { -+ printk("call_filldir: called with null fname?!?\n"); -+ return 0; -+ } -+ curr_pos = hash2pos(fname->hash, fname->minor_hash); -+ while (fname) { -+ error = filldir(dirent, fname->name, -+ fname->name_len, curr_pos, -+ fname->inode, -+ get_dtype(sb, fname->file_type)); -+ if (error) { -+ filp->f_pos = curr_pos; -+ info->extra_fname = fname->next; -+ return error; -+ } -+ fname = fname->next; -+ } -+ return 0; -+} -+ -+static int ext3_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir) -+{ -+ struct dir_private_info *info = filp->private_data; -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct fname *fname; -+ int ret; -+ -+ if (!info) { -+ info = create_dir_info(filp->f_pos); -+ if (!info) -+ return -ENOMEM; -+ filp->private_data = info; -+ } -+ -+ /* Some one has messed with f_pos; reset the world */ -+ if (info->last_pos != filp->f_pos) { -+ free_rb_tree_fname(&info->root); -+ info->curr_node = 0; -+ info->extra_fname = 0; -+ info->curr_hash = pos2maj_hash(filp->f_pos); -+ info->curr_minor_hash = pos2min_hash(filp->f_pos); -+ } -+ -+ /* -+ * If there are any leftover names on the hash collision -+ * chain, return them first. -+ */ -+ if (info->extra_fname && -+ call_filldir(filp, dirent, filldir, info->extra_fname)) -+ goto finished; -+ -+ if (!info->curr_node) -+ info->curr_node = rb_get_first(&info->root); -+ -+ while (1) { -+ /* -+ * Fill the rbtree if we have no more entries, -+ * or the inode has changed since we last read in the -+ * cached entries. -+ */ -+ if ((!info->curr_node) || -+ (filp->f_version != inode->i_version)) { -+ info->curr_node = 0; -+ free_rb_tree_fname(&info->root); -+ filp->f_version = inode->i_version; -+ ret = ext3_htree_fill_tree(filp, info->curr_hash, -+ info->curr_minor_hash, -+ &info->next_hash); -+ if (ret < 0) -+ return ret; -+ if (ret == 0) -+ break; -+ info->curr_node = rb_get_first(&info->root); -+ } -+ -+ fname = rb_entry(info->curr_node, struct fname, rb_hash); -+ info->curr_hash = fname->hash; -+ info->curr_minor_hash = fname->minor_hash; -+ if (call_filldir(filp, dirent, filldir, fname)) -+ break; -+ -+ info->curr_node = rb_get_next(info->curr_node); -+ if (!info->curr_node) { -+ info->curr_hash = info->next_hash; -+ info->curr_minor_hash = 0; -+ } -+ } -+finished: -+ info->last_pos = filp->f_pos; -+ UPDATE_ATIME(inode); -+ return 0; -+} -+#endif -Index: linux-2.4.19.SuSE/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c 2002-12-04 09:46:03.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/namei.c 2004-05-27 11:08:52.000000000 -0700 -@@ -16,6 +16,12 @@ - * David S. Miller (davem@caip.rutgers.edu), 1995 - * Directory entry file type support and forward compatibility hooks - * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 -+ * Hash Tree Directory indexing (c) -+ * Daniel Phillips, 2001 -+ * Hash Tree Directory indexing porting -+ * Christopher Li, 2002 -+ * Hash Tree Directory indexing cleanup -+ * Theodore Ts'o, 2002 - */ - - #include -@@ -40,6 +46,630 @@ - #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) - #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) - -+static struct buffer_head *ext3_append(handle_t *handle, -+ struct inode *inode, -+ u32 *block, int *err) -+{ -+ struct buffer_head *bh; -+ -+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; -+ -+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { -+ inode->i_size += inode->i_sb->s_blocksize; -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_journal_get_write_access(handle,bh); -+ } -+ return bh; -+} -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#ifndef swap -+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -+#endif -+ -+typedef struct { u32 v; } le_u32; -+typedef struct { u16 v; } le_u16; -+ -+#ifdef DX_DEBUG -+#define dxtrace(command) command -+#else -+#define dxtrace(command) -+#endif -+ -+struct fake_dirent -+{ -+ /*le*/u32 inode; -+ /*le*/u16 rec_len; -+ u8 name_len; -+ u8 file_type; -+}; -+ -+struct dx_countlimit -+{ -+ le_u16 limit; -+ le_u16 count; -+}; -+ -+struct dx_entry -+{ -+ le_u32 hash; -+ le_u32 block; -+}; -+ -+/* -+ * dx_root_info is laid out so that if it should somehow get overlaid by a -+ * dirent the two low bits of the hash version will be zero. Therefore, the -+ * hash version mod 4 should never be 0. Sincerely, the paranoia department. -+ */ -+ -+struct dx_root -+{ -+ struct fake_dirent dot; -+ char dot_name[4]; -+ struct fake_dirent dotdot; -+ char dotdot_name[4]; -+ struct dx_root_info -+ { -+ le_u32 reserved_zero; -+ u8 hash_version; -+ u8 info_length; /* 8 */ -+ u8 indirect_levels; -+ u8 unused_flags; -+ } -+ info; -+ struct dx_entry entries[0]; -+}; -+ -+struct dx_node -+{ -+ struct fake_dirent fake; -+ struct dx_entry entries[0]; -+}; -+ -+ -+struct dx_frame -+{ -+ struct buffer_head *bh; -+ struct dx_entry *entries; -+ struct dx_entry *at; -+}; -+ -+struct dx_map_entry -+{ -+ u32 hash; -+ u32 offs; -+}; -+ -+#ifdef CONFIG_EXT3_INDEX -+static inline unsigned dx_get_block (struct dx_entry *entry); -+static void dx_set_block (struct dx_entry *entry, unsigned value); -+static inline unsigned dx_get_hash (struct dx_entry *entry); -+static void dx_set_hash (struct dx_entry *entry, unsigned value); -+static unsigned dx_get_count (struct dx_entry *entries); -+static unsigned dx_get_limit (struct dx_entry *entries); -+static void dx_set_count (struct dx_entry *entries, unsigned value); -+static void dx_set_limit (struct dx_entry *entries, unsigned value); -+static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -+static unsigned dx_node_limit (struct inode *dir); -+static struct dx_frame *dx_probe(struct dentry *dentry, -+ struct inode *dir, -+ struct dx_hash_info *hinfo, -+ struct dx_frame *frame, -+ int *err); -+static void dx_release (struct dx_frame *frames); -+static int dx_make_map (struct ext3_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry map[]); -+static void dx_sort_map(struct dx_map_entry *map, unsigned count); -+static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, -+ struct dx_map_entry *offsets, int count); -+static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); -+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); -+static int ext3_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, int *err, -+ __u32 *start_hash); -+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -+ struct ext3_dir_entry_2 **res_dir, int *err); -+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); -+ -+/* -+ * Future: use high four bits of block for coalesce-on-delete flags -+ * Mask them off for now. -+ */ -+ -+static inline unsigned dx_get_block (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->block.v) & 0x00ffffff; -+} -+ -+static inline void dx_set_block (struct dx_entry *entry, unsigned value) -+{ -+ entry->block.v = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_hash (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->hash.v); -+} -+ -+static inline void dx_set_hash (struct dx_entry *entry, unsigned value) -+{ -+ entry->hash.v = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_count (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); -+} -+ -+static inline unsigned dx_get_limit (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); -+} -+ -+static inline void dx_set_count (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); -+} -+ -+static inline void dx_set_limit (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); -+} -+ -+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - -+ EXT3_DIR_REC_LEN(2) - infosize; -+ return 0? 20: entry_space / sizeof(struct dx_entry); -+} -+ -+static inline unsigned dx_node_limit (struct inode *dir) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); -+ return 0? 22: entry_space / sizeof(struct dx_entry); -+} -+ -+/* -+ * Debug -+ */ -+#ifdef DX_DEBUG -+struct stats -+{ -+ unsigned names; -+ unsigned space; -+ unsigned bcount; -+}; -+ -+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, -+ int size, int show_names) -+{ -+ unsigned names = 0, space = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ printk("names: "); -+ while ((char *) de < base + size) -+ { -+ if (de->inode) -+ { -+ if (show_names) -+ { -+ int len = de->name_len; -+ char *name = de->name; -+ while (len--) printk("%c", *name++); -+ ext3fs_dirhash(de->name, de->name_len, &h); -+ printk(":%x.%u ", h.hash, -+ ((char *) de - base)); -+ } -+ space += EXT3_DIR_REC_LEN(de->name_len); -+ names++; -+ } -+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ printk("(%i)\n", names); -+ return (struct stats) { names, space, 1 }; -+} -+ -+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, -+ struct dx_entry *entries, int levels) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count = dx_get_count (entries), names = 0, space = 0, i; -+ unsigned bcount = 0; -+ struct buffer_head *bh; -+ int err; -+ printk("%i indexed blocks...\n", count); -+ for (i = 0; i < count; i++, entries++) -+ { -+ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; -+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; -+ struct stats stats; -+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); -+ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; -+ stats = levels? -+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): -+ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); -+ names += stats.names; -+ space += stats.space; -+ bcount += stats.bcount; -+ brelse (bh); -+ } -+ if (bcount) -+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", -+ names, space/bcount,(space/bcount)*100/blocksize); -+ return (struct stats) { names, space, bcount}; -+} -+#endif /* DX_DEBUG */ -+ -+/* -+ * Probe for a directory leaf block to search. -+ * -+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format -+ * error in the directory index, and the caller should fall back to -+ * searching the directory normally. The callers of dx_probe **MUST** -+ * check for this error code, and make sure it never gets reflected -+ * back to userspace. -+ */ -+static struct dx_frame * -+dx_probe(struct dentry *dentry, struct inode *dir, -+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -+{ -+ unsigned count, indirect; -+ struct dx_entry *at, *entries, *p, *q, *m; -+ struct dx_root *root; -+ struct buffer_head *bh; -+ struct dx_frame *frame = frame_in; -+ u32 hash; -+ -+ frame->bh = NULL; -+ if (dentry) -+ dir = dentry->d_parent->d_inode; -+ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) -+ goto fail; -+ root = (struct dx_root *) bh->b_data; -+ if (root->info.hash_version != DX_HASH_TEA && -+ root->info.hash_version != DX_HASH_HALF_MD4 && -+ root->info.hash_version != DX_HASH_LEGACY) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unrecognised inode hash code %d", -+ root->info.hash_version); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ hinfo->hash_version = root->info.hash_version; -+ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; -+ if (dentry) -+ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); -+ hash = hinfo->hash; -+ -+ if (root->info.unused_flags & 1) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash flags: %#06x", -+ root->info.unused_flags); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ if ((indirect = root->info.indirect_levels) > 1) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash depth: %#06x", -+ root->info.indirect_levels); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ entries = (struct dx_entry *) (((char *)&root->info) + -+ root->info.info_length); -+ assert(dx_get_limit(entries) == dx_root_limit(dir, -+ root->info.info_length)); -+ dxtrace (printk("Look up %x", hash)); -+ while (1) -+ { -+ count = dx_get_count(entries); -+ assert (count && count <= dx_get_limit(entries)); -+ p = entries + 1; -+ q = entries + count - 1; -+ while (p <= q) -+ { -+ m = p + (q - p)/2; -+ dxtrace(printk(".")); -+ if (dx_get_hash(m) > hash) -+ q = m - 1; -+ else -+ p = m + 1; -+ } -+ -+ if (0) // linear search cross check -+ { -+ unsigned n = count - 1; -+ at = entries; -+ while (n--) -+ { -+ dxtrace(printk(",")); -+ if (dx_get_hash(++at) > hash) -+ { -+ at--; -+ break; -+ } -+ } -+ assert (at == p - 1); -+ } -+ -+ at = p - 1; -+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); -+ frame->bh = bh; -+ frame->entries = entries; -+ frame->at = at; -+ if (!indirect--) return frame; -+ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) -+ goto fail2; -+ at = entries = ((struct dx_node *) bh->b_data)->entries; -+ assert (dx_get_limit(entries) == dx_node_limit (dir)); -+ frame++; -+ } -+fail2: -+ while (frame >= frame_in) { -+ brelse(frame->bh); -+ frame--; -+ } -+fail: -+ return NULL; -+} -+ -+static void dx_release (struct dx_frame *frames) -+{ -+ if (frames[0].bh == NULL) -+ return; -+ -+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) -+ brelse(frames[1].bh); -+ brelse(frames[0].bh); -+} -+ -+/* -+ * This function increments the frame pointer to search the next leaf -+ * block, and reads in the necessary intervening nodes if the search -+ * should be necessary. Whether or not the search is necessary is -+ * controlled by the hash parameter. If the hash value is even, then -+ * the search is only continued if the next block starts with that -+ * hash value. This is used if we are searching for a specific file. -+ * -+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. -+ * -+ * This function returns 1 if the caller should continue to search, -+ * or 0 if it should not. If there is an error reading one of the -+ * index blocks, it will return -1. -+ * -+ * If start_hash is non-null, it will be filled in with the starting -+ * hash of the next page. -+ */ -+static int ext3_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, int *err, -+ __u32 *start_hash) -+{ -+ struct dx_frame *p; -+ struct buffer_head *bh; -+ int num_frames = 0; -+ __u32 bhash; -+ -+ *err = ENOENT; -+ p = frame; -+ /* -+ * Find the next leaf page by incrementing the frame pointer. -+ * If we run out of entries in the interior node, loop around and -+ * increment pointer in the parent node. When we break out of -+ * this loop, num_frames indicates the number of interior -+ * nodes need to be read. -+ */ -+ while (1) { -+ if (++(p->at) < p->entries + dx_get_count(p->entries)) -+ break; -+ if (p == frames) -+ return 0; -+ num_frames++; -+ p--; -+ } -+ -+ /* -+ * If the hash is 1, then continue only if the next page has a -+ * continuation hash of any value. This is used for readdir -+ * handling. Otherwise, check to see if the hash matches the -+ * desired contiuation hash. If it doesn't, return since -+ * there's no point to read in the successive index pages. -+ */ -+ bhash = dx_get_hash(p->at); -+ if (start_hash) -+ *start_hash = bhash; -+ if ((hash & 1) == 0) { -+ if ((bhash & ~1) != hash) -+ return 0; -+ } -+ /* -+ * If the hash is HASH_NB_ALWAYS, we always go to the next -+ * block so no check is necessary -+ */ -+ while (num_frames--) { -+ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), -+ 0, err))) -+ return -1; /* Failure */ -+ p++; -+ brelse (p->bh); -+ p->bh = bh; -+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; -+ } -+ return 1; -+} -+ -+ -+/* -+ * p is at least 6 bytes before the end of page -+ */ -+static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) -+{ -+ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); -+} -+ -+/* -+ * This function fills a red-black tree with information from a -+ * directory. We start scanning the directory in hash order, starting -+ * at start_hash and start_minor_hash. -+ * -+ * This function returns the number of entries inserted into the tree, -+ * or a negative error code. -+ */ -+int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash) -+{ -+ struct dx_hash_info hinfo; -+ struct buffer_head *bh; -+ struct ext3_dir_entry_2 *de, *top; -+ static struct dx_frame frames[2], *frame; -+ struct inode *dir; -+ int block, err; -+ int count = 0; -+ int ret; -+ __u32 hashval; -+ -+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, -+ start_minor_hash)); -+ dir = dir_file->f_dentry->d_inode; -+ hinfo.hash = start_hash; -+ hinfo.minor_hash = 0; -+ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ -+ while (1) { -+ block = dx_get_block(frame->at); -+ dxtrace(printk("Reading block %d\n", block)); -+ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) -+ goto errout; -+ -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - -+ EXT3_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3_next_entry(de)) { -+ ext3fs_dirhash(de->name, de->name_len, &hinfo); -+ if ((hinfo.hash < start_hash) || -+ ((hinfo.hash == start_hash) && -+ (hinfo.minor_hash < start_minor_hash))) -+ continue; -+ ext3_htree_store_dirent(dir_file, hinfo.hash, -+ hinfo.minor_hash, de); -+ count++; -+ } -+ brelse (bh); -+ hashval = ~1; -+ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, -+ frame, frames, &err, &hashval); -+ if (next_hash) -+ *next_hash = hashval; -+ if (ret == -1) -+ goto errout; -+ /* -+ * Stop if: (a) there are no more entries, or -+ * (b) we have inserted at least one entry and the -+ * next hash value is not a continuation -+ */ -+ if ((ret == 0) || -+ (count && ((hashval & 1) == 0))) -+ break; -+ } -+ dx_release(frames); -+ dxtrace(printk("Fill tree: returned %d entries\n", count)); -+ return count; -+errout: -+ dx_release(frames); -+ return (err); -+} -+ -+ -+/* -+ * Directory block splitting, compacting -+ */ -+ -+static int dx_make_map (struct ext3_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) -+{ -+ int count = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ while ((char *) de < base + size) -+ { -+ if (de->name_len && de->inode) { -+ ext3fs_dirhash(de->name, de->name_len, &h); -+ map_tail--; -+ map_tail->hash = h.hash; -+ map_tail->offs = (u32) ((char *) de - base); -+ count++; -+ } -+ /* XXX: do we need to check rec_len == 0 case? -Chris */ -+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ return count; -+} -+ -+static void dx_sort_map (struct dx_map_entry *map, unsigned count) -+{ -+ struct dx_map_entry *p, *q, *top = map + count - 1; -+ int more; -+ /* Combsort until bubble sort doesn't suck */ -+ while (count > 2) -+ { -+ count = count*10/13; -+ if (count - 9 < 2) /* 9, 10 -> 11 */ -+ count = 11; -+ for (p = top, q = p - count; q >= map; p--, q--) -+ if (p->hash < q->hash) -+ swap(*p, *q); -+ } -+ /* Garden variety bubble sort */ -+ do { -+ more = 0; -+ q = top; -+ while (q-- > map) -+ { -+ if (q[1].hash >= q[0].hash) -+ continue; -+ swap(*(q+1), *q); -+ more = 1; -+ } -+ } while(more); -+} -+ -+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) -+{ -+ struct dx_entry *entries = frame->entries; -+ struct dx_entry *old = frame->at, *new = old + 1; -+ int count = dx_get_count(entries); -+ -+ assert(count < dx_get_limit(entries)); -+ assert(old < entries + count); -+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); -+ dx_set_hash(new, hash); -+ dx_set_block(new, block); -+ dx_set_count(entries, count + 1); -+} -+#endif -+ -+ -+static void ext3_update_dx_flag(struct inode *inode) -+{ -+ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, -+ EXT3_FEATURE_COMPAT_DIR_INDEX)) -+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; -+} -+ - /* - * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. - * -@@ -96,6 +726,7 @@ - return 0; - } - -+ - /* - * ext3_find_entry() - * -@@ -107,6 +738,8 @@ - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ -+ -+ - static struct buffer_head * ext3_find_entry (struct dentry *dentry, - struct ext3_dir_entry_2 ** res_dir) - { -@@ -121,12 +754,32 @@ - int num = 0; - int nblocks, i, err; - struct inode *dir = dentry->d_parent->d_inode; -+ int namelen; -+ const u8 *name; -+ unsigned blocksize; - - *res_dir = NULL; - sb = dir->i_sb; -- -+ blocksize = sb->s_blocksize; -+ namelen = dentry->d_name.len; -+ name = dentry->d_name.name; -+ if (namelen > EXT3_NAME_LEN) -+ return NULL; -+#ifdef CONFIG_EXT3_INDEX -+ if (is_dx(dir)) { -+ bh = ext3_dx_find_entry(dentry, res_dir, &err); -+ /* -+ * On success, or if the error was file not found, -+ * return. Otherwise, fall back to doing a search the -+ * old fashioned way. -+ */ -+ if (bh || (err != ERR_BAD_DX_DIR)) -+ return bh; -+ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); -+ } -+#endif - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); -- start = dir->u.ext3_i.i_dir_start_lookup; -+ start = EXT3_I(dir)->i_dir_start_lookup; - if (start >= nblocks) - start = 0; - block = start; -@@ -167,7 +820,7 @@ - i = search_dirblock(bh, dir, dentry, - block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); - if (i == 1) { -- dir->u.ext3_i.i_dir_start_lookup = block; -+ EXT3_I(dir)->i_dir_start_lookup = block; - ret = bh; - goto cleanup_and_exit; - } else { -@@ -198,6 +851,74 @@ - return ret; - } - -+#ifdef CONFIG_EXT3_INDEX -+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -+ struct ext3_dir_entry_2 **res_dir, int *err) -+{ -+ struct super_block * sb; -+ struct dx_hash_info hinfo; -+ u32 hash; -+ struct dx_frame frames[2], *frame; -+ struct ext3_dir_entry_2 *de, *top; -+ struct buffer_head *bh; -+ unsigned long block; -+ int retval; -+ int namelen = dentry->d_name.len; -+ const u8 *name = dentry->d_name.name; -+ struct inode *dir = dentry->d_parent->d_inode; -+ -+ sb = dir->i_sb; -+ /* NFS may look up ".." - look at dx_root directory block */ -+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ -+ if (!(frame = dx_probe(dentry, 0, &hinfo, frames, err))) -+ return NULL; -+ } else { -+ frame = frames; -+ frame->bh = NULL; /* for dx_release() */ -+ frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ -+ dx_set_block(frame->at, 0); /* dx_root block is 0 */ -+ } -+ hash = hinfo.hash; -+ do { -+ block = dx_get_block(frame->at); -+ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) -+ goto errout; -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ top = (struct ext3_dir_entry_2 *)((char *)de + sb->s_blocksize - -+ EXT3_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3_next_entry(de)) -+ if (ext3_match (namelen, name, de)) { -+ if (!ext3_check_dir_entry("ext3_find_entry", -+ dir, de, bh, -+ (block<b_data))) { -+ brelse (bh); -+ goto errout; -+ } -+ *res_dir = de; -+ dx_release (frames); -+ return bh; -+ } -+ brelse (bh); -+ /* Check to see if we should continue to search */ -+ retval = ext3_htree_next_block(dir, hash, frame, -+ frames, err, 0); -+ if (retval == -1) { -+ ext3_warning(sb, __FUNCTION__, -+ "error reading index page in directory #%lu", -+ dir->i_ino); -+ goto errout; -+ } -+ } while (retval == 1); -+ -+ *err = -ENOENT; -+errout: -+ dxtrace(printk("%s not found\n", name)); -+ dx_release (frames); -+ return NULL; -+} -+#endif -+ - static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) - { - struct inode * inode; -@@ -214,8 +927,9 @@ - brelse (bh); - inode = iget(dir->i_sb, ino); - -- if (!inode) -+ if (!inode) { - return ERR_PTR(-EACCES); -+ } - } - d_add(dentry, inode); - return NULL; -@@ -239,6 +953,301 @@ - de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; - } - -+#ifdef CONFIG_EXT3_INDEX -+static struct ext3_dir_entry_2 * -+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) -+{ -+ unsigned rec_len = 0; -+ -+ while (count--) { -+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); -+ rec_len = EXT3_DIR_REC_LEN(de->name_len); -+ memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); -+ de->inode = 0; -+ map++; -+ to += rec_len; -+ } -+ return (struct ext3_dir_entry_2 *) (to - rec_len); -+} -+ -+static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) -+{ -+ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; -+ unsigned rec_len = 0; -+ -+ prev = to = de; -+ while ((char*)de < base + size) { -+ next = (struct ext3_dir_entry_2 *) ((char *) de + -+ le16_to_cpu(de->rec_len)); -+ if (de->inode && de->name_len) { -+ rec_len = EXT3_DIR_REC_LEN(de->name_len); -+ if (de > to) -+ memmove(to, de, rec_len); -+ to->rec_len = cpu_to_le16(rec_len); -+ prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); -+ } -+ de = next; -+ } -+ return prev; -+} -+ -+static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, -+ struct buffer_head **bh,struct dx_frame *frame, -+ struct dx_hash_info *hinfo, int *error) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count, continued; -+ struct buffer_head *bh2; -+ u32 newblock; -+ u32 hash2; -+ struct dx_map_entry *map; -+ char *data1 = (*bh)->b_data, *data2; -+ unsigned split; -+ struct ext3_dir_entry_2 *de = NULL, *de2; -+ int err; -+ -+ bh2 = ext3_append (handle, dir, &newblock, error); -+ if (!(bh2)) { -+ brelse(*bh); -+ *bh = NULL; -+ goto errout; -+ } -+ -+ BUFFER_TRACE(*bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, *bh); -+ if (err) { -+ journal_error: -+ brelse(*bh); -+ brelse(bh2); -+ *bh = NULL; -+ ext3_std_error(dir->i_sb, err); -+ goto errout; -+ } -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ -+ data2 = bh2->b_data; -+ -+ /* create map in the end of data2 block */ -+ map = (struct dx_map_entry *) (data2 + blocksize); -+ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, -+ blocksize, hinfo, map); -+ map -= count; -+ split = count/2; // need to adjust to actual middle -+ dx_sort_map (map, count); -+ hash2 = map[split].hash; -+ continued = hash2 == map[split - 1].hash; -+ dxtrace(printk("Split block %i at %x, %i/%i\n", -+ dx_get_block(frame->at), hash2, split, count-split)); -+ -+ /* Fancy dance to stay within two buffers */ -+ de2 = dx_move_dirents(data1, data2, map + split, count - split); -+ de = dx_pack_dirents(data1,blocksize); -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); -+ -+ /* Which block gets the new entry? */ -+ if (hinfo->hash >= hash2) -+ { -+ swap(*bh, bh2); -+ de = de2; -+ } -+ dx_insert_block (frame, hash2 + continued, newblock); -+ err = ext3_journal_dirty_metadata (handle, bh2); -+ if (err) -+ goto journal_error; -+ err = ext3_journal_dirty_metadata (handle, frame->bh); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ dxtrace(dx_show_index ("frame", frame->entries)); -+errout: -+ return de; -+} -+#endif -+ -+ -+/* -+ * Add a new entry into a directory (leaf) block. If de is non-NULL, -+ * it points to a directory entry which is guaranteed to be large -+ * enough for new directory entry. If de is NULL, then -+ * add_dirent_to_buf will attempt search the directory block for -+ * space. It will return -ENOSPC if no space is available, and -EIO -+ * and -EEXIST if directory entry already exists. -+ * -+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In -+ * all other cases bh is released. -+ */ -+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct ext3_dir_entry_2 *de, -+ struct buffer_head * bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ unsigned long offset = 0; -+ unsigned short reclen; -+ int nlen, rlen, err; -+ char *top; -+ -+ reclen = EXT3_DIR_REC_LEN(namelen); -+ if (!de) { -+ de = (struct ext3_dir_entry_2 *)bh->b_data; -+ top = bh->b_data + dir->i_sb->s_blocksize - reclen; -+ while ((char *) de <= top) { -+ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, -+ bh, offset)) { -+ brelse (bh); -+ return -EIO; -+ } -+ if (ext3_match (namelen, name, de)) { -+ brelse (bh); -+ return -EEXIST; -+ } -+ nlen = EXT3_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if ((de->inode? rlen - nlen: rlen) >= reclen) -+ break; -+ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); -+ offset += rlen; -+ } -+ if ((char *) de > top) -+ return -ENOSPC; -+ } -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) { -+ ext3_std_error(dir->i_sb, err); -+ brelse(bh); -+ return err; -+ } -+ -+ /* By now the buffer is marked for journaling */ -+ nlen = EXT3_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if (de->inode) { -+ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); -+ de1->rec_len = cpu_to_le16(rlen - nlen); -+ de->rec_len = cpu_to_le16(nlen); -+ de = de1; -+ } -+ de->file_type = EXT3_FT_UNKNOWN; -+ if (inode) { -+ de->inode = cpu_to_le32(inode->i_ino); -+ ext3_set_de_type(dir->i_sb, de, inode->i_mode); -+ } else -+ de->inode = 0; -+ de->name_len = namelen; -+ memcpy (de->name, name, namelen); -+ /* -+ * XXX shouldn't update any times until successful -+ * completion of syscall, but too many callers depend -+ * on this. -+ * -+ * XXX similarly, too many callers depend on -+ * ext3_new_inode() setting the times, but error -+ * recovery deletes the inode, so the worst that can -+ * happen is that the times are slightly out of date -+ * and/or different from the directory change time. -+ */ -+ dir->i_mtime = dir->i_ctime = CURRENT_TIME; -+ ext3_update_dx_flag(dir); -+ dir->i_version = ++event; -+ ext3_mark_inode_dirty(handle, dir); -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ ext3_std_error(dir->i_sb, err); -+ brelse(bh); -+ return 0; -+} -+ -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * This converts a one block unindexed directory to a 3 block indexed -+ * directory, and adds the dentry to the indexed directory. -+ */ -+static int make_indexed_dir(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct buffer_head *bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ struct buffer_head *bh2; -+ struct dx_root *root; -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries; -+ struct ext3_dir_entry_2 *de, *de2; -+ char *data1, *top; -+ unsigned len; -+ int retval; -+ unsigned blocksize; -+ struct dx_hash_info hinfo; -+ u32 block; -+ -+ blocksize = dir->i_sb->s_blocksize; -+ dxtrace(printk("Creating index\n")); -+ retval = ext3_journal_get_write_access(handle, bh); -+ if (retval) { -+ ext3_std_error(dir->i_sb, retval); -+ brelse(bh); -+ return retval; -+ } -+ root = (struct dx_root *) bh->b_data; -+ -+ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; -+ bh2 = ext3_append (handle, dir, &block, &retval); -+ if (!(bh2)) { -+ brelse(bh); -+ return retval; -+ } -+ data1 = bh2->b_data; -+ -+ /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *)&root->dotdot; -+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); -+ len = ((char *) root) + blocksize - (char *) de; -+ memcpy (data1, de, len); -+ de = (struct ext3_dir_entry_2 *) data1; -+ top = data1 + len; -+ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) -+ de = de2; -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ /* Initialize the root; the dot dirents already exist */ -+ de = (struct ext3_dir_entry_2 *) (&root->dotdot); -+ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); -+ memset (&root->info, 0, sizeof(root->info)); -+ root->info.info_length = sizeof(root->info); -+ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; -+ entries = root->entries; -+ dx_set_block (entries, 1); -+ dx_set_count (entries, 1); -+ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); -+ -+ /* Initialize as for dx_probe */ -+ hinfo.hash_version = root->info.hash_version; -+ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; -+ ext3fs_dirhash(name, namelen, &hinfo); -+ frame = frames; -+ frame->entries = entries; -+ frame->at = entries; -+ frame->bh = bh; -+ bh = bh2; -+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); -+ dx_release (frames); -+ if (!(de)) -+ return retval; -+ -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} -+#endif -+ - /* - * ext3_add_entry() - * -@@ -249,127 +1258,198 @@ - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ -- --/* -- * AKPM: the journalling code here looks wrong on the error paths -- */ - static int ext3_add_entry (handle_t *handle, struct dentry *dentry, - struct inode *inode) - { - struct inode *dir = dentry->d_parent->d_inode; -- const char *name = dentry->d_name.name; -- int namelen = dentry->d_name.len; - unsigned long offset; -- unsigned short rec_len; - struct buffer_head * bh; -- struct ext3_dir_entry_2 * de, * de1; -+ struct ext3_dir_entry_2 *de; - struct super_block * sb; - int retval; -+#ifdef CONFIG_EXT3_INDEX -+ int dx_fallback=0; -+#endif -+ unsigned blocksize; -+ unsigned nlen, rlen; -+ u32 block, blocks; - - sb = dir->i_sb; -- -- if (!namelen) -+ blocksize = sb->s_blocksize; -+ if (!dentry->d_name.len) - return -EINVAL; -- bh = ext3_bread (handle, dir, 0, 0, &retval); -+#ifdef CONFIG_EXT3_INDEX -+ if (is_dx(dir)) { -+ retval = ext3_dx_add_entry(handle, dentry, inode); -+ if (!retval || (retval != ERR_BAD_DX_DIR)) -+ return retval; -+ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; -+ dx_fallback++; -+ ext3_mark_inode_dirty(handle, dir); -+ } -+#endif -+ blocks = dir->i_size >> sb->s_blocksize_bits; -+ for (block = 0, offset = 0; block < blocks; block++) { -+ bh = ext3_bread(handle, dir, block, 0, &retval); -+ if(!bh) -+ return retval; -+ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); -+ if (retval != -ENOSPC) -+ return retval; -+ -+#ifdef CONFIG_EXT3_INDEX -+ if (blocks == 1 && !dx_fallback && -+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) -+ return make_indexed_dir(handle, dentry, inode, bh); -+#endif -+ brelse(bh); -+ } -+ bh = ext3_append(handle, dir, &block, &retval); - if (!bh) - return retval; -- rec_len = EXT3_DIR_REC_LEN(namelen); -- offset = 0; - de = (struct ext3_dir_entry_2 *) bh->b_data; -- while (1) { -- if ((char *)de >= sb->s_blocksize + bh->b_data) { -- brelse (bh); -- bh = NULL; -- bh = ext3_bread (handle, dir, -- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); -- if (!bh) -- return retval; -- if (dir->i_size <= offset) { -- if (dir->i_size == 0) { -- brelse(bh); -- return -ENOENT; -- } -- -- ext3_debug ("creating next block\n"); -- -- BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -- de = (struct ext3_dir_entry_2 *) bh->b_data; -- de->inode = 0; -- de->rec_len = le16_to_cpu(sb->s_blocksize); -- dir->u.ext3_i.i_disksize = -- dir->i_size = offset + sb->s_blocksize; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); -- } else { -- -- ext3_debug ("skipping to next block\n"); -+ de->inode = 0; -+ de->rec_len = cpu_to_le16(rlen = blocksize); -+ nlen = 0; -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} - -- de = (struct ext3_dir_entry_2 *) bh->b_data; -- } -- } -- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, -- offset)) { -- brelse (bh); -- return -ENOENT; -- } -- if (ext3_match (namelen, name, de)) { -- brelse (bh); -- return -EEXIST; -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * Returns 0 for success, or a negative error value -+ */ -+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries, *at; -+ struct dx_hash_info hinfo; -+ struct buffer_head * bh; -+ struct inode *dir = dentry->d_parent->d_inode; -+ struct super_block * sb = dir->i_sb; -+ struct ext3_dir_entry_2 *de; -+ int err; -+ -+ frame = dx_probe(dentry, 0, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ entries = frame->entries; -+ at = frame->at; -+ -+ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) -+ goto cleanup; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto journal_error; -+ -+ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); -+ if (err != -ENOSPC) { -+ bh = 0; -+ goto cleanup; -+ } -+ -+ /* Block full, should compress but for now just split */ -+ dxtrace(printk("using %u of %u node entries\n", -+ dx_get_count(entries), dx_get_limit(entries))); -+ /* Need to split index? */ -+ if (dx_get_count(entries) == dx_get_limit(entries)) { -+ u32 newblock; -+ unsigned icount = dx_get_count(entries); -+ int levels = frame - frames; -+ struct dx_entry *entries2; -+ struct dx_node *node2; -+ struct buffer_head *bh2; -+ -+ if (levels && (dx_get_count(frames->entries) == -+ dx_get_limit(frames->entries))) { -+ ext3_warning(sb, __FUNCTION__, -+ "Directory index full!\n"); -+ err = -ENOSPC; -+ goto cleanup; - } -- if ((le32_to_cpu(de->inode) == 0 && -- le16_to_cpu(de->rec_len) >= rec_len) || -- (le16_to_cpu(de->rec_len) >= -- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { -- BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -- /* By now the buffer is marked for journaling */ -- offset += le16_to_cpu(de->rec_len); -- if (le32_to_cpu(de->inode)) { -- de1 = (struct ext3_dir_entry_2 *) ((char *) de + -- EXT3_DIR_REC_LEN(de->name_len)); -- de1->rec_len = -- cpu_to_le16(le16_to_cpu(de->rec_len) - -- EXT3_DIR_REC_LEN(de->name_len)); -- de->rec_len = cpu_to_le16( -- EXT3_DIR_REC_LEN(de->name_len)); -- de = de1; -+ -+ bh2 = ext3_append (handle, dir, &newblock, &err); -+ if (!(bh2)) -+ goto cleanup; -+ node2 = (struct dx_node *)(bh2->b_data); -+ entries2 = node2->entries; -+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); -+ node2->fake.inode = 0; -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ if (levels) { -+ unsigned icount1 = icount/2, icount2 = icount - icount1; -+ unsigned hash2 = dx_get_hash(entries + icount1); -+ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); -+ -+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ -+ err = ext3_journal_get_write_access(handle, -+ frames[0].bh); -+ if (err) -+ goto journal_error; -+ -+ memcpy ((char *) entries2, (char *) (entries + icount1),+ icount2 * sizeof(struct dx_entry)); -+ dx_set_count (entries, icount1); -+ dx_set_count (entries2, icount2); -+ dx_set_limit (entries2, dx_node_limit(dir)); -+ -+ /* Which index block gets the new entry? */ -+ if (at - entries >= icount1) { -+ frame->at = at = at - entries - icount1 + entries2; -+ frame->entries = entries = entries2; -+ swap(frame->bh, bh2); - } -- de->file_type = EXT3_FT_UNKNOWN; -- if (inode) { -- de->inode = cpu_to_le32(inode->i_ino); -- ext3_set_de_type(dir->i_sb, de, inode->i_mode); -- } else -- de->inode = 0; -- de->name_len = namelen; -- memcpy (de->name, name, namelen); -- /* -- * XXX shouldn't update any times until successful -- * completion of syscall, but too many callers depend -- * on this. -- * -- * XXX similarly, too many callers depend on -- * ext3_new_inode() setting the times, but error -- * recovery deletes the inode, so the worst that can -- * happen is that the times are slightly out of date -- * and/or different from the directory change time. -- */ -- dir->i_mtime = dir->i_ctime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); -- dir->i_version = ++event; -- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -- ext3_journal_dirty_metadata(handle, bh); -- brelse(bh); -- return 0; -+ dx_insert_block (frames + 0, hash2, newblock); -+ dxtrace(dx_show_index ("node", frames[1].entries)); -+ dxtrace(dx_show_index ("node", -+ ((struct dx_node *) bh2->b_data)->entries)); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ } else { -+ dxtrace(printk("Creating second level index...\n")); -+ memcpy((char *) entries2, (char *) entries, -+ icount * sizeof(struct dx_entry)); -+ dx_set_limit(entries2, dx_node_limit(dir)); -+ -+ /* Set up root */ -+ dx_set_count(entries, 1); -+ dx_set_block(entries + 0, newblock); -+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; -+ -+ /* Add new access path frame */ -+ frame = frames + 1; -+ frame->at = at = at - entries + entries2; -+ frame->entries = entries = entries2; -+ frame->bh = bh2; -+ err = ext3_journal_get_write_access(handle, -+ frame->bh); -+ if (err) -+ goto journal_error; - } -- offset += le16_to_cpu(de->rec_len); -- de = (struct ext3_dir_entry_2 *) -- ((char *) de + le16_to_cpu(de->rec_len)); -+ ext3_journal_dirty_metadata(handle, frames[0].bh); - } -- brelse (bh); -- return -ENOSPC; -+ de = do_split(handle, dir, &bh, frame, &hinfo, &err); -+ if (!de) -+ goto cleanup; -+ err = add_dirent_to_buf(handle, dentry, inode, de, bh); -+ bh = 0; -+ goto cleanup; -+ -+journal_error: -+ ext3_std_error(dir->i_sb, err); -+cleanup: -+ if (bh) -+ brelse(bh); -+ dx_release(frames); -+ return err; - } -+#endif - - /* - * ext3_delete_entry deletes a directory entry by merging it with the -@@ -453,9 +1533,11 @@ - struct inode * inode; - int err; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -480,9 +1562,11 @@ - struct inode *inode; - int err; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -512,9 +1596,11 @@ - if (dir->i_nlink >= EXT3_LINK_MAX) - return -EMLINK; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -526,7 +1612,8 @@ - - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; -- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; -+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; -+ inode->i_blocks = 0; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - inode->i_nlink--; /* is this nlink == 0? */ -@@ -555,21 +1642,19 @@ - brelse (dir_block); - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); -- if (err) -- goto out_no_entry; -+ if (err) { -+ inode->i_nlink = 0; -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } - dir->i_nlink++; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - d_instantiate(dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); - return err; -- --out_no_entry: -- inode->i_nlink = 0; -- ext3_mark_inode_dirty(handle, inode); -- iput (inode); -- goto out_stop; - } - - /* -@@ -656,7 +1741,7 @@ - int err = 0, rc; - - lock_super(sb); -- if (!list_empty(&inode->u.ext3_i.i_orphan)) -+ if (!list_empty(&EXT3_I(inode)->i_orphan)) - goto out_unlock; - - /* Orphan handling is only valid for files with data blocks -@@ -697,7 +1782,7 @@ - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ - if (!err) -- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); -+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - - jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); - jbd_debug(4, "orphan inode %ld will point to %d\n", -@@ -715,25 +1800,26 @@ - int ext3_orphan_del(handle_t *handle, struct inode *inode) - { - struct list_head *prev; -+ struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_sb_info *sbi; - ino_t ino_next; - struct ext3_iloc iloc; - int err = 0; - - lock_super(inode->i_sb); -- if (list_empty(&inode->u.ext3_i.i_orphan)) { -+ if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } - - ino_next = NEXT_ORPHAN(inode); -- prev = inode->u.ext3_i.i_orphan.prev; -+ prev = ei->i_orphan.prev; - sbi = EXT3_SB(inode->i_sb); - - jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); - -- list_del(&inode->u.ext3_i.i_orphan); -- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+ list_del(&ei->i_orphan); -+ INIT_LIST_HEAD(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on -@@ -794,8 +1880,9 @@ - handle_t *handle; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); -@@ -833,7 +1920,7 @@ - ext3_mark_inode_dirty(handle, inode); - dir->i_nlink--; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - - end_rmdir: -@@ -851,8 +1938,9 @@ - handle_t *handle; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -879,7 +1967,7 @@ - if (retval) - goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - inode->i_nlink--; - if (!inode->i_nlink) -@@ -905,9 +1993,11 @@ - if (l > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -917,7 +2007,7 @@ - if (IS_ERR(inode)) - goto out_stop; - -- if (l > sizeof (inode->u.ext3_i.i_data)) { -+ if (l > sizeof (EXT3_I(inode)->i_data)) { - inode->i_op = &ext3_symlink_inode_operations; - inode->i_mapping->a_ops = &ext3_aops; - /* -@@ -926,25 +2016,23 @@ - * i_size in generic_commit_write(). - */ - err = block_symlink(inode, symname, l); -- if (err) -- goto out_no_entry; -+ if (err) { -+ ext3_dec_count(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } - } else { - inode->i_op = &ext3_fast_symlink_inode_operations; -- memcpy((char*)&inode->u.ext3_i.i_data,symname,l); -+ memcpy((char*)&EXT3_I(inode)->i_data,symname,l); - inode->i_size = l-1; - } -- inode->u.ext3_i.i_disksize = inode->i_size; -+ EXT3_I(inode)->i_disksize = inode->i_size; - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); - return err; -- --out_no_entry: -- ext3_dec_count(handle, inode); -- ext3_mark_inode_dirty(handle, inode); -- iput (inode); -- goto out_stop; - } - - static int ext3_link (struct dentry * old_dentry, -@@ -957,12 +2045,15 @@ - if (S_ISDIR(inode->i_mode)) - return -EPERM; - -- if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (inode->i_nlink >= EXT3_LINK_MAX) { - return -EMLINK; -+ } - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -996,9 +2087,11 @@ - - old_bh = new_bh = dir_bh = NULL; - -- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) - handle->h_sync = 1; -@@ -1078,7 +2171,7 @@ - new_inode->i_ctime = CURRENT_TIME; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; -- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); - ext3_journal_get_write_access(handle, dir_bh); -@@ -1090,7 +2183,7 @@ - new_inode->i_nlink--; - } else { - new_dir->i_nlink++; -- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } - } -Index: linux-2.4.19.SuSE/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c 2004-05-27 11:07:21.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext3/super.c 2004-05-27 11:08:28.000000000 -0700 -@@ -741,6 +741,7 @@ - es->s_mtime = cpu_to_le32(CURRENT_TIME); - ext3_update_dynamic_rev(sb); - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ - ext3_commit_super (sb, es, 1); - if (test_opt (sb, DEBUG)) - printk (KERN_INFO -@@ -751,6 +752,7 @@ - EXT3_BLOCKS_PER_GROUP(sb), - EXT3_INODES_PER_GROUP(sb), - sbi->s_mount_opt); -+ - printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", - bdevname(sb->s_dev)); - if (EXT3_SB(sb)->s_journal->j_inode == NULL) { -@@ -925,6 +927,7 @@ - return res; - } - -+ - struct super_block * ext3_read_super (struct super_block * sb, void * data, - int silent) - { -@@ -1113,6 +1116,9 @@ - sbi->s_mount_state = le16_to_cpu(es->s_state); - sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); -+ for (i=0; i < 4; i++) -+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); -+ sbi->s_def_hash_version = es->s_def_hash_version; - - if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR -@@ -1821,6 +1827,7 @@ - exit_ext3_xattr(); - } - -+EXPORT_SYMBOL(ext3_force_commit); - EXPORT_SYMBOL(ext3_bread); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -Index: linux-2.4.19.SuSE/fs/ext3/file.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/file.c 2002-12-04 09:46:18.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/file.c 2004-05-27 11:08:28.000000000 -0700 -@@ -38,6 +38,9 @@ - { - if (filp->f_mode & FMODE_WRITE) - ext3_discard_prealloc (inode); -+ if (is_dx(inode) && filp->private_data) -+ ext3_htree_free_dir_info(filp->private_data); -+ - return 0; - } - -Index: linux-2.4.19.SuSE/fs/ext3/hash.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/hash.c 1970-01-02 14:15:01.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/hash.c 2004-05-27 11:08:28.000000000 -0700 -@@ -0,0 +1,215 @@ -+/* -+ * linux/fs/ext3/hash.c -+ * -+ * Copyright (C) 2002 by Theodore Ts'o -+ * -+ * This file is released under the GPL v2. -+ * -+ * This file may be redistributed under the terms of the GNU Public -+ * License. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define DELTA 0x9E3779B9 -+ -+static void TEA_transform(__u32 buf[4], __u32 const in[]) -+{ -+ __u32 sum = 0; -+ __u32 b0 = buf[0], b1 = buf[1]; -+ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; -+ int n = 16; -+ -+ do { -+ sum += DELTA; -+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); -+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); -+ } while(--n); -+ -+ buf[0] += b0; -+ buf[1] += b1; -+} -+ -+/* F, G and H are basic MD4 functions: selection, majority, parity */ -+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) -+#define H(x, y, z) ((x) ^ (y) ^ (z)) -+ -+/* -+ * The generic round function. The application is so specific that -+ * we don't bother protecting all the arguments with parens, as is generally -+ * good macro practice, in favor of extra legibility. -+ * Rotation is separate from addition to prevent recomputation -+ */ -+#define ROUND(f, a, b, c, d, x, s) \ -+ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) -+#define K1 0 -+#define K2 013240474631UL -+#define K3 015666365641UL -+ -+/* -+ * Basic cut-down MD4 transform. Returns only 32 bits of result. -+ */ -+static void halfMD4Transform (__u32 buf[4], __u32 const in[]) -+{ -+ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; -+ -+ /* Round 1 */ -+ ROUND(F, a, b, c, d, in[0] + K1, 3); -+ ROUND(F, d, a, b, c, in[1] + K1, 7); -+ ROUND(F, c, d, a, b, in[2] + K1, 11); -+ ROUND(F, b, c, d, a, in[3] + K1, 19); -+ ROUND(F, a, b, c, d, in[4] + K1, 3); -+ ROUND(F, d, a, b, c, in[5] + K1, 7); -+ ROUND(F, c, d, a, b, in[6] + K1, 11); -+ ROUND(F, b, c, d, a, in[7] + K1, 19); -+ -+ /* Round 2 */ -+ ROUND(G, a, b, c, d, in[1] + K2, 3); -+ ROUND(G, d, a, b, c, in[3] + K2, 5); -+ ROUND(G, c, d, a, b, in[5] + K2, 9); -+ ROUND(G, b, c, d, a, in[7] + K2, 13); -+ ROUND(G, a, b, c, d, in[0] + K2, 3); -+ ROUND(G, d, a, b, c, in[2] + K2, 5); -+ ROUND(G, c, d, a, b, in[4] + K2, 9); -+ ROUND(G, b, c, d, a, in[6] + K2, 13); -+ -+ /* Round 3 */ -+ ROUND(H, a, b, c, d, in[3] + K3, 3); -+ ROUND(H, d, a, b, c, in[7] + K3, 9); -+ ROUND(H, c, d, a, b, in[2] + K3, 11); -+ ROUND(H, b, c, d, a, in[6] + K3, 15); -+ ROUND(H, a, b, c, d, in[1] + K3, 3); -+ ROUND(H, d, a, b, c, in[5] + K3, 9); -+ ROUND(H, c, d, a, b, in[0] + K3, 11); -+ ROUND(H, b, c, d, a, in[4] + K3, 15); -+ -+ buf[0] += a; -+ buf[1] += b; -+ buf[2] += c; -+ buf[3] += d; -+} -+ -+#undef ROUND -+#undef F -+#undef G -+#undef H -+#undef K1 -+#undef K2 -+#undef K3 -+ -+/* The old legacy hash */ -+static __u32 dx_hack_hash (const char *name, int len) -+{ -+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; -+ while (len--) { -+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); -+ -+ if (hash & 0x80000000) hash -= 0x7fffffff; -+ hash1 = hash0; -+ hash0 = hash; -+ } -+ return (hash0 << 1); -+} -+ -+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) -+{ -+ __u32 pad, val; -+ int i; -+ -+ pad = (__u32)len | ((__u32)len << 8); -+ pad |= pad << 16; -+ -+ val = pad; -+ if (len > num*4) -+ len = num * 4; -+ for (i=0; i < len; i++) { -+ if ((i % 4) == 0) -+ val = pad; -+ val = msg[i] + (val << 8); -+ if ((i % 4) == 3) { -+ *buf++ = val; -+ val = pad; -+ num--; -+ } -+ } -+ if (--num >= 0) -+ *buf++ = val; -+ while (--num >= 0) -+ *buf++ = pad; -+} -+ -+/* -+ * Returns the hash of a filename. If len is 0 and name is NULL, then -+ * this function can be used to test whether or not a hash version is -+ * supported. -+ * -+ * The seed is an 4 longword (32 bits) "secret" which can be used to -+ * uniquify a hash. If the seed is all zero's, then some default seed -+ * may be used. -+ * -+ * A particular hash version specifies whether or not the seed is -+ * represented, and whether or not the returned hash is 32 bits or 64 -+ * bits. 32 bit hashes will return 0 for the minor hash. -+ */ -+int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) -+{ -+ __u32 hash; -+ __u32 minor_hash = 0; -+ const char *p; -+ int i; -+ __u32 in[8], buf[4]; -+ -+ /* Initialize the default seed for the hash checksum functions */ -+ buf[0] = 0x67452301; -+ buf[1] = 0xefcdab89; -+ buf[2] = 0x98badcfe; -+ buf[3] = 0x10325476; -+ -+ /* Check to see if the seed is all zero's */ -+ if (hinfo->seed) { -+ for (i=0; i < 4; i++) { -+ if (hinfo->seed[i]) -+ break; -+ } -+ if (i < 4) -+ memcpy(buf, hinfo->seed, sizeof(buf)); -+ } -+ -+ switch (hinfo->hash_version) { -+ case DX_HASH_LEGACY: -+ hash = dx_hack_hash(name, len); -+ break; -+ case DX_HASH_HALF_MD4: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 8); -+ halfMD4Transform(buf, in); -+ len -= 32; -+ p += 32; -+ } -+ minor_hash = buf[2]; -+ hash = buf[1]; -+ break; -+ case DX_HASH_TEA: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 4); -+ TEA_transform(buf, in); -+ len -= 16; -+ p += 16; -+ } -+ hash = buf[0]; -+ minor_hash = buf[1]; -+ break; -+ default: -+ hinfo->hash = 0; -+ return -1; -+ } -+ hinfo->hash = hash & ~1; -+ hinfo->minor_hash = minor_hash; -+ return 0; -+} -Index: linux-2.4.19.SuSE/lib/rbtree.c -=================================================================== ---- linux-2.4.19.SuSE.orig/lib/rbtree.c 2002-08-02 17:39:46.000000000 -0700 -+++ linux-2.4.19.SuSE/lib/rbtree.c 2004-05-27 11:08:28.000000000 -0700 -@@ -17,6 +17,8 @@ - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - linux/lib/rbtree.c -+ -+ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 - */ - - #include -@@ -294,3 +296,43 @@ - __rb_erase_color(child, parent, root); - } - EXPORT_SYMBOL(rb_erase); -+ -+/* -+ * This function returns the first node (in sort order) of the tree. -+ */ -+rb_node_t *rb_get_first(rb_root_t *root) -+{ -+ rb_node_t *n; -+ -+ n = root->rb_node; -+ if (!n) -+ return 0; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+} -+EXPORT_SYMBOL(rb_get_first); -+ -+/* -+ * Given a node, this function will return the next node in the tree. -+ */ -+rb_node_t *rb_get_next(rb_node_t *n) -+{ -+ rb_node_t *parent; -+ -+ if (n->rb_right) { -+ n = n->rb_right; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+ } else { -+ while ((parent = n->rb_parent)) { -+ if (n == parent->rb_left) -+ return parent; -+ n = parent; -+ } -+ return 0; -+ } -+} -+EXPORT_SYMBOL(rb_get_next); -+ -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h 2003-10-05 09:30:34.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h 2004-05-27 11:08:28.000000000 -0700 -@@ -40,6 +40,11 @@ - #define EXT3FS_VERSION "2.4-0.9.18" - - /* -+ * Always enable hashed directories -+ */ -+#define CONFIG_EXT3_INDEX -+ -+/* - * Debug code - */ - #ifdef EXT3FS_DEBUG -@@ -414,8 +419,11 @@ - /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ - __u32 s_journal_dev; /* device number of journal file */ - __u32 s_last_orphan; /* start of list of inodes to delete */ -- --/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ -+ __u32 s_hash_seed[4]; /* HTREE hash seed */ -+ __u8 s_def_hash_version; /* Default hash version to use */ -+ __u8 s_reserved_char_pad; -+ __u16 s_reserved_word_pad; -+ __u32 s_reserved[192]; /* Padding to the end of the block */ - }; - - #ifdef __KERNEL__ -@@ -552,9 +560,46 @@ - #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) - #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ - ~EXT3_DIR_ROUND) -+/* -+ * Hash Tree Directory indexing -+ * (c) Daniel Phillips, 2001 -+ */ -+ -+#ifdef CONFIG_EXT3_INDEX -+ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) -+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#else -+ #define is_dx(dir) 0 -+#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) -+#endif -+ -+/* Legal values for the dx_root hash_version field: */ -+ -+#define DX_HASH_LEGACY 0 -+#define DX_HASH_HALF_MD4 1 -+#define DX_HASH_TEA 2 -+ -+/* hash info structure used by the directory hash */ -+struct dx_hash_info -+{ -+ u32 hash; -+ u32 minor_hash; -+ int hash_version; -+ u32 *seed; -+}; - - #ifdef __KERNEL__ - /* -+ * Control parameters used by ext3_htree_next_block -+ */ -+#define HASH_NB_ALWAYS 1 -+ -+ -+/* - * Describe an inode's exact location on disk and in memory - */ - struct ext3_iloc -@@ -564,6 +609,27 @@ - unsigned long block_group; - }; - -+ -+/* -+ * This structure is stuffed into the struct file's private_data field -+ * for directories. It is where we put information so that we can do -+ * readdir operations in hash tree order. -+ */ -+struct dir_private_info { -+ rb_root_t root; -+ rb_node_t *curr_node; -+ struct fname *extra_fname; -+ loff_t last_pos; -+ __u32 curr_hash; -+ __u32 curr_minor_hash; -+ __u32 next_hash; -+}; -+ -+/* -+ * Special error return code only used by dx_probe() and its callers. -+ */ -+#define ERR_BAD_DX_DIR -75000 -+ - /* - * Function prototypes - */ -@@ -591,11 +657,20 @@ - - /* dir.c */ - extern int ext3_check_dir_entry(const char *, struct inode *, -- struct ext3_dir_entry_2 *, struct buffer_head *, -- unsigned long); -+ struct ext3_dir_entry_2 *, -+ struct buffer_head *, unsigned long); -+extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3_dir_entry_2 *dirent); -+extern void ext3_htree_free_dir_info(struct dir_private_info *p); -+ - /* fsync.c */ - extern int ext3_sync_file (struct file *, struct dentry *, int); - -+/* hash.c */ -+extern int ext3fs_dirhash(const char *name, int len, struct -+ dx_hash_info *hinfo); -+ - /* ialloc.c */ - extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); - extern void ext3_free_inode (handle_t *, struct inode *); -@@ -628,6 +703,8 @@ - /* namei.c */ - extern int ext3_orphan_add(handle_t *, struct inode *); - extern int ext3_orphan_del(handle_t *, struct inode *); -+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash); - - /* super.c */ - extern void ext3_error (struct super_block *, const char *, const char *, ...) -Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h 2003-10-05 09:16:36.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h 2004-05-27 11:08:28.000000000 -0700 -@@ -62,6 +62,8 @@ - int s_inode_size; - int s_first_ino; - u32 s_next_generation; -+ u32 s_hash_seed[4]; -+ int s_def_hash_version; - - /* Journaling */ - struct inode * s_journal_inode; -Index: linux-2.4.19.SuSE/include/linux/ext3_jbd.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_jbd.h 2003-10-05 09:30:34.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/ext3_jbd.h 2004-05-27 11:08:28.000000000 -0700 -@@ -69,6 +69,8 @@ - - #define EXT3_RESERVE_TRANS_BLOCKS 12 - -+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 -+ - int - ext3_mark_iloc_dirty(handle_t *handle, - struct inode *inode, -Index: linux-2.4.19.SuSE/include/linux/rbtree.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/rbtree.h 2003-10-05 09:16:36.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/rbtree.h 2004-05-27 11:08:28.000000000 -0700 -@@ -120,6 +120,8 @@ - - extern void rb_insert_color(rb_node_t *, rb_root_t *); - extern void rb_erase(rb_node_t *, rb_root_t *); -+extern rb_node_t *rb_get_first(rb_root_t *root); -+extern rb_node_t *rb_get_next(rb_node_t *n); - - static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) - { diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.19-suse.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.19-suse.patch deleted file mode 100644 index 4bcefce..0000000 --- a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.19-suse.patch +++ /dev/null @@ -1,481 +0,0 @@ - fs/ext3/file.c | 4 - fs/ext3/inode.c | 116 ++++++++++++++++++++++ - fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ - include/linux/ext3_fs.h | 5 - include/linux/ext3_fs_sb.h | 10 + - 5 files changed, 365 insertions(+) - -Index: linux-2.4.19.SuSE/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:18:04 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 01:19:22 2003 -@@ -401,6 +401,220 @@ - } - } - -+#ifdef EXT3_DELETE_THREAD -+/* -+ * Delete inodes in a loop until there are no more to be deleted. -+ * Normally, we run in the background doing the deletes and sleeping again, -+ * and clients just add new inodes to be deleted onto the end of the list. -+ * If someone is concerned about free space (e.g. block allocation or similar) -+ * then they can sleep on s_delete_waiter_queue and be woken up when space -+ * has been freed. -+ */ -+int ext3_delete_thread(void *data) -+{ -+ struct super_block *sb = data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct task_struct *tsk = current; -+ -+ /* Almost like daemonize, but not quite */ -+ exit_mm(current); -+ tsk->session = 1; -+ tsk->pgrp = 1; -+ tsk->tty = NULL; -+ exit_files(current); -+ reparent_to_init(); -+ -+ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); -+ sigfillset(&tsk->blocked); -+ -+ /*tsk->flags |= PF_KERNTHREAD;*/ -+ -+ INIT_LIST_HEAD(&sbi->s_delete_list); -+ wake_up(&sbi->s_delete_waiter_queue); -+ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); -+ -+ /* main loop */ -+ for (;;) { -+ wait_event_interruptible(sbi->s_delete_thread_queue, -+ !list_empty(&sbi->s_delete_list) || -+ !test_opt(sb, ASYNCDEL)); -+ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", -+ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); -+ -+ spin_lock(&sbi->s_delete_lock); -+ if (list_empty(&sbi->s_delete_list)) { -+ clear_opt(sbi->s_mount_opt, ASYNCDEL); -+ memset(&sbi->s_delete_list, 0, -+ sizeof(sbi->s_delete_list)); -+ spin_unlock(&sbi->s_delete_lock); -+ ext3_debug("delete thread on %s exiting\n", -+ kdevname(sb->s_dev)); -+ wake_up(&sbi->s_delete_waiter_queue); -+ break; -+ } -+ -+ while (!list_empty(&sbi->s_delete_list)) { -+ struct inode *inode=list_entry(sbi->s_delete_list.next, -+ struct inode, i_dentry); -+ unsigned long blocks = inode->i_blocks >> -+ (inode->i_blkbits - 9); -+ -+ list_del_init(&inode->i_dentry); -+ spin_unlock(&sbi->s_delete_lock); -+ ext3_debug("%s delete ino %lu blk %lu\n", -+ tsk->comm, inode->i_ino, blocks); -+ -+ iput(inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ sbi->s_delete_blocks -= blocks; -+ sbi->s_delete_inodes--; -+ } -+ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { -+ ext3_warning(sb, __FUNCTION__, -+ "%lu blocks, %lu inodes on list?\n", -+ sbi->s_delete_blocks,sbi->s_delete_inodes); -+ sbi->s_delete_blocks = 0; -+ sbi->s_delete_inodes = 0; -+ } -+ spin_unlock(&sbi->s_delete_lock); -+ wake_up(&sbi->s_delete_waiter_queue); -+ } -+ -+ return 0; -+} -+ -+static void ext3_start_delete_thread(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int rc; -+ -+ spin_lock_init(&sbi->s_delete_lock); -+ init_waitqueue_head(&sbi->s_delete_thread_queue); -+ init_waitqueue_head(&sbi->s_delete_waiter_queue); -+ -+ if (!test_opt(sb, ASYNCDEL)) -+ return; -+ -+ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); -+ if (rc < 0) -+ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", -+ rc); -+ else -+ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); -+} -+ -+static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) -+{ -+ if (sbi->s_delete_list.next == 0) /* thread never started */ -+ return; -+ -+ clear_opt(sbi->s_mount_opt, ASYNCDEL); -+ wake_up(&sbi->s_delete_thread_queue); -+ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list)); -+} -+ -+/* Instead of playing games with the inode flags, destruction, etc we just -+ * create a new inode locally and put it on a list for the truncate thread. -+ * We need large parts of the inode struct in order to complete the -+ * truncate and unlink, so we may as well just have a real inode to do it. -+ * -+ * If we have any problem deferring the delete, just delete it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+static void ext3_delete_inode_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (is_bad_inode(old_inode)) { -+ clear_inode(old_inode); -+ return; -+ } -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_delete; -+ -+ /* We may want to delete the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) -+ goto out_delete; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_delete; -+ } -+ -+ /* We can iget this inode again here, because our caller has unhashed -+ * old_inode, so new_inode will be in a different inode struct. -+ * -+ * We need to ensure that the i_orphan pointers in the other inodes -+ * point at the new inode copy instead of the old one so the orphan -+ * list doesn't get corrupted when the old orphan inode is freed. -+ */ -+ down(&sbi->s_orphan_lock); -+ -+ sbi->s_mount_state |= EXT3_ORPHAN_FS; -+ new_inode = iget(old_inode->i_sb, old_inode->i_ino); -+ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; -+ if (is_bad_inode(new_inode)) { -+ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); -+ iput(new_inode); -+ new_inode = NULL; -+ } -+ if (!new_inode) { -+ up(&sbi->s_orphan_lock); -+ ext3_debug("delete inode %lu directly (bad read)\n", -+ old_inode->i_ino); -+ goto out_delete; -+ } -+ J_ASSERT(new_inode != old_inode); -+ -+ J_ASSERT(!list_empty(&oei->i_orphan)); -+ -+ nei = EXT3_I(new_inode); -+ /* Ugh. We need to insert new_inode into the same spot on the list -+ * as old_inode was, to ensure the in-memory orphan list is still -+ * in the same order as the on-disk orphan list (badness otherwise). -+ */ -+ nei->i_orphan = oei->i_orphan; -+ nei->i_orphan.next->prev = &nei->i_orphan; -+ nei->i_orphan.prev->next = &nei->i_orphan; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up(&sbi->s_orphan_lock); -+ -+ clear_inode(old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_delete: -+ ext3_delete_inode(old_inode); -+} -+#else -+#define ext3_start_delete_thread(sbi) do {} while(0) -+#define ext3_stop_delete_thread(sbi) do {} while(0) -+#endif /* EXT3_DELETE_THREAD */ -+ - void ext3_put_super (struct super_block * sb) - { - struct ext3_sb_info *sbi = EXT3_SB(sb); -@@ -408,6 +622,7 @@ - kdev_t j_dev = sbi->s_journal->j_dev; - int i; - -+ ext3_stop_delete_thread(sbi); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -476,7 +691,11 @@ - write_inode: ext3_write_inode, /* BKL not held. Don't need */ - dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ - put_inode: ext3_put_inode, /* BKL not held. Don't need */ -+#ifdef EXT3_DELETE_THREAD -+ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ -+#else - delete_inode: ext3_delete_inode, /* BKL not held. We take it */ -+#endif - put_super: ext3_put_super, /* BKL held */ - write_super: ext3_write_super, /* BKL held */ - sync_fs: ext3_sync_fs, -@@ -553,6 +772,13 @@ - clear_opt (*mount_options, POSIX_ACL); - else - #endif -+#ifdef EXT3_DELETE_THREAD -+ if (!strcmp(this_char, "asyncdel")) -+ set_opt(*mount_options, ASYNCDEL); -+ else if (!strcmp(this_char, "noasyncdel")) -+ clear_opt(*mount_options, ASYNCDEL); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -1254,6 +1480,7 @@ - } - - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ ext3_start_delete_thread(sb); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock -@@ -1692,6 +1919,9 @@ - if (!parse_options(data, &tmp, sbi, &tmp, 1)) - return -EINVAL; - -+ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) -+ ext3_stop_delete_thread(sbi); -+ - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) - ext3_abort(sb, __FUNCTION__, "Abort forced by user"); - -Index: linux-2.4.19.SuSE/fs/ext3/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:02:56 2003 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:19:22 2003 -@@ -2114,6 +2114,118 @@ - ext3_journal_stop(handle, inode); - } - -+#ifdef EXT3_DELETE_THREAD -+/* Move blocks from to-be-truncated inode over to a new inode, and delete -+ * that one from the delete thread instead. This avoids a lot of latency -+ * when truncating large files. -+ * -+ * If we have any problem deferring the truncate, just truncate it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+void ext3_truncate_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ handle_t *handle; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_truncate; -+ -+ /* XXX This is a temporary limitation for code simplicity. -+ * We could truncate to arbitrary sizes at some later time. -+ */ -+ if (old_inode->i_size != 0) -+ goto out_truncate; -+ -+ /* We may want to truncate the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || -+ old_inode->i_size > oei->i_disksize) -+ goto out_truncate; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_truncate; -+ } -+ -+ ext3_discard_prealloc(old_inode); -+ -+ /* old_inode = 1 -+ * new_inode = sb + GDT + ibitmap -+ * orphan list = 1 inode/superblock for add, 2 inodes for del -+ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS -+ */ -+ handle = ext3_journal_start(old_inode, 7); -+ if (IS_ERR(handle)) -+ goto out_truncate; -+ -+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); -+ if (IS_ERR(new_inode)) { -+ ext3_debug("truncate inode %lu directly (no new inodes)\n", -+ old_inode->i_ino); -+ goto out_journal; -+ } -+ -+ nei = EXT3_I(new_inode); -+ -+ down_write(&oei->truncate_sem); -+ new_inode->i_size = old_inode->i_size; -+ new_inode->i_blocks = old_inode->i_blocks; -+ new_inode->i_uid = old_inode->i_uid; -+ new_inode->i_gid = old_inode->i_gid; -+ new_inode->i_nlink = 0; -+ -+ /* FIXME when we do arbitrary truncates */ -+ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; -+ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; -+ -+ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); -+ memset(oei->i_data, 0, sizeof(oei->i_data)); -+ -+ nei->i_disksize = oei->i_disksize; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up_write(&oei->truncate_sem); -+ -+ if (ext3_orphan_add(handle, new_inode) < 0) -+ goto out_journal; -+ -+ if (ext3_orphan_del(handle, old_inode) < 0) { -+ ext3_orphan_del(handle, new_inode); -+ iput(new_inode); -+ goto out_journal; -+ } -+ -+ ext3_journal_stop(handle, old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_journal: -+ ext3_journal_stop(handle, old_inode); -+out_truncate: -+ ext3_truncate(old_inode); -+} -+#endif /* EXT3_DELETE_THREAD */ -+ - /* - * ext3_get_inode_loc returns with an extra refcount against the - * inode's underlying buffer_head on success. -Index: linux-2.4.19.SuSE/fs/ext3/file.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/file.c Sun Nov 16 00:40:59 2003 -+++ linux-2.4.19.SuSE/fs/ext3/file.c Sun Nov 16 01:19:22 2003 -@@ -132,7 +132,11 @@ - }; - - struct inode_operations ext3_file_inode_operations = { -+#ifdef EXT3_DELETE_THREAD -+ truncate: ext3_truncate_thread, /* BKL held */ -+#else - truncate: ext3_truncate, /* BKL held */ -+#endif - setattr: ext3_setattr, /* BKL held */ - setxattr: ext3_setxattr, /* BKL held */ - getxattr: ext3_getxattr, /* BKL held */ -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:02:51 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:20:06 2003 -@@ -193,6 +193,7 @@ - */ - #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ - #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ -+#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ - - /* - * ioctl commands -@@ -321,6 +322,7 @@ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ -+#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -695,6 +697,9 @@ - extern void ext3_dirty_inode(struct inode *); - extern int ext3_change_inode_journal_flag(struct inode *, int); - extern void ext3_truncate (struct inode *); -+#ifdef EXT3_DELETE_THREAD -+extern void ext3_truncate_thread(struct inode *inode); -+#endif - - /* ioctl.c */ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, -Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h Sun Nov 16 01:18:41 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h Sun Nov 16 01:19:22 2003 -@@ -29,6 +29,8 @@ - - #define EXT3_MAX_GROUP_LOADED 8 - -+#define EXT3_DELETE_THREAD -+ - /* - * third extended-fs super-block data in memory - */ -@@ -75,6 +77,14 @@ - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ - #endif -+#ifdef EXT3_DELETE_THREAD -+ spinlock_t s_delete_lock; -+ struct list_head s_delete_list; -+ unsigned long s_delete_blocks; -+ unsigned long s_delete_inodes; -+ wait_queue_head_t s_delete_thread_queue; -+ wait_queue_head_t s_delete_waiter_queue; -+#endif - }; - - #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch deleted file mode 100644 index ca05893..0000000 --- a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch +++ /dev/null @@ -1,541 +0,0 @@ - fs/ext3/file.c | 4 - fs/ext3/inode.c | 116 ++++++++++++++++++++++ - fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ - include/linux/ext3_fs.h | 5 - include/linux/ext3_fs_sb.h | 10 + - 5 files changed, 365 insertions(+) - -Index: linux-2.4.20/fs/ext3/super.c -=================================================================== ---- linux-2.4.20.orig/fs/ext3/super.c 2004-01-12 20:13:37.000000000 +0300 -+++ linux-2.4.20/fs/ext3/super.c 2004-01-13 16:59:54.000000000 +0300 -@@ -48,6 +48,8 @@ - static void ext3_clear_journal_err(struct super_block * sb, - struct ext3_super_block * es); - -+static int ext3_sync_fs(struct super_block * sb); -+ - #ifdef CONFIG_JBD_DEBUG - int journal_no_write[2]; - -@@ -398,6 +400,221 @@ - } - } - -+#ifdef EXT3_DELETE_THREAD -+/* -+ * Delete inodes in a loop until there are no more to be deleted. -+ * Normally, we run in the background doing the deletes and sleeping again, -+ * and clients just add new inodes to be deleted onto the end of the list. -+ * If someone is concerned about free space (e.g. block allocation or similar) -+ * then they can sleep on s_delete_waiter_queue and be woken up when space -+ * has been freed. -+ */ -+int ext3_delete_thread(void *data) -+{ -+ struct super_block *sb = data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct task_struct *tsk = current; -+ -+ /* Almost like daemonize, but not quite */ -+ exit_mm(current); -+ tsk->session = 1; -+ tsk->pgrp = 1; -+ tsk->tty = NULL; -+ exit_files(current); -+ reparent_to_init(); -+ -+ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); -+ sigfillset(&tsk->blocked); -+ -+ /*tsk->flags |= PF_KERNTHREAD;*/ -+ -+ INIT_LIST_HEAD(&sbi->s_delete_list); -+ wake_up(&sbi->s_delete_waiter_queue); -+ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); -+ -+ /* main loop */ -+ for (;;) { -+ wait_event_interruptible(sbi->s_delete_thread_queue, -+ !list_empty(&sbi->s_delete_list) || -+ !test_opt(sb, ASYNCDEL)); -+ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", -+ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); -+ -+ spin_lock(&sbi->s_delete_lock); -+ if (list_empty(&sbi->s_delete_list)) { -+ clear_opt(sbi->s_mount_opt, ASYNCDEL); -+ memset(&sbi->s_delete_list, 0, -+ sizeof(sbi->s_delete_list)); -+ spin_unlock(&sbi->s_delete_lock); -+ ext3_debug("delete thread on %s exiting\n", -+ kdevname(sb->s_dev)); -+ wake_up(&sbi->s_delete_waiter_queue); -+ break; -+ } -+ -+ while (!list_empty(&sbi->s_delete_list)) { -+ struct inode *inode=list_entry(sbi->s_delete_list.next, -+ struct inode, i_dentry); -+ unsigned long blocks = inode->i_blocks >> -+ (inode->i_blkbits - 9); -+ -+ list_del_init(&inode->i_dentry); -+ spin_unlock(&sbi->s_delete_lock); -+ ext3_debug("%s delete ino %lu blk %lu\n", -+ tsk->comm, inode->i_ino, blocks); -+ -+ iput(inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ sbi->s_delete_blocks -= blocks; -+ sbi->s_delete_inodes--; -+ } -+ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { -+ ext3_warning(sb, __FUNCTION__, -+ "%lu blocks, %lu inodes on list?\n", -+ sbi->s_delete_blocks,sbi->s_delete_inodes); -+ sbi->s_delete_blocks = 0; -+ sbi->s_delete_inodes = 0; -+ } -+ spin_unlock(&sbi->s_delete_lock); -+ wake_up(&sbi->s_delete_waiter_queue); -+ } -+ -+ return 0; -+} -+ -+static void ext3_start_delete_thread(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int rc; -+ -+ spin_lock_init(&sbi->s_delete_lock); -+ init_waitqueue_head(&sbi->s_delete_thread_queue); -+ init_waitqueue_head(&sbi->s_delete_waiter_queue); -+ -+ if (!test_opt(sb, ASYNCDEL)) -+ return; -+ -+ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); -+ if (rc < 0) -+ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", -+ rc); -+ else -+ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); -+} -+ -+static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) -+{ -+ if (sbi->s_delete_list.next == 0) /* thread never started */ -+ return; -+ -+ clear_opt(sbi->s_mount_opt, ASYNCDEL); -+ wake_up(&sbi->s_delete_thread_queue); -+ wait_event(sbi->s_delete_waiter_queue, -+ sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0); -+} -+ -+/* Instead of playing games with the inode flags, destruction, etc we just -+ * create a new inode locally and put it on a list for the truncate thread. -+ * We need large parts of the inode struct in order to complete the -+ * truncate and unlink, so we may as well just have a real inode to do it. -+ * -+ * If we have any problem deferring the delete, just delete it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+static void ext3_delete_inode_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (is_bad_inode(old_inode)) { -+ clear_inode(old_inode); -+ return; -+ } -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_delete; -+ -+ /* We may want to delete the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) -+ goto out_delete; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_delete; -+ } -+ -+ /* We can iget this inode again here, because our caller has unhashed -+ * old_inode, so new_inode will be in a different inode struct. -+ * -+ * We need to ensure that the i_orphan pointers in the other inodes -+ * point at the new inode copy instead of the old one so the orphan -+ * list doesn't get corrupted when the old orphan inode is freed. -+ */ -+ down(&sbi->s_orphan_lock); -+ -+ sbi->s_mount_state |= EXT3_ORPHAN_FS; -+ new_inode = iget(old_inode->i_sb, old_inode->i_ino); -+ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; -+ if (is_bad_inode(new_inode)) { -+ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); -+ iput(new_inode); -+ new_inode = NULL; -+ } -+ if (!new_inode) { -+ up(&sbi->s_orphan_lock); -+ ext3_debug("delete inode %lu directly (bad read)\n", -+ old_inode->i_ino); -+ goto out_delete; -+ } -+ J_ASSERT(new_inode != old_inode); -+ -+ J_ASSERT(!list_empty(&oei->i_orphan)); -+ -+ nei = EXT3_I(new_inode); -+ /* Ugh. We need to insert new_inode into the same spot on the list -+ * as old_inode was, to ensure the in-memory orphan list is still -+ * in the same order as the on-disk orphan list (badness otherwise). -+ */ -+ nei->i_orphan = oei->i_orphan; -+ nei->i_orphan.next->prev = &nei->i_orphan; -+ nei->i_orphan.prev->next = &nei->i_orphan; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up(&sbi->s_orphan_lock); -+ -+ clear_inode(old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_delete: -+ ext3_delete_inode(old_inode); -+} -+#else -+#define ext3_start_delete_thread(sbi) do {} while(0) -+#define ext3_stop_delete_thread(sbi) do {} while(0) -+#endif /* EXT3_DELETE_THREAD */ -+ - void ext3_put_super (struct super_block * sb) - { - struct ext3_sb_info *sbi = EXT3_SB(sb); -@@ -405,6 +622,7 @@ - kdev_t j_dev = sbi->s_journal->j_dev; - int i; - -+ J_ASSERT(sbi->s_delete_inodes == 0); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -453,9 +671,14 @@ - write_inode: ext3_write_inode, /* BKL not held. Don't need */ - dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ - put_inode: ext3_put_inode, /* BKL not held. Don't need */ -+#ifdef EXT3_DELETE_THREAD -+ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ -+#else - delete_inode: ext3_delete_inode, /* BKL not held. We take it */ -+#endif - put_super: ext3_put_super, /* BKL held */ - write_super: ext3_write_super, /* BKL held */ -+ sync_fs: ext3_sync_fs, - write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ - unlockfs: ext3_unlockfs, /* BKL not held. We take it */ - statfs: ext3_statfs, /* BKL held */ -@@ -521,6 +744,13 @@ - clear_opt (*mount_options, XATTR_USER); - else - #endif -+#ifdef EXT3_DELETE_THREAD -+ if (!strcmp(this_char, "asyncdel")) -+ set_opt(*mount_options, ASYNCDEL); -+ else if (!strcmp(this_char, "noasyncdel")) -+ clear_opt(*mount_options, ASYNCDEL); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -1220,6 +1450,7 @@ - } - - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ ext3_start_delete_thread(sb); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock -@@ -1625,6 +1856,21 @@ - } - } - -+static int ext3_sync_fs(struct super_block *sb) -+{ -+ tid_t target; -+ -+ if (atomic_read(&sb->s_active) == 0) { -+ /* fs is being umounted: time to stop delete thread */ -+ ext3_stop_delete_thread(EXT3_SB(sb)); -+ } -+ -+ sb->s_dirt = 0; -+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); -+ log_wait_commit(EXT3_SB(sb)->s_journal, target); -+ return 0; -+} -+ - /* - * LVM calls this function before a (read-only) snapshot is created. This - * gives us a chance to flush the journal completely and mark the fs clean. -@@ -1682,6 +1928,9 @@ - if (!parse_options(data, &tmp, sbi, &tmp, 1)) - return -EINVAL; - -+ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) -+ ext3_stop_delete_thread(sbi); -+ - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) - ext3_abort(sb, __FUNCTION__, "Abort forced by user"); - -Index: linux-2.4.20/fs/ext3/inode.c -=================================================================== ---- linux-2.4.20.orig/fs/ext3/inode.c 2004-01-12 20:13:37.000000000 +0300 -+++ linux-2.4.20/fs/ext3/inode.c 2004-01-13 16:55:45.000000000 +0300 -@@ -2552,6 +2552,118 @@ - return err; - } - -+#ifdef EXT3_DELETE_THREAD -+/* Move blocks from to-be-truncated inode over to a new inode, and delete -+ * that one from the delete thread instead. This avoids a lot of latency -+ * when truncating large files. -+ * -+ * If we have any problem deferring the truncate, just truncate it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+void ext3_truncate_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ handle_t *handle; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_truncate; -+ -+ /* XXX This is a temporary limitation for code simplicity. -+ * We could truncate to arbitrary sizes at some later time. -+ */ -+ if (old_inode->i_size != 0) -+ goto out_truncate; -+ -+ /* We may want to truncate the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || -+ old_inode->i_size > oei->i_disksize) -+ goto out_truncate; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_truncate; -+ } -+ -+ ext3_discard_prealloc(old_inode); -+ -+ /* old_inode = 1 -+ * new_inode = sb + GDT + ibitmap -+ * orphan list = 1 inode/superblock for add, 2 inodes for del -+ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS -+ */ -+ handle = ext3_journal_start(old_inode, 7); -+ if (IS_ERR(handle)) -+ goto out_truncate; -+ -+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); -+ if (IS_ERR(new_inode)) { -+ ext3_debug("truncate inode %lu directly (no new inodes)\n", -+ old_inode->i_ino); -+ goto out_journal; -+ } -+ -+ nei = EXT3_I(new_inode); -+ -+ down_write(&oei->truncate_sem); -+ new_inode->i_size = old_inode->i_size; -+ new_inode->i_blocks = old_inode->i_blocks; -+ new_inode->i_uid = old_inode->i_uid; -+ new_inode->i_gid = old_inode->i_gid; -+ new_inode->i_nlink = 0; -+ -+ /* FIXME when we do arbitrary truncates */ -+ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; -+ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; -+ -+ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); -+ memset(oei->i_data, 0, sizeof(oei->i_data)); -+ -+ nei->i_disksize = oei->i_disksize; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up_write(&oei->truncate_sem); -+ -+ if (ext3_orphan_add(handle, new_inode) < 0) -+ goto out_journal; -+ -+ if (ext3_orphan_del(handle, old_inode) < 0) { -+ ext3_orphan_del(handle, new_inode); -+ iput(new_inode); -+ goto out_journal; -+ } -+ -+ ext3_journal_stop(handle, old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_journal: -+ ext3_journal_stop(handle, old_inode); -+out_truncate: -+ ext3_truncate(old_inode); -+} -+#endif /* EXT3_DELETE_THREAD */ -+ - /* - * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. -Index: linux-2.4.20/fs/ext3/file.c -=================================================================== ---- linux-2.4.20.orig/fs/ext3/file.c 2004-01-12 20:13:36.000000000 +0300 -+++ linux-2.4.20/fs/ext3/file.c 2004-01-13 16:55:45.000000000 +0300 -@@ -125,7 +125,11 @@ - }; - - struct inode_operations ext3_file_inode_operations = { -+#ifdef EXT3_DELETE_THREAD -+ truncate: ext3_truncate_thread, /* BKL held */ -+#else - truncate: ext3_truncate, /* BKL held */ -+#endif - setattr: ext3_setattr, /* BKL held */ - setxattr: ext3_setxattr, /* BKL held */ - getxattr: ext3_getxattr, /* BKL held */ -Index: linux-2.4.20/fs/buffer.c -=================================================================== ---- linux-2.4.20.orig/fs/buffer.c 2003-05-16 05:29:12.000000000 +0400 -+++ linux-2.4.20/fs/buffer.c 2004-01-13 16:55:45.000000000 +0300 -@@ -328,6 +328,8 @@ - if (sb->s_dirt && sb->s_op && sb->s_op->write_super) - sb->s_op->write_super(sb); - unlock_super(sb); -+ if (sb->s_op && sb->s_op->sync_fs) -+ sb->s_op->sync_fs(sb); - unlock_kernel(); - - return sync_buffers(dev, 1); -Index: linux-2.4.20/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-01-12 20:13:37.000000000 +0300 -+++ linux-2.4.20/include/linux/ext3_fs.h 2004-01-13 16:55:45.000000000 +0300 -@@ -193,6 +193,7 @@ - */ - #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ - #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ -+#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ - - /* - * ioctl commands -@@ -320,6 +321,7 @@ - #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ -+#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -696,6 +698,9 @@ - extern void ext3_dirty_inode(struct inode *); - extern int ext3_change_inode_journal_flag(struct inode *, int); - extern void ext3_truncate (struct inode *); -+#ifdef EXT3_DELETE_THREAD -+extern void ext3_truncate_thread(struct inode *inode); -+#endif - - /* ioctl.c */ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, -Index: linux-2.4.20/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.20.orig/include/linux/ext3_fs_sb.h 2004-01-12 20:13:37.000000000 +0300 -+++ linux-2.4.20/include/linux/ext3_fs_sb.h 2004-01-13 16:55:45.000000000 +0300 -@@ -29,6 +29,8 @@ - - #define EXT3_MAX_GROUP_LOADED 8 - -+#define EXT3_DELETE_THREAD -+ - /* - * third extended-fs super-block data in memory - */ -@@ -76,6 +78,14 @@ - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ - #endif -+#ifdef EXT3_DELETE_THREAD -+ spinlock_t s_delete_lock; -+ struct list_head s_delete_list; -+ unsigned long s_delete_blocks; -+ unsigned long s_delete_inodes; -+ wait_queue_head_t s_delete_thread_queue; -+ wait_queue_head_t s_delete_waiter_queue; -+#endif - }; - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.4.20/include/linux/fs.h -=================================================================== ---- linux-2.4.20.orig/include/linux/fs.h 2004-01-12 20:13:36.000000000 +0300 -+++ linux-2.4.20/include/linux/fs.h 2004-01-13 16:55:45.000000000 +0300 -@@ -917,6 +917,7 @@ - void (*delete_inode) (struct inode *); - void (*put_super) (struct super_block *); - void (*write_super) (struct super_block *); -+ int (*sync_fs) (struct super_block *); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); - int (*statfs) (struct super_block *, struct statfs *); diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch deleted file mode 100644 index 6e4c834..0000000 --- a/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch +++ /dev/null @@ -1,2584 +0,0 @@ - fs/ext3/Makefile | 2 - fs/ext3/dir.c | 302 +++++++++ - fs/ext3/file.c | 3 - fs/ext3/hash.c | 215 ++++++ - fs/ext3/namei.c | 1420 ++++++++++++++++++++++++++++++++++++++++----- - fs/ext3/super.c | 7 - include/linux/ext3_fs.h | 85 ++ - include/linux/ext3_fs_sb.h | 2 - include/linux/ext3_jbd.h | 2 - include/linux/rbtree.h | 2 - lib/rbtree.c | 42 + - 11 files changed, 1921 insertions(+), 161 deletions(-) - -Index: linux.mcp2/fs/ext3/dir.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/dir.c 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/dir.c 2004-05-17 15:07:06.000000000 -0700 -@@ -21,12 +21,16 @@ - #include - #include - #include -+#include -+#include - - static unsigned char ext3_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK - }; - - static int ext3_readdir(struct file *, void *, filldir_t); -+static int ext3_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir); - - struct file_operations ext3_dir_operations = { - read: generic_read_dir, -@@ -35,6 +39,17 @@ - fsync: ext3_sync_file, /* BKL held */ - }; - -+ -+static unsigned char get_dtype(struct super_block *sb, int filetype) -+{ -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || -+ (filetype >= EXT3_FT_MAX)) -+ return DT_UNKNOWN; -+ -+ return (ext3_filetype_table[filetype]); -+} -+ -+ - int ext3_check_dir_entry (const char * function, struct inode * dir, - struct ext3_dir_entry_2 * de, - struct buffer_head * bh, -@@ -79,6 +94,16 @@ - - sb = inode->i_sb; - -+ if (is_dx(inode)) { -+ err = ext3_dx_readdir(filp, dirent, filldir); -+ if (err != ERR_BAD_DX_DIR) -+ return err; -+ /* -+ * We don't set the inode dirty flag since it's not -+ * critical that it get flushed back to the disk. -+ */ -+ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; -+ } - stored = 0; - bh = NULL; - offset = filp->f_pos & (sb->s_blocksize - 1); -@@ -162,18 +187,12 @@ - * during the copy operation. - */ - unsigned long version = filp->f_version; -- unsigned char d_type = DT_UNKNOWN; - -- if (EXT3_HAS_INCOMPAT_FEATURE(sb, -- EXT3_FEATURE_INCOMPAT_FILETYPE) -- && de->file_type < EXT3_FT_MAX) -- d_type = -- ext3_filetype_table[de->file_type]; - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), -- d_type); -+ get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) -@@ -188,3 +207,272 @@ - UPDATE_ATIME(inode); - return 0; - } -+ -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * These functions convert from the major/minor hash to an f_pos -+ * value. -+ * -+ * Currently we only use major hash numer. This is unfortunate, but -+ * on 32-bit machines, the same VFS interface is used for lseek and -+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of -+ * lseek/telldir/seekdir will blow out spectacularly, and from within -+ * the ext2 low-level routine, we don't know if we're being called by -+ * a 64-bit version of the system call or the 32-bit version of the -+ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir -+ * cookie. Sigh. -+ */ -+#define hash2pos(major, minor) (major >> 1) -+#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -+#define pos2min_hash(pos) (0) -+ -+/* -+ * This structure holds the nodes of the red-black tree used to store -+ * the directory entry in hash order. -+ */ -+struct fname { -+ __u32 hash; -+ __u32 minor_hash; -+ rb_node_t rb_hash; -+ struct fname *next; -+ __u32 inode; -+ __u8 name_len; -+ __u8 file_type; -+ char name[0]; -+}; -+ -+/* -+ * This functoin implements a non-recursive way of freeing all of the -+ * nodes in the red-black tree. -+ */ -+static void free_rb_tree_fname(rb_root_t *root) -+{ -+ rb_node_t *n = root->rb_node; -+ rb_node_t *parent; -+ struct fname *fname; -+ -+ while (n) { -+ /* Do the node's children first */ -+ if ((n)->rb_left) { -+ n = n->rb_left; -+ continue; -+ } -+ if (n->rb_right) { -+ n = n->rb_right; -+ continue; -+ } -+ /* -+ * The node has no children; free it, and then zero -+ * out parent's link to it. Finally go to the -+ * beginning of the loop and try to free the parent -+ * node. -+ */ -+ parent = n->rb_parent; -+ fname = rb_entry(n, struct fname, rb_hash); -+ kfree(fname); -+ if (!parent) -+ root->rb_node = 0; -+ else if (parent->rb_left == n) -+ parent->rb_left = 0; -+ else if (parent->rb_right == n) -+ parent->rb_right = 0; -+ n = parent; -+ } -+ root->rb_node = 0; -+} -+ -+ -+struct dir_private_info *create_dir_info(loff_t pos) -+{ -+ struct dir_private_info *p; -+ -+ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); -+ if (!p) -+ return NULL; -+ p->root.rb_node = 0; -+ p->curr_node = 0; -+ p->extra_fname = 0; -+ p->last_pos = 0; -+ p->curr_hash = pos2maj_hash(pos); -+ p->curr_minor_hash = pos2min_hash(pos); -+ p->next_hash = 0; -+ return p; -+} -+ -+void ext3_htree_free_dir_info(struct dir_private_info *p) -+{ -+ free_rb_tree_fname(&p->root); -+ kfree(p); -+} -+ -+/* -+ * Given a directory entry, enter it into the fname rb tree. -+ */ -+int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3_dir_entry_2 *dirent) -+{ -+ rb_node_t **p, *parent = NULL; -+ struct fname * fname, *new_fn; -+ struct dir_private_info *info; -+ int len; -+ -+ info = (struct dir_private_info *) dir_file->private_data; -+ p = &info->root.rb_node; -+ -+ /* Create and allocate the fname structure */ -+ len = sizeof(struct fname) + dirent->name_len + 1; -+ new_fn = kmalloc(len, GFP_KERNEL); -+ if (!new_fn) -+ return -ENOMEM; -+ memset(new_fn, 0, len); -+ new_fn->hash = hash; -+ new_fn->minor_hash = minor_hash; -+ new_fn->inode = le32_to_cpu(dirent->inode); -+ new_fn->name_len = dirent->name_len; -+ new_fn->file_type = dirent->file_type; -+ memcpy(new_fn->name, dirent->name, dirent->name_len); -+ new_fn->name[dirent->name_len] = 0; -+ -+ while (*p) { -+ parent = *p; -+ fname = rb_entry(parent, struct fname, rb_hash); -+ -+ /* -+ * If the hash and minor hash match up, then we put -+ * them on a linked list. This rarely happens... -+ */ -+ if ((new_fn->hash == fname->hash) && -+ (new_fn->minor_hash == fname->minor_hash)) { -+ new_fn->next = fname->next; -+ fname->next = new_fn; -+ return 0; -+ } -+ -+ if (new_fn->hash < fname->hash) -+ p = &(*p)->rb_left; -+ else if (new_fn->hash > fname->hash) -+ p = &(*p)->rb_right; -+ else if (new_fn->minor_hash < fname->minor_hash) -+ p = &(*p)->rb_left; -+ else /* if (new_fn->minor_hash > fname->minor_hash) */ -+ p = &(*p)->rb_right; -+ } -+ -+ rb_link_node(&new_fn->rb_hash, parent, p); -+ rb_insert_color(&new_fn->rb_hash, &info->root); -+ return 0; -+} -+ -+ -+ -+/* -+ * This is a helper function for ext3_dx_readdir. It calls filldir -+ * for all entres on the fname linked list. (Normally there is only -+ * one entry on the linked list, unless there are 62 bit hash collisions.) -+ */ -+static int call_filldir(struct file * filp, void * dirent, -+ filldir_t filldir, struct fname *fname) -+{ -+ struct dir_private_info *info = filp->private_data; -+ loff_t curr_pos; -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct super_block * sb; -+ int error; -+ -+ sb = inode->i_sb; -+ -+ if (!fname) { -+ printk("call_filldir: called with null fname?!?\n"); -+ return 0; -+ } -+ curr_pos = hash2pos(fname->hash, fname->minor_hash); -+ while (fname) { -+ error = filldir(dirent, fname->name, -+ fname->name_len, curr_pos, -+ fname->inode, -+ get_dtype(sb, fname->file_type)); -+ if (error) { -+ filp->f_pos = curr_pos; -+ info->extra_fname = fname->next; -+ return error; -+ } -+ fname = fname->next; -+ } -+ return 0; -+} -+ -+static int ext3_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir) -+{ -+ struct dir_private_info *info = filp->private_data; -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct fname *fname; -+ int ret; -+ -+ if (!info) { -+ info = create_dir_info(filp->f_pos); -+ if (!info) -+ return -ENOMEM; -+ filp->private_data = info; -+ } -+ -+ /* Some one has messed with f_pos; reset the world */ -+ if (info->last_pos != filp->f_pos) { -+ free_rb_tree_fname(&info->root); -+ info->curr_node = 0; -+ info->extra_fname = 0; -+ info->curr_hash = pos2maj_hash(filp->f_pos); -+ info->curr_minor_hash = pos2min_hash(filp->f_pos); -+ } -+ -+ /* -+ * If there are any leftover names on the hash collision -+ * chain, return them first. -+ */ -+ if (info->extra_fname && -+ call_filldir(filp, dirent, filldir, info->extra_fname)) -+ goto finished; -+ -+ if (!info->curr_node) -+ info->curr_node = rb_get_first(&info->root); -+ -+ while (1) { -+ /* -+ * Fill the rbtree if we have no more entries, -+ * or the inode has changed since we last read in the -+ * cached entries. -+ */ -+ if ((!info->curr_node) || -+ (filp->f_version != inode->i_version)) { -+ info->curr_node = 0; -+ free_rb_tree_fname(&info->root); -+ filp->f_version = inode->i_version; -+ ret = ext3_htree_fill_tree(filp, info->curr_hash, -+ info->curr_minor_hash, -+ &info->next_hash); -+ if (ret < 0) -+ return ret; -+ if (ret == 0) -+ break; -+ info->curr_node = rb_get_first(&info->root); -+ } -+ -+ fname = rb_entry(info->curr_node, struct fname, rb_hash); -+ info->curr_hash = fname->hash; -+ info->curr_minor_hash = fname->minor_hash; -+ if (call_filldir(filp, dirent, filldir, fname)) -+ break; -+ -+ info->curr_node = rb_get_next(info->curr_node); -+ if (!info->curr_node) { -+ info->curr_hash = info->next_hash; -+ info->curr_minor_hash = 0; -+ } -+ } -+finished: -+ info->last_pos = filp->f_pos; -+ UPDATE_ATIME(inode); -+ return 0; -+} -+#endif -Index: linux.mcp2/fs/ext3/file.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/file.c 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/file.c 2004-05-17 15:07:06.000000000 -0700 -@@ -35,6 +35,9 @@ - { - if (filp->f_mode & FMODE_WRITE) - ext3_discard_prealloc (inode); -+ if (is_dx(inode) && filp->private_data) -+ ext3_htree_free_dir_info(filp->private_data); -+ - return 0; - } - -Index: linux.mcp2/fs/ext3/hash.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/hash.c 2002-04-11 07:25:15.000000000 -0700 -+++ linux.mcp2/fs/ext3/hash.c 2004-05-17 15:07:06.000000000 -0700 -@@ -0,0 +1,215 @@ -+/* -+ * linux/fs/ext3/hash.c -+ * -+ * Copyright (C) 2002 by Theodore Ts'o -+ * -+ * This file is released under the GPL v2. -+ * -+ * This file may be redistributed under the terms of the GNU Public -+ * License. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define DELTA 0x9E3779B9 -+ -+static void TEA_transform(__u32 buf[4], __u32 const in[]) -+{ -+ __u32 sum = 0; -+ __u32 b0 = buf[0], b1 = buf[1]; -+ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; -+ int n = 16; -+ -+ do { -+ sum += DELTA; -+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); -+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); -+ } while(--n); -+ -+ buf[0] += b0; -+ buf[1] += b1; -+} -+ -+/* F, G and H are basic MD4 functions: selection, majority, parity */ -+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) -+#define H(x, y, z) ((x) ^ (y) ^ (z)) -+ -+/* -+ * The generic round function. The application is so specific that -+ * we don't bother protecting all the arguments with parens, as is generally -+ * good macro practice, in favor of extra legibility. -+ * Rotation is separate from addition to prevent recomputation -+ */ -+#define ROUND(f, a, b, c, d, x, s) \ -+ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) -+#define K1 0 -+#define K2 013240474631UL -+#define K3 015666365641UL -+ -+/* -+ * Basic cut-down MD4 transform. Returns only 32 bits of result. -+ */ -+static void halfMD4Transform (__u32 buf[4], __u32 const in[]) -+{ -+ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; -+ -+ /* Round 1 */ -+ ROUND(F, a, b, c, d, in[0] + K1, 3); -+ ROUND(F, d, a, b, c, in[1] + K1, 7); -+ ROUND(F, c, d, a, b, in[2] + K1, 11); -+ ROUND(F, b, c, d, a, in[3] + K1, 19); -+ ROUND(F, a, b, c, d, in[4] + K1, 3); -+ ROUND(F, d, a, b, c, in[5] + K1, 7); -+ ROUND(F, c, d, a, b, in[6] + K1, 11); -+ ROUND(F, b, c, d, a, in[7] + K1, 19); -+ -+ /* Round 2 */ -+ ROUND(G, a, b, c, d, in[1] + K2, 3); -+ ROUND(G, d, a, b, c, in[3] + K2, 5); -+ ROUND(G, c, d, a, b, in[5] + K2, 9); -+ ROUND(G, b, c, d, a, in[7] + K2, 13); -+ ROUND(G, a, b, c, d, in[0] + K2, 3); -+ ROUND(G, d, a, b, c, in[2] + K2, 5); -+ ROUND(G, c, d, a, b, in[4] + K2, 9); -+ ROUND(G, b, c, d, a, in[6] + K2, 13); -+ -+ /* Round 3 */ -+ ROUND(H, a, b, c, d, in[3] + K3, 3); -+ ROUND(H, d, a, b, c, in[7] + K3, 9); -+ ROUND(H, c, d, a, b, in[2] + K3, 11); -+ ROUND(H, b, c, d, a, in[6] + K3, 15); -+ ROUND(H, a, b, c, d, in[1] + K3, 3); -+ ROUND(H, d, a, b, c, in[5] + K3, 9); -+ ROUND(H, c, d, a, b, in[0] + K3, 11); -+ ROUND(H, b, c, d, a, in[4] + K3, 15); -+ -+ buf[0] += a; -+ buf[1] += b; -+ buf[2] += c; -+ buf[3] += d; -+} -+ -+#undef ROUND -+#undef F -+#undef G -+#undef H -+#undef K1 -+#undef K2 -+#undef K3 -+ -+/* The old legacy hash */ -+static __u32 dx_hack_hash (const char *name, int len) -+{ -+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; -+ while (len--) { -+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); -+ -+ if (hash & 0x80000000) hash -= 0x7fffffff; -+ hash1 = hash0; -+ hash0 = hash; -+ } -+ return (hash0 << 1); -+} -+ -+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) -+{ -+ __u32 pad, val; -+ int i; -+ -+ pad = (__u32)len | ((__u32)len << 8); -+ pad |= pad << 16; -+ -+ val = pad; -+ if (len > num*4) -+ len = num * 4; -+ for (i=0; i < len; i++) { -+ if ((i % 4) == 0) -+ val = pad; -+ val = msg[i] + (val << 8); -+ if ((i % 4) == 3) { -+ *buf++ = val; -+ val = pad; -+ num--; -+ } -+ } -+ if (--num >= 0) -+ *buf++ = val; -+ while (--num >= 0) -+ *buf++ = pad; -+} -+ -+/* -+ * Returns the hash of a filename. If len is 0 and name is NULL, then -+ * this function can be used to test whether or not a hash version is -+ * supported. -+ * -+ * The seed is an 4 longword (32 bits) "secret" which can be used to -+ * uniquify a hash. If the seed is all zero's, then some default seed -+ * may be used. -+ * -+ * A particular hash version specifies whether or not the seed is -+ * represented, and whether or not the returned hash is 32 bits or 64 -+ * bits. 32 bit hashes will return 0 for the minor hash. -+ */ -+int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) -+{ -+ __u32 hash; -+ __u32 minor_hash = 0; -+ const char *p; -+ int i; -+ __u32 in[8], buf[4]; -+ -+ /* Initialize the default seed for the hash checksum functions */ -+ buf[0] = 0x67452301; -+ buf[1] = 0xefcdab89; -+ buf[2] = 0x98badcfe; -+ buf[3] = 0x10325476; -+ -+ /* Check to see if the seed is all zero's */ -+ if (hinfo->seed) { -+ for (i=0; i < 4; i++) { -+ if (hinfo->seed[i]) -+ break; -+ } -+ if (i < 4) -+ memcpy(buf, hinfo->seed, sizeof(buf)); -+ } -+ -+ switch (hinfo->hash_version) { -+ case DX_HASH_LEGACY: -+ hash = dx_hack_hash(name, len); -+ break; -+ case DX_HASH_HALF_MD4: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 8); -+ halfMD4Transform(buf, in); -+ len -= 32; -+ p += 32; -+ } -+ minor_hash = buf[2]; -+ hash = buf[1]; -+ break; -+ case DX_HASH_TEA: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 4); -+ TEA_transform(buf, in); -+ len -= 16; -+ p += 16; -+ } -+ hash = buf[0]; -+ minor_hash = buf[1]; -+ break; -+ default: -+ hinfo->hash = 0; -+ return -1; -+ } -+ hinfo->hash = hash & ~1; -+ hinfo->minor_hash = minor_hash; -+ return 0; -+} -Index: linux.mcp2/fs/ext3/Makefile -=================================================================== ---- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:07:06.000000000 -0700 -@@ -10,7 +10,7 @@ - O_TARGET := ext3.o - - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -- ioctl.o namei.o super.o symlink.o -+ ioctl.o namei.o super.o symlink.o hash.o - obj-m := $(O_TARGET) - - include $(TOPDIR)/Rules.make -Index: linux.mcp2/fs/ext3/namei.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:07:06.000000000 -0700 -@@ -16,6 +16,12 @@ - * David S. Miller (davem@caip.rutgers.edu), 1995 - * Directory entry file type support and forward compatibility hooks - * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 -+ * Hash Tree Directory indexing (c) -+ * Daniel Phillips, 2001 -+ * Hash Tree Directory indexing porting -+ * Christopher Li, 2002 -+ * Hash Tree Directory indexing cleanup -+ * Theodore Ts'o, 2002 - */ - - #include -@@ -38,6 +44,642 @@ - #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) - #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) - -+static struct buffer_head *ext3_append(handle_t *handle, -+ struct inode *inode, -+ u32 *block, int *err) -+{ -+ struct buffer_head *bh; -+ -+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; -+ -+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { -+ inode->i_size += inode->i_sb->s_blocksize; -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_journal_get_write_access(handle,bh); -+ } -+ return bh; -+} -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#ifndef swap -+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -+#endif -+ -+typedef struct { u32 v; } le_u32; -+typedef struct { u16 v; } le_u16; -+ -+#ifdef DX_DEBUG -+#define dxtrace(command) command -+#else -+#define dxtrace(command) -+#endif -+ -+struct fake_dirent -+{ -+ /*le*/u32 inode; -+ /*le*/u16 rec_len; -+ u8 name_len; -+ u8 file_type; -+}; -+ -+struct dx_countlimit -+{ -+ le_u16 limit; -+ le_u16 count; -+}; -+ -+struct dx_entry -+{ -+ le_u32 hash; -+ le_u32 block; -+}; -+ -+/* -+ * dx_root_info is laid out so that if it should somehow get overlaid by a -+ * dirent the two low bits of the hash version will be zero. Therefore, the -+ * hash version mod 4 should never be 0. Sincerely, the paranoia department. -+ */ -+ -+struct dx_root -+{ -+ struct fake_dirent dot; -+ char dot_name[4]; -+ struct fake_dirent dotdot; -+ char dotdot_name[4]; -+ struct dx_root_info -+ { -+ le_u32 reserved_zero; -+ u8 hash_version; -+ u8 info_length; /* 8 */ -+ u8 indirect_levels; -+ u8 unused_flags; -+ } -+ info; -+ struct dx_entry entries[0]; -+}; -+ -+struct dx_node -+{ -+ struct fake_dirent fake; -+ struct dx_entry entries[0]; -+}; -+ -+ -+struct dx_frame -+{ -+ struct buffer_head *bh; -+ struct dx_entry *entries; -+ struct dx_entry *at; -+}; -+ -+struct dx_map_entry -+{ -+ u32 hash; -+ u32 offs; -+}; -+ -+#ifdef CONFIG_EXT3_INDEX -+static inline unsigned dx_get_block (struct dx_entry *entry); -+static void dx_set_block (struct dx_entry *entry, unsigned value); -+static inline unsigned dx_get_hash (struct dx_entry *entry); -+static void dx_set_hash (struct dx_entry *entry, unsigned value); -+static unsigned dx_get_count (struct dx_entry *entries); -+static unsigned dx_get_limit (struct dx_entry *entries); -+static void dx_set_count (struct dx_entry *entries, unsigned value); -+static void dx_set_limit (struct dx_entry *entries, unsigned value); -+static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -+static unsigned dx_node_limit (struct inode *dir); -+static struct dx_frame *dx_probe(struct dentry *dentry, -+ struct inode *dir, -+ struct dx_hash_info *hinfo, -+ struct dx_frame *frame, -+ int *err); -+static void dx_release (struct dx_frame *frames); -+static int dx_make_map (struct ext3_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry map[]); -+static void dx_sort_map(struct dx_map_entry *map, unsigned count); -+static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, -+ struct dx_map_entry *offsets, int count); -+static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); -+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); -+static int ext3_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, int *err, -+ __u32 *start_hash); -+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -+ struct ext3_dir_entry_2 **res_dir, int *err); -+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); -+ -+/* -+ * Future: use high four bits of block for coalesce-on-delete flags -+ * Mask them off for now. -+ */ -+ -+static inline unsigned dx_get_block (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->block.v) & 0x00ffffff; -+} -+ -+static inline void dx_set_block (struct dx_entry *entry, unsigned value) -+{ -+ entry->block.v = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_hash (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->hash.v); -+} -+ -+static inline void dx_set_hash (struct dx_entry *entry, unsigned value) -+{ -+ entry->hash.v = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_count (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); -+} -+ -+static inline unsigned dx_get_limit (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); -+} -+ -+static inline void dx_set_count (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); -+} -+ -+static inline void dx_set_limit (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); -+} -+ -+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - -+ EXT3_DIR_REC_LEN(2) - infosize; -+ return 0? 20: entry_space / sizeof(struct dx_entry); -+} -+ -+static inline unsigned dx_node_limit (struct inode *dir) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); -+ return 0? 22: entry_space / sizeof(struct dx_entry); -+} -+ -+/* -+ * Debug -+ */ -+#ifdef DX_DEBUG -+struct stats -+{ -+ unsigned names; -+ unsigned space; -+ unsigned bcount; -+}; -+ -+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, -+ int size, int show_names) -+{ -+ unsigned names = 0, space = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ printk("names: "); -+ while ((char *) de < base + size) -+ { -+ if (de->inode) -+ { -+ if (show_names) -+ { -+ int len = de->name_len; -+ char *name = de->name; -+ while (len--) printk("%c", *name++); -+ ext3fs_dirhash(de->name, de->name_len, &h); -+ printk(":%x.%u ", h.hash, -+ ((char *) de - base)); -+ } -+ space += EXT3_DIR_REC_LEN(de->name_len); -+ names++; -+ } -+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ printk("(%i)\n", names); -+ return (struct stats) { names, space, 1 }; -+} -+ -+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, -+ struct dx_entry *entries, int levels) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count = dx_get_count (entries), names = 0, space = 0, i; -+ unsigned bcount = 0; -+ struct buffer_head *bh; -+ int err; -+ printk("%i indexed blocks...\n", count); -+ for (i = 0; i < count; i++, entries++) -+ { -+ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; -+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; -+ struct stats stats; -+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); -+ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; -+ stats = levels? -+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): -+ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); -+ names += stats.names; -+ space += stats.space; -+ bcount += stats.bcount; -+ brelse (bh); -+ } -+ if (bcount) -+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", -+ names, space/bcount,(space/bcount)*100/blocksize); -+ return (struct stats) { names, space, bcount}; -+} -+#endif /* DX_DEBUG */ -+ -+/* -+ * Probe for a directory leaf block to search. -+ * -+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format -+ * error in the directory index, and the caller should fall back to -+ * searching the directory normally. The callers of dx_probe **MUST** -+ * check for this error code, and make sure it never gets reflected -+ * back to userspace. -+ */ -+static struct dx_frame * -+dx_probe(struct dentry *dentry, struct inode *dir, -+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -+{ -+ unsigned count, indirect; -+ struct dx_entry *at, *entries, *p, *q, *m; -+ struct dx_root *root; -+ struct buffer_head *bh; -+ struct dx_frame *frame = frame_in; -+ u32 hash; -+ -+ frame->bh = NULL; -+ if (dentry) -+ dir = dentry->d_parent->d_inode; -+ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) -+ goto fail; -+ root = (struct dx_root *) bh->b_data; -+ if (root->info.hash_version != DX_HASH_TEA && -+ root->info.hash_version != DX_HASH_HALF_MD4 && -+ root->info.hash_version != DX_HASH_LEGACY) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unrecognised inode hash code %d", -+ root->info.hash_version); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ hinfo->hash_version = root->info.hash_version; -+ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; -+ if (dentry) -+ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); -+ hash = hinfo->hash; -+ -+ if (root->info.unused_flags & 1) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash flags: %#06x", -+ root->info.unused_flags); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ if ((indirect = root->info.indirect_levels) > 1) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash depth: %#06x", -+ root->info.indirect_levels); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ entries = (struct dx_entry *) (((char *)&root->info) + -+ root->info.info_length); -+ assert(dx_get_limit(entries) == dx_root_limit(dir, -+ root->info.info_length)); -+ dxtrace (printk("Look up %x", hash)); -+ while (1) -+ { -+ count = dx_get_count(entries); -+ assert (count && count <= dx_get_limit(entries)); -+ p = entries + 1; -+ q = entries + count - 1; -+ while (p <= q) -+ { -+ m = p + (q - p)/2; -+ dxtrace(printk(".")); -+ if (dx_get_hash(m) > hash) -+ q = m - 1; -+ else -+ p = m + 1; -+ } -+ -+ if (0) // linear search cross check -+ { -+ unsigned n = count - 1; -+ at = entries; -+ while (n--) -+ { -+ dxtrace(printk(",")); -+ if (dx_get_hash(++at) > hash) -+ { -+ at--; -+ break; -+ } -+ } -+ assert (at == p - 1); -+ } -+ -+ at = p - 1; -+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); -+ frame->bh = bh; -+ frame->entries = entries; -+ frame->at = at; -+ if (!indirect--) return frame; -+ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) -+ goto fail2; -+ at = entries = ((struct dx_node *) bh->b_data)->entries; -+ assert (dx_get_limit(entries) == dx_node_limit (dir)); -+ frame++; -+ } -+fail2: -+ while (frame >= frame_in) { -+ brelse(frame->bh); -+ frame--; -+ } -+fail: -+ return NULL; -+} -+ -+static void dx_release (struct dx_frame *frames) -+{ -+ if (frames[0].bh == NULL) -+ return; -+ -+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) -+ brelse(frames[1].bh); -+ brelse(frames[0].bh); -+} -+ -+/* -+ * This function increments the frame pointer to search the next leaf -+ * block, and reads in the necessary intervening nodes if the search -+ * should be necessary. Whether or not the search is necessary is -+ * controlled by the hash parameter. If the hash value is even, then -+ * the search is only continued if the next block starts with that -+ * hash value. This is used if we are searching for a specific file. -+ * -+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. -+ * -+ * This function returns 1 if the caller should continue to search, -+ * or 0 if it should not. If there is an error reading one of the -+ * index blocks, it will return -1. -+ * -+ * If start_hash is non-null, it will be filled in with the starting -+ * hash of the next page. -+ */ -+static int ext3_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, int *err, -+ __u32 *start_hash) -+{ -+ struct dx_frame *p; -+ struct buffer_head *bh; -+ int num_frames = 0; -+ __u32 bhash; -+ -+ *err = ENOENT; -+ p = frame; -+ /* -+ * Find the next leaf page by incrementing the frame pointer. -+ * If we run out of entries in the interior node, loop around and -+ * increment pointer in the parent node. When we break out of -+ * this loop, num_frames indicates the number of interior -+ * nodes need to be read. -+ */ -+ while (1) { -+ if (++(p->at) < p->entries + dx_get_count(p->entries)) -+ break; -+ if (p == frames) -+ return 0; -+ num_frames++; -+ p--; -+ } -+ -+ /* -+ * If the hash is 1, then continue only if the next page has a -+ * continuation hash of any value. This is used for readdir -+ * handling. Otherwise, check to see if the hash matches the -+ * desired contiuation hash. If it doesn't, return since -+ * there's no point to read in the successive index pages. -+ */ -+ bhash = dx_get_hash(p->at); -+ if (start_hash) -+ *start_hash = bhash; -+ if ((hash & 1) == 0) { -+ if ((bhash & ~1) != hash) -+ return 0; -+ } -+ /* -+ * If the hash is HASH_NB_ALWAYS, we always go to the next -+ * block so no check is necessary -+ */ -+ while (num_frames--) { -+ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), -+ 0, err))) -+ return -1; /* Failure */ -+ p++; -+ brelse (p->bh); -+ p->bh = bh; -+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; -+ } -+ return 1; -+} -+ -+ -+/* -+ * p is at least 6 bytes before the end of page -+ */ -+static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) -+{ -+ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); -+} -+ -+/* -+ * This function fills a red-black tree with information from a -+ * directory. We start scanning the directory in hash order, starting -+ * at start_hash and start_minor_hash. -+ * -+ * This function returns the number of entries inserted into the tree, -+ * or a negative error code. -+ */ -+int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash) -+{ -+ struct dx_hash_info hinfo; -+ struct buffer_head *bh; -+ struct ext3_dir_entry_2 *de, *top; -+ static struct dx_frame frames[2], *frame; -+ struct inode *dir; -+ int block, err; -+ int count = 0; -+ int ret; -+ __u32 hashval; -+ -+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, -+ start_minor_hash)); -+ dir = dir_file->f_dentry->d_inode; -+ hinfo.hash = start_hash; -+ hinfo.minor_hash = 0; -+ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ -+ /* Add '.' and '..' from the htree header */ -+ if (!start_hash && !start_minor_hash) { -+ de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; -+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) -+ goto errout; -+ de = ext3_next_entry(de); -+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) -+ goto errout; -+ count += 2; -+ } -+ -+ while (1) { -+ block = dx_get_block(frame->at); -+ dxtrace(printk("Reading block %d\n", block)); -+ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) -+ goto errout; -+ -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - -+ EXT3_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3_next_entry(de)) { -+ ext3fs_dirhash(de->name, de->name_len, &hinfo); -+ if ((hinfo.hash < start_hash) || -+ ((hinfo.hash == start_hash) && -+ (hinfo.minor_hash < start_minor_hash))) -+ continue; -+ if ((err = ext3_htree_store_dirent(dir_file, -+ hinfo.hash, hinfo.minor_hash, de)) != 0) -+ goto errout; -+ count++; -+ } -+ brelse (bh); -+ hashval = ~1; -+ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, -+ frame, frames, &err, &hashval); -+ if (next_hash) -+ *next_hash = hashval; -+ if (ret == -1) -+ goto errout; -+ /* -+ * Stop if: (a) there are no more entries, or -+ * (b) we have inserted at least one entry and the -+ * next hash value is not a continuation -+ */ -+ if ((ret == 0) || -+ (count && ((hashval & 1) == 0))) -+ break; -+ } -+ dx_release(frames); -+ dxtrace(printk("Fill tree: returned %d entries\n", count)); -+ return count; -+errout: -+ dx_release(frames); -+ return (err); -+} -+ -+ -+/* -+ * Directory block splitting, compacting -+ */ -+ -+static int dx_make_map (struct ext3_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) -+{ -+ int count = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ while ((char *) de < base + size) -+ { -+ if (de->name_len && de->inode) { -+ ext3fs_dirhash(de->name, de->name_len, &h); -+ map_tail--; -+ map_tail->hash = h.hash; -+ map_tail->offs = (u32) ((char *) de - base); -+ count++; -+ } -+ /* XXX: do we need to check rec_len == 0 case? -Chris */ -+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ return count; -+} -+ -+static void dx_sort_map (struct dx_map_entry *map, unsigned count) -+{ -+ struct dx_map_entry *p, *q, *top = map + count - 1; -+ int more; -+ /* Combsort until bubble sort doesn't suck */ -+ while (count > 2) -+ { -+ count = count*10/13; -+ if (count - 9 < 2) /* 9, 10 -> 11 */ -+ count = 11; -+ for (p = top, q = p - count; q >= map; p--, q--) -+ if (p->hash < q->hash) -+ swap(*p, *q); -+ } -+ /* Garden variety bubble sort */ -+ do { -+ more = 0; -+ q = top; -+ while (q-- > map) -+ { -+ if (q[1].hash >= q[0].hash) -+ continue; -+ swap(*(q+1), *q); -+ more = 1; -+ } -+ } while(more); -+} -+ -+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) -+{ -+ struct dx_entry *entries = frame->entries; -+ struct dx_entry *old = frame->at, *new = old + 1; -+ int count = dx_get_count(entries); -+ -+ assert(count < dx_get_limit(entries)); -+ assert(old < entries + count); -+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); -+ dx_set_hash(new, hash); -+ dx_set_block(new, block); -+ dx_set_count(entries, count + 1); -+} -+#endif -+ -+ -+static void ext3_update_dx_flag(struct inode *inode) -+{ -+ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, -+ EXT3_FEATURE_COMPAT_DIR_INDEX)) -+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; -+} -+ - /* - * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. - * -@@ -94,6 +736,7 @@ - return 0; - } - -+ - /* - * ext3_find_entry() - * -@@ -105,6 +748,8 @@ - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ -+ -+ - static struct buffer_head * ext3_find_entry (struct dentry *dentry, - struct ext3_dir_entry_2 ** res_dir) - { -@@ -119,12 +764,32 @@ - int num = 0; - int nblocks, i, err; - struct inode *dir = dentry->d_parent->d_inode; -+ int namelen; -+ const u8 *name; -+ unsigned blocksize; - - *res_dir = NULL; - sb = dir->i_sb; -- -+ blocksize = sb->s_blocksize; -+ namelen = dentry->d_name.len; -+ name = dentry->d_name.name; -+ if (namelen > EXT3_NAME_LEN) -+ return NULL; -+#ifdef CONFIG_EXT3_INDEX -+ if (is_dx(dir)) { -+ bh = ext3_dx_find_entry(dentry, res_dir, &err); -+ /* -+ * On success, or if the error was file not found, -+ * return. Otherwise, fall back to doing a search the -+ * old fashioned way. -+ */ -+ if (bh || (err != ERR_BAD_DX_DIR)) -+ return bh; -+ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); -+ } -+#endif - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); -- start = dir->u.ext3_i.i_dir_start_lookup; -+ start = EXT3_I(dir)->i_dir_start_lookup; - if (start >= nblocks) - start = 0; - block = start; -@@ -165,7 +830,7 @@ - i = search_dirblock(bh, dir, dentry, - block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); - if (i == 1) { -- dir->u.ext3_i.i_dir_start_lookup = block; -+ EXT3_I(dir)->i_dir_start_lookup = block; - ret = bh; - goto cleanup_and_exit; - } else { -@@ -196,6 +861,66 @@ - return ret; - } - -+#ifdef CONFIG_EXT3_INDEX -+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -+ struct ext3_dir_entry_2 **res_dir, int *err) -+{ -+ struct super_block * sb; -+ struct dx_hash_info hinfo; -+ u32 hash; -+ struct dx_frame frames[2], *frame; -+ struct ext3_dir_entry_2 *de, *top; -+ struct buffer_head *bh; -+ unsigned long block; -+ int retval; -+ int namelen = dentry->d_name.len; -+ const u8 *name = dentry->d_name.name; -+ struct inode *dir = dentry->d_parent->d_inode; -+ -+ sb = dir->i_sb; -+ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) -+ return NULL; -+ hash = hinfo.hash; -+ do { -+ block = dx_get_block(frame->at); -+ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) -+ goto errout; -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - -+ EXT3_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3_next_entry(de)) -+ if (ext3_match (namelen, name, de)) { -+ if (!ext3_check_dir_entry("ext3_find_entry", -+ dir, de, bh, -+ (block<b_data))) { -+ brelse (bh); -+ goto errout; -+ } -+ *res_dir = de; -+ dx_release (frames); -+ return bh; -+ } -+ brelse (bh); -+ /* Check to see if we should continue to search */ -+ retval = ext3_htree_next_block(dir, hash, frame, -+ frames, err, 0); -+ if (retval == -1) { -+ ext3_warning(sb, __FUNCTION__, -+ "error reading index page in directory #%lu", -+ dir->i_ino); -+ goto errout; -+ } -+ } while (retval == 1); -+ -+ *err = -ENOENT; -+errout: -+ dxtrace(printk("%s not found\n", name)); -+ dx_release (frames); -+ return NULL; -+} -+#endif -+ - static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) - { - struct inode * inode; -@@ -212,8 +937,9 @@ - brelse (bh); - inode = iget(dir->i_sb, ino); - -- if (!inode) -+ if (!inode) { - return ERR_PTR(-EACCES); -+ } - } - d_add(dentry, inode); - return NULL; -@@ -237,6 +963,301 @@ - de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; - } - -+#ifdef CONFIG_EXT3_INDEX -+static struct ext3_dir_entry_2 * -+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) -+{ -+ unsigned rec_len = 0; -+ -+ while (count--) { -+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); -+ rec_len = EXT3_DIR_REC_LEN(de->name_len); -+ memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); -+ de->inode = 0; -+ map++; -+ to += rec_len; -+ } -+ return (struct ext3_dir_entry_2 *) (to - rec_len); -+} -+ -+static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) -+{ -+ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; -+ unsigned rec_len = 0; -+ -+ prev = to = de; -+ while ((char*)de < base + size) { -+ next = (struct ext3_dir_entry_2 *) ((char *) de + -+ le16_to_cpu(de->rec_len)); -+ if (de->inode && de->name_len) { -+ rec_len = EXT3_DIR_REC_LEN(de->name_len); -+ if (de > to) -+ memmove(to, de, rec_len); -+ to->rec_len = cpu_to_le16(rec_len); -+ prev = to; -+ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); -+ } -+ de = next; -+ } -+ return prev; -+} -+ -+static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, -+ struct buffer_head **bh,struct dx_frame *frame, -+ struct dx_hash_info *hinfo, int *error) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count, continued; -+ struct buffer_head *bh2; -+ u32 newblock; -+ u32 hash2; -+ struct dx_map_entry *map; -+ char *data1 = (*bh)->b_data, *data2; -+ unsigned split; -+ struct ext3_dir_entry_2 *de = NULL, *de2; -+ int err; -+ -+ bh2 = ext3_append (handle, dir, &newblock, error); -+ if (!(bh2)) { -+ brelse(*bh); -+ *bh = NULL; -+ goto errout; -+ } -+ -+ BUFFER_TRACE(*bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, *bh); -+ if (err) { -+ journal_error: -+ brelse(*bh); -+ brelse(bh2); -+ *bh = NULL; -+ ext3_std_error(dir->i_sb, err); -+ goto errout; -+ } -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ -+ data2 = bh2->b_data; -+ -+ /* create map in the end of data2 block */ -+ map = (struct dx_map_entry *) (data2 + blocksize); -+ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, -+ blocksize, hinfo, map); -+ map -= count; -+ split = count/2; // need to adjust to actual middle -+ dx_sort_map (map, count); -+ hash2 = map[split].hash; -+ continued = hash2 == map[split - 1].hash; -+ dxtrace(printk("Split block %i at %x, %i/%i\n", -+ dx_get_block(frame->at), hash2, split, count-split)); -+ -+ /* Fancy dance to stay within two buffers */ -+ de2 = dx_move_dirents(data1, data2, map + split, count - split); -+ de = dx_pack_dirents(data1,blocksize); -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); -+ -+ /* Which block gets the new entry? */ -+ if (hinfo->hash >= hash2) -+ { -+ swap(*bh, bh2); -+ de = de2; -+ } -+ dx_insert_block (frame, hash2 + continued, newblock); -+ err = ext3_journal_dirty_metadata (handle, bh2); -+ if (err) -+ goto journal_error; -+ err = ext3_journal_dirty_metadata (handle, frame->bh); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ dxtrace(dx_show_index ("frame", frame->entries)); -+errout: -+ return de; -+} -+#endif -+ -+ -+/* -+ * Add a new entry into a directory (leaf) block. If de is non-NULL, -+ * it points to a directory entry which is guaranteed to be large -+ * enough for new directory entry. If de is NULL, then -+ * add_dirent_to_buf will attempt search the directory block for -+ * space. It will return -ENOSPC if no space is available, and -EIO -+ * and -EEXIST if directory entry already exists. -+ * -+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In -+ * all other cases bh is released. -+ */ -+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct ext3_dir_entry_2 *de, -+ struct buffer_head * bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ unsigned long offset = 0; -+ unsigned short reclen; -+ int nlen, rlen, err; -+ char *top; -+ -+ reclen = EXT3_DIR_REC_LEN(namelen); -+ if (!de) { -+ de = (struct ext3_dir_entry_2 *)bh->b_data; -+ top = bh->b_data + dir->i_sb->s_blocksize - reclen; -+ while ((char *) de <= top) { -+ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, -+ bh, offset)) { -+ brelse (bh); -+ return -EIO; -+ } -+ if (ext3_match (namelen, name, de)) { -+ brelse (bh); -+ return -EEXIST; -+ } -+ nlen = EXT3_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if ((de->inode? rlen - nlen: rlen) >= reclen) -+ break; -+ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); -+ offset += rlen; -+ } -+ if ((char *) de > top) -+ return -ENOSPC; -+ } -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) { -+ ext3_std_error(dir->i_sb, err); -+ brelse(bh); -+ return err; -+ } -+ -+ /* By now the buffer is marked for journaling */ -+ nlen = EXT3_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if (de->inode) { -+ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); -+ de1->rec_len = cpu_to_le16(rlen - nlen); -+ de->rec_len = cpu_to_le16(nlen); -+ de = de1; -+ } -+ de->file_type = EXT3_FT_UNKNOWN; -+ if (inode) { -+ de->inode = cpu_to_le32(inode->i_ino); -+ ext3_set_de_type(dir->i_sb, de, inode->i_mode); -+ } else -+ de->inode = 0; -+ de->name_len = namelen; -+ memcpy (de->name, name, namelen); -+ /* -+ * XXX shouldn't update any times until successful -+ * completion of syscall, but too many callers depend -+ * on this. -+ * -+ * XXX similarly, too many callers depend on -+ * ext3_new_inode() setting the times, but error -+ * recovery deletes the inode, so the worst that can -+ * happen is that the times are slightly out of date -+ * and/or different from the directory change time. -+ */ -+ dir->i_mtime = dir->i_ctime = CURRENT_TIME; -+ ext3_update_dx_flag(dir); -+ dir->i_version = ++event; -+ ext3_mark_inode_dirty(handle, dir); -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ ext3_std_error(dir->i_sb, err); -+ brelse(bh); -+ return 0; -+} -+ -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * This converts a one block unindexed directory to a 3 block indexed -+ * directory, and adds the dentry to the indexed directory. -+ */ -+static int make_indexed_dir(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct buffer_head *bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ struct buffer_head *bh2; -+ struct dx_root *root; -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries; -+ struct ext3_dir_entry_2 *de, *de2; -+ char *data1, *top; -+ unsigned len; -+ int retval; -+ unsigned blocksize; -+ struct dx_hash_info hinfo; -+ u32 block; -+ -+ blocksize = dir->i_sb->s_blocksize; -+ dxtrace(printk("Creating index\n")); -+ retval = ext3_journal_get_write_access(handle, bh); -+ if (retval) { -+ ext3_std_error(dir->i_sb, retval); -+ brelse(bh); -+ return retval; -+ } -+ root = (struct dx_root *) bh->b_data; -+ -+ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; -+ bh2 = ext3_append (handle, dir, &block, &retval); -+ if (!(bh2)) { -+ brelse(bh); -+ return retval; -+ } -+ data1 = bh2->b_data; -+ -+ /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *)&root->dotdot; -+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); -+ len = ((char *) root) + blocksize - (char *) de; -+ memcpy (data1, de, len); -+ de = (struct ext3_dir_entry_2 *) data1; -+ top = data1 + len; -+ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) -+ de = de2; -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ /* Initialize the root; the dot dirents already exist */ -+ de = (struct ext3_dir_entry_2 *) (&root->dotdot); -+ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); -+ memset (&root->info, 0, sizeof(root->info)); -+ root->info.info_length = sizeof(root->info); -+ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; -+ entries = root->entries; -+ dx_set_block (entries, 1); -+ dx_set_count (entries, 1); -+ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); -+ -+ /* Initialize as for dx_probe */ -+ hinfo.hash_version = root->info.hash_version; -+ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; -+ ext3fs_dirhash(name, namelen, &hinfo); -+ frame = frames; -+ frame->entries = entries; -+ frame->at = entries; -+ frame->bh = bh; -+ bh = bh2; -+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); -+ dx_release (frames); -+ if (!(de)) -+ return retval; -+ -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} -+#endif -+ - /* - * ext3_add_entry() - * -@@ -247,127 +1268,198 @@ - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ -- --/* -- * AKPM: the journalling code here looks wrong on the error paths -- */ - static int ext3_add_entry (handle_t *handle, struct dentry *dentry, - struct inode *inode) - { - struct inode *dir = dentry->d_parent->d_inode; -- const char *name = dentry->d_name.name; -- int namelen = dentry->d_name.len; - unsigned long offset; -- unsigned short rec_len; - struct buffer_head * bh; -- struct ext3_dir_entry_2 * de, * de1; -+ struct ext3_dir_entry_2 *de; - struct super_block * sb; - int retval; -+#ifdef CONFIG_EXT3_INDEX -+ int dx_fallback=0; -+#endif -+ unsigned blocksize; -+ unsigned nlen, rlen; -+ u32 block, blocks; - - sb = dir->i_sb; -- -- if (!namelen) -+ blocksize = sb->s_blocksize; -+ if (!dentry->d_name.len) - return -EINVAL; -- bh = ext3_bread (handle, dir, 0, 0, &retval); -+#ifdef CONFIG_EXT3_INDEX -+ if (is_dx(dir)) { -+ retval = ext3_dx_add_entry(handle, dentry, inode); -+ if (!retval || (retval != ERR_BAD_DX_DIR)) -+ return retval; -+ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; -+ dx_fallback++; -+ ext3_mark_inode_dirty(handle, dir); -+ } -+#endif -+ blocks = dir->i_size >> sb->s_blocksize_bits; -+ for (block = 0, offset = 0; block < blocks; block++) { -+ bh = ext3_bread(handle, dir, block, 0, &retval); -+ if(!bh) -+ return retval; -+ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); -+ if (retval != -ENOSPC) -+ return retval; -+ -+#ifdef CONFIG_EXT3_INDEX -+ if (blocks == 1 && !dx_fallback && -+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) -+ return make_indexed_dir(handle, dentry, inode, bh); -+#endif -+ brelse(bh); -+ } -+ bh = ext3_append(handle, dir, &block, &retval); - if (!bh) - return retval; -- rec_len = EXT3_DIR_REC_LEN(namelen); -- offset = 0; - de = (struct ext3_dir_entry_2 *) bh->b_data; -- while (1) { -- if ((char *)de >= sb->s_blocksize + bh->b_data) { -- brelse (bh); -- bh = NULL; -- bh = ext3_bread (handle, dir, -- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); -- if (!bh) -- return retval; -- if (dir->i_size <= offset) { -- if (dir->i_size == 0) { -- brelse(bh); -- return -ENOENT; -- } -+ de->inode = 0; -+ de->rec_len = cpu_to_le16(rlen = blocksize); -+ nlen = 0; -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} - -- ext3_debug ("creating next block\n"); -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * Returns 0 for success, or a negative error value -+ */ -+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries, *at; -+ struct dx_hash_info hinfo; -+ struct buffer_head * bh; -+ struct inode *dir = dentry->d_parent->d_inode; -+ struct super_block * sb = dir->i_sb; -+ struct ext3_dir_entry_2 *de; -+ int err; - -- BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -- de = (struct ext3_dir_entry_2 *) bh->b_data; -- de->inode = 0; -- de->rec_len = le16_to_cpu(sb->s_blocksize); -- dir->u.ext3_i.i_disksize = -- dir->i_size = offset + sb->s_blocksize; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); -- } else { -+ frame = dx_probe(dentry, 0, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ entries = frame->entries; -+ at = frame->at; - -- ext3_debug ("skipping to next block\n"); -+ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) -+ goto cleanup; - -- de = (struct ext3_dir_entry_2 *) bh->b_data; -- } -- } -- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, -- offset)) { -- brelse (bh); -- return -ENOENT; -- } -- if (ext3_match (namelen, name, de)) { -- brelse (bh); -- return -EEXIST; -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto journal_error; -+ -+ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); -+ if (err != -ENOSPC) { -+ bh = 0; -+ goto cleanup; -+ } -+ -+ /* Block full, should compress but for now just split */ -+ dxtrace(printk("using %u of %u node entries\n", -+ dx_get_count(entries), dx_get_limit(entries))); -+ /* Need to split index? */ -+ if (dx_get_count(entries) == dx_get_limit(entries)) { -+ u32 newblock; -+ unsigned icount = dx_get_count(entries); -+ int levels = frame - frames; -+ struct dx_entry *entries2; -+ struct dx_node *node2; -+ struct buffer_head *bh2; -+ -+ if (levels && (dx_get_count(frames->entries) == -+ dx_get_limit(frames->entries))) { -+ ext3_warning(sb, __FUNCTION__, -+ "Directory index full!\n"); -+ err = -ENOSPC; -+ goto cleanup; - } -- if ((le32_to_cpu(de->inode) == 0 && -- le16_to_cpu(de->rec_len) >= rec_len) || -- (le16_to_cpu(de->rec_len) >= -- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { -- BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -- /* By now the buffer is marked for journaling */ -- offset += le16_to_cpu(de->rec_len); -- if (le32_to_cpu(de->inode)) { -- de1 = (struct ext3_dir_entry_2 *) ((char *) de + -- EXT3_DIR_REC_LEN(de->name_len)); -- de1->rec_len = -- cpu_to_le16(le16_to_cpu(de->rec_len) - -- EXT3_DIR_REC_LEN(de->name_len)); -- de->rec_len = cpu_to_le16( -- EXT3_DIR_REC_LEN(de->name_len)); -- de = de1; -+ bh2 = ext3_append (handle, dir, &newblock, &err); -+ if (!(bh2)) -+ goto cleanup; -+ node2 = (struct dx_node *)(bh2->b_data); -+ entries2 = node2->entries; -+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); -+ node2->fake.inode = 0; -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ if (levels) { -+ unsigned icount1 = icount/2, icount2 = icount - icount1; -+ unsigned hash2 = dx_get_hash(entries + icount1); -+ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); -+ -+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ -+ err = ext3_journal_get_write_access(handle, -+ frames[0].bh); -+ if (err) -+ goto journal_error; -+ -+ memcpy ((char *) entries2, (char *) (entries + icount1), -+ icount2 * sizeof(struct dx_entry)); -+ dx_set_count (entries, icount1); -+ dx_set_count (entries2, icount2); -+ dx_set_limit (entries2, dx_node_limit(dir)); -+ -+ /* Which index block gets the new entry? */ -+ if (at - entries >= icount1) { -+ frame->at = at = at - entries - icount1 + entries2; -+ frame->entries = entries = entries2; -+ swap(frame->bh, bh2); - } -- de->file_type = EXT3_FT_UNKNOWN; -- if (inode) { -- de->inode = cpu_to_le32(inode->i_ino); -- ext3_set_de_type(dir->i_sb, de, inode->i_mode); -- } else -- de->inode = 0; -- de->name_len = namelen; -- memcpy (de->name, name, namelen); -- /* -- * XXX shouldn't update any times until successful -- * completion of syscall, but too many callers depend -- * on this. -- * -- * XXX similarly, too many callers depend on -- * ext3_new_inode() setting the times, but error -- * recovery deletes the inode, so the worst that can -- * happen is that the times are slightly out of date -- * and/or different from the directory change time. -- */ -- dir->i_mtime = dir->i_ctime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); -- dir->i_version = ++event; -- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -- ext3_journal_dirty_metadata(handle, bh); -- brelse(bh); -- return 0; -+ dx_insert_block (frames + 0, hash2, newblock); -+ dxtrace(dx_show_index ("node", frames[1].entries)); -+ dxtrace(dx_show_index ("node", -+ ((struct dx_node *) bh2->b_data)->entries)); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ } else { -+ dxtrace(printk("Creating second level index...\n")); -+ memcpy((char *) entries2, (char *) entries, -+ icount * sizeof(struct dx_entry)); -+ dx_set_limit(entries2, dx_node_limit(dir)); -+ -+ /* Set up root */ -+ dx_set_count(entries, 1); -+ dx_set_block(entries + 0, newblock); -+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; -+ -+ /* Add new access path frame */ -+ frame = frames + 1; -+ frame->at = at = at - entries + entries2; -+ frame->entries = entries = entries2; -+ frame->bh = bh2; -+ err = ext3_journal_get_write_access(handle, -+ frame->bh); -+ if (err) -+ goto journal_error; - } -- offset += le16_to_cpu(de->rec_len); -- de = (struct ext3_dir_entry_2 *) -- ((char *) de + le16_to_cpu(de->rec_len)); -+ ext3_journal_dirty_metadata(handle, frames[0].bh); - } -- brelse (bh); -- return -ENOSPC; -+ de = do_split(handle, dir, &bh, frame, &hinfo, &err); -+ if (!de) -+ goto cleanup; -+ err = add_dirent_to_buf(handle, dentry, inode, de, bh); -+ bh = 0; -+ goto cleanup; -+ -+journal_error: -+ ext3_std_error(dir->i_sb, err); -+cleanup: -+ if (bh) -+ brelse(bh); -+ dx_release(frames); -+ return err; - } -+#endif - - /* - * ext3_delete_entry deletes a directory entry by merging it with the -@@ -451,9 +1543,11 @@ - struct inode * inode; - int err; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -478,9 +1572,11 @@ - struct inode *inode; - int err; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -507,9 +1603,11 @@ - if (dir->i_nlink >= EXT3_LINK_MAX) - return -EMLINK; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -521,7 +1619,7 @@ - - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; -- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; -+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - inode->i_blocks = 0; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { -@@ -554,21 +1652,19 @@ - inode->i_mode |= S_ISGID; - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); -- if (err) -- goto out_no_entry; -+ if (err) { -+ inode->i_nlink = 0; -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } - dir->i_nlink++; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - d_instantiate(dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); - return err; -- --out_no_entry: -- inode->i_nlink = 0; -- ext3_mark_inode_dirty(handle, inode); -- iput (inode); -- goto out_stop; - } - - /* -@@ -655,7 +1751,7 @@ - int err = 0, rc; - - lock_super(sb); -- if (!list_empty(&inode->u.ext3_i.i_orphan)) -+ if (!list_empty(&EXT3_I(inode)->i_orphan)) - goto out_unlock; - - /* Orphan handling is only valid for files with data blocks -@@ -696,7 +1792,7 @@ - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ - if (!err) -- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); -+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - - jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); - jbd_debug(4, "orphan inode %ld will point to %d\n", -@@ -714,25 +1810,26 @@ - int ext3_orphan_del(handle_t *handle, struct inode *inode) - { - struct list_head *prev; -+ struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_sb_info *sbi; - ino_t ino_next; - struct ext3_iloc iloc; - int err = 0; - - lock_super(inode->i_sb); -- if (list_empty(&inode->u.ext3_i.i_orphan)) { -+ if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } - - ino_next = NEXT_ORPHAN(inode); -- prev = inode->u.ext3_i.i_orphan.prev; -+ prev = ei->i_orphan.prev; - sbi = EXT3_SB(inode->i_sb); - - jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); - -- list_del(&inode->u.ext3_i.i_orphan); -- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+ list_del(&ei->i_orphan); -+ INIT_LIST_HEAD(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on -@@ -793,8 +1890,9 @@ - handle_t *handle; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); -@@ -832,7 +1930,7 @@ - ext3_mark_inode_dirty(handle, inode); - dir->i_nlink--; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - - end_rmdir: -@@ -850,8 +1948,9 @@ - handle_t *handle; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -878,7 +1977,7 @@ - if (retval) - goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - inode->i_nlink--; - if (!inode->i_nlink) -@@ -904,9 +2003,11 @@ - if (l > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -916,7 +2017,7 @@ - if (IS_ERR(inode)) - goto out_stop; - -- if (l > sizeof (inode->u.ext3_i.i_data)) { -+ if (l > sizeof (EXT3_I(inode)->i_data)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_mapping->a_ops = &ext3_aops; - /* -@@ -925,8 +2026,12 @@ - * i_size in generic_commit_write(). - */ - err = block_symlink(inode, symname, l); -- if (err) -- goto out_no_entry; -+ if (err) { -+ ext3_dec_count(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } - } else { - inode->i_op = &ext3_fast_symlink_inode_operations; - memcpy((char*)&inode->u.ext3_i.i_data,symname,l); -@@ -938,12 +2043,6 @@ - out_stop: - ext3_journal_stop(handle, dir); - return err; -- --out_no_entry: -- ext3_dec_count(handle, inode); -- ext3_mark_inode_dirty(handle, inode); -- iput (inode); -- goto out_stop; - } - - static int ext3_link (struct dentry * old_dentry, -@@ -956,12 +2055,15 @@ - if (S_ISDIR(inode->i_mode)) - return -EPERM; - -- if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (inode->i_nlink >= EXT3_LINK_MAX) { - return -EMLINK; -+ } - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -995,9 +2097,11 @@ - - old_bh = new_bh = dir_bh = NULL; - -- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) - handle->h_sync = 1; -@@ -1070,14 +2174,33 @@ - /* - * ok, that's it - */ -- ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ if (retval == -ENOENT) { -+ /* -+ * old_de could have moved out from under us. -+ */ -+ struct buffer_head *old_bh2; -+ struct ext3_dir_entry_2 *old_de2; -+ -+ old_bh2 = ext3_find_entry(old_dentry, &old_de2); -+ if (old_bh2) { -+ retval = ext3_delete_entry(handle, old_dir, -+ old_de2, old_bh2); -+ brelse(old_bh2); -+ } -+ } -+ if (retval) { -+ ext3_warning(old_dir->i_sb, "ext3_rename", -+ "Deleting old file (%lu), %d, error=%d", -+ old_dir->i_ino, old_dir->i_nlink, retval); -+ } - - if (new_inode) { - new_inode->i_nlink--; - new_inode->i_ctime = CURRENT_TIME; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; -- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); - ext3_journal_get_write_access(handle, dir_bh); -@@ -1089,7 +2212,7 @@ - new_inode->i_nlink--; - } else { - new_dir->i_nlink++; -- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } - } -Index: linux.mcp2/fs/ext3/super.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:08:50.000000000 -0700 -@@ -702,6 +702,7 @@ - es->s_mtime = cpu_to_le32(CURRENT_TIME); - ext3_update_dynamic_rev(sb); - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ - ext3_commit_super (sb, es, 1); - if (test_opt (sb, DEBUG)) - printk (KERN_INFO -@@ -712,6 +713,7 @@ - EXT3_BLOCKS_PER_GROUP(sb), - EXT3_INODES_PER_GROUP(sb), - sbi->s_mount_opt); -+ - printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", - bdevname(sb->s_dev)); - if (EXT3_SB(sb)->s_journal->j_inode == NULL) { -@@ -886,6 +888,7 @@ - return res; - } - -+ - struct super_block * ext3_read_super (struct super_block * sb, void * data, - int silent) - { -@@ -1062,6 +1065,9 @@ - sbi->s_mount_state = le16_to_cpu(es->s_state); - sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); -+ for (i=0; i < 4; i++) -+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); -+ sbi->s_def_hash_version = es->s_def_hash_version; - - if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR -@@ -1744,7 +1750,7 @@ - unregister_filesystem(&ext3_fs_type); - } - --EXPORT_NO_SYMBOLS; -+EXPORT_SYMBOL(ext3_force_commit); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -Index: linux.mcp2/include/linux/ext3_fs.h -=================================================================== ---- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 14:53:17.000000000 -0700 -+++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:07:07.000000000 -0700 -@@ -40,6 +40,11 @@ - #define EXT3FS_VERSION "2.4-0.9.17" - - /* -+ * Always enable hashed directories -+ */ -+#define CONFIG_EXT3_INDEX -+ -+/* - * Debug code - */ - #ifdef EXT3FS_DEBUG -@@ -437,8 +442,11 @@ - /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ - __u32 s_journal_dev; /* device number of journal file */ - __u32 s_last_orphan; /* start of list of inodes to delete */ -- --/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ -+ __u32 s_hash_seed[4]; /* HTREE hash seed */ -+ __u8 s_def_hash_version; /* Default hash version to use */ -+ __u8 s_reserved_char_pad; -+ __u16 s_reserved_word_pad; -+ __u32 s_reserved[192]; /* Padding to the end of the block */ - }; - - #ifdef __KERNEL__ -@@ -575,9 +583,46 @@ - #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) - #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ - ~EXT3_DIR_ROUND) -+/* -+ * Hash Tree Directory indexing -+ * (c) Daniel Phillips, 2001 -+ */ -+ -+#ifdef CONFIG_EXT3_INDEX -+ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) -+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#else -+ #define is_dx(dir) 0 -+#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) -+#endif -+ -+/* Legal values for the dx_root hash_version field: */ -+ -+#define DX_HASH_LEGACY 0 -+#define DX_HASH_HALF_MD4 1 -+#define DX_HASH_TEA 2 -+ -+/* hash info structure used by the directory hash */ -+struct dx_hash_info -+{ -+ u32 hash; -+ u32 minor_hash; -+ int hash_version; -+ u32 *seed; -+}; - - #ifdef __KERNEL__ - /* -+ * Control parameters used by ext3_htree_next_block -+ */ -+#define HASH_NB_ALWAYS 1 -+ -+ -+/* - * Describe an inode's exact location on disk and in memory - */ - struct ext3_iloc -@@ -587,6 +632,27 @@ - unsigned long block_group; - }; - -+ -+/* -+ * This structure is stuffed into the struct file's private_data field -+ * for directories. It is where we put information so that we can do -+ * readdir operations in hash tree order. -+ */ -+struct dir_private_info { -+ rb_root_t root; -+ rb_node_t *curr_node; -+ struct fname *extra_fname; -+ loff_t last_pos; -+ __u32 curr_hash; -+ __u32 curr_minor_hash; -+ __u32 next_hash; -+}; -+ -+/* -+ * Special error return code only used by dx_probe() and its callers. -+ */ -+#define ERR_BAD_DX_DIR -75000 -+ - /* - * Function prototypes - */ -@@ -614,11 +680,20 @@ - - /* dir.c */ - extern int ext3_check_dir_entry(const char *, struct inode *, -- struct ext3_dir_entry_2 *, struct buffer_head *, -- unsigned long); -+ struct ext3_dir_entry_2 *, -+ struct buffer_head *, unsigned long); -+extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3_dir_entry_2 *dirent); -+extern void ext3_htree_free_dir_info(struct dir_private_info *p); -+ - /* fsync.c */ - extern int ext3_sync_file (struct file *, struct dentry *, int); - -+/* hash.c */ -+extern int ext3fs_dirhash(const char *name, int len, struct -+ dx_hash_info *hinfo); -+ - /* ialloc.c */ - extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); - extern void ext3_free_inode (handle_t *, struct inode *); -@@ -650,6 +725,8 @@ - /* namei.c */ - extern int ext3_orphan_add(handle_t *, struct inode *); - extern int ext3_orphan_del(handle_t *, struct inode *); -+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash); - - /* super.c */ - extern void ext3_error (struct super_block *, const char *, const char *, ...) -Index: linux.mcp2/include/linux/ext3_fs_sb.h -=================================================================== ---- linux.mcp2.orig/include/linux/ext3_fs_sb.h 2004-05-17 14:41:25.000000000 -0700 -+++ linux.mcp2/include/linux/ext3_fs_sb.h 2004-05-17 15:07:07.000000000 -0700 -@@ -62,6 +62,8 @@ - int s_inode_size; - int s_first_ino; - u32 s_next_generation; -+ u32 s_hash_seed[4]; -+ int s_def_hash_version; - - /* Journaling */ - struct inode * s_journal_inode; -Index: linux.mcp2/include/linux/ext3_jbd.h -=================================================================== ---- linux.mcp2.orig/include/linux/ext3_jbd.h 2004-05-17 14:53:17.000000000 -0700 -+++ linux.mcp2/include/linux/ext3_jbd.h 2004-05-17 15:07:07.000000000 -0700 -@@ -63,6 +63,8 @@ - - #define EXT3_RESERVE_TRANS_BLOCKS 12 - -+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 -+ - int - ext3_mark_iloc_dirty(handle_t *handle, - struct inode *inode, -Index: linux.mcp2/include/linux/rbtree.h -=================================================================== ---- linux.mcp2.orig/include/linux/rbtree.h 2004-05-17 14:41:25.000000000 -0700 -+++ linux.mcp2/include/linux/rbtree.h 2004-05-17 15:07:07.000000000 -0700 -@@ -120,6 +120,8 @@ - - extern void rb_insert_color(rb_node_t *, rb_root_t *); - extern void rb_erase(rb_node_t *, rb_root_t *); -+extern rb_node_t *rb_get_first(rb_root_t *root); -+extern rb_node_t *rb_get_next(rb_node_t *n); - - static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) - { -Index: linux.mcp2/lib/rbtree.c -=================================================================== ---- linux.mcp2.orig/lib/rbtree.c 2004-01-19 07:49:44.000000000 -0800 -+++ linux.mcp2/lib/rbtree.c 2004-05-17 15:10:39.000000000 -0700 -@@ -17,6 +17,8 @@ - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - linux/lib/rbtree.c -+ -+ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 - */ - - #include -@@ -294,3 +296,42 @@ - __rb_erase_color(child, parent, root); - } - EXPORT_SYMBOL(rb_erase); -+ -+/* -+ * This function returns the first node (in sort order) of the tree. -+ */ -+rb_node_t *rb_get_first(rb_root_t *root) -+{ -+ rb_node_t *n; -+ -+ n = root->rb_node; -+ if (!n) -+ return 0; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+} -+EXPORT_SYMBOL(rb_get_first); -+ -+/* -+ * Given a node, this function will return the next node in the tree. -+ */ -+rb_node_t *rb_get_next(rb_node_t *n) -+{ -+ rb_node_t *parent; -+ -+ if (n->rb_right) { -+ n = n->rb_right; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+ } else { -+ while ((parent = n->rb_parent)) { -+ if (n == parent->rb_left) -+ return parent; -+ n = parent; -+ } -+ return 0; -+ } -+} -+EXPORT_SYMBOL(rb_get_next); diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.6.12.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.6.12.patch deleted file mode 100644 index e20aaca..0000000 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.6.12.patch +++ /dev/null @@ -1,161 +0,0 @@ -Index: linux-2.6.7/fs/ext3/namei.c -=================================================================== ---- linux-2.6.7.orig/fs/ext3/namei.c 2004-06-15 23:19:36.000000000 -0600 -+++ linux-2.6.7/fs/ext3/namei.c 2004-08-20 17:48:54.000000000 -0600 -@@ -1596,11 +1596,17 @@ static int ext3_delete_entry (handle_t * - static inline void ext3_inc_count(handle_t *handle, struct inode *inode) - { - inode->i_nlink++; -+ if (is_dx(inode) && inode->i_nlink > 1) { -+ /* limit is 16-bit i_links_count */ -+ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) -+ inode->i_nlink = 1; -+ } - } - - static inline void ext3_dec_count(handle_t *handle, struct inode *inode) - { -- inode->i_nlink--; -+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) -+ inode->i_nlink--; - } - - static int ext3_add_nondir(handle_t *handle, -@@ -1693,7 +1698,7 @@ static int ext3_mkdir(struct inode * dir - struct ext3_dir_entry_2 * de; - int err; - -- if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) - return -EMLINK; - - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -@@ -1715,7 +1720,7 @@ static int ext3_mkdir(struct inode * dir - inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { -- inode->i_nlink--; /* is this nlink == 0? */ -+ ext3_dec_count(handle, inode); /* is this nlink == 0? */ - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; -@@ -1747,7 +1752,7 @@ static int ext3_mkdir(struct inode * dir - iput (inode); - goto out_stop; - } -- dir->i_nlink++; -+ ext3_inc_count(handle, dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - d_instantiate(dentry, inode); -@@ -2010,10 +2015,10 @@ static int ext3_rmdir (struct inode * di - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_rmdir; -- if (inode->i_nlink != 2) -- ext3_warning (inode->i_sb, "ext3_rmdir", -- "empty directory has nlink!=2 (%d)", -- inode->i_nlink); -+ if (!EXT3_DIR_LINK_EMPTY(inode)) -+ ext3_warning(inode->i_sb, "ext3_rmdir", -+ "empty directory has too many links (%d)", -+ inode->i_nlink); - inode->i_version++; - inode->i_nlink = 0; - /* There's no need to set i_disksize: the fact that i_nlink is -@@ -2023,7 +2028,7 @@ static int ext3_rmdir (struct inode * di - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - ext3_mark_inode_dirty(handle, inode); -- dir->i_nlink--; -+ ext3_dec_count(handle, dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - -@@ -2074,7 +2079,7 @@ static int ext3_unlink(struct inode * di - dir->i_ctime = dir->i_mtime = CURRENT_TIME; - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); -- inode->i_nlink--; -+ ext3_dec_count(handle, inode); - if (!inode->i_nlink) - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime; -@@ -2146,7 +2151,7 @@ static int ext3_link (struct dentry * ol - struct inode *inode = old_dentry->d_inode; - int err; - -- if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(inode)) - return -EMLINK; - - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -@@ -2230,8 +2235,8 @@ static int ext3_rename (struct inode * o - if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; -- if (!new_inode && new_dir!=old_dir && -- new_dir->i_nlink >= EXT3_LINK_MAX) -+ if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) - goto end_rename; - } - if (!new_bh) { -@@ -2288,7 +2293,7 @@ static int ext3_rename (struct inode * o - } - - if (new_inode) { -- new_inode->i_nlink--; -+ ext3_dec_count(handle, new_inode); - new_inode->i_ctime = CURRENT_TIME_SEC; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; -@@ -2299,11 +2304,13 @@ static int ext3_rename (struct inode * o - PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_bh); -- old_dir->i_nlink--; -+ ext3_dec_count(handle, old_dir); - if (new_inode) { -- new_inode->i_nlink--; -+ /* checked empty_dir above, can't have another parent, -+ * ext3_dec_count() won't work for many-linked dirs */ -+ new_inode->i_nlink = 0; - } else { -- new_dir->i_nlink++; -+ ext3_inc_count(handle, new_dir); - ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } ---- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600 -+++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600 -@@ -79,7 +81,7 @@ - /* - * Maximal count of links to a file - */ --#define EXT3_LINK_MAX 32000 -+#define EXT3_LINK_MAX 65000 - - /* - * Macro-instructions used to manage several block sizes -@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/lustre/kernel_patches/patches/ext3-no-write-super.patch b/lustre/kernel_patches/patches/ext3-no-write-super.patch deleted file mode 100644 index d2dcdae..0000000 --- a/lustre/kernel_patches/patches/ext3-no-write-super.patch +++ /dev/null @@ -1,22 +0,0 @@ - 0 files changed - ---- linux-2.4.20/fs/ext3/super.c~ext3-no-write-super 2003-08-11 13:20:17.000000000 +0400 -+++ linux-2.4.20-alexey/fs/ext3/super.c 2003-08-11 13:31:35.000000000 +0400 -@@ -1849,7 +1849,6 @@ void ext3_write_super (struct super_bloc - if (down_trylock(&sb->s_lock) == 0) - BUG(); /* aviro detector */ - sb->s_dirt = 0; -- target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); - - /* - * Tricky --- if we are unmounting, the write really does need -@@ -1857,6 +1856,7 @@ void ext3_write_super (struct super_bloc - * sb->s_root. - */ - if (do_sync_supers || !sb->s_root) { -+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); - unlock_super(sb); - log_wait_commit(EXT3_SB(sb)->s_journal, target); - lock_super(sb); - -_ diff --git a/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch b/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch deleted file mode 100644 index 4c16fe6..0000000 --- a/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch +++ /dev/null @@ -1,85 +0,0 @@ -Index: linux-2.4.19/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:36:03.000000000 -0400 -+++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:37:37.000000000 -0400 -@@ -1751,8 +1751,8 @@ - struct super_block *sb = inode->i_sb; - struct ext3_iloc iloc; - int err = 0, rc; -- -- lock_super(sb); -+ -+ down(&EXT3_SB(sb)->s_orphan_lock); - if (!list_empty(&EXT3_I(inode)->i_orphan)) - goto out_unlock; - -@@ -1800,7 +1800,7 @@ - jbd_debug(4, "orphan inode %ld will point to %d\n", - inode->i_ino, NEXT_ORPHAN(inode)); - out_unlock: -- unlock_super(sb); -+ up(&EXT3_SB(sb)->s_orphan_lock); - ext3_std_error(inode->i_sb, err); - return err; - } -@@ -1813,20 +1813,19 @@ - { - struct list_head *prev; - struct ext3_inode_info *ei = EXT3_I(inode); -- struct ext3_sb_info *sbi; -+ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); - unsigned long ino_next; - struct ext3_iloc iloc; - int err = 0; - -- lock_super(inode->i_sb); -+ down(&sbi->s_orphan_lock); - if (list_empty(&ei->i_orphan)) { -- unlock_super(inode->i_sb); -+ up(&sbi->s_orphan_lock); - return 0; - } - - ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; -- sbi = EXT3_SB(inode->i_sb); - - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - -@@ -1872,10 +1871,10 @@ - if (err) - goto out_brelse; - --out_err: -+out_err: - ext3_std_error(inode->i_sb, err); - out: -- unlock_super(inode->i_sb); -+ up(&sbi->s_orphan_lock); - return err; - - out_brelse: -Index: linux-2.4.19/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.orig/fs/ext3/super.c 2004-04-23 22:30:41.000000000 -0400 -+++ linux-2.4.19/fs/ext3/super.c 2004-04-23 22:36:22.000000000 -0400 -@@ -1179,6 +1179,7 @@ - */ - sb->s_op = &ext3_sops; - INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ -+ sema_init(&sbi->s_orphan_lock, 1); - - sb->s_root = 0; - -Index: linux-2.4.19/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.19.orig/include/linux/ext3_fs_sb.h 2004-04-23 18:26:27.000000000 -0400 -+++ linux-2.4.19/include/linux/ext3_fs_sb.h 2004-04-23 22:36:22.000000000 -0400 -@@ -69,6 +69,7 @@ - struct inode * s_journal_inode; - struct journal_s * s_journal; - struct list_head s_orphan; -+ struct semaphore s_orphan_lock; - struct block_device *journal_bdev; - #ifdef CONFIG_JBD_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ diff --git a/lustre/kernel_patches/patches/ext3-unmount_sync.patch b/lustre/kernel_patches/patches/ext3-unmount_sync.patch deleted file mode 100644 index c57903c..0000000 --- a/lustre/kernel_patches/patches/ext3-unmount_sync.patch +++ /dev/null @@ -1,21 +0,0 @@ - fs/ext3/super.c | 7 ++++++- - 1 files changed, 6 insertions(+), 1 deletion(-) - ---- linux-2.4.20/fs/ext3/super.c~ext3-unmount_sync 2003-04-08 23:35:44.000000000 -0600 -+++ linux-2.4.20-braam/fs/ext3/super.c 2003-04-08 23:35:44.000000000 -0600 -@@ -1612,7 +1612,12 @@ void ext3_write_super (struct super_bloc - sb->s_dirt = 0; - target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); - -- if (do_sync_supers) { -+ /* -+ * Tricky --- if we are unmounting, the write really does need -+ * to be synchronous. We can detect that by looking for NULL in -+ * sb->s_root. -+ */ -+ if (do_sync_supers || !sb->s_root) { - unlock_super(sb); - log_wait_commit(EXT3_SB(sb)->s_journal, target); - lock_super(sb); - -_ diff --git a/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch b/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch deleted file mode 100644 index 595db54..0000000 --- a/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch +++ /dev/null @@ -1,53 +0,0 @@ - ./fs/ext3/namei.c | 11 +++++------ - 1 files changed, 5 insertions(+), 6 deletions(-) - -Index: linux-2.4.19-pre1/./fs/ext3/namei.c -=================================================================== ---- linux-2.4.19-pre1.orig/./fs/ext3/namei.c 2003-11-21 01:52:06.000000000 +0300 -+++ linux-2.4.19-pre1/./fs/ext3/namei.c 2003-11-21 01:58:15.000000000 +0300 -@@ -1522,8 +1522,11 @@ - { - int err = ext3_add_entry(handle, dentry, inode); - if (!err) { -- d_instantiate(dentry, inode); -- return 0; -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ d_instantiate(dentry, inode); -+ return 0; -+ } - } - ext3_dec_count(handle, inode); - iput(inode); -@@ -1559,7 +1562,6 @@ - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - inode->i_mapping->a_ops = &ext3_aops; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle, dir); -@@ -1586,7 +1588,6 @@ - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, mode, rdev); -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle, dir); -@@ -2035,7 +2036,6 @@ - inode->i_size = l-1; - } - inode->u.ext3_i.i_disksize = inode->i_size; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); -@@ -2069,7 +2069,6 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - ext3_journal_stop(handle, dir); - return err; diff --git a/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch b/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch deleted file mode 100644 index 7899354..0000000 --- a/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch +++ /dev/null @@ -1,53 +0,0 @@ - ./fs/ext3/namei.c | 11 +++++------ - 1 files changed, 5 insertions(+), 6 deletions(-) - -Index: linux-2.4.19/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:30:41.000000000 -0400 -+++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:36:03.000000000 -0400 -@@ -1522,8 +1522,11 @@ - { - int err = ext3_add_entry(handle, dentry, inode); - if (!err) { -- d_instantiate(dentry, inode); -- return 0; -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ d_instantiate(dentry, inode); -+ return 0; -+ } - } - ext3_dec_count(handle, inode); - iput(inode); -@@ -1559,7 +1562,6 @@ - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - inode->i_mapping->a_ops = &ext3_aops; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle, dir); -@@ -1589,7 +1591,6 @@ - #ifdef CONFIG_EXT3_FS_XATTR - inode->i_op = &ext3_special_inode_operations; - #endif -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle, dir); -@@ -2039,7 +2040,6 @@ - inode->i_size = l-1; - } - EXT3_I(inode)->i_disksize = inode->i_size; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); -@@ -2073,7 +2073,6 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - ext3_journal_stop(handle, dir); - return err; diff --git a/lustre/kernel_patches/patches/extN-wantedi-2.4.19-suse.patch b/lustre/kernel_patches/patches/extN-wantedi-2.4.19-suse.patch deleted file mode 100644 index 02cfef1..0000000 --- a/lustre/kernel_patches/patches/extN-wantedi-2.4.19-suse.patch +++ /dev/null @@ -1,226 +0,0 @@ - fs/ext3/ialloc.c | 40 ++++++++++++++++++++++++++++++++++++++-- - fs/ext3/inode.c | 2 +- - fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++ - fs/ext3/namei.c | 21 +++++++++++++++++---- - include/linux/dcache.h | 5 +++++ - include/linux/ext3_fs.h | 5 ++++- - 6 files changed, 90 insertions(+), 8 deletions(-) - -Index: linux-2.4.19.SuSE/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:18:04 2003 -+++ linux-2.4.19.SuSE/fs/ext3/namei.c Sun Nov 16 01:23:20 2003 -@@ -1534,6 +1534,19 @@ - return err; - } - -+static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir, -+ int mode, struct dentry *dentry) -+{ -+ unsigned long inum = 0; -+ -+ if (dentry->d_fsdata != NULL) { -+ struct dentry_params *param = -+ (struct dentry_params *) dentry->d_fsdata; -+ inum = param->p_inum; -+ } -+ return ext3_new_inode(handle, dir, mode, inum); -+} -+ - /* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it -@@ -1557,7 +1570,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, mode); -+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext3_file_inode_operations; -@@ -1585,7 +1598,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, mode); -+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); -@@ -1618,7 +1631,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFDIR | mode); -+ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -2013,7 +2026,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); -+ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -Index: linux-2.4.19.SuSE/fs/ext3/ialloc.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ialloc.c Sun Nov 16 01:20:17 2003 -+++ linux-2.4.19.SuSE/fs/ext3/ialloc.c Sun Nov 16 01:24:49 2003 -@@ -330,7 +330,8 @@ - * For other inodes, search forward from the parent directory's block - * group to find a free inode. - */ --struct inode * ext3_new_inode (handle_t *handle, struct inode * dir, int mode) -+struct inode * ext3_new_inode(handle_t *handle, const struct inode * dir, -+ int mode, unsigned long goal) - { - struct super_block * sb; - struct buffer_head * bh; -@@ -355,7 +356,41 @@ - init_rwsem(&inode->u.ext3_i.truncate_sem); - - lock_super (sb); -- es = sb->u.ext3_sb.s_es; -+ es = EXT3_SB(sb)->s_es; -+ -+ if (goal) { -+ i = (goal - 1) / EXT3_INODES_PER_GROUP(sb); -+ j = (goal - 1) % EXT3_INODES_PER_GROUP(sb); -+ gdp = ext3_get_group_desc(sb, i, &bh2); -+ -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) { -+ err = bitmap_nr; -+ goto fail; -+ } -+ -+ bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) goto fail; -+ -+ if (ext3_set_bit(j, bh->b_data)) { -+ printk(KERN_ERR "goal inode %lu unavailable\n", goal); -+ /* Oh well, we tried. */ -+ goto repeat; -+ } -+ -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) goto fail; -+ -+ /* We've shortcircuited the allocation system successfully, -+ * now finish filling in the inode. -+ */ -+ goto have_bit_and_group; -+ } -+ - repeat: - gdp = NULL; - i = 0; -@@ -470,6 +505,7 @@ - } - goto repeat; - } -+ have_bit_and_group: - j += i * EXT3_INODES_PER_GROUP(sb) + 1; - if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { - ext3_error (sb, "ext3_new_inode", -Index: linux-2.4.19.SuSE/fs/ext3/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:20:17 2003 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:23:20 2003 -@@ -2168,7 +2168,7 @@ - if (IS_ERR(handle)) - goto out_truncate; - -- new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); -+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode, 0); - if (IS_ERR(new_inode)) { - ext3_debug("truncate inode %lu directly (no new inodes)\n", - old_inode->i_ino); -Index: linux-2.4.19.SuSE/fs/ext3/ioctl.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ioctl.c Fri Nov 9 14:25:04 2001 -+++ linux-2.4.19.SuSE/fs/ext3/ioctl.c Sun Nov 16 01:23:20 2003 -@@ -23,6 +23,31 @@ - ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); - - switch (cmd) { -+ case EXT3_IOC_CREATE_INUM: { -+ char name[32]; -+ struct dentry *dchild, *dparent; -+ int rc = 0; -+ -+ dparent = list_entry(inode->i_dentry.next, struct dentry, -+ d_alias); -+ snprintf(name, sizeof name, "%lu", arg); -+ dchild = lookup_one_len(name, dparent, strlen(name)); -+ if (dchild->d_inode) { -+ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", -+ dparent->d_name.len, dparent->d_name.name, arg, -+ dchild->d_inode->i_ino); -+ rc = -EEXIST; -+ } else { -+ dchild->d_fsdata = (void *)arg; -+ rc = vfs_create(inode, dchild, 0644); -+ if (rc) -+ printk(KERN_ERR "vfs_create: %d\n", rc); -+ else if (dchild->d_inode->i_ino != arg) -+ rc = -EEXIST; -+ } -+ dput(dchild); -+ return rc; -+ } - case EXT3_IOC_GETFLAGS: - flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int *) arg); -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:20:17 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:25:42 2003 -@@ -202,6 +202,7 @@ - #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) - #define EXT3_IOC_GETVERSION _IOR('f', 3, long) - #define EXT3_IOC_SETVERSION _IOW('f', 4, long) -+/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ - #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) - #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) - #ifdef CONFIG_JBD_DEBUG -@@ -674,7 +675,8 @@ - dx_hash_info *hinfo); - - /* ialloc.c */ --extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); -+extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int, -+ unsigned long); - extern void ext3_free_inode (handle_t *, struct inode *); - extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); - extern unsigned long ext3_count_free_inodes (struct super_block *); -@@ -765,4 +767,5 @@ - - #endif /* __KERNEL__ */ - -+#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) - #endif /* _LINUX_EXT3_FS_H */ -Index: linux-2.4.19.SuSE/include/linux/dcache.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/dcache.h Sat Nov 15 17:35:46 2003 -+++ linux-2.4.19.SuSE/include/linux/dcache.h Sun Nov 16 01:23:20 2003 -@@ -62,6 +62,11 @@ - - #define IS_ROOT(x) ((x) == (x)->d_parent) - -+struct dentry_params { -+ unsigned long p_inum; -+ void *p_ptr; -+}; -+ - /* - * "quick string" -- eases parameter passing, but more importantly - * saves "metadata" about the string (ie length and the hash). diff --git a/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch b/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch deleted file mode 100644 index 85bdf9e..0000000 --- a/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch +++ /dev/null @@ -1,121 +0,0 @@ - - - - fs/inode.c | 21 ++++++++++++++------- - fs/smbfs/inode.c | 2 +- - fs/super.c | 4 ++-- - include/linux/fs.h | 2 +- - 4 files changed, 18 insertions(+), 11 deletions(-) - -Index: linux.mcp2/fs/inode.c -=================================================================== ---- linux.mcp2.orig/fs/inode.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/inode.c 2004-05-05 14:31:31.000000000 -0700 -@@ -553,7 +553,8 @@ - /* - * Invalidate all inodes for a device. - */ --static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) -+static int invalidate_list(struct list_head *head, struct super_block * sb, -+ struct list_head * dispose, int show) - { - struct list_head *next; - int busy = 0, count = 0; -@@ -578,6 +579,11 @@ - count++; - continue; - } -+ if (show) -+ printk(KERN_ERR -+ "inode busy: dev %s:%lu (%p) mode %o count %u\n", -+ kdevname(sb->s_dev), inode->i_ino, inode, -+ inode->i_mode, atomic_read(&inode->i_count)); - busy = 1; - } - /* only unused inodes may be cached with i_count zero */ -@@ -596,22 +602,23 @@ - /** - * invalidate_inodes - discard the inodes on a device - * @sb: superblock -+ * @show: whether we should display any busy inodes found - * - * Discard all of the inodes for a given superblock. If the discard - * fails because there are busy inodes then a non zero value is returned. - * If the discard is successful all the inodes have been discarded. - */ - --int invalidate_inodes(struct super_block * sb) -+int invalidate_inodes(struct super_block * sb, int show) - { - int busy; - LIST_HEAD(throw_away); - - spin_lock(&inode_lock); -- busy = invalidate_list(&inode_in_use, sb, &throw_away); -- busy |= invalidate_list(&inode_unused, sb, &throw_away); -- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); -- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); -+ busy = invalidate_list(&inode_in_use, sb, &throw_away, show); -+ busy |= invalidate_list(&inode_unused, sb, &throw_away, show); -+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show); -+ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show); - spin_unlock(&inode_lock); - - dispose_list(&throw_away); -@@ -637,7 +644,7 @@ - * hold). - */ - shrink_dcache_sb(sb); -- res = invalidate_inodes(sb); -+ res = invalidate_inodes(sb, 0); - drop_super(sb); - } - invalidate_buffers(dev); -Index: linux.mcp2/fs/super.c -=================================================================== ---- linux.mcp2.orig/fs/super.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/super.c 2004-05-05 14:32:06.000000000 -0700 -@@ -838,7 +838,7 @@ - lock_super(sb); - lock_kernel(); - sb->s_flags &= ~MS_ACTIVE; -- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */ -+ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */ - if (sop) { - if (sop->write_super && sb->s_dirt) - sop->write_super(sb); -@@ -847,7 +847,7 @@ - } - - /* Forget any remaining inodes */ -- if (invalidate_inodes(sb)) { -+ if (invalidate_inodes(sb, 1)) { - printk(KERN_ERR "VFS: Busy inodes after unmount. " - "Self-destruct in 5 seconds. Have a nice day...\n"); - } -Index: linux.mcp2/fs/smbfs/inode.c -=================================================================== ---- linux.mcp2.orig/fs/smbfs/inode.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/smbfs/inode.c 2004-05-05 14:31:31.000000000 -0700 -@@ -166,7 +166,7 @@ - { - VERBOSE("\n"); - shrink_dcache_sb(SB_of(server)); -- invalidate_inodes(SB_of(server)); -+ invalidate_inodes(SB_of(server), 0); - } - - /* -Index: linux.mcp2/include/linux/fs.h -=================================================================== ---- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:31:06.000000000 -0700 -+++ linux.mcp2/include/linux/fs.h 2004-05-05 14:31:31.000000000 -0700 -@@ -1283,7 +1283,7 @@ - extern void set_buffer_flushtime(struct buffer_head *); - extern void balance_dirty(void); - extern int check_disk_change(kdev_t); --extern int invalidate_inodes(struct super_block *); -+extern int invalidate_inodes(struct super_block *, int); - extern int invalidate_device(kdev_t, int); - extern void invalidate_inode_pages(struct inode *); - extern void invalidate_inode_pages2(struct address_space *); diff --git a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch b/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch deleted file mode 100644 index 2466af6..0000000 --- a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch +++ /dev/null @@ -1,52 +0,0 @@ - fs/Makefile | 2 +- - fs/inode.c | 4 +++- - mm/page_alloc.c | 1 + - 3 files changed, 5 insertions(+), 2 deletions(-) - -Index: linux-ion/fs/inode.c -=================================================================== ---- linux-ion.orig/fs/inode.c 2004-09-27 14:58:03.000000000 -0700 -+++ linux-ion/fs/inode.c 2004-09-27 14:58:34.000000000 -0700 -@@ -5,6 +5,7 @@ - */ - - #include -+#include - #include - #include - #include -@@ -66,7 +67,8 @@ - * NOTE! You also have to own the lock if you change - * the i_state of an inode while it is in use.. - */ --static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; -+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; -+EXPORT_SYMBOL(inode_lock); - - /* - * Statistics gathering.. -Index: linux-ion/fs/Makefile -=================================================================== ---- linux-ion.orig/fs/Makefile 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/fs/Makefile 2004-09-27 14:59:37.000000000 -0700 -@@ -7,7 +7,7 @@ - - O_TARGET := fs.o - --export-objs := filesystems.o open.o dcache.o buffer.o -+export-objs := filesystems.o open.o dcache.o buffer.o inode.o - mod-subdirs := nls - - obj-y := open.o read_write.o devices.o file_table.o buffer.o \ -Index: linux-ion/mm/page_alloc.c -=================================================================== ---- linux-ion.orig/mm/page_alloc.c 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/mm/page_alloc.c 2004-09-27 14:58:34.000000000 -0700 -@@ -28,6 +28,7 @@ - LIST_HEAD(inactive_list); - LIST_HEAD(active_list); - pg_data_t *pgdat_list; -+EXPORT_SYMBOL(pgdat_list); - - /* Used to look up the address of the struct zone encoded in page->zone */ - zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; diff --git a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-suse.patch b/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-suse.patch deleted file mode 100644 index 2040fcd..0000000 --- a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-suse.patch +++ /dev/null @@ -1,52 +0,0 @@ - fs/Makefile | 2 +- - fs/inode.c | 4 +++- - mm/page_alloc.c | 1 + - 3 files changed, 5 insertions(+), 2 deletions(-) - -Index: linux-2.4.19.SuSE/fs/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/inode.c Sat Nov 15 18:02:13 2003 -+++ linux-2.4.19.SuSE/fs/inode.c Sat Nov 15 18:03:04 2003 -@@ -5,6 +5,7 @@ - */ - - #include -+#include - #include - #include - #include -@@ -67,7 +68,8 @@ - * NOTE! You also have to own the lock if you change - * the i_state of an inode while it is in use.. - */ --static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; -+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; -+EXPORT_SYMBOL(inode_lock); - - /* - * Statistics gathering.. -Index: linux-2.4.19.SuSE/fs/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/Makefile Mon Jan 27 05:08:56 2003 -+++ linux-2.4.19.SuSE/fs/Makefile Sat Nov 15 18:03:54 2003 -@@ -7,7 +7,7 @@ - - O_TARGET := fs.o - --export-objs := filesystems.o open.o dcache.o buffer.o -+export-objs := filesystems.o open.o dcache.o buffer.o inode.o - mod-subdirs := nls - - obj-y := open.o read_write.o devices.o file_table.o buffer.o \ -Index: linux-2.4.19.SuSE/mm/page_alloc.c -=================================================================== ---- linux-2.4.19.SuSE.orig/mm/page_alloc.c Mon Jan 27 05:08:55 2003 -+++ linux-2.4.19.SuSE/mm/page_alloc.c Sat Nov 15 18:03:04 2003 -@@ -32,6 +32,7 @@ - LIST_HEAD(inactive_list); - LIST_HEAD(active_list); - pg_data_t *pgdat_list; -+EXPORT_SYMBOL(pgdat_list); - - /* Used to look up the address of the struct zone encoded in page->zone */ - zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; diff --git a/lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch b/lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch deleted file mode 100644 index b3ac80a..0000000 --- a/lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch +++ /dev/null @@ -1,497 +0,0 @@ - Documentation/filesystems/ext2.txt | 16 ++ - fs/ext3/Makefile | 2 - fs/ext3/inode.c | 4 - fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ - fs/ext3/iopen.h | 13 + - fs/ext3/namei.c | 13 + - fs/ext3/super.c | 11 + - include/linux/ext3_fs.h | 2 - 8 files changed, 318 insertions(+), 2 deletions(-) - -Index: linux-2.4.19/Documentation/filesystems/ext2.txt -=================================================================== ---- linux-2.4.19.orig/Documentation/filesystems/ext2.txt 2001-07-11 18:44:45.000000000 -0400 -+++ linux-2.4.19/Documentation/filesystems/ext2.txt 2004-04-23 22:37:48.000000000 -0400 -@@ -35,6 +35,22 @@ - - sb=n Use alternate superblock at this location. - -+iopen Makes an invisible pseudo-directory called -+ __iopen__ available in the root directory -+ of the filesystem. Allows open-by-inode- -+ number. i.e., inode 3145 can be accessed -+ via /mntpt/__iopen__/3145 -+ -+iopen_nopriv This option makes the iopen directory be -+ world-readable. This may be safer since it -+ allows daemons to run as an unprivileged user, -+ however it significantly changes the security -+ model of a Unix filesystem, since previously -+ all files under a mode 700 directory were not -+ generally avilable even if the -+ permissions on the file itself is -+ world-readable. -+ - grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. - - -Index: linux.mcp2/fs/ext3/Makefile -=================================================================== ---- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:20:52.000000000 -0700 -+++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:21:55.000000000 -0700 -@@ -11,7 +11,7 @@ - - export-objs := ext3-exports.o - --obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o - obj-m := $(O_TARGET) - -Index: linux.mcp2/fs/ext3/inode.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/inode.c 2004-05-17 15:20:59.000000000 -0700 -+++ linux.mcp2/fs/ext3/inode.c 2004-05-17 15:21:55.000000000 -0700 -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include "iopen.h" - - /* - * SEARCH_FROM_ZERO forces each block allocation to search from the start -@@ -2125,6 +2126,9 @@ - struct buffer_head *bh; - int block; - -+ if (ext3_iopen_get_inode(inode)) -+ return; -+ - if(ext3_get_inode_loc(inode, &iloc)) - goto bad_inode; - bh = iloc.bh; -Index: linux.mcp2/fs/ext3/iopen.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/iopen.c 2002-04-11 07:25:15.000000000 -0700 -+++ linux.mcp2/fs/ext3/iopen.c 2004-05-17 15:21:55.000000000 -0700 -@@ -0,0 +1,285 @@ -+/* -+ * linux/fs/ext3/iopen.c -+ * -+ * Special support for open by inode number -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ * -+ * -+ * Invariants: -+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias -+ * for an inode at one time. -+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry -+ * aliases on an inode at the same time. -+ * -+ * If we have any connected dentry aliases for an inode, use one of those -+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED -+ * dentry for this inode, which thereafter will be found by the dcache -+ * when looking up this inode number in __iopen__, so we don't return here -+ * until it is gone. -+ * -+ * If we get an inode via a regular name lookup, then we "rename" the -+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures -+ * existing users of the disconnected dentry will continue to use the same -+ * dentry as the connected users, and there will never be both kinds of -+ * dentry aliases at one time. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "iopen.h" -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#define IOPEN_NAME_LEN 32 -+ -+/* -+ * This implements looking up an inode by number. -+ */ -+static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ unsigned long ino; -+ struct list_head *lp; -+ struct dentry *alternate; -+ char buf[IOPEN_NAME_LEN]; -+ -+ if (dentry->d_name.len >= IOPEN_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ memcpy(buf, dentry->d_name.name, dentry->d_name.len); -+ buf[dentry->d_name.len] = 0; -+ -+ if (strcmp(buf, ".") == 0) -+ ino = dir->i_ino; -+ else if (strcmp(buf, "..") == 0) -+ ino = EXT3_ROOT_INO; -+ else -+ ino = simple_strtoul(buf, 0, 0); -+ -+ if ((ino != EXT3_ROOT_INO && -+ //ino != EXT3_ACL_IDX_INO && -+ //ino != EXT3_ACL_DATA_INO && -+ ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) -+ return ERR_PTR(-ENOENT); -+ -+ inode = iget(dir->i_sb, ino); -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ if (is_bad_inode(inode)) { -+ iput(inode); -+ return ERR_PTR(-ENOENT); -+ } -+ -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); -+ list_for_each(lp, &inode->i_dentry) { -+ alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); -+ } -+ -+ if (!list_empty(&inode->i_dentry)) { -+ alternate = list_entry(inode->i_dentry.next, -+ struct dentry, d_alias); -+ dget_locked(alternate); -+ alternate->d_vfs_flags |= DCACHE_REFERENCED; -+ iput(inode); -+ spin_unlock(&dcache_lock); -+ return alternate; -+ } -+ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+#define do_switch(x,y) do { \ -+ __typeof__ (x) __tmp = x; \ -+ x = y; y = __tmp; } while (0) -+ -+static inline void switch_names(struct dentry *dentry, struct dentry *target) -+{ -+ const unsigned char *old_name, *new_name; -+ -+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); -+ old_name = target->d_name.name; -+ new_name = dentry->d_name.name; -+ if (old_name == target->d_iname) -+ old_name = dentry->d_iname; -+ if (new_name == dentry->d_iname) -+ new_name = target->d_iname; -+ target->d_name.name = new_name; -+ dentry->d_name.name = old_name; -+} -+ -+/* This function is spliced into ext3_lookup and does the move of a -+ * disconnected dentry (if it exists) to a connected dentry. -+ */ -+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, -+ int rehash) -+{ -+ struct dentry *tmp, *goal = NULL; -+ struct list_head *lp; -+ -+ /* verify this dentry is really new */ -+ assert(dentry->d_inode == NULL); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ if (rehash) -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (!inode) -+ goto do_rehash; -+ -+ if (!test_opt(inode->i_sb, IOPEN)) -+ goto do_instantiate; -+ -+ /* preferrably return a connected dentry */ -+ list_for_each(lp, &inode->i_dentry) { -+ tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { -+ assert(tmp->d_alias.next == &inode->i_dentry); -+ assert(tmp->d_alias.prev == &inode->i_dentry); -+ goal = tmp; -+ dget_locked(goal); -+ break; -+ } -+ } -+ -+ if (!goal) -+ goto do_instantiate; -+ -+ /* Move the goal to the de hash queue - like d_move() */ -+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; -+ list_del_init(&goal->d_hash); -+ -+ list_del(&goal->d_child); -+ list_del(&dentry->d_child); -+ -+ /* Switch the parents and the names.. */ -+ switch_names(goal, dentry); -+ do_switch(goal->d_parent, dentry->d_parent); -+ do_switch(goal->d_name.len, dentry->d_name.len); -+ do_switch(goal->d_name.hash, dentry->d_name.hash); -+ -+ /* And add them back to the (new) parent lists */ -+ list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); -+ __d_rehash(goal, 0); -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ -+ return goal; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+do_instantiate: -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+do_rehash: -+ if (rehash) -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+/* -+ * These are the special structures for the iopen pseudo directory. -+ */ -+ -+static struct inode_operations iopen_inode_operations = { -+ lookup: iopen_lookup, /* BKL held */ -+}; -+ -+static struct file_operations iopen_file_operations = { -+ read: generic_read_dir, -+}; -+ -+static int match_dentry(struct dentry *dentry, const char *name) -+{ -+ int len; -+ -+ len = strlen(name); -+ if (dentry->d_name.len != len) -+ return 0; -+ if (strncmp(dentry->d_name.name, name, len)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * This function is spliced into ext3_lookup and returns 1 the file -+ * name is __iopen__ and dentry has been filled in appropriately. -+ */ -+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ -+ if (dir->i_ino != EXT3_ROOT_INO || -+ !test_opt(dir->i_sb, IOPEN) || -+ !match_dentry(dentry, "__iopen__")) -+ return 0; -+ -+ inode = iget(dir->i_sb, EXT3_BAD_INO); -+ -+ if (!inode) -+ return 0; -+ d_add(dentry, inode); -+ return 1; -+} -+ -+/* -+ * This function is spliced into read_inode; it returns 1 if inode -+ * number is the one for /__iopen__, in which case the inode is filled -+ * in appropriately. Otherwise, this fuction returns 0. -+ */ -+int ext3_iopen_get_inode(struct inode *inode) -+{ -+ if (inode->i_ino != EXT3_BAD_INO) -+ return 0; -+ -+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; -+ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) -+ inode->i_mode |= 0777; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 1; -+ inode->i_size = 4096; -+ inode->i_atime = CURRENT_TIME; -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_mtime = CURRENT_TIME; -+ inode->u.ext3_i.i_dtime = 0; -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = 0; -+ inode->i_version = 1; -+ inode->i_generation = 0; -+ -+ inode->i_op = &iopen_inode_operations; -+ inode->i_fop = &iopen_file_operations; -+ inode->i_mapping->a_ops = 0; -+ -+ return 1; -+} -Index: linux.mcp2/fs/ext3/iopen.h -=================================================================== ---- linux.mcp2.orig/fs/ext3/iopen.h 2002-04-11 07:25:15.000000000 -0700 -+++ linux.mcp2/fs/ext3/iopen.h 2004-05-17 15:21:55.000000000 -0700 -@@ -0,0 +1,15 @@ -+/* -+ * iopen.h -+ * -+ * Special support for opening files by inode number. -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ */ -+ -+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *dentry, -+ struct inode *inode, int rehash); -Index: linux.mcp2/fs/ext3/namei.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:20:59.000000000 -0700 -+++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:21:55.000000000 -0700 -@@ -35,7 +35,7 @@ - #include - #include - #include -- -+#include "iopen.h" - - /* - * define how far ahead to read directories while searching them. -@@ -931,6 +931,9 @@ - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; -+ - bh = ext3_find_entry(dentry, &de); - inode = NULL; - if (bh) { -@@ -942,8 +945,8 @@ - return ERR_PTR(-EACCES); - } - } -- d_add(dentry, inode); -- return NULL; -+ -+ return iopen_connect_dentry(dentry, inode, 1); - } - - #define S_SHIFT 12 -@@ -1932,10 +1935,6 @@ - inode->i_nlink); - inode->i_version = ++event; - inode->i_nlink = 0; -- /* There's no need to set i_disksize: the fact that i_nlink is -- * zero will ensure that the right thing happens during any -- * recovery. */ -- inode->i_size = 0; - ext3_orphan_add(handle, inode); - ext3_mark_inode_dirty(handle, inode); - dir->i_nlink--; -@@ -2054,6 +2053,23 @@ - return err; - } - -+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ -+static int ext3_add_link(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ dput(iopen_connect_dentry(dentry, inode, 0)); -+ return 0; -+ } -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ - static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) - { -@@ -2081,7 +2097,8 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- err = ext3_add_nondir(handle, dentry, inode); -+ err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle, inode); - ext3_journal_stop(handle, dir); - return err; - } -Index: linux.mcp2/fs/ext3/super.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:20:59.000000000 -0700 -+++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:21:55.000000000 -0700 -@@ -836,6 +836,18 @@ - || !strcmp (this_char, "quota") - || !strcmp (this_char, "usrquota")) - /* Don't do anything ;-) */ ; -+ else if (!strcmp (this_char, "iopen")) { -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "noiopen")) { -+ clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "iopen_nopriv")) { -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } - else if (!strcmp (this_char, "journal")) { - /* @@@ FIXME */ - /* Eventually we will want to be able to create -Index: linux.mcp2/include/linux/ext3_fs.h -=================================================================== ---- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 15:20:59.000000000 -0700 -+++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:21:55.000000000 -0700 -@@ -323,6 +323,8 @@ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ -+#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch b/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch deleted file mode 100644 index 5d5b8ff..0000000 --- a/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch +++ /dev/null @@ -1,497 +0,0 @@ - Documentation/filesystems/ext2.txt | 16 ++ - fs/ext3/Makefile | 2 - fs/ext3/inode.c | 4 - fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ - fs/ext3/iopen.h | 13 + - fs/ext3/namei.c | 13 + - fs/ext3/super.c | 11 + - include/linux/ext3_fs.h | 2 - 8 files changed, 318 insertions(+), 2 deletions(-) - -Index: linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt -=================================================================== ---- linux-2.4.19.SuSE.orig/Documentation/filesystems/ext2.txt Wed Jul 11 15:44:45 2001 -+++ linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt Sun Nov 16 01:27:31 2003 -@@ -35,6 +35,22 @@ - - sb=n Use alternate superblock at this location. - -+iopen Makes an invisible pseudo-directory called -+ __iopen__ available in the root directory -+ of the filesystem. Allows open-by-inode- -+ number. i.e., inode 3145 can be accessed -+ via /mntpt/__iopen__/3145 -+ -+iopen_nopriv This option makes the iopen directory be -+ world-readable. This may be safer since it -+ allows daemons to run as an unprivileged user, -+ however it significantly changes the security -+ model of a Unix filesystem, since previously -+ all files under a mode 700 directory were not -+ generally avilable even if the -+ permissions on the file itself is -+ world-readable. -+ - grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. - - -Index: linux-2.4.19.SuSE/fs/ext3/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/Makefile Sun Nov 16 00:40:59 2003 -+++ linux-2.4.19.SuSE/fs/ext3/Makefile Sun Nov 16 01:27:31 2003 -@@ -11,7 +11,7 @@ - - export-objs := ext3-exports.o - --obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o - obj-m := $(O_TARGET) - -Index: linux-2.4.19.SuSE/fs/ext3/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:26:04 2003 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:27:31 2003 -@@ -34,6 +34,7 @@ - #include - #include - #include -+#include "iopen.h" - - /* - * SEARCH_FROM_ZERO forces each block allocation to search from the start -@@ -2350,6 +2351,9 @@ - struct buffer_head *bh; - int block; - -+ if (ext3_iopen_get_inode(inode)) -+ return; -+ - if(ext3_get_inode_loc(inode, &iloc)) - goto bad_inode; - bh = iloc.bh; -Index: lum/fs/ext3/iopen.c -=================================================================== ---- lum.orig/fs/ext3/iopen.c 2004-03-09 16:46:37.000000000 -0700 -+++ lum/fs/ext3/iopen.c 2004-03-09 16:48:03.000000000 -0700 -@@ -0,0 +1,285 @@ -+/* -+ * linux/fs/ext3/iopen.c -+ * -+ * Special support for open by inode number -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ * -+ * -+ * Invariants: -+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias -+ * for an inode at one time. -+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry -+ * aliases on an inode at the same time. -+ * -+ * If we have any connected dentry aliases for an inode, use one of those -+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED -+ * dentry for this inode, which thereafter will be found by the dcache -+ * when looking up this inode number in __iopen__, so we don't return here -+ * until it is gone. -+ * -+ * If we get an inode via a regular name lookup, then we "rename" the -+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures -+ * existing users of the disconnected dentry will continue to use the same -+ * dentry as the connected users, and there will never be both kinds of -+ * dentry aliases at one time. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "iopen.h" -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#define IOPEN_NAME_LEN 32 -+ -+/* -+ * This implements looking up an inode by number. -+ */ -+static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ unsigned long ino; -+ struct list_head *lp; -+ struct dentry *alternate; -+ char buf[IOPEN_NAME_LEN]; -+ -+ if (dentry->d_name.len >= IOPEN_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ memcpy(buf, dentry->d_name.name, dentry->d_name.len); -+ buf[dentry->d_name.len] = 0; -+ -+ if (strcmp(buf, ".") == 0) -+ ino = dir->i_ino; -+ else if (strcmp(buf, "..") == 0) -+ ino = EXT3_ROOT_INO; -+ else -+ ino = simple_strtoul(buf, 0, 0); -+ -+ if ((ino != EXT3_ROOT_INO && -+ //ino != EXT3_ACL_IDX_INO && -+ //ino != EXT3_ACL_DATA_INO && -+ ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) -+ return ERR_PTR(-ENOENT); -+ -+ inode = iget(dir->i_sb, ino); -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ if (is_bad_inode(inode)) { -+ iput(inode); -+ return ERR_PTR(-ENOENT); -+ } -+ -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); -+ list_for_each(lp, &inode->i_dentry) { -+ alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); -+ } -+ -+ if (!list_empty(&inode->i_dentry)) { -+ alternate = list_entry(inode->i_dentry.next, -+ struct dentry, d_alias); -+ dget_locked(alternate); -+ alternate->d_vfs_flags |= DCACHE_REFERENCED; -+ iput(inode); -+ spin_unlock(&dcache_lock); -+ return alternate; -+ } -+ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+#define do_switch(x,y) do { \ -+ __typeof__ (x) __tmp = x; \ -+ x = y; y = __tmp; } while (0) -+ -+static inline void switch_names(struct dentry *dentry, struct dentry *target) -+{ -+ const unsigned char *old_name, *new_name; -+ -+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); -+ old_name = target->d_name.name; -+ new_name = dentry->d_name.name; -+ if (old_name == target->d_iname) -+ old_name = dentry->d_iname; -+ if (new_name == dentry->d_iname) -+ new_name = target->d_iname; -+ target->d_name.name = new_name; -+ dentry->d_name.name = old_name; -+} -+ -+/* This function is spliced into ext3_lookup and does the move of a -+ * disconnected dentry (if it exists) to a connected dentry. -+ */ -+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, -+ int rehash) -+{ -+ struct dentry *tmp, *goal = NULL; -+ struct list_head *lp; -+ -+ /* verify this dentry is really new */ -+ assert(dentry->d_inode == NULL); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ if (rehash) -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (!inode) -+ goto do_rehash; -+ -+ if (!test_opt(inode->i_sb, IOPEN)) -+ goto do_instantiate; -+ -+ /* preferrably return a connected dentry */ -+ list_for_each(lp, &inode->i_dentry) { -+ tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { -+ assert(tmp->d_alias.next == &inode->i_dentry); -+ assert(tmp->d_alias.prev == &inode->i_dentry); -+ goal = tmp; -+ dget_locked(goal); -+ break; -+ } -+ } -+ -+ if (!goal) -+ goto do_instantiate; -+ -+ /* Move the goal to the de hash queue - like d_move() */ -+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; -+ list_del_init(&goal->d_hash); -+ -+ list_del(&goal->d_child); -+ list_del(&dentry->d_child); -+ -+ /* Switch the parents and the names.. */ -+ switch_names(goal, dentry); -+ do_switch(goal->d_parent, dentry->d_parent); -+ do_switch(goal->d_name.len, dentry->d_name.len); -+ do_switch(goal->d_name.hash, dentry->d_name.hash); -+ -+ /* And add them back to the (new) parent lists */ -+ list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); -+ __d_rehash(goal, 0); -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ -+ return goal; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+do_instantiate: -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+do_rehash: -+ if (rehash) -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+/* -+ * These are the special structures for the iopen pseudo directory. -+ */ -+ -+static struct inode_operations iopen_inode_operations = { -+ lookup: iopen_lookup, /* BKL held */ -+}; -+ -+static struct file_operations iopen_file_operations = { -+ read: generic_read_dir, -+}; -+ -+static int match_dentry(struct dentry *dentry, const char *name) -+{ -+ int len; -+ -+ len = strlen(name); -+ if (dentry->d_name.len != len) -+ return 0; -+ if (strncmp(dentry->d_name.name, name, len)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * This function is spliced into ext3_lookup and returns 1 the file -+ * name is __iopen__ and dentry has been filled in appropriately. -+ */ -+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ -+ if (dir->i_ino != EXT3_ROOT_INO || -+ !test_opt(dir->i_sb, IOPEN) || -+ !match_dentry(dentry, "__iopen__")) -+ return 0; -+ -+ inode = iget(dir->i_sb, EXT3_BAD_INO); -+ -+ if (!inode) -+ return 0; -+ d_add(dentry, inode); -+ return 1; -+} -+ -+/* -+ * This function is spliced into read_inode; it returns 1 if inode -+ * number is the one for /__iopen__, in which case the inode is filled -+ * in appropriately. Otherwise, this fuction returns 0. -+ */ -+int ext3_iopen_get_inode(struct inode *inode) -+{ -+ if (inode->i_ino != EXT3_BAD_INO) -+ return 0; -+ -+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; -+ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) -+ inode->i_mode |= 0777; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 1; -+ inode->i_size = 4096; -+ inode->i_atime = CURRENT_TIME; -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_mtime = CURRENT_TIME; -+ inode->u.ext3_i.i_dtime = 0; -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = 0; -+ inode->i_version = 1; -+ inode->i_generation = 0; -+ -+ inode->i_op = &iopen_inode_operations; -+ inode->i_fop = &iopen_file_operations; -+ inode->i_mapping->a_ops = 0; -+ -+ return 1; -+} -Index: lum/fs/ext3/iopen.h -=================================================================== ---- lum.orig/fs/ext3/iopen.h 2004-03-09 16:46:37.000000000 -0700 -+++ lum/fs/ext3/iopen.h 2004-03-09 16:48:03.000000000 -0700 -@@ -0,0 +1,15 @@ -+/* -+ * iopen.h -+ * -+ * Special support for opening files by inode number. -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ */ -+ -+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *dentry, -+ struct inode *inode, int rehash); -Index: linux-2.4.19.SuSE/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:23:20 2003 -+++ linux-2.4.19.SuSE/fs/ext3/namei.c Sun Nov 16 01:27:31 2003 -@@ -36,7 +36,7 @@ - #include - #include - #include -- -+#include "iopen.h" - - /* - * define how far ahead to read directories while searching them. -@@ -926,6 +927,9 @@ - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; -+ - bh = ext3_find_entry(dentry, &de); - inode = NULL; - if (bh) { -@@ -943,8 +948,8 @@ - return ERR_PTR(-EACCES); - } - } -- d_add(dentry, inode); -- return NULL; -+ -+ return iopen_connect_dentry(dentry, inode, 1); - } - - #define S_SHIFT 12 -@@ -1932,10 +1935,6 @@ - inode->i_nlink); - inode->i_version = ++event; - inode->i_nlink = 0; -- /* There's no need to set i_disksize: the fact that i_nlink is -- * zero will ensure that the right thing happens during any -- * recovery. */ -- inode->i_size = 0; - ext3_orphan_add(handle, inode); - dir->i_nlink--; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -@@ -2086,6 +2085,23 @@ - return err; - } - -+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ -+static int ext3_add_link(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ dput(iopen_connect_dentry(dentry, inode, 0)); -+ return 0; -+ } -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ - static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) - { -@@ -2113,7 +2129,8 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- err = ext3_add_nondir(handle, dentry, inode); -+ err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle, inode); - ext3_journal_stop(handle, dir); - return err; - } -Index: linux-2.4.19.SuSE/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:19:22 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 01:27:31 2003 -@@ -864,6 +864,18 @@ - || !strcmp (this_char, "quota") - || !strcmp (this_char, "usrquota")) - /* Don't do anything ;-) */ ; -+ else if (!strcmp (this_char, "iopen")) { -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "noiopen")) { -+ clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "iopen_nopriv")) { -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } - else if (!strcmp (this_char, "journal")) { - /* @@@ FIXME */ - /* Eventually we will want to be able to create -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:25:42 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:30:05 2003 -@@ -324,6 +324,8 @@ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ - #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ -+#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/jbd-2.4.18-jcberr.patch b/lustre/kernel_patches/patches/jbd-2.4.18-jcberr.patch deleted file mode 100644 index 81b4136..0000000 --- a/lustre/kernel_patches/patches/jbd-2.4.18-jcberr.patch +++ /dev/null @@ -1,274 +0,0 @@ -Index: linux-2.4.19.SuSE/include/linux/jbd.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/jbd.h Sun Nov 16 13:51:03 2003 -+++ linux-2.4.19.SuSE/include/linux/jbd.h Sun Nov 16 15:10:48 2003 -@@ -283,6 +283,13 @@ - return bh->b_private; - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+struct journal_callback { -+ struct list_head jcb_list; -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* user data goes here */ -+}; -+ - struct jbd_revoke_table_s; - - /* The handle_t type represents a single atomic update being performed -@@ -313,6 +320,12 @@ - operations */ - int h_err; - -+ /* List of application registered callbacks for this handle. -+ * The function(s) will be called after the transaction that -+ * this handle is part of has been committed to disk. -+ */ -+ struct list_head h_jcb; -+ - /* Flags */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -432,6 +445,10 @@ - - /* How many handles used this transaction? */ - int t_handle_count; -+ -+ /* List of registered callback functions for this transaction. -+ * Called when the transaction is committed. */ -+ struct list_head t_jcb; - }; - - -@@ -676,6 +693,9 @@ - extern int journal_try_to_free_buffers(journal_t *, struct page *, int); - extern int journal_stop(handle_t *); - extern int journal_flush (journal_t *); -+extern void journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); - - extern void journal_lock_updates (journal_t *); - extern void journal_unlock_updates (journal_t *); -Index: linux-2.4.19.SuSE/fs/jbd/checkpoint.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/checkpoint.c Mon Feb 25 11:38:08 2002 -+++ linux-2.4.19.SuSE/fs/jbd/checkpoint.c Sun Nov 16 15:10:48 2003 -@@ -594,7 +594,8 @@ - J_ASSERT (transaction->t_log_list == NULL); - J_ASSERT (transaction->t_checkpoint_list == NULL); - J_ASSERT (transaction->t_updates == 0); -- -+ J_ASSERT (list_empty(&transaction->t_jcb)); -+ - J_ASSERT (transaction->t_journal->j_committing_transaction != - transaction); - -Index: linux-2.4.19.SuSE/fs/jbd/commit.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/commit.c Mon Jan 27 05:08:04 2003 -+++ linux-2.4.19.SuSE/fs/jbd/commit.c Sun Nov 16 15:13:53 2003 -@@ -485,7 +485,7 @@ - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - -- Wait for the transactions in reverse order. That way we are -+ Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ -@@ -576,8 +576,10 @@ - - jbd_debug(3, "JBD: commit phase 6\n"); - -- if (is_journal_aborted(journal)) -+ if (is_journal_aborted(journal)) { -+ unlock_journal(journal); - goto skip_commit; -+ } - - /* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort -@@ -587,9 +589,10 @@ - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - __journal_abort_hard(journal); -+ unlock_journal(journal); - goto skip_commit; - } -- -+ - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { - journal_header_t *tmp = -@@ -610,14 +614,32 @@ - put_bh(bh); /* One for getblk() */ - journal_unlock_journal_head(descriptor); - } -- lock_journal(journal); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this - transaction can be removed from any checkpoint list it was on - before. */ - --skip_commit: -+skip_commit: /* The journal should be unlocked by now. */ -+ -+ /* Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ */ -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del(p); -+ jcb->jcb_func(jcb, error); -+ } -+ } -+ -+ lock_journal(journal); - - jbd_debug(3, "JBD: commit phase 7\n"); - -Index: linux-2.4.19.SuSE/fs/jbd/journal.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/journal.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/jbd/journal.c Sun Nov 16 15:10:48 2003 -@@ -59,6 +59,7 @@ - #endif - EXPORT_SYMBOL(journal_flush); - EXPORT_SYMBOL(journal_revoke); -+EXPORT_SYMBOL(journal_callback_set); - - EXPORT_SYMBOL(journal_init_dev); - EXPORT_SYMBOL(journal_init_inode); -Index: linux-2.4.19.SuSE/fs/jbd/transaction.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/transaction.c Sun Nov 16 01:45:26 2003 -+++ linux-2.4.19.SuSE/fs/jbd/transaction.c Sun Nov 16 15:15:34 2003 -@@ -58,6 +58,7 @@ - transaction->t_state = T_RUNNING; - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + bdflush_interval(); -+ INIT_LIST_HEAD(&transaction->t_jcb); - - /* Set up the commit timer for the new transaction. */ - J_ASSERT (!journal->j_commit_timer_active); -@@ -91,7 +92,14 @@ - transaction_t *transaction; - int needed; - int nblocks = handle->h_buffer_credits; -- -+ -+ if (nblocks > journal->j_max_transaction_buffers) { -+ jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n", -+ current->comm, nblocks, -+ journal->j_max_transaction_buffers); -+ return -ENOSPC; -+ } -+ - jbd_debug(3, "New handle %p going live.\n", handle); - - repeat: -@@ -202,6 +210,20 @@ - return 0; - } - -+/* Allocate a new handle. This should probably be in a slab... */ -+static handle_t *new_handle(int nblocks) -+{ -+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return NULL; -+ memset(handle, 0, sizeof (handle_t)); -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); -+ -+ return handle; -+} -+ - /* - * Obtain a new handle. - * -@@ -228,14 +250,11 @@ - handle->h_ref++; - return handle; - } -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = start_this_handle(journal, handle); -@@ -334,14 +353,11 @@ - - if (is_journal_aborted(journal)) - return ERR_PTR(-EIO); -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = try_start_this_handle(journal, handle); -@@ -1321,6 +1337,28 @@ - #endif - - /* -+ * Register a callback function for this handle. The function will be -+ * called when the transaction that this handle is part of has been -+ * committed to disk with the original callback data struct and the -+ * error status of the journal as parameters. There is no guarantee of -+ * ordering between handles within a single transaction, nor between -+ * callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ list_add_tail(&jcb->jcb_list, &handle->h_jcb); -+ jcb->jcb_func = func; -+} -+ -+/* - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining -@@ -1385,7 +1423,10 @@ - wake_up(&journal->j_wait_transaction_locked); - } - -- /* -+ /* Move callbacks from the handle to the transaction. */ -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ -+ /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the diff --git a/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch b/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch deleted file mode 100644 index bbbf613..0000000 --- a/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch +++ /dev/null @@ -1,274 +0,0 @@ -Index: linux-2.4.19-pre1/include/linux/jbd.h -=================================================================== ---- linux-2.4.19-pre1.orig/include/linux/jbd.h 2003-11-21 03:00:11.000000000 +0300 -+++ linux-2.4.19-pre1/include/linux/jbd.h 2003-11-21 03:04:47.000000000 +0300 -@@ -275,6 +275,13 @@ - return bh->b_private; - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+struct journal_callback { -+ struct list_head jcb_list; -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* user data goes here */ -+}; -+ - struct jbd_revoke_table_s; - - /* The handle_t type represents a single atomic update being performed -@@ -305,6 +312,12 @@ - operations */ - int h_err; - -+ /* List of application registered callbacks for this handle. -+ * The function(s) will be called after the transaction that -+ * this handle is part of has been committed to disk. -+ */ -+ struct list_head h_jcb; -+ - /* Flags */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -424,6 +437,10 @@ - - /* How many handles used this transaction? */ - int t_handle_count; -+ -+ /* List of registered callback functions for this transaction. -+ * Called when the transaction is committed. */ -+ struct list_head t_jcb; - }; - - -@@ -672,6 +689,9 @@ - extern int journal_try_to_free_buffers(journal_t *, struct page *, int); - extern int journal_stop(handle_t *); - extern int journal_flush (journal_t *); -+extern void journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); - - extern void journal_lock_updates (journal_t *); - extern void journal_unlock_updates (journal_t *); -Index: linux-2.4.19-pre1/fs/jbd/checkpoint.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/jbd/checkpoint.c 2003-11-21 02:53:20.000000000 +0300 -+++ linux-2.4.19-pre1/fs/jbd/checkpoint.c 2003-11-21 03:04:47.000000000 +0300 -@@ -601,7 +601,8 @@ - J_ASSERT (transaction->t_log_list == NULL); - J_ASSERT (transaction->t_checkpoint_list == NULL); - J_ASSERT (transaction->t_updates == 0); -- -+ J_ASSERT (list_empty(&transaction->t_jcb)); -+ - J_ASSERT (transaction->t_journal->j_committing_transaction != - transaction); - -Index: linux-2.4.19-pre1/fs/jbd/commit.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/jbd/commit.c 2003-11-21 02:53:20.000000000 +0300 -+++ linux-2.4.19-pre1/fs/jbd/commit.c 2003-11-21 03:04:47.000000000 +0300 -@@ -480,7 +480,7 @@ - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - -- Wait for the transactions in reverse order. That way we are -+ Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ -@@ -571,8 +571,10 @@ - - jbd_debug(3, "JBD: commit phase 6\n"); - -- if (is_journal_aborted(journal)) -+ if (is_journal_aborted(journal)) { -+ unlock_journal(journal); - goto skip_commit; -+ } - - /* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort -@@ -582,9 +584,10 @@ - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - __journal_abort_hard(journal); -+ unlock_journal(journal); - goto skip_commit; - } -- -+ - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { - journal_header_t *tmp = -@@ -605,14 +608,32 @@ - put_bh(bh); /* One for getblk() */ - journal_unlock_journal_head(descriptor); - } -- lock_journal(journal); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this - transaction can be removed from any checkpoint list it was on - before. */ - --skip_commit: -+skip_commit: /* The journal should be unlocked by now. */ -+ -+ /* Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ */ -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del(p); -+ jcb->jcb_func(jcb, error); -+ } -+ } -+ -+ lock_journal(journal); - - jbd_debug(3, "JBD: commit phase 7\n"); - -Index: linux-2.4.19-pre1/fs/jbd/journal.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/jbd/journal.c 2003-11-21 02:53:20.000000000 +0300 -+++ linux-2.4.19-pre1/fs/jbd/journal.c 2003-11-21 03:04:47.000000000 +0300 -@@ -58,6 +58,7 @@ - #endif - EXPORT_SYMBOL(journal_flush); - EXPORT_SYMBOL(journal_revoke); -+EXPORT_SYMBOL(journal_callback_set); - - EXPORT_SYMBOL(journal_init_dev); - EXPORT_SYMBOL(journal_init_inode); -Index: linux-2.4.19-pre1/fs/jbd/transaction.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/jbd/transaction.c 2003-11-21 02:53:20.000000000 +0300 -+++ linux-2.4.19-pre1/fs/jbd/transaction.c 2003-11-21 03:05:14.000000000 +0300 -@@ -57,6 +57,7 @@ - transaction->t_state = T_RUNNING; - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + journal->j_commit_interval; -+ INIT_LIST_HEAD(&transaction->t_jcb); - - /* Set up the commit timer for the new transaction. */ - J_ASSERT (!journal->j_commit_timer_active); -@@ -90,7 +91,14 @@ - transaction_t *transaction; - int needed; - int nblocks = handle->h_buffer_credits; -- -+ -+ if (nblocks > journal->j_max_transaction_buffers) { -+ jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n", -+ current->comm, nblocks, -+ journal->j_max_transaction_buffers); -+ return -ENOSPC; -+ } -+ - jbd_debug(3, "New handle %p going live.\n", handle); - - repeat: -@@ -196,6 +204,20 @@ - return 0; - } - -+/* Allocate a new handle. This should probably be in a slab... */ -+static handle_t *new_handle(int nblocks) -+{ -+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return NULL; -+ memset(handle, 0, sizeof (handle_t)); -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); -+ -+ return handle; -+} -+ - /* - * Obtain a new handle. - * -@@ -222,14 +244,11 @@ - handle->h_ref++; - return handle; - } -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = start_this_handle(journal, handle); -@@ -328,14 +347,11 @@ - - if (is_journal_aborted(journal)) - return ERR_PTR(-EIO); -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = try_start_this_handle(journal, handle); -@@ -1324,6 +1340,28 @@ - #endif - - /* -+ * Register a callback function for this handle. The function will be -+ * called when the transaction that this handle is part of has been -+ * committed to disk with the original callback data struct and the -+ * error status of the journal as parameters. There is no guarantee of -+ * ordering between handles within a single transaction, nor between -+ * callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ list_add_tail(&jcb->jcb_list, &handle->h_jcb); -+ jcb->jcb_func = func; -+} -+ -+/* - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining -@@ -1389,7 +1427,10 @@ - wake_up(&journal->j_wait_transaction_locked); - } - -- /* -+ /* Move callbacks from the handle to the transaction. */ -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ -+ /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the diff --git a/lustre/kernel_patches/patches/jbd-flushtime-2.4.19-suse.patch b/lustre/kernel_patches/patches/jbd-flushtime-2.4.19-suse.patch deleted file mode 100644 index 8411137..0000000 --- a/lustre/kernel_patches/patches/jbd-flushtime-2.4.19-suse.patch +++ /dev/null @@ -1,35 +0,0 @@ -Index: linux-2.4.19.SuSE/fs/jbd/transaction.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/transaction.c Sun Nov 16 01:38:25 2003 -+++ linux-2.4.19.SuSE/fs/jbd/transaction.c Sun Nov 16 01:44:26 2003 -@@ -1094,7 +1094,6 @@ - - spin_lock(&journal_datalist_lock); - set_bit(BH_JBDDirty, &bh->b_state); -- set_buffer_flushtime(bh); - - J_ASSERT_JH(jh, jh->b_transaction != NULL); - -@@ -1995,6 +1994,13 @@ - spin_unlock(&journal_datalist_lock); - } - -+static void jbd_refile_buffer(struct buffer_head *bh) -+{ -+ if (buffer_dirty(bh) && (bh->b_list != BUF_DIRTY)) -+ set_buffer_flushtime(bh); -+ refile_buffer(bh); -+} -+ - /* - * Remove a buffer from its current buffer list in preparation for - * dropping it from its current transaction entirely. If the buffer has -@@ -2022,7 +2028,7 @@ - J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); - } else { - /* Onto BUF_DIRTY for writeback */ -- refile_buffer(jh2bh(jh)); -+ jbd_refile_buffer(jh2bh(jh)); - } - } - diff --git a/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch b/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch deleted file mode 100644 index 25f7954..0000000 --- a/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch +++ /dev/null @@ -1,685 +0,0 @@ -Index: linux-bgl/arch/arm/vmlinux-armo.lds.in -=================================================================== ---- linux-bgl.orig/arch/arm/vmlinux-armo.lds.in 2003-07-02 08:44:12.000000000 -0700 -+++ linux-bgl/arch/arm/vmlinux-armo.lds.in 2004-10-26 22:52:50.037677957 -0700 -@@ -62,6 +62,10 @@ - *(__ksymtab) - __stop___ksymtab = .; - -+ __start___kallsyms = .; /* All kernel symbols */ -+ *(__kallsyms) -+ __stop___kallsyms = .; -+ - *(.got) /* Global offset table */ - - _etext = .; /* End of text section */ -Index: linux-bgl/arch/arm/vmlinux-armv.lds.in -=================================================================== ---- linux-bgl.orig/arch/arm/vmlinux-armv.lds.in 2003-07-02 08:44:12.000000000 -0700 -+++ linux-bgl/arch/arm/vmlinux-armv.lds.in 2004-10-26 22:52:50.038677801 -0700 -@@ -67,6 +67,12 @@ - __stop___ksymtab = .; - } - -+ __kallsyms : { /* Kernel debugging table */ -+ __start___kallsyms = .; /* All kernel symbols */ -+ *(__kallsyms) -+ __stop___kallsyms = .; -+ } -+ - . = ALIGN(8192); - - .data : { -Index: linux-bgl/arch/ppc/config.in -=================================================================== ---- linux-bgl.orig/arch/ppc/config.in 2004-10-04 09:55:49.000000000 -0700 -+++ linux-bgl/arch/ppc/config.in 2004-10-26 23:11:56.416643929 -0700 -@@ -732,6 +732,7 @@ - string 'Additional compile arguments' CONFIG_COMPILE_OPTIONS "-g -ggdb" - fi - fi -+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS - - if [ "$CONFIG_ALL_PPC" = "y" ]; then - bool 'Support for early boot text console (BootX or OpenFirmware only)' CONFIG_BOOTX_TEXT -Index: linux-bgl/arch/ppc/vmlinux.lds -=================================================================== ---- linux-bgl.orig/arch/ppc/vmlinux.lds 2003-07-02 08:43:30.000000000 -0700 -+++ linux-bgl/arch/ppc/vmlinux.lds 2004-10-26 22:52:50.043677020 -0700 -@@ -73,6 +73,10 @@ - __ksymtab : { *(__ksymtab) } - __stop___ksymtab = .; - -+ __start___kallsyms = .; /* All kernel symbols */ -+ __kallsyms : { *(__kallsyms) } -+ __stop___kallsyms = .; -+ - __start___ftr_fixup = .; - __ftr_fixup : { *(__ftr_fixup) } - __stop___ftr_fixup = .; -Index: linux-bgl/arch/i386/config.in -=================================================================== ---- linux-bgl.orig/arch/i386/config.in 2003-07-02 08:43:46.000000000 -0700 -+++ linux-bgl/arch/i386/config.in 2004-10-26 22:52:50.040677488 -0700 -@@ -363,6 +363,7 @@ - if [ "$CONFIG_ISDN" != "n" ]; then - source drivers/isdn/Config.in - fi -+ bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS - fi - endmenu - -Index: linux-bgl/arch/i386/vmlinux.lds -=================================================================== ---- linux-bgl.orig/arch/i386/vmlinux.lds 2003-07-02 08:44:32.000000000 -0700 -+++ linux-bgl/arch/i386/vmlinux.lds 2004-10-26 22:52:50.040677488 -0700 -@@ -27,6 +27,9 @@ - __start___ksymtab = .; /* Kernel symbol table */ - __ksymtab : { *(__ksymtab) } - __stop___ksymtab = .; -+ __start___kallsyms = .; /* All kernel symbols */ -+ __kallsyms : { *(__kallsyms) } -+ __stop___kallsyms = .; - - .data : { /* Data */ - *(.data) -Index: linux-bgl/arch/ia64/config.in -=================================================================== ---- linux-bgl.orig/arch/ia64/config.in 2003-07-02 08:44:12.000000000 -0700 -+++ linux-bgl/arch/ia64/config.in 2004-10-26 22:52:50.055675147 -0700 -@@ -278,4 +278,6 @@ - bool ' Turn on irq debug checks (slow!)' CONFIG_IA64_DEBUG_IRQ - fi - -+bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS -+ - endmenu -Index: linux-bgl/arch/alpha/vmlinux.lds.in -=================================================================== ---- linux-bgl.orig/arch/alpha/vmlinux.lds.in 2003-07-02 08:43:45.000000000 -0700 -+++ linux-bgl/arch/alpha/vmlinux.lds.in 2004-10-26 22:52:50.036678113 -0700 -@@ -28,6 +28,10 @@ - __stop___ksymtab = .; - .kstrtab : { *(.kstrtab) } - -+ __start___kallsyms = .; /* All kernel symbols */ -+ __kallsyms : { *(__kallsyms) } -+ __stop___kallsyms = .; -+ - /* Startup code */ - . = ALIGN(8192); - __init_begin = .; -Index: linux-bgl/Makefile -=================================================================== ---- linux-bgl.orig/Makefile 2004-10-04 09:55:49.000000000 -0700 -+++ linux-bgl/Makefile 2004-10-26 22:54:44.018588371 -0700 -@@ -38,10 +38,13 @@ - MAKEFILES = $(TOPDIR)/.config - GENKSYMS = /sbin/genksyms - DEPMOD = /sbin/depmod -+KALLSYMS = /sbin/kallsyms - MODFLAGS = -DMODULE - CFLAGS_KERNEL = - PERL = perl - -+TMPPREFIX = -+ - export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ - CONFIG_SHELL TOPDIR HPATH HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC \ - CPP AR NM STRIP OBJCOPY OBJDUMP MAKE MAKEFILES GENKSYMS MODFLAGS PERL -@@ -198,7 +201,7 @@ - CLEAN_FILES = \ - kernel/ksyms.lst include/linux/compile.h \ - vmlinux System.map \ -- .tmp* \ -+ $(TMPPREFIX).tmp* \ - drivers/char/consolemap_deftbl.c drivers/video/promcon_tbl.c \ - drivers/char/conmakehash \ - drivers/char/drm/*-mod.c \ -@@ -278,16 +281,39 @@ - boot: vmlinux - @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C arch/$(ARCH)/boot - -+LD_VMLINUX := $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \ -+ --start-group \ -+ $(CORE_FILES) \ -+ $(DRIVERS) \ -+ $(NETWORKS) \ -+ $(LIBS) \ -+ --end-group -+ifeq ($(CONFIG_KALLSYMS),y) -+LD_VMLINUX_KALLSYMS := $(TMPPREFIX).tmp_kallsyms3.o -+else -+LD_VMLINUX_KALLSYMS := -+endif -+ - vmlinux: include/linux/version.h $(CONFIGURATION) init/main.o init/version.o init/do_mounts.o linuxsubdirs -- $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \ -- --start-group \ -- $(CORE_FILES) \ -- $(DRIVERS) \ -- $(NETWORKS) \ -- $(LIBS) \ -- --end-group \ -- -o vmlinux -+ @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" kallsyms -+ -+.PHONY: kallsyms -+ -+kallsyms: -+ifeq ($(CONFIG_KALLSYMS),y) -+ @echo kallsyms pass 1 -+ $(LD_VMLINUX) -o $(TMPPREFIX).tmp_vmlinux1 -+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux1 > $(TMPPREFIX).tmp_kallsyms1.o -+ @echo kallsyms pass 2 -+ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms1.o -o $(TMPPREFIX).tmp_vmlinux2 -+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux2 > $(TMPPREFIX).tmp_kallsyms2.o -+ @echo kallsyms pass 3 -+ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms2.o -o $(TMPPREFIX).tmp_vmlinux3 -+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux3 > $(TMPPREFIX).tmp_kallsyms3.o -+endif -+ $(LD_VMLINUX) $(LD_VMLINUX_KALLSYMS) -o vmlinux - $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map -+ @rm -f $(TMPPREFIX).tmp_vmlinux* $(TMPPREFIX).tmp_kallsyms* - - symlinks: - rm -f include/asm -Index: linux-bgl/kernel/Makefile -=================================================================== ---- linux-bgl.orig/kernel/Makefile 2003-07-02 08:44:29.000000000 -0700 -+++ linux-bgl/kernel/Makefile 2004-10-26 22:59:34.101037916 -0700 -@@ -19,6 +19,7 @@ - obj-$(CONFIG_UID16) += uid16.o - obj-$(CONFIG_MODULES) += ksyms.o - obj-$(CONFIG_PM) += pm.o -+obj-$(CONFIG_KALLSYMS) += kallsyms.o - - ifneq ($(CONFIG_IA64),y) - # According to Alan Modra , the -fno-omit-frame-pointer is -Index: linux-bgl/kernel/ksyms.c -=================================================================== ---- linux-bgl.orig/kernel/ksyms.c 2004-10-26 21:49:59.922431839 -0700 -+++ linux-bgl/kernel/ksyms.c 2004-10-26 22:52:50.050675927 -0700 -@@ -56,6 +56,9 @@ - #ifdef CONFIG_KMOD - #include - #endif -+#ifdef CONFIG_KALLSYMS -+#include -+#endif - - extern void set_device_ro(kdev_t dev,int flag); - -@@ -81,6 +84,15 @@ - EXPORT_SYMBOL(inter_module_put); - EXPORT_SYMBOL(try_inc_mod_count); - -+#ifdef CONFIG_KALLSYMS -+extern const char __start___kallsyms[]; -+extern const char __stop___kallsyms[]; -+EXPORT_SYMBOL(__start___kallsyms); -+EXPORT_SYMBOL(__stop___kallsyms); -+ -+ -+#endif -+ - /* process memory management */ - EXPORT_SYMBOL(do_mmap_pgoff); - EXPORT_SYMBOL(do_munmap); -Index: linux-bgl/kernel/kallsyms.c -=================================================================== ---- linux-bgl.orig/kernel/kallsyms.c 2004-10-26 17:10:51.404753448 -0700 -+++ linux-bgl/kernel/kallsyms.c 2004-10-26 22:52:50.048676240 -0700 -@@ -0,0 +1,306 @@ -+/* An example of using kallsyms data in a kernel debugger. -+ -+ Copyright 2000 Keith Owens April 2000 -+ -+ This file is part of the Linux modutils. -+ -+ This program is free software; you can redistribute it and/or modify it -+ under the terms of the GNU General Public License as published by the -+ Free Software Foundation; either version 2 of the License, or (at your -+ option) any later version. -+ -+ This program is distributed in the hope that it will be useful, but -+ WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software Foundation, -+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -+ */ -+ -+#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.2 2005/04/01 21:30:19 green Exp $" -+ -+/* -+ This code uses the list of all kernel and module symbols to :- -+ -+ * Find any non-stack symbol in a kernel or module. Symbols do -+ not have to be exported for debugging. -+ -+ * Convert an address to the module (or kernel) that owns it, the -+ section it is in and the nearest symbol. This finds all non-stack -+ symbols, not just exported ones. -+ -+ You need modutils >= 2.3.11 and a kernel with the kallsyms patch -+ which was compiled with CONFIG_KALLSYMS. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+/* These external symbols are only set on kernels compiled with -+ * CONFIG_KALLSYMS. -+ */ -+ -+extern const char __start___kallsyms[]; -+extern const char __stop___kallsyms[]; -+ -+static struct module **kallsyms_module_list; -+ -+static void kallsyms_get_module_list(void) -+{ -+ const struct kallsyms_header *ka_hdr; -+ const struct kallsyms_section *ka_sec; -+ const struct kallsyms_symbol *ka_sym; -+ const char *ka_str; -+ int i; -+ const char *p; -+ -+ if (__start___kallsyms >= __stop___kallsyms) -+ return; -+ ka_hdr = (struct kallsyms_header *)__start___kallsyms; -+ ka_sec = (struct kallsyms_section *) -+ ((char *)(ka_hdr) + ka_hdr->section_off); -+ ka_sym = (struct kallsyms_symbol *) -+ ((char *)(ka_hdr) + ka_hdr->symbol_off); -+ ka_str = -+ ((char *)(ka_hdr) + ka_hdr->string_off); -+ -+ for (i = 0; i < ka_hdr->symbols; kallsyms_next_sym(ka_hdr, ka_sym), ++i) { -+ p = ka_str + ka_sym->name_off; -+ if (strcmp(p, "module_list") == 0) { -+ if (ka_sym->symbol_addr) -+ kallsyms_module_list = (struct module **)(ka_sym->symbol_addr); -+ break; -+ } -+ } -+} -+ -+static inline void kallsyms_do_first_time(void) -+{ -+ static int first_time = 1; -+ if (first_time) -+ kallsyms_get_module_list(); -+ first_time = 0; -+} -+ -+/* A symbol can appear in more than one module. A token is used to -+ * restart the scan at the next module, set the token to 0 for the -+ * first scan of each symbol. -+ */ -+ -+int kallsyms_symbol_to_address( -+ const char *name, /* Name to lookup */ -+ unsigned long *token, /* Which module to start at */ -+ const char **mod_name, /* Set to module name */ -+ unsigned long *mod_start, /* Set to start address of module */ -+ unsigned long *mod_end, /* Set to end address of module */ -+ const char **sec_name, /* Set to section name */ -+ unsigned long *sec_start, /* Set to start address of section */ -+ unsigned long *sec_end, /* Set to end address of section */ -+ const char **sym_name, /* Set to full symbol name */ -+ unsigned long *sym_start, /* Set to start address of symbol */ -+ unsigned long *sym_end /* Set to end address of symbol */ -+ ) -+{ -+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ -+ const struct kallsyms_section *ka_sec; -+ const struct kallsyms_symbol *ka_sym = NULL; -+ const char *ka_str = NULL; -+ const struct module *m; -+ int i = 0, l; -+ const char *p, *pt_R; -+ char *p2; -+ -+ kallsyms_do_first_time(); -+ if (!kallsyms_module_list) -+ return(0); -+ -+ /* Restart? */ -+ m = *kallsyms_module_list; -+ if (token && *token) { -+ for (; m; m = m->next) -+ if ((unsigned long)m == *token) -+ break; -+ if (m) -+ m = m->next; -+ } -+ -+ for (; m; m = m->next) { -+ if (!mod_member_present(m, kallsyms_start) || -+ !mod_member_present(m, kallsyms_end) || -+ m->kallsyms_start >= m->kallsyms_end) -+ continue; -+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; -+ ka_sym = (struct kallsyms_symbol *) -+ ((char *)(ka_hdr) + ka_hdr->symbol_off); -+ ka_str = -+ ((char *)(ka_hdr) + ka_hdr->string_off); -+ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) { -+ p = ka_str + ka_sym->name_off; -+ if (strcmp(p, name) == 0) -+ break; -+ /* Unversioned requests match versioned names */ -+ if (!(pt_R = strstr(p, "_R"))) -+ continue; -+ l = strlen(pt_R); -+ if (l < 10) -+ continue; /* Not _R.*xxxxxxxx */ -+ (void)simple_strtoul(pt_R+l-8, &p2, 16); -+ if (*p2) -+ continue; /* Not _R.*xxxxxxxx */ -+ if (strncmp(p, name, pt_R-p) == 0) -+ break; /* Match with version */ -+ } -+ if (i < ka_hdr->symbols) -+ break; -+ } -+ -+ if (token) -+ *token = (unsigned long)m; -+ if (!m) -+ return(0); /* not found */ -+ -+ ka_sec = (const struct kallsyms_section *) -+ ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off); -+ *mod_name = *(m->name) ? m->name : "kernel"; -+ *mod_start = ka_hdr->start; -+ *mod_end = ka_hdr->end; -+ *sec_name = ka_sec->name_off + ka_str; -+ *sec_start = ka_sec->start; -+ *sec_end = ka_sec->start + ka_sec->size; -+ *sym_name = ka_sym->name_off + ka_str; -+ *sym_start = ka_sym->symbol_addr; -+ if (i < ka_hdr->symbols-1) { -+ const struct kallsyms_symbol *ka_symn = ka_sym; -+ kallsyms_next_sym(ka_hdr, ka_symn); -+ *sym_end = ka_symn->symbol_addr; -+ } -+ else -+ *sym_end = *sec_end; -+ return(1); -+} -+ -+int kallsyms_address_to_symbol( -+ unsigned long address, /* Address to lookup */ -+ const char **mod_name, /* Set to module name */ -+ unsigned long *mod_start, /* Set to start address of module */ -+ unsigned long *mod_end, /* Set to end address of module */ -+ const char **sec_name, /* Set to section name */ -+ unsigned long *sec_start, /* Set to start address of section */ -+ unsigned long *sec_end, /* Set to end address of section */ -+ const char **sym_name, /* Set to full symbol name */ -+ unsigned long *sym_start, /* Set to start address of symbol */ -+ unsigned long *sym_end /* Set to end address of symbol */ -+ ) -+{ -+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ -+ const struct kallsyms_section *ka_sec = NULL; -+ const struct kallsyms_symbol *ka_sym; -+ const char *ka_str; -+ const struct module *m; -+ int i; -+ unsigned long end; -+ -+ kallsyms_do_first_time(); -+ if (!kallsyms_module_list) -+ return(0); -+ -+ for (m = *kallsyms_module_list; m; m = m->next) { -+ if (!mod_member_present(m, kallsyms_start) || -+ !mod_member_present(m, kallsyms_end) || -+ m->kallsyms_start >= m->kallsyms_end) -+ continue; -+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; -+ ka_sec = (const struct kallsyms_section *) -+ ((char *)ka_hdr + ka_hdr->section_off); -+ /* Is the address in any section in this module? */ -+ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) { -+ if (ka_sec->start <= address && -+ (ka_sec->start + ka_sec->size) > address) -+ break; -+ } -+ if (i < ka_hdr->sections) -+ break; /* Found a matching section */ -+ } -+ -+ if (!m) -+ return(0); /* not found */ -+ -+ ka_sym = (struct kallsyms_symbol *) -+ ((char *)(ka_hdr) + ka_hdr->symbol_off); -+ ka_str = -+ ((char *)(ka_hdr) + ka_hdr->string_off); -+ *mod_name = *(m->name) ? m->name : "kernel"; -+ *mod_start = ka_hdr->start; -+ *mod_end = ka_hdr->end; -+ *sec_name = ka_sec->name_off + ka_str; -+ *sec_start = ka_sec->start; -+ *sec_end = ka_sec->start + ka_sec->size; -+ *sym_name = *sec_name; /* In case we find no matching symbol */ -+ *sym_start = *sec_start; -+ *sym_end = *sec_end; -+ -+ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) { -+ if (ka_sym->symbol_addr > address) -+ continue; -+ if (i < ka_hdr->symbols-1) { -+ const struct kallsyms_symbol *ka_symn = ka_sym; -+ kallsyms_next_sym(ka_hdr, ka_symn); -+ end = ka_symn->symbol_addr; -+ } -+ else -+ end = *sec_end; -+ if (end <= address) -+ continue; -+ if ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off -+ != (char *)ka_sec) -+ continue; /* wrong section */ -+ *sym_name = ka_str + ka_sym->name_off; -+ *sym_start = ka_sym->symbol_addr; -+ *sym_end = end; -+ break; -+ } -+ return(1); -+} -+ -+/* List all sections in all modules. The callback routine is invoked with -+ * token, module name, section name, section start, section end, section flags. -+ */ -+int kallsyms_sections(void *token, -+ int (*callback)(void *, const char *, const char *, ElfW(Addr), ElfW(Addr), ElfW(Word))) -+{ -+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ -+ const struct kallsyms_section *ka_sec = NULL; -+ const char *ka_str; -+ const struct module *m; -+ int i; -+ -+ kallsyms_do_first_time(); -+ if (!kallsyms_module_list) -+ return(0); -+ -+ for (m = *kallsyms_module_list; m; m = m->next) { -+ if (!mod_member_present(m, kallsyms_start) || -+ !mod_member_present(m, kallsyms_end) || -+ m->kallsyms_start >= m->kallsyms_end) -+ continue; -+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; -+ ka_sec = (const struct kallsyms_section *) ((char *)ka_hdr + ka_hdr->section_off); -+ ka_str = ((char *)(ka_hdr) + ka_hdr->string_off); -+ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) { -+ if (callback( -+ token, -+ *(m->name) ? m->name : "kernel", -+ ka_sec->name_off + ka_str, -+ ka_sec->start, -+ ka_sec->start + ka_sec->size, -+ ka_sec->flags)) -+ return(0); -+ } -+ } -+ return(1); -+} -Index: linux-bgl/include/linux/kallsyms.h -=================================================================== ---- linux-bgl.orig/include/linux/kallsyms.h 2004-10-26 17:10:51.404753448 -0700 -+++ linux-bgl/include/linux/kallsyms.h 2004-10-26 22:52:50.045676708 -0700 -@@ -0,0 +1,141 @@ -+/* kallsyms headers -+ Copyright 2000 Keith Owens -+ -+ This file is part of the Linux modutils. It is exported to kernel -+ space so debuggers can access the kallsyms data. -+ -+ The kallsyms data contains all the non-stack symbols from a kernel -+ or a module. The kernel symbols are held between __start___kallsyms -+ and __stop___kallsyms. The symbols for a module are accessed via -+ the struct module chain which is based at module_list. -+ -+ This program is free software; you can redistribute it and/or modify it -+ under the terms of the GNU General Public License as published by the -+ Free Software Foundation; either version 2 of the License, or (at your -+ option) any later version. -+ -+ This program is distributed in the hope that it will be useful, but -+ WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software Foundation, -+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -+ */ -+ -+#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.2 2005/04/01 21:30:19 green Exp $" -+ -+#ifndef MODUTILS_KALLSYMS_H -+#define MODUTILS_KALLSYMS_H 1 -+ -+/* Have to (re)define these ElfW entries here because external kallsyms -+ * code does not have access to modutils/include/obj.h. This code is -+ * included from user spaces tools (modutils) and kernel, they need -+ * different includes. -+ */ -+ -+#ifndef ELFCLASS32 -+#ifdef __KERNEL__ -+#include -+#else /* __KERNEL__ */ -+#include -+#endif /* __KERNEL__ */ -+#endif /* ELFCLASS32 */ -+ -+#ifndef ELFCLASSM -+#define ELFCLASSM ELF_CLASS -+#endif -+ -+#ifndef ElfW -+# if ELFCLASSM == ELFCLASS32 -+# define ElfW(x) Elf32_ ## x -+# define ELFW(x) ELF32_ ## x -+# else -+# define ElfW(x) Elf64_ ## x -+# define ELFW(x) ELF64_ ## x -+# endif -+#endif -+ -+/* Format of data in the kallsyms section. -+ * Most of the fields are small numbers but the total size and all -+ * offsets can be large so use the 32/64 bit types for these fields. -+ * -+ * Do not use sizeof() on these structures, modutils may be using extra -+ * fields. Instead use the size fields in the header to access the -+ * other bits of data. -+ */ -+ -+struct kallsyms_header { -+ int size; /* Size of this header */ -+ ElfW(Word) total_size; /* Total size of kallsyms data */ -+ int sections; /* Number of section entries */ -+ ElfW(Off) section_off; /* Offset to first section entry */ -+ int section_size; /* Size of one section entry */ -+ int symbols; /* Number of symbol entries */ -+ ElfW(Off) symbol_off; /* Offset to first symbol entry */ -+ int symbol_size; /* Size of one symbol entry */ -+ ElfW(Off) string_off; /* Offset to first string */ -+ ElfW(Addr) start; /* Start address of first section */ -+ ElfW(Addr) end; /* End address of last section */ -+}; -+ -+struct kallsyms_section { -+ ElfW(Addr) start; /* Start address of section */ -+ ElfW(Word) size; /* Size of this section */ -+ ElfW(Off) name_off; /* Offset to section name */ -+ ElfW(Word) flags; /* Flags from section */ -+}; -+ -+struct kallsyms_symbol { -+ ElfW(Off) section_off; /* Offset to section that owns this symbol */ -+ ElfW(Addr) symbol_addr; /* Address of symbol */ -+ ElfW(Off) name_off; /* Offset to symbol name */ -+}; -+ -+#define KALLSYMS_SEC_NAME "__kallsyms" -+#define KALLSYMS_IDX 2 /* obj_kallsyms creates kallsyms as section 2 */ -+ -+#define kallsyms_next_sec(h,s) \ -+ ((s) = (struct kallsyms_section *)((char *)(s) + (h)->section_size)) -+#define kallsyms_next_sym(h,s) \ -+ ((s) = (struct kallsyms_symbol *)((char *)(s) + (h)->symbol_size)) -+ -+int kallsyms_symbol_to_address( -+ const char *name, /* Name to lookup */ -+ unsigned long *token, /* Which module to start with */ -+ const char **mod_name, /* Set to module name or "kernel" */ -+ unsigned long *mod_start, /* Set to start address of module */ -+ unsigned long *mod_end, /* Set to end address of module */ -+ const char **sec_name, /* Set to section name */ -+ unsigned long *sec_start, /* Set to start address of section */ -+ unsigned long *sec_end, /* Set to end address of section */ -+ const char **sym_name, /* Set to full symbol name */ -+ unsigned long *sym_start, /* Set to start address of symbol */ -+ unsigned long *sym_end /* Set to end address of symbol */ -+ ); -+ -+int kallsyms_address_to_symbol( -+ unsigned long address, /* Address to lookup */ -+ const char **mod_name, /* Set to module name */ -+ unsigned long *mod_start, /* Set to start address of module */ -+ unsigned long *mod_end, /* Set to end address of module */ -+ const char **sec_name, /* Set to section name */ -+ unsigned long *sec_start, /* Set to start address of section */ -+ unsigned long *sec_end, /* Set to end address of section */ -+ const char **sym_name, /* Set to full symbol name */ -+ unsigned long *sym_start, /* Set to start address of symbol */ -+ unsigned long *sym_end /* Set to end address of symbol */ -+ ); -+ -+int kallsyms_sections(void *token, -+ int (*callback)(void *, /* token */ -+ const char *, /* module name */ -+ const char *, /* section name */ -+ ElfW(Addr), /* Section start */ -+ ElfW(Addr), /* Section end */ -+ ElfW(Word) /* Section flags */ -+ ) -+ ); -+ -+#endif /* kallsyms.h */ diff --git a/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch b/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch deleted file mode 100644 index 9d33973..0000000 --- a/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch +++ /dev/null @@ -1,678 +0,0 @@ -Index: linux-bgl/arch/i386/kernel/traps.c -=================================================================== ---- linux-bgl.orig/arch/i386/kernel/traps.c 2003-07-02 08:43:23.000000000 -0700 -+++ linux-bgl/arch/i386/kernel/traps.c 2004-10-26 23:25:17.950442396 -0700 -@@ -24,6 +24,7 @@ - #include - #include - #include -+#include - - #ifdef CONFIG_MCA - #include -@@ -135,6 +136,8 @@ - { - int i; - unsigned long addr; -+ /* static to not take up stackspace; if we race here too bad */ -+ static char buffer[512]; - - if (!stack) - stack = (unsigned long*)&stack; -@@ -144,9 +147,8 @@ - while (((long) stack & (THREAD_SIZE-1)) != 0) { - addr = *stack++; - if (kernel_text_address(addr)) { -- if (i && ((i % 6) == 0)) -- printk("\n "); -- printk(" [<%08lx>]", addr); -+ lookup_symbol(addr, buffer, 512); -+ printk("[<%08lx>] %s (0x%p)\n", addr,buffer,stack-1); - i++; - } - } -@@ -186,12 +188,19 @@ - show_trace(esp); - } - -+#ifdef CONFIG_MK7 -+#define ARCHIT "/athlon" -+#else -+#define ARCHIT "/i686" -+#endif -+ - void show_registers(struct pt_regs *regs) - { - int i; - int in_kernel = 1; - unsigned long esp; - unsigned short ss; -+ static char buffer[512]; - - esp = (unsigned long) (®s->esp); - ss = __KERNEL_DS; -@@ -200,8 +209,12 @@ - esp = regs->esp; - ss = regs->xss & 0xffff; - } -+ -+ print_modules(); -+ lookup_symbol(regs->eip, buffer, 512); - printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx\n", - smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags); -+ printk("\nEIP is at %s (" UTS_RELEASE ARCHIT ")\n",buffer); - printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); - printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", -@@ -261,7 +274,7 @@ - if (__get_user(file, (char **)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) - file = ""; -- -+ printk("------------[ cut here ]------------\n"); - printk("kernel BUG at %s:%d!\n", file, line); - - no_bug: -Index: linux-bgl/arch/i386/kernel/process.c -=================================================================== ---- linux-bgl.orig/arch/i386/kernel/process.c 2003-07-02 08:44:07.000000000 -0700 -+++ linux-bgl/arch/i386/kernel/process.c 2004-10-26 23:28:53.017015082 -0700 -@@ -33,6 +33,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -437,10 +438,14 @@ - void show_regs(struct pt_regs * regs) - { - unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; -+ static char buffer[512]; -+ -+ lookup_symbol(regs->eip, buffer, 512); - - printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); - printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id()); -+ printk("\nEIP is at %s (" UTS_RELEASE ")\n", buffer); - if (regs->xcs & 3) - printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); - printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted()); -Index: linux-bgl/arch/ia64/kernel/process.c -=================================================================== ---- linux-bgl.orig/arch/ia64/kernel/process.c 2003-07-02 08:43:26.000000000 -0700 -+++ linux-bgl/arch/ia64/kernel/process.c 2004-10-26 23:29:56.340005959 -0700 -@@ -18,6 +18,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -33,9 +34,10 @@ - #include - #endif - --static void --do_show_stack (struct unw_frame_info *info, void *arg) -+void -+ia64_do_show_stack (struct unw_frame_info *info, void *arg) - { -+ static char buffer[512]; - unsigned long ip, sp, bsp; - - printk("\nCall Trace: "); -@@ -46,7 +48,8 @@ - - unw_get_sp(info, &sp); - unw_get_bsp(info, &bsp); -- printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx\n", ip, sp, bsp); -+ lookup_symbol(ip, buffer, 512); -+ printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx %s\n", ip, sp, bsp, buffer); - } while (unw_unwind(info) >= 0); - } - -@@ -56,19 +59,19 @@ - struct unw_frame_info info; - - unw_init_from_blocked_task(&info, task); -- do_show_stack(&info, 0); -+ ia64_do_show_stack(&info, 0); - } - - void - show_stack (struct task_struct *task) - { - if (!task) -- unw_init_running(do_show_stack, 0); -+ unw_init_running(ia64_do_show_stack, 0); - else { - struct unw_frame_info info; - - unw_init_from_blocked_task(&info, task); -- do_show_stack(&info, 0); -+ ia64_do_show_stack(&info, 0); - } - } - -@@ -76,8 +79,11 @@ - show_regs (struct pt_regs *regs) - { - unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; -+ static char buffer[512]; - - printk("\nPid: %d, comm: %20s\n", current->pid, current->comm); -+ lookup_symbol(ip, buffer, 512); -+ printk("EIP is at %s (" UTS_RELEASE ")\n", buffer); - printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", - regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); - printk("unat: %016lx pfs : %016lx rsc : %016lx\n", -Index: linux-bgl/arch/s390/config.in -=================================================================== ---- linux-bgl.orig/arch/s390/config.in 2003-07-02 08:43:27.000000000 -0700 -+++ linux-bgl/arch/s390/config.in 2004-10-26 23:25:17.961440685 -0700 -@@ -73,5 +73,6 @@ - # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG - #fi - bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ -+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS - endmenu - -Index: linux-bgl/arch/s390/kernel/traps.c -=================================================================== ---- linux-bgl.orig/arch/s390/kernel/traps.c 2003-07-02 08:44:02.000000000 -0700 -+++ linux-bgl/arch/s390/kernel/traps.c 2004-10-26 23:25:17.964440218 -0700 -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -108,27 +109,26 @@ - - void show_trace(unsigned long * stack) - { -+ static char buffer[512]; - unsigned long backchain, low_addr, high_addr, ret_addr; - int i; - - if (!stack) - stack = (unsigned long*)&stack; - -- printk("Call Trace: "); - low_addr = ((unsigned long) stack) & PSW_ADDR_MASK; - high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE; - /* Skip the first frame (biased stack) */ - backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK; -- /* Print up to 8 lines */ -- for (i = 0; i < 8; i++) { -+ /* Print up to 20 lines */ -+ for (i = 0; i < 20; i++) { - if (backchain < low_addr || backchain >= high_addr) - break; - ret_addr = *((unsigned long *) (backchain+56)) & PSW_ADDR_MASK; - if (!kernel_text_address(ret_addr)) - break; -- if (i && ((i % 6) == 0)) -- printk("\n "); -- printk("[<%08lx>] ", ret_addr); -+ lookup_symbol(ret_addr, buffer, 512); -+ printk("[<%08lx>] %s (0x%lx)\n", ret_addr,buffer,backchain+56); - low_addr = backchain; - backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK; - } -@@ -171,6 +171,7 @@ - - void show_registers(struct pt_regs *regs) - { -+ static char buffer[512]; - mm_segment_t old_fs; - char *mode; - int i; -@@ -179,6 +180,10 @@ - printk("%s PSW : %08lx %08lx\n", - mode, (unsigned long) regs->psw.mask, - (unsigned long) regs->psw.addr); -+ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { -+ lookup_symbol(regs->psw.addr & 0x7FFFFFFF, buffer, 512); -+ printk(" %s (" UTS_RELEASE ")\n", buffer); -+ } - printk("%s GPRS: %08x %08x %08x %08x\n", mode, - regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); - printk(" %08x %08x %08x %08x\n", -Index: linux-bgl/arch/s390x/config.in -=================================================================== ---- linux-bgl.orig/arch/s390x/config.in 2003-07-02 08:43:07.000000000 -0700 -+++ linux-bgl/arch/s390x/config.in 2004-10-26 23:25:17.964440218 -0700 -@@ -75,5 +75,6 @@ - # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG - #fi - bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ -+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS - endmenu - -Index: linux-bgl/arch/s390x/kernel/traps.c -=================================================================== ---- linux-bgl.orig/arch/s390x/kernel/traps.c 2003-07-02 08:43:25.000000000 -0700 -+++ linux-bgl/arch/s390x/kernel/traps.c 2004-10-26 23:25:17.966439907 -0700 -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -112,25 +113,25 @@ - { - unsigned long backchain, low_addr, high_addr, ret_addr; - int i; -+ /* static to not take up stackspace; if we race here too bad */ -+ static char buffer[512]; - - if (!stack) - stack = (unsigned long*)&stack; - -- printk("Call Trace: "); - low_addr = ((unsigned long) stack) & PSW_ADDR_MASK; - high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE; - /* Skip the first frame (biased stack) */ - backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK; -- /* Print up to 8 lines */ -- for (i = 0; i < 8; i++) { -+ /* Print up to 20 lines */ -+ for (i = 0; i < 20; i++) { - if (backchain < low_addr || backchain >= high_addr) - break; - ret_addr = *((unsigned long *) (backchain+112)) & PSW_ADDR_MASK; - if (!kernel_text_address(ret_addr)) - break; -- if (i && ((i % 3) == 0)) -- printk("\n "); -- printk("[<%016lx>] ", ret_addr); -+ lookup_symbol(ret_addr, buffer, 512); -+ printk("[<%016lx>] %s (0x%lx)\n", ret_addr, buffer, backchain+112); - low_addr = backchain; - backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK; - } -@@ -173,6 +174,7 @@ - - void show_registers(struct pt_regs *regs) - { -+ static char buffer[512]; - mm_segment_t old_fs; - char *mode; - int i; -@@ -181,6 +183,10 @@ - printk("%s PSW : %016lx %016lx\n", - mode, (unsigned long) regs->psw.mask, - (unsigned long) regs->psw.addr); -+ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { -+ lookup_symbol(regs->psw.addr, buffer, 512); -+ printk(" %s (" UTS_RELEASE ")\n", buffer); -+ } - printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode, - regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); - printk(" %016lx %016lx %016lx %016lx\n", -Index: linux-bgl/arch/ppc64/mm/fault.c -=================================================================== ---- linux-bgl.orig/arch/ppc64/mm/fault.c 2003-07-02 08:43:12.000000000 -0700 -+++ linux-bgl/arch/ppc64/mm/fault.c 2004-10-26 23:30:24.467942247 -0700 -@@ -224,7 +224,6 @@ - if (debugger_kernel_faults) - debugger(regs); - #endif -- print_backtrace( (unsigned long *)regs->gpr[1] ); - panic("kernel access of bad area pc %lx lr %lx address %lX tsk %s/%d", - regs->nip,regs->link,address,current->comm,current->pid); - } -Index: linux-bgl/arch/ppc64/kernel/traps.c -=================================================================== ---- linux-bgl.orig/arch/ppc64/kernel/traps.c 2003-07-02 08:44:03.000000000 -0700 -+++ linux-bgl/arch/ppc64/kernel/traps.c 2004-10-26 23:33:45.297572484 -0700 -@@ -89,7 +89,6 @@ - #if defined(CONFIG_KDB) - kdb(KDB_REASON_OOPS, 0, (kdb_eframe_t) regs); - #endif -- print_backtrace((unsigned long *)regs->gpr[1]); - panic("Exception in kernel pc %lx signal %d",regs->nip,signr); - #if defined(CONFIG_PPCDBG) && (defined(CONFIG_XMON) || defined(CONFIG_KGDB)) - /* Allow us to catch SIGILLs for 64-bit app/glibc debugging. -Peter */ -@@ -187,7 +186,6 @@ - if (kdb(KDB_REASON_FAULT, 0, regs)) - return ; - #endif -- print_backtrace((unsigned long *)regs->gpr[1]); - panic("machine check"); - } - _exception(SIGSEGV, regs); -@@ -209,7 +207,6 @@ - } - #endif - show_regs(regs); -- print_backtrace((unsigned long *)regs->gpr[1]); - panic("System Management Interrupt"); - } - -Index: linux-bgl/arch/ppc64/kernel/process.c -=================================================================== ---- linux-bgl.orig/arch/ppc64/kernel/process.c 2003-07-02 08:44:31.000000000 -0700 -+++ linux-bgl/arch/ppc64/kernel/process.c 2004-10-26 23:33:01.060713583 -0700 -@@ -30,6 +30,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -130,12 +132,61 @@ - __restore_flags(s); - } - -+/* -+ * If the address is either in the .text section of the -+ * kernel, or in the vmalloc'ed module regions, it *may* -+ * be the address of a calling routine -+ */ -+ -+#ifdef CONFIG_MODULES -+ -+extern struct module *module_list; -+extern struct module kernel_module; -+extern char _stext[], _etext[]; -+ -+static inline int kernel_text_address(unsigned long addr) -+{ -+ int retval = 0; -+ struct module *mod; -+ -+ if (addr >= (unsigned long) &_stext && -+ addr <= (unsigned long) &_etext) -+ return 1; -+ -+ for (mod = module_list; mod != &kernel_module; mod = mod->next) { -+ /* mod_bound tests for addr being inside the vmalloc'ed -+ * module area. Of course it'd be better to test only -+ * for the .text subset... */ -+ if (mod_bound(addr, 0, mod)) { -+ retval = 1; -+ break; -+ } -+ } -+ -+ return retval; -+} -+ -+#else -+ -+static inline int kernel_text_address(unsigned long addr) -+{ -+ return (addr >= (unsigned long) &_stext && -+ addr <= (unsigned long) &_etext); -+} -+ -+#endif -+ -+ - void show_regs(struct pt_regs * regs) - { - int i; -+ static char buffer[512]; - -- printk("NIP: %016lX XER: %016lX LR: %016lX REGS: %p TRAP: %04lx %s\n", -+ print_modules(); -+ printk("NIP: %016lx XER: %016lx LR: %016lx REGS: %p TRAP: %04lx %s\n", - regs->nip, regs->xer, regs->link, regs,regs->trap, print_tainted()); -+ lookup_symbol(regs->nip, buffer, 512); -+ printk("NIP is at %s (" UTS_RELEASE ")\n", buffer); - printk("MSR: %016lx EE: %01x PR: %01x FP: %01x ME: %01x IR/DR: %01x%01x\n", - regs->msr, regs->msr&MSR_EE ? 1 : 0, regs->msr&MSR_PR ? 1 : 0, - regs->msr & MSR_FP ? 1 : 0,regs->msr&MSR_ME ? 1 : 0, -@@ -147,27 +198,22 @@ - printk("\nlast math %p ", last_task_used_math); - - #ifdef CONFIG_SMP -- /* printk(" CPU: %d last CPU: %d", current->processor,current->last_processor); */ -+ printk("CPU: %d", smp_processor_id()); - #endif /* CONFIG_SMP */ - -- printk("\n"); - for (i = 0; i < 32; i++) - { - long r; - if ((i % 4) == 0) -- { -- printk("GPR%02d: ", i); -- } -+ printk("\nGPR%02d: ", i); - - if ( __get_user(r, &(regs->gpr[i])) ) - return; - -- printk("%016lX ", r); -- if ((i % 4) == 3) -- { -- printk("\n"); -- } -+ printk("%016lx ", r); - } -+ printk("\n"); -+ print_backtrace((unsigned long *)regs->gpr[1]); - } - - void exit_thread(void) -@@ -415,67 +461,24 @@ - } - } - --extern char _stext[], _etext[]; -- --char * ppc_find_proc_name( unsigned * p, char * buf, unsigned buflen ) --{ -- unsigned long tb_flags; -- unsigned short name_len; -- unsigned long tb_start, code_start, code_ptr, code_offset; -- unsigned code_len; -- strcpy( buf, "Unknown" ); -- code_ptr = (unsigned long)p; -- code_offset = 0; -- if ( ( (unsigned long)p >= (unsigned long)_stext ) && ( (unsigned long)p <= (unsigned long)_etext ) ) { -- while ( (unsigned long)p <= (unsigned long)_etext ) { -- if ( *p == 0 ) { -- tb_start = (unsigned long)p; -- ++p; /* Point to traceback flags */ -- tb_flags = *((unsigned long *)p); -- p += 2; /* Skip over traceback flags */ -- if ( tb_flags & TB_NAME_PRESENT ) { -- if ( tb_flags & TB_PARMINFO ) -- ++p; /* skip over parminfo data */ -- if ( tb_flags & TB_HAS_TBOFF ) { -- code_len = *p; /* get code length */ -- code_start = tb_start - code_len; -- code_offset = code_ptr - code_start + 1; -- if ( code_offset > 0x100000 ) -- break; -- ++p; /* skip over code size */ -- } -- name_len = *((unsigned short *)p); -- if ( name_len > (buflen-20) ) -- name_len = buflen-20; -- memcpy( buf, ((char *)p)+2, name_len ); -- buf[name_len] = 0; -- if ( code_offset ) -- sprintf( buf+name_len, "+0x%lx", code_offset-1 ); -- } -- break; -- } -- ++p; -- } -- } -- return buf; --} -- - void - print_backtrace(unsigned long *sp) - { - int cnt = 0; - unsigned long i; -- char name_buf[256]; -+ char buffer[512]; - -- printk("Call backtrace: \n"); -+ printk("Call Trace: \n"); - while (sp) { - if (__get_user( i, &sp[2] )) - break; -- printk("%016lX ", i); -- printk("%s\n", ppc_find_proc_name( (unsigned *)i, name_buf, 256 )); -+ if (kernel_text_address(i)) { -+ if (__get_user(sp, (unsigned long **)sp)) -+ break; -+ lookup_symbol(i, buffer, 512); -+ printk("[<%016lx>] %s\n", i, buffer); -+ } - if (cnt > 32) break; -- if (__get_user(sp, (unsigned long **)sp)) -- break; - } - printk("\n"); - } -@@ -515,6 +518,7 @@ - unsigned long ip, sp; - unsigned long stack_page = (unsigned long)p; - int count = 0; -+ static char buffer[512]; - - if (!p) - return; -@@ -528,7 +532,8 @@ - break; - if (count > 0) { - ip = *(unsigned long *)(sp + 16); -- printk("[%016lx] ", ip); -+ lookup_symbol(ip, buffer, 512); -+ printk("[<%016lx>] %s\n", ip, buffer); - } - } while (count++ < 16); - printk("\n"); -Index: linux-bgl/kernel/Makefile -=================================================================== ---- linux-bgl.orig/kernel/Makefile 2004-10-26 23:23:00.516655289 -0700 -+++ linux-bgl/kernel/Makefile 2004-10-26 23:35:04.930451186 -0700 -@@ -14,7 +14,7 @@ - obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \ - module.o exit.o itimer.o info.o time.o softirq.o resource.o \ - sysctl.o acct.o capability.o ptrace.o timer.o user.o \ -- signal.o sys.o kmod.o context.o -+ signal.o sys.o kmod.o context.o kksymoops.o - - obj-$(CONFIG_UID16) += uid16.o - obj-$(CONFIG_MODULES) += ksyms.o -Index: linux-bgl/kernel/kksymoops.c -=================================================================== ---- linux-bgl.orig/kernel/kksymoops.c 2004-10-26 17:10:51.404753448 -0700 -+++ linux-bgl/kernel/kksymoops.c 2004-10-26 23:25:17.971439129 -0700 -@@ -0,0 +1,82 @@ -+#include -+#include -+#include -+#include -+#include -+#ifdef CONFIG_KALLSYMS -+#include -+#endif -+ -+ -+ -+int lookup_symbol(unsigned long address, char *buffer, int buflen) -+{ -+ struct module *this_mod; -+ unsigned long bestsofar; -+ -+ const char *mod_name = NULL, *sec_name = NULL, *sym_name = NULL; -+ unsigned long mod_start,mod_end,sec_start,sec_end,sym_start,sym_end; -+ -+ if (!buffer) -+ return -EFAULT; -+ -+ if (buflen<256) -+ return -ENOMEM; -+ -+ memset(buffer,0,buflen); -+ -+#ifdef CONFIG_KALLSYMS -+ if (!kallsyms_address_to_symbol(address,&mod_name,&mod_start,&mod_end,&sec_name, -+ &sec_start, &sec_end, &sym_name, &sym_start, &sym_end)) { -+ /* kallsyms doesn't have a clue; lets try harder */ -+ bestsofar = 0; -+ snprintf(buffer,buflen-1,"[unresolved]"); -+ -+ this_mod = module_list; -+ -+ while (this_mod != NULL) { -+ int i; -+ /* walk the symbol list of this module. Only symbols -+ who's address is smaller than the searched for address -+ are relevant; and only if it's better than the best so far */ -+ for (i=0; i< this_mod->nsyms; i++) -+ if ((this_mod->syms[i].value<=address) && -+ (bestsofarsyms[i].value)) { -+ snprintf(buffer,buflen-1,"%s [%s] 0x%x", -+ this_mod->syms[i].name, -+ this_mod->name, -+ (unsigned int)(address - this_mod->syms[i].value)); -+ bestsofar = this_mod->syms[i].value; -+ } -+ this_mod = this_mod->next; -+ } -+ -+ } else { /* kallsyms success */ -+ snprintf(buffer,buflen-1,"%s [%s] 0x%x",sym_name,mod_name,(unsigned int)(address-sym_start)); -+ } -+#endif -+ return strlen(buffer); -+} -+ -+static char modlist[4096]; -+/* this function isn't smp safe but that's not really a problem; it's called from -+ * oops context only and any locking could actually prevent the oops from going out; -+ * the line that is generated is informational only and should NEVER prevent the real oops -+ * from going out. -+ */ -+void print_modules(void) -+{ -+ struct module *this_mod; -+ int pos = 0, i; -+ memset(modlist,0,4096); -+ -+#ifdef CONFIG_KALLSYMS -+ this_mod = module_list; -+ while (this_mod != NULL) { -+ if (this_mod->name != NULL) -+ pos +=snprintf(modlist+pos,160-pos-1,"%s ",this_mod->name); -+ this_mod = this_mod->next; -+ } -+ printk("%s\n",modlist); -+#endif -+} -Index: linux-bgl/include/linux/kernel.h -=================================================================== ---- linux-bgl.orig/include/linux/kernel.h 2003-07-02 08:44:16.000000000 -0700 -+++ linux-bgl/include/linux/kernel.h 2004-10-26 23:25:17.968439596 -0700 -@@ -107,6 +107,9 @@ - extern int tainted; - extern const char *print_tainted(void); - -+extern int lookup_symbol(unsigned long address, char *buffer, int buflen); -+extern void print_modules(void); -+ - #if DEBUG - #define pr_debug(fmt,arg...) \ - printk(KERN_DEBUG fmt,##arg) diff --git a/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch b/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch deleted file mode 100644 index f8db708..0000000 --- a/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch +++ /dev/null @@ -1,1842 +0,0 @@ -Index: linux-2.4.24/arch/i386/kernel/i386_ksyms.c -=================================================================== ---- linux-2.4.24.orig/arch/i386/kernel/i386_ksyms.c 2003-11-28 13:26:19.000000000 -0500 -+++ linux-2.4.24/arch/i386/kernel/i386_ksyms.c 2004-05-07 16:58:39.000000000 -0400 -@@ -186,3 +186,8 @@ - EXPORT_SYMBOL(edd); - EXPORT_SYMBOL(eddnr); - #endif -+ -+EXPORT_SYMBOL_GPL(show_mem); -+EXPORT_SYMBOL_GPL(show_state); -+EXPORT_SYMBOL_GPL(show_regs); -+ -Index: linux-2.4.24/arch/i386/kernel/process.c -=================================================================== ---- linux-2.4.24.orig/arch/i386/kernel/process.c 2003-11-28 13:26:19.000000000 -0500 -+++ linux-2.4.24/arch/i386/kernel/process.c 2004-05-07 17:08:18.000000000 -0400 -@@ -400,7 +400,8 @@ - * Stop all CPUs and turn off local APICs and the IO-APIC, so - * other OSs see a clean IRQ state. - */ -- smp_send_stop(); -+ if (!netdump_func) -+ smp_send_stop(); - #elif CONFIG_X86_LOCAL_APIC - if (cpu_has_apic) { - __cli(); -Index: linux-2.4.24/arch/i386/kernel/traps.c -=================================================================== ---- linux-2.4.24.orig/arch/i386/kernel/traps.c 2004-05-07 16:57:00.000000000 -0400 -+++ linux-2.4.24/arch/i386/kernel/traps.c 2004-05-07 17:09:17.000000000 -0400 -@@ -280,6 +280,9 @@ - printk("Kernel BUG\n"); - } - -+void (*netdump_func) (struct pt_regs *regs) = NULL; -+int netdump_mode = 0; -+ - spinlock_t die_lock = SPIN_LOCK_UNLOCKED; - - void die(const char * str, struct pt_regs * regs, long err) -@@ -290,6 +293,8 @@ - handle_BUG(regs); - printk("%s: %04lx\n", str, err & 0xffff); - show_registers(regs); -+ if (netdump_func) -+ netdump_func(regs); - bust_spinlocks(0); - spin_unlock_irq(&die_lock); - do_exit(SIGSEGV); -@@ -1041,3 +1046,9 @@ - - EXPORT_SYMBOL_GPL(is_kernel_text_address); - EXPORT_SYMBOL_GPL(lookup_symbol); -+ -+EXPORT_SYMBOL_GPL(netdump_func); -+EXPORT_SYMBOL_GPL(netdump_mode); -+#if CONFIG_X86_LOCAL_APIC -+EXPORT_SYMBOL_GPL(nmi_watchdog); -+#endif -Index: linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c -=================================================================== ---- linux-2.4.24.orig/arch/x86_64/kernel/x8664_ksyms.c 2003-11-28 13:26:19.000000000 -0500 -+++ linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c 2004-05-07 17:01:51.000000000 -0400 -@@ -41,6 +41,9 @@ - EXPORT_SYMBOL(drive_info); - #endif - -+//extern void (*netdump_func) (struct pt_regs *regs) = NULL; -+int netdump_mode = 0; -+ - /* platform dependent support */ - EXPORT_SYMBOL(boot_cpu_data); - EXPORT_SYMBOL(dump_fpu); -@@ -229,3 +232,6 @@ - EXPORT_SYMBOL(touch_nmi_watchdog); - - EXPORT_SYMBOL(do_fork); -+ -+EXPORT_SYMBOL_GPL(netdump_func); -+EXPORT_SYMBOL_GPL(netdump_mode); -Index: linux-2.4.24/drivers/net/3c59x.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/3c59x.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/3c59x.c 2004-05-07 17:01:00.000000000 -0400 -@@ -874,6 +874,7 @@ - static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); - static void vortex_tx_timeout(struct net_device *dev); - static void acpi_set_WOL(struct net_device *dev); -+static void vorboom_poll(struct net_device *dev); - static struct ethtool_ops vortex_ethtool_ops; - - /* This driver uses 'options' to pass the media type, full-duplex flag, etc. */ -@@ -1343,6 +1344,9 @@ - dev->set_multicast_list = set_rx_mode; - dev->tx_timeout = vortex_tx_timeout; - dev->watchdog_timeo = (watchdog * HZ) / 1000; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &vorboom_poll; -+#endif - if (pdev && vp->enable_wol) { - vp->pm_state_valid = 1; - pci_save_state(vp->pdev, vp->power_state); -@@ -2322,6 +2326,29 @@ - spin_unlock(&vp->lock); - } - -+#ifdef HAVE_POLL_CONTROLLER -+ -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void vorboom_poll (struct net_device *dev) -+{ -+ struct vortex_private *vp = (struct vortex_private *)dev->priv; -+ -+ if (!netdump_mode) disable_irq(dev->irq); -+ if (vp->full_bus_master_tx) -+ boomerang_interrupt(dev->irq, dev, 0); -+ else -+ vortex_interrupt(dev->irq, dev, 0); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ -+ - static int vortex_rx(struct net_device *dev) - { - struct vortex_private *vp = (struct vortex_private *)dev->priv; -Index: linux-2.4.24/drivers/net/Config.in -=================================================================== ---- linux-2.4.24.orig/drivers/net/Config.in 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/Config.in 2004-05-07 16:58:39.000000000 -0400 -@@ -295,6 +295,8 @@ - dep_tristate ' SysKonnect FDDI PCI support' CONFIG_SKFP $CONFIG_PCI - fi - -+tristate 'Network logging support' CONFIG_NETCONSOLE -+ - if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - if [ "$CONFIG_INET" = "y" ]; then - bool 'HIPPI driver support (EXPERIMENTAL)' CONFIG_HIPPI -Index: linux-2.4.24/drivers/net/eepro100.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/eepro100.c 2003-08-25 07:44:42.000000000 -0400 -+++ linux-2.4.24/drivers/net/eepro100.c 2004-05-07 16:58:39.000000000 -0400 -@@ -543,6 +543,7 @@ - static int speedo_rx(struct net_device *dev); - static void speedo_tx_buffer_gc(struct net_device *dev); - static void speedo_interrupt(int irq, void *dev_instance, struct pt_regs *regs); -+static void poll_speedo (struct net_device *dev); - static int speedo_close(struct net_device *dev); - static struct net_device_stats *speedo_get_stats(struct net_device *dev); - static int speedo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); -@@ -879,6 +880,9 @@ - dev->get_stats = &speedo_get_stats; - dev->set_multicast_list = &set_rx_mode; - dev->do_ioctl = &speedo_ioctl; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &poll_speedo; -+#endif - - return 0; - } -@@ -1176,10 +1180,8 @@ - - - /* Media monitoring and control. */ --static void speedo_timer(unsigned long data) -+static void speedo_timeout(struct net_device *dev, struct speedo_private *sp) - { -- struct net_device *dev = (struct net_device *)data; -- struct speedo_private *sp = (struct speedo_private *)dev->priv; - long ioaddr = dev->base_addr; - int phy_num = sp->phy[0] & 0x1f; - -@@ -1217,6 +1219,15 @@ - dev->name, sp->rx_mode, jiffies, sp->last_rx_time); - set_rx_mode(dev); - } -+} -+ -+static void speedo_timer(unsigned long data) -+{ -+ struct net_device *dev = (struct net_device *)data; -+ struct speedo_private *sp = (struct speedo_private *)dev->priv; -+ -+ speedo_timeout(dev, sp); -+ - /* We must continue to monitor the media. */ - sp->timer.expires = RUN_AT(2*HZ); /* 2.0 sec. */ - add_timer(&sp->timer); -@@ -1661,6 +1672,29 @@ - return; - } - -+#ifdef HAVE_POLL_CONTROLLER -+ -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void poll_speedo (struct net_device *dev) -+{ -+ struct speedo_private *sp = (struct speedo_private *)dev->priv; -+ -+ if (!netdump_mode) disable_irq(dev->irq); -+ if (sp->timer.expires == jiffies) { -+ sp->timer.expires = RUN_AT(2*HZ); -+ speedo_timeout(dev, sp); -+ } -+ speedo_interrupt (dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ - static inline struct RxFD *speedo_rx_alloc(struct net_device *dev, int entry) - { - struct speedo_private *sp = (struct speedo_private *)dev->priv; -Index: linux-2.4.24/drivers/net/Makefile -=================================================================== ---- linux-2.4.24.orig/drivers/net/Makefile 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/Makefile 2004-05-07 16:58:39.000000000 -0400 -@@ -250,6 +250,8 @@ - obj-y += ../acorn/net/acorn-net.o - endif - -+obj-$(CONFIG_NETCONSOLE) += netconsole.o -+ - # - # HIPPI adapters - # -Index: linux-2.4.24/drivers/net/netconsole.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/netconsole.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.4.24/drivers/net/netconsole.c 2004-05-07 16:58:39.000000000 -0400 -@@ -0,0 +1,1246 @@ -+/* -+ * linux/drivers/net/netconsole.c -+ * -+ * Copyright (C) 2001 Ingo Molnar -+ * Copyright (C) 2002 Red Hat, Inc. -+ * -+ * This file contains the implementation of an IRQ-safe, crash-safe -+ * kernel console implementation that outputs kernel messages to the -+ * network. -+ * -+ * Modification history: -+ * -+ * 2001-09-17 started by Ingo Molnar. -+ * 2002-03-14 simultaneous syslog packet option by Michael K. Johnson -+ */ -+ -+/**************************************************************** -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2, or (at your option) -+ * any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -+ * -+ ****************************************************************/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#if CONFIG_X86_LOCAL_APIC -+#include -+#endif -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct net_device *netconsole_dev; -+static u16 source_port, netdump_target_port, netlog_target_port, syslog_target_port; -+static u32 source_ip, netdump_target_ip, netlog_target_ip, syslog_target_ip; -+static unsigned char netdump_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; -+static unsigned char netlog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; -+static unsigned char syslog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; -+ -+static unsigned int mhz = 500, idle_timeout; -+static unsigned long long mhz_cycles, jiffy_cycles; -+ -+#include "netconsole.h" -+ -+#define MAX_UDP_CHUNK 1460 -+#define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN) -+ -+#define DEBUG 0 -+#if DEBUG -+# define Dprintk(x...) printk(KERN_INFO x) -+#else -+# define Dprintk(x...) -+#endif -+/* -+ * We maintain a small pool of fully-sized skbs, -+ * to make sure the message gets out even in -+ * extreme OOM situations. -+ */ -+#define MAX_NETCONSOLE_SKBS 128 -+ -+static spinlock_t netconsole_lock = SPIN_LOCK_UNLOCKED; -+static int nr_netconsole_skbs; -+static struct sk_buff *netconsole_skbs; -+ -+#define MAX_SKB_SIZE \ -+ (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ -+ sizeof(struct iphdr) + sizeof(struct ethhdr)) -+ -+static int new_arp = 0; -+static unsigned char arp_sha[ETH_ALEN], arp_tha[ETH_ALEN]; -+static u32 arp_sip, arp_tip; -+ -+static void send_netconsole_arp(struct net_device *dev); -+ -+static void __refill_netconsole_skbs(void) -+{ -+ struct sk_buff *skb; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&netconsole_lock, flags); -+ while (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS) { -+ skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); -+ if (!skb) -+ break; -+ if (netconsole_skbs) -+ skb->next = netconsole_skbs; -+ else -+ skb->next = NULL; -+ netconsole_skbs = skb; -+ nr_netconsole_skbs++; -+ } -+ spin_unlock_irqrestore(&netconsole_lock, flags); -+} -+ -+static struct sk_buff * get_netconsole_skb(void) -+{ -+ struct sk_buff *skb; -+ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&netconsole_lock, flags); -+ skb = netconsole_skbs; -+ if (skb) { -+ netconsole_skbs = skb->next; -+ skb->next = NULL; -+ nr_netconsole_skbs--; -+ } -+ spin_unlock_irqrestore(&netconsole_lock, flags); -+ -+ return skb; -+} -+ -+static unsigned long long t0; -+ -+/* -+ * Do cleanups: -+ * - zap completed output skbs. -+ * - send ARPs if requested -+ * - reboot the box if inactive for more than N seconds. -+ */ -+static void zap_completion_queue(void) -+{ -+ unsigned long long t1; -+ int cpu = smp_processor_id(); -+ -+ if (softnet_data[cpu].completion_queue) { -+ struct sk_buff *clist; -+ -+ local_irq_disable(); -+ clist = softnet_data[cpu].completion_queue; -+ softnet_data[cpu].completion_queue = NULL; -+ local_irq_enable(); -+ -+ while (clist != NULL) { -+ struct sk_buff *skb = clist; -+ clist = clist->next; -+ __kfree_skb(skb); -+ } -+ } -+ -+ if (new_arp) { -+ Dprintk("got ARP req - sending reply.\n"); -+ new_arp = 0; -+ send_netconsole_arp(netconsole_dev); -+ } -+ -+ rdtscll(t1); -+ if (idle_timeout) { -+ if (t0) { -+ if (((t1 - t0) >> 20) > mhz_cycles * (unsigned long long)idle_timeout) { -+ t0 = t1; -+ printk("netdump idle timeout - rebooting in 3 seconds.\n"); -+ mdelay(3000); -+ machine_restart(NULL); -+ } -+ } -+ } -+ /* maintain jiffies in a polling fashion, based on rdtsc. */ -+ { -+ static unsigned long long prev_tick; -+ -+ if (t1 - prev_tick >= jiffy_cycles) { -+ prev_tick += jiffy_cycles; -+ jiffies++; -+ } -+ } -+} -+ -+static struct sk_buff * alloc_netconsole_skb(struct net_device *dev, int len, int reserve) -+{ -+ int once = 1; -+ int count = 0; -+ struct sk_buff *skb = NULL; -+ -+repeat: -+ zap_completion_queue(); -+ if (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS) -+ __refill_netconsole_skbs(); -+ -+ skb = alloc_skb(len, GFP_ATOMIC); -+ if (!skb) { -+ skb = get_netconsole_skb(); -+ if (!skb) { -+ count++; -+ if (once && (count == 1000000)) { -+ printk("possibly FATAL: out of netconsole skbs!!! will keep retrying.\n"); -+ once = 0; -+ } -+ Dprintk("alloc skb: polling controller ...\n"); -+ dev->poll_controller(dev); -+ goto repeat; -+ } -+ } -+ -+ atomic_set(&skb->users, 1); -+ skb_reserve(skb, reserve); -+ return skb; -+} -+ -+static void transmit_raw_skb(struct sk_buff *skb, struct net_device *dev) -+{ -+ -+repeat_poll: -+ spin_lock(&dev->xmit_lock); -+ dev->xmit_lock_owner = smp_processor_id(); -+ -+ if (netif_queue_stopped(dev)) { -+ dev->xmit_lock_owner = -1; -+ spin_unlock(&dev->xmit_lock); -+ -+ Dprintk("xmit skb: polling controller ...\n"); -+ dev->poll_controller(dev); -+ zap_completion_queue(); -+ goto repeat_poll; -+ } -+ -+ dev->hard_start_xmit(skb, dev); -+ -+ dev->xmit_lock_owner = -1; -+ spin_unlock(&dev->xmit_lock); -+} -+ -+static void transmit_netconsole_skb(struct sk_buff *skb, struct net_device *dev, -+ int ip_len, int udp_len, -+ u16 source_port, u16 target_port, u32 source_ip, u32 target_ip, -+ unsigned char * macdaddr) -+{ -+ struct udphdr *udph; -+ struct iphdr *iph; -+ struct ethhdr *eth; -+ -+ udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); -+ udph->source = source_port; -+ udph->dest = target_port; -+ udph->len = htons(udp_len); -+ udph->check = 0; -+ -+ iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); -+ -+ iph->version = 4; -+ iph->ihl = 5; -+ iph->tos = 0; -+ iph->tot_len = htons(ip_len); -+ iph->id = 0; -+ iph->frag_off = 0; -+ iph->ttl = 64; -+ iph->protocol = IPPROTO_UDP; -+ iph->check = 0; -+ iph->saddr = source_ip; -+ iph->daddr = target_ip; -+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); -+ -+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); -+ -+ eth->h_proto = htons(ETH_P_IP); -+ memcpy(eth->h_source, dev->dev_addr, dev->addr_len); -+ memcpy(eth->h_dest, macdaddr, dev->addr_len); -+ -+ transmit_raw_skb(skb, dev); -+} -+ -+static void send_netconsole_arp(struct net_device *dev) -+{ -+ int total_len, arp_len, arp_data_len; -+ struct sk_buff *skb; -+ unsigned char *arp; -+ struct arphdr *arph; -+ struct ethhdr *eth; -+ -+ arp_data_len = 2*4 + 2*ETH_ALEN; -+ arp_len = arp_data_len + sizeof(struct arphdr); -+ total_len = arp_len + ETH_HLEN; -+ -+ skb = alloc_netconsole_skb(dev, total_len, total_len - arp_data_len); -+ -+ arp = skb->data; -+ -+ memcpy(arp, dev->dev_addr, ETH_ALEN); -+ arp += ETH_ALEN; -+ -+ memcpy(arp, &source_ip, 4); -+ arp += 4; -+ -+ memcpy(arp, arp_sha, ETH_ALEN); -+ arp += ETH_ALEN; -+ -+ memcpy(arp, &arp_sip, 4); -+ arp += 4; -+ -+ skb->len += 2*4 + 2*ETH_ALEN; -+ -+ arph = (struct arphdr *)skb_push(skb, sizeof(*arph)); -+ -+ arph->ar_hrd = htons(dev->type); -+ arph->ar_pro = __constant_htons(ETH_P_IP); -+ arph->ar_hln = ETH_ALEN; -+ arph->ar_pln = 4; -+ arph->ar_op = __constant_htons(ARPOP_REPLY); -+ -+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); -+ -+ eth->h_proto = htons(ETH_P_ARP); -+ memcpy(eth->h_source, dev->dev_addr, dev->addr_len); -+ memcpy(eth->h_dest, arp_sha, dev->addr_len); -+ -+ transmit_raw_skb(skb, dev); -+} -+ -+static void send_netdump_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply) -+{ -+ int total_len, ip_len, udp_len; -+ struct sk_buff *skb; -+ -+ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr); -+ ip_len = udp_len + sizeof(struct iphdr); -+ total_len = ip_len + ETH_HLEN; -+ -+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN); -+ -+ skb->data[0] = NETCONSOLE_VERSION; -+ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); -+ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); -+ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); -+ -+ memcpy(skb->data + HEADER_LEN, msg, msg_len); -+ skb->len += msg_len + HEADER_LEN; -+ -+ transmit_netconsole_skb(skb, dev, ip_len, udp_len, -+ source_port, netdump_target_port, source_ip, netdump_target_ip, netdump_daddr); -+} -+ -+#define SYSLOG_HEADER_LEN 4 -+ -+static void send_netlog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply) -+{ -+ int total_len, ip_len, udp_len; -+ struct sk_buff *skb; -+ -+ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr); -+ ip_len = udp_len + sizeof(struct iphdr); -+ total_len = ip_len + ETH_HLEN; -+ -+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN); -+ -+ skb->data[0] = NETCONSOLE_VERSION; -+ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); -+ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); -+ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); -+ -+ memcpy(skb->data + HEADER_LEN, msg, msg_len); -+ skb->len += msg_len + HEADER_LEN; -+ -+ transmit_netconsole_skb(skb, dev, ip_len, udp_len, -+ source_port, netlog_target_port, source_ip, netlog_target_ip, netlog_daddr); -+} -+ -+#define SYSLOG_HEADER_LEN 4 -+ -+static void send_syslog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, int pri) -+{ -+ int total_len, ip_len, udp_len; -+ struct sk_buff *skb; -+ -+ udp_len = msg_len + SYSLOG_HEADER_LEN + sizeof(struct udphdr); -+ ip_len = udp_len + sizeof(struct iphdr); -+ total_len = ip_len + ETH_HLEN; -+ -+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - SYSLOG_HEADER_LEN); -+ -+ skb->data[0] = '<'; -+ skb->data[1] = pri + '0'; -+ skb->data[2]= '>'; -+ skb->data[3]= ' '; -+ -+ memcpy(skb->data + SYSLOG_HEADER_LEN, msg, msg_len); -+ skb->len += msg_len + SYSLOG_HEADER_LEN; -+ -+ transmit_netconsole_skb(skb, dev, ip_len, udp_len, source_port, -+ syslog_target_port, source_ip, syslog_target_ip, syslog_daddr); -+} -+ -+#define MAX_SYSLOG_CHARS 1000 -+ -+static spinlock_t syslog_lock = SPIN_LOCK_UNLOCKED; -+static int syslog_chars; -+static unsigned char syslog_line [MAX_SYSLOG_CHARS + 10]; -+ -+/* -+ * We feed kernel messages char by char, and send the UDP packet -+ * one linefeed. We buffer all characters received. -+ */ -+static inline void feed_syslog_char(struct net_device *dev, const unsigned char c) -+{ -+ if (syslog_chars == MAX_SYSLOG_CHARS) -+ syslog_chars--; -+ syslog_line[syslog_chars] = c; -+ syslog_chars++; -+ if (c == '\n') { -+ send_syslog_skb(dev, syslog_line, syslog_chars, 5); -+ syslog_chars = 0; -+ } -+} -+ -+static spinlock_t sequence_lock = SPIN_LOCK_UNLOCKED; -+static unsigned int log_offset; -+ -+static void write_netconsole_msg(struct console *con, const char *msg0, unsigned int msg_len) -+{ -+ int len, left, i; -+ struct net_device *dev; -+ const char *msg = msg0; -+ reply_t reply; -+ -+ dev = netconsole_dev; -+ if (!dev || netdump_mode) -+ return; -+ -+ if (dev->poll_controller && netif_running(dev)) { -+ unsigned long flags; -+ -+ __save_flags(flags); -+ __cli(); -+ left = msg_len; -+ if (netlog_target_ip) { -+ while (left) { -+ if (left > MAX_PRINT_CHUNK) -+ len = MAX_PRINT_CHUNK; -+ else -+ len = left; -+ reply.code = REPLY_LOG; -+ reply.nr = 0; -+ spin_lock(&sequence_lock); -+ reply.info = log_offset; -+ log_offset += len; -+ spin_unlock(&sequence_lock); -+ send_netlog_skb(dev, msg, len, &reply); -+ msg += len; -+ left -= len; -+ } -+ } -+ if (syslog_target_ip) { -+ spin_lock(&syslog_lock); -+ for (i = 0; i < msg_len; i++) -+ feed_syslog_char(dev, msg0[i]); -+ spin_unlock(&syslog_lock); -+ } -+ -+ __restore_flags(flags); -+ } -+} -+ -+static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) -+{ -+ return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); -+} -+ -+static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, -+ unsigned short ulen, u32 saddr, u32 daddr) -+{ -+ if (uh->check == 0) { -+ skb->ip_summed = CHECKSUM_UNNECESSARY; -+ } else if (skb->ip_summed == CHECKSUM_HW) { -+ skb->ip_summed = CHECKSUM_UNNECESSARY; -+ if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) -+ return 0; -+ skb->ip_summed = CHECKSUM_NONE; -+ } -+ if (skb->ip_summed != CHECKSUM_UNNECESSARY) -+ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, -+0); -+ /* Probably, we should checksum udp header (it should be in cache -+ * in any case) and data in tiny packets (< rx copybreak). -+ */ -+ return 0; -+} -+ -+static __inline__ int __udp_checksum_complete(struct sk_buff *skb) -+{ -+ return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); -+} -+ -+static __inline__ int udp_checksum_complete(struct sk_buff *skb) -+{ -+ return skb->ip_summed != CHECKSUM_UNNECESSARY && -+ __udp_checksum_complete(skb); -+} -+ -+/* -+ * NOTE: security depends on the trusted path between the netconsole -+ * server and netconsole client, since none of the packets are -+ * encrypted. The random magic number protects the protocol -+ * against spoofing. -+ */ -+static u64 netconsole_magic; -+static u32 magic1, magic2; -+ -+static spinlock_t req_lock = SPIN_LOCK_UNLOCKED; -+static int nr_req = 0; -+static LIST_HEAD(request_list); -+ -+static void add_new_req(req_t *req) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&req_lock, flags); -+ list_add_tail(&req->list, &request_list); -+ nr_req++; -+ Dprintk("pending requests: %d.\n", nr_req); -+ spin_unlock_irqrestore(&req_lock, flags); -+ -+ rdtscll(t0); -+} -+ -+static req_t *get_new_req(void) -+{ -+ req_t *req = NULL; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&req_lock, flags); -+ if (nr_req) { -+ req = list_entry(request_list.next, req_t, list); -+ list_del(&req->list); -+ nr_req--; -+ } -+ spin_unlock_irqrestore(&req_lock, flags); -+ -+ return req; -+} -+ -+static req_t *alloc_req(void) -+{ -+ req_t *req; -+ -+ req = (req_t *) kmalloc(sizeof(*req), GFP_ATOMIC); -+ return req; -+} -+ -+static int netconsole_rx_hook(struct sk_buff *skb) -+{ -+ int proto; -+ struct iphdr *iph; -+ struct udphdr *uh; -+ __u32 len, saddr, daddr, ulen; -+ req_t *__req; -+ req_t *req; -+ struct net_device *dev; -+ -+ if (!netdump_mode) -+ return NET_RX_SUCCESS; -+#if DEBUG -+ { -+ static int packet_count; -+ Dprintk(" %d\r", ++packet_count); -+ } -+#endif -+ dev = skb->dev; -+ if (dev->type != ARPHRD_ETHER) -+ goto out; -+ proto = ntohs(skb->mac.ethernet->h_proto); -+ Dprintk("rx got skb %p (len: %d, users: %d), dev %s, h_proto: %04x.\n", skb, skb->len, atomic_read(&skb->users), dev->name, proto); -+ #define D(x) skb->mac.ethernet->h_dest[x] -+ Dprintk("... h_dest: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); -+ #undef D -+ #define D(x) skb->mac.ethernet->h_source[x] -+ Dprintk("... h_source: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); -+ #undef D -+ if (skb->pkt_type == PACKET_OTHERHOST) -+ goto out; -+ if (skb_shared(skb)) -+ goto out; -+ if (proto == ETH_P_ARP) { -+ struct arphdr *arp; -+ unsigned char *arp_ptr; -+ -+ Dprintk("got arp skb.\n"); -+ arp = (struct arphdr *)skb->data; -+ if (!pskb_may_pull(skb, sizeof(struct arphdr) + 2*4 + 2*ETH_ALEN)) -+ goto out; -+ if (htons(dev->type) != arp->ar_hrd) -+ goto out; -+ if (arp->ar_pro != __constant_htons(ETH_P_IP)) -+ goto out; -+ if (arp->ar_hln != ETH_ALEN) -+ goto out; -+ if (arp->ar_pln != 4) -+ goto out; -+ if (arp->ar_op != __constant_htons(ARPOP_REQUEST)) -+ goto out; -+ /* -+ * ARP header looks ok so far, extract fields: -+ */ -+ arp_ptr = (unsigned char *)(arp + 1); -+ -+ memcpy(arp_sha, arp_ptr, ETH_ALEN); -+ arp_ptr += ETH_ALEN; -+ -+ memcpy(&arp_sip, arp_ptr, 4); -+ arp_ptr += 4; -+ -+ memcpy(arp_tha, arp_ptr, ETH_ALEN); -+ arp_ptr += ETH_ALEN; -+ -+ memcpy(&arp_tip, arp_ptr, 4); -+ -+ #define D(x) arp_sha[x] -+ Dprintk("... arp_sha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); -+ #undef D -+ #define D(x) ((unsigned char *)&arp_sip)[x] -+ Dprintk("... arp_sip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); -+ #undef D -+ #define D(x) arp_tha[x] -+ Dprintk("... arp_tha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); -+ #undef D -+ #define D(x) ((unsigned char *)&arp_tip)[x] -+ Dprintk("... arp_tip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); -+ #undef D -+ #define D(x) ((unsigned char *)&source_ip)[x] -+ Dprintk("... (source_ip): %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); -+ #undef D -+ -+ if (LOOPBACK(arp_tip) || MULTICAST(arp_tip)) -+ goto out; -+ -+ if (arp_tip != source_ip) -+ goto out; -+ new_arp = 1; -+ goto out; -+ } -+ if (proto != ETH_P_IP) -+ goto out; -+ /* -+ * IP header correctness testing: -+ */ -+ iph = (struct iphdr *)skb->data; -+ if (!pskb_may_pull(skb, sizeof(struct iphdr))) -+ goto out; -+ Dprintk("... IP ihl*4: %d, version: %d.\n", iph->ihl*4, iph->version); -+ if (iph->ihl < 5 || iph->version != 4) -+ goto out; -+ if (!pskb_may_pull(skb, iph->ihl*4)) -+ goto out; -+ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) -+ goto out; -+ len = ntohs(iph->tot_len); -+ Dprintk("... IP len: %d.\n", len); -+ if (skb->len < len || len < iph->ihl*4) -+ goto out; -+ saddr = iph->saddr; -+ daddr = iph->daddr; -+ Dprintk("... IP src: %08x, dst: %08x.\n", saddr, daddr); -+ Dprintk("... IP protocol: %d.\n", iph->protocol); -+ if (iph->protocol != IPPROTO_UDP) -+ goto out; -+ Dprintk("... netdump src: %08x, dst: %08x.\n", source_ip, netlog_target_ip); -+ if (source_ip != daddr) -+ goto out; -+ if (netlog_target_ip != saddr) -+ goto out; -+ len -= iph->ihl*4; -+ uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); -+ ulen = ntohs(uh->len); -+ Dprintk("... UDP len: %d (left %d).\n", ulen, len); -+ -+#define MIN_COMM_SIZE (sizeof(*uh) + NETDUMP_REQ_SIZE) -+ if (ulen != len || ulen < MIN_COMM_SIZE) { -+ Dprintk("... UDP, hm, len not ok.\n"); -+ goto out; -+ } -+ if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) { -+ Dprintk("... UDP, hm, checksum init not ok.\n"); -+ goto out; -+ } -+ if (udp_checksum_complete(skb)) { -+ Dprintk("... UDP, hm, checksum complete not ok.\n"); -+ goto out; -+ } -+ Dprintk("... UDP packet OK!\n"); -+ Dprintk("... UDP src port: %d, dst port: %d.\n", uh->source, uh->dest); -+ if (source_port != uh->source) -+ goto out; -+ if (netlog_target_port != uh->dest) -+ goto out; -+ __req = (req_t *)(uh + 1); -+ Dprintk("... UDP netdump packet OK!\n"); -+ -+ req = alloc_req(); -+ if (!req) { -+ printk("no more RAM to allocate request - dropping it.\n"); -+ goto out; -+ } -+ -+ req->magic = ntohl(__req->magic); -+ req->command = ntohl(__req->command); -+ req->from = ntohl(__req->from); -+ req->to = ntohl(__req->to); -+ req->nr = ntohl(__req->nr); -+ -+ Dprintk("... netdump magic: %08Lx.\n", req->magic); -+ Dprintk("... netdump command: %08x.\n", req->command); -+ Dprintk("... netdump from: %08x.\n", req->from); -+ Dprintk("... netdump to: %08x.\n", req->to); -+ -+ add_new_req(req); -+out: -+ return NET_RX_DROP; -+} -+ -+#define INVALID_PAGE "page is not valid!\n" -+ -+static void send_netdump_mem (struct net_device *dev, req_t *req) -+{ -+ int i; -+ char *kaddr; -+ char str[1024]; -+ struct page *page; -+ unsigned long nr = req->from; -+ int nr_chunks = PAGE_SIZE/1024; -+ reply_t reply; -+ -+ reply.nr = req->nr; -+ reply.info = 0; -+ if (req->from >= max_mapnr) { -+ sprintf(str, "page %08lx is bigger than max page # %08lx!\n", nr, max_mapnr); -+ reply.code = REPLY_ERROR; -+ send_netdump_skb(dev, str, strlen(str), &reply); -+ return; -+ } -+ page = mem_map + nr; -+ if (PageReserved(page)) -+ page = ZERO_PAGE(0); -+ -+ kaddr = (char *)kmap_atomic(page, KM_NETDUMP); -+ -+ for (i = 0; i < nr_chunks; i++) { -+ unsigned int offset = i*1024; -+ reply.code = REPLY_MEM; -+ reply.info = offset; -+ send_netdump_skb(dev, kaddr + offset, 1024, &reply); -+ } -+ -+ kunmap_atomic(kaddr, KM_NETDUMP); -+} -+ -+/* -+ * This function waits for the client to acknowledge the receipt -+ * of the netdump startup reply, with the possibility of packets -+ * getting lost. We resend the startup packet if no ACK is received, -+ * after a 1 second delay. -+ * -+ * (The client can test the success of the handshake via the HELLO -+ * command, and send ACKs until we enter netdump mode.) -+ */ -+static void netdump_startup_handshake(struct net_device *dev) -+{ -+ char tmp[200]; -+ reply_t reply; -+ req_t *req = NULL; -+ int i; -+ -+ netdump_mode = 1; -+ -+repeat: -+ sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); -+ reply.code = REPLY_START_NETDUMP; -+ reply.nr = 0; -+ reply.info = 0; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ -+ for (i = 0; i < 10000; i++) { -+ // wait 1 sec. -+ udelay(100); -+ Dprintk("handshake: polling controller ...\n"); -+ dev->poll_controller(dev); -+ zap_completion_queue(); -+ req = get_new_req(); -+ if (req) -+ break; -+ } -+ if (!req) -+ goto repeat; -+ if (req->command != COMM_START_NETDUMP_ACK) { -+ kfree(req); -+ goto repeat; -+ } -+ kfree(req); -+ -+ printk("NETDUMP START!\n"); -+} -+ -+#if 0 -+ -+static inline void print_status (req_t *req) -+{ -+ static int count = 0; -+ -+ switch (++count & 3) { -+ case 0: printk("/\r"); break; -+ case 1: printk("|\r"); break; -+ case 2: printk("\\\r"); break; -+ case 3: printk("-\r"); break; -+ } -+} -+ -+#else -+ -+static inline void print_status (req_t *req) -+{ -+ static int count = 0; -+ static int prev_jiffies = 0; -+ -+ if (jiffies/HZ != prev_jiffies/HZ) { -+ prev_jiffies = jiffies; -+ count++; -+ switch (count & 3) { -+ case 0: printk("%d(%ld)/\r", nr_req, jiffies); break; -+ case 1: printk("%d(%ld)|\r", nr_req, jiffies); break; -+ case 2: printk("%d(%ld)\\\r", nr_req, jiffies); break; -+ case 3: printk("%d(%ld)-\r", nr_req, jiffies); break; -+ } -+ } -+} -+ -+#endif -+ -+#define CLI 1 -+ -+#if CONFIG_SMP -+static void freeze_cpu (void * dummy) -+{ -+ printk("CPU#%d is frozen.\n", smp_processor_id()); -+#if CLI -+ for (;;) __cli(); -+#else -+ for (;;) __sti(); -+#endif -+} -+#endif -+ -+static void netconsole_netdump (struct pt_regs *regs) -+{ -+ reply_t reply; -+ char tmp[200]; -+ unsigned long flags; -+ struct net_device *dev = netconsole_dev; -+ unsigned long esp; -+ unsigned short ss; -+ struct pt_regs myregs; -+ req_t *req; -+ -+ __save_flags(flags); -+ __cli(); -+#if CONFIG_X86_LOCAL_APIC -+ nmi_watchdog = 0; -+#endif -+#if CONFIG_SMP -+ smp_call_function(freeze_cpu, NULL, 1, 0); -+#endif -+ mdelay(1000); -+ /* -+ * Just in case we are crashing within the networking code -+ * ... attempt to fix up. -+ */ -+ spin_lock_init(&dev->xmit_lock); -+ -+ esp = (unsigned long) ((char *)regs + sizeof (struct pt_regs)); -+ ss = __KERNEL_DS; -+ if (regs->xcs & 3) { -+ esp = regs->esp; -+ ss = regs->xss & 0xffff; -+ } -+ myregs = *regs; -+ myregs.esp = esp; -+ myregs.xss = (myregs.xss & 0xffff0000) | ss; -+ -+ rdtscll(t0); -+ -+ printk("< netdump activated - performing handshake with the client. >\n"); -+ netdump_startup_handshake(dev); -+ -+ printk("< handshake completed - listening for dump requests. >\n"); -+ -+ while (netdump_mode) { -+ __cli(); -+ Dprintk("main netdump loop: polling controller ...\n"); -+ dev->poll_controller(dev); -+ zap_completion_queue(); -+#if !CLI -+ __sti(); -+#endif -+ req = get_new_req(); -+ if (!req) -+ continue; -+ Dprintk("got new req, command %d.\n", req->command); -+ print_status(req); -+ switch (req->command) { -+ case COMM_NONE: -+ Dprintk("got NO command.\n"); -+ break; -+ -+ case COMM_SEND_MEM: -+ Dprintk("got MEM command.\n"); -+ // send ->from ->to. -+ send_netdump_mem(dev, req); -+ break; -+ -+ case COMM_EXIT: -+ Dprintk("got EXIT command.\n"); -+ netdump_mode = 0; -+ break; -+ -+ case COMM_REBOOT: -+ Dprintk("got REBOOT command.\n"); -+ printk("netdump: rebooting in 3 seconds.\n"); -+ mdelay(3000); -+ machine_restart(NULL); -+ break; -+ -+ case COMM_HELLO: -+ sprintf(tmp, "Hello, this is netdump version 0.%02d\n", NETCONSOLE_VERSION); -+ reply.code = REPLY_HELLO; -+ reply.nr = req->nr; -+ reply.info = NETCONSOLE_VERSION; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ -+ case COMM_GET_PAGE_SIZE: -+ sprintf(tmp, "PAGE_SIZE: %ld\n", PAGE_SIZE); -+ reply.code = REPLY_PAGE_SIZE; -+ reply.nr = req->nr; -+ reply.info = PAGE_SIZE; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ -+ case COMM_GET_REGS: -+ { -+ char *tmp2 = tmp; -+ elf_gregset_t elf_regs; -+ -+ reply.code = REPLY_REGS; -+ reply.nr = req->nr; -+ reply.info = max_mapnr; -+ tmp2 = tmp + sprintf(tmp, "Sending register info.\n"); -+ ELF_CORE_COPY_REGS(elf_regs, regs); -+ memcpy(tmp2, &elf_regs, sizeof(elf_regs)); -+ send_netdump_skb(dev, tmp, strlen(tmp) + sizeof(elf_regs), &reply); -+ break; -+ } -+ -+ case COMM_GET_NR_PAGES: -+ reply.code = REPLY_NR_PAGES; -+ reply.nr = req->nr; -+ reply.info = max_mapnr; -+ sprintf(tmp, "Number of pages: %ld\n", max_mapnr); -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ -+ case COMM_SHOW_STATE: -+ netdump_mode = 0; -+ if (regs) -+ show_regs(regs); -+ show_state(); -+ show_mem(); -+ netdump_mode = 1; -+ reply.code = REPLY_SHOW_STATE; -+ reply.nr = req->nr; -+ reply.info = 0; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ -+ default: -+ reply.code = REPLY_ERROR; -+ reply.nr = req->nr; -+ reply.info = req->command; -+ Dprintk("got UNKNOWN command!\n"); -+ sprintf(tmp, "Got unknown command code %d!\n", req->command); -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ } -+ kfree(req); -+ req = NULL; -+ } -+ sprintf(tmp, "NETDUMP end.\n"); -+ reply.code = REPLY_END_NETDUMP; -+ reply.nr = 0; -+ reply.info = 0; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ printk("NETDUMP END!\n"); -+ __restore_flags(flags); -+} -+ -+static char *dev; -+static int netdump_target_eth_byte0 = 255; -+static int netdump_target_eth_byte1 = 255; -+static int netdump_target_eth_byte2 = 255; -+static int netdump_target_eth_byte3 = 255; -+static int netdump_target_eth_byte4 = 255; -+static int netdump_target_eth_byte5 = 255; -+ -+static int netlog_target_eth_byte0 = 255; -+static int netlog_target_eth_byte1 = 255; -+static int netlog_target_eth_byte2 = 255; -+static int netlog_target_eth_byte3 = 255; -+static int netlog_target_eth_byte4 = 255; -+static int netlog_target_eth_byte5 = 255; -+ -+static int syslog_target_eth_byte0 = 255; -+static int syslog_target_eth_byte1 = 255; -+static int syslog_target_eth_byte2 = 255; -+static int syslog_target_eth_byte3 = 255; -+static int syslog_target_eth_byte4 = 255; -+static int syslog_target_eth_byte5 = 255; -+ -+MODULE_PARM(netdump_target_ip, "i"); -+MODULE_PARM_DESC(netdump_target_ip, -+ "remote netdump IP address as a native (not network) endian integer"); -+MODULE_PARM(netlog_target_ip, "i"); -+MODULE_PARM_DESC(netlog_target_ip, -+ "remote netlog IP address as a native (not network) endian integer"); -+MODULE_PARM(syslog_target_ip, "i"); -+MODULE_PARM_DESC(syslog_target_ip, -+ "remote syslog IP address as a native (not network) endian integer"); -+ -+MODULE_PARM(source_port, "h"); -+MODULE_PARM_DESC(source_port, -+ "local port from which to send netdump packets"); -+ -+MODULE_PARM(netdump_target_port, "h"); -+MODULE_PARM_DESC(netdump_target_port, -+ "remote port to which to send netdump packets"); -+MODULE_PARM(netlog_target_port, "h"); -+MODULE_PARM_DESC(netlog_target_port, -+ "remote port to which to send netlog packets"); -+MODULE_PARM(syslog_target_port, "h"); -+MODULE_PARM_DESC(syslog_target_port, -+ "remote port to which to send syslog packets"); -+ -+#define ETH_BYTE(name,nr) \ -+ MODULE_PARM(name##_target_eth_byte##nr, "i"); \ -+ MODULE_PARM_DESC(name##_target_eth_byte##nr, \ -+ "byte "#nr" of the netdump server MAC address") -+ -+#define ETH_BYTES(name) \ -+ ETH_BYTE(name, 0); ETH_BYTE(name, 1); ETH_BYTE(name, 2); \ -+ ETH_BYTE(name, 3); ETH_BYTE(name, 4); ETH_BYTE(name, 5); -+ -+ETH_BYTES(netdump); -+ETH_BYTES(netlog); -+ETH_BYTES(syslog); -+ -+MODULE_PARM(magic1, "i"); -+MODULE_PARM_DESC(magic1, -+ "lower 32 bits of magic cookie shared between client and server"); -+MODULE_PARM(magic2, "i"); -+MODULE_PARM_DESC(magic2, -+ "upper 32 bits of magic cookie shared between client and server"); -+MODULE_PARM(dev, "s"); -+MODULE_PARM_DESC(dev, -+ "name of the device from which to send netdump and syslog packets"); -+MODULE_PARM(mhz, "i"); -+MODULE_PARM_DESC(mhz, -+ "one second wall clock time takes this many million CPU cycles"); -+MODULE_PARM(idle_timeout, "i"); -+MODULE_PARM_DESC(idle_timeout, -+ "reboot system after this many idle seconds"); -+ -+static struct console netconsole = -+ { flags: CON_ENABLED, write: write_netconsole_msg }; -+ -+static int init_netconsole(void) -+{ -+ struct net_device *ndev = NULL; -+ struct in_device *in_dev; -+ -+ printk(KERN_INFO "netlog: using network device <%s>\n", dev); -+ // this will be valid once the device goes up. -+ if (dev) -+ ndev = dev_get_by_name(dev); -+ if (!ndev) { -+ printk(KERN_ERR "netlog: network device %s does not exist, aborting.\n", dev); -+ return -1; -+ } -+ if (!ndev->poll_controller) { -+ printk(KERN_ERR "netlog: %s's network driver does not implement netlogging yet, aborting.\n", dev); -+ return -1; -+ } -+ in_dev = in_dev_get(ndev); -+ if (!in_dev) { -+ printk(KERN_ERR "netlog: network device %s is not an IP protocol device, aborting.\n", dev); -+ return -1; -+ } -+ -+ if (!magic1 || !magic2) { -+ printk(KERN_ERR "netlog: magic cookie (magic1,magic2) not specified.\n"); -+ return -1; -+ } -+ netconsole_magic = magic1 + (((u64)magic2)<<32); -+ -+ source_ip = ntohl(in_dev->ifa_list->ifa_local); -+ if (!source_ip) { -+ printk(KERN_ERR "netlog: network device %s has no local address, aborting.\n", dev); -+ return -1; -+ } -+#define IP(x) ((unsigned char *)&source_ip)[x] -+ printk(KERN_INFO "netlog: using source IP %u.%u.%u.%u\n", -+ IP(3), IP(2), IP(1), IP(0)); -+#undef IP -+ source_ip = htonl(source_ip); -+ if (!source_port) { -+ printk(KERN_ERR "netlog: source_port parameter not specified, aborting.\n"); -+ return -1; -+ } -+ printk(KERN_INFO "netlog: using source UDP port: %u\n", source_port); -+ source_port = htons(source_port); -+ -+ if (!netdump_target_ip && !netlog_target_ip && !syslog_target_ip) { -+ printk(KERN_ERR "netlog: target_ip parameter not specified, aborting.\n"); -+ return -1; -+ } -+ if (netdump_target_ip) { -+#define IP(x) ((unsigned char *)&netdump_target_ip)[x] -+ printk(KERN_INFO "netlog: using netdump target IP %u.%u.%u.%u\n", -+ IP(3), IP(2), IP(1), IP(0)); -+#undef IP -+ netdump_target_ip = htonl(netdump_target_ip); -+ } -+ if (netlog_target_ip) { -+#define IP(x) ((unsigned char *)&netlog_target_ip)[x] -+ printk(KERN_INFO "netlog: using netlog target IP %u.%u.%u.%u\n", -+ IP(3), IP(2), IP(1), IP(0)); -+#undef IP -+ netlog_target_ip = htonl(netlog_target_ip); -+ } -+ if (syslog_target_ip) { -+ if (!syslog_target_port) -+ syslog_target_port = 514; -+#define IP(x) ((unsigned char *)&syslog_target_ip)[x] -+ printk("netlog: using syslog target IP %u.%u.%u.%u, port: %d\n", IP(3), IP(2), IP(1), IP(0), syslog_target_port); -+#undef IP -+ syslog_target_ip = htonl(syslog_target_ip); -+ syslog_target_port = htons(syslog_target_port); -+ } -+ if (!netdump_target_port && !netlog_target_port && !syslog_target_port) { -+ printk(KERN_ERR "netlog: target_port parameter not specified, aborting.\n"); -+ return -1; -+ } -+ if (netdump_target_port) { -+ printk(KERN_INFO "netlog: using target UDP port: %u\n", netdump_target_port); -+ netdump_target_port = htons(netdump_target_port); -+ } -+ if (netlog_target_port) { -+ printk(KERN_INFO "netlog: using target UDP port: %u\n", netlog_target_port); -+ netlog_target_port = htons(netlog_target_port); -+ } -+ -+ netdump_daddr[0] = netdump_target_eth_byte0; -+ netdump_daddr[1] = netdump_target_eth_byte1; -+ netdump_daddr[2] = netdump_target_eth_byte2; -+ netdump_daddr[3] = netdump_target_eth_byte3; -+ netdump_daddr[4] = netdump_target_eth_byte4; -+ netdump_daddr[5] = netdump_target_eth_byte5; -+ -+ if ((netdump_daddr[0] & netdump_daddr[1] & netdump_daddr[2] & netdump_daddr[3] & netdump_daddr[4] & netdump_daddr[5]) == 255) -+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n"); -+ else -+ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", -+ netdump_daddr[0], netdump_daddr[1], netdump_daddr[2], netdump_daddr[3], netdump_daddr[4], netdump_daddr[5]); -+ -+ netlog_daddr[0] = netlog_target_eth_byte0; -+ netlog_daddr[1] = netlog_target_eth_byte1; -+ netlog_daddr[2] = netlog_target_eth_byte2; -+ netlog_daddr[3] = netlog_target_eth_byte3; -+ netlog_daddr[4] = netlog_target_eth_byte4; -+ netlog_daddr[5] = netlog_target_eth_byte5; -+ -+ if ((netlog_daddr[0] & netlog_daddr[1] & netlog_daddr[2] & netlog_daddr[3] & netlog_daddr[4] & netlog_daddr[5]) == 255) -+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n"); -+ else -+ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", -+ netlog_daddr[0], netlog_daddr[1], netlog_daddr[2], netlog_daddr[3], netlog_daddr[4], netlog_daddr[5]); -+ syslog_daddr[0] = syslog_target_eth_byte0; -+ syslog_daddr[1] = syslog_target_eth_byte1; -+ syslog_daddr[2] = syslog_target_eth_byte2; -+ syslog_daddr[3] = syslog_target_eth_byte3; -+ syslog_daddr[4] = syslog_target_eth_byte4; -+ syslog_daddr[5] = syslog_target_eth_byte5; -+ -+ if ((syslog_daddr[0] & syslog_daddr[1] & syslog_daddr[2] & syslog_daddr[3] & syslog_daddr[4] & syslog_daddr[5]) == 255) -+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send syslog packets.\n"); -+ else -+ printk(KERN_INFO "netlog: using syslog target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", -+ syslog_daddr[0], syslog_daddr[1], syslog_daddr[2], syslog_daddr[3], syslog_daddr[4], syslog_daddr[5]); -+ -+ mhz_cycles = (unsigned long long)mhz * 1000000ULL; -+ jiffy_cycles = (unsigned long long)mhz * (1000000/HZ); -+ -+ INIT_LIST_HEAD(&request_list); -+ -+ ndev->rx_hook = netconsole_rx_hook; -+ netdump_func = netconsole_netdump; -+ netconsole_dev = ndev; -+#define STARTUP_MSG "[...network console startup...]\n" -+ write_netconsole_msg(NULL, STARTUP_MSG, strlen(STARTUP_MSG)); -+ -+ register_console(&netconsole); -+ printk(KERN_INFO "netlog: network logging started up successfully!\n"); -+ return 0; -+} -+ -+static void cleanup_netconsole(void) -+{ -+ printk(KERN_INFO "netlog: network logging shut down.\n"); -+ unregister_console(&netconsole); -+ -+#define SHUTDOWN_MSG "[...network console shutdown...]\n" -+ write_netconsole_msg(NULL, SHUTDOWN_MSG, strlen(SHUTDOWN_MSG)); -+ netconsole_dev->rx_hook = NULL; -+ netconsole_dev = NULL; -+} -+ -+module_init(init_netconsole); -+module_exit(cleanup_netconsole); -+ -+MODULE_LICENSE("GPL"); -+ -Index: linux-2.4.24/drivers/net/netconsole.h -=================================================================== ---- linux-2.4.24.orig/drivers/net/netconsole.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.4.24/drivers/net/netconsole.h 2004-05-07 16:58:39.000000000 -0400 -@@ -0,0 +1,81 @@ -+/* -+ * linux/drivers/net/netconsole.h -+ * -+ * Copyright (C) 2001 Ingo Molnar -+ * -+ * This file contains the implementation of an IRQ-safe, crash-safe -+ * kernel console implementation that outputs kernel messages to the -+ * network. -+ * -+ * Modification history: -+ * -+ * 2001-09-17 started by Ingo Molnar. -+ */ -+ -+/**************************************************************** -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2, or (at your option) -+ * any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -+ * -+ ****************************************************************/ -+ -+#define NETCONSOLE_VERSION 0x04 -+ -+enum netdump_commands { -+ COMM_NONE = 0, -+ COMM_SEND_MEM = 1, -+ COMM_EXIT = 2, -+ COMM_REBOOT = 3, -+ COMM_HELLO = 4, -+ COMM_GET_NR_PAGES = 5, -+ COMM_GET_PAGE_SIZE = 6, -+ COMM_START_NETDUMP_ACK = 7, -+ COMM_GET_REGS = 8, -+ COMM_SHOW_STATE = 9, -+}; -+ -+#define NETDUMP_REQ_SIZE (8+4*4) -+ -+typedef struct netdump_req_s { -+ u64 magic; -+ u32 nr; -+ u32 command; -+ u32 from; -+ u32 to; -+ struct list_head list; -+} req_t; -+ -+enum netdump_replies { -+ REPLY_NONE = 0, -+ REPLY_ERROR = 1, -+ REPLY_LOG = 2, -+ REPLY_MEM = 3, -+ REPLY_RESERVED = 4, -+ REPLY_HELLO = 5, -+ REPLY_NR_PAGES = 6, -+ REPLY_PAGE_SIZE = 7, -+ REPLY_START_NETDUMP = 8, -+ REPLY_END_NETDUMP = 9, -+ REPLY_REGS = 10, -+ REPLY_MAGIC = 11, -+ REPLY_SHOW_STATE = 12, -+}; -+ -+typedef struct netdump_reply_s { -+ u32 nr; -+ u32 code; -+ u32 info; -+} reply_t; -+ -+#define HEADER_LEN (1 + sizeof(reply_t)) -+ -Index: linux-2.4.24/drivers/net/tlan.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/tlan.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/tlan.c 2004-05-07 16:58:39.000000000 -0400 -@@ -345,6 +345,8 @@ - static void TLan_EeReceiveByte( u16, u8 *, int ); - static int TLan_EeReadByte( struct net_device *, u8, u8 * ); - -+static void TLan_Poll(struct net_device *); -+ - - static void - TLan_StoreSKB( struct tlan_list_tag *tag, struct sk_buff *skb) -@@ -891,6 +893,9 @@ - dev->get_stats = &TLan_GetStats; - dev->set_multicast_list = &TLan_SetMulticastList; - dev->do_ioctl = &TLan_ioctl; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &TLan_Poll; -+#endif - dev->tx_timeout = &TLan_tx_timeout; - dev->watchdog_timeo = TX_TIMEOUT; - -@@ -1176,7 +1181,14 @@ - - } /* TLan_HandleInterrupts */ - -- -+#ifdef HAVE_POLL_CONTROLLER -+static void TLan_Poll(struct net_device *dev) -+{ -+ if (!netdump_mode) disable_irq(dev->irq); -+ TLan_HandleInterrupt(dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+#endif - - - /*************************************************************** -Index: linux-2.4.24/drivers/net/tulip/tulip_core.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/tulip/tulip_core.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/tulip/tulip_core.c 2004-05-07 16:58:39.000000000 -0400 -@@ -266,6 +266,7 @@ - static struct net_device_stats *tulip_get_stats(struct net_device *dev); - static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); - static void set_rx_mode(struct net_device *dev); -+static void poll_tulip(struct net_device *dev); - - - -@@ -1728,6 +1729,9 @@ - dev->get_stats = tulip_get_stats; - dev->do_ioctl = private_ioctl; - dev->set_multicast_list = set_rx_mode; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &poll_tulip; -+#endif - - if (register_netdev(dev)) - goto err_out_free_ring; -@@ -1902,6 +1906,24 @@ - } - - -+#ifdef HAVE_POLL_CONTROLLER -+ -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void poll_tulip (struct net_device *dev) -+{ -+ if (!netdump_mode) disable_irq(dev->irq); -+ tulip_interrupt (dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ -+ - static struct pci_driver tulip_driver = { - name: DRV_NAME, - id_table: tulip_pci_tbl, -Index: linux-2.4.24/drivers/net/e100/e100_main.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/e100/e100_main.c 2004-05-07 16:58:39.000000000 -0400 -+++ linux-2.4.24/drivers/net/e100/e100_main.c 2004-05-07 17:00:21.000000000 -0400 -@@ -664,6 +664,10 @@ - goto err_unregister_netdev; - } - -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = e100_netpoll; -+#endif -+ - e100nics++; - - e100_get_speed_duplex_caps(bdp); -Index: linux-2.4.24/drivers/net/e1000/e1000_main.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/e1000/e1000_main.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/e1000/e1000_main.c 2004-05-07 16:58:39.000000000 -0400 -@@ -182,6 +182,9 @@ - static int e1000_resume(struct pci_dev *pdev); - #endif - -+/* for netdump / net console */ -+static void e1000_netpoll (struct net_device *dev); -+ - struct notifier_block e1000_notifier_reboot = { - .notifier_call = e1000_notify_reboot, - .next = NULL, -@@ -434,6 +437,10 @@ - netdev->vlan_rx_add_vid = e1000_vlan_rx_add_vid; - netdev->vlan_rx_kill_vid = e1000_vlan_rx_kill_vid; - -+#ifdef HAVE_POLL_CONTROLLER -+ netdev->poll_controller = e1000_netpoll; -+#endif -+ - netdev->irq = pdev->irq; - netdev->mem_start = mmio_start; - netdev->mem_end = mmio_start + mmio_len; -@@ -2899,4 +2906,20 @@ - } - #endif - -+#ifdef HAVE_POLL_CONTROLLER -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void e1000_netpoll (struct net_device *dev) -+{ -+ if (!netdump_mode) disable_irq(dev->irq); -+ e1000_intr (dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ - /* e1000_main.c */ -Index: linux-2.4.24/drivers/net/tg3.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/tg3.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/tg3.c 2004-05-07 16:58:39.000000000 -0400 -@@ -216,6 +216,9 @@ - #define tr16(reg) readw(tp->regs + (reg)) - #define tr8(reg) readb(tp->regs + (reg)) - -+/* Added by mark.fasheh@oracle.com to help enable netdump on these cards */ -+static void poll_tg3 (struct net_device *dev); -+ - static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val) - { - unsigned long flags; -@@ -7630,6 +7633,9 @@ - dev->watchdog_timeo = TG3_TX_TIMEOUT; - dev->change_mtu = tg3_change_mtu; - dev->irq = pdev->irq; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &poll_tg3; -+#endif - - err = tg3_get_invariants(tp); - if (err) { -@@ -7862,5 +7868,23 @@ - pci_unregister_driver(&tg3_driver); - } - -+#ifdef HAVE_POLL_CONTROLLER -+ -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void poll_tg3 (struct net_device *dev) -+{ -+ if (!netdump_mode) disable_irq(dev->irq); -+ tg3_interrupt (dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ -+ - module_init(tg3_init); - module_exit(tg3_cleanup); -Index: linux-2.4.24/include/asm-i386/kmap_types.h -=================================================================== ---- linux-2.4.24.orig/include/asm-i386/kmap_types.h 2003-08-25 07:44:43.000000000 -0400 -+++ linux-2.4.24/include/asm-i386/kmap_types.h 2004-05-07 16:59:12.000000000 -0400 -@@ -10,6 +10,7 @@ - KM_BH_IRQ, - KM_SOFTIRQ0, - KM_SOFTIRQ1, -+ KM_NETDUMP, - KM_TYPE_NR - }; - -Index: linux-2.4.24/include/linux/kernel.h -=================================================================== ---- linux-2.4.24.orig/include/linux/kernel.h 2004-05-07 16:56:55.000000000 -0400 -+++ linux-2.4.24/include/linux/kernel.h 2004-05-07 16:58:39.000000000 -0400 -@@ -104,6 +104,9 @@ - - extern void bust_spinlocks(int yes); - extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ -+struct pt_regs; -+extern void (*netdump_func) (struct pt_regs *regs); -+extern int netdump_mode; - - extern int tainted; - extern const char *print_tainted(void); -Index: linux-2.4.24/include/linux/netdevice.h -=================================================================== ---- linux-2.4.24.orig/include/linux/netdevice.h 2003-11-28 13:26:21.000000000 -0500 -+++ linux-2.4.24/include/linux/netdevice.h 2004-05-07 16:58:39.000000000 -0400 -@@ -435,6 +435,9 @@ - unsigned char *haddr); - int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); - int (*accept_fastpath)(struct net_device *, struct dst_entry*); -+#define HAVE_POLL_CONTROLLER -+ void (*poll_controller)(struct net_device *dev); -+ int (*rx_hook)(struct sk_buff *skb); - - /* open/release and usage marking */ - struct module *owner; -Index: linux-2.4.24/kernel/panic.c -=================================================================== ---- linux-2.4.24.orig/kernel/panic.c 2004-05-07 16:56:56.000000000 -0400 -+++ linux-2.4.24/kernel/panic.c 2004-05-07 16:58:39.000000000 -0400 -@@ -62,6 +62,8 @@ - vsprintf(buf, fmt, args); - va_end(args); - printk(KERN_EMERG "Kernel panic: %s\n",buf); -+ if (netdump_func) -+ BUG(); - if (in_interrupt()) - printk(KERN_EMERG "In interrupt handler - not syncing\n"); - else if (!current->pid) -Index: linux-2.4.24/net/core/dev.c -=================================================================== ---- linux-2.4.24.orig/net/core/dev.c 2003-11-28 13:26:21.000000000 -0500 -+++ linux-2.4.24/net/core/dev.c 2004-05-07 16:58:39.000000000 -0400 -@@ -1288,6 +1288,13 @@ - - local_irq_save(flags); - -+ if (unlikely(skb->dev->rx_hook != NULL)) { -+ int ret; -+ -+ ret = skb->dev->rx_hook(skb); -+ if (ret == NET_RX_DROP) -+ goto drop; -+ } - netdev_rx_stat[this_cpu].total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { diff --git a/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch deleted file mode 100644 index a6a7e12..0000000 --- a/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch +++ /dev/null @@ -1,5242 +0,0 @@ - Documentation/Configure.help | 66 ++ - arch/alpha/defconfig | 7 - arch/alpha/kernel/entry.S | 12 - arch/arm/defconfig | 7 - arch/arm/kernel/calls.S | 24 - arch/i386/defconfig | 7 - arch/ia64/defconfig | 7 - arch/ia64/kernel/entry.S | 24 - arch/m68k/defconfig | 7 - arch/mips/defconfig | 7 - arch/mips64/defconfig | 7 - arch/ppc/defconfig | 14 - arch/ppc64/kernel/misc.S | 2 - arch/s390/defconfig | 7 - arch/s390/kernel/entry.S | 24 - arch/s390x/defconfig | 7 - arch/s390x/kernel/entry.S | 24 - arch/s390x/kernel/wrapper32.S | 92 +++ - arch/sparc/defconfig | 7 - arch/sparc/kernel/systbls.S | 10 - arch/sparc64/defconfig | 7 - arch/sparc64/kernel/systbls.S | 20 - fs/Config.in | 14 - fs/Makefile | 3 - fs/ext2/Makefile | 4 - fs/ext2/file.c | 5 - fs/ext2/ialloc.c | 2 - fs/ext2/inode.c | 34 - - fs/ext2/namei.c | 14 - fs/ext2/super.c | 29 - fs/ext2/symlink.c | 14 - fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ - fs/ext2/xattr_user.c | 103 +++ - fs/ext3/Makefile | 10 - fs/ext3/file.c | 5 - fs/ext3/ialloc.c | 2 - fs/ext3/inode.c | 35 - - fs/ext3/namei.c | 21 - fs/ext3/super.c | 36 + - fs/ext3/symlink.c | 14 - fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ - fs/ext3/xattr_user.c | 111 +++ - fs/jfs/jfs_xattr.h | 6 - fs/jfs/xattr.c | 6 - fs/mbcache.c | 648 ++++++++++++++++++++++ - include/asm-arm/unistd.h | 2 - include/asm-ia64/unistd.h | 13 - include/asm-ppc64/unistd.h | 2 - include/asm-s390/unistd.h | 15 - include/asm-s390x/unistd.h | 15 - include/asm-sparc/unistd.h | 24 - include/asm-sparc64/unistd.h | 24 - include/linux/cache_def.h | 15 - include/linux/errno.h | 4 - include/linux/ext2_fs.h | 31 - - include/linux/ext2_xattr.h | 157 +++++ - include/linux/ext3_fs.h | 31 - - include/linux/ext3_jbd.h | 8 - include/linux/ext3_xattr.h | 157 +++++ - include/linux/fs.h | 2 - include/linux/mbcache.h | 69 ++ - kernel/ksyms.c | 4 - mm/vmscan.c | 35 + - fs/ext3/ext3-exports.c | 14 + - 64 files changed, 4355 insertions(+), 195 deletions(-) - -Index: linux-DRV401/arch/ppc/defconfig -=================================================================== ---- linux-DRV401.orig/arch/ppc/defconfig 2004-10-15 10:24:32.000000000 -0700 -+++ linux-DRV401/arch/ppc/defconfig 2004-10-15 11:03:51.000000000 -0700 -@@ -1,6 +1,13 @@ - # - # Automatically generated by make menuconfig: don't edit - # -+CONFIG_EXT3_FS_XATTR=y -+# CONFIG_EXT3_FS_XATTR_SHARING is not set -+# CONFIG_EXT3_FS_XATTR_USER is not set -+# CONFIG_EXT2_FS_XATTR is not set -+# CONFIG_EXT2_FS_XATTR_SHARING is not set -+# CONFIG_EXT2_FS_XATTR_USER is not set -+# CONFIG_FS_MBCACHE is not set - # CONFIG_UID16 is not set - # CONFIG_RWSEM_GENERIC_SPINLOCK is not set - CONFIG_RWSEM_XCHGADD_ALGORITHM=y -Index: linux-DRV401/fs/Config.in -=================================================================== ---- linux-DRV401.orig/fs/Config.in 2004-10-15 10:24:06.000000000 -0700 -+++ linux-DRV401/fs/Config.in 2004-10-15 11:03:51.000000000 -0700 -@@ -22,6 +22,11 @@ - dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL - - tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS -+dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS -+dep_bool ' Ext3 extended attribute block sharing' \ -+ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR -+dep_bool ' Ext3 extended user attributes' \ -+ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR - # CONFIG_JBD could be its own option (even modular), but until there are - # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS - # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS -@@ -77,6 +82,11 @@ - tristate 'ROM file system support' CONFIG_ROMFS_FS - - tristate 'Second extended fs support' CONFIG_EXT2_FS -+dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS -+dep_bool ' Ext2 extended attribute block sharing' \ -+ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR -+dep_bool ' Ext2 extended user attributes' \ -+ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR - - tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS - -@@ -156,6 +166,10 @@ - fi - fi - -+# Meta block cache for Extended Attributes (ext2/ext3) -+#tristate 'Meta block cache' CONFIG_FS_MBCACHE -+define_tristate CONFIG_FS_MBCACHE y -+ - mainmenu_option next_comment - comment 'Partition Types' - source fs/partitions/Config.in -Index: linux-DRV401/fs/Makefile -=================================================================== ---- linux-DRV401.orig/fs/Makefile 2004-10-15 10:39:15.000000000 -0700 -+++ linux-DRV401/fs/Makefile 2004-10-15 11:03:51.000000000 -0700 -@@ -14,7 +14,7 @@ - super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \ - fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ - dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ -- filesystems.o namespace.o seq_file.o quota.o -+ filesystems.o namespace.o seq_file.o quota.o xattr.o - - ifeq ($(CONFIG_QUOTA),y) - obj-y += dquot.o -@@ -76,6 +76,9 @@ - - obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o - -+export-objs += mbcache.o -+obj-$(CONFIG_FS_MBCACHE) += mbcache.o -+ - # persistent filesystems - obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) - -Index: linux-DRV401/fs/ext2/Makefile -=================================================================== ---- linux-DRV401.orig/fs/ext2/Makefile 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/Makefile 2004-10-15 11:03:51.000000000 -0700 -@@ -13,4 +13,8 @@ - ioctl.o namei.o super.o symlink.o - obj-m := $(O_TARGET) - -+export-objs += xattr.o -+obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o -+obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o -+ - include $(TOPDIR)/Rules.make -Index: linux-DRV401/fs/ext2/file.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/file.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/file.c 2004-10-15 11:03:51.000000000 -0700 -@@ -20,6 +20,7 @@ - - #include - #include -+#include - #include - - /* -@@ -51,4 +52,8 @@ - - struct inode_operations ext2_file_inode_operations = { - truncate: ext2_truncate, -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, - }; -Index: linux-DRV401/fs/ext2/ialloc.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/ialloc.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/ialloc.c 2004-10-15 11:03:51.000000000 -0700 -@@ -15,6 +15,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -167,6 +168,7 @@ - */ - if (!is_bad_inode(inode)) { - /* Quota is already initialized in iput() */ -+ ext2_xattr_delete_inode(inode); - DQUOT_FREE_INODE(inode); - DQUOT_DROP(inode); - } -Index: linux-DRV401/fs/ext2/inode.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/inode.c 2004-10-15 10:24:00.000000000 -0700 -+++ linux-DRV401/fs/ext2/inode.c 2004-10-15 11:03:51.000000000 -0700 -@@ -39,6 +39,18 @@ - static int ext2_update_inode(struct inode * inode, int do_sync); - - /* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext2_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext2_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ -+/* - * Called at each iput() - */ - void ext2_put_inode (struct inode * inode) -@@ -53,9 +65,7 @@ - { - lock_kernel(); - -- if (is_bad_inode(inode) || -- inode->i_ino == EXT2_ACL_IDX_INO || -- inode->i_ino == EXT2_ACL_DATA_INO) -+ if (is_bad_inode(inode)) - goto no_delete; - inode->u.ext2_i.i_dtime = CURRENT_TIME; - mark_inode_dirty(inode); -@@ -792,6 +802,8 @@ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; -+ if (ext2_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -@@ -879,8 +891,7 @@ - unsigned long offset; - struct ext2_group_desc * gdp; - -- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO && -- inode->i_ino != EXT2_ACL_DATA_INO && -+ if ((inode->i_ino != EXT2_ROOT_INO && - inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) || - inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) { - ext2_error (inode->i_sb, "ext2_read_inode", -@@ -965,10 +976,7 @@ - for (block = 0; block < EXT2_N_BLOCKS; block++) - inode->u.ext2_i.i_data[block] = raw_inode->i_block[block]; - -- if (inode->i_ino == EXT2_ACL_IDX_INO || -- inode->i_ino == EXT2_ACL_DATA_INO) -- /* Nothing to do */ ; -- else if (S_ISREG(inode->i_mode)) { -+ if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext2_file_inode_operations; - inode->i_fop = &ext2_file_operations; - inode->i_mapping->a_ops = &ext2_aops; -@@ -977,15 +985,17 @@ - inode->i_fop = &ext2_dir_operations; - inode->i_mapping->a_ops = &ext2_aops; - } else if (S_ISLNK(inode->i_mode)) { -- if (!inode->i_blocks) -+ if (ext2_inode_is_fast_symlink(inode)) - inode->i_op = &ext2_fast_symlink_inode_operations; - else { -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext2_symlink_inode_operations; - inode->i_mapping->a_ops = &ext2_aops; - } -- } else -+ } else { -+ inode->i_op = &ext2_special_inode_operations; - init_special_inode(inode, inode->i_mode, - le32_to_cpu(raw_inode->i_block[0])); -+ } - brelse (bh); - inode->i_attr_flags = 0; - if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) { -Index: linux-DRV401/fs/ext2/namei.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/namei.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/namei.c 2004-10-15 11:03:51.000000000 -0700 -@@ -31,6 +31,7 @@ - - #include - #include -+#include - #include - - /* -@@ -136,7 +137,7 @@ - - if (l > sizeof (inode->u.ext2_i.i_data)) { - /* slow symlink */ -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext2_symlink_inode_operations; - inode->i_mapping->a_ops = &ext2_aops; - err = block_symlink(inode, symname, l); - if (err) -@@ -345,4 +346,15 @@ - rmdir: ext2_rmdir, - mknod: ext2_mknod, - rename: ext2_rename, -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, -+}; -+ -+struct inode_operations ext2_special_inode_operations = { -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, - }; -Index: linux-DRV401/fs/ext2/super.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/super.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/super.c 2004-10-15 11:03:51.000000000 -0700 -@@ -21,6 +21,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -125,6 +126,7 @@ - int db_count; - int i; - -+ ext2_xattr_put_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { - struct ext2_super_block *es = EXT2_SB(sb)->s_es; - -@@ -175,6 +177,13 @@ - this_char = strtok (NULL, ",")) { - if ((value = strchr (this_char, '=')) != NULL) - *value++ = 0; -+#ifdef CONFIG_EXT2_FS_XATTR_USER -+ if (!strcmp (this_char, "user_xattr")) -+ set_opt (*mount_options, XATTR_USER); -+ else if (!strcmp (this_char, "nouser_xattr")) -+ clear_opt (*mount_options, XATTR_USER); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -424,6 +433,9 @@ - blocksize = BLOCK_SIZE; - - sb->u.ext2_sb.s_mount_opt = 0; -+#ifdef CONFIG_EXT2_FS_XATTR_USER -+ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */ -+#endif - if (!parse_options ((char *) data, &sb_block, &resuid, &resgid, - &sb->u.ext2_sb.s_mount_opt)) { - return NULL; -@@ -810,12 +822,27 @@ - - static int __init init_ext2_fs(void) - { -- return register_filesystem(&ext2_fs_type); -+ int error = init_ext2_xattr(); -+ if (error) -+ return error; -+ error = init_ext2_xattr_user(); -+ if (error) -+ goto fail; -+ error = register_filesystem(&ext2_fs_type); -+ if (!error) -+ return 0; -+ -+ exit_ext2_xattr_user(); -+fail: -+ exit_ext2_xattr(); -+ return error; - } - - static void __exit exit_ext2_fs(void) - { - unregister_filesystem(&ext2_fs_type); -+ exit_ext2_xattr_user(); -+ exit_ext2_xattr(); - } - - EXPORT_NO_SYMBOLS; -Index: linux-DRV401/fs/ext2/symlink.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/symlink.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/symlink.c 2004-10-15 11:03:51.000000000 -0700 -@@ -19,6 +19,7 @@ - - #include - #include -+#include - - static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen) - { -@@ -32,7 +33,20 @@ - return vfs_follow_link(nd, s); - } - -+struct inode_operations ext2_symlink_inode_operations = { -+ readlink: page_readlink, -+ follow_link: page_follow_link, -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, -+}; -+ - struct inode_operations ext2_fast_symlink_inode_operations = { - readlink: ext2_readlink, - follow_link: ext2_follow_link, -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, - }; -Index: linux-DRV401/fs/ext2/xattr.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/xattr.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext2/xattr.c 2004-10-15 11:03:51.000000000 -0700 -@@ -0,0 +1,1212 @@ -+/* -+ * linux/fs/ext2/xattr.c -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ * -+ * Fix by Harrison Xing . -+ * Extended attributes for symlinks and special files added per -+ * suggestion of Luka Renko . -+ */ -+ -+/* -+ * Extended attributes are stored on disk blocks allocated outside of -+ * any inode. The i_file_acl field is then made to point to this allocated -+ * block. If all extended attributes of an inode are identical, these -+ * inodes may share the same extended attribute block. Such situations -+ * are automatically detected by keeping a cache of recent attribute block -+ * numbers and hashes over the block's contents in memory. -+ * -+ * -+ * Extended attribute block layout: -+ * -+ * +------------------+ -+ * | header | -+ * | entry 1 | | -+ * | entry 2 | | growing downwards -+ * | entry 3 | v -+ * | four null bytes | -+ * | . . . | -+ * | value 1 | ^ -+ * | value 3 | | growing upwards -+ * | value 2 | | -+ * +------------------+ -+ * -+ * The block header is followed by multiple entry descriptors. These entry -+ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD -+ * byte boundaries. The entry descriptors are sorted by attribute name, -+ * so that two extended attribute blocks can be compared efficiently. -+ * -+ * Attribute values are aligned to the end of the block, stored in -+ * no specific order. They are also padded to EXT2_XATTR_PAD byte -+ * boundaries. No additional gaps are left between them. -+ * -+ * Locking strategy -+ * ---------------- -+ * The VFS already holds the BKL and the inode->i_sem semaphore when any of -+ * the xattr inode operations are called, so we are guaranteed that only one -+ * processes accesses extended attributes of an inode at any time. -+ * -+ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that -+ * only a single process is modifying an extended attribute block, even -+ * if the block is shared among inodes. -+ * -+ * Note for porting to 2.5 -+ * ----------------------- -+ * The BKL will no longer be held in the xattr inode operations. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* These symbols may be needed by a module. */ -+EXPORT_SYMBOL(ext2_xattr_register); -+EXPORT_SYMBOL(ext2_xattr_unregister); -+EXPORT_SYMBOL(ext2_xattr_get); -+EXPORT_SYMBOL(ext2_xattr_list); -+EXPORT_SYMBOL(ext2_xattr_set); -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) -+# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) -+#endif -+ -+#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data)) -+#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr)) -+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) -+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) -+ -+#ifdef EXT2_XATTR_DEBUG -+# define ea_idebug(inode, f...) do { \ -+ printk(KERN_DEBUG "inode %s:%ld: ", \ -+ kdevname(inode->i_dev), inode->i_ino); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+# define ea_bdebug(bh, f...) do { \ -+ printk(KERN_DEBUG "block %s:%ld: ", \ -+ kdevname(bh->b_dev), bh->b_blocknr); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+#else -+# define ea_idebug(f...) -+# define ea_bdebug(f...) -+#endif -+ -+static int ext2_xattr_set2(struct inode *, struct buffer_head *, -+ struct ext2_xattr_header *); -+ -+#ifdef CONFIG_EXT2_FS_XATTR_SHARING -+ -+static int ext2_xattr_cache_insert(struct buffer_head *); -+static struct buffer_head *ext2_xattr_cache_find(struct inode *, -+ struct ext2_xattr_header *); -+static void ext2_xattr_cache_remove(struct buffer_head *); -+static void ext2_xattr_rehash(struct ext2_xattr_header *, -+ struct ext2_xattr_entry *); -+ -+static struct mb_cache *ext2_xattr_cache; -+ -+#else -+# define ext2_xattr_cache_insert(bh) 0 -+# define ext2_xattr_cache_find(inode, header) NULL -+# define ext2_xattr_cache_remove(bh) while(0) {} -+# define ext2_xattr_rehash(header, entry) while(0) {} -+#endif -+ -+/* -+ * If a file system does not share extended attributes among inodes, -+ * we should not need the ext2_xattr_sem semaphore. However, the -+ * filesystem may still contain shared blocks, so we always take -+ * the lock. -+ */ -+ -+DECLARE_MUTEX(ext2_xattr_sem); -+ -+static inline int -+ext2_xattr_new_block(struct inode *inode, int * errp, int force) -+{ -+ struct super_block *sb = inode->i_sb; -+ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) + -+ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb); -+ -+ /* How can we enforce the allocation? */ -+ int block = ext2_new_block(inode, goal, 0, 0, errp); -+#ifdef OLD_QUOTAS -+ if (!*errp) -+ inode->i_blocks += inode->i_sb->s_blocksize >> 9; -+#endif -+ return block; -+} -+ -+static inline int -+ext2_xattr_quota_alloc(struct inode *inode, int force) -+{ -+ /* How can we enforce the allocation? */ -+#ifdef OLD_QUOTAS -+ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); -+ if (!error) -+ inode->i_blocks += inode->i_sb->s_blocksize >> 9; -+#else -+ int error = DQUOT_ALLOC_BLOCK(inode, 1); -+#endif -+ return error; -+} -+ -+#ifdef OLD_QUOTAS -+ -+static inline void -+ext2_xattr_quota_free(struct inode *inode) -+{ -+ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); -+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; -+} -+ -+static inline void -+ext2_xattr_free_block(struct inode * inode, unsigned long block) -+{ -+ ext2_free_blocks(inode, block, 1); -+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; -+} -+ -+#else -+# define ext2_xattr_quota_free(inode) \ -+ DQUOT_FREE_BLOCK(inode, 1) -+# define ext2_xattr_free_block(inode, block) \ -+ ext2_free_blocks(inode, block, 1) -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) -+ -+static inline struct buffer_head * -+sb_bread(struct super_block *sb, int block) -+{ -+ return bread(sb->s_dev, block, sb->s_blocksize); -+} -+ -+static inline struct buffer_head * -+sb_getblk(struct super_block *sb, int block) -+{ -+ return getblk(sb->s_dev, block, sb->s_blocksize); -+} -+ -+#endif -+ -+struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX]; -+rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED; -+ -+int -+ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler) -+{ -+ int error = -EINVAL; -+ -+ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { -+ write_lock(&ext2_handler_lock); -+ if (!ext2_xattr_handlers[name_index-1]) { -+ ext2_xattr_handlers[name_index-1] = handler; -+ error = 0; -+ } -+ write_unlock(&ext2_handler_lock); -+ } -+ return error; -+} -+ -+void -+ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler) -+{ -+ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) { -+ write_lock(&ext2_handler_lock); -+ ext2_xattr_handlers[name_index-1] = NULL; -+ write_unlock(&ext2_handler_lock); -+ } -+} -+ -+static inline const char * -+strcmp_prefix(const char *a, const char *a_prefix) -+{ -+ while (*a_prefix && *a == *a_prefix) { -+ a++; -+ a_prefix++; -+ } -+ return *a_prefix ? NULL : a; -+} -+ -+/* -+ * Decode the extended attribute name, and translate it into -+ * the name_index and name suffix. -+ */ -+static struct ext2_xattr_handler * -+ext2_xattr_resolve_name(const char **name) -+{ -+ struct ext2_xattr_handler *handler = NULL; -+ int i; -+ -+ if (!*name) -+ return NULL; -+ read_lock(&ext2_handler_lock); -+ for (i=0; iprefix); -+ if (n) { -+ handler = ext2_xattr_handlers[i]; -+ *name = n; -+ break; -+ } -+ } -+ } -+ read_unlock(&ext2_handler_lock); -+ return handler; -+} -+ -+static inline struct ext2_xattr_handler * -+ext2_xattr_handler(int name_index) -+{ -+ struct ext2_xattr_handler *handler = NULL; -+ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { -+ read_lock(&ext2_handler_lock); -+ handler = ext2_xattr_handlers[name_index-1]; -+ read_unlock(&ext2_handler_lock); -+ } -+ return handler; -+} -+ -+/* -+ * Inode operation getxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+ssize_t -+ext2_getxattr(struct dentry *dentry, const char *name, -+ void *buffer, size_t size) -+{ -+ struct ext2_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext2_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->get(inode, name, buffer, size); -+} -+ -+/* -+ * Inode operation listxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+ssize_t -+ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) -+{ -+ return ext2_xattr_list(dentry->d_inode, buffer, size); -+} -+ -+/* -+ * Inode operation setxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+int -+ext2_setxattr(struct dentry *dentry, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ struct ext2_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ if (size == 0) -+ value = ""; /* empty EA, do not remove */ -+ handler = ext2_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->set(inode, name, value, size, flags); -+} -+ -+/* -+ * Inode operation removexattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+int -+ext2_removexattr(struct dentry *dentry, const char *name) -+{ -+ struct ext2_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext2_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); -+} -+ -+/* -+ * ext2_xattr_get() -+ * -+ * Copy an extended attribute into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext2_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext2_xattr_entry *entry; -+ unsigned int block, size; -+ char *end; -+ int name_len, error; -+ -+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", -+ name_index, name, buffer, (long)buffer_size); -+ -+ if (name == NULL) -+ return -EINVAL; -+ if (!EXT2_I(inode)->i_file_acl) -+ return -ENOATTR; -+ block = EXT2_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* find named attribute */ -+ name_len = strlen(name); -+ -+ error = -ERANGE; -+ if (name_len > 255) -+ goto cleanup; -+ entry = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext2_xattr_entry *next = -+ EXT2_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (name_index == entry->e_name_index && -+ name_len == entry->e_name_len && -+ memcmp(name, entry->e_name, name_len) == 0) -+ goto found; -+ entry = next; -+ } -+ /* Check the remaining name entries */ -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext2_xattr_entry *next = -+ EXT2_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ entry = next; -+ } -+ if (ext2_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ error = -ENOATTR; -+ goto cleanup; -+found: -+ /* check the buffer size */ -+ if (entry->e_value_block != 0) -+ goto bad_block; -+ size = le32_to_cpu(entry->e_value_size); -+ if (size > inode->i_sb->s_blocksize || -+ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) -+ goto bad_block; -+ -+ if (ext2_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (buffer) { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ /* return value of attribute */ -+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), -+ size); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * ext2_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext2_xattr_entry *entry; -+ unsigned int block, size = 0; -+ char *buf, *end; -+ int error; -+ -+ ea_idebug(inode, "buffer=%p, buffer_size=%ld", -+ buffer, (long)buffer_size); -+ -+ if (!EXT2_I(inode)->i_file_acl) -+ return 0; -+ block = EXT2_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* compute the size required for the list of attribute names */ -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT2_XATTR_NEXT(entry)) { -+ struct ext2_xattr_handler *handler; -+ struct ext2_xattr_entry *next = -+ EXT2_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ -+ handler = ext2_xattr_handler(entry->e_name_index); -+ if (handler) -+ size += handler->list(NULL, inode, entry->e_name, -+ entry->e_name_len); -+ } -+ -+ if (ext2_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (!buffer) { -+ error = size; -+ goto cleanup; -+ } else { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ } -+ -+ /* list the attribute names */ -+ buf = buffer; -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT2_XATTR_NEXT(entry)) { -+ struct ext2_xattr_handler *handler; -+ -+ handler = ext2_xattr_handler(entry->e_name_index); -+ if (handler) -+ buf += handler->list(buf, inode, entry->e_name, -+ entry->e_name_len); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is -+ * not set, set it. -+ */ -+static void ext2_xattr_update_super_block(struct super_block *sb) -+{ -+ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) -+ return; -+ -+ lock_super(sb); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) -+ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR; -+#endif -+ EXT2_SB(sb)->s_es->s_feature_compat |= -+ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR); -+ sb->s_dirt = 1; -+ mark_buffer_dirty(EXT2_SB(sb)->s_sbh); -+ unlock_super(sb); -+} -+ -+/* -+ * ext2_xattr_set() -+ * -+ * Create, replace or remove an extended attribute for this inode. Buffer -+ * is NULL to remove an existing extended attribute, and non-NULL to -+ * either replace an existing extended attribute, or create a new extended -+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE -+ * specify that an extended attribute must exist and must not exist -+ * previous to the call, respectively. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+int -+ext2_xattr_set(struct inode *inode, int name_index, const char *name, -+ const void *value, size_t value_len, int flags) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *bh = NULL; -+ struct ext2_xattr_header *header = NULL; -+ struct ext2_xattr_entry *here, *last; -+ unsigned int name_len; -+ int block = EXT2_I(inode)->i_file_acl; -+ int min_offs = sb->s_blocksize, not_found = 1, free, error; -+ char *end; -+ -+ /* -+ * header -- Points either into bh, or to a temporarily -+ * allocated buffer. -+ * here -- The named entry found, or the place for inserting, within -+ * the block pointed to by header. -+ * last -- Points right after the last named entry within the block -+ * pointed to by header. -+ * min_offs -- The offset of the first value (values are aligned -+ * towards the end of the block). -+ * end -- Points right after the block pointed to by header. -+ */ -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ name_index, name, value, (long)value_len); -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -+ return -EPERM; -+ if (value == NULL) -+ value_len = 0; -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ if (name_len > 255 || value_len > sb->s_blocksize) -+ return -ERANGE; -+ down(&ext2_xattr_sem); -+ -+ if (block) { -+ /* The inode already has an extended attribute block. */ -+ -+ bh = sb_bread(sb, block); -+ error = -EIO; -+ if (!bh) -+ goto cleanup; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), -+ le32_to_cpu(HDR(bh)->h_refcount)); -+ header = HDR(bh); -+ end = bh->b_data + bh->b_size; -+ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || -+ header->h_blocks != cpu_to_le32(1)) { -+bad_block: ext2_error(sb, "ext2_xattr_set", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* Find the named attribute. */ -+ here = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(here)) { -+ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!here->e_value_block && here->e_value_size) { -+ int offs = le16_to_cpu(here->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ not_found = name_index - here->e_name_index; -+ if (!not_found) -+ not_found = name_len - here->e_name_len; -+ if (!not_found) -+ not_found = memcmp(name, here->e_name,name_len); -+ if (not_found <= 0) -+ break; -+ here = next; -+ } -+ last = here; -+ /* We still need to compute min_offs and last. */ -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!last->e_value_block && last->e_value_size) { -+ int offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ last = next; -+ } -+ -+ /* Check whether we have enough space left. */ -+ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); -+ } else { -+ /* We will use a new extended attribute block. */ -+ free = sb->s_blocksize - -+ sizeof(struct ext2_xattr_header) - sizeof(__u32); -+ here = last = NULL; /* avoid gcc uninitialized warning. */ -+ } -+ -+ if (not_found) { -+ /* Request to remove a nonexistent attribute? */ -+ error = -ENOATTR; -+ if (flags & XATTR_REPLACE) -+ goto cleanup; -+ error = 0; -+ if (value == NULL) -+ goto cleanup; -+ else -+ free -= EXT2_XATTR_LEN(name_len); -+ } else { -+ /* Request to create an existing attribute? */ -+ error = -EEXIST; -+ if (flags & XATTR_CREATE) -+ goto cleanup; -+ if (!here->e_value_block && here->e_value_size) { -+ unsigned int size = le32_to_cpu(here->e_value_size); -+ -+ if (le16_to_cpu(here->e_value_offs) + size > -+ sb->s_blocksize || size > sb->s_blocksize) -+ goto bad_block; -+ free += EXT2_XATTR_SIZE(size); -+ } -+ } -+ free -= EXT2_XATTR_SIZE(value_len); -+ error = -ENOSPC; -+ if (free < 0) -+ goto cleanup; -+ -+ /* Here we know that we can set the new attribute. */ -+ -+ if (header) { -+ if (header->h_refcount == cpu_to_le32(1)) { -+ ea_bdebug(bh, "modifying in-place"); -+ ext2_xattr_cache_remove(bh); -+ } else { -+ int offset; -+ -+ ea_bdebug(bh, "cloning"); -+ header = kmalloc(bh->b_size, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memcpy(header, HDR(bh), bh->b_size); -+ header->h_refcount = cpu_to_le32(1); -+ offset = (char *)header - bh->b_data; -+ here = ENTRY((char *)here + offset); -+ last = ENTRY((char *)last + offset); -+ } -+ } else { -+ /* Allocate a buffer where we construct the new block. */ -+ header = kmalloc(sb->s_blocksize, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memset(header, 0, sb->s_blocksize); -+ end = (char *)header + sb->s_blocksize; -+ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); -+ header->h_blocks = header->h_refcount = cpu_to_le32(1); -+ last = here = ENTRY(header+1); -+ } -+ -+ if (not_found) { -+ /* Insert the new name. */ -+ int size = EXT2_XATTR_LEN(name_len); -+ int rest = (char *)last - (char *)here; -+ memmove((char *)here + size, here, rest); -+ memset(here, 0, size); -+ here->e_name_index = name_index; -+ here->e_name_len = name_len; -+ memcpy(here->e_name, name, name_len); -+ } else { -+ /* Remove the old value. */ -+ if (!here->e_value_block && here->e_value_size) { -+ char *first_val = (char *)header + min_offs; -+ int offs = le16_to_cpu(here->e_value_offs); -+ char *val = (char *)header + offs; -+ size_t size = EXT2_XATTR_SIZE( -+ le32_to_cpu(here->e_value_size)); -+ memmove(first_val + size, first_val, val - first_val); -+ memset(first_val, 0, size); -+ here->e_value_offs = 0; -+ min_offs += size; -+ -+ /* Adjust all value offsets. */ -+ last = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(last)) { -+ int o = le16_to_cpu(last->e_value_offs); -+ if (!last->e_value_block && o < offs) -+ last->e_value_offs = -+ cpu_to_le16(o + size); -+ last = EXT2_XATTR_NEXT(last); -+ } -+ } -+ if (value == NULL) { -+ /* Remove this attribute. */ -+ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) { -+ /* This block is now empty. */ -+ error = ext2_xattr_set2(inode, bh, NULL); -+ goto cleanup; -+ } else { -+ /* Remove the old name. */ -+ int size = EXT2_XATTR_LEN(name_len); -+ last = ENTRY((char *)last - size); -+ memmove(here, (char*)here + size, -+ (char*)last - (char*)here); -+ memset(last, 0, size); -+ } -+ } -+ } -+ -+ if (value != NULL) { -+ /* Insert the new value. */ -+ here->e_value_size = cpu_to_le32(value_len); -+ if (value_len) { -+ size_t size = EXT2_XATTR_SIZE(value_len); -+ char *val = (char *)header + min_offs - size; -+ here->e_value_offs = -+ cpu_to_le16((char *)val - (char *)header); -+ memset(val + size - EXT2_XATTR_PAD, 0, -+ EXT2_XATTR_PAD); /* Clear the pad bytes. */ -+ memcpy(val, value, value_len); -+ } -+ } -+ ext2_xattr_rehash(header, here); -+ -+ error = ext2_xattr_set2(inode, bh, header); -+ -+cleanup: -+ brelse(bh); -+ if (!(bh && header == HDR(bh))) -+ kfree(header); -+ up(&ext2_xattr_sem); -+ -+ return error; -+} -+ -+/* -+ * Second half of ext2_xattr_set(): Update the file system. -+ */ -+static int -+ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, -+ struct ext2_xattr_header *header) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *new_bh = NULL; -+ int error; -+ -+ if (header) { -+ new_bh = ext2_xattr_cache_find(inode, header); -+ if (new_bh) { -+ /* -+ * We found an identical block in the cache. -+ * The old block will be released after updating -+ * the inode. -+ */ -+ ea_bdebug(old_bh, "reusing block %ld", -+ new_bh->b_blocknr); -+ -+ error = -EDQUOT; -+ if (ext2_xattr_quota_alloc(inode, 1)) -+ goto cleanup; -+ -+ HDR(new_bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); -+ ea_bdebug(new_bh, "refcount now=%d", -+ le32_to_cpu(HDR(new_bh)->h_refcount)); -+ } else if (old_bh && header == HDR(old_bh)) { -+ /* Keep this block. */ -+ new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); -+ } else { -+ /* We need to allocate a new block */ -+ int force = EXT2_I(inode)->i_file_acl != 0; -+ int block = ext2_xattr_new_block(inode, &error, force); -+ if (error) -+ goto cleanup; -+ ea_idebug(inode, "creating block %d", block); -+ -+ new_bh = sb_getblk(sb, block); -+ if (!new_bh) { -+ ext2_xattr_free_block(inode, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(new_bh); -+ memcpy(new_bh->b_data, header, new_bh->b_size); -+ mark_buffer_uptodate(new_bh, 1); -+ unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); -+ -+ ext2_xattr_update_super_block(sb); -+ } -+ mark_buffer_dirty(new_bh); -+ if (IS_SYNC(inode)) { -+ ll_rw_block(WRITE, 1, &new_bh); -+ wait_on_buffer(new_bh); -+ error = -EIO; -+ if (buffer_req(new_bh) && !buffer_uptodate(new_bh)) -+ goto cleanup; -+ } -+ } -+ -+ /* Update the inode. */ -+ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; -+ inode->i_ctime = CURRENT_TIME; -+ if (IS_SYNC(inode)) { -+ error = ext2_sync_inode (inode); -+ if (error) -+ goto cleanup; -+ } else -+ mark_inode_dirty(inode); -+ -+ error = 0; -+ if (old_bh && old_bh != new_bh) { -+ /* -+ * If there was an old block, and we are not still using it, -+ * we now release the old block. -+ */ -+ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); -+ -+ if (refcount == 1) { -+ /* Free the old block. */ -+ ea_bdebug(old_bh, "freeing"); -+ ext2_xattr_free_block(inode, old_bh->b_blocknr); -+ mark_buffer_clean(old_bh); -+ } else { -+ /* Decrement the refcount only. */ -+ refcount--; -+ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); -+ ext2_xattr_quota_free(inode); -+ mark_buffer_dirty(old_bh); -+ ea_bdebug(old_bh, "refcount now=%d", refcount); -+ } -+ } -+ -+cleanup: -+ if (old_bh != new_bh) -+ brelse(new_bh); -+ -+ return error; -+} -+ -+/* -+ * ext2_xattr_delete_inode() -+ * -+ * Free extended attribute resources associated with this inode. This -+ * is called immediately before an inode is freed. -+ */ -+void -+ext2_xattr_delete_inode(struct inode *inode) -+{ -+ struct buffer_head *bh; -+ unsigned int block = EXT2_I(inode)->i_file_acl; -+ -+ if (!block) -+ return; -+ down(&ext2_xattr_sem); -+ -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) { -+ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", -+ "inode %ld: block %d read error", inode->i_ino, block); -+ goto cleanup; -+ } -+ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ goto cleanup; -+ } -+ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { -+ ext2_xattr_cache_remove(bh); -+ ext2_xattr_free_block(inode, block); -+ bforget(bh); -+ bh = NULL; -+ } else { -+ HDR(bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ mark_buffer_dirty(bh); -+ if (IS_SYNC(inode)) { -+ ll_rw_block(WRITE, 1, &bh); -+ wait_on_buffer(bh); -+ } -+ ext2_xattr_quota_free(inode); -+ } -+ EXT2_I(inode)->i_file_acl = 0; -+ -+cleanup: -+ brelse(bh); -+ up(&ext2_xattr_sem); -+} -+ -+/* -+ * ext2_xattr_put_super() -+ * -+ * This is called when a file system is unmounted. -+ */ -+void -+ext2_xattr_put_super(struct super_block *sb) -+{ -+#ifdef CONFIG_EXT2_FS_XATTR_SHARING -+ mb_cache_shrink(ext2_xattr_cache, sb->s_dev); -+#endif -+} -+ -+#ifdef CONFIG_EXT2_FS_XATTR_SHARING -+ -+/* -+ * ext2_xattr_cache_insert() -+ * -+ * Create a new entry in the extended attribute cache, and insert -+ * it unless such an entry is already in the cache. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+static int -+ext2_xattr_cache_insert(struct buffer_head *bh) -+{ -+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); -+ struct mb_cache_entry *ce; -+ int error; -+ -+ ce = mb_cache_entry_alloc(ext2_xattr_cache); -+ if (!ce) -+ return -ENOMEM; -+ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); -+ if (error) { -+ mb_cache_entry_free(ce); -+ if (error == -EBUSY) { -+ ea_bdebug(bh, "already in cache (%d cache entries)", -+ atomic_read(&ext2_xattr_cache->c_entry_count)); -+ error = 0; -+ } -+ } else { -+ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, -+ atomic_read(&ext2_xattr_cache->c_entry_count)); -+ mb_cache_entry_release(ce); -+ } -+ return error; -+} -+ -+/* -+ * ext2_xattr_cmp() -+ * -+ * Compare two extended attribute blocks for equality. -+ * -+ * Returns 0 if the blocks are equal, 1 if they differ, and -+ * a negative error number on errors. -+ */ -+static int -+ext2_xattr_cmp(struct ext2_xattr_header *header1, -+ struct ext2_xattr_header *header2) -+{ -+ struct ext2_xattr_entry *entry1, *entry2; -+ -+ entry1 = ENTRY(header1+1); -+ entry2 = ENTRY(header2+1); -+ while (!IS_LAST_ENTRY(entry1)) { -+ if (IS_LAST_ENTRY(entry2)) -+ return 1; -+ if (entry1->e_hash != entry2->e_hash || -+ entry1->e_name_len != entry2->e_name_len || -+ entry1->e_value_size != entry2->e_value_size || -+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) -+ return 1; -+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) -+ return -EIO; -+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), -+ (char *)header2 + le16_to_cpu(entry2->e_value_offs), -+ le32_to_cpu(entry1->e_value_size))) -+ return 1; -+ -+ entry1 = EXT2_XATTR_NEXT(entry1); -+ entry2 = EXT2_XATTR_NEXT(entry2); -+ } -+ if (!IS_LAST_ENTRY(entry2)) -+ return 1; -+ return 0; -+} -+ -+/* -+ * ext2_xattr_cache_find() -+ * -+ * Find an identical extended attribute block. -+ * -+ * Returns a pointer to the block found, or NULL if such a block was -+ * not found or an error occurred. -+ */ -+static struct buffer_head * -+ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) -+{ -+ __u32 hash = le32_to_cpu(header->h_hash); -+ struct mb_cache_entry *ce; -+ -+ if (!header->h_hash) -+ return NULL; /* never share */ -+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -+ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash); -+ while (ce) { -+ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); -+ -+ if (!bh) { -+ ext2_error(inode->i_sb, "ext2_xattr_cache_find", -+ "inode %ld: block %ld read error", -+ inode->i_ino, ce->e_block); -+ } else if (le32_to_cpu(HDR(bh)->h_refcount) > -+ EXT2_XATTR_REFCOUNT_MAX) { -+ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, -+ le32_to_cpu(HDR(bh)->h_refcount), -+ EXT2_XATTR_REFCOUNT_MAX); -+ } else if (!ext2_xattr_cmp(header, HDR(bh))) { -+ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); -+ mb_cache_entry_release(ce); -+ return bh; -+ } -+ brelse(bh); -+ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); -+ } -+ return NULL; -+} -+ -+/* -+ * ext2_xattr_cache_remove() -+ * -+ * Remove the cache entry of a block from the cache. Called when a -+ * block becomes invalid. -+ */ -+static void -+ext2_xattr_cache_remove(struct buffer_head *bh) -+{ -+ struct mb_cache_entry *ce; -+ -+ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr); -+ if (ce) { -+ ea_bdebug(bh, "removing (%d cache entries remaining)", -+ atomic_read(&ext2_xattr_cache->c_entry_count)-1); -+ mb_cache_entry_free(ce); -+ } else -+ ea_bdebug(bh, "no cache entry"); -+} -+ -+#define NAME_HASH_SHIFT 5 -+#define VALUE_HASH_SHIFT 16 -+ -+/* -+ * ext2_xattr_hash_entry() -+ * -+ * Compute the hash of an extended attribute. -+ */ -+static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header, -+ struct ext2_xattr_entry *entry) -+{ -+ __u32 hash = 0; -+ char *name = entry->e_name; -+ int n; -+ -+ for (n=0; n < entry->e_name_len; n++) { -+ hash = (hash << NAME_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ -+ *name++; -+ } -+ -+ if (entry->e_value_block == 0 && entry->e_value_size != 0) { -+ __u32 *value = (__u32 *)((char *)header + -+ le16_to_cpu(entry->e_value_offs)); -+ for (n = (le32_to_cpu(entry->e_value_size) + -+ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) { -+ hash = (hash << VALUE_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ -+ le32_to_cpu(*value++); -+ } -+ } -+ entry->e_hash = cpu_to_le32(hash); -+} -+ -+#undef NAME_HASH_SHIFT -+#undef VALUE_HASH_SHIFT -+ -+#define BLOCK_HASH_SHIFT 16 -+ -+/* -+ * ext2_xattr_rehash() -+ * -+ * Re-compute the extended attribute hash value after an entry has changed. -+ */ -+static void ext2_xattr_rehash(struct ext2_xattr_header *header, -+ struct ext2_xattr_entry *entry) -+{ -+ struct ext2_xattr_entry *here; -+ __u32 hash = 0; -+ -+ ext2_xattr_hash_entry(header, entry); -+ here = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(here)) { -+ if (!here->e_hash) { -+ /* Block is not shared if an entry's hash value == 0 */ -+ hash = 0; -+ break; -+ } -+ hash = (hash << BLOCK_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ -+ le32_to_cpu(here->e_hash); -+ here = EXT2_XATTR_NEXT(here); -+ } -+ header->h_hash = cpu_to_le32(hash); -+} -+ -+#undef BLOCK_HASH_SHIFT -+ -+int __init -+init_ext2_xattr(void) -+{ -+ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL, -+ sizeof(struct mb_cache_entry) + -+ sizeof(struct mb_cache_entry_index), 1, 61); -+ if (!ext2_xattr_cache) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+void -+exit_ext2_xattr(void) -+{ -+ mb_cache_destroy(ext2_xattr_cache); -+} -+ -+#else /* CONFIG_EXT2_FS_XATTR_SHARING */ -+ -+int __init -+init_ext2_xattr(void) -+{ -+ return 0; -+} -+ -+void -+exit_ext2_xattr(void) -+{ -+} -+ -+#endif /* CONFIG_EXT2_FS_XATTR_SHARING */ -Index: linux-DRV401/fs/ext2/xattr_user.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/xattr_user.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext2/xattr_user.c 2004-10-15 11:03:51.000000000 -0700 -@@ -0,0 +1,103 @@ -+/* -+ * linux/fs/ext2/xattr_user.c -+ * Handler for extended user attributes. -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_EXT2_FS_POSIX_ACL -+# include -+#endif -+ -+#define XATTR_USER_PREFIX "user." -+ -+static size_t -+ext2_xattr_user_list(char *list, struct inode *inode, -+ const char *name, int name_len) -+{ -+ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; -+ -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return 0; -+ -+ if (list) { -+ memcpy(list, XATTR_USER_PREFIX, prefix_len); -+ memcpy(list+prefix_len, name, name_len); -+ list[prefix_len + name_len] = '\0'; -+ } -+ return prefix_len + name_len + 1; -+} -+ -+static int -+ext2_xattr_user_get(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -ENOTSUP; -+#ifdef CONFIG_EXT2_FS_POSIX_ACL -+ error = ext2_permission_locked(inode, MAY_READ); -+#else -+ error = permission(inode, MAY_READ); -+#endif -+ if (error) -+ return error; -+ -+ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, -+ buffer, size); -+} -+ -+static int -+ext2_xattr_user_set(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -ENOTSUP; -+ if ( !S_ISREG(inode->i_mode) && -+ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) -+ return -EPERM; -+#ifdef CONFIG_EXT2_FS_POSIX_ACL -+ error = ext2_permission_locked(inode, MAY_WRITE); -+#else -+ error = permission(inode, MAY_WRITE); -+#endif -+ if (error) -+ return error; -+ -+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, -+ value, size, flags); -+} -+ -+struct ext2_xattr_handler ext2_xattr_user_handler = { -+ prefix: XATTR_USER_PREFIX, -+ list: ext2_xattr_user_list, -+ get: ext2_xattr_user_get, -+ set: ext2_xattr_user_set, -+}; -+ -+int __init -+init_ext2_xattr_user(void) -+{ -+ return ext2_xattr_register(EXT2_XATTR_INDEX_USER, -+ &ext2_xattr_user_handler); -+} -+ -+void -+exit_ext2_xattr_user(void) -+{ -+ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER, -+ &ext2_xattr_user_handler); -+} -Index: linux-DRV401/fs/ext3/Makefile -=================================================================== ---- linux-DRV401.orig/fs/ext3/Makefile 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/fs/ext3/Makefile 2004-10-15 11:03:51.000000000 -0700 -@@ -1,5 +1,5 @@ - # --# Makefile for the linux ext2-filesystem routines. -+# Makefile for the linux ext3-filesystem routines. - # - # Note! Dependencies are done automagically by 'make dep', which also - # removes any old dependencies. DON'T put your own dependencies here -@@ -9,8 +9,14 @@ - - O_TARGET := ext3.o - -+export-objs := ext3-exports.o -+ - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -- ioctl.o namei.o super.o symlink.o hash.o -+ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o - obj-m := $(O_TARGET) - -+export-objs += xattr.o -+obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o -+obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o -+ - include $(TOPDIR)/Rules.make -Index: linux-DRV401/fs/ext3/file.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/file.c 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/fs/ext3/file.c 2004-10-15 11:03:51.000000000 -0700 -@@ -23,6 +23,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -93,5 +94,9 @@ - struct inode_operations ext3_file_inode_operations = { - truncate: ext3_truncate, /* BKL held */ - setattr: ext3_setattr, /* BKL held */ -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ - }; - -Index: linux-DRV401/fs/ext3/ialloc.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/ialloc.c 2004-10-15 10:24:00.000000000 -0700 -+++ linux-DRV401/fs/ext3/ialloc.c 2004-10-15 11:03:52.000000000 -0700 -@@ -17,6 +17,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -216,6 +217,7 @@ - * as writing the quota to disk may need the lock as well. - */ - DQUOT_INIT(inode); -+ ext3_xattr_delete_inode(handle, inode); - DQUOT_FREE_INODE(inode); - DQUOT_DROP(inode); - -Index: linux-DRV401/fs/ext3/inode.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/inode.c 2004-10-15 10:24:00.000000000 -0700 -+++ linux-DRV401/fs/ext3/inode.c 2004-10-15 11:03:52.000000000 -0700 -@@ -39,6 +39,18 @@ - */ - #undef SEARCH_FROM_ZERO - -+/* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext3_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext3_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ - /* The ext3 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. -@@ -48,7 +60,7 @@ - * still needs to be revoked. - */ - --static int ext3_forget(handle_t *handle, int is_metadata, -+int ext3_forget(handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - int blocknr) - { -@@ -164,9 +176,7 @@ - { - handle_t *handle; - -- if (is_bad_inode(inode) || -- inode->i_ino == EXT3_ACL_IDX_INO || -- inode->i_ino == EXT3_ACL_DATA_INO) -+ if (is_bad_inode(inode)) - goto no_delete; - - lock_kernel(); -@@ -1843,6 +1853,8 @@ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; -+ if (ext3_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -@@ -1990,8 +2002,6 @@ - struct ext3_group_desc * gdp; - - if ((inode->i_ino != EXT3_ROOT_INO && -- inode->i_ino != EXT3_ACL_IDX_INO && -- inode->i_ino != EXT3_ACL_DATA_INO && - inode->i_ino != EXT3_JOURNAL_INO && - inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || - inode->i_ino > le32_to_cpu( -@@ -2118,10 +2128,7 @@ - - brelse (iloc.bh); - -- if (inode->i_ino == EXT3_ACL_IDX_INO || -- inode->i_ino == EXT3_ACL_DATA_INO) -- /* Nothing to do */ ; -- else if (S_ISREG(inode->i_mode)) { -+ if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - inode->i_mapping->a_ops = &ext3_aops; -@@ -2129,15 +2136,17 @@ - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { -- if (!inode->i_blocks) -+ if (ext3_inode_is_fast_symlink(inode)) - inode->i_op = &ext3_fast_symlink_inode_operations; - else { -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext3_symlink_inode_operations; - inode->i_mapping->a_ops = &ext3_aops; - } -- } else -+ } else { -+ inode->i_op = &ext3_special_inode_operations; - init_special_inode(inode, inode->i_mode, - le32_to_cpu(iloc.raw_inode->i_block[0])); -+ } - /* inode->i_attr_flags = 0; unused */ - if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { - /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ -Index: linux-DRV401/fs/ext3/namei.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/namei.c 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/fs/ext3/namei.c 2004-10-15 11:03:52.000000000 -0700 -@@ -29,6 +29,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -1612,7 +1613,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFDIR); -+ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -1620,7 +1621,6 @@ - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; -- inode->i_blocks = 0; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - inode->i_nlink--; /* is this nlink == 0? */ -@@ -1647,9 +1647,6 @@ - BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_block); - brelse (dir_block); -- inode->i_mode = S_IFDIR | mode; -- if (dir->i_mode & S_ISGID) -- inode->i_mode |= S_ISGID; - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); - if (err) { -@@ -2018,7 +2015,7 @@ - goto out_stop; - - if (l > sizeof (EXT3_I(inode)->i_data)) { -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext3_symlink_inode_operations; - inode->i_mapping->a_ops = &ext3_aops; - /* - * block_symlink() calls back into ext3_prepare/commit_write. -@@ -2245,4 +2242,16 @@ - rmdir: ext3_rmdir, /* BKL held */ - mknod: ext3_mknod, /* BKL held */ - rename: ext3_rename, /* BKL held */ -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ - }; -+ -+struct inode_operations ext3_special_inode_operations = { -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ -+}; -+ -Index: linux-DRV401/fs/ext3/super.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/super.c 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/fs/ext3/super.c 2004-10-15 11:03:52.000000000 -0700 -@@ -24,6 +24,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -404,6 +405,7 @@ - kdev_t j_dev = sbi->s_journal->j_dev; - int i; - -+ ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -@@ -499,6 +501,7 @@ - int is_remount) - { - unsigned long *mount_options = &sbi->s_mount_opt; -+ - uid_t *resuid = &sbi->s_resuid; - gid_t *resgid = &sbi->s_resgid; - char * this_char; -@@ -511,6 +514,13 @@ - this_char = strtok (NULL, ",")) { - if ((value = strchr (this_char, '=')) != NULL) - *value++ = 0; -+#ifdef CONFIG_EXT3_FS_XATTR_USER -+ if (!strcmp (this_char, "user_xattr")) -+ set_opt (*mount_options, XATTR_USER); -+ else if (!strcmp (this_char, "nouser_xattr")) -+ clear_opt (*mount_options, XATTR_USER); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -924,6 +934,12 @@ - sbi->s_mount_opt = 0; - sbi->s_resuid = EXT3_DEF_RESUID; - sbi->s_resgid = EXT3_DEF_RESGID; -+ -+ /* Default extended attribute flags */ -+#ifdef CONFIG_EXT3_FS_XATTR_USER -+ /* set_opt(sbi->s_mount_opt, XATTR_USER); */ -+#endif -+ - if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { - sb->s_dev = 0; - goto out_fail; -@@ -1742,12 +1758,27 @@ - - static int __init init_ext3_fs(void) - { -- return register_filesystem(&ext3_fs_type); -+ int error = init_ext3_xattr(); -+ if (error) -+ return error; -+ error = init_ext3_xattr_user(); -+ if (error) -+ goto fail; -+ error = register_filesystem(&ext3_fs_type); -+ if (!error) -+ return 0; -+ -+ exit_ext3_xattr_user(); -+fail: -+ exit_ext3_xattr(); -+ return error; - } - - static void __exit exit_ext3_fs(void) - { - unregister_filesystem(&ext3_fs_type); -+ exit_ext3_xattr_user(); -+ exit_ext3_xattr(); - } - - EXPORT_SYMBOL(ext3_force_commit); -Index: linux-DRV401/fs/ext3/symlink.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/symlink.c 2004-10-15 10:24:00.000000000 -0700 -+++ linux-DRV401/fs/ext3/symlink.c 2004-10-15 11:03:52.000000000 -0700 -@@ -20,6 +20,7 @@ - #include - #include - #include -+#include - - static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) - { -@@ -33,7 +34,20 @@ - return vfs_follow_link(nd, s); - } - -+struct inode_operations ext3_symlink_inode_operations = { -+ readlink: page_readlink, /* BKL not held. Don't need */ -+ follow_link: page_follow_link, /* BKL not held. Don't need */ -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ -+}; -+ - struct inode_operations ext3_fast_symlink_inode_operations = { - readlink: ext3_readlink, /* BKL not held. Don't need */ - follow_link: ext3_follow_link, /* BKL not held. Don't need */ -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ - }; -Index: linux-DRV401/fs/ext3/xattr.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/xattr.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext3/xattr.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,1225 @@ -+/* -+ * linux/fs/ext3/xattr.c -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ * -+ * Fix by Harrison Xing . -+ * Ext3 code with a lot of help from Eric Jarman . -+ * Extended attributes for symlinks and special files added per -+ * suggestion of Luka Renko . -+ */ -+ -+/* -+ * Extended attributes are stored on disk blocks allocated outside of -+ * any inode. The i_file_acl field is then made to point to this allocated -+ * block. If all extended attributes of an inode are identical, these -+ * inodes may share the same extended attribute block. Such situations -+ * are automatically detected by keeping a cache of recent attribute block -+ * numbers and hashes over the block's contents in memory. -+ * -+ * -+ * Extended attribute block layout: -+ * -+ * +------------------+ -+ * | header | -+ * | entry 1 | | -+ * | entry 2 | | growing downwards -+ * | entry 3 | v -+ * | four null bytes | -+ * | . . . | -+ * | value 1 | ^ -+ * | value 3 | | growing upwards -+ * | value 2 | | -+ * +------------------+ -+ * -+ * The block header is followed by multiple entry descriptors. These entry -+ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD -+ * byte boundaries. The entry descriptors are sorted by attribute name, -+ * so that two extended attribute blocks can be compared efficiently. -+ * -+ * Attribute values are aligned to the end of the block, stored in -+ * no specific order. They are also padded to EXT3_XATTR_PAD byte -+ * boundaries. No additional gaps are left between them. -+ * -+ * Locking strategy -+ * ---------------- -+ * The VFS already holds the BKL and the inode->i_sem semaphore when any of -+ * the xattr inode operations are called, so we are guaranteed that only one -+ * processes accesses extended attributes of an inode at any time. -+ * -+ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that -+ * only a single process is modifying an extended attribute block, even -+ * if the block is shared among inodes. -+ * -+ * Note for porting to 2.5 -+ * ----------------------- -+ * The BKL will no longer be held in the xattr inode operations. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define EXT3_EA_USER "user." -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) -+# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) -+#endif -+ -+#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) -+#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) -+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) -+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) -+ -+#ifdef EXT3_XATTR_DEBUG -+# define ea_idebug(inode, f...) do { \ -+ printk(KERN_DEBUG "inode %s:%ld: ", \ -+ kdevname(inode->i_dev), inode->i_ino); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+# define ea_bdebug(bh, f...) do { \ -+ printk(KERN_DEBUG "block %s:%ld: ", \ -+ kdevname(bh->b_dev), bh->b_blocknr); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+#else -+# define ea_idebug(f...) -+# define ea_bdebug(f...) -+#endif -+ -+static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, -+ struct ext3_xattr_header *); -+ -+#ifdef CONFIG_EXT3_FS_XATTR_SHARING -+ -+static int ext3_xattr_cache_insert(struct buffer_head *); -+static struct buffer_head *ext3_xattr_cache_find(struct inode *, -+ struct ext3_xattr_header *); -+static void ext3_xattr_cache_remove(struct buffer_head *); -+static void ext3_xattr_rehash(struct ext3_xattr_header *, -+ struct ext3_xattr_entry *); -+ -+static struct mb_cache *ext3_xattr_cache; -+ -+#else -+# define ext3_xattr_cache_insert(bh) 0 -+# define ext3_xattr_cache_find(inode, header) NULL -+# define ext3_xattr_cache_remove(bh) while(0) {} -+# define ext3_xattr_rehash(header, entry) while(0) {} -+#endif -+ -+/* -+ * If a file system does not share extended attributes among inodes, -+ * we should not need the ext3_xattr_sem semaphore. However, the -+ * filesystem may still contain shared blocks, so we always take -+ * the lock. -+ */ -+ -+DECLARE_MUTEX(ext3_xattr_sem); -+ -+static inline int -+ext3_xattr_new_block(handle_t *handle, struct inode *inode, -+ int * errp, int force) -+{ -+ struct super_block *sb = inode->i_sb; -+ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + -+ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); -+ -+ /* How can we enforce the allocation? */ -+ int block = ext3_new_block(handle, inode, goal, 0, 0, errp); -+#ifdef OLD_QUOTAS -+ if (!*errp) -+ inode->i_blocks += inode->i_sb->s_blocksize >> 9; -+#endif -+ return block; -+} -+ -+static inline int -+ext3_xattr_quota_alloc(struct inode *inode, int force) -+{ -+ /* How can we enforce the allocation? */ -+#ifdef OLD_QUOTAS -+ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); -+ if (!error) -+ inode->i_blocks += inode->i_sb->s_blocksize >> 9; -+#else -+ int error = DQUOT_ALLOC_BLOCK(inode, 1); -+#endif -+ return error; -+} -+ -+#ifdef OLD_QUOTAS -+ -+static inline void -+ext3_xattr_quota_free(struct inode *inode) -+{ -+ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); -+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; -+} -+ -+static inline void -+ext3_xattr_free_block(handle_t *handle, struct inode * inode, -+ unsigned long block) -+{ -+ ext3_free_blocks(handle, inode, block, 1); -+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; -+} -+ -+#else -+# define ext3_xattr_quota_free(inode) \ -+ DQUOT_FREE_BLOCK(inode, 1) -+# define ext3_xattr_free_block(handle, inode, block) \ -+ ext3_free_blocks(handle, inode, block, 1) -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) -+ -+static inline struct buffer_head * -+sb_bread(struct super_block *sb, int block) -+{ -+ return bread(sb->s_dev, block, sb->s_blocksize); -+} -+ -+static inline struct buffer_head * -+sb_getblk(struct super_block *sb, int block) -+{ -+ return getblk(sb->s_dev, block, sb->s_blocksize); -+} -+ -+#endif -+ -+struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; -+rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; -+ -+int -+ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) -+{ -+ int error = -EINVAL; -+ -+ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { -+ write_lock(&ext3_handler_lock); -+ if (!ext3_xattr_handlers[name_index-1]) { -+ ext3_xattr_handlers[name_index-1] = handler; -+ error = 0; -+ } -+ write_unlock(&ext3_handler_lock); -+ } -+ return error; -+} -+ -+void -+ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) -+{ -+ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { -+ write_lock(&ext3_handler_lock); -+ ext3_xattr_handlers[name_index-1] = NULL; -+ write_unlock(&ext3_handler_lock); -+ } -+} -+ -+static inline const char * -+strcmp_prefix(const char *a, const char *a_prefix) -+{ -+ while (*a_prefix && *a == *a_prefix) { -+ a++; -+ a_prefix++; -+ } -+ return *a_prefix ? NULL : a; -+} -+ -+/* -+ * Decode the extended attribute name, and translate it into -+ * the name_index and name suffix. -+ */ -+static inline struct ext3_xattr_handler * -+ext3_xattr_resolve_name(const char **name) -+{ -+ struct ext3_xattr_handler *handler = NULL; -+ int i; -+ -+ if (!*name) -+ return NULL; -+ read_lock(&ext3_handler_lock); -+ for (i=0; iprefix); -+ if (n) { -+ handler = ext3_xattr_handlers[i]; -+ *name = n; -+ break; -+ } -+ } -+ } -+ read_unlock(&ext3_handler_lock); -+ return handler; -+} -+ -+static inline struct ext3_xattr_handler * -+ext3_xattr_handler(int name_index) -+{ -+ struct ext3_xattr_handler *handler = NULL; -+ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { -+ read_lock(&ext3_handler_lock); -+ handler = ext3_xattr_handlers[name_index-1]; -+ read_unlock(&ext3_handler_lock); -+ } -+ return handler; -+} -+ -+/* -+ * Inode operation getxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+ssize_t -+ext3_getxattr(struct dentry *dentry, const char *name, -+ void *buffer, size_t size) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->get(inode, name, buffer, size); -+} -+ -+/* -+ * Inode operation listxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+ssize_t -+ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) -+{ -+ return ext3_xattr_list(dentry->d_inode, buffer, size); -+} -+ -+/* -+ * Inode operation setxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+int -+ext3_setxattr(struct dentry *dentry, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ if (size == 0) -+ value = ""; /* empty EA, do not remove */ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->set(inode, name, value, size, flags); -+} -+ -+/* -+ * Inode operation removexattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+int -+ext3_removexattr(struct dentry *dentry, const char *name) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); -+} -+ -+/* -+ * ext3_xattr_get() -+ * -+ * Copy an extended attribute into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ unsigned int block, size; -+ char *end; -+ int name_len, error; -+ -+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", -+ name_index, name, buffer, (long)buffer_size); -+ -+ if (name == NULL) -+ return -EINVAL; -+ if (!EXT3_I(inode)->i_file_acl) -+ return -ENOATTR; -+ block = EXT3_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* find named attribute */ -+ name_len = strlen(name); -+ -+ error = -ERANGE; -+ if (name_len > 255) -+ goto cleanup; -+ entry = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (name_index == entry->e_name_index && -+ name_len == entry->e_name_len && -+ memcmp(name, entry->e_name, name_len) == 0) -+ goto found; -+ entry = next; -+ } -+ /* Check the remaining name entries */ -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ entry = next; -+ } -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ error = -ENOATTR; -+ goto cleanup; -+found: -+ /* check the buffer size */ -+ if (entry->e_value_block != 0) -+ goto bad_block; -+ size = le32_to_cpu(entry->e_value_size); -+ if (size > inode->i_sb->s_blocksize || -+ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) -+ goto bad_block; -+ -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (buffer) { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ /* return value of attribute */ -+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), -+ size); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ unsigned int block, size = 0; -+ char *buf, *end; -+ int error; -+ -+ ea_idebug(inode, "buffer=%p, buffer_size=%ld", -+ buffer, (long)buffer_size); -+ -+ if (!EXT3_I(inode)->i_file_acl) -+ return 0; -+ block = EXT3_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* compute the size required for the list of attribute names */ -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT3_XATTR_NEXT(entry)) { -+ struct ext3_xattr_handler *handler; -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ -+ handler = ext3_xattr_handler(entry->e_name_index); -+ if (handler) -+ size += handler->list(NULL, inode, entry->e_name, -+ entry->e_name_len); -+ } -+ -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (!buffer) { -+ error = size; -+ goto cleanup; -+ } else { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ } -+ -+ /* list the attribute names */ -+ buf = buffer; -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT3_XATTR_NEXT(entry)) { -+ struct ext3_xattr_handler *handler; -+ -+ handler = ext3_xattr_handler(entry->e_name_index); -+ if (handler) -+ buf += handler->list(buf, inode, entry->e_name, -+ entry->e_name_len); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is -+ * not set, set it. -+ */ -+static void ext3_xattr_update_super_block(handle_t *handle, -+ struct super_block *sb) -+{ -+ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) -+ return; -+ -+ lock_super(sb); -+ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) -+ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR; -+#endif -+ EXT3_SB(sb)->s_es->s_feature_compat |= -+ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); -+ sb->s_dirt = 1; -+ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ unlock_super(sb); -+} -+ -+/* -+ * ext3_xattr_set() -+ * -+ * Create, replace or remove an extended attribute for this inode. Buffer -+ * is NULL to remove an existing extended attribute, and non-NULL to -+ * either replace an existing extended attribute, or create a new extended -+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE -+ * specify that an extended attribute must exist and must not exist -+ * previous to the call, respectively. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+int -+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, int flags) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_header *header = NULL; -+ struct ext3_xattr_entry *here, *last; -+ unsigned int name_len; -+ int block = EXT3_I(inode)->i_file_acl; -+ int min_offs = sb->s_blocksize, not_found = 1, free, error; -+ char *end; -+ -+ /* -+ * header -- Points either into bh, or to a temporarily -+ * allocated buffer. -+ * here -- The named entry found, or the place for inserting, within -+ * the block pointed to by header. -+ * last -- Points right after the last named entry within the block -+ * pointed to by header. -+ * min_offs -- The offset of the first value (values are aligned -+ * towards the end of the block). -+ * end -- Points right after the block pointed to by header. -+ */ -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ name_index, name, value, (long)value_len); -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -+ return -EPERM; -+ if (value == NULL) -+ value_len = 0; -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ if (name_len > 255 || value_len > sb->s_blocksize) -+ return -ERANGE; -+ down(&ext3_xattr_sem); -+ -+ if (block) { -+ /* The inode already has an extended attribute block. */ -+ bh = sb_bread(sb, block); -+ error = -EIO; -+ if (!bh) -+ goto cleanup; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), -+ le32_to_cpu(HDR(bh)->h_refcount)); -+ header = HDR(bh); -+ end = bh->b_data + bh->b_size; -+ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ header->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(sb, "ext3_xattr_set", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* Find the named attribute. */ -+ here = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(here)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!here->e_value_block && here->e_value_size) { -+ int offs = le16_to_cpu(here->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ not_found = name_index - here->e_name_index; -+ if (!not_found) -+ not_found = name_len - here->e_name_len; -+ if (!not_found) -+ not_found = memcmp(name, here->e_name,name_len); -+ if (not_found <= 0) -+ break; -+ here = next; -+ } -+ last = here; -+ /* We still need to compute min_offs and last. */ -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!last->e_value_block && last->e_value_size) { -+ int offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ last = next; -+ } -+ -+ /* Check whether we have enough space left. */ -+ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); -+ } else { -+ /* We will use a new extended attribute block. */ -+ free = sb->s_blocksize - -+ sizeof(struct ext3_xattr_header) - sizeof(__u32); -+ here = last = NULL; /* avoid gcc uninitialized warning. */ -+ } -+ -+ if (not_found) { -+ /* Request to remove a nonexistent attribute? */ -+ error = -ENOATTR; -+ if (flags & XATTR_REPLACE) -+ goto cleanup; -+ error = 0; -+ if (value == NULL) -+ goto cleanup; -+ else -+ free -= EXT3_XATTR_LEN(name_len); -+ } else { -+ /* Request to create an existing attribute? */ -+ error = -EEXIST; -+ if (flags & XATTR_CREATE) -+ goto cleanup; -+ if (!here->e_value_block && here->e_value_size) { -+ unsigned int size = le32_to_cpu(here->e_value_size); -+ -+ if (le16_to_cpu(here->e_value_offs) + size > -+ sb->s_blocksize || size > sb->s_blocksize) -+ goto bad_block; -+ free += EXT3_XATTR_SIZE(size); -+ } -+ } -+ free -= EXT3_XATTR_SIZE(value_len); -+ error = -ENOSPC; -+ if (free < 0) -+ goto cleanup; -+ -+ /* Here we know that we can set the new attribute. */ -+ -+ if (header) { -+ if (header->h_refcount == cpu_to_le32(1)) { -+ ea_bdebug(bh, "modifying in-place"); -+ ext3_xattr_cache_remove(bh); -+ error = ext3_journal_get_write_access(handle, bh); -+ if (error) -+ goto cleanup; -+ } else { -+ int offset; -+ -+ ea_bdebug(bh, "cloning"); -+ header = kmalloc(bh->b_size, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memcpy(header, HDR(bh), bh->b_size); -+ header->h_refcount = cpu_to_le32(1); -+ offset = (char *)header - bh->b_data; -+ here = ENTRY((char *)here + offset); -+ last = ENTRY((char *)last + offset); -+ } -+ } else { -+ /* Allocate a buffer where we construct the new block. */ -+ header = kmalloc(sb->s_blocksize, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memset(header, 0, sb->s_blocksize); -+ end = (char *)header + sb->s_blocksize; -+ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); -+ header->h_blocks = header->h_refcount = cpu_to_le32(1); -+ last = here = ENTRY(header+1); -+ } -+ -+ if (not_found) { -+ /* Insert the new name. */ -+ int size = EXT3_XATTR_LEN(name_len); -+ int rest = (char *)last - (char *)here; -+ memmove((char *)here + size, here, rest); -+ memset(here, 0, size); -+ here->e_name_index = name_index; -+ here->e_name_len = name_len; -+ memcpy(here->e_name, name, name_len); -+ } else { -+ /* Remove the old value. */ -+ if (!here->e_value_block && here->e_value_size) { -+ char *first_val = (char *)header + min_offs; -+ int offs = le16_to_cpu(here->e_value_offs); -+ char *val = (char *)header + offs; -+ size_t size = EXT3_XATTR_SIZE( -+ le32_to_cpu(here->e_value_size)); -+ memmove(first_val + size, first_val, val - first_val); -+ memset(first_val, 0, size); -+ here->e_value_offs = 0; -+ min_offs += size; -+ -+ /* Adjust all value offsets. */ -+ last = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(last)) { -+ int o = le16_to_cpu(last->e_value_offs); -+ if (!last->e_value_block && o < offs) -+ last->e_value_offs = -+ cpu_to_le16(o + size); -+ last = EXT3_XATTR_NEXT(last); -+ } -+ } -+ if (value == NULL) { -+ /* Remove this attribute. */ -+ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { -+ /* This block is now empty. */ -+ error = ext3_xattr_set2(handle, inode, bh,NULL); -+ goto cleanup; -+ } else { -+ /* Remove the old name. */ -+ int size = EXT3_XATTR_LEN(name_len); -+ last = ENTRY((char *)last - size); -+ memmove(here, (char*)here + size, -+ (char*)last - (char*)here); -+ memset(last, 0, size); -+ } -+ } -+ } -+ -+ if (value != NULL) { -+ /* Insert the new value. */ -+ here->e_value_size = cpu_to_le32(value_len); -+ if (value_len) { -+ size_t size = EXT3_XATTR_SIZE(value_len); -+ char *val = (char *)header + min_offs - size; -+ here->e_value_offs = -+ cpu_to_le16((char *)val - (char *)header); -+ memset(val + size - EXT3_XATTR_PAD, 0, -+ EXT3_XATTR_PAD); /* Clear the pad bytes. */ -+ memcpy(val, value, value_len); -+ } -+ } -+ ext3_xattr_rehash(header, here); -+ -+ error = ext3_xattr_set2(handle, inode, bh, header); -+ -+cleanup: -+ brelse(bh); -+ if (!(bh && header == HDR(bh))) -+ kfree(header); -+ up(&ext3_xattr_sem); -+ -+ return error; -+} -+ -+/* -+ * Second half of ext3_xattr_set(): Update the file system. -+ */ -+static int -+ext3_xattr_set2(handle_t *handle, struct inode *inode, -+ struct buffer_head *old_bh, struct ext3_xattr_header *header) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *new_bh = NULL; -+ int error; -+ -+ if (header) { -+ new_bh = ext3_xattr_cache_find(inode, header); -+ if (new_bh) { -+ /* -+ * We found an identical block in the cache. -+ * The old block will be released after updating -+ * the inode. -+ */ -+ ea_bdebug(old_bh, "reusing block %ld", -+ new_bh->b_blocknr); -+ -+ error = -EDQUOT; -+ if (ext3_xattr_quota_alloc(inode, 1)) -+ goto cleanup; -+ -+ error = ext3_journal_get_write_access(handle, new_bh); -+ if (error) -+ goto cleanup; -+ HDR(new_bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); -+ ea_bdebug(new_bh, "refcount now=%d", -+ le32_to_cpu(HDR(new_bh)->h_refcount)); -+ } else if (old_bh && header == HDR(old_bh)) { -+ /* Keep this block. */ -+ new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); -+ } else { -+ /* We need to allocate a new block */ -+ int force = EXT3_I(inode)->i_file_acl != 0; -+ int block = ext3_xattr_new_block(handle, inode, -+ &error, force); -+ if (error) -+ goto cleanup; -+ ea_idebug(inode, "creating block %d", block); -+ -+ new_bh = sb_getblk(sb, block); -+ if (!new_bh) { -+getblk_failed: ext3_xattr_free_block(handle, inode, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(new_bh); -+ error = ext3_journal_get_create_access(handle, new_bh); -+ if (error) { -+ unlock_buffer(new_bh); -+ goto getblk_failed; -+ } -+ memcpy(new_bh->b_data, header, new_bh->b_size); -+ mark_buffer_uptodate(new_bh, 1); -+ unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); -+ -+ ext3_xattr_update_super_block(handle, sb); -+ } -+ error = ext3_journal_dirty_metadata(handle, new_bh); -+ if (error) -+ goto cleanup; -+ } -+ -+ /* Update the inode. */ -+ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; -+ inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, inode); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+ error = 0; -+ if (old_bh && old_bh != new_bh) { -+ /* -+ * If there was an old block, and we are not still using it, -+ * we now release the old block. -+ */ -+ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); -+ -+ error = ext3_journal_get_write_access(handle, old_bh); -+ if (error) -+ goto cleanup; -+ if (refcount == 1) { -+ /* Free the old block. */ -+ ea_bdebug(old_bh, "freeing"); -+ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr); -+ -+ /* ext3_forget() calls bforget() for us, but we -+ let our caller release old_bh, so we need to -+ duplicate the handle before. */ -+ get_bh(old_bh); -+ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); -+ } else { -+ /* Decrement the refcount only. */ -+ refcount--; -+ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); -+ ext3_xattr_quota_free(inode); -+ ext3_journal_dirty_metadata(handle, old_bh); -+ ea_bdebug(old_bh, "refcount now=%d", refcount); -+ } -+ } -+ -+cleanup: -+ if (old_bh != new_bh) -+ brelse(new_bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_delete_inode() -+ * -+ * Free extended attribute resources associated with this inode. This -+ * is called immediately before an inode is freed. -+ */ -+void -+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -+{ -+ struct buffer_head *bh; -+ unsigned int block = EXT3_I(inode)->i_file_acl; -+ -+ if (!block) -+ return; -+ down(&ext3_xattr_sem); -+ -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) { -+ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", -+ "inode %ld: block %d read error", inode->i_ino, block); -+ goto cleanup; -+ } -+ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ goto cleanup; -+ } -+ ext3_journal_get_write_access(handle, bh); -+ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { -+ ext3_xattr_cache_remove(bh); -+ ext3_xattr_free_block(handle, inode, block); -+ ext3_forget(handle, 1, inode, bh, block); -+ bh = NULL; -+ } else { -+ HDR(bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ ext3_journal_dirty_metadata(handle, bh); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ ext3_xattr_quota_free(inode); -+ } -+ EXT3_I(inode)->i_file_acl = 0; -+ -+cleanup: -+ brelse(bh); -+ up(&ext3_xattr_sem); -+} -+ -+/* -+ * ext3_xattr_put_super() -+ * -+ * This is called when a file system is unmounted. -+ */ -+void -+ext3_xattr_put_super(struct super_block *sb) -+{ -+#ifdef CONFIG_EXT3_FS_XATTR_SHARING -+ mb_cache_shrink(ext3_xattr_cache, sb->s_dev); -+#endif -+} -+ -+#ifdef CONFIG_EXT3_FS_XATTR_SHARING -+ -+/* -+ * ext3_xattr_cache_insert() -+ * -+ * Create a new entry in the extended attribute cache, and insert -+ * it unless such an entry is already in the cache. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+static int -+ext3_xattr_cache_insert(struct buffer_head *bh) -+{ -+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); -+ struct mb_cache_entry *ce; -+ int error; -+ -+ ce = mb_cache_entry_alloc(ext3_xattr_cache); -+ if (!ce) -+ return -ENOMEM; -+ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); -+ if (error) { -+ mb_cache_entry_free(ce); -+ if (error == -EBUSY) { -+ ea_bdebug(bh, "already in cache (%d cache entries)", -+ atomic_read(&ext3_xattr_cache->c_entry_count)); -+ error = 0; -+ } -+ } else { -+ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, -+ atomic_read(&ext3_xattr_cache->c_entry_count)); -+ mb_cache_entry_release(ce); -+ } -+ return error; -+} -+ -+/* -+ * ext3_xattr_cmp() -+ * -+ * Compare two extended attribute blocks for equality. -+ * -+ * Returns 0 if the blocks are equal, 1 if they differ, and -+ * a negative error number on errors. -+ */ -+static int -+ext3_xattr_cmp(struct ext3_xattr_header *header1, -+ struct ext3_xattr_header *header2) -+{ -+ struct ext3_xattr_entry *entry1, *entry2; -+ -+ entry1 = ENTRY(header1+1); -+ entry2 = ENTRY(header2+1); -+ while (!IS_LAST_ENTRY(entry1)) { -+ if (IS_LAST_ENTRY(entry2)) -+ return 1; -+ if (entry1->e_hash != entry2->e_hash || -+ entry1->e_name_len != entry2->e_name_len || -+ entry1->e_value_size != entry2->e_value_size || -+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) -+ return 1; -+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) -+ return -EIO; -+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), -+ (char *)header2 + le16_to_cpu(entry2->e_value_offs), -+ le32_to_cpu(entry1->e_value_size))) -+ return 1; -+ -+ entry1 = EXT3_XATTR_NEXT(entry1); -+ entry2 = EXT3_XATTR_NEXT(entry2); -+ } -+ if (!IS_LAST_ENTRY(entry2)) -+ return 1; -+ return 0; -+} -+ -+/* -+ * ext3_xattr_cache_find() -+ * -+ * Find an identical extended attribute block. -+ * -+ * Returns a pointer to the block found, or NULL if such a block was -+ * not found or an error occurred. -+ */ -+static struct buffer_head * -+ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) -+{ -+ __u32 hash = le32_to_cpu(header->h_hash); -+ struct mb_cache_entry *ce; -+ -+ if (!header->h_hash) -+ return NULL; /* never share */ -+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -+ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash); -+ while (ce) { -+ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); -+ -+ if (!bh) { -+ ext3_error(inode->i_sb, "ext3_xattr_cache_find", -+ "inode %ld: block %ld read error", -+ inode->i_ino, ce->e_block); -+ } else if (le32_to_cpu(HDR(bh)->h_refcount) > -+ EXT3_XATTR_REFCOUNT_MAX) { -+ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, -+ le32_to_cpu(HDR(bh)->h_refcount), -+ EXT3_XATTR_REFCOUNT_MAX); -+ } else if (!ext3_xattr_cmp(header, HDR(bh))) { -+ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); -+ mb_cache_entry_release(ce); -+ return bh; -+ } -+ brelse(bh); -+ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); -+ } -+ return NULL; -+} -+ -+/* -+ * ext3_xattr_cache_remove() -+ * -+ * Remove the cache entry of a block from the cache. Called when a -+ * block becomes invalid. -+ */ -+static void -+ext3_xattr_cache_remove(struct buffer_head *bh) -+{ -+ struct mb_cache_entry *ce; -+ -+ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr); -+ if (ce) { -+ ea_bdebug(bh, "removing (%d cache entries remaining)", -+ atomic_read(&ext3_xattr_cache->c_entry_count)-1); -+ mb_cache_entry_free(ce); -+ } else -+ ea_bdebug(bh, "no cache entry"); -+} -+ -+#define NAME_HASH_SHIFT 5 -+#define VALUE_HASH_SHIFT 16 -+ -+/* -+ * ext3_xattr_hash_entry() -+ * -+ * Compute the hash of an extended attribute. -+ */ -+static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, -+ struct ext3_xattr_entry *entry) -+{ -+ __u32 hash = 0; -+ char *name = entry->e_name; -+ int n; -+ -+ for (n=0; n < entry->e_name_len; n++) { -+ hash = (hash << NAME_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ -+ *name++; -+ } -+ -+ if (entry->e_value_block == 0 && entry->e_value_size != 0) { -+ __u32 *value = (__u32 *)((char *)header + -+ le16_to_cpu(entry->e_value_offs)); -+ for (n = (le32_to_cpu(entry->e_value_size) + -+ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { -+ hash = (hash << VALUE_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ -+ le32_to_cpu(*value++); -+ } -+ } -+ entry->e_hash = cpu_to_le32(hash); -+} -+ -+#undef NAME_HASH_SHIFT -+#undef VALUE_HASH_SHIFT -+ -+#define BLOCK_HASH_SHIFT 16 -+ -+/* -+ * ext3_xattr_rehash() -+ * -+ * Re-compute the extended attribute hash value after an entry has changed. -+ */ -+static void ext3_xattr_rehash(struct ext3_xattr_header *header, -+ struct ext3_xattr_entry *entry) -+{ -+ struct ext3_xattr_entry *here; -+ __u32 hash = 0; -+ -+ ext3_xattr_hash_entry(header, entry); -+ here = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(here)) { -+ if (!here->e_hash) { -+ /* Block is not shared if an entry's hash value == 0 */ -+ hash = 0; -+ break; -+ } -+ hash = (hash << BLOCK_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ -+ le32_to_cpu(here->e_hash); -+ here = EXT3_XATTR_NEXT(here); -+ } -+ header->h_hash = cpu_to_le32(hash); -+} -+ -+#undef BLOCK_HASH_SHIFT -+ -+int __init -+init_ext3_xattr(void) -+{ -+ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, -+ sizeof(struct mb_cache_entry) + -+ sizeof(struct mb_cache_entry_index), 1, 61); -+ if (!ext3_xattr_cache) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+void -+exit_ext3_xattr(void) -+{ -+ if (ext3_xattr_cache) -+ mb_cache_destroy(ext3_xattr_cache); -+ ext3_xattr_cache = NULL; -+} -+ -+#else /* CONFIG_EXT3_FS_XATTR_SHARING */ -+ -+int __init -+init_ext3_xattr(void) -+{ -+ return 0; -+} -+ -+void -+exit_ext3_xattr(void) -+{ -+} -+ -+#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ -Index: linux-DRV401/fs/ext3/xattr_user.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/xattr_user.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext3/xattr_user.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,111 @@ -+/* -+ * linux/fs/ext3/xattr_user.c -+ * Handler for extended user attributes. -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+# include -+#endif -+ -+#define XATTR_USER_PREFIX "user." -+ -+static size_t -+ext3_xattr_user_list(char *list, struct inode *inode, -+ const char *name, int name_len) -+{ -+ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; -+ -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return 0; -+ -+ if (list) { -+ memcpy(list, XATTR_USER_PREFIX, prefix_len); -+ memcpy(list+prefix_len, name, name_len); -+ list[prefix_len + name_len] = '\0'; -+ } -+ return prefix_len + name_len + 1; -+} -+ -+static int -+ext3_xattr_user_get(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -ENOTSUP; -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+ error = ext3_permission_locked(inode, MAY_READ); -+#else -+ error = permission(inode, MAY_READ); -+#endif -+ if (error) -+ return error; -+ -+ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, -+ buffer, size); -+} -+ -+static int -+ext3_xattr_user_set(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ handle_t *handle; -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -ENOTSUP; -+ if ( !S_ISREG(inode->i_mode) && -+ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) -+ return -EPERM; -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+ error = ext3_permission_locked(inode, MAY_WRITE); -+#else -+ error = permission(inode, MAY_WRITE); -+#endif -+ if (error) -+ return error; -+ -+ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name, -+ value, size, flags); -+ ext3_journal_stop(handle, inode); -+ -+ return error; -+} -+ -+struct ext3_xattr_handler ext3_xattr_user_handler = { -+ prefix: XATTR_USER_PREFIX, -+ list: ext3_xattr_user_list, -+ get: ext3_xattr_user_get, -+ set: ext3_xattr_user_set, -+}; -+ -+int __init -+init_ext3_xattr_user(void) -+{ -+ return ext3_xattr_register(EXT3_XATTR_INDEX_USER, -+ &ext3_xattr_user_handler); -+} -+ -+void -+exit_ext3_xattr_user(void) -+{ -+ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, -+ &ext3_xattr_user_handler); -+} -Index: linux-DRV401/fs/ext3/ext3-exports.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/ext3-exports.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext3/ext3-exports.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,13 @@ -+#include -+#include -+#include -+#include -+#include -+ -+EXPORT_SYMBOL(ext3_force_commit); -+EXPORT_SYMBOL(ext3_bread); -+EXPORT_SYMBOL(ext3_xattr_register); -+EXPORT_SYMBOL(ext3_xattr_unregister); -+EXPORT_SYMBOL(ext3_xattr_get); -+EXPORT_SYMBOL(ext3_xattr_list); -+EXPORT_SYMBOL(ext3_xattr_set); -Index: linux-DRV401/fs/mbcache.c -=================================================================== ---- linux-DRV401.orig/fs/mbcache.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/mbcache.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,648 @@ -+/* -+ * linux/fs/mbcache.c -+ * (C) 2001-2002 Andreas Gruenbacher, -+ */ -+ -+/* -+ * Filesystem Meta Information Block Cache (mbcache) -+ * -+ * The mbcache caches blocks of block devices that need to be located -+ * by their device/block number, as well as by other criteria (such -+ * as the block's contents). -+ * -+ * There can only be one cache entry in a cache per device and block number. -+ * Additional indexes need not be unique in this sense. The number of -+ * additional indexes (=other criteria) can be hardwired at compile time -+ * or specified at cache create time. -+ * -+ * Each cache entry is of fixed size. An entry may be `valid' or `invalid' -+ * in the cache. A valid entry is in the main hash tables of the cache, -+ * and may also be in the lru list. An invalid entry is not in any hashes -+ * or lists. -+ * -+ * A valid cache entry is only in the lru list if no handles refer to it. -+ * Invalid cache entries will be freed when the last handle to the cache -+ * entry is released. Entries that cannot be freed immediately are put -+ * back on the lru list. -+ */ -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+#ifdef MB_CACHE_DEBUG -+# define mb_debug(f...) do { \ -+ printk(KERN_DEBUG f); \ -+ printk("\n"); \ -+ } while (0) -+#define mb_assert(c) do { if (!(c)) \ -+ printk(KERN_ERR "assertion " #c " failed\n"); \ -+ } while(0) -+#else -+# define mb_debug(f...) do { } while(0) -+# define mb_assert(c) do { } while(0) -+#endif -+#define mb_error(f...) do { \ -+ printk(KERN_ERR f); \ -+ printk("\n"); \ -+ } while(0) -+ -+MODULE_AUTHOR("Andreas Gruenbacher "); -+MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) -+MODULE_LICENSE("GPL"); -+#endif -+ -+EXPORT_SYMBOL(mb_cache_create); -+EXPORT_SYMBOL(mb_cache_shrink); -+EXPORT_SYMBOL(mb_cache_destroy); -+EXPORT_SYMBOL(mb_cache_entry_alloc); -+EXPORT_SYMBOL(mb_cache_entry_insert); -+EXPORT_SYMBOL(mb_cache_entry_release); -+EXPORT_SYMBOL(mb_cache_entry_takeout); -+EXPORT_SYMBOL(mb_cache_entry_free); -+EXPORT_SYMBOL(mb_cache_entry_dup); -+EXPORT_SYMBOL(mb_cache_entry_get); -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+EXPORT_SYMBOL(mb_cache_entry_find_first); -+EXPORT_SYMBOL(mb_cache_entry_find_next); -+#endif -+ -+ -+/* -+ * Global data: list of all mbcache's, lru list, and a spinlock for -+ * accessing cache data structures on SMP machines. The lru list is -+ * global across all mbcaches. -+ */ -+ -+static LIST_HEAD(mb_cache_list); -+static LIST_HEAD(mb_cache_lru_list); -+static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED; -+ -+static inline int -+mb_cache_indexes(struct mb_cache *cache) -+{ -+#ifdef MB_CACHE_INDEXES_COUNT -+ return MB_CACHE_INDEXES_COUNT; -+#else -+ return cache->c_indexes_count; -+#endif -+} -+ -+/* -+ * What the mbcache registers as to get shrunk dynamically. -+ */ -+ -+static void -+mb_cache_memory_pressure(int priority, unsigned int gfp_mask); -+ -+static struct cache_definition mb_cache_definition = { -+ "mb_cache", -+ mb_cache_memory_pressure -+}; -+ -+ -+static inline int -+__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) -+{ -+ return !list_empty(&ce->e_block_list); -+} -+ -+ -+static inline void -+__mb_cache_entry_unhash(struct mb_cache_entry *ce) -+{ -+ int n; -+ -+ if (__mb_cache_entry_is_hashed(ce)) { -+ list_del_init(&ce->e_block_list); -+ for (n=0; ne_cache); n++) -+ list_del(&ce->e_indexes[n].o_list); -+ } -+} -+ -+ -+static inline void -+__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) -+{ -+ struct mb_cache *cache = ce->e_cache; -+ -+ mb_assert(atomic_read(&ce->e_used) == 0); -+ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { -+ /* free failed -- put back on the lru list -+ for freeing later. */ -+ spin_lock(&mb_cache_spinlock); -+ list_add(&ce->e_lru_list, &mb_cache_lru_list); -+ spin_unlock(&mb_cache_spinlock); -+ } else { -+ kmem_cache_free(cache->c_entry_cache, ce); -+ atomic_dec(&cache->c_entry_count); -+ } -+} -+ -+ -+static inline void -+__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) -+{ -+ if (atomic_dec_and_test(&ce->e_used)) { -+ if (__mb_cache_entry_is_hashed(ce)) -+ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); -+ else { -+ spin_unlock(&mb_cache_spinlock); -+ __mb_cache_entry_forget(ce, GFP_KERNEL); -+ return; -+ } -+ } -+ spin_unlock(&mb_cache_spinlock); -+} -+ -+ -+/* -+ * mb_cache_memory_pressure() memory pressure callback -+ * -+ * This function is called by the kernel memory management when memory -+ * gets low. -+ * -+ * @priority: Amount by which to shrink the cache (0 = highes priority) -+ * @gfp_mask: (ignored) -+ */ -+static void -+mb_cache_memory_pressure(int priority, unsigned int gfp_mask) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l, *ltmp; -+ int count = 0; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each(l, &mb_cache_list) { -+ struct mb_cache *cache = -+ list_entry(l, struct mb_cache, c_cache_list); -+ mb_debug("cache %s (%d)", cache->c_name, -+ atomic_read(&cache->c_entry_count)); -+ count += atomic_read(&cache->c_entry_count); -+ } -+ mb_debug("trying to free %d of %d entries", -+ count / (priority ? priority : 1), count); -+ if (priority) -+ count /= priority; -+ while (count-- && !list_empty(&mb_cache_lru_list)) { -+ struct mb_cache_entry *ce = -+ list_entry(mb_cache_lru_list.next, -+ struct mb_cache_entry, e_lru_list); -+ list_del(&ce->e_lru_list); -+ __mb_cache_entry_unhash(ce); -+ list_add_tail(&ce->e_lru_list, &free_list); -+ } -+ spin_unlock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &free_list) { -+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, -+ e_lru_list), gfp_mask); -+ } -+} -+ -+ -+/* -+ * mb_cache_create() create a new cache -+ * -+ * All entries in one cache are equal size. Cache entries may be from -+ * multiple devices. If this is the first mbcache created, registers -+ * the cache with kernel memory management. Returns NULL if no more -+ * memory was available. -+ * -+ * @name: name of the cache (informal) -+ * @cache_op: contains the callback called when freeing a cache entry -+ * @entry_size: The size of a cache entry, including -+ * struct mb_cache_entry -+ * @indexes_count: number of additional indexes in the cache. Must equal -+ * MB_CACHE_INDEXES_COUNT if the number of indexes is -+ * hardwired. -+ * @bucket_count: number of hash buckets -+ */ -+struct mb_cache * -+mb_cache_create(const char *name, struct mb_cache_op *cache_op, -+ size_t entry_size, int indexes_count, int bucket_count) -+{ -+ int m=0, n; -+ struct mb_cache *cache = NULL; -+ -+ if(entry_size < sizeof(struct mb_cache_entry) + -+ indexes_count * sizeof(struct mb_cache_entry_index)) -+ return NULL; -+ -+ MOD_INC_USE_COUNT; -+ cache = kmalloc(sizeof(struct mb_cache) + -+ indexes_count * sizeof(struct list_head), GFP_KERNEL); -+ if (!cache) -+ goto fail; -+ cache->c_name = name; -+ cache->c_op.free = NULL; -+ if (cache_op) -+ cache->c_op.free = cache_op->free; -+ atomic_set(&cache->c_entry_count, 0); -+ cache->c_bucket_count = bucket_count; -+#ifdef MB_CACHE_INDEXES_COUNT -+ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT); -+#else -+ cache->c_indexes_count = indexes_count; -+#endif -+ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), -+ GFP_KERNEL); -+ if (!cache->c_block_hash) -+ goto fail; -+ for (n=0; nc_block_hash[n]); -+ for (m=0; mc_indexes_hash[m] = kmalloc(bucket_count * -+ sizeof(struct list_head), -+ GFP_KERNEL); -+ if (!cache->c_indexes_hash[m]) -+ goto fail; -+ for (n=0; nc_indexes_hash[m][n]); -+ } -+ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0, -+ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL); -+ if (!cache->c_entry_cache) -+ goto fail; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_add(&cache->c_cache_list, &mb_cache_list); -+ spin_unlock(&mb_cache_spinlock); -+ return cache; -+ -+fail: -+ if (cache) { -+ while (--m >= 0) -+ kfree(cache->c_indexes_hash[m]); -+ if (cache->c_block_hash) -+ kfree(cache->c_block_hash); -+ kfree(cache); -+ } -+ MOD_DEC_USE_COUNT; -+ return NULL; -+} -+ -+ -+/* -+ * mb_cache_shrink() -+ * -+ * Removes all cache entires of a device from the cache. All cache entries -+ * currently in use cannot be freed, and thus remain in the cache. -+ * -+ * @cache: which cache to shrink -+ * @dev: which device's cache entries to shrink -+ */ -+void -+mb_cache_shrink(struct mb_cache *cache, kdev_t dev) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l, *ltmp; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_lru_list); -+ if (ce->e_dev == dev) { -+ list_del(&ce->e_lru_list); -+ list_add_tail(&ce->e_lru_list, &free_list); -+ __mb_cache_entry_unhash(ce); -+ } -+ } -+ spin_unlock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &free_list) { -+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, -+ e_lru_list), GFP_KERNEL); -+ } -+} -+ -+ -+/* -+ * mb_cache_destroy() -+ * -+ * Shrinks the cache to its minimum possible size (hopefully 0 entries), -+ * and then destroys it. If this was the last mbcache, un-registers the -+ * mbcache from kernel memory management. -+ */ -+void -+mb_cache_destroy(struct mb_cache *cache) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l, *ltmp; -+ int n; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_lru_list); -+ if (ce->e_cache == cache) { -+ list_del(&ce->e_lru_list); -+ list_add_tail(&ce->e_lru_list, &free_list); -+ __mb_cache_entry_unhash(ce); -+ } -+ } -+ list_del(&cache->c_cache_list); -+ spin_unlock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &free_list) { -+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, -+ e_lru_list), GFP_KERNEL); -+ } -+ -+ if (atomic_read(&cache->c_entry_count) > 0) { -+ mb_error("cache %s: %d orphaned entries", -+ cache->c_name, -+ atomic_read(&cache->c_entry_count)); -+ } -+ -+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)) -+ /* We don't have kmem_cache_destroy() in 2.2.x */ -+ kmem_cache_shrink(cache->c_entry_cache); -+#else -+ kmem_cache_destroy(cache->c_entry_cache); -+#endif -+ for (n=0; n < mb_cache_indexes(cache); n++) -+ kfree(cache->c_indexes_hash[n]); -+ kfree(cache->c_block_hash); -+ kfree(cache); -+ -+ MOD_DEC_USE_COUNT; -+} -+ -+ -+/* -+ * mb_cache_entry_alloc() -+ * -+ * Allocates a new cache entry. The new entry will not be valid initially, -+ * and thus cannot be looked up yet. It should be filled with data, and -+ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL -+ * if no more memory was available. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_alloc(struct mb_cache *cache) -+{ -+ struct mb_cache_entry *ce; -+ -+ atomic_inc(&cache->c_entry_count); -+ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL); -+ if (ce) { -+ INIT_LIST_HEAD(&ce->e_lru_list); -+ INIT_LIST_HEAD(&ce->e_block_list); -+ ce->e_cache = cache; -+ atomic_set(&ce->e_used, 1); -+ } -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_insert() -+ * -+ * Inserts an entry that was allocated using mb_cache_entry_alloc() into -+ * the cache. After this, the cache entry can be looked up, but is not yet -+ * in the lru list as the caller still holds a handle to it. Returns 0 on -+ * success, or -EBUSY if a cache entry for that device + inode exists -+ * already (this may happen after a failed lookup, if another process has -+ * inserted the same cache entry in the meantime). -+ * -+ * @dev: device the cache entry belongs to -+ * @block: block number -+ * @keys: array of additional keys. There must be indexes_count entries -+ * in the array (as specified when creating the cache). -+ */ -+int -+mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev, -+ unsigned long block, unsigned int keys[]) -+{ -+ struct mb_cache *cache = ce->e_cache; -+ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; -+ struct list_head *l; -+ int error = -EBUSY, n; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each(l, &cache->c_block_hash[bucket]) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_block_list); -+ if (ce->e_dev == dev && ce->e_block == block) -+ goto out; -+ } -+ __mb_cache_entry_unhash(ce); -+ ce->e_dev = dev; -+ ce->e_block = block; -+ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); -+ for (n=0; ne_indexes[n].o_key = keys[n]; -+ bucket = keys[n] % cache->c_bucket_count; -+ list_add(&ce->e_indexes[n].o_list, -+ &cache->c_indexes_hash[n][bucket]); -+ } -+out: -+ spin_unlock(&mb_cache_spinlock); -+ return error; -+} -+ -+ -+/* -+ * mb_cache_entry_release() -+ * -+ * Release a handle to a cache entry. When the last handle to a cache entry -+ * is released it is either freed (if it is invalid) or otherwise inserted -+ * in to the lru list. -+ */ -+void -+mb_cache_entry_release(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ __mb_cache_entry_release_unlock(ce); -+} -+ -+ -+/* -+ * mb_cache_entry_takeout() -+ * -+ * Take a cache entry out of the cache, making it invalid. The entry can later -+ * be re-inserted using mb_cache_entry_insert(), or released using -+ * mb_cache_entry_release(). -+ */ -+void -+mb_cache_entry_takeout(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ mb_assert(list_empty(&ce->e_lru_list)); -+ __mb_cache_entry_unhash(ce); -+ spin_unlock(&mb_cache_spinlock); -+} -+ -+ -+/* -+ * mb_cache_entry_free() -+ * -+ * This is equivalent to the sequence mb_cache_entry_takeout() -- -+ * mb_cache_entry_release(). -+ */ -+void -+mb_cache_entry_free(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ mb_assert(list_empty(&ce->e_lru_list)); -+ __mb_cache_entry_unhash(ce); -+ __mb_cache_entry_release_unlock(ce); -+} -+ -+ -+/* -+ * mb_cache_entry_dup() -+ * -+ * Duplicate a handle to a cache entry (does not duplicate the cache entry -+ * itself). After the call, both the old and the new handle must be released. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_dup(struct mb_cache_entry *ce) -+{ -+ atomic_inc(&ce->e_used); -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_get() -+ * -+ * Get a cache entry by device / block number. (There can only be one entry -+ * in the cache per device and block.) Returns NULL if no such cache entry -+ * exists. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block) -+{ -+ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each(l, &cache->c_block_hash[bucket]) { -+ ce = list_entry(l, struct mb_cache_entry, e_block_list); -+ if (ce->e_dev == dev && ce->e_block == block) { -+ if (!list_empty(&ce->e_lru_list)) -+ list_del_init(&ce->e_lru_list); -+ atomic_inc(&ce->e_used); -+ goto cleanup; -+ } -+ } -+ ce = NULL; -+ -+cleanup: -+ spin_unlock(&mb_cache_spinlock); -+ return ce; -+} -+ -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+ -+static struct mb_cache_entry * -+__mb_cache_entry_find(struct list_head *l, struct list_head *head, -+ int index, kdev_t dev, unsigned int key) -+{ -+ while (l != head) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, -+ e_indexes[index].o_list); -+ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) { -+ if (!list_empty(&ce->e_lru_list)) -+ list_del_init(&ce->e_lru_list); -+ atomic_inc(&ce->e_used); -+ return ce; -+ } -+ l = l->next; -+ } -+ return NULL; -+} -+ -+ -+/* -+ * mb_cache_entry_find_first() -+ * -+ * Find the first cache entry on a given device with a certain key in -+ * an additional index. Additonal matches can be found with -+ * mb_cache_entry_find_next(). Returns NULL if no match was found. -+ * -+ * @cache: the cache to search -+ * @index: the number of the additonal index to search (0<=indexc_bucket_count; -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ mb_assert(index < mb_cache_indexes(cache)); -+ spin_lock(&mb_cache_spinlock); -+ l = cache->c_indexes_hash[index][bucket].next; -+ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], -+ index, dev, key); -+ spin_unlock(&mb_cache_spinlock); -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_find_next() -+ * -+ * Find the next cache entry on a given device with a certain key in an -+ * additional index. Returns NULL if no match could be found. The previous -+ * entry is atomatically released, so that mb_cache_entry_find_next() can -+ * be called like this: -+ * -+ * entry = mb_cache_entry_find_first(); -+ * while (entry) { -+ * ... -+ * entry = mb_cache_entry_find_next(entry, ...); -+ * } -+ * -+ * @prev: The previous match -+ * @index: the number of the additonal index to search (0<=indexe_cache; -+ unsigned int bucket = key % cache->c_bucket_count; -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ mb_assert(index < mb_cache_indexes(cache)); -+ spin_lock(&mb_cache_spinlock); -+ l = prev->e_indexes[index].o_list.next; -+ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], -+ index, dev, key); -+ __mb_cache_entry_release_unlock(prev); -+ return ce; -+} -+ -+#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ -+ -+static int __init init_mbcache(void) -+{ -+ register_cache(&mb_cache_definition); -+ return 0; -+} -+ -+static void __exit exit_mbcache(void) -+{ -+ unregister_cache(&mb_cache_definition); -+} -+ -+module_init(init_mbcache) -+module_exit(exit_mbcache) -+ -Index: linux-DRV401/fs/xattr.c -=================================================================== ---- linux-DRV401.orig/fs/xattr.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/xattr.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,355 @@ -+/* -+ File: fs/xattr.c -+ -+ Extended attribute handling. -+ -+ Copyright (C) 2001 by Andreas Gruenbacher -+ Copyright (C) 2001 SGI - Silicon Graphics, Inc -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * Extended attribute memory allocation wrappers, originally -+ * based on the Intermezzo PRESTO_ALLOC/PRESTO_FREE macros. -+ * The vmalloc use here is very uncommon - extended attributes -+ * are supposed to be small chunks of metadata, and it is quite -+ * unusual to have very many extended attributes, so lists tend -+ * to be quite short as well. The 64K upper limit is derived -+ * from the extended attribute size limit used by XFS. -+ * Intentionally allow zero @size for value/list size requests. -+ */ -+static void * -+xattr_alloc(size_t size, size_t limit) -+{ -+ void *ptr; -+ -+ if (size > limit) -+ return ERR_PTR(-E2BIG); -+ -+ if (!size) /* size request, no buffer is needed */ -+ return NULL; -+ else if (size <= PAGE_SIZE) -+ ptr = kmalloc((unsigned long) size, GFP_KERNEL); -+ else -+ ptr = vmalloc((unsigned long) size); -+ if (!ptr) -+ return ERR_PTR(-ENOMEM); -+ return ptr; -+} -+ -+static void -+xattr_free(void *ptr, size_t size) -+{ -+ if (!size) /* size request, no buffer was needed */ -+ return; -+ else if (size <= PAGE_SIZE) -+ kfree(ptr); -+ else -+ vfree(ptr); -+} -+ -+/* -+ * Extended attribute SET operations -+ */ -+static long -+setxattr(struct dentry *d, char *name, void *value, size_t size, int flags) -+{ -+ int error; -+ void *kvalue; -+ char kname[XATTR_NAME_MAX + 1]; -+ -+ if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) -+ return -EINVAL; -+ -+ error = strncpy_from_user(kname, name, sizeof(kname)); -+ if (error == 0 || error == sizeof(kname)) -+ error = -ERANGE; -+ if (error < 0) -+ return error; -+ -+ kvalue = xattr_alloc(size, XATTR_SIZE_MAX); -+ if (IS_ERR(kvalue)) -+ return PTR_ERR(kvalue); -+ -+ if (size > 0 && copy_from_user(kvalue, value, size)) { -+ xattr_free(kvalue, size); -+ return -EFAULT; -+ } -+ -+ error = -EOPNOTSUPP; -+ if (d->d_inode->i_op && d->d_inode->i_op->setxattr) { -+ down(&d->d_inode->i_sem); -+ lock_kernel(); -+ error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags); -+ unlock_kernel(); -+ up(&d->d_inode->i_sem); -+ } -+ -+ xattr_free(kvalue, size); -+ return error; -+} -+ -+asmlinkage long -+sys_setxattr(char *path, char *name, void *value, size_t size, int flags) -+{ -+ struct nameidata nd; -+ int error; -+ -+ error = user_path_walk(path, &nd); -+ if (error) -+ return error; -+ error = setxattr(nd.dentry, name, value, size, flags); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage long -+sys_lsetxattr(char *path, char *name, void *value, size_t size, int flags) -+{ -+ struct nameidata nd; -+ int error; -+ -+ error = user_path_walk_link(path, &nd); -+ if (error) -+ return error; -+ error = setxattr(nd.dentry, name, value, size, flags); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage long -+sys_fsetxattr(int fd, char *name, void *value, size_t size, int flags) -+{ -+ struct file *f; -+ int error = -EBADF; -+ -+ f = fget(fd); -+ if (!f) -+ return error; -+ error = setxattr(f->f_dentry, name, value, size, flags); -+ fput(f); -+ return error; -+} -+ -+/* -+ * Extended attribute GET operations -+ */ -+static ssize_t -+getxattr(struct dentry *d, char *name, void *value, size_t size) -+{ -+ ssize_t error; -+ void *kvalue; -+ char kname[XATTR_NAME_MAX + 1]; -+ -+ error = strncpy_from_user(kname, name, sizeof(kname)); -+ if (error == 0 || error == sizeof(kname)) -+ error = -ERANGE; -+ if (error < 0) -+ return error; -+ -+ kvalue = xattr_alloc(size, XATTR_SIZE_MAX); -+ if (IS_ERR(kvalue)) -+ return PTR_ERR(kvalue); -+ -+ error = -EOPNOTSUPP; -+ if (d->d_inode->i_op && d->d_inode->i_op->getxattr) { -+ down(&d->d_inode->i_sem); -+ lock_kernel(); -+ error = d->d_inode->i_op->getxattr(d, kname, kvalue, size); -+ unlock_kernel(); -+ up(&d->d_inode->i_sem); -+ } -+ -+ if (kvalue && error > 0) -+ if (copy_to_user(value, kvalue, error)) -+ error = -EFAULT; -+ xattr_free(kvalue, size); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_getxattr(char *path, char *name, void *value, size_t size) -+{ -+ struct nameidata nd; -+ ssize_t error; -+ -+ error = user_path_walk(path, &nd); -+ if (error) -+ return error; -+ error = getxattr(nd.dentry, name, value, size); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_lgetxattr(char *path, char *name, void *value, size_t size) -+{ -+ struct nameidata nd; -+ ssize_t error; -+ -+ error = user_path_walk_link(path, &nd); -+ if (error) -+ return error; -+ error = getxattr(nd.dentry, name, value, size); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_fgetxattr(int fd, char *name, void *value, size_t size) -+{ -+ struct file *f; -+ ssize_t error = -EBADF; -+ -+ f = fget(fd); -+ if (!f) -+ return error; -+ error = getxattr(f->f_dentry, name, value, size); -+ fput(f); -+ return error; -+} -+ -+/* -+ * Extended attribute LIST operations -+ */ -+static ssize_t -+listxattr(struct dentry *d, char *list, size_t size) -+{ -+ ssize_t error; -+ char *klist; -+ -+ klist = (char *)xattr_alloc(size, XATTR_LIST_MAX); -+ if (IS_ERR(klist)) -+ return PTR_ERR(klist); -+ -+ error = -EOPNOTSUPP; -+ if (d->d_inode->i_op && d->d_inode->i_op->listxattr) { -+ down(&d->d_inode->i_sem); -+ lock_kernel(); -+ error = d->d_inode->i_op->listxattr(d, klist, size); -+ unlock_kernel(); -+ up(&d->d_inode->i_sem); -+ } -+ -+ if (klist && error > 0) -+ if (copy_to_user(list, klist, error)) -+ error = -EFAULT; -+ xattr_free(klist, size); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_listxattr(char *path, char *list, size_t size) -+{ -+ struct nameidata nd; -+ ssize_t error; -+ -+ error = user_path_walk(path, &nd); -+ if (error) -+ return error; -+ error = listxattr(nd.dentry, list, size); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_llistxattr(char *path, char *list, size_t size) -+{ -+ struct nameidata nd; -+ ssize_t error; -+ -+ error = user_path_walk_link(path, &nd); -+ if (error) -+ return error; -+ error = listxattr(nd.dentry, list, size); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_flistxattr(int fd, char *list, size_t size) -+{ -+ struct file *f; -+ ssize_t error = -EBADF; -+ -+ f = fget(fd); -+ if (!f) -+ return error; -+ error = listxattr(f->f_dentry, list, size); -+ fput(f); -+ return error; -+} -+ -+/* -+ * Extended attribute REMOVE operations -+ */ -+static long -+removexattr(struct dentry *d, char *name) -+{ -+ int error; -+ char kname[XATTR_NAME_MAX + 1]; -+ -+ error = strncpy_from_user(kname, name, sizeof(kname)); -+ if (error == 0 || error == sizeof(kname)) -+ error = -ERANGE; -+ if (error < 0) -+ return error; -+ -+ error = -EOPNOTSUPP; -+ if (d->d_inode->i_op && d->d_inode->i_op->removexattr) { -+ down(&d->d_inode->i_sem); -+ lock_kernel(); -+ error = d->d_inode->i_op->removexattr(d, kname); -+ unlock_kernel(); -+ up(&d->d_inode->i_sem); -+ } -+ return error; -+} -+ -+asmlinkage long -+sys_removexattr(char *path, char *name) -+{ -+ struct nameidata nd; -+ int error; -+ -+ error = user_path_walk(path, &nd); -+ if (error) -+ return error; -+ error = removexattr(nd.dentry, name); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage long -+sys_lremovexattr(char *path, char *name) -+{ -+ struct nameidata nd; -+ int error; -+ -+ error = user_path_walk_link(path, &nd); -+ if (error) -+ return error; -+ error = removexattr(nd.dentry, name); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage long -+sys_fremovexattr(int fd, char *name) -+{ -+ struct file *f; -+ int error = -EBADF; -+ -+ f = fget(fd); -+ if (!f) -+ return error; -+ error = removexattr(f->f_dentry, name); -+ fput(f); -+ return error; -+} -Index: linux-DRV401/include/linux/cache_def.h -=================================================================== ---- linux-DRV401.orig/include/linux/cache_def.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/cache_def.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,15 @@ -+/* -+ * linux/cache_def.h -+ * Handling of caches defined in drivers, filesystems, ... -+ * -+ * Copyright (C) 2002 by Andreas Gruenbacher, -+ */ -+ -+struct cache_definition { -+ const char *name; -+ void (*shrink)(int, unsigned int); -+ struct list_head link; -+}; -+ -+extern void register_cache(struct cache_definition *); -+extern void unregister_cache(struct cache_definition *); -Index: linux-DRV401/include/linux/errno.h -=================================================================== ---- linux-DRV401.orig/include/linux/errno.h 2004-10-15 10:26:15.000000000 -0700 -+++ linux-DRV401/include/linux/errno.h 2004-10-15 11:03:52.000000000 -0700 -@@ -23,4 +23,8 @@ - - #endif - -+/* Defined for extended attributes */ -+#define ENOATTR ENODATA /* No such attribute */ -+#define ENOTSUP EOPNOTSUPP /* Operation not supported */ -+ - #endif -Index: linux-DRV401/include/linux/ext2_fs.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext2_fs.h 2004-10-15 10:26:11.000000000 -0700 -+++ linux-DRV401/include/linux/ext2_fs.h 2004-10-15 11:03:52.000000000 -0700 -@@ -57,8 +57,6 @@ - */ - #define EXT2_BAD_INO 1 /* Bad blocks inode */ - #define EXT2_ROOT_INO 2 /* Root inode */ --#define EXT2_ACL_IDX_INO 3 /* ACL inode */ --#define EXT2_ACL_DATA_INO 4 /* ACL inode */ - #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ - #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ - -@@ -86,7 +84,6 @@ - #else - # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size) - #endif --#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry)) - #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) - #ifdef __KERNEL__ - # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -@@ -121,28 +118,6 @@ - #endif - - /* -- * ACL structures -- */ --struct ext2_acl_header /* Header of Access Control Lists */ --{ -- __u32 aclh_size; -- __u32 aclh_file_count; -- __u32 aclh_acle_count; -- __u32 aclh_first_acle; --}; -- --struct ext2_acl_entry /* Access Control List Entry */ --{ -- __u32 acle_size; -- __u16 acle_perms; /* Access permissions */ -- __u16 acle_type; /* Type of entry */ -- __u16 acle_tag; /* User or group identity */ -- __u16 acle_pad1; -- __u32 acle_next; /* Pointer on next entry for the */ -- /* same inode or on next free entry */ --}; -- --/* - * Structure of a blocks group descriptor - */ - struct ext2_group_desc -@@ -314,6 +289,7 @@ - #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ - #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ - #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ -+#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - - #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt - #define set_opt(o, opt) o |= EXT2_MOUNT_##opt -@@ -397,6 +373,7 @@ - - #ifdef __KERNEL__ - #define EXT2_SB(sb) (&((sb)->u.ext2_sb)) -+#define EXT2_I(inode) (&((inode)->u.ext2_i)) - #else - /* Assume that user mode programs are passing in an ext2fs superblock, not - * a kernel struct super_block. This will allow us to call the feature-test -@@ -466,7 +443,7 @@ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 - #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff - --#define EXT2_FEATURE_COMPAT_SUPP 0 -+#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE - #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ -@@ -623,8 +600,10 @@ - - /* namei.c */ - extern struct inode_operations ext2_dir_inode_operations; -+extern struct inode_operations ext2_special_inode_operations; - - /* symlink.c */ -+extern struct inode_operations ext2_symlink_inode_operations; - extern struct inode_operations ext2_fast_symlink_inode_operations; - - #endif /* __KERNEL__ */ -Index: linux-DRV401/include/linux/ext2_xattr.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext2_xattr.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/ext2_xattr.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,157 @@ -+/* -+ File: linux/ext2_xattr.h -+ -+ On-disk format of extended attributes for the ext2 filesystem. -+ -+ (C) 2001 Andreas Gruenbacher, -+*/ -+ -+#include -+#include -+#include -+ -+/* Magic value in attribute blocks */ -+#define EXT2_XATTR_MAGIC 0xEA020000 -+ -+/* Maximum number of references to one attribute block */ -+#define EXT2_XATTR_REFCOUNT_MAX 1024 -+ -+/* Name indexes */ -+#define EXT2_XATTR_INDEX_MAX 10 -+#define EXT2_XATTR_INDEX_USER 1 -+#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2 -+#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -+ -+struct ext2_xattr_header { -+ __u32 h_magic; /* magic number for identification */ -+ __u32 h_refcount; /* reference count */ -+ __u32 h_blocks; /* number of disk blocks used */ -+ __u32 h_hash; /* hash value of all attributes */ -+ __u32 h_reserved[4]; /* zero right now */ -+}; -+ -+struct ext2_xattr_entry { -+ __u8 e_name_len; /* length of name */ -+ __u8 e_name_index; /* attribute name index */ -+ __u16 e_value_offs; /* offset in disk block of value */ -+ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ -+ __u32 e_value_size; /* size of attribute value */ -+ __u32 e_hash; /* hash value of name and value */ -+ char e_name[0]; /* attribute name */ -+}; -+ -+#define EXT2_XATTR_PAD_BITS 2 -+#define EXT2_XATTR_PAD (1<e_name_len)) ) -+#define EXT2_XATTR_SIZE(size) \ -+ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) -+ -+#ifdef __KERNEL__ -+ -+# ifdef CONFIG_EXT2_FS_XATTR -+ -+struct ext2_xattr_handler { -+ char *prefix; -+ size_t (*list)(char *list, struct inode *inode, const char *name, -+ int name_len); -+ int (*get)(struct inode *inode, const char *name, void *buffer, -+ size_t size); -+ int (*set)(struct inode *inode, const char *name, const void *buffer, -+ size_t size, int flags); -+}; -+ -+extern int ext2_xattr_register(int, struct ext2_xattr_handler *); -+extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *); -+ -+extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int); -+extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t); -+extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); -+extern int ext2_removexattr(struct dentry *, const char *); -+ -+extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); -+extern int ext2_xattr_list(struct inode *, char *, size_t); -+extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); -+ -+extern void ext2_xattr_delete_inode(struct inode *); -+extern void ext2_xattr_put_super(struct super_block *); -+ -+extern int init_ext2_xattr(void) __init; -+extern void exit_ext2_xattr(void); -+ -+# else /* CONFIG_EXT2_FS_XATTR */ -+# define ext2_setxattr NULL -+# define ext2_getxattr NULL -+# define ext2_listxattr NULL -+# define ext2_removexattr NULL -+ -+static inline int -+ext2_xattr_get(struct inode *inode, int name_index, -+ const char *name, void *buffer, size_t size) -+{ -+ return -ENOTSUP; -+} -+ -+static inline int -+ext2_xattr_list(struct inode *inode, char *buffer, size_t size) -+{ -+ return -ENOTSUP; -+} -+ -+static inline int -+ext2_xattr_set(struct inode *inode, int name_index, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ return -ENOTSUP; -+} -+ -+static inline void -+ext2_xattr_delete_inode(struct inode *inode) -+{ -+} -+ -+static inline void -+ext2_xattr_put_super(struct super_block *sb) -+{ -+} -+ -+static inline int -+init_ext2_xattr(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext2_xattr(void) -+{ -+} -+ -+# endif /* CONFIG_EXT2_FS_XATTR */ -+ -+# ifdef CONFIG_EXT2_FS_XATTR_USER -+ -+extern int init_ext2_xattr_user(void) __init; -+extern void exit_ext2_xattr_user(void); -+ -+# else /* CONFIG_EXT2_FS_XATTR_USER */ -+ -+static inline int -+init_ext2_xattr_user(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext2_xattr_user(void) -+{ -+} -+ -+# endif /* CONFIG_EXT2_FS_XATTR_USER */ -+ -+#endif /* __KERNEL__ */ -+ -Index: linux-DRV401/include/linux/ext3_fs.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext3_fs.h 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/include/linux/ext3_fs.h 2004-10-15 11:03:52.000000000 -0700 -@@ -63,8 +63,6 @@ - */ - #define EXT3_BAD_INO 1 /* Bad blocks inode */ - #define EXT3_ROOT_INO 2 /* Root inode */ --#define EXT3_ACL_IDX_INO 3 /* ACL inode */ --#define EXT3_ACL_DATA_INO 4 /* ACL inode */ - #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ - #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ - #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ -@@ -94,7 +92,6 @@ - #else - # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) - #endif --#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) - #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) - #ifdef __KERNEL__ - # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -@@ -129,28 +126,6 @@ - #endif - - /* -- * ACL structures -- */ --struct ext3_acl_header /* Header of Access Control Lists */ --{ -- __u32 aclh_size; -- __u32 aclh_file_count; -- __u32 aclh_acle_count; -- __u32 aclh_first_acle; --}; -- --struct ext3_acl_entry /* Access Control List Entry */ --{ -- __u32 acle_size; -- __u16 acle_perms; /* Access permissions */ -- __u16 acle_type; /* Type of entry */ -- __u16 acle_tag; /* User or group identity */ -- __u16 acle_pad1; -- __u32 acle_next; /* Pointer on next entry for the */ -- /* same inode or on next free entry */ --}; -- --/* - * Structure of a blocks group descriptor - */ - struct ext3_group_desc -@@ -344,6 +319,7 @@ - #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ - #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ -+#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -520,7 +496,7 @@ - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - --#define EXT3_FEATURE_COMPAT_SUPP 0 -+#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ -@@ -703,6 +679,7 @@ - extern unsigned long ext3_count_free (struct buffer_head *, unsigned); - - /* inode.c */ -+extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); - -@@ -771,8 +748,10 @@ - - /* namei.c */ - extern struct inode_operations ext3_dir_inode_operations; -+extern struct inode_operations ext3_special_inode_operations; - - /* symlink.c */ -+extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - - -Index: linux-DRV401/include/linux/ext3_jbd.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext3_jbd.h 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/include/linux/ext3_jbd.h 2004-10-15 11:03:52.000000000 -0700 -@@ -30,13 +30,19 @@ - - #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 - -+/* Extended attributes may touch two data buffers, two bitmap buffers, -+ * and two group and summaries. */ -+ -+#define EXT3_XATTR_TRANS_BLOCKS 8 -+ - /* Define the minimum size for a transaction which modifies data. This - * needs to take into account the fact that we may end up modifying two - * quota files too (one for the group, one for the user quota). The - * superblock only gets updated once, of course, so don't bother - * counting that again for the quota updates. */ - --#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) -+#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ -+ EXT3_XATTR_TRANS_BLOCKS - 2) - - extern int ext3_writepage_trans_blocks(struct inode *inode); - -Index: linux-DRV401/include/linux/ext3_xattr.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext3_xattr.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/ext3_xattr.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,157 @@ -+/* -+ File: linux/ext3_xattr.h -+ -+ On-disk format of extended attributes for the ext3 filesystem. -+ -+ (C) 2001 Andreas Gruenbacher, -+*/ -+ -+#include -+#include -+#include -+ -+/* Magic value in attribute blocks */ -+#define EXT3_XATTR_MAGIC 0xEA020000 -+ -+/* Maximum number of references to one attribute block */ -+#define EXT3_XATTR_REFCOUNT_MAX 1024 -+ -+/* Name indexes */ -+#define EXT3_XATTR_INDEX_MAX 10 -+#define EXT3_XATTR_INDEX_USER 1 -+#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 -+#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -+ -+struct ext3_xattr_header { -+ __u32 h_magic; /* magic number for identification */ -+ __u32 h_refcount; /* reference count */ -+ __u32 h_blocks; /* number of disk blocks used */ -+ __u32 h_hash; /* hash value of all attributes */ -+ __u32 h_reserved[4]; /* zero right now */ -+}; -+ -+struct ext3_xattr_entry { -+ __u8 e_name_len; /* length of name */ -+ __u8 e_name_index; /* attribute name index */ -+ __u16 e_value_offs; /* offset in disk block of value */ -+ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ -+ __u32 e_value_size; /* size of attribute value */ -+ __u32 e_hash; /* hash value of name and value */ -+ char e_name[0]; /* attribute name */ -+}; -+ -+#define EXT3_XATTR_PAD_BITS 2 -+#define EXT3_XATTR_PAD (1<e_name_len)) ) -+#define EXT3_XATTR_SIZE(size) \ -+ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) -+ -+#ifdef __KERNEL__ -+ -+# ifdef CONFIG_EXT3_FS_XATTR -+ -+struct ext3_xattr_handler { -+ char *prefix; -+ size_t (*list)(char *list, struct inode *inode, const char *name, -+ int name_len); -+ int (*get)(struct inode *inode, const char *name, void *buffer, -+ size_t size); -+ int (*set)(struct inode *inode, const char *name, const void *buffer, -+ size_t size, int flags); -+}; -+ -+extern int ext3_xattr_register(int, struct ext3_xattr_handler *); -+extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); -+ -+extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int); -+extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); -+extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); -+extern int ext3_removexattr(struct dentry *, const char *); -+ -+extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); -+extern int ext3_xattr_list(struct inode *, char *, size_t); -+extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int); -+ -+extern void ext3_xattr_delete_inode(handle_t *, struct inode *); -+extern void ext3_xattr_put_super(struct super_block *); -+ -+extern int init_ext3_xattr(void) __init; -+extern void exit_ext3_xattr(void); -+ -+# else /* CONFIG_EXT3_FS_XATTR */ -+# define ext3_setxattr NULL -+# define ext3_getxattr NULL -+# define ext3_listxattr NULL -+# define ext3_removexattr NULL -+ -+static inline int -+ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t size) -+{ -+ return -ENOTSUP; -+} -+ -+static inline int -+ext3_xattr_list(struct inode *inode, void *buffer, size_t size) -+{ -+ return -ENOTSUP; -+} -+ -+static inline int -+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t size, int flags) -+{ -+ return -ENOTSUP; -+} -+ -+static inline void -+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -+{ -+} -+ -+static inline void -+ext3_xattr_put_super(struct super_block *sb) -+{ -+} -+ -+static inline int -+init_ext3_xattr(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext3_xattr(void) -+{ -+} -+ -+# endif /* CONFIG_EXT3_FS_XATTR */ -+ -+# ifdef CONFIG_EXT3_FS_XATTR_USER -+ -+extern int init_ext3_xattr_user(void) __init; -+extern void exit_ext3_xattr_user(void); -+ -+# else /* CONFIG_EXT3_FS_XATTR_USER */ -+ -+static inline int -+init_ext3_xattr_user(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext3_xattr_user(void) -+{ -+} -+ -+#endif /* CONFIG_EXT3_FS_XATTR_USER */ -+ -+#endif /* __KERNEL__ */ -+ -Index: linux-DRV401/include/linux/fs.h -=================================================================== ---- linux-DRV401.orig/include/linux/fs.h 2004-10-15 10:39:15.000000000 -0700 -+++ linux-DRV401/include/linux/fs.h 2004-10-15 11:03:52.000000000 -0700 -@@ -936,6 +936,10 @@ - int (*setattr) (struct dentry *, struct iattr *); - int (*setattr_raw) (struct inode *, struct iattr *); - int (*getattr) (struct dentry *, struct iattr *); -+ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); -+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); -+ ssize_t (*listxattr) (struct dentry *, char *, size_t); -+ int (*removexattr) (struct dentry *, const char *); - }; - - struct seq_file; -Index: linux-DRV401/include/linux/mbcache.h -=================================================================== ---- linux-DRV401.orig/include/linux/mbcache.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/mbcache.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,69 @@ -+/* -+ File: linux/mbcache.h -+ -+ (C) 2001 by Andreas Gruenbacher, -+*/ -+ -+/* Hardwire the number of additional indexes */ -+#define MB_CACHE_INDEXES_COUNT 1 -+ -+struct mb_cache_entry; -+ -+struct mb_cache_op { -+ int (*free)(struct mb_cache_entry *, int); -+}; -+ -+struct mb_cache { -+ struct list_head c_cache_list; -+ const char *c_name; -+ struct mb_cache_op c_op; -+ atomic_t c_entry_count; -+ int c_bucket_count; -+#ifndef MB_CACHE_INDEXES_COUNT -+ int c_indexes_count; -+#endif -+ kmem_cache_t *c_entry_cache; -+ struct list_head *c_block_hash; -+ struct list_head *c_indexes_hash[0]; -+}; -+ -+struct mb_cache_entry_index { -+ struct list_head o_list; -+ unsigned int o_key; -+}; -+ -+struct mb_cache_entry { -+ struct list_head e_lru_list; -+ struct mb_cache *e_cache; -+ atomic_t e_used; -+ kdev_t e_dev; -+ unsigned long e_block; -+ struct list_head e_block_list; -+ struct mb_cache_entry_index e_indexes[0]; -+}; -+ -+/* Functions on caches */ -+ -+struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t, -+ int, int); -+void mb_cache_shrink(struct mb_cache *, kdev_t); -+void mb_cache_destroy(struct mb_cache *); -+ -+/* Functions on cache entries */ -+ -+struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *); -+int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long, -+ unsigned int[]); -+void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]); -+void mb_cache_entry_release(struct mb_cache_entry *); -+void mb_cache_entry_takeout(struct mb_cache_entry *); -+void mb_cache_entry_free(struct mb_cache_entry *); -+struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *); -+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t, -+ unsigned long); -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int, -+ kdev_t, unsigned int); -+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int, -+ kdev_t, unsigned int); -+#endif -Index: linux-DRV401/include/linux/xattr.h -=================================================================== ---- linux-DRV401.orig/include/linux/xattr.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/xattr.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,15 @@ -+/* -+ File: linux/xattr.h -+ -+ Extended attributes handling. -+ -+ Copyright (C) 2001 by Andreas Gruenbacher -+ Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. -+*/ -+#ifndef _LINUX_XATTR_H -+#define _LINUX_XATTR_H -+ -+#define XATTR_CREATE 0x1 /* set the value, fail if attr already exists */ -+#define XATTR_REPLACE 0x2 /* set the value, fail if attr does not exist */ -+ -+#endif /* _LINUX_XATTR_H */ -Index: linux-DRV401/include/linux/limits.h -=================================================================== ---- linux-DRV401.orig/include/linux/limits.h 2004-10-15 10:26:20.000000000 -0700 -+++ linux-DRV401/include/linux/limits.h 2004-10-15 11:03:52.000000000 -0700 -@@ -13,6 +13,9 @@ - #define NAME_MAX 255 /* # chars in a file name */ - #define PATH_MAX 4096 /* # chars in a path name including nul */ - #define PIPE_BUF 4096 /* # bytes in atomic write to a pipe */ -+#define XATTR_NAME_MAX 255 /* # chars in an extended attribute name */ -+#define XATTR_SIZE_MAX 65536 /* size of an extended attribute value (64k) */ -+#define XATTR_LIST_MAX 65536 /* size of extended attribute namelist (64k) */ - - #define RTSIG_MAX 32 - -Index: linux-DRV401/kernel/ksyms.c -=================================================================== ---- linux-DRV401.orig/kernel/ksyms.c 2004-10-15 10:39:15.000000000 -0700 -+++ linux-DRV401/kernel/ksyms.c 2004-10-15 11:03:52.000000000 -0700 -@@ -11,6 +11,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -88,6 +89,7 @@ - EXPORT_SYMBOL(exit_files); - EXPORT_SYMBOL(exit_fs); - EXPORT_SYMBOL(exit_sighand); -+EXPORT_SYMBOL(copy_fs_struct); - EXPORT_SYMBOL(unshare_files); - - /* internal kernel memory management */ -@@ -105,6 +107,8 @@ - EXPORT_SYMBOL(kmem_cache_shrink); - EXPORT_SYMBOL(kmem_cache_alloc); - EXPORT_SYMBOL(kmem_cache_free); -+EXPORT_SYMBOL(register_cache); -+EXPORT_SYMBOL(unregister_cache); - EXPORT_SYMBOL(kmalloc); - EXPORT_SYMBOL(kfree); - EXPORT_SYMBOL(vfree); -Index: linux-DRV401/mm/vmscan.c -=================================================================== ---- linux-DRV401.orig/mm/vmscan.c 2004-10-15 10:24:07.000000000 -0700 -+++ linux-DRV401/mm/vmscan.c 2004-10-15 11:08:53.000000000 -0700 -@@ -15,6 +15,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -31,6 +32,39 @@ - */ - #define DEF_PRIORITY (6) - -+static DECLARE_MUTEX(other_caches_sem); -+static LIST_HEAD(cache_definitions); -+ -+void register_cache(struct cache_definition *cache) -+{ -+ down(&other_caches_sem); -+ list_add(&cache->link, &cache_definitions); -+ up(&other_caches_sem); -+} -+ -+void unregister_cache(struct cache_definition *cache) -+{ -+ down(&other_caches_sem); -+ list_del(&cache->link); -+ up(&other_caches_sem); -+} -+ -+static void shrink_other_caches(unsigned int priority, int gfp_mask) -+{ -+ struct list_head *p; -+ -+ if (down_trylock(&other_caches_sem)) -+ return; -+ -+ list_for_each_prev(p, &cache_definitions) { -+ struct cache_definition *cache = -+ list_entry(p, struct cache_definition, link); -+ -+ cache->shrink(priority, gfp_mask); -+ } -+ up(&other_caches_sem); -+} -+ - /* - * The swap-out function returns 1 if it successfully - * scanned all the pages it was asked to (`count'). -@@ -584,6 +618,7 @@ - - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); -+ shrink_other_caches(priority, gfp_mask); - #ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); - #endif diff --git a/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch b/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch deleted file mode 100644 index 1becfbc..0000000 --- a/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch +++ /dev/null @@ -1,346 +0,0 @@ - Documentation/Configure.help | 66 ++ - arch/ia64/defconfig | 7 - fs/Config.in | 14 - fs/Makefile | 3 - fs/ext2/Makefile | 4 - fs/ext2/file.c | 5 - fs/ext2/ialloc.c | 2 - fs/ext2/inode.c | 34 - - fs/ext2/namei.c | 14 - fs/ext2/super.c | 29 - fs/ext2/symlink.c | 14 - fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ - fs/ext2/xattr_user.c | 103 +++ - fs/ext3/Makefile | 9 - fs/ext3/ext3-exports.c | 13 - fs/ext3/file.c | 5 - fs/ext3/ialloc.c | 2 - fs/ext3/inode.c | 35 - - fs/ext3/namei.c | 21 - fs/ext3/super.c | 36 + - fs/ext3/symlink.c | 14 - fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ - fs/ext3/xattr_user.c | 111 +++ - fs/jfs/jfs_xattr.h | 6 - fs/jfs/xattr.c | 6 - fs/mbcache.c | 648 ++++++++++++++++++++++ - include/linux/cache_def.h | 15 - include/linux/errno.h | 4 - include/linux/ext2_fs.h | 31 - - include/linux/ext2_xattr.h | 157 +++++ - include/linux/ext3_fs.h | 31 - - include/linux/ext3_jbd.h | 8 - include/linux/ext3_xattr.h | 157 +++++ - include/linux/fs.h | 2 - include/linux/mbcache.h | 69 ++ - kernel/ksyms.c | 4 - mm/vmscan.c | 35 + - 62 files changed, 4343 insertions(+), 182 deletions(-) - -Index: linux-2.4.19.SuSE/Documentation/Configure.help -=================================================================== ---- linux-2.4.19.SuSE.orig/Documentation/Configure.help 2004-05-03 11:20:17.000000000 -0700 -+++ linux-2.4.19.SuSE/Documentation/Configure.help 2004-05-03 11:50:22.000000000 -0700 -@@ -15296,6 +15296,39 @@ - - If unsure, say N. - -+Ext2 extended attributes -+CONFIG_EXT2_FS_XATTR -+ Extended attributes are name:value pairs associated with inodes by -+ the kernel or by users (see the attr(5) manual page, or visit -+ for details). -+ -+ If unsure, say N. -+ -+Ext2 extended attribute block sharing -+CONFIG_EXT2_FS_XATTR_SHARING -+ This options enables code for sharing identical extended attribute -+ blocks among multiple inodes. -+ -+ Usually, say Y. -+ -+Ext2 extended user attributes -+CONFIG_EXT2_FS_XATTR_USER -+ This option enables extended user attributes on ext2. Processes can -+ associate extended user attributes with inodes to store additional -+ information such as the character encoding of files, etc. (see the -+ attr(5) manual page, or visit for details). -+ -+ If unsure, say N. -+ -+Ext2 trusted extended attributes -+CONFIG_EXT2_FS_XATTR_TRUSTED -+ This option enables extended attributes on ext2 that are accessible -+ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this -+ is only the super user. Trusted extended attributes are meant for -+ implementing system/security services. -+ -+ If unsure, say N. -+ - Ext3 journalling file system support (EXPERIMENTAL) - CONFIG_EXT3_FS - This is the journalling version of the Second extended file system -@@ -15354,6 +15387,39 @@ - - If unsure, say N. - -+Ext3 extended attributes -+CONFIG_EXT3_FS_XATTR -+ Extended attributes are name:value pairs associated with inodes by -+ the kernel or by users (see the attr(5) manual page, or visit -+ for details). -+ -+ If unsure, say N. -+ -+Ext3 extended attribute block sharing -+CONFIG_EXT3_FS_XATTR_SHARING -+ This options enables code for sharing identical extended attribute -+ blocks among multiple inodes. -+ -+ Usually, say Y. -+ -+Ext3 extended user attributes -+CONFIG_EXT3_FS_XATTR_USER -+ This option enables extended user attributes on ext3. Processes can -+ associate extended user attributes with inodes to store additional -+ information such as the character encoding of files, etc. (see the -+ attr(5) manual page, or visit for details). -+ -+ If unsure, say N. -+ -+Ext3 trusted extended attributes -+CONFIG_EXT3_FS_XATTR_TRUSTED -+ This option enables extended attributes on ext3 that are accessible -+ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this -+ is only the super user. Trusted extended attributes are meant for -+ implementing system/security services. -+ -+ If unsure, say N. -+ - Journal Block Device support (JBD for ext3) (EXPERIMENTAL) - CONFIG_JBD - This is a generic journalling layer for block devices. It is -Index: linux-2.4.19.SuSE/arch/ia64/defconfig -=================================================================== ---- linux-2.4.19.SuSE.orig/arch/ia64/defconfig 2004-05-03 11:19:10.000000000 -0700 -+++ linux-2.4.19.SuSE/arch/ia64/defconfig 2004-05-03 11:50:22.000000000 -0700 -@@ -1,6 +1,13 @@ - # - # Automatically generated make config: don't edit - # -+CONFIG_EXT3_FS_XATTR=y -+# CONFIG_EXT3_FS_XATTR_SHARING is not set -+# CONFIG_EXT3_FS_XATTR_USER is not set -+# CONFIG_EXT2_FS_XATTR is not set -+# CONFIG_EXT2_FS_XATTR_SHARING is not set -+# CONFIG_EXT2_FS_XATTR_USER is not set -+# CONFIG_FS_MBCACHE is not set - - # - # Code maturity level options -Index: linux-2.4.19.SuSE/fs/Config.in -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/Config.in 2004-05-03 11:18:52.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/Config.in 2004-05-03 11:50:22.000000000 -0700 -@@ -203,6 +203,10 @@ - #tristate 'Meta block cache' CONFIG_FS_MBCACHE - define_tristate CONFIG_FS_MBCACHE y - -+# Meta block cache for Extended Attributes (ext2/ext3) -+#tristate 'Meta block cache' CONFIG_FS_MBCACHE -+define_tristate CONFIG_FS_MBCACHE y -+ - mainmenu_option next_comment - comment 'Partition Types' - source fs/partitions/Config.in -Index: linux-2.4.19.SuSE/fs/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/Makefile 2004-05-03 11:22:49.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/Makefile 2004-05-03 11:50:22.000000000 -0700 -@@ -104,6 +104,9 @@ - obj-$(CONFIG_FS_MBCACHE) += mbcache.o - obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o - -+export-objs += mbcache.o -+obj-$(CONFIG_FS_MBCACHE) += mbcache.o -+ - # persistent filesystems - obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) - -Index: linux-2.4.19.SuSE/fs/ext2/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/Makefile 2004-05-03 11:18:46.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext2/Makefile 2004-05-03 11:50:22.000000000 -0700 -@@ -18,4 +18,8 @@ - obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o - obj-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o - -+export-objs += xattr.o -+obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o -+obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o -+ - include $(TOPDIR)/Rules.make -Index: linux-2.4.19.SuSE/fs/ext2/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/inode.c 2004-05-03 11:18:47.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext2/inode.c 2004-05-03 11:50:22.000000000 -0700 -@@ -52,6 +52,18 @@ - } - - /* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext2_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext2_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ -+/* - * Called at each iput() - */ - void ext2_put_inode (struct inode * inode) -@@ -806,6 +818,8 @@ - return; - if (ext2_inode_is_fast_symlink(inode)) - return; -+ if (ext2_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -Index: linux-2.4.19.SuSE/fs/ext2/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/super.c 2004-05-03 11:18:47.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext2/super.c 2004-05-03 11:50:22.000000000 -0700 -@@ -70,6 +70,7 @@ - { - va_list args; - -+ ext2_xattr_put_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { - sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS; - sb->u.ext2_sb.s_es->s_state = -Index: linux-2.4.19.SuSE/fs/ext3/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c 2004-05-03 11:18:47.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c 2004-05-03 11:50:22.000000000 -0700 -@@ -54,6 +54,18 @@ - inode->i_blocks - ea_blocks == 0); - } - -+/* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext3_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext3_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ - /* The ext3 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. -@@ -1968,6 +1980,8 @@ - return; - if (ext3_inode_is_fast_symlink(inode)) - return; -+ if (ext3_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c 2004-02-18 07:26:44.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c 2004-05-03 11:50:22.000000000 -0700 -@@ -0,0 +1,13 @@ -+#include -+#include -+#include -+#include -+#include -+ -+EXPORT_SYMBOL(ext3_force_commit); -+EXPORT_SYMBOL(ext3_bread); -+EXPORT_SYMBOL(ext3_xattr_register); -+EXPORT_SYMBOL(ext3_xattr_unregister); -+EXPORT_SYMBOL(ext3_xattr_get); -+EXPORT_SYMBOL(ext3_xattr_list); -+EXPORT_SYMBOL(ext3_xattr_set); -Index: linux-2.4.19.SuSE/include/linux/errno.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/errno.h 2004-05-03 11:20:21.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/errno.h 2004-05-03 11:50:22.000000000 -0700 -@@ -30,4 +30,8 @@ - - #endif - -+/* Defined for extended attributes */ -+#define ENOATTR ENODATA /* No such attribute */ -+#define ENOTSUP EOPNOTSUPP /* Operation not supported */ -+ - #endif -Index: linux-2.4.19.SuSE/kernel/ksyms.c -=================================================================== ---- linux-2.4.19.SuSE.orig/kernel/ksyms.c 2004-05-03 11:22:48.000000000 -0700 -+++ linux-2.4.19.SuSE/kernel/ksyms.c 2004-05-03 11:50:22.000000000 -0700 -@@ -12,6 +12,7 @@ - #define __KERNEL_SYSCALLS__ - #include - #include -+#include - #include - #include - #include -Index: linux-2.4.19.SuSE/mm/vmscan.c -=================================================================== ---- linux-2.4.19.SuSE.orig/mm/vmscan.c 2004-05-03 11:18:53.000000000 -0700 -+++ linux-2.4.19.SuSE/mm/vmscan.c 2004-05-03 11:50:22.000000000 -0700 -@@ -32,6 +32,39 @@ - */ - int vm_passes = 60; - -+static DECLARE_MUTEX(other_caches_sem); -+static LIST_HEAD(cache_definitions); -+ -+void register_cache(struct cache_definition *cache) -+{ -+ down(&other_caches_sem); -+ list_add(&cache->link, &cache_definitions); -+ up(&other_caches_sem); -+} -+ -+void unregister_cache(struct cache_definition *cache) -+{ -+ down(&other_caches_sem); -+ list_del(&cache->link); -+ up(&other_caches_sem); -+} -+ -+static void shrink_other_caches(unsigned int priority, int gfp_mask) -+{ -+ struct list_head *p; -+ -+ if (down_trylock(&other_caches_sem)) -+ return; -+ -+ list_for_each_prev(p, &cache_definitions) { -+ struct cache_definition *cache = -+ list_entry(p, struct cache_definition, link); -+ -+ cache->shrink(priority, gfp_mask); -+ } -+ up(&other_caches_sem); -+} -+ - /* - * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan - * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll diff --git a/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch b/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch deleted file mode 100644 index 26d3af9..0000000 --- a/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch +++ /dev/null @@ -1,47 +0,0 @@ - ext2/super.c | 3 +-- - ext3/ext3-exports.c | 13 +++++++++++++ - 2 files changed, 14 insertions(+), 2 deletions(-) - -Index: linux-2.4.19.SuSE/fs/ext2/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/super.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/ext2/super.c Sun Nov 16 00:40:59 2003 -@@ -70,6 +70,7 @@ - { - va_list args; - -+ ext2_xattr_put_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { - sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS; - sb->u.ext2_sb.s_es->s_state = -Index: linux-2.4.19.SuSE/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 00:40:59 2003 -@@ -1822,8 +1828,6 @@ - exit_ext3_xattr(); - } - --EXPORT_SYMBOL(ext3_force_commit); --EXPORT_SYMBOL(ext3_bread); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c Sun Nov 16 00:40:58 2003 -+++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c Sun Nov 16 00:40:59 2003 -@@ -0,0 +1,13 @@ -+#include -+#include -+#include -+#include -+#include -+ -+EXPORT_SYMBOL(ext3_force_commit); -+EXPORT_SYMBOL(ext3_bread); -+EXPORT_SYMBOL(ext3_xattr_register); -+EXPORT_SYMBOL(ext3_xattr_unregister); -+EXPORT_SYMBOL(ext3_xattr_get); -+EXPORT_SYMBOL(ext3_xattr_list); -+EXPORT_SYMBOL(ext3_xattr_set); diff --git a/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch b/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch deleted file mode 100644 index 19ad959..0000000 --- a/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch +++ /dev/null @@ -1,72 +0,0 @@ -Index: linux-2.4.18-chaos/include/linux/list.h -=================================================================== ---- linux-2.4.18-chaos.orig/include/linux/list.h 2003-11-23 00:07:05.000000000 +0300 -+++ linux-2.4.18-chaos/include/linux/list.h 2003-12-11 00:25:15.000000000 +0300 -@@ -173,6 +173,67 @@ - for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ - pos = pos->prev, prefetch(pos->prev)) - -+/** -+ * list_for_each_entry - iterate over list of given type -+ * @pos: the type * to use as a loop counter. -+ * @head: the head for your list. -+ * @member: the name of the list_struct within the struct. -+ */ -+#define list_for_each_entry(pos, head, member) \ -+ for (pos = list_entry((head)->next, typeof(*pos), member), \ -+ prefetch(pos->member.next); \ -+ &pos->member != (head); \ -+ pos = list_entry(pos->member.next, typeof(*pos), member), \ -+ prefetch(pos->member.next)) -+ -+#ifndef list_for_each_entry_safe -+/** -+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry -+ * @pos: the type * to use as a loop counter. -+ * @n: another type * to use as temporary storage -+ * @head: the head for your list. -+ * @member: the name of the list_struct within the struct. -+ */ -+#define list_for_each_entry_safe(pos, n, head, member) \ -+ for (pos = list_entry((head)->next, typeof(*pos), member), \ -+ n = list_entry(pos->member.next, typeof(*pos), member); \ -+ &pos->member != (head); \ -+ pos = n, n = list_entry(n->member.next, typeof(*n), member)) -+#endif -+ -+/** -+ * list_move - delete from one list and add as another's head -+ * @list: the entry to move -+ * @head: the head that will precede our entry -+ */ -+static inline void list_move(struct list_head *list, struct list_head *head) -+{ -+ __list_del(list->prev, list->next); -+ list_add(list, head); -+} -+ -+/** -+ * list_move_tail - delete from one list and add as another's tail -+ * @list: the entry to move -+ * @head: the head that will follow our entry -+ */ -+static inline void list_move_tail(struct list_head *list, -+ struct list_head *head) -+{ -+ __list_del(list->prev, list->next); -+ list_add_tail(list, head); -+} -+ -+/* 2.5 uses hlists for some things, like the d_hash. we'll treat them -+ * as 2.5 and let macros drop back.. */ -+#define hlist_entry list_entry -+#define hlist_head list_head -+#define hlist_node list_head -+#define HLIST_HEAD LIST_HEAD -+#define INIT_HLIST_HEAD INIT_LIST_HEAD -+#define hlist_del_init list_del_init -+#define hlist_add_head list_add -+#define hlist_for_each_safe list_for_each_safe - - #endif /* __KERNEL__ || _LVM_H_INCLUDE */ - diff --git a/lustre/kernel_patches/patches/mcore-2.4.20-8.patch b/lustre/kernel_patches/patches/mcore-2.4.20-8.patch deleted file mode 100644 index c8b80eb..0000000 --- a/lustre/kernel_patches/patches/mcore-2.4.20-8.patch +++ /dev/null @@ -1,2738 +0,0 @@ -? linux/.config -? linux/include/linux/autoconf.h -? linux/include/linux/modules -Index: linux/Makefile -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/Makefile,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/Makefile 12 Mar 2003 19:48:52 -0000 1.3.2.1 -+++ linux/Makefile 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 -@@ -99,6 +99,10 @@ - CFLAGS += -fomit-frame-pointer - endif - AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) -+ifeq ($(CONFIG_MCL_COREDUMP),y) -+ CFLAGS += -g -+endif -+ - - # - # ROOT_DEV specifies the default root-device when making the image. -Index: linux/Documentation/Configure.help -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/Documentation/Configure.help,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/Documentation/Configure.help 12 Mar 2003 19:48:52 -0000 1.3.2.1 -+++ linux/Documentation/Configure.help 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 -@@ -21660,6 +21660,35 @@ - This option allows you to run the kernel with data cache disabled. - Say Y if you experience CPM lock-ups. - -+Boot kernel image support -+CONFIG_BOOTIMG -+ Add support for booting a new Linux kernel from a running Linux -+ system. You need to download the bootimg(8) utility from -+ ftp://icaftp.epfl.ch/pub/people/almesber/misc/bootimg-current.tar.gz -+ in order to use this functionality. -+ -+Protect SMP configuration tables -+CONFIG_BOOTIMG_SMP -+ On SMP systems, the BIOS stores tables with configuration data in -+ memory and an SMP-enabled kernel reads these tables. However, a -+ kernel without SMP support will overwrite such tables. If a kernel -+ without SMP support used bootimg to boot an SMP-enabled kernel, the -+ latter will probably crash when trying to read the SMP tables. The -+ CONFIG_BOOTIMG_SMP option enables minimal support for scanning and -+ protecting of SMP configuration tables also for kernels without SMP -+ support. -+ -+In-memory kernel core dump facility -+CONFIG_MCL_COREDUMP -+ In conjunction with bootimg, this allows you to get kernel core dumps -+ of your system at panic() time. The panic call is modified so that it -+ calls the core dump facility and reboots the system. On the way back -+ up, the kernel dump image is written out to disk by the accompanying -+ init script. You can use the crash analysis tool to analyze the core -+ dump. This tool can be found at : -+ -+ http://www.missioncriticallinux.com/download -+ - # - # m68k-specific kernel options - # Documented by Chris Lawrence et al. -Index: linux/arch/i386/config.in -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/config.in,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.2 -diff -u -r1.3.2.1 -r1.3.2.1.2.2 ---- linux/arch/i386/config.in 12 Mar 2003 19:49:05 -0000 1.3.2.1 -+++ linux/arch/i386/config.in 1 Apr 2003 19:35:12 -0000 1.3.2.1.2.2 -@@ -502,6 +502,12 @@ - bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ - bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK - bool ' Compile the kernel with frame pointers' CONFIG_FRAME_POINTER -+ if [ "$CONFIG_FRAME_POINTER " != "n" ]; then -+ bool ' Kernel Core Dump Facility' CONFIG_MCL_COREDUMP -+ if [ "$CONFIG_MCL_COREDUMP" = "y" ]; then -+ bool ' Reboot using bootimg' CONFIG_BOOTIMG -+ fi -+ fi - fi - - endmenu -Index: linux/arch/i386/vmlinux.lds -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/vmlinux.lds,v -retrieving revision 1.1.1.1.4.1 -retrieving revision 1.1.1.1.4.1.2.1 -diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 ---- linux/arch/i386/vmlinux.lds 12 Mar 2003 19:49:05 -0000 1.1.1.1.4.1 -+++ linux/arch/i386/vmlinux.lds 1 Apr 2003 12:17:40 -0000 1.1.1.1.4.1.2.1 -@@ -19,6 +19,13 @@ - .rodata : { *(.rodata) *(.rodata.*) } - .kstrtab : { *(.kstrtab) } - -+ . = ALIGN(16); /* Relocatable bootimage code */ -+ __bootimg_start = .; -+ .bootimg : { -+ *(.bootimg) -+ } -+ __bootimg_end = .; -+ - . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : { *(__ex_table) } -Index: linux/arch/i386/boot/setup.S -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/setup.S,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/arch/i386/boot/setup.S 12 Mar 2003 19:49:05 -0000 1.2.2.1 -+++ linux/arch/i386/boot/setup.S 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 -@@ -105,16 +105,22 @@ - # flags, unused bits must be zero (RFU) bit within loadflags - loadflags: - LOADED_HIGH = 1 # If set, the kernel is loaded high -+RELOADS_GDT = 2 # if set, kernel reloads GDT, such that -+ # boot loader does not have to provide -+ # GDT in a "safe" memory location - CAN_USE_HEAP = 0x80 # If set, the loader also has set - # heap_end_ptr to tell how much - # space behind setup.S can be used for - # heap purposes. - # Only the loader knows what is free --#ifndef __BIG_KERNEL__ -- .byte 0 --#else -- .byte LOADED_HIGH -+_FLAGS = 0 -+#ifdef __BIG_KERNEL__ -+ _FLAGS = _FLAGS | LOADED_HIGH - #endif -+#ifdef CONFIG_BOOTIMG -+ _FLAGS = _FLAGS | RELOADS_GDT -+#endif -+ .byte _FLAGS - - setup_move_size: .word 0x8000 # size to move, when setup is not - # loaded at 0x90000. We will move setup -Index: linux/arch/i386/kernel/Makefile -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/Makefile,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/arch/i386/kernel/Makefile 12 Mar 2003 19:49:05 -0000 1.2.2.1 -+++ linux/arch/i386/kernel/Makefile 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 -@@ -49,6 +49,7 @@ - obj-$(CONFIG_X86_LONGRUN) += longrun.o - obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o - obj-$(CONFIG_PROFILING) += profile.o -+obj-$(CONFIG_MCL_COREDUMP) += crash.o - - - include $(TOPDIR)/Rules.make -Index: linux/arch/i386/kernel/crash.c -=================================================================== -RCS file: linux/arch/i386/kernel/crash.c -diff -N linux/arch/i386/kernel/crash.c ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/arch/i386/kernel/crash.c 1 Apr 2003 12:17:40 -0000 1.1.6.1 -@@ -0,0 +1,82 @@ -+/* -+ * linux/arch/i386/crash.c -+ * -+ * Architecture dependant code for MCL in-memory core dump. -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+inline void crash_save_regs(void) { -+ static unsigned long regs[8]; -+ -+ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs[0])); -+ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs[1])); -+ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs[2])); -+ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs[3])); -+ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs[4])); -+ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs[5])); -+ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs[6])); -+ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs[7])); -+ -+ panic_regs = regs; -+} -+ -+/* -+ * Save the current stack pointer and EIP. -+ */ -+void crash_save_current_state(struct task_struct *tp) -+{ -+ /* -+ * Here we save ebp instead of esp just in case the compiler -+ * decides to put an extra push in before we execute this -+ * instruction (thus invalidating our frame pointer). -+ */ -+ asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp)); -+ tp->thread.eip = (u_long)crash_save_current_state; -+ panic_ksp[smp_processor_id()] = tp->thread.esp; -+ mb(); -+ -+ save_core(); -+ -+ crash_halt_or_reboot(1); -+} -+ -+/* -+ * If we are not the panicking thread, we simply halt. Otherwise, -+ * we take care of calling the reboot code. -+ */ -+void crash_halt_or_reboot(int boot_cpu) -+{ -+#ifdef CONFIG_SMP -+ if (!boot_cpu) { -+ stop_this_cpu(NULL); -+ /* NOTREACHED */ -+ } -+#endif -+ machine_restart(NULL); -+} -+ -+void crash_cleanup_smp_state(void) -+{ -+ /* -+ * Here we duplicate smp_send_stop. Crash_halt_or_reboot() calls -+ * stop_this_cpu. We now know that we are the only one running, -+ * so we finish off the smp_send_stop function. -+ */ -+ __cli(); -+#ifdef CONFIG_SMP -+ disable_local_APIC(); -+#endif -+} -+ -+/* -+ * Core dump IPI -+ */ -+void smp_crash_funnel_cpu(void) -+{ -+ crash_save_current_state(current); -+} -Index: linux/arch/i386/kernel/nmi.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/nmi.c,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/arch/i386/kernel/nmi.c 12 Mar 2003 19:49:06 -0000 1.2.2.1 -+++ linux/arch/i386/kernel/nmi.c 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 -@@ -374,11 +374,18 @@ - bust_spinlocks(1); - printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); - show_registers(regs); -+#ifdef CONFIG_MCL_COREDUMP -+ spin_unlock(&nmi_print_lock); -+ bust_spinlocks(0); -+ panic("die"); -+ /* NOTREACHED */ -+#else - printk("console shuts up ...\n"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - do_exit(SIGSEGV); -+#endif - } - } else { - last_irq_sums[cpu] = sum; -Index: linux/arch/i386/kernel/process.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/process.c,v -retrieving revision 1.2.2.2 -retrieving revision 1.2.2.2.2.1 -diff -u -r1.2.2.2 -r1.2.2.2.2.1 ---- linux/arch/i386/kernel/process.c 1 Apr 2003 02:11:17 -0000 1.2.2.2 -+++ linux/arch/i386/kernel/process.c 1 Apr 2003 12:17:40 -0000 1.2.2.2.2.1 -@@ -50,6 +50,9 @@ - #ifdef CONFIG_MATH_EMULATION - #include - #endif -+#ifdef CONFIG_BOOTIMG -+#include -+#endif - - #include - -@@ -377,7 +380,21 @@ - - void machine_restart(char * __unused) - { -+#ifdef CONFIG_MCL_COREDUMP -+ extern char *panicmsg; -+ /* -+ * Only call bootimg if we have a valid descriptor and -+ * we are in a panic() context. -+ */ -+ if (panicmsg) -+#endif -+#ifdef CONFIG_BOOTIMG -+ if (bootimg_dsc.page_dir) -+ boot_image(); -+#endif -+ - #if CONFIG_SMP -+{ - int cpuid; - - cpuid = GET_APIC_ID(apic_read(APIC_ID)); -@@ -413,6 +430,7 @@ - if (!netdump_func) - smp_send_stop(); - disable_IO_APIC(); -+} - #endif - - if(!reboot_thru_bios) { -Index: linux/arch/i386/kernel/setup.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/setup.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.2 -diff -u -r1.3.2.1 -r1.3.2.1.2.2 ---- linux/arch/i386/kernel/setup.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 -+++ linux/arch/i386/kernel/setup.c 1 Apr 2003 17:55:35 -0000 1.3.2.1.2.2 -@@ -116,6 +116,9 @@ - #include - #include - #include -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif - /* - * Machine setup.. - */ -@@ -973,6 +976,7 @@ - static unsigned long __init setup_memory(void) - { - unsigned long bootmap_size, start_pfn, max_low_pfn; -+ unsigned long bootmap_pages = 0UL, crash_pages = 0UL; - - /* - * partially used pages are not usable - thus -@@ -992,6 +996,21 @@ - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - #endif -+ -+#ifdef CONFIG_MCL_COREDUMP -+ bootmap_pages = bootmem_bootmap_pages(max_low_pfn); -+ crash_pages = crash_pages_needed(); -+ -+ printk("start_pfn: %d, bootmap_pages: %d\n", start_pfn, bootmap_pages); -+ -+ crash_init((u_long)phys_to_virt(PFN_PHYS(start_pfn)), -+ (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn)), -+ (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn + -+ crash_pages))); -+ -+ printk("new start_pfn: %08lx\n", PFN_PHYS(start_pfn)); -+ printk("crash map starts at %lx\n",(start_pfn+bootmap_pages)*PAGE_SIZE); -+#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - /* -@@ -1007,8 +1026,8 @@ - * the (very unlikely) case of us accidentally initializing the - * bootmem allocator with an invalid RAM area. - */ -- reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + -- bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); -+ reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + bootmap_size + -+ ((1+crash_pages)*PAGE_SIZE) + PAGE_SIZE-1) - (HIGH_MEMORY)); - - /* - * reserve physical page 0 - it's a special BIOS page on many boxes, -@@ -1016,6 +1035,16 @@ - */ - reserve_bootmem(0, PAGE_SIZE); - -+#ifdef CONFIG_BOOTIMG -+ /* -+ * bootimg(8) reads the old parameter block. Note that the copy in -+ * empty_zero_page will vanish when mem_init runs. (Should we -+ * memcpy(phys_to_virt(0x90000), PARAM, PAGE_SIZE); -+ * now ?) -+ */ -+ reserve_bootmem(0x90000, PAGE_SIZE); -+#endif -+ - #ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff -@@ -1032,6 +1061,7 @@ - find_smp_config(); - #endif - #ifdef CONFIG_BLK_DEV_INITRD -+ printk("caution: initrd may overwrite dump\n"); /* phro */ - if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { - reserve_bootmem(INITRD_START, INITRD_SIZE); -@@ -1172,6 +1202,12 @@ - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ - #endif - paging_init(); -+#ifdef CONFIG_MCL_COREDUMP -+ /* -+ * Reserve crash pages -+ */ -+ crash_mark_dump_reserved(); -+#endif - #ifdef CONFIG_X86_LOCAL_APIC - /* - * get boot-time SMP configuration: -Index: linux/arch/i386/kernel/smp.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/smp.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/arch/i386/kernel/smp.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 -+++ linux/arch/i386/kernel/smp.c 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 -@@ -23,6 +23,9 @@ - #include - #include - -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif - /* - * Some notes on x86 processor bugs affecting SMP operation: - * -@@ -579,7 +582,7 @@ - return 0; - } - --static void stop_this_cpu (void * dummy) -+void stop_this_cpu (void * dummy) - { - /* - * Remove this CPU: -Index: linux/arch/i386/kernel/traps.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/traps.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/arch/i386/kernel/traps.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 -+++ linux/arch/i386/kernel/traps.c 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 -@@ -52,6 +52,10 @@ - #include - #include - -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif -+ - asmlinkage int system_call(void); - asmlinkage void lcall7(void); - asmlinkage void lcall27(void); -@@ -309,7 +313,11 @@ - netdump_func(regs); - bust_spinlocks(0); - spin_unlock_irq(&die_lock); -- do_exit(SIGSEGV); -+#ifdef CONFIG_MCL_COREDUMP -+ if(panic_on_oops) -+ panic("die"); -+#endif -+ do_exit(SIGSEGV);/* NOTREACHED */ - } - - static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) -Index: linux/drivers/char/misc.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/misc.c,v -retrieving revision 1.2 -retrieving revision 1.2.4.1 -diff -u -r1.2 -r1.2.4.1 ---- linux/drivers/char/misc.c 25 Sep 2002 17:11:05 -0000 1.2 -+++ linux/drivers/char/misc.c 1 Apr 2003 12:17:41 -0000 1.2.4.1 -@@ -78,6 +78,8 @@ - extern int i8k_init(void); - extern int lcd_init(void); - -+extern int crash_init_chrdev(void); -+ - static int misc_read_proc(char *buf, char **start, off_t offset, - int len, int *eof, void *private) - { -@@ -255,6 +257,9 @@ - int __init misc_init(void) - { - create_proc_read_entry("misc", 0, 0, misc_read_proc, NULL); -+#ifdef CONFIG_MCL_COREDUMP -+ crash_init_chrdev(); -+#endif - #ifdef CONFIG_MVME16x - rtc_MK48T08_init(); - #endif -Index: linux/drivers/char/sysrq.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/sysrq.c,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.2 -diff -u -r1.2.2.1 -r1.2.2.1.2.2 ---- linux/drivers/char/sysrq.c 12 Mar 2003 19:49:47 -0000 1.2.2.1 -+++ linux/drivers/char/sysrq.c 1 Apr 2003 17:55:35 -0000 1.2.2.1.2.2 -@@ -97,7 +97,18 @@ - action_msg: "Resetting", - }; - -- -+#ifdef CONFIG_MCL_COREDUMP -+/* kernel core dump sysrq */ -+static void sysrq_handle_coredump(int key, struct pt_regs *pt_regs, -+ struct kbd_struct *kbd, struct tty_struct *ttty) { -+ panic("sysrq"); -+} -+static struct sysrq_key_op sysrq_coredump_op = { -+ handler: sysrq_handle_coredump, -+ help_msg: "Crash", -+ action_msg: "Dumping core", -+}; -+#endif - - /* SYNC SYSRQ HANDLERS BLOCK */ - -@@ -334,7 +345,11 @@ - it is handled specially on the spark - and will never arive */ - /* b */ &sysrq_reboot_op, -+#ifdef CONFIG_MCL_COREDUMP -+/* c */ &sysrq_coredump_op, -+#else - /* c */ NULL, -+#endif - /* d */ NULL, - /* e */ &sysrq_term_op, - /* f */ NULL, -Index: linux/include/asm-i386/bootimg.h -=================================================================== -RCS file: linux/include/asm-i386/bootimg.h -diff -N linux/include/asm-i386/bootimg.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/include/asm-i386/bootimg.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,141 @@ -+/* asm-i386/bootimg.h - Boot image, i386-specific code */ -+ -+/* Written 2000 by Werner Almesberger */ -+ -+/* -+ * When porting bootimg(2) to a new architcture, you need to adapt the -+ * functions and definitions in this file. -+ */ -+ -+ -+#ifndef _ASM_I386_BOOTIMG_H -+#define _ASM_I386_BOOTIMG_H -+ -+#include -+#include -+ -+#ifdef CONFIG_SMP -+#include -+#include -+#endif -+ -+ -+/* -+ * The memory page with the code currently executing has been copied from -+ * old_page to new_page. Jump there. -+ * -+ * Note: flush_icache_range has already been called on the new page. -+ */ -+ -+static inline void jump_relocated(unsigned long old_page,unsigned long new_page) -+{ -+ int tmp; -+ -+ __asm__ __volatile__( -+ "stc\n\t" -+ "call 1f\n" -+ "1:\tjnc 2f\n\t" -+ "popl %0\n\t" -+ "addl %1,%0\n\t" -+ "addl %1,%%esp\n\t" -+ "clc\n\t" -+ "jmp *%0\n" -+ "2:" -+ : "=&r" (tmp) : "r" (new_page-old_page)); -+} -+ -+ -+/* -+ * Stop paging, such that -+ * - page tables can be overwritten -+ * - all physical memory can be accessed -+ * - all physical memory is identity-mapped -+ * -+ * (Other rules are possible, but need to be encoded in bootimg(8).) -+ */ -+ -+static inline void stop_paging(void) -+{ -+ unsigned long msw; -+ -+ __asm__ __volatile__( -+ "movl %%cr0,%0\n\t" -+ "andl $0x7fffffff,%0\n\t" -+ "movl %0,%%cr0\n\t" -+ "jmp 1f\n\t" /* i486 and such */ -+ "1:" -+ -+/* Clear the PAE bit in register %cr4 if we were in PAE mode. The initial -+ * page table set up by the new kernel's bootstrap code is non-PAE regardless -+ * of whether the new kernel is a PAE kernel. By clearing the PAE bit here, -+ * we make sure the bootstrap code doesn't accidentally enable PAE mode when -+ * it turns on address translation. -+ */ -+#ifdef CONFIG_X86_PAE -+ "movl %%cr4,%0\n\t" -+ "andl $0xffffffdf,%0\n\t" -+ "movl %0,%%cr4\n\t" -+#endif -+ -+ : "=&r" (msw) : : "memory"); -+} -+ -+ -+/* -+ * Stop any remaining concurrency in the system. If become_only_thread fails -+ * but the system is still usable, become_only_thread should return an error -+ * code. If no recovery is possible, it may as well panic. -+ */ -+ -+static inline int become_only_thread(void) -+{ -+#ifdef CONFIG_SMP -+ smp_send_stop(); -+ disable_IO_APIC(); -+#endif -+ cli(); -+ return 0; -+} -+ -+ -+/* -+ * A conservative estimate of the number of bytes relocate_and_jump allocated -+ * on the stack. This is only used for sanity checking before running code, -+ * because we can't recover from failure in relocate_and_jump. -+ */ -+ -+#define RESERVE_MIN_RELOC_STACK 256 -+ -+ -+/* -+ * Change the stack pointer such that stack is at the end of the specified -+ * page. No data on the old stack will be accessed anymore, so no copying is -+ * required. -+ */ -+ -+static inline void stack_on_page(void *page) -+{ -+ __asm__ __volatile__( -+ "push %%ds\n\t" -+ "pop %%ss\n\t" -+ "movl %0,%%esp\n\t" -+ "addl $0x1000,%%esp\n\t" -+ : : "r" (page)); -+} -+ -+/* -+ * Set up things such that the kernel will be comfortable (e.g. some -+ * architectures expect the boot loader to set registers in certain ways), -+ * and then jump to the kernel's entry address. -+ */ -+ -+static inline void jump_to_kernel(void (*kernel_entry)(void)) -+{ -+ __asm__ __volatile__( -+ "mov $0x90000,%%esi\n\t" -+ : : ); -+ -+ kernel_entry(); -+} -+ -+#endif -Index: linux/include/asm-i386/crash.h -=================================================================== -RCS file: linux/include/asm-i386/crash.h -diff -N linux/include/asm-i386/crash.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/include/asm-i386/crash.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,15 @@ -+#ifndef __ASM_CRASH_H -+#define __ASM_CRASH_H -+ -+#define UPPER_MEM_BACKUP 0 -+#define LOWER_MEM_FORWARD 0 -+#define LOW_OFFSET 100 -+ -+/* -+ * These two functions are inlined on alpha. That's why they appear -+ * in the arch dependent include file. -+ */ -+void crash_save_current_state(struct task_struct *); -+void crash_halt_or_reboot(int); -+ -+#endif -Index: linux/include/linux/bootimg.h -=================================================================== -RCS file: linux/include/linux/bootimg.h -diff -N linux/include/linux/bootimg.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/include/linux/bootimg.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,84 @@ -+/* linux/bootimg.h - Boot image, general definitions */ -+ -+/* Written 2000 by Werner Almesberger */ -+ -+ -+#ifndef _LINUX_BOOTIMG_H -+#define _LINUX_BOOTIMG_H -+ -+ -+/* -+ * Constraints on image_map: -+ * - each image_map[n] is the virtual address of a page-sized memory region -+ * readable by the user -+ * - currently, image_map[n] is not required to be page-aligned, but this may -+ * change in the future if we want to map pages directly to lower memory -+ * pressure (NB: mapping works for ELF and plain binary images, but usually -+ * not for (b)zImages, because the prepended boot and setup sectors -+ * mis-align them) -+ * -+ * Constraints on load_map: -+ * - each load_map[] is the physical address of a page in RAM -+ */ -+ -+struct boot_image { -+ void **image_map; /* pointers to image pages in user memory */ -+ int pages; /* length in pages */ -+ unsigned long *load_map;/* list of destination pages (physical addr) */ -+ unsigned long start; /* jump to this physical address */ -+ int flags; /* for future use, must be zero for now */ -+}; -+ -+ -+#ifdef __KERNEL__ -+ -+#define __bootimg __attribute__ ((__section__ (".bootimg"))) -+ -+ -+struct bootimg_dsc { -+ unsigned long self; /* code page ALL ADDRESSES */ -+ unsigned long scratch; /* scratch page ARE PHYSICAL !*/ -+ unsigned long **page_dir; /* src & dst page tables */ -+ void (*jump_to)(void); /* start address */ -+ int pages; /* number of pages */ -+ unsigned long csum; /* Kernel Image checksum */ -+}; -+ -+/* -+ * page_dir contains pointers to pages containing pointers to pages. We call -+ * page_dir a "directory" and the page page_dir[n] points to a "table". The -+ * first PAGES_PER_TABLE/2 entries of page_dir are for source pages, and other -+ * half are for destination pages. -+ */ -+ -+/* -+ * Note that the definitions used here do not necessarily correspond to the -+ * architecture-specific PTRS_PER_PTE, __pte_offset, etc. -+ */ -+ -+#define PAGES_PER_TABLE (PAGE_SIZE/sizeof(void *)) -+#define FROM_TABLE(i) ((i)/PAGES_PER_TABLE) -+#define TO_TABLE(i) ((i)/PAGES_PER_TABLE+PAGES_PER_TABLE/2) -+#define PAGE_NR(i) ((i) % PAGES_PER_TABLE) -+ -+ -+extern char __bootimg_start,__bootimg_end; /* linker segment boundaries */ -+extern unsigned long *unity_page; /* unity-mapped page for i386 */ -+ -+/* -+ * relocate_and_jump runs in its own page with its own stack. This makes it -+ * difficult to pass parameters. The solution chosen here is to use the global -+ * variable bootimg_dsc, which is copied into an "auto" variable by -+ * relocate_and_jump before any copying or relocation takes place. -+ */ -+ -+extern struct bootimg_dsc bootimg_dsc; -+ -+typedef void (*relocate_and_jump_t)(void); -+ -+void relocate_and_jump(void); -+int boot_image(void); -+ -+#endif /* __KERNEL__ */ -+ -+#endif -Index: linux/include/linux/crash.h -=================================================================== -RCS file: linux/include/linux/crash.h -diff -N linux/include/linux/crash.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/include/linux/crash.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,119 @@ -+#ifndef __LINUX_CRASH_H -+#define __LINUX_CRASH_H -+ -+/* defines for interfacing with user-space (ioctls, etc) */ -+struct ioctl_getdump { -+ unsigned long kva; -+ unsigned long buf; -+}; -+ -+#define CRASH_IOC_MAGIC 'C' -+ -+#define CRASH_IOCFREEDUMP _IO(CRASH_IOC_MAGIC, 0) -+#define CRASH_IOCGETDUMP _IOWR(CRASH_IOC_MAGIC, 1, struct ioctl_getdump) -+#define CRASH_IOCBOOTIMG _IOWR(CRASH_IOC_MAGIC, 2, struct boot_image) -+#define CRASH_IOCVERSION _IO(CRASH_IOC_MAGIC, 3) -+ -+/* kernel-only part of crash.h */ -+#ifdef __KERNEL__ -+#include -+ -+#define CRASH_K_MINOR (1) -+#define CRASH_K_MAJOR (0) -+ -+/* -+ * Crash prototypes. -+ */ -+void save_core(void); -+void crash_mark_dump_reserved(void); -+void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va); -+u_long crash_pages_needed(void); -+void smp_crash_funnel_cpu(void); -+void crash_cleanup_smp_state(void); -+ -+/* -+ * Arch dependant crash.c funcs -+ */ -+void crash_save_current_state(struct task_struct *); -+void crash_halt_or_reboot(int); -+inline void crash_save_regs(void); -+ -+/* -+ * Crash globals -+ */ -+extern u_long crash_dump_header; -+extern volatile u_long panic_ksp[]; -+extern volatile int crash_release; -+extern int panic_on_oops; -+extern char *panicmsg; -+extern int panic_processor; -+extern int crash_perform_sync; -+extern unsigned long *panic_regs; -+ -+/* -+ * symbols not exported by linux header files -+ */ -+extern void stop_this_cpu(void *); -+ -+/* struct crash_map_hdr located at byte offset 0 */ -+/* on-disk formats */ -+ -+#define trunc_page(x) ((void *)(((unsigned long)(x)) & ~((unsigned long)(PAGE_SIZE - 1)))) -+#define round_page(x) trunc_page(((unsigned long)(x)) + ((unsigned long)(PAGE_SIZE - 1))) -+ -+#define CRASH_MAGIC 0x9a8bccdd -+#define CRASH_SOURCE_PAGES 128 -+#define CRASH_SUB_MAP_BYTES ((u_long)round_page((CRASH_SOURCE_PAGES+1)*sizeof(u_long))) -+#define CRASH_SUB_MAP_PAGES (CRASH_SUB_MAP_BYTES / PAGE_SIZE) -+#define CRASH_UNCOMPR_BUF_PAGES (CRASH_SOURCE_PAGES + CRASH_SUB_MAP_PAGES) -+#define CRASH_COMPR_BUF_PAGES (CRASH_UNCOMPR_BUF_PAGES + (CRASH_UNCOMPR_BUF_PAGES/4)) -+#define CRASH_COMPESS_PRIME_PAGES (2*CRASH_COMPR_BUF_PAGES) -+#define CRASH_ZALLOC_PAGES 16*5*2 /* 2 to handle crash in crash */ -+#define CRASH_LOW_WATER_PAGES 100 -+ -+#define CRASH_CPU_TIMEOUT 5000 /* 5 sec wait for other cpus to stop */ -+ -+#define CRASH_MARK_RESERVED(addr) (set_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags)) -+#define CRASH_CLEAR_RESERVED(addr) (clear_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags)) -+#define CRASH_MARK_BOOT_RESERVED(addr) reserve_bootmem(virt_to_phys((void *)addr), PAGE_SIZE); -+ -+typedef int boolean_t; -+ -+#define TRUE 1 -+#define FALSE 0 -+ -+/* mem structure */ -+struct mem_crash_map_hdr { -+ long magic[4]; /* identify crash dump */ -+ u_long map; /* location of map */ -+ u_long map_pages; -+ u_long data_pages; -+ u_long compr_units; -+ u_long boot_reserved_start; -+ u_long boot_reserved_end; -+}; -+struct mem_crash_map_entry { -+ u_long src_va; /* source start of larger non-contig -+ * block. a src_va of -1 means that -+ * the dest_page_va is the location of -+ * the next map page */ -+ u_long dest_page_va; /* dest of this sub block */ -+ u_long check_sum; /* check_sum for dest data */ -+}; -+ -+/* file structure */ -+struct crash_map_hdr { -+ long magic[4]; /* identify crash dump */ -+ int blk_size; /* block size for this device */ -+ int map_block; /* location of map */ -+ int map_blocks; /* number of blocks for map */ -+}; -+struct crash_map_entry { -+ u_long start_va; /* virtual address */ -+ char *exp_data; /* expanded data in memory */ -+ int start_blk; /* device location */ -+ int num_blks; -+}; -+ -+#endif /* __KERNEL__ */ -+#endif /* __LINUX_CRASH_H */ -Index: linux/include/linux/mm.h -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/include/linux/mm.h,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.2 -diff -u -r1.2.2.1 -r1.2.2.1.2.2 ---- linux/include/linux/mm.h 12 Mar 2003 19:51:27 -0000 1.2.2.1 -+++ linux/include/linux/mm.h 1 Apr 2003 17:55:35 -0000 1.2.2.1.2.2 -@@ -331,6 +331,11 @@ - #define PG_lru 18 - #define PG_active_cache 19 - #define PG_fs_1 20 /* Filesystem specific */ -+#ifdef CONFIG_MCL_COREDUMP -+#define PG_free 21 -+#define PG_shm 22 -+#define PG_anon 23 -+#endif - - /* Make it prettier to test the above... */ - #define UnlockPage(page) unlock_page(page) -@@ -452,6 +457,11 @@ - #define PageSetSlab(page) set_bit(PG_slab, &(page)->flags) - #define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) - #define PageReserved(page) test_bit(PG_reserved, &(page)->flags) -+#ifdef CONFIG_MCL_COREDUMP -+#define PageFree(page) (test_bit(PG_free, &(page)->flags)) -+#define PageAnon(page) (test_bit(PG_anon, &(page)->flags)) -+#define PageShm(page) (test_bit(PG_shm, &(page)->flags)) -+#endif - - #define PageActiveAnon(page) test_bit(PG_active_anon, &(page)->flags) - #define SetPageActiveAnon(page) set_bit(PG_active_anon, &(page)->flags) -Index: linux/include/linux/reboot.h -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/include/linux/reboot.h,v -retrieving revision 1.1.1.1 -retrieving revision 1.1.1.1.10.2 -diff -u -r1.1.1.1 -r1.1.1.1.10.2 ---- linux/include/linux/reboot.h 7 May 2002 21:53:47 -0000 1.1.1.1 -+++ linux/include/linux/reboot.h 1 Apr 2003 17:55:35 -0000 1.1.1.1.10.2 -@@ -20,6 +20,7 @@ - * CAD_OFF Ctrl-Alt-Del sequence sends SIGINT to init task. - * POWER_OFF Stop OS and remove all power from system, if possible. - * RESTART2 Restart system using given command string. -+ * COREDUMP We're taking a core dump, secondary cpus already stopped. - */ - - #define LINUX_REBOOT_CMD_RESTART 0x01234567 -@@ -28,7 +29,9 @@ - #define LINUX_REBOOT_CMD_CAD_OFF 0x00000000 - #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC - #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4 -- -+#ifdef CONFIG_MCL_COREDUMP -+#define LINUX_REBOOT_CMD_COREDUMP 0x9A8BCCDD -+#endif - - #ifdef __KERNEL__ - -Index: linux/include/linux/sysctl.h -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/include/linux/sysctl.h,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/include/linux/sysctl.h 12 Mar 2003 19:51:30 -0000 1.3.2.1 -+++ linux/include/linux/sysctl.h 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 -@@ -126,6 +126,7 @@ - KERN_CADPID=54, /* int: PID of the process to notify on CAD */ - KERN_CORE_PATTERN=56, /* string: pattern for core-files */ - KERN_PID_MAX=55, /* int: max PID value of processes */ -+ KERN_PANIC_ON_OOPS /* int: panic on oops enabled */ - }; - - -Index: linux/init/main.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/init/main.c,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/init/main.c 12 Mar 2003 19:51:35 -0000 1.2.2.1 -+++ linux/init/main.c 1 Apr 2003 12:17:41 -0000 1.2.2.1.2.1 -@@ -70,6 +70,10 @@ - #include - #endif - -+#ifdef CONFIG_BOOTIMG -+#include -+#endif -+ - /* - * Versions of gcc older than that listed below may actually compile - * and link okay, but the end product can have subtle run time bugs. -@@ -352,10 +356,14 @@ - { - char * command_line; - extern char saved_command_line[]; -+#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC) -+ unsigned long value; -+#endif - /* - * Interrupts are still disabled. Do necessary setups, then - * enable them - */ -+ printk("start_kernel\n"); - lock_kernel(); - printk(linux_banner); - setup_arch(&command_line); -@@ -373,12 +381,26 @@ - * this. But we do want output early, in case something goes wrong. - */ - console_init(); -+ -+#ifdef CONFIG_BOOTIMG -+ unity_page = alloc_bootmem_pages(PAGE_SIZE); -+ printk("unity_page addr: %p\n",unity_page); -+#endif - #ifdef CONFIG_MODULES - init_modules(); - #endif - profile_init(); - kmem_cache_init(); - sti(); -+#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC) -+ /* If we don't make sure the APIC is enabled, AND the LVT0 -+ * register is programmed properly, we won't get timer interrupts -+ */ -+ setup_local_APIC(); -+ -+ value = apic_read(APIC_LVT0); -+ apic_write_around(APIC_LVT0, value & ~APIC_LVT_MASKED); -+#endif - calibrate_delay(); - #ifdef CONFIG_BLK_DEV_INITRD - if (initrd_start && !initrd_below_start_ok && -Index: linux/kernel/Makefile -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/kernel/Makefile,v -retrieving revision 1.1.1.1.4.1 -retrieving revision 1.1.1.1.4.1.2.1 -diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 ---- linux/kernel/Makefile 12 Mar 2003 19:51:36 -0000 1.1.1.1.4.1 -+++ linux/kernel/Makefile 1 Apr 2003 12:17:41 -0000 1.1.1.1.4.1.2.1 -@@ -22,7 +22,8 @@ - obj-$(CONFIG_PM) += pm.o - obj-$(CONFIG_KALLSYMS) += kallsyms.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o -- -+obj-$(CONFIG_BOOTIMG) += bootimg.o bootimg_pic.o -+obj-$(CONFIG_MCL_COREDUMP) += crash.o - - ifneq ($(CONFIG_IA64),y) - # According to Alan Modra , the -fno-omit-frame-pointer is -Index: linux/kernel/bootimg.c -=================================================================== -RCS file: linux/kernel/bootimg.c -diff -N linux/kernel/bootimg.c ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/kernel/bootimg.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,301 @@ -+/* bootimg.c - Boot another (kernel) image */ -+ -+/* Written 2000 by Werner Almesberger */ -+ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#if 0 -+#define DPRINTK_CONT(format,args...) printk(format,##args) -+#else -+#define DPRINTK_CONT(format,args...) -+#endif -+#define DPRINTK(format,args...) DPRINTK_CONT(KERN_DEBUG format,##args) -+ -+unsigned long **bootimg_page_dir; -+ -+struct bootimg_dsc bootimg_dsc; /* communication with PIC */ -+unsigned long *unity_page; /* unity-mapped page for i386 */ -+ -+static unsigned long bootimg_checksum(unsigned long **page_dir, int num_pages) -+{ -+ unsigned long checksum, *page; -+ int i, j; -+ -+ checksum = 0; -+ -+ for (i = 0; i < num_pages; i++) { -+ page = __va((unsigned long *) -+ page_dir[FROM_TABLE(i)][PAGE_NR(i)]); -+ -+ for (j = 0; j < PAGES_PER_TABLE; j++) -+ checksum ^= page[j]; -+ -+ checksum ^= page_dir[TO_TABLE(i)][PAGE_NR(i)]; -+ } -+ -+ return checksum; -+} -+ -+#ifdef CONFIG_X86_PAE -+ -+static unsigned long get_identity_mapped_page(void) -+{ -+ pgd_t *pgd; -+ pmd_t *pmd; -+ unsigned long phys_addr, page_base; -+ -+ /* Set up a 2 Mb identity-mapped page. */ -+ -+ phys_addr = virt_to_phys(unity_page); -+ pgd = pgd_offset(current->active_mm, phys_addr); -+ pmd = pmd_offset(pgd, phys_addr); -+ -+ /* We hardcode this rather than using PMD_MASK just in case the PAE -+ * mode setup ever changes so that 2 Mb pages are no longer used. -+ */ -+ page_base = phys_addr & ~((1 << 21) - 1); -+ -+ set_pmd(pmd, __pmd(page_base | _PAGE_PSE | _KERNPG_TABLE)); -+ __flush_tlb_one(phys_addr); -+ -+ return (unsigned long) unity_page; -+} -+ -+#else -+ -+static unsigned long get_identity_mapped_page(void) -+{ -+ set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)), -+ __pgd((_KERNPG_TABLE + _PAGE_PSE + (virt_to_phys(unity_page)&PGDIR_MASK)))); -+ __flush_tlb_one(virt_to_phys(unity_page)); -+ return (unsigned long)unity_page; -+} -+ -+#endif -+ -+#if 0 /* Perhaps we'll need this in the future? */ -+static void unmap_identity_mapped_page(void) -+{ -+ set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)),__pgd(0)); -+ __flush_tlb(); -+} -+#endif -+ -+static int fill_page_dir(unsigned long **page_dir,struct boot_image *image) -+{ -+ int i, count=0; -+ -+ memset(page_dir,0,PAGE_SIZE); -+ for (i = 0; i < image->pages; i += PAGES_PER_TABLE) { -+ unsigned long **table; -+ int bytes_left; -+ -+ table = page_dir+FROM_TABLE(i); -+ *table = (unsigned long *) get_free_page(GFP_KERNEL); -+ if (!*table) return -ENOMEM; -+ -+ memset(*table,0,PAGE_SIZE); -+ DPRINTK("page %d: from table %p @ %p\n",i,*table,table); -+ table = page_dir+TO_TABLE(i); -+ *table = (unsigned long *) get_free_page(GFP_KERNEL); -+ if (!*table) return -ENOMEM; -+ -+ bytes_left = (image->pages-i)*sizeof(unsigned long); -+ if (copy_from_user(*table,image->load_map+i, -+ bytes_left > PAGE_SIZE ? PAGE_SIZE : bytes_left)) -+ return -EFAULT; -+ DPRINTK("page %d: to table %p @ %p\n",i,*table,table); -+ count+=2; /* 2 pages per loop */ -+ } -+ -+ for (i = 0; i < image->pages; i++) { -+ unsigned long page = get_free_page(GFP_KERNEL); -+ void *src; -+ -+ if (!page) return -ENOMEM; -+ count++; -+ -+ page_dir[FROM_TABLE(i)][PAGE_NR(i)] = -+ virt_to_phys((void *) page); -+ if (get_user(src,image->image_map+i) || -+ copy_from_user((void *) page,src,PAGE_SIZE)) -+ return -EFAULT; -+ -+ DPRINTK("page %d: %p->%p->%p @ %p\n",i,src,(void *) page, -+ (void *) page_dir[FROM_TABLE(i)][PAGE_NR(i)], -+ &page_dir[FROM_TABLE(i)][PAGE_NR(i)]); -+ } -+ -+ DPRINTK("fill_page_dir: %d pages allocated\n", count); -+ -+ return 0; -+} -+ -+ -+static void free_page_dir(unsigned long **page_dir) -+{ -+ int i,j,count=0; -+ -+ for (i = 0; i < PAGES_PER_TABLE/2; i++) -+ if (page_dir[i]) -+ for (j = 0; j < PAGES_PER_TABLE; j++) -+ if (page_dir[i][j]) { -+ free_page((unsigned long) -+ phys_to_virt(page_dir[i][j])); -+ count++; -+ } -+ for (i = 0; i < PAGES_PER_TABLE; i++) -+ if (page_dir[i]) { -+ free_page((unsigned long) *page_dir[i]); -+ count++; -+ } -+ DPRINTK("free_page_dir: %d pages freed\n", count); -+} -+ -+ -+static void convert_table_refs_to_phys(unsigned long **page_dir) -+{ -+ int i; -+ -+ DPRINTK("PAGES_PER_TABLE: %d\n",PAGES_PER_TABLE); -+ for (i = 0; i < PAGES_PER_TABLE; i++) -+ if (page_dir[i]) { -+ DPRINTK("table %i: mapped %p -> ",i,page_dir[i]); -+ page_dir[i] = (unsigned long *) -+ virt_to_phys(page_dir[i]); -+ DPRINTK_CONT("%p\n",page_dir[i]); -+ } -+} -+ -+ -+ -+static int fill_bootimg_dsc(struct boot_image *image) -+{ -+ unsigned long scratch; -+ int error = -ENOMEM; -+ -+ if(bootimg_page_dir) { -+ /* free previously allocated memory */ -+ free_page_dir(bootimg_page_dir); -+ free_page((unsigned long) bootimg_page_dir); -+ DPRINTK("free_page (bootimg_page_dir)\n"); -+ } -+ -+ bootimg_page_dir = (unsigned long **) get_free_page(GFP_KERNEL); -+ if (!bootimg_page_dir) goto out0; -+ DPRINTK("get_free_page (bootimg_page_dir)\n"); -+ -+ error = fill_page_dir(bootimg_page_dir,image); -+ if (error) goto out1; -+ -+ if(!bootimg_dsc.scratch) { -+ scratch = get_free_page(GFP_KERNEL); -+ DPRINTK("get_free_page (scratch)\n"); -+ } else -+ scratch = 1; /* already allocated */ -+ -+ if (!scratch) goto out1; -+ /* -+ * Not all architectures need the code to be identity-mapped, but it -+ * can't hurt ... -+ */ -+ DPRINTK("bootimg_page_dir: mapped %p -> ",bootimg_page_dir); -+ bootimg_dsc.page_dir = (unsigned long **) virt_to_phys(bootimg_page_dir); -+ DPRINTK_CONT("%p\n",bootimg_dsc.page_dir); -+ if(!bootimg_dsc.scratch) -+ bootimg_dsc.scratch = virt_to_phys((void *) scratch); -+ bootimg_dsc.jump_to = (void (*)(void)) image->start; -+ bootimg_dsc.pages = image->pages; -+ bootimg_dsc.csum = bootimg_checksum(bootimg_page_dir, image->pages); -+ -+ return 0; -+ -+out1: -+ free_page_dir(bootimg_page_dir); -+ free_page((unsigned long) bootimg_page_dir); -+ DPRINTK("free_page (bootimg_page_dir)\n"); -+ bootimg_page_dir = 0; -+out0: -+ return error; -+} -+ -+extern char *panicmsg; -+int boot_image() -+{ -+ relocate_and_jump_t code; -+ unsigned long code_page; -+ int error = -ENOMEM; -+ -+ if (bootimg_checksum(__va(bootimg_dsc.page_dir),bootimg_dsc.pages) -+ != bootimg_dsc.csum) -+ printk("Checksum of kernel image failed. Rebooting via BIOS\n"); -+ -+ code_page = get_identity_mapped_page(); -+ if (!code_page) goto out3; -+ code = (relocate_and_jump_t) virt_to_phys((void *) code_page); -+ memcpy(code,&__bootimg_start,&__bootimg_end-&__bootimg_start); -+ flush_icache_range(&__bootimg_start, &__bootimg_end-&__bootimg_start); -+ -+ bootimg_dsc.self = (unsigned long) code; -+ printk(KERN_INFO "Running boot code at 0x%p\n",code); -+ -+ /* -+ * The point of no return. Not even printk may work after a successful -+ * return from become_only_thread. -+ */ -+ -+ if (!panicmsg) { -+ error = become_only_thread(); -+ if (error) goto out3; -+ } else { -+#ifdef CONFIG_SMP -+ disable_IO_APIC(); -+#endif -+ __cli(); -+ } -+ -+ convert_table_refs_to_phys((unsigned long **)__va(bootimg_dsc.page_dir)); -+ stack_on_page(code); -+ -+ code(); -+ -+ panic("PIC code exec failed"); -+out3: -+ printk("boot_image() failed!\n"); -+ for(;;); -+} -+ -+/* changed from asmlinkage because we're called via an IOCTL on /dev/crash now */ -+int sys_bootimg(struct boot_image *user_dsc) -+{ -+ struct boot_image dsc; -+ -+ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_MODULE)) return -EPERM; -+ if (&__bootimg_end-&__bootimg_start > PAGE_SIZE-RESERVE_MIN_RELOC_STACK) -+ { -+ printk(KERN_ERR "boot_image: PIC too large (%d bytes)\n", -+ &__bootimg_end-&__bootimg_start); -+ return -EIO; -+ } -+ if ((void *) relocate_and_jump != (void *) &__bootimg_start) { -+ printk(KERN_ERR "boot_image: relocate_and_jump is mis-placed" -+ "(0x%p != 0x%p)\n",relocate_and_jump,&__bootimg_start); -+ return -EIO; -+ } -+ -+ if (copy_from_user(&dsc,user_dsc,sizeof(dsc))) return -EFAULT; -+ if (dsc.pages >= PAGES_PER_TABLE*PAGES_PER_TABLE/2) return -EFBIG; -+ if (dsc.flags) return -EINVAL; /* for future use */ -+ return fill_bootimg_dsc(&dsc); -+} -Index: linux/kernel/bootimg_pic.c -=================================================================== -RCS file: linux/kernel/bootimg_pic.c -diff -N linux/kernel/bootimg_pic.c ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/kernel/bootimg_pic.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,91 @@ -+/* bootimg_pic.c - Boot image, position-independent code */ -+ -+/* Written 2000 by Werner Almesberger */ -+ -+/* -+ * Strongly inspired by FiPaBoL designed mainly by Otfried Cheong and Roger -+ * Gammans, and written by the latter. -+ */ -+ -+/* -+ * This code is position-independent and must fit in a single page ! -+ * Furthermore, everything (text+data+stack) has to go into the -+ * .bootimg segment. -+ */ -+ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#define copy_and_swap(from,to) \ -+ ( { my_copy_page(from,to); \ -+ tmp = from; \ -+ from = to; \ -+ to = tmp; } ) -+ -+ -+static inline void my_copy_page(unsigned long from,unsigned long to) -+{ -+ unsigned long end = from+PAGE_SIZE; -+ -+ do *((unsigned long *) to)++ = *((unsigned long *) from)++; -+ while (from != end); -+} -+ -+ -+void __bootimg relocate_and_jump(void) -+{ -+ struct bootimg_dsc dsc = bootimg_dsc; -+ int i; -+ -+ stop_paging(); -+ for (i = 0; i < dsc.pages; i++) { -+ unsigned long from,to,tmp; -+ -+ from = dsc.page_dir[FROM_TABLE(i)][PAGE_NR(i)]; -+ to = dsc.page_dir[TO_TABLE(i)][PAGE_NR(i)]; -+ if (from == to) continue; -+ if (to == dsc.self) { -+ copy_and_swap(dsc.self,dsc.scratch); -+ /* WARNING: flush_icache_range MUST BE INLINED !!! */ -+ flush_icache_range(dsc.self,dsc.self+PAGE_SIZE-1); -+ jump_relocated(dsc.scratch,dsc.self); -+ } -+ else if (to == (unsigned long) dsc.page_dir) -+ copy_and_swap((unsigned long) dsc.page_dir,dsc.scratch); -+ else { -+ /* -+ * O((n^2-n)/2), sigh ... -+ */ -+ unsigned long **table; -+ int j; -+ -+ for (j = i+1; j < dsc.pages; j++) { -+ table = dsc.page_dir+FROM_TABLE(j); -+ if (((unsigned long) *table) == to) { -+ copy_and_swap(*table,dsc.scratch); -+ break; -+ } -+ if ((*table)[PAGE_NR(j)] == to) { -+ copy_and_swap((*table)[PAGE_NR(j)], -+ dsc.scratch); -+ break; -+ } -+ table = dsc.page_dir+TO_TABLE(j); -+ if (((unsigned long) *table) == to) { -+ copy_and_swap(*table,dsc.scratch); -+ break; -+ } -+ } -+ } -+ my_copy_page(from,to); -+ dsc.scratch = from; -+ } -+ jump_to_kernel(dsc.jump_to); -+} -Index: linux/kernel/crash.c -=================================================================== -RCS file: linux/kernel/crash.c -diff -N linux/kernel/crash.c ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/kernel/crash.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,886 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_BOOTIMG -+#include -+#endif -+ -+static void crash_print_data_around(u_long p); -+static void crash_free_page(u_long addr); -+static int crash_chksum_page(u_long pg_addr, u_long * sum_addr); -+static void *czalloc(void *arg, unsigned int items, unsigned int size); -+static void czfree(void *arg, void *ptr); -+static u_long crash_alloc_dest_page(void); -+static void crash_free_dest_page(u_long dest); -+static void init_dest_page_alloc(void); -+static int crash_audit_maps(void); -+static u_long crash_get_source_page(void); -+static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages); -+static int crash_reset_stream(z_stream * stream); -+static boolean_t crash_is_kseg(u_long addr); -+static u_long *crash_link(u_long p); -+static int crash_chksum(u_long limit, u_long * sum_addr); -+static int crash_audit_map_page(u_long map); -+static void crash_wait_cpus(void); -+static int crash_is_dir_page(struct page *page); -+ -+/* for the /dev/crash interface */ -+int crash_init_chrdev(void); -+static int crashdev_ioctl(struct inode *, struct file *, unsigned int, unsigned long); -+ -+#define CRASH_DEBUG 1 -+ -+#ifdef CONFIG_BOOTIMG -+extern int sys_bootimg(struct boot_image *); -+#endif -+ -+static u_long crash_compr_buf; -+static u_long crash_uncompr_buf; -+static u_long crash_dump_header = 0; -+static u_long crash_dest_free_list = 0; -+static u_long crash_debug = 0; -+ -+static u_long crash_cur_pfn; -+ -+static u_long src_pages_skipped = 0; -+static u_long src_pages_saved = 0; -+static u_long dest_pages_free = 0; -+ -+/* this information is saved from within panic() */ -+char *panicmsg = (char *)0; -+int panic_processor = 0; -+int crash_perform_sync = 0; -+ -+u_int console_crash = 0; /* should be moved to alpha branch */ -+ -+// typedef struct task_struct *task_t; -+ -+/* -+ * Threads active at time of panic: -+ */ -+volatile task_t *panic_threads[NR_CPUS]; -+volatile unsigned long panic_ksp[NR_CPUS]; -+unsigned long *panic_regs = NULL; -+ -+int panic_on_oops; /* for /proc/sys/kernel/panic_on_oops */ -+ -+extern unsigned long max_low_pfn; -+ -+u_long crash_zalloc_start; // , crash_zalloc_end, crash_zalloc_cur; -+ -+/* -+ * Crash Kernel API functions below -+ * crash_pages_needed, computes pages needed for header and compression temp -+ * crash_init, partitions out the allocated pages, sets defaults and -+ * initializes the character device. -+ * crash_mark_dump_reserved, marks pages reserved from a previous dump. -+ * save_core, called at panic time to save a dump to memory. -+ */ -+u_long crash_pages_needed(void) -+{ -+ /* one for the header */ -+ return (1 + CRASH_ZALLOC_PAGES + CRASH_UNCOMPR_BUF_PAGES + CRASH_COMPR_BUF_PAGES); -+} -+ -+void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va) -+{ -+ struct mem_crash_map_hdr *header; -+ int i; -+ -+ /* the default behavior is not NOT panic on a kernel OOPS */ -+ panic_on_oops = 0; -+ -+ printk("crash_init (crash_va: %08lx)\n", crash_va); -+ for (i = 0; i < NR_CPUS; i++) -+ panic_threads[i] = 0; -+ crash_dump_header = crash_va; -+ crash_va += PAGE_SIZE; -+ crash_zalloc_start = crash_va; -+ crash_va += CRASH_ZALLOC_PAGES * PAGE_SIZE; -+ crash_uncompr_buf = crash_va; -+ crash_va += CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE; -+ crash_compr_buf = crash_va; -+ crash_va += CRASH_COMPR_BUF_PAGES * PAGE_SIZE; -+#if 0 -+ if (crash_va != end_alloc_va) -+ panic("crash_init inconsistency-1\n"); -+#endif -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+#ifdef CRASH_DEBUG -+ printk("crash_dump_header %p {\n", header); -+ printk(" magic[0] = %lx\n", header->magic[0]); -+ printk(" map = %lx\n", header->map); -+ printk(" map_pages = %lx\n", header->map_pages); -+ printk(" data_pages = %lx\n", header->data_pages); -+ printk(" compr_units = %lx\n", header->compr_units); -+ printk(" boot_reserved_start = %lx\n", header->boot_reserved_start); -+ printk(" boot_reserved_end = %lx\n", header->boot_reserved_end); -+#endif -+ -+ if (header->magic[0] == CRASH_MAGIC) { -+ printk("crash found\n"); -+ if ((header->boot_reserved_start != bootmap_va) || -+ (header->boot_reserved_end != end_alloc_va)) { -+ /* crash audit will catch the corruption */ -+ printk("crash_init inconsistency, dump may be corrupted\n"); -+ } -+ } else { -+printk("memset..."); -+ memset(header, 0, sizeof(*header)); -+printk("done\n"); -+ } -+ -+ header->boot_reserved_start = bootmap_va; -+ header->boot_reserved_end = end_alloc_va; -+ -+} -+ -+void crash_mark_dump_reserved(void) -+{ -+ struct mem_crash_map_hdr *header; -+ struct mem_crash_map_entry *m; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (header->magic[0] != CRASH_MAGIC) -+ return; -+ m = (struct mem_crash_map_entry *)header->map; -+#ifdef CRASH_DEBUG -+ printk("\n\n\ncrash_mark_dump_reserved\n\n"); -+ printk("crash_dump_header %p {\n", header); -+ printk(" magic[0] = %lx\n", header->magic[0]); -+ printk(" map = %lx\n", header->map); -+ printk(" map_pages = %lx\n", header->map_pages); -+ printk(" data_pages = %lx\n", header->data_pages); -+ printk(" compr_units = %lx\n", header->compr_units); -+ printk(" boot_reserved_start = %lx\n", header->boot_reserved_start); -+ printk(" boot_reserved_end = %lx\n", header->boot_reserved_end); -+ printk("mem_crash_map_entry %p {\n", m); -+ printk(" src_va = %lx\n", m->src_va); -+ printk(" dest_page_va = %lx\n", m->dest_page_va); -+ printk(" check_sum = %lx\n", m->check_sum); -+#endif -+ -+ if (crash_audit_maps()) { -+ header->magic[0] = 0; -+ return; -+ } -+ -+ m = (struct mem_crash_map_entry *)header->map; -+ again: -+ CRASH_MARK_BOOT_RESERVED(m); -+ for (; m->src_va; m++) { -+ if (m->src_va == -1) { -+ m = (struct mem_crash_map_entry *)m->dest_page_va; -+ goto again; -+ } -+ CRASH_MARK_BOOT_RESERVED(m->dest_page_va); -+ } -+ return; -+} -+ -+void save_core(void) -+{ -+ int i, j, k; -+ z_stream stream; -+ int err; -+ struct task_struct *tp; -+ struct mem_crash_map_hdr *header; -+ u_long *sub_map; -+ u_long map; -+ u_long src, dest, unc, cp, src_base, comp_pages; -+ -+ k = 0; -+ dest = 0; -+ __cli(); -+ tp = current; -+ mb(); -+ if (smp_processor_id() != 0) { /* boot_cpu_id is always 0, i think */ -+ panic_threads[smp_processor_id()] = tp; -+ crash_halt_or_reboot(0); -+ } else { -+ if (console_crash) -+ panic_threads[smp_processor_id()] = &init_task_union.task; -+ else -+ panic_threads[smp_processor_id()] = tp; -+ -+ crash_wait_cpus(); -+ } -+ -+ printk("save_core: started on CPU%d\n", smp_processor_id()); -+ if (!crash_dump_header) { -+ printk("save_core: not initialized\n"); -+ return; -+ } -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ header->magic[0] = 0; -+ header->map_pages = 0; -+ header->data_pages = 0; -+ header->compr_units = 0; -+ header->map = 0; -+ -+ stream.workspace=(void*)crash_zalloc_start; -+ // stream.zalloc = czalloc; -+ // stream.zfree = czfree; -+ // stream.opaque = (voidpf) 0; -+ stream.next_out = (Bytef *) crash_compr_buf; -+ stream.avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE); -+ stream.next_in = (Bytef *) crash_uncompr_buf; -+ stream.avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE); -+ err = zlib_deflateInit(&stream, Z_BEST_SPEED); -+ if (err != Z_OK) { -+ printk("save_core: bad return %d from deflateInit\n", err); -+ return; -+ } -+ -+ init_dest_page_alloc(); -+ header->map = map = crash_update_map(0, 0, 0, &header->map_pages); -+ if (!map) { -+ printk("save_core: no dest pages\n"); -+ return; -+ } -+ crash_cur_pfn = 0; -+ src_base = 0; -+ src = 0; -+ for (;;) { -+ sub_map = (u_long *) crash_uncompr_buf; -+ unc = crash_uncompr_buf + CRASH_SUB_MAP_PAGES * PAGE_SIZE; -+ for (i = 0; i < CRASH_SOURCE_PAGES; i++) { -+ src = crash_get_source_page(); -+ if (!src) -+ break; -+ if (!i) -+ src_base = src; -+ if (!crash_is_kseg(unc) || !crash_is_kseg(src)) { -+ printk("unc = 0x%lx, src = 0x%lx, i = %d\n", unc, src, i); -+ i = src = 0; -+ break; -+ } -+ memcpy((void *)unc, (void *)src, PAGE_SIZE); -+ unc += PAGE_SIZE; -+ *sub_map++ = src; -+ } -+ *sub_map = 0; -+ if (!i && !src) -+ break; -+ err = zlib_deflate(&stream, Z_FINISH); -+ if (!(err == Z_STREAM_END)) { -+ zlib_deflateEnd(&stream); -+ printk("save_core: bad return %d from deflate, src_base = 0x%lx\n", err, -+ src_base); -+ return; -+ } -+ comp_pages = (u_long) round_page(stream.total_out) / PAGE_SIZE; -+ if (crash_debug) -+ printk("src_base = 0x%lx compressed data in 0x%lx pages\n", src_base, -+ comp_pages); -+ -+ cp = crash_compr_buf; -+ j = 0; -+ if (crash_debug) -+ printk("\nsrc = %lx\n", src_base); -+ else { -+ printk("."); -+ if (!(k++ % 64)) -+ printk("\n"); -+ } -+ for (i = 0; i < comp_pages; i++) { -+ dest = crash_alloc_dest_page(); -+ if (crash_debug) { -+ printk("%lx ", dest); -+ if (!(j++ % 8)) -+ printk("\n"); -+ } -+ header->data_pages++; -+ if (!dest) { -+ printk("save_core: no dest pages\n"); -+ return; -+ } -+ if (!crash_is_kseg(dest) || !crash_is_kseg(cp)) { -+ printk("dest = 0x%lx, cp = 0x%lx, i = %d, comp_pages = 0x%lx\n", -+ dest, cp, i, comp_pages); -+ src = 0; -+ break; -+ } -+ memcpy((void *)dest, (void *)cp, PAGE_SIZE); -+ cp += PAGE_SIZE; -+ map = crash_update_map(map, src_base, dest, &header->map_pages); /* links a new map page, if necessary */ -+ if (!map) { -+ printk("save_core: no map\n"); -+ return; -+ } -+ } -+ header->compr_units++; -+ if (!src) -+ break; -+ if (crash_reset_stream(&stream)) -+ return; -+ } -+ -+ map = crash_update_map(map, 0, 0, &header->map_pages); -+ header->magic[0] = CRASH_MAGIC; -+ -+ if (crash_audit_maps()) { -+ header->magic[0] = 0; -+ return; -+ } -+ -+ printk("\nsave_core: src pages skipped = 0x%lx src pages saved = 0x%lx\n", -+ src_pages_skipped, src_pages_saved); -+ printk("save_core: data_pages = 0x%lx map_pages = 0x%lx\n", header->data_pages, -+ header->map_pages); -+ printk("save_core: completed, crash_dump_header = 0x%lx\n", crash_dump_header); -+} -+ -+/* helper functions private to this file */ -+static int crash_reset_stream(z_stream * stream) -+{ -+ int err; -+ -+ stream->workspace=(void*)crash_zalloc_start; -+ // stream->zalloc = czalloc; -+ // stream->zfree = czfree; -+ // stream->opaque = (voidpf) 0; -+ stream->next_out = (Bytef *) crash_compr_buf; -+ stream->avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE); -+ stream->next_in = (Bytef *) crash_uncompr_buf; -+ stream->avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE); -+ err = zlib_deflateReset(stream); -+ if (err != Z_OK) { -+ printk("crash_reset_stream: bad return %d from deflateReset\n", err); -+ return 1; -+ } -+ return 0; -+} -+ -+static u_long crash_alloc_dest_page(void) -+{ -+ u_long addr; -+ -+ addr = crash_dest_free_list; -+ if (addr) { -+ crash_dest_free_list = *(u_long *) addr; -+ dest_pages_free--; -+ } else -+ printk("crash_alloc_dest_page: free list empty\n"); -+ return addr; -+} -+ -+static void crash_free_dest_page(u_long dest) -+{ -+ if (!dest) { -+ printk("crash_free_dest_page: freeing addr 0\n"); -+ return; -+ } -+ dest_pages_free++; -+ dest = (u_long) trunc_page(dest); -+ *(u_long *) dest = crash_dest_free_list; -+ crash_dest_free_list = dest; -+} -+ -+/* -+ * Stolen from setup.c -+ */ -+#define PFN_PHYS(x) ((x) << PAGE_SHIFT) -+ -+static void init_dest_page_alloc(void) -+{ -+ u_long va; -+ long i; -+ struct page *page; -+ struct mem_crash_map_hdr *header; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ for (i = ((1 << 24) >> PAGE_SHIFT) + LOWER_MEM_FORWARD; -+ i < (max_low_pfn - UPPER_MEM_BACKUP); i++) { -+ va = (u_long) phys_to_virt(PFN_PHYS(i)); -+ if ((va >= header->boot_reserved_start) && (va < header->boot_reserved_end)) -+ continue; -+ page = mem_map + i; -+ if (PageLocked(page) || PageReserved(page)) -+ continue; -+ if (PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers) -+ crash_free_dest_page(va); -+ } -+ if (crash_debug) -+ printk("init_dest_page_alloc: dest_pages_free = 0x%lx\n", dest_pages_free); -+} -+ -+static int crash_is_dir_page(struct page *page) { -+ struct inode *tmp_inode; -+ -+ if(page->mapping && page->mapping->host) { -+ tmp_inode = (struct inode *)page->mapping->host; -+ if((tmp_inode->i_sb->s_magic == EXT2_SUPER_MAGIC) && -+ (S_ISDIR(tmp_inode->i_mode))) -+ return 1; -+ } -+ -+ return 0; -+} -+ -+static u_long crash_get_source_page(void) -+{ -+ struct page *page; -+ u_long va; -+ -+ while (crash_cur_pfn < max_low_pfn) { -+ page = mem_map + crash_cur_pfn; -+ if (!(PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers)) -+ break; -+ src_pages_skipped++; -+ crash_cur_pfn++; -+ } -+ if (crash_cur_pfn == max_low_pfn) -+ return 0; -+ -+ va = (u_long) phys_to_virt(PFN_PHYS(crash_cur_pfn)); -+ src_pages_saved++; -+ crash_cur_pfn++; -+ return va; -+} -+ -+static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages) -+{ -+ struct mem_crash_map_entry *m; -+ -+ -+ if (!map) { -+ (*pages)++; -+ return crash_alloc_dest_page(); -+ } -+ m = (struct mem_crash_map_entry *)map; -+ m->src_va = src_base; -+ m->dest_page_va = dest; -+ if (dest) -+ if (crash_chksum_page(dest, &m->check_sum)) -+ return 0; -+ -+ map += sizeof(struct mem_crash_map_entry); -+ -+ m = (struct mem_crash_map_entry *)map; -+ if (!src_base) { /* end of list */ -+ if (crash_chksum((u_long) m, &m->src_va)) -+ return 0; -+ } else if ((map + 3 * sizeof(struct mem_crash_map_entry)) > (u_long) round_page(map)) { -+ m->src_va = -1; -+ map = m->dest_page_va = crash_alloc_dest_page(); -+ if (crash_debug) -+ printk("\nm = 0x%lx m->src_va = 0x%lx m->dest_page_va = 0x%lx\n", -+ (u_long) trunc_page(m), m->src_va, m->dest_page_va); -+ m++; -+ if (crash_chksum((u_long) m, &m->src_va)) -+ return 0; -+ if (crash_debug) -+ printk("m = 0x%lx chksum = m->src_va = 0x%lx\n", (u_long) trunc_page(m), -+ m->src_va); -+ if (crash_audit_map_page((u_long) m)) -+ return 0; -+ (*pages)++; -+ } -+ return map; -+} -+ -+static int crash_chksum(u_long limit, u_long * sum_addr) -+{ -+ u_long sum; -+ u_long *addr; -+ -+ if (!crash_is_kseg(limit)) { -+ printk("bad addr = 0x%lx to crash_chksum\n", limit); -+ return 1; -+ } -+ sum = 0; -+ addr = (u_long *) trunc_page(limit); -+ for (; (u_long) addr < limit; addr++) -+ sum += *addr; -+ *sum_addr = sum; -+ return 0; -+} -+ -+static int crash_chksum_page(u_long pg_addr, u_long * sum_addr) -+{ -+ u_long sum, limit; -+ u_long *addr; -+ -+ if (!crash_is_kseg(pg_addr)) { -+ printk("bad addr = 0x%lx to crash_chksum_page\n", pg_addr); -+ return 1; -+ } -+ -+ sum = 0; -+ addr = (u_long *) trunc_page(pg_addr); -+ limit = (u_long) addr + PAGE_SIZE; -+ for (; (u_long) addr < limit; addr++) -+ sum += *addr; -+ *sum_addr = sum; -+ return 0; -+} -+ -+static int crash_audit_maps(void) -+{ -+ u_long m, count; -+ u_long *link_addr; -+ struct mem_crash_map_hdr *header; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (header->magic[0] != CRASH_MAGIC) -+ return 1; -+ -+ link_addr = &header->map; -+ m = header->map; -+ -+ count = 0; -+ for (;;) { -+ if (!crash_is_kseg(m)) { -+ printk("crash_audit_maps: bad link 0x%lx at 0x%lx\n", m, -+ (u_long) link_addr); -+ return 1; -+ } -+ if (crash_audit_map_page(m)) { -+ printk("audit failed while on map page %ld\n", count); -+ return 1; -+ } -+ if (!crash_link(m)) -+ break; -+ link_addr = crash_link(m); -+ m = *link_addr; -+ -+ count++; -+ } -+ return 0; -+} -+ -+static int crash_audit_map_page(u_long map) -+{ -+ struct mem_crash_map_entry *m; -+ u_long sum; -+ -+ if (!map || !crash_is_kseg(map)) { -+ printk("crash_audit_map_page: bad map = 0x%lx\n", map); -+ return 1; -+ } -+ map = (u_long) trunc_page((u_long) map); -+ m = (struct mem_crash_map_entry *)map; -+ for (;;) { -+ if ((m->src_va == -1) || (m->src_va == 0)) { -+ m++; -+ if (crash_chksum((u_long) m, &sum)) -+ return 1; -+ if (m->src_va != sum) { -+ printk("crash_audit_map_page: checksum failure1\n"); -+ printk("m = 0x%lx, sum = 0x%lx, m->src_va = 0x%lx\n", -+ (u_long) m, (u_long) sum, (u_long) m->src_va); -+ crash_print_data_around((u_long) & m->src_va); -+ return 1; -+ } else { -+ return 0; -+ } -+ } else { -+ if (crash_chksum_page((u_long) m->dest_page_va, &sum) -+ || (m->check_sum != sum)) { -+ printk("crash_audit_map_page: checksum failure2\n"); -+ printk -+ ("dest_page_va = 0x%lx, &dest_page_va = 0x%lx, sum = 0x%lx, m->check_sum = 0x%lx\n", -+ (u_long) m->dest_page_va, (u_long) (&m->check_sum), -+ (u_long) sum, (u_long) m->check_sum); -+ crash_print_data_around((u_long) & m->check_sum); -+ return 1; -+ } -+ } -+ m++; -+ } -+} -+ -+static void crash_print_data_around(u_long p) -+{ -+ u_long *a; -+ int i; -+ -+ if (!crash_is_kseg(p)) { -+ printk("crash_print_data_around: p = 0x%lx not kseg\n", p); -+ return; -+ } -+ a = (u_long *) p; -+ a -= 20; -+ for (i = 0; i < 40; i++) -+ printk("%lx\n", *a++); -+} -+ -+#ifdef CRASH_DEBUG -+static void crash_print_map_page(u_long map) -+{ -+ struct mem_crash_map_entry *m; -+ int j = 0; -+ u_long sum; -+ -+ map = (u_long) trunc_page((u_long) map); -+ m = (struct mem_crash_map_entry *)map; -+ for (;;) { -+ printk("%lx %lx %lx ", m->src_va, m->dest_page_va, m->check_sum); -+ if (!(j++ % 4)) -+ printk("\n"); -+ if ((m->src_va == -1) || (m->src_va == 0)) { -+ m++; -+ printk("%lx %lx ", m->src_va, m->dest_page_va); -+ if (crash_chksum((u_long) m, &sum)); -+ else -+ printk("\nchksum = 0x%lx\n", sum); -+ return; -+ } -+ m++; -+ } -+} -+#endif /* CRASH_DEBUG */ -+ -+static void crash_wait_cpus(void) -+{ -+ int i; -+ int msecs = 0; -+ -+ for (i = 0; i < smp_num_cpus; i++) { -+ if (i != smp_processor_id()) { -+ while (!panic_threads[i]) { -+ msecs++; -+ mdelay(1); -+ if (msecs > CRASH_CPU_TIMEOUT) { -+ /* if other cpus are still running -+ * we have to halt, otherwise we could -+ * risk using buffer cache pages which -+ * could subsequently get flushed to disk. -+ */ -+ printk("Unable to halt other CPUs, halting system.\n"); -+ crash_halt_or_reboot(0); -+ } -+ } -+ } -+ } -+ -+ crash_cleanup_smp_state(); -+} -+ -+ -+#if 0 -+static void *czalloc(void *arg, unsigned int items, unsigned int size) -+{ -+ u_long nbytes; -+ u_long addr; -+ -+ nbytes = (u_long) (items * size); -+ nbytes = (u_long) round_page(nbytes); -+ if ((crash_zalloc_cur + nbytes) > crash_zalloc_end) -+ return 0; -+ addr = crash_zalloc_cur; -+ crash_zalloc_cur += nbytes; -+ return ((void *)addr); -+} -+ -+static void czfree(void *arg, void *ptr) -+{ -+ printk("zfree: ptr = 0x%lx\n", (u_long) ptr); -+} -+#endif -+ -+static boolean_t crash_is_kseg(u_long addr) -+{ -+ u_long phys; -+ -+ phys = virt_to_phys((void *)addr); -+ if (phys < PFN_PHYS(max_low_pfn)) -+ return TRUE; -+ else -+ return FALSE; -+} -+ -+static u_long *crash_link(u_long p) -+{ -+ struct mem_crash_map_entry *m; -+ -+ p = (u_long) trunc_page(p); -+ m = (struct mem_crash_map_entry *)p; -+ for (; m->src_va; m++) -+ if (m->src_va == -1) -+ return &m->dest_page_va; -+ -+ return 0; -+} -+ -+/* Call this after data written to disk. */ -+static int crash_free_crashmem(void) -+{ -+ struct mem_crash_map_hdr *header; -+ struct mem_crash_map_entry *m, *last_m; -+ -+ if (crash_debug) -+ printk("crash_free_crashmem: \n"); -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (crash_audit_maps()) { -+ header->magic[0] = 0; -+ return 1; -+ } -+ m = (struct mem_crash_map_entry *)header->map; -+ again: -+ for (; m->src_va; m++) { -+ if (m->src_va == -1) { -+ last_m = m; -+ m = (struct mem_crash_map_entry *)m->dest_page_va; -+ crash_free_page((unsigned long)last_m); -+ goto again; -+ } -+ crash_free_page(m->dest_page_va); -+ } -+ if (crash_debug) -+ printk("crash_free_crashmem: 0x%lx freed\n", -+ (header->data_pages + header->map_pages) * PAGE_SIZE); -+ header->magic[0] = 0; -+ return 0; -+} -+ -+static void crash_free_page(u_long addr) -+{ -+ struct page *page; -+ -+ page = virt_to_page(addr); -+ ClearPageReserved(page); -+ set_page_count(page, 1); -+ __free_page(page); -+} -+ -+static int get_dump_helper(u_long kva, u_long buf) -+{ -+ struct page *page; -+ struct mem_crash_map_hdr *header; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (header->magic[0] != CRASH_MAGIC) -+ return 1; -+ -+ if (!kva) { -+ if (crash_audit_maps()) { -+ printk("get_dump_helper: audit failure\n"); -+ header->magic[0] = 0; -+ return 1; -+ } -+ page = virt_to_page((u_long) crash_dump_header); -+ if (!PageReserved(page)) { -+ printk("not reserved: crash_dump_header = 0x%lx\n", crash_dump_header); -+ return 1; -+ } -+ if (copy_to_user((char *)buf, (char *)crash_dump_header, -+ sizeof(struct mem_crash_map_hdr))) { -+ printk("get_dump_helper: copy_to_user failed1\n"); -+ return 1; -+ } -+ } else { -+ page = virt_to_page(kva); -+ if (!PageReserved(page)) { -+ printk("not reserved: kva = 0x%lx\n", kva); -+ return 1; -+ } -+ if (copy_to_user((char *)buf, (char *)trunc_page(kva), PAGE_SIZE)) { -+ printk("get_dump_helper: copy_to_user failed2\n"); -+ return 1; -+ } -+ } -+ return 0; -+} -+ -+static void free_dump_helper(void) -+{ -+ struct mem_crash_map_hdr *header; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (header->magic[0] != CRASH_MAGIC) -+ return; -+ if (crash_debug) -+ printk("free_dump_helper\n"); -+ crash_free_crashmem(); -+} -+ -+static int crashdev_open(struct inode *inode, struct file *file) -+{ -+ /* always return success -- nothing to do here */ -+ return 0; -+} -+ -+/* character device implementation */ -+static struct file_operations crashdev_fops = { -+ ioctl:crashdev_ioctl, -+ open:crashdev_open, -+}; -+ -+static struct miscdevice crash_miscdev = { -+ 190, "crash", &crashdev_fops -+}; -+ -+int crash_init_chrdev(void) -+{ -+ int result; -+ -+ result = misc_register(&crash_miscdev); -+ -+ if (result < 0) -+ printk(KERN_WARNING "crash: can't register crash device (c 10 190)\n"); -+ -+ return result; -+} -+ -+/* call the original syscalls, just to get things going */ -+static int crashdev_ioctl(struct inode *inode, struct file *file, -+ unsigned int cmd, unsigned long arg) -+{ -+ int retval = 0; -+ -+ switch (cmd) { -+ case CRASH_IOCFREEDUMP: -+ free_dump_helper(); -+ break; -+ -+ case CRASH_IOCGETDUMP: -+ if (crash_debug) { -+ printk("crashdev_ioctl: get dump\n"); -+ printk("vals: %08lx %08lx\n", -+ ((struct ioctl_getdump *)arg)->kva, -+ ((struct ioctl_getdump *)arg)->buf); -+ } -+ -+ retval = get_dump_helper((u_long) ((struct ioctl_getdump *)arg)->kva, -+ (u_long) ((struct ioctl_getdump *)arg)->buf); -+ break; -+ -+#ifdef CONFIG_BOOTIMG -+ case CRASH_IOCBOOTIMG: -+ if (crash_debug) -+ printk("crashdev_ioctl: bootimg\n"); -+ -+ retval = sys_bootimg((struct boot_image *)arg); -+ break; -+#endif -+ -+ case CRASH_IOCVERSION: -+ if (crash_debug) -+ printk("crashdev_ioctl: version\n"); -+ retval = CRASH_K_MINOR | (CRASH_K_MAJOR << 16); -+ break; -+ -+ default: -+ return -EINVAL; -+ } -+ -+ return retval; -+} -Index: linux/kernel/module.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/kernel/module.c,v -retrieving revision 1.1.1.1.4.1 -retrieving revision 1.1.1.1.4.1.2.1 -diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 ---- linux/kernel/module.c 12 Mar 2003 19:51:36 -0000 1.1.1.1.4.1 -+++ linux/kernel/module.c 1 Apr 2003 12:17:41 -0000 1.1.1.1.4.1.2.1 -@@ -311,7 +311,14 @@ - error = -EEXIST; - goto err1; - } -+#if defined(CONFIG_MCL_COREDUMP) -+ /* Call vmalloc_32 instead of module_map (vmalloc for i386) -+ * to avoid being mapped in highmem where mcore can't see us. -+ */ -+ if ((mod = (struct module *)vmalloc_32(size)) == NULL) { -+#else - if ((mod = (struct module *)module_map(size)) == NULL) { -+#endif - error = -ENOMEM; - goto err1; - } -Index: linux/kernel/panic.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/kernel/panic.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/kernel/panic.c 12 Mar 2003 19:51:36 -0000 1.3.2.1 -+++ linux/kernel/panic.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 -@@ -19,6 +19,10 @@ - #include - #include - -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif -+ - asmlinkage void sys_sync(void); /* it's really int */ - - int panic_timeout; -@@ -197,20 +201,43 @@ - unsigned long caller = (unsigned long) __builtin_return_address(0); - #endif - -+#ifdef CONFIG_MCL_COREDUMP -+ crash_save_regs(); -+#endif -+ - bust_spinlocks(1); - va_start(args, fmt); - vsprintf(buf, fmt, args); - va_end(args); - printk(KERN_EMERG "Kernel panic: %s\n",buf); -+ -+#ifdef CONFIG_MCL_COREDUMP -+ if (!panicmsg) { -+ panicmsg = buf; -+ panic_processor = smp_processor_id(); -+ mb(); -+ } -+#endif -+ - if (netdump_func) - BUG(); - if (in_interrupt()) - printk(KERN_EMERG "In interrupt handler - not syncing\n"); - else if (!current->pid) - printk(KERN_EMERG "In idle task - not syncing\n"); -+#ifdef CONFIG_MCL_COREDUMP -+ else if (crash_perform_sync) -+#else - else -+#endif - sys_sync(); -+ - bust_spinlocks(0); -+ -+#ifdef CONFIG_MCL_COREDUMP -+ smp_call_function((void *)smp_crash_funnel_cpu,0,0,0); -+ crash_save_current_state(current); -+#endif - - #ifdef CONFIG_SMP - smp_send_stop(); -Index: linux/kernel/sysctl.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/kernel/sysctl.c,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/kernel/sysctl.c 12 Mar 2003 19:51:36 -0000 1.2.2.1 -+++ linux/kernel/sysctl.c 1 Apr 2003 12:17:41 -0000 1.2.2.1.2.1 -@@ -37,6 +37,10 @@ - #include - #endif - -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif -+ - #if defined(CONFIG_SYSCTL) - - /* External variables not in a header file. */ -@@ -247,6 +251,10 @@ - {KERN_SYSRQ, "sysrq", &sysrq_enabled, sizeof (int), - 0644, NULL, &proc_dointvec}, - #endif -+#ifdef CONFIG_MCL_COREDUMP -+ {KERN_PANIC_ON_OOPS, "panic_on_oops", &panic_on_oops, sizeof(int), -+ 0644, NULL, &proc_dointvec}, -+#endif - {KERN_CADPID, "cad_pid", &cad_pid, sizeof (int), - 0600, NULL, &proc_dointvec}, - {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int), -Index: linux/lib/Config.in -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/lib/Config.in,v -retrieving revision 1.2 -retrieving revision 1.2.4.1 -diff -u -r1.2 -r1.2.4.1 ---- linux/lib/Config.in 14 Feb 2003 22:59:23 -0000 1.2 -+++ linux/lib/Config.in 1 Apr 2003 12:17:41 -0000 1.2.4.1 -@@ -23,12 +23,14 @@ - fi - fi - --if [ "$CONFIG_PPP_DEFLATE" = "y" -o \ -+if [ "$CONFIG_MCL_COREDUMP" = "y" -o \ -+ "$CONFIG_PPP_DEFLATE" = "y" -o \ - "$CONFIG_JFFS2_FS" = "y" ]; then - define_tristate CONFIG_ZLIB_DEFLATE y - else - if [ "$CONFIG_PPP_DEFLATE" = "m" -o \ -- "$CONFIG_JFFS2_FS" = "m" ]; then -+ "$CONFIG_JFFS2_FS" = "m" -o \ -+ "$CONFIG_MCL_COREDUMP" = "m" ]; then - define_tristate CONFIG_ZLIB_DEFLATE m - else - tristate 'zlib compression support' CONFIG_ZLIB_DEFLATE -Index: linux/mm/memory.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/mm/memory.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/mm/memory.c 12 Mar 2003 19:51:37 -0000 1.3.2.1 -+++ linux/mm/memory.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 -@@ -1381,6 +1381,10 @@ - } - lock_page(page); - -+#ifdef CONFIG_MCL_COREDUMP -+ set_bit(PG_anon, &page->flags); -+#endif -+ - /* - * Back out if somebody else faulted in this pte while we - * released the page table lock. -@@ -1470,6 +1474,9 @@ - mm->rss++; - flush_page_to_ram(page); - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); -+#ifdef CONFIG_MCL_COREDUMP -+ set_bit(PG_anon, &page->flags); -+#endif - lru_cache_add(page); - } - -Index: linux/mm/page_alloc.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/mm/page_alloc.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/mm/page_alloc.c 12 Mar 2003 19:51:37 -0000 1.3.2.1 -+++ linux/mm/page_alloc.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 -@@ -95,6 +95,10 @@ - struct page *base; - per_cpu_t *per_cpu; - zone_t *zone; -+#ifdef CONFIG_MCL_COREDUMP -+ struct page *pagemap; -+ int count = 1<lock); - -+#ifdef CONFIG_MCL_COREDUMP -+ pagemap = page; -+ do { -+ pagemap->flags |= (1<flags &= ~((1<free_pages -= mask; - - while (mask + (1 << (MAX_ORDER-1))) { -@@ -268,6 +281,16 @@ - zone->free_pages -= 1UL << order; - - page = expand(zone, page, index, order, curr_order, area); -+#ifdef CONFIG_MCL_COREDUMP -+ { -+ struct page *pagemap = page; -+ int count = 1<flags &= ~(1<lock, flags); - - set_page_count(page, 1); -Index: linux/arch/i386//boot/compressed/head.S -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/compressed/head.S,v -retrieving revision 1.1.1.1 -retrieving revision 1.1.1.1.12.6 -diff -u -r1.1.1.1 -r1.1.1.1.12.6 ---- linux/arch/i386//boot/compressed/head.S 7 May 2002 21:53:54 -0000 1.1.1.1 -+++ linux/arch/i386//boot/compressed/head.S 5 Apr 2003 05:51:27 -0000 1.1.1.1.12.6 -@@ -23,6 +23,7 @@ - */ - .text - -+#include - #include - #include - -@@ -31,6 +32,55 @@ - startup_32: - cld - cli -+ -+#ifdef CONFIG_BOOTIMG -+/* -+ * GDT is invalid if we're booted by bootimg, so reload it now -+ */ -+ lgdt %cs:gdt_descr -+ ljmp $(__KERNEL_CS),$1f -+ -+gdt_table_limit = gdt_table_end - gdt_table - 1 -+gdt_descr: -+ .word gdt_table_limit -+ .long gdt_table -+ -+gdt_table: /* stolen from arch/i386/kernel/head.S */ -+ .quad 0x0000000000000000 /* NULL descriptor */ -+ .quad 0x0000000000000000 /* 0x0b reserved */ -+ .quad 0x0000000000000000 /* 0x13 reserved */ -+ .quad 0x0000000000000000 /* 0x1b reserved */ -+ .quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */ -+ .quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */ -+ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ -+ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ -+ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ -+ .quad 0x0000000000000000 /* 0x4b reserved */ -+ .quad 0x0000000000000000 /* 0x53 reserved */ -+ .quad 0x0000000000000000 /* 0x5b reserved */ -+ -+ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ -+ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ -+ .quad 0x0000000000000000 /* 0x70 TSS descriptor */ -+ .quad 0x0000000000000000 /* 0x78 LDT descriptor */ -+ -+ /* Segments used for calling PnP BIOS */ -+ .quad 0x00c09a0000000000 /* 0x80 32-bit code */ -+ .quad 0x00809a0000000000 /* 0x88 16-bit code */ -+ .quad 0x0080920000000000 /* 0x90 16-bit data */ -+ .quad 0x0080920000000000 /* 0x98 16-bit data */ -+ .quad 0x0080920000000000 /* 0xa0 16-bit data */ -+ /* -+ * The APM segments have byte granularity and their bases -+ * and limits are set at run time. -+ */ -+ .quad 0x00409a0000000000 /* 0xa8 APM CS code */ -+ .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ -+ .quad 0x0040920000000000 /* 0xb8 APM DS data */ -+gdt_table_end: -+ -+1: -+#endif - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es -@@ -92,7 +142,6 @@ - cld - rep - movsl -- - popl %esi # discard the address - popl %ebx # real mode pointer - popl %esi # low_buffer_start -@@ -124,5 +173,10 @@ - movsl - movl %ebx,%esi # Restore setup pointer - xorl %ebx,%ebx -+#ifdef CONFIG_BOOTIMG -+ movl $0x100000,%eax -+ jmpl *%eax -+#else - ljmp $(__KERNEL_CS), $0x100000 -+#endif - move_routine_end: -Index: linux/arch/i386//kernel/head.S -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/head.S,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.5 -diff -u -r1.2.2.1 -r1.2.2.1.2.5 ---- linux/arch/i386//kernel/head.S 12 Mar 2003 19:49:06 -0000 1.2.2.1 -+++ linux/arch/i386//kernel/head.S 5 Apr 2003 05:51:27 -0000 1.2.2.1.2.5 -@@ -42,6 +42,21 @@ - * On entry, %esi points to the real-mode code as a 32-bit pointer. - */ - startup_32: -+#ifdef CONFIG_BOOTIMG -+/* -+ * GDT is invalid if we're booted by bootimg, so reload it now -+ */ -+ lgdt %cs:_gdt_descr-__PAGE_OFFSET -+ ljmp $(__KERNEL_CS),$1f-__PAGE_OFFSET -+ -+gdt_limit = SYMBOL_NAME(cpu_gdt_table_end) - SYMBOL_NAME(cpu_gdt_table) - 1 -+ -+_gdt_descr: -+ .word gdt_limit -+ .long SYMBOL_NAME(cpu_gdt_table)-__PAGE_OFFSET -+ -+1: -+#endif - /* - * Set segments to known values - */ -@@ -452,6 +467,7 @@ - .quad 0x00409a0000000000 /* 0xa8 APM CS code */ - .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ - .quad 0x0040920000000000 /* 0xb8 APM DS data */ -+ENTRY(cpu_gdt_table_end) - - #if CONFIG_SMP - .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ diff --git a/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch b/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch deleted file mode 100644 index 5cc34b8..0000000 --- a/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch +++ /dev/null @@ -1,50 +0,0 @@ -Index: linux-2.4.20-30.9/scripts/mkdep.c -=================================================================== ---- linux-2.4.20-30.9.orig/scripts/mkdep.c 2004-02-19 19:40:51.000000000 -0500 -+++ linux-2.4.20-30.9/scripts/mkdep.c 2004-04-28 17:24:54.000000000 -0400 -@@ -48,8 +48,6 @@ - char __depname[512] = "\n\t@touch "; - #define depname (__depname+9) - int hasdep; --char cwd[PATH_MAX]; --int lcwd; - - struct path_struct { - int len; -@@ -204,22 +202,8 @@ - memcpy(path->buffer+path->len, name, len); - path->buffer[path->len+len] = '\0'; - if (access(path->buffer, F_OK) == 0) { -- int l = lcwd + strlen(path->buffer); -- char name2[l+2], *p; -- if (path->buffer[0] == '/') { -- memcpy(name2, path->buffer, l+1); -- } -- else { -- memcpy(name2, cwd, lcwd); -- name2[lcwd] = '/'; -- memcpy(name2+lcwd+1, path->buffer, path->len+len+1); -- } -- while ((p = strstr(name2, "/../"))) { -- *p = '\0'; -- strcpy(strrchr(name2, '/'), p+3); -- } - do_depname(); -- printf(" \\\n %s", name2); -+ printf(" \\\n %s", path->buffer); - return; - } - } -@@ -601,12 +585,6 @@ - return 1; - } - -- if (!getcwd(cwd, sizeof(cwd))) { -- fprintf(stderr, "mkdep: getcwd() failed %m\n"); -- return 1; -- } -- lcwd = strlen(cwd); -- - add_path("."); /* for #include "..." */ - - while (++argv, --argc > 0) { diff --git a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch deleted file mode 100644 index aa6276f..0000000 --- a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch +++ /dev/null @@ -1,742 +0,0 @@ - fs/Makefile | 3 - fs/file_table.c | 11 ++ - fs/inode.c | 23 ++++- - fs/namei.c | 12 ++ - fs/nfsd/export.c | 5 + - fs/nfsd/nfsfh.c | 65 +++++++++++++- - fs/nfsd/vfs.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++----- - include/linux/fs.h | 10 ++ - kernel/ksyms.c | 2 - 9 files changed, 337 insertions(+), 34 deletions(-) - -Index: linux-bgl/fs/nfsd/vfs.c -=================================================================== ---- linux-bgl.orig/fs/nfsd/vfs.c 2003-07-02 08:44:33.000000000 -0700 -+++ linux-bgl/fs/nfsd/vfs.c 2004-12-28 17:13:59.940919832 -0800 -@@ -77,6 +77,129 @@ - static struct raparms * raparml; - static struct raparms * raparm_cache; - -+static int link_raw(struct dentry *dold, struct dentry *ddir, -+ struct dentry *dnew) -+{ -+ int err; -+ -+ struct nameidata old_nd = { .dentry = dold }; -+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->link_raw(&old_nd, &nd); -+ igrab(dold->d_inode); -+ d_instantiate(dnew, dold->d_inode); -+ if(dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it) -+ dold->d_inode->i_op->revalidate_it(dnew, NULL); -+ -+ return err; -+} -+ -+static int unlink_raw(struct dentry *dentry, char *fname, int flen, -+ struct dentry *rdentry) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->unlink_raw(&nd); -+ if (!err) -+ d_delete(rdentry); -+ -+ return err; -+} -+ -+static int rmdir_raw(struct dentry *dentry, char *fname, int flen, -+ struct dentry *rdentry) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->rmdir_raw(&nd); -+ if(!err) { -+ rdentry->d_inode->i_flags |= S_DEAD; -+ d_delete(rdentry); -+ } -+ -+ return err; -+} -+ -+static int symlink_raw(struct dentry *dentry, char *fname, int flen, -+ char *path) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->symlink_raw(&nd, path); -+ -+ return err; -+} -+ -+static int mkdir_raw(struct dentry *dentry, char *fname, int flen, int mode) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->mkdir_raw(&nd, mode); -+ -+ return err; -+} -+ -+static int mknod_raw(struct dentry *dentry, char *fname, int flen, int mode, -+ dev_t dev) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->mknod_raw(&nd, mode, dev); -+ -+ return err; -+} -+ -+static int rename_raw(struct dentry *fdentry, struct dentry *tdentry, -+ struct dentry *odentry, struct dentry *ndentry) -+{ -+ int err; -+ -+ struct nameidata old_nd = { .dentry = fdentry, .last = odentry->d_name}; -+ struct nameidata new_nd = { .dentry = tdentry, .last = ndentry->d_name}; -+ struct inode_operations *op = old_nd.dentry->d_inode->i_op; -+ err = op->rename_raw(&old_nd, &new_nd); -+ d_move(odentry, ndentry); -+ -+ return err; -+} -+ -+static int setattr_raw(struct inode *inode, struct iattr *iap) -+{ -+ int err; -+ -+ iap->ia_valid |= ATTR_RAW; -+ err = inode->i_op->setattr_raw(inode, iap); -+ -+ return err; -+} -+ -+int revalidate_it(struct dentry *dentry, struct lookup_intent *it) -+{ -+ int err = 0; -+ -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ err = -EINVAL; -+ dentry = NULL; -+ return err; -+ } -+ } -+ -+ return err; -+} -+ - /* - * Look up one component of a pathname. - * N.B. After this call _both_ fhp and resfh need an fh_put -@@ -304,7 +426,10 @@ - } - err = nfserr_notsync; - if (!check_guard || guardtime == inode->i_ctime) { -- err = notify_change(dentry, iap); -+ if ( dentry->d_inode->i_op && dentry->d_inode->i_op->setattr_raw) -+ err = setattr_raw(dentry->d_inode, iap); -+ else -+ err = notify_change(dentry, iap); - err = nfserrno(err); - } - if (size_change) { -@@ -431,6 +556,7 @@ - { - struct dentry *dentry; - struct inode *inode; -+ struct lookup_intent it; - int err; - - /* If we get here, then the client has already done an "open", and (hopefully) -@@ -477,6 +603,14 @@ - filp->f_mode = FMODE_READ; - } - -+ intent_init(&it, IT_OPEN, (filp->f_flags & ~O_ACCMODE) | filp->f_mode); -+ -+ err = revalidate_it(dentry, &it); -+ if (err) -+ goto out_nfserr; -+ -+ filp->f_it = ⁢ -+ - err = 0; - if (filp->f_op && filp->f_op->open) { - err = filp->f_op->open(inode, filp); -@@ -491,7 +625,11 @@ - atomic_dec(&filp->f_count); - } - } -+ - out_nfserr: -+ if (it.it_op_release) -+ intent_release(&it); -+ - if (err) - err = nfserrno(err); - out: -@@ -822,7 +960,7 @@ - { - struct dentry *dentry, *dchild; - struct inode *dirp; -- int err; -+ int err, error = -EOPNOTSUPP; - - err = nfserr_perm; - if (!flen) -@@ -838,20 +976,44 @@ - dentry = fhp->fh_dentry; - dirp = dentry->d_inode; - -+ switch (type) { -+ case S_IFDIR: -+ if (dirp->i_op->mkdir_raw) -+ error = mkdir_raw(dentry, fname, flen, iap->ia_mode); -+ break; -+ case S_IFCHR: -+ case S_IFBLK: -+ case S_IFIFO: -+ case S_IFSOCK: -+ case S_IFREG: -+ if (dirp->i_op->mknod_raw) { -+ if (type == S_IFREG) -+ rdev = 0; -+ error = mknod_raw(dentry, fname, flen, iap->ia_mode, rdev); -+ } -+ break; -+ default: -+ printk("nfsd: bad file type %o in nfsd_create\n", type); -+ } -+ - err = nfserr_notdir; -- if(!dirp->i_op || !dirp->i_op->lookup) -+ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it)) - goto out; - /* - * Check whether the response file handle has been verified yet. - * If it has, the parent directory should already be locked. - */ -- if (!resfhp->fh_dentry) { -- /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ -- fh_lock(fhp); -+ if (!resfhp->fh_dentry || dirp->i_op->lookup_it) { -+ /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create -+ and nfsd_proc_create in case of lustre -+ */ -+ if (!resfhp->fh_dentry) -+ fh_lock(fhp); - dchild = lookup_one_len(fname, dentry, flen); - err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; -+ resfhp->fh_dentry = NULL; - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) - goto out; -@@ -872,10 +1034,12 @@ - * Make sure the child dentry is still negative ... - */ - err = nfserr_exist; -- if (dchild->d_inode) { -- dprintk("nfsd_create: dentry %s/%s not negative!\n", -- dentry->d_name.name, dchild->d_name.name); -- goto out; -+ if ( error == -EOPNOTSUPP) { -+ if (dchild->d_inode) { -+ dprintk("nfsd_create: dentry %s/%s not negative!\n", -+ dentry->d_name.name, dchild->d_name.name); -+ goto out; -+ } - } - - if (!(iap->ia_valid & ATTR_MODE)) -@@ -888,16 +1052,19 @@ - err = nfserr_perm; - switch (type) { - case S_IFREG: -- err = vfs_create(dirp, dchild, iap->ia_mode); -+ if (error == -EOPNOTSUPP) -+ err = vfs_create(dirp, dchild, iap->ia_mode); - break; - case S_IFDIR: -- err = vfs_mkdir(dirp, dchild, iap->ia_mode); -+ if (error == -EOPNOTSUPP) -+ err = vfs_mkdir(dirp, dchild, iap->ia_mode); - break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: -- err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); -+ if (error == -EOPNOTSUPP) -+ err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); - break; - default: - printk("nfsd: bad file type %o in nfsd_create\n", type); -@@ -966,7 +1133,13 @@ - /* Get all the sanity checks out of the way before - * we lock the parent. */ - err = nfserr_notdir; -- if(!dirp->i_op || !dirp->i_op->lookup) -+ if (dirp->i_op->mknod_raw) { -+ err = mknod_raw(dentry, fname, flen, iap->ia_mode, 0); -+ if (err && err != -EOPNOTSUPP) -+ goto out; -+ } -+ -+ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it)) - goto out; - fh_lock(fhp); - -@@ -1017,6 +1190,8 @@ - case NFS3_CREATE_GUARDED: - err = nfserr_exist; - } -+ if(dirp->i_op->mknod_raw) -+ err = 0; - goto out; - } - -@@ -1123,7 +1298,7 @@ - struct iattr *iap) - { - struct dentry *dentry, *dnew; -- int err, cerr; -+ int err, cerr, error = -EOPNOTSUPP; - - err = nfserr_noent; - if (!flen || !plen) -@@ -1137,12 +1312,18 @@ - goto out; - fh_lock(fhp); - dentry = fhp->fh_dentry; -+ -+ if (dentry->d_inode->i_op->symlink_raw) -+ error = symlink_raw(dentry, fname, flen, path); -+ - dnew = lookup_one_len(fname, dentry, flen); - err = PTR_ERR(dnew); - if (IS_ERR(dnew)) - goto out_nfserr; - -- err = vfs_symlink(dentry->d_inode, dnew, path); -+ err = error; -+ if (err == -EOPNOTSUPP || !dentry->d_inode->i_op->symlink_raw) -+ err = vfs_symlink(dentry->d_inode, dnew, path); - if (!err) { - if (EX_ISSYNC(fhp->fh_export)) - nfsd_sync_dir(dentry); -@@ -1152,7 +1333,10 @@ - iap->ia_valid |= ATTR_CTIME; - iap->ia_mode = (iap->ia_mode&S_IALLUGO) - | S_IFLNK; -- err = notify_change(dnew, iap); -+ if (dnew->d_inode->i_op && dnew->d_inode->i_op->setattr_raw) -+ err = setattr_raw(dnew->d_inode, iap); -+ else -+ err = notify_change(dnew, iap); - if (!err && EX_ISSYNC(fhp->fh_export)) - write_inode_now(dentry->d_inode, 1); - } -@@ -1210,7 +1394,10 @@ - dold = tfhp->fh_dentry; - dest = dold->d_inode; - -- err = vfs_link(dold, dirp, dnew); -+ if (dirp->i_op->link_raw) -+ err = link_raw(dold, ddir, dnew); -+ else -+ err = vfs_link(dold, dirp, dnew); - if (!err) { - if (EX_ISSYNC(ffhp->fh_export)) { - nfsd_sync_dir(ddir); -@@ -1295,7 +1482,10 @@ - err = nfserr_perm; - } else - #endif -- err = vfs_rename(fdir, odentry, tdir, ndentry); -+ if(fdir->i_op->rename_raw) -+ err = rename_raw(fdentry, tdentry, odentry, ndentry); -+ else -+ err = vfs_rename(fdir, odentry, tdir, ndentry); - if (!err && EX_ISSYNC(tfhp->fh_export)) { - nfsd_sync_dir(tdentry); - nfsd_sync_dir(fdentry); -@@ -1316,7 +1506,7 @@ - fill_post_wcc(tfhp); - double_up(&tdir->i_sem, &fdir->i_sem); - ffhp->fh_locked = tfhp->fh_locked = 0; -- -+ - out: - return err; - } -@@ -1362,9 +1552,15 @@ - err = nfserr_perm; - } else - #endif -- err = vfs_unlink(dirp, rdentry); -+ if (dirp->i_op->unlink_raw) -+ err = unlink_raw(dentry, fname, flen, rdentry); -+ else -+ err = vfs_unlink(dirp, rdentry); - } else { /* It's RMDIR */ -- err = vfs_rmdir(dirp, rdentry); -+ if (dirp->i_op->rmdir_raw) -+ err = rmdir_raw(dentry, fname, flen, rdentry); -+ else -+ err = vfs_rmdir(dirp, rdentry); - } - - dput(rdentry); -Index: linux-bgl/fs/nfsd/nfsfh.c -=================================================================== ---- linux-bgl.orig/fs/nfsd/nfsfh.c 2003-07-02 08:44:08.000000000 -0700 -+++ linux-bgl/fs/nfsd/nfsfh.c 2004-12-28 17:13:59.942919514 -0800 -@@ -36,6 +36,15 @@ - int sequence; /* sequence counter */ - }; - -+static struct dentry *lookup_it(struct inode *inode, struct dentry * dentry) -+{ -+ if (inode->i_op->lookup_it) -+ return inode->i_op->lookup_it(inode, dentry, NULL, 0); -+ else -+ return inode->i_op->lookup(inode, dentry); -+ -+} -+ - /* - * A rather strange filldir function to capture - * the name matching the specified inode number. -@@ -75,6 +84,8 @@ - int error; - struct file file; - struct nfsd_getdents_callback buffer; -+ struct lookup_intent it; -+ struct file *filp = NULL; - - error = -ENOTDIR; - if (!dir || !S_ISDIR(dir->i_mode)) -@@ -85,9 +96,37 @@ - /* - * Open the directory ... - */ -- error = init_private_file(&file, dentry, FMODE_READ); -- if (error) -+ if (dentry->d_op && dentry->d_op->d_revalidate_it) { -+ if ((dentry->d_flags & DCACHE_NFSD_DISCONNECTED) && -+ (dentry->d_parent == dentry) ) { -+ it.it_op_release = NULL; -+ /* -+ * XXX Temporary Hack: Simulating init_private_file without -+ * f_op->open for disconnected dentry Since we don't have actual -+ * dentry->d_name to revalidate in revalidate_it() -+ */ -+ filp = &file; -+ memset(filp, 0, sizeof(*filp)); -+ filp->f_mode = FMODE_READ; -+ atomic_set(&filp->f_count, 1); -+ filp->f_dentry = dentry; -+ filp->f_uid = current->fsuid; -+ filp->f_gid = current->fsgid; -+ filp->f_op = dentry->d_inode->i_fop; -+ error = 0; -+ } else { -+ intent_init(&it, IT_OPEN, 0); -+ error = revalidate_it(dentry, &it); -+ if (error) -+ goto out; -+ error = init_private_file_it(&file, dentry, FMODE_READ, &it); -+ } -+ } else { -+ error = init_private_file_it(&file, dentry, FMODE_READ, NULL); -+ } -+ if (error) - goto out; -+ - error = -EINVAL; - if (!file.f_op->readdir) - goto out_close; -@@ -113,9 +152,13 @@ - } - - out_close: -- if (file.f_op->release) -+ if (file.f_op->release && !filp) - file.f_op->release(dir, &file); - out: -+ if (dentry->d_op && -+ dentry->d_op->d_revalidate_it && -+ it.it_op_release && !filp) -+ intent_release(&it); - return error; - } - -@@ -273,7 +316,7 @@ - /* I'm going to assume that if the returned dentry is different, then - * it is well connected. But nobody returns different dentrys do they? - */ -- pdentry = child->d_inode->i_op->lookup(child->d_inode, tdentry); -+ pdentry = lookup_it(child->d_inode, tdentry); - d_drop(tdentry); /* we never want ".." hashed */ - if (!pdentry && tdentry->d_inode == NULL) { - /* File system cannot find ".." ... sad but possible */ -@@ -304,6 +347,8 @@ - igrab(tdentry->d_inode); - pdentry->d_flags |= DCACHE_NFSD_DISCONNECTED; - } -+ if (child->d_op && child->d_op->d_revalidate_it) -+ pdentry->d_op = child->d_op; - } - if (pdentry == NULL) - pdentry = ERR_PTR(-ENOMEM); -@@ -461,6 +506,8 @@ - struct dentry *pdentry; - struct inode *parent; - -+ if (result->d_op && result->d_op->d_revalidate_it) -+ dentry->d_op = result->d_op; - pdentry = nfsd_findparent(dentry); - err = PTR_ERR(pdentry); - if (IS_ERR(pdentry)) -@@ -648,6 +695,11 @@ - - inode = dentry->d_inode; - -+ /* cache coherency for non-device filesystems */ -+ if (inode->i_op && inode->i_op->revalidate_it) { -+ inode->i_op->revalidate_it(dentry, NULL); -+ } -+ - /* Type check. The correct error return for type mismatches - * does not seem to be generally agreed upon. SunOS seems to - * use EISDIR if file isn't S_IFREG; a comment in the NFSv3 -@@ -878,8 +930,9 @@ - dentry->d_parent->d_name.name, dentry->d_name.name); - goto out; - out_uptodate: -- printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n", -- dentry->d_parent->d_name.name, dentry->d_name.name); -+ if(!dentry->d_parent->d_inode->i_op->mkdir_raw) -+ printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n", -+ dentry->d_parent->d_name.name, dentry->d_name.name); - goto out; - } - -Index: linux-bgl/fs/Makefile -=================================================================== ---- linux-bgl.orig/fs/Makefile 2004-12-28 17:13:56.898868625 -0800 -+++ linux-bgl/fs/Makefile 2004-12-28 17:13:59.943919356 -0800 -@@ -7,7 +7,8 @@ - - O_TARGET := fs.o - --export-objs := filesystems.o open.o dcache.o buffer.o inode.o -+export-objs := filesystems.o open.o dcache.o buffer.o inode.o namei.o \ -+ file_table.o - mod-subdirs := nls - - obj-y := open.o read_write.o devices.o file_table.o buffer.o \ -Index: linux-bgl/fs/namei.c -=================================================================== ---- linux-bgl.orig/fs/namei.c 2004-12-28 17:13:56.265835195 -0800 -+++ linux-bgl/fs/namei.c 2004-12-28 17:13:59.947918720 -0800 -@@ -22,6 +22,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -100,6 +101,7 @@ - it->it_op_release(it); - - } -+EXPORT_SYMBOL(intent_release); - - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the -@@ -889,7 +891,8 @@ - - - /* SMP-safe */ --struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) -+struct dentry * lookup_one_len_it(const char * name, struct dentry * base, -+ int len, struct lookup_intent *it) - { - unsigned long hash; - struct qstr this; -@@ -909,11 +912,16 @@ - } - this.hash = end_name_hash(hash); - -- return lookup_hash_it(&this, base, NULL); -+ return lookup_hash_it(&this, base, it); - access: - return ERR_PTR(-EACCES); - } - -+struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) -+{ -+ return lookup_one_len_it(name, base, len, NULL); -+} -+ - /* - * namei() - * -Index: linux-bgl/fs/file_table.c -=================================================================== ---- linux-bgl.orig/fs/file_table.c 2003-07-02 08:44:42.000000000 -0700 -+++ linux-bgl/fs/file_table.c 2004-12-28 17:13:59.948918562 -0800 -@@ -82,7 +82,8 @@ - * and call the open function (if any). The caller must verify that - * inode->i_fop is not NULL. - */ --int init_private_file(struct file *filp, struct dentry *dentry, int mode) -+int init_private_file_it(struct file *filp, struct dentry *dentry, int mode, -+ struct lookup_intent *it) - { - memset(filp, 0, sizeof(*filp)); - filp->f_mode = mode; -@@ -90,12 +91,20 @@ - filp->f_dentry = dentry; - filp->f_uid = current->fsuid; - filp->f_gid = current->fsgid; -+ if (it) -+ filp->f_it = it; - filp->f_op = dentry->d_inode->i_fop; - if (filp->f_op->open) - return filp->f_op->open(dentry->d_inode, filp); - else - return 0; - } -+EXPORT_SYMBOL(init_private_file_it); -+ -+int init_private_file(struct file *filp, struct dentry *dentry, int mode) -+{ -+ return init_private_file_it(filp, dentry, mode, NULL); -+} - - void fput(struct file * file) - { -Index: linux-bgl/fs/inode.c -=================================================================== ---- linux-bgl.orig/fs/inode.c 2004-12-28 17:13:56.635910389 -0800 -+++ linux-bgl/fs/inode.c 2004-12-28 17:13:59.950918244 -0800 -@@ -971,9 +971,10 @@ - } - - --struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find_actor, void *opaque) -+static inline struct inode *ifind(struct super_block *sb, unsigned long ino, -+ struct list_head *head, -+ find_inode_t find_actor, void *opaque) - { -- struct list_head * head = inode_hashtable + hash(sb,ino); - struct inode * inode; - - spin_lock(&inode_lock); -@@ -986,6 +987,24 @@ - } - spin_unlock(&inode_lock); - -+ return NULL; -+} -+ -+struct inode *ilookup4(struct super_block *sb, unsigned long ino, -+ find_inode_t find_actor, void *opaque) -+{ -+ struct list_head * head = inode_hashtable + hash(sb,ino); -+ return ifind(sb, ino, head, find_actor, opaque); -+} -+ -+struct inode *iget4(struct super_block *sb, unsigned long ino, -+ find_inode_t find_actor, void *opaque) -+{ -+ struct list_head * head = inode_hashtable + hash(sb,ino); -+ struct inode *inode = ifind(sb, ino, head, find_actor, opaque); -+ if (inode) -+ return inode; -+ - /* - * get_new_inode() will do the right thing, re-trying the search - * in case it had to block at any point. -Index: linux-bgl/kernel/ksyms.c -=================================================================== ---- linux-bgl.orig/kernel/ksyms.c 2004-12-28 17:13:56.978855920 -0800 -+++ linux-bgl/kernel/ksyms.c 2004-12-28 17:13:59.951918085 -0800 -@@ -142,6 +142,7 @@ - EXPORT_SYMBOL(igrab); - EXPORT_SYMBOL(iunique); - EXPORT_SYMBOL(iget4); -+EXPORT_SYMBOL(ilookup4); - EXPORT_SYMBOL(iput); - EXPORT_SYMBOL(force_delete); - EXPORT_SYMBOL(follow_up); -@@ -152,6 +153,7 @@ - EXPORT_SYMBOL(path_release); - EXPORT_SYMBOL(__user_walk); - EXPORT_SYMBOL(lookup_one_len); -+EXPORT_SYMBOL(lookup_one_len_it); - EXPORT_SYMBOL(lookup_hash); - EXPORT_SYMBOL(sys_close); - EXPORT_SYMBOL(dcache_lock); -Index: linux-bgl/include/linux/fs.h -=================================================================== ---- linux-bgl.orig/include/linux/fs.h 2004-12-28 17:13:59.471860200 -0800 -+++ linux-bgl/include/linux/fs.h 2004-12-28 17:13:59.955917450 -0800 -@@ -93,6 +93,9 @@ - #define FS_SINGLE 8 /* Filesystem that can have only one superblock */ - #define FS_NOMOUNT 16 /* Never mount from userland */ - #define FS_LITTER 32 /* Keeps the tree in dcache */ -+#define FS_NFSEXP_FSID 64 /* Use file system specific fsid for -+ * exporting non device filesystems. -+ */ - #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon - * as nfs_rename() will be cleaned up - */ -@@ -1149,6 +1152,9 @@ - struct nameidata *nd, struct lookup_intent *it); - extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, - int flags, struct lookup_intent *it); -+extern int revalidate_it(struct dentry *dentry, struct lookup_intent *it); -+extern int init_private_file_it(struct file *, struct dentry *dentry, int mode, -+ struct lookup_intent *it); - extern int filp_close(struct file *, fl_owner_t id); - extern char * getname(const char *); - -@@ -1418,6 +1424,8 @@ - extern int follow_down(struct vfsmount **, struct dentry **); - extern int follow_up(struct vfsmount **, struct dentry **); - extern struct dentry * lookup_one_len(const char *, struct dentry *, int); -+extern struct dentry * lookup_one_len_it(const char *, struct dentry *, int, -+ struct lookup_intent *); - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -@@ -1431,6 +1439,8 @@ - - typedef int (*find_inode_t)(struct inode *, unsigned long, void *); - extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *); -+extern struct inode * ilookup4(struct super_block *, unsigned long, -+ find_inode_t, void *); - static inline struct inode *iget(struct super_block *sb, unsigned long ino) - { - return iget4(sb, ino, NULL, NULL); diff --git a/lustre/kernel_patches/patches/removepage-2.4.19-suse.patch b/lustre/kernel_patches/patches/removepage-2.4.19-suse.patch deleted file mode 100644 index 4602f96..0000000 --- a/lustre/kernel_patches/patches/removepage-2.4.19-suse.patch +++ /dev/null @@ -1,30 +0,0 @@ - include/linux/fs.h | 1 + - mm/filemap.c | 3 +++ - 2 files changed, 4 insertions(+) - -Index: linux-2.4.19.SuSE/include/linux/fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/fs.h Sun Nov 16 00:40:59 2003 -+++ linux-2.4.19.SuSE/include/linux/fs.h Sun Nov 16 01:38:06 2003 -@@ -428,6 +428,7 @@ - int (*releasepage) (struct page *, int); - #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ - int (*direct_IO)(int, struct file *, struct kiobuf *, unsigned long, int); -+ void (*removepage)(struct page *); /* called when page gets removed from the inode */ - }; - - struct address_space { -Index: linux-2.4.19.SuSE/mm/filemap.c -=================================================================== ---- linux-2.4.19.SuSE.orig/mm/filemap.c Sat Nov 15 18:02:15 2003 -+++ linux-2.4.19.SuSE/mm/filemap.c Sun Nov 16 01:37:11 2003 -@@ -97,6 +97,9 @@ - { - struct address_space * mapping = page->mapping; - -+ if (mapping->a_ops->removepage) -+ mapping->a_ops->removepage(page); -+ - mapping->nrpages--; - list_del(&page->list); - page->mapping = NULL; diff --git a/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch b/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch deleted file mode 100644 index 567e1e8..0000000 --- a/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch +++ /dev/null @@ -1,16 +0,0 @@ -Index: linux-2.4.19-pre1/include/linux/sched.h -=================================================================== ---- linux-2.4.19-pre1.orig/include/linux/sched.h 2003-11-21 04:05:05.000000000 +0300 -+++ linux-2.4.19-pre1/include/linux/sched.h 2003-11-21 04:10:29.000000000 +0300 -@@ -927,6 +927,11 @@ - return res; - } - -+static inline int need_resched(void) -+{ -+ return (unlikely(current->need_resched)); -+} -+ - #endif /* __KERNEL__ */ - - #endif diff --git a/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch b/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch deleted file mode 100644 index e60f473..0000000 --- a/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch +++ /dev/null @@ -1,46 +0,0 @@ - include/linux/socket.h | 4 ++++ - net/netsyms.c | 2 ++ - net/socket.c | 2 +- - 3 files changed, 7 insertions(+), 1 deletion(-) - -Index: linux-DRV401/include/linux/socket.h -=================================================================== ---- linux-DRV401.orig/include/linux/socket.h 2004-10-15 10:26:20.000000000 -0700 -+++ linux-DRV401/include/linux/socket.h 2004-10-15 11:11:09.000000000 -0700 -@@ -260,6 +260,10 @@ - extern int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen); - extern int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr); - extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); -+struct socket; -+extern int sock_map_fd(struct socket *sock); -+extern struct socket *sockfd_lookup(int fd, int *err); -+ - #endif - #endif /* not kernel and not glibc */ - #endif /* _LINUX_SOCKET_H */ -Index: linux-DRV401/net/netsyms.c -=================================================================== ---- linux-DRV401.orig/net/netsyms.c 2004-10-15 11:10:52.000000000 -0700 -+++ linux-DRV401/net/netsyms.c 2004-10-15 11:11:09.000000000 -0700 -@@ -159,6 +159,8 @@ - EXPORT_SYMBOL(put_cmsg); - EXPORT_SYMBOL(sock_kmalloc); - EXPORT_SYMBOL(sock_kfree_s); -+EXPORT_SYMBOL(sockfd_lookup); -+EXPORT_SYMBOL(sock_map_fd); - - #ifdef CONFIG_FILTER - EXPORT_SYMBOL(sk_run_filter); -Index: linux-DRV401/net/socket.c -=================================================================== ---- linux-DRV401.orig/net/socket.c 2004-10-15 10:24:16.000000000 -0700 -+++ linux-DRV401/net/socket.c 2004-10-15 11:11:09.000000000 -0700 -@@ -326,7 +326,7 @@ - * but we take care of internal coherence yet. - */ - --static int sock_map_fd(struct socket *sock) -+int sock_map_fd(struct socket *sock) - { - int fd; - struct qstr this; diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch deleted file mode 100644 index bcd3f73..0000000 --- a/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch +++ /dev/null @@ -1,461 +0,0 @@ -Index: linux-2.4.19-pre1/include/linux/skbuff.h -=================================================================== ---- linux-2.4.19-pre1.orig/include/linux/skbuff.h 2001-11-22 22:46:26.000000000 +0300 -+++ linux-2.4.19-pre1/include/linux/skbuff.h 2004-01-14 01:15:13.000000000 +0300 -@@ -116,6 +116,30 @@ - __u16 size; - }; - -+/* Support for callback when skb data has been released */ -+typedef struct zccd /* Zero Copy Callback Descriptor */ -+{ /* (embed as first member of custom struct) */ -+ atomic_t zccd_count; /* reference count */ -+ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ -+} zccd_t; -+ -+static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) -+{ -+ atomic_set (&d->zccd_count, 1); -+ d->zccd_destructor = callback; -+} -+ -+static inline void zccd_get (zccd_t *d) /* take a reference */ -+{ -+ atomic_inc (&d->zccd_count); -+} -+ -+static inline void zccd_put (zccd_t *d) /* release a reference */ -+{ -+ if (atomic_dec_and_test (&d->zccd_count)) -+ (d->zccd_destructor)(d); -+} -+ - /* This data is invariant across clones and lives at - * the end of the header data, ie. at skb->end. - */ -@@ -123,6 +147,12 @@ - atomic_t dataref; - unsigned int nr_frags; - struct sk_buff *frag_list; -+ zccd_t *zccd; /* zero copy descriptor */ -+ zccd_t *zccd2; /* 2nd zero copy descriptor */ -+ /* NB we expect zero-copy data to be at least 1 packet, so -+ * having 2 zccds means we don't unneccessarily split the packet -+ * where consecutive zero-copy sends abutt. -+ */ - skb_frag_t frags[MAX_SKB_FRAGS]; - }; - -Index: linux-2.4.19-pre1/include/net/tcp.h -=================================================================== ---- linux-2.4.19-pre1.orig/include/net/tcp.h 2001-11-22 22:47:22.000000000 +0300 -+++ linux-2.4.19-pre1/include/net/tcp.h 2004-01-14 01:15:13.000000000 +0300 -@@ -640,6 +640,8 @@ - - extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); - extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); -+extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, -+ int flags, zccd_t *zccd); - - extern int tcp_ioctl(struct sock *sk, - int cmd, -@@ -733,6 +735,9 @@ - struct msghdr *msg, - int len, int nonblock, - int flags, int *addr_len); -+extern int tcp_recvpackets(struct sock *sk, -+ struct sk_buff_head *packets, -+ int len, int nonblock); - - extern int tcp_listen_start(struct sock *sk); - -Index: linux-2.4.19-pre1/net/netsyms.c -=================================================================== ---- linux-2.4.19-pre1.orig/net/netsyms.c 2004-01-14 01:10:37.000000000 +0300 -+++ linux-2.4.19-pre1/net/netsyms.c 2004-01-14 01:15:54.000000000 +0300 -@@ -409,6 +409,9 @@ - - #endif - -+EXPORT_SYMBOL(tcp_sendpage_zccd); -+EXPORT_SYMBOL(tcp_recvpackets); -+ - EXPORT_SYMBOL(netlink_set_err); - EXPORT_SYMBOL(netlink_broadcast); - EXPORT_SYMBOL(netlink_unicast); -Index: linux-2.4.19-pre1/net/core/skbuff.c -=================================================================== ---- linux-2.4.19-pre1.orig/net/core/skbuff.c 2001-12-21 20:42:05.000000000 +0300 -+++ linux-2.4.19-pre1/net/core/skbuff.c 2004-01-14 01:15:13.000000000 +0300 -@@ -208,6 +208,8 @@ - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; -+ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ -+ skb_shinfo(skb)->zccd2 = NULL; - return skb; - - nodata: -@@ -276,6 +278,10 @@ - { - if (!skb->cloned || - atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { -+ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ -+ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ -+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ -+ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -@@ -532,6 +538,8 @@ - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; -+ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */ -+ skb_shinfo(skb)->zccd2 = NULL; - - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; -@@ -578,6 +586,14 @@ - n->data_len = skb->data_len; - n->len = skb->len; - -+ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ -+ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ -+ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; -+ -+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ -+ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ -+ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; -+ - if (skb_shinfo(skb)->nr_frags) { - int i; - -@@ -620,6 +636,8 @@ - u8 *data; - int size = nhead + (skb->end - skb->head) + ntail; - long off; -+ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ -+ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ - - if (skb_shared(skb)) - BUG(); -@@ -641,6 +659,11 @@ - if (skb_shinfo(skb)->frag_list) - skb_clone_fraglist(skb); - -+ if (zccd != NULL) /* user zero copy descriptor? */ -+ zccd_get (zccd); /* extra ref (pages are shared) */ -+ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ -+ zccd_get (zccd2); /* extra ref (pages are shared) */ -+ - skb_release_data(skb); - - off = (data+nhead) - skb->head; -@@ -655,6 +678,8 @@ - skb->nh.raw += off; - skb->cloned = 0; - atomic_set(&skb_shinfo(skb)->dataref, 1); -+ skb_shinfo(skb)->zccd = zccd; -+ skb_shinfo(skb)->zccd2 = zccd2; - return 0; - - nodata: -Index: linux-2.4.19-pre1/net/ipv4/tcp.c -=================================================================== ---- linux-2.4.19-pre1.orig/net/ipv4/tcp.c 2001-12-21 20:42:05.000000000 +0300 -+++ linux-2.4.19-pre1/net/ipv4/tcp.c 2004-01-14 01:15:13.000000000 +0300 -@@ -744,7 +744,7 @@ - goto out; - } - --ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); -+ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd); - - static inline int - can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) -@@ -823,7 +823,8 @@ - return err; - } - --ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) -+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ -+ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd) - { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int mss_now; -@@ -871,6 +872,17 @@ - copy = size; - - i = skb_shinfo(skb)->nr_frags; -+ -+ if (zccd != NULL && /* this is a zcc I/O */ -+ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ -+ skb_shinfo(skb)->zccd2 != NULL && -+ skb_shinfo(skb)->zccd != zccd && /* not the same one */ -+ skb_shinfo(skb)->zccd2 != zccd) -+ { -+ tcp_mark_push (tp, skb); -+ goto new_segment; -+ } -+ - if (can_coalesce(skb, i, page, offset)) { - skb_shinfo(skb)->frags[i-1].size += copy; - } else if (i < MAX_SKB_FRAGS) { -@@ -881,6 +893,20 @@ - goto new_segment; - } - -+ if (zccd != NULL && /* this is a zcc I/O */ -+ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ -+ skb_shinfo(skb)->zccd2 != zccd) -+ { -+ zccd_get (zccd); /* bump ref count */ -+ -+ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); -+ -+ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ -+ skb_shinfo(skb)->zccd = zccd; -+ else -+ skb_shinfo(skb)->zccd2 = zccd; -+ } -+ - skb->len += copy; - skb->data_len += copy; - skb->ip_summed = CHECKSUM_HW; -@@ -944,7 +970,31 @@ - - lock_sock(sk); - TCP_CHECK_TIMER(sk); -- res = do_tcp_sendpages(sk, &page, offset, size, flags); -+ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL); -+ TCP_CHECK_TIMER(sk); -+ release_sock(sk); -+ return res; -+} -+ -+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, -+ int flags, zccd_t *zccd) -+{ -+ ssize_t res; -+ struct sock *sk = sock->sk; -+ -+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) -+ -+ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ -+ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ -+ BUG (); -+ -+#undef TCP_ZC_CSUM_FLAGS -+ -+ lock_sock(sk); -+ TCP_CHECK_TIMER(sk); -+ -+ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); -+ - TCP_CHECK_TIMER(sk); - release_sock(sk); - return res; -@@ -1683,6 +1733,202 @@ - goto out; - } - -+int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, -+ int len, int nonblock) -+{ -+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); -+ int copied; -+ long timeo; -+ -+ BUG_TRAP (len > 0); -+ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ -+ -+ lock_sock(sk); -+ -+ TCP_CHECK_TIMER(sk); -+ -+ copied = -ENOTCONN; -+ if (sk->state == TCP_LISTEN) -+ goto out; -+ -+ copied = 0; -+ timeo = sock_rcvtimeo(sk, nonblock); -+ -+ do { -+ struct sk_buff * skb; -+ u32 offset; -+ unsigned long used; -+ int exhausted; -+ int eaten; -+ -+ /* Are we at urgent data? Stop if we have read anything. */ -+ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) -+ break; -+ -+ /* We need to check signals first, to get correct SIGURG -+ * handling. FIXME: Need to check this doesnt impact 1003.1g -+ * and move it down to the bottom of the loop -+ */ -+ if (signal_pending(current)) { -+ if (copied) -+ break; -+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; -+ break; -+ } -+ -+ /* Next get a buffer. */ -+ -+ skb = skb_peek(&sk->receive_queue); -+ -+ if (skb == NULL) /* nothing ready */ -+ { -+ if (copied) { -+ if (sk->err || -+ sk->state == TCP_CLOSE || -+ (sk->shutdown & RCV_SHUTDOWN) || -+ !timeo || -+ (0)) -+ break; -+ } else { -+ if (sk->done) -+ break; -+ -+ if (sk->err) { -+ copied = sock_error(sk); -+ break; -+ } -+ -+ if (sk->shutdown & RCV_SHUTDOWN) -+ break; -+ -+ if (sk->state == TCP_CLOSE) { -+ if (!sk->done) { -+ /* This occurs when user tries to read -+ * from never connected socket. -+ */ -+ copied = -ENOTCONN; -+ break; -+ } -+ break; -+ } -+ -+ if (!timeo) { -+ copied = -EAGAIN; -+ break; -+ } -+ } -+ -+ cleanup_rbuf(sk, copied); -+ timeo = tcp_data_wait(sk, timeo); -+ continue; -+ } -+ -+ BUG_TRAP (atomic_read (&skb->users) == 1); -+ -+ exhausted = eaten = 0; -+ -+ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; -+ if (skb->h.th->syn) -+ offset--; -+ -+ used = skb->len - offset; -+ -+ if (tp->urg_data) { -+ u32 urg_offset = tp->urg_seq - tp->copied_seq; -+ if (urg_offset < used) { -+ if (!urg_offset) { /* at urgent date */ -+ if (!sk->urginline) { -+ tp->copied_seq++; /* discard the single byte of urgent data */ -+ offset++; -+ used--; -+ } -+ } else /* truncate read */ -+ used = urg_offset; -+ } -+ } -+ -+ BUG_TRAP (used >= 0); -+ if (len < used) -+ used = len; -+ -+ if (used == 0) -+ exhausted = 1; -+ else -+ { -+ if (skb_is_nonlinear (skb)) -+ { -+ int rc = skb_linearize (skb, GFP_KERNEL); -+ -+ printk ("tcp_recvpackets(): linearising: %d\n", rc); -+ -+ if (rc) -+ { -+ if (!copied) -+ copied = rc; -+ break; -+ } -+ } -+ -+ if ((offset + used) == skb->len) /* consuming the whole packet */ -+ { -+ __skb_unlink (skb, &sk->receive_queue); -+ dst_release (skb->dst); -+ skb_orphan (skb); -+ __skb_pull (skb, offset); -+ __skb_queue_tail (packets, skb); -+ exhausted = eaten = 1; -+ } -+ else /* consuming only part of the packet */ -+ { -+ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); -+ -+ if (skb2 == NULL) -+ { -+ if (!copied) -+ copied = -ENOMEM; -+ break; -+ } -+ -+ dst_release (skb2->dst); -+ __skb_pull (skb2, offset); -+ __skb_trim (skb2, used); -+ __skb_queue_tail (packets, skb2); -+ } -+ -+ tp->copied_seq += used; -+ copied += used; -+ len -= used; -+ } -+ -+ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { -+ tp->urg_data = 0; -+ tcp_fast_path_check(sk, tp); -+ } -+ -+ if (!exhausted) -+ continue; -+ -+ if (skb->h.th->fin) -+ { -+ tp->copied_seq++; -+ if (!eaten) -+ tcp_eat_skb (sk, skb); -+ break; -+ } -+ -+ if (!eaten) -+ tcp_eat_skb (sk, skb); -+ -+ } while (len > 0); -+ -+ out: -+ /* Clean up data we have read: This will do ACK frames. */ -+ cleanup_rbuf(sk, copied); -+ TCP_CHECK_TIMER(sk); -+ release_sock(sk); -+ return copied; -+} -+ - /* - * State processing on a close. This implements the state shift for - * sending our FIN frame. Note that we only send a FIN for some diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch deleted file mode 100644 index eec0362..0000000 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch +++ /dev/null @@ -1,1849 +0,0 @@ - fs/dcache.c | 19 ++ - fs/exec.c | 17 +- - fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++------- - fs/namespace.c | 28 +++- - fs/open.c | 172 +++++++++++++++++++------- - fs/stat.c | 52 +++++--- - include/linux/dcache.h | 60 +++++++++ - include/linux/fs.h | 32 ++++ - include/linux/fs_struct.h | 4 - kernel/exit.c | 3 - kernel/fork.c | 3 - kernel/ksyms.c | 1 - 12 files changed, 558 insertions(+), 128 deletions(-) - -Index: linux.mcp2/fs/dcache.c -=================================================================== ---- linux.mcp2.orig/fs/dcache.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/dcache.c 2004-05-05 14:19:59.000000000 -0700 -@@ -181,6 +181,13 @@ - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -830,13 +837,19 @@ - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux.mcp2/fs/exec.c -=================================================================== ---- linux.mcp2.orig/fs/exec.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/exec.c 2004-05-05 14:19:59.000000000 -0700 -@@ -107,8 +107,10 @@ - struct file * file; - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; - -- error = user_path_walk(library, &nd); -+ error = user_path_walk_it(library, &nd, &it); - if (error) - goto out; - -@@ -120,7 +122,8 @@ - if (error) - goto exit; - -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(&it); - error = PTR_ERR(file); - if (IS_ERR(file)) - goto out; -@@ -342,9 +345,11 @@ - struct inode *inode; - struct file *file; - int err = 0; -+ struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; - - if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- err = path_walk(name, &nd); -+ err = path_walk_it(name, &nd, &it); - file = ERR_PTR(err); - if (!err) { - inode = nd.dentry->d_inode; -@@ -356,7 +361,8 @@ - err = -EACCES; - file = ERR_PTR(err); - if (!err) { -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(&it); - if (!IS_ERR(file)) { - err = deny_write_access(file); - if (err) { -@@ -368,6 +374,7 @@ - return file; - } - } -+ intent_release(&it); - path_release(&nd); - } - goto out; -@@ -969,7 +976,7 @@ - goto close_fail; - if (!file->f_op->write) - goto close_fail; -- if (do_truncate(file->f_dentry, 0) != 0) -+ if (do_truncate(file->f_dentry, 0, 0) != 0) - goto close_fail; - - retval = binfmt->core_dump(signr, regs, file); -Index: linux.mcp2/fs/namei.c -=================================================================== ---- linux.mcp2.orig/fs/namei.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/namei.c 2004-05-05 14:28:26.000000000 -0700 -@@ -94,6 +94,13 @@ - * XEmacs seems to be relying on it... - */ - -+void intent_release(struct lookup_intent *it) -+{ -+ if (it && it->it_op_release) -+ it->it_op_release(it); -+ -+} -+ - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the - * kernel data space before using them.. -@@ -260,10 +267,19 @@ - * Internal lookup() using the new generic dcache. - * SMP-safe - */ --static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * dentry = d_lookup(parent, name); - -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { - dput(dentry); -@@ -281,11 +297,15 @@ - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ --static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * result; - struct inode *dir = parent->d_inode; -+ int counter = 0; - -+again: -+ counter++; - down(&dir->i_sem); - /* - * First re-do the cached lookup just in case it was created -@@ -300,6 +320,9 @@ - result = ERR_PTR(-ENOMEM); - if (dentry) { - lock_kernel(); -+ if (dir->i_op->lookup_it) -+ result = dir->i_op->lookup_it(dir, dentry, it, flags); -+ else - result = dir->i_op->lookup(dir, dentry); - unlock_kernel(); - if (result) -@@ -321,6 +344,15 @@ - dput(result); - result = ERR_PTR(-ENOENT); - } -+ } else if (result->d_op && result->d_op->d_revalidate_it) { -+ if (!result->d_op->d_revalidate_it(result, flags, it) && -+ !d_invalidate(result)) { -+ dput(result); -+ if (counter > 10) -+ result = ERR_PTR(-ESTALE); -+ if (!IS_ERR(result)) -+ goto again; -+ } - } - return result; - } -@@ -332,7 +364,8 @@ - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ --static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) - { - int err; - if (current->link_count >= 5) -@@ -346,10 +379,12 @@ - current->link_count++; - current->total_link_count++; - UPDATE_ATIME(dentry->d_inode); -+ nd->intent = it; - err = dentry->d_inode->i_op->follow_link(dentry, nd); - current->link_count--; - return err; - loop: -+ intent_release(it); - path_release(nd); - return -ELOOP; - } -@@ -447,7 +482,8 @@ - * - * We expect 'base' to be positive and a directory. - */ --int link_path_walk(const char * name, struct nameidata *nd) -+int link_path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it) - { - struct dentry *dentry; - struct inode *inode; -@@ -520,9 +556,10 @@ - break; - } - /* This does the actual lookups.. */ -- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, -+ NULL); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -540,7 +577,7 @@ - goto out_dput; - - if (inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ err = do_follow_link(dentry, nd, NULL); - dput(dentry); - if (err) - goto return_err; -@@ -556,7 +593,7 @@ - nd->dentry = dentry; - } - err = -ENOTDIR; -- if (!inode->i_op->lookup) -+ if (!inode->i_op->lookup && !inode->i_op->lookup_it) - break; - continue; - /* here ends the main loop */ -@@ -583,9 +620,9 @@ - if (err < 0) - break; - } -- dentry = cached_lookup(nd->dentry, &this, 0); -+ dentry = cached_lookup(nd->dentry, &this, 0, it); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, 0); -+ dentry = real_lookup(nd->dentry, &this, 0, it); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -595,7 +632,7 @@ - inode = dentry->d_inode; - if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ err = do_follow_link(dentry, nd, it); - dput(dentry); - if (err) - goto return_err; -@@ -609,7 +646,8 @@ - goto no_inode; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; -- if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || -+ (!inode->i_op->lookup && !inode->i_op->lookup_it)) - break; - } - goto return_base; -@@ -633,6 +671,34 @@ - * Check the cached dentry for staleness. - */ - dentry = nd->dentry; -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ err = -ESTALE; -+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { -+ struct dentry *new; -+ err = permission(dentry->d_parent->d_inode, -+ MAY_EXEC); -+ if (err) -+ break; -+ new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); -+ if (IS_ERR(new)) { -+ err = PTR_ERR(new); -+ break; -+ } -+ d_invalidate(dentry); -+ dput(dentry); -+ nd->dentry = new; -+ } -+ if (!nd->dentry->d_inode) -+ goto no_inode; -+ if (lookup_flags & LOOKUP_DIRECTORY) { -+ err = -ENOTDIR; -+ if (!nd->dentry->d_inode->i_op || -+ (!nd->dentry->d_inode->i_op->lookup && -+ !nd->dentry->d_inode->i_op->lookup_it)) -+ break; -+ } -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - err = -ESTALE; - if (!dentry->d_op->d_revalidate(dentry, 0)) { -@@ -646,15 +703,28 @@ - dput(dentry); - break; - } -+ if (err) -+ intent_release(it); - path_release(nd); - return_err: - return err; - } - -+int link_path_walk(const char * name, struct nameidata *nd) -+{ -+ return link_path_walk_it(name, nd, NULL); -+} -+ -+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) -+{ -+ current->total_link_count = 0; -+ return link_path_walk_it(name, nd, it); -+} -+ - int path_walk(const char * name, struct nameidata *nd) - { - current->total_link_count = 0; -- return link_path_walk(name, nd); -+ return link_path_walk_it(name, nd, NULL); - } - - /* SMP-safe */ -@@ -743,6 +813,7 @@ - { - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; -+ nd->intent = NULL; - if (*name=='/') - return walk_init_root(name,nd); - read_lock(¤t->fs->lock); -@@ -757,7 +828,8 @@ - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, -+ struct lookup_intent *it) - { - struct dentry * dentry; - struct inode *inode; -@@ -780,13 +852,16 @@ - goto out; - } - -- dentry = cached_lookup(base, name, 0); -+ dentry = cached_lookup(base, name, 0, it); - if (!dentry) { - struct dentry *new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; - lock_kernel(); -+ if (inode->i_op->lookup_it) -+ dentry = inode->i_op->lookup_it(inode, new, it, 0); -+ else - dentry = inode->i_op->lookup(inode, new); - unlock_kernel(); - if (!dentry) -@@ -798,6 +873,12 @@ - return dentry; - } - -+struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+{ -+ return lookup_hash_it(name, base, NULL); -+} -+ -+ - /* SMP-safe */ - struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) - { -@@ -819,7 +900,7 @@ - } - this.hash = end_name_hash(hash); - -- return lookup_hash(&this, base); -+ return lookup_hash_it(&this, base, NULL); - access: - return ERR_PTR(-EACCES); - } -@@ -851,6 +932,23 @@ - return err; - } - -+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, -+ struct lookup_intent *it) -+{ -+ char *tmp; -+ int err; -+ -+ tmp = getname(name); -+ err = PTR_ERR(tmp); -+ if (!IS_ERR(tmp)) { -+ err = 0; -+ if (path_init(tmp, flags, nd)) -+ err = path_walk_it(tmp, nd, it); -+ putname(tmp); -+ } -+ return err; -+} -+ - /* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. -@@ -946,7 +1044,8 @@ - return retval; - } - --int vfs_create(struct inode *dir, struct dentry *dentry, int mode) -+static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode, -+ struct lookup_intent *it) - { - int error; - -@@ -959,12 +1058,15 @@ - goto exit_lock; - - error = -EACCES; /* shouldn't it be ENOSYS? */ -- if (!dir->i_op || !dir->i_op->create) -+ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it)) - goto exit_lock; - - DQUOT_INIT(dir); - lock_kernel(); -- error = dir->i_op->create(dir, dentry, mode); -+ if (dir->i_op->create_it) -+ error = dir->i_op->create_it(dir, dentry, mode, it); -+ else -+ error = dir->i_op->create(dir, dentry, mode); - unlock_kernel(); - exit_lock: - up(&dir->i_zombie); -@@ -973,6 +1075,11 @@ - return error; - } - -+int vfs_create(struct inode *dir, struct dentry *dentry, int mode) -+{ -+ return vfs_create_it(dir, dentry, mode, NULL); -+} -+ - /* - * open_namei() - * -@@ -987,7 +1094,8 @@ - * for symlinks (where the permissions are checked later). - * SMP-safe - */ --int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) -+int open_namei_it(const char *pathname, int flag, int mode, -+ struct nameidata *nd, struct lookup_intent *it) - { - int acc_mode, error = 0; - struct inode *inode; -@@ -997,12 +1105,14 @@ - - acc_mode = ACC_MODE(flag); - -+ if (it) -+ it->it_flags = flag; - /* - * The simplest case - just a plain lookup. - */ - if (!(flag & O_CREAT)) { - if (path_init(pathname, lookup_flags(flag), nd)) -- error = path_walk(pathname, nd); -+ error = path_walk_it(pathname, nd, it); - if (error) - return error; - dentry = nd->dentry; -@@ -1012,6 +1122,10 @@ - /* - * Create - we need to know the parent. - */ -+ if (it) { -+ it->it_create_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - if (path_init(pathname, LOOKUP_PARENT, nd)) - error = path_walk(pathname, nd); - if (error) -@@ -1028,7 +1142,7 @@ - - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - - do_last: - error = PTR_ERR(dentry); -@@ -1037,10 +1151,11 @@ - goto exit; - } - -+ it->it_create_mode = mode; - /* Negative dentry, just create the file */ - if (!dentry->d_inode) { -- error = vfs_create(dir->d_inode, dentry, -- mode & ~current->fs->umask); -+ error = vfs_create_it(dir->d_inode, dentry, -+ mode & ~current->fs->umask, it); - up(&dir->d_inode->i_sem); - dput(nd->dentry); - nd->dentry = dentry; -@@ -1144,7 +1259,7 @@ - if (!error) { - DQUOT_INIT(inode); - -- error = do_truncate(dentry, 0); -+ error = do_truncate(dentry, 0, 1); - } - put_write_access(inode); - if (error) -@@ -1156,8 +1271,10 @@ - return 0; - - exit_dput: -+ intent_release(it); - dput(dentry); - exit: -+ intent_release(it); - path_release(nd); - return error; - -@@ -1176,7 +1293,10 @@ - * are done. Procfs-like symlinks just set LAST_BIND. - */ - UPDATE_ATIME(dentry->d_inode); -+ nd->intent = it; - error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (error) -+ intent_release(it); - dput(dentry); - if (error) - return error; -@@ -1198,13 +1318,20 @@ - } - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - putname(nd->last.name); - goto do_last; - } - -+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) -+{ -+ return open_namei_it(pathname, flag, mode, nd, NULL); -+} -+ -+ - /* SMP-safe */ --static struct dentry *lookup_create(struct nameidata *nd, int is_dir) -+static struct dentry *lookup_create(struct nameidata *nd, int is_dir, -+ struct lookup_intent *it) - { - struct dentry *dentry; - -@@ -1212,7 +1339,7 @@ - dentry = ERR_PTR(-EEXIST); - if (nd->last_type != LAST_NORM) - goto fail; -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - if (IS_ERR(dentry)) - goto fail; - if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1269,7 +1396,20 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->mknod_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mknod_raw(&nd, mode, dev); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ -+ dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(dentry); - - mode &= ~current->fs->umask; -@@ -1290,6 +1426,7 @@ - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1338,7 +1475,18 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 1); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->mkdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mkdir_raw(&nd, mode); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 1, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_mkdir(nd.dentry->d_inode, dentry, -@@ -1346,6 +1490,7 @@ - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1447,8 +1592,16 @@ - error = -EBUSY; - goto exit1; - } -+ if (nd.dentry->d_inode->i_op->rmdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ error = op->rmdir_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_rmdir(nd.dentry->d_inode, dentry); -@@ -1507,8 +1660,15 @@ - error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit1; -+ if (nd.dentry->d_inode->i_op->unlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->unlink_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ -@@ -1576,15 +1736,27 @@ - error = path_walk(to, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->symlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->symlink_raw(&nd, from); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_symlink(nd.dentry->d_inode, dentry, from); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+ out2: - path_release(&nd); --out: -+ out: - putname(to); - } - putname(from); -@@ -1667,7 +1835,18 @@ - error = -EXDEV; - if (old_nd.mnt != nd.mnt) - goto out_release; -- new_dentry = lookup_create(&nd, 0); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out_release; -+ } -+ if (nd.dentry->d_inode->i_op->link_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->link_raw(&old_nd, &nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out_release; -+ } -+ new_dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(new_dentry); - if (!IS_ERR(new_dentry)) { - error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -@@ -1713,7 +1888,7 @@ - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error; - struct inode *target; -@@ -1792,7 +1967,7 @@ - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error; - -@@ -1883,9 +2058,18 @@ - if (newnd.last_type != LAST_NORM) - goto exit2; - -+ if (old_dir->d_inode->i_op->rename_raw) { -+ lock_kernel(); -+ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); -+ unlock_kernel(); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit2; -+ } -+ - double_lock(new_dir, old_dir); - -- old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; -@@ -1901,16 +2085,16 @@ - if (newnd.last.name[newnd.last.len]) - goto exit4; - } -- new_dentry = lookup_hash(&newnd.last, new_dir); -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - -+ - lock_kernel(); - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); - unlock_kernel(); -- - dput(new_dentry); - exit4: - dput(old_dentry); -@@ -1961,20 +2145,26 @@ - } - - static inline int --__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) - { - int res = 0; - char *name; - if (IS_ERR(link)) - goto fail; - -+ if (it == NULL) -+ it = nd->intent; -+ else if (it != nd->intent) -+ printk("it != nd->intent: tell phil@clusterfs.com\n"); -+ - if (*link == '/') { - path_release(nd); - if (!walk_init_root(link, nd)) - /* weird __emul_prefix() stuff did it */ - goto out; - } -- res = link_path_walk(link, nd); -+ res = link_path_walk_it(link, nd, it); - out: - if (current->link_count || res || nd->last_type!=LAST_NORM) - return res; -@@ -1996,7 +2186,13 @@ - - int vfs_follow_link(struct nameidata *nd, const char *link) - { -- return __vfs_follow_link(nd, link); -+ return __vfs_follow_link(nd, link, NULL); -+} -+ -+int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) -+{ -+ return __vfs_follow_link(nd, link, it); - } - - /* get the link contents into pagecache */ -@@ -2038,7 +2234,7 @@ - { - struct page *page = NULL; - char *s = page_getlink(dentry, &page); -- int res = __vfs_follow_link(nd, s); -+ int res = __vfs_follow_link(nd, s, NULL); - if (page) { - kunmap(page); - page_cache_release(page); -Index: linux.mcp2/fs/namespace.c -=================================================================== ---- linux.mcp2.orig/fs/namespace.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/namespace.c 2004-05-05 14:22:06.000000000 -0700 -@@ -97,6 +97,7 @@ - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -108,6 +109,7 @@ - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -491,15 +493,18 @@ - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; - if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) -- err = path_walk(old_name, &old_nd); -- if (err) -+ err = path_walk_it(old_name, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down_write(¤t->namespace->sem); - err = -EINVAL; -@@ -522,6 +527,7 @@ - } - - up_write(¤t->namespace->sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -706,6 +712,7 @@ - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -731,9 +738,11 @@ - - /* ... and get the mountpoint */ - if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- retval = path_walk(dir_name, &nd); -- if (retval) -+ retval = path_walk_it(dir_name, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -+ } - - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, -@@ -745,6 +754,8 @@ - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -910,6 +921,8 @@ - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - char *name; - int error; - -@@ -924,7 +937,7 @@ - goto out0; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) -- error = path_walk(name, &new_nd); -+ error = path_walk_it(name, &new_nd, &new_it); - putname(name); - if (error) - goto out0; -@@ -938,7 +951,7 @@ - goto out1; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) -- error = path_walk(name, &old_nd); -+ error = path_walk_it(name, &old_nd, &old_it); - putname(name); - if (error) - goto out1; -@@ -994,8 +1007,10 @@ - up(&old_nd.dentry->d_inode->i_zombie); - up_write(¤t->namespace->sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); -Index: linux.mcp2/fs/open.c -=================================================================== ---- linux.mcp2.orig/fs/open.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/open.c 2004-05-05 14:30:34.000000000 -0700 -@@ -19,6 +19,8 @@ - #include - - #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -+extern int path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it); - - int vfs_statfs(struct super_block *sb, struct statfs *buf) - { -@@ -95,9 +97,10 @@ - write_unlock(&files->file_lock); - } - --int do_truncate(struct dentry *dentry, loff_t length) -+int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) - { - struct inode *inode = dentry->d_inode; -+ struct inode_operations *op = dentry->d_inode->i_op; - int error; - struct iattr newattrs; - -@@ -108,7 +111,13 @@ - down(&inode->i_sem); - newattrs.ia_size = length; - newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; -- error = notify_change(dentry, &newattrs); -+ if (called_from_open) -+ newattrs.ia_valid |= ATTR_FROM_OPEN; -+ if (op->setattr_raw) { -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ } else -+ error = notify_change(dentry, &newattrs); - up(&inode->i_sem); - return error; - } -@@ -118,12 +127,13 @@ - struct nameidata nd; - struct inode * inode; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; - -- error = user_path_walk(path, &nd); -+ error = user_path_walk_it(path, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -163,11 +173,13 @@ - error = locks_verify_truncate(inode, NULL, length); - if (!error) { - DQUOT_INIT(inode); -- error = do_truncate(nd.dentry, length); -+ intent_release(&it); -+ error = do_truncate(nd.dentry, length, 0); - } - put_write_access(inode); - - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; -@@ -215,7 +227,7 @@ - - error = locks_verify_truncate(inode, file, length); - if (!error) -- error = do_truncate(dentry, length); -+ error = do_truncate(dentry, length, 0); - out_putf: - fput(file); - out: -@@ -260,11 +272,13 @@ - struct inode * inode; - struct iattr newattrs; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, NULL); - if (error) - goto out; - inode = nd.dentry->d_inode; - -+ /* this is safe without a Lustre lock because it only depends -+ on the super block */ - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; -@@ -279,11 +293,25 @@ - goto dput_and_out; - - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; -- } else { -+ } -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } -+ -+ error = -EPERM; -+ if (!times) { - if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE)) != 0) - goto dput_and_out; - } -+ - error = notify_change(nd.dentry, &newattrs); - dput_and_out: - path_release(&nd); -@@ -304,12 +332,14 @@ - struct inode * inode; - struct iattr newattrs; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, NULL); - - if (error) - goto out; - inode = nd.dentry->d_inode; - -+ /* this is safe without a Lustre lock because it only depends -+ on the super block */ - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; -@@ -324,7 +354,20 @@ - newattrs.ia_atime = times[0].tv_sec; - newattrs.ia_mtime = times[1].tv_sec; - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; -- } else { -+ } -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } -+ -+ error = -EPERM; -+ if (!utimes) { - if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE)) != 0) - goto dput_and_out; -@@ -347,6 +390,7 @@ - int old_fsuid, old_fsgid; - kernel_cap_t old_cap; - int res; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; -@@ -364,13 +408,14 @@ - else - current->cap_effective = current->cap_permitted; - -- res = user_path_walk(filename, &nd); -+ res = user_path_walk_it(filename, &nd, &it); - if (!res) { - res = permission(nd.dentry->d_inode, mode); - /* SuS v2 requires we report a read only fs too */ - if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) - && !special_file(nd.dentry->d_inode->i_mode)) - res = -EROFS; -+ intent_release(&it); - path_release(&nd); - } - -@@ -386,6 +431,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -394,7 +440,7 @@ - - error = 0; - if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -406,6 +452,7 @@ - set_fs_pwd(current->fs, nd.mnt, nd.dentry); - - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; -@@ -446,6 +493,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -454,7 +502,7 @@ - - path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -471,39 +519,56 @@ - set_fs_altroot(); - error = 0; - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; - } - --asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) -+int chmod_common(struct dentry *dentry, mode_t mode) - { -- struct inode * inode; -- struct dentry * dentry; -- struct file * file; -- int err = -EBADF; -+ struct inode *inode = dentry->d_inode; - struct iattr newattrs; -+ int err = -EROFS; - -- file = fget(fd); -- if (!file) -+ if (IS_RDONLY(inode)) - goto out; - -- dentry = file->f_dentry; -- inode = dentry->d_inode; -+ if (inode->i_op->setattr_raw) { -+ newattrs.ia_mode = mode; -+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; -+ newattrs.ia_valid |= ATTR_RAW; -+ err = inode->i_op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (err != -EOPNOTSUPP) -+ goto out; -+ } - -- err = -EROFS; -- if (IS_RDONLY(inode)) -- goto out_putf; - err = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- goto out_putf; -+ goto out; -+ - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); - --out_putf: -+out: -+ return err; -+} -+ -+asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) -+{ -+ struct file * file; -+ int err = -EBADF; -+ -+ file = fget(fd); -+ if (!file) -+ goto out; -+ -+ err = chmod_common(file->f_dentry, mode); -+ - fput(file); - out: - return err; -@@ -512,30 +577,14 @@ - asmlinkage long sys_chmod(const char * filename, mode_t mode) - { - struct nameidata nd; -- struct inode * inode; - int error; -- struct iattr newattrs; - - error = user_path_walk(filename, &nd); - if (error) - goto out; -- inode = nd.dentry->d_inode; -- -- error = -EROFS; -- if (IS_RDONLY(inode)) -- goto dput_and_out; - -- error = -EPERM; -- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- goto dput_and_out; -+ error = chmod_common(nd.dentry, mode); - -- if (mode == (mode_t) -1) -- mode = inode->i_mode; -- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); -- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; -- error = notify_change(nd.dentry, &newattrs); -- --dput_and_out: - path_release(&nd); - out: - return error; -@@ -555,6 +604,20 @@ - error = -EROFS; - if (IS_RDONLY(inode)) - goto out; -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = dentry->d_inode->i_op; -+ -+ newattrs.ia_uid = user; -+ newattrs.ia_gid = group; -+ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ return error; -+ } -+ - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out; -@@ -659,6 +722,7 @@ - { - int namei_flags, error; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) -@@ -666,14 +730,15 @@ - if (namei_flags & O_TRUNC) - namei_flags |= 2; - -- error = open_namei(filename, namei_flags, mode, &nd); -- if (!error) -- return dentry_open(nd.dentry, nd.mnt, flags); -+ error = open_namei_it(filename, namei_flags, mode, &nd, &it); -+ if (error) -+ return ERR_PTR(error); - -- return ERR_PTR(error); -+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); - } - --struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it) - { - struct file * f; - struct inode *inode; -@@ -710,12 +775,15 @@ - } - - if (f->f_op && f->f_op->open) { -+ f->f_it = it; - error = f->f_op->open(inode,f); -+ f->f_it = NULL; - if (error) - goto cleanup_all; - } - f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - -+ intent_release(it); - return f; - - cleanup_all: -@@ -730,11 +798,17 @@ - cleanup_file: - put_filp(f); - cleanup_dentry: -+ intent_release(it); - dput(dentry); - mntput(mnt); - return ERR_PTR(error); - } - -+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+{ -+ return dentry_open_it(dentry, mnt, flags, NULL); -+} -+ - /* - * Find an empty file descriptor entry, and mark it busy. - */ -Index: linux.mcp2/fs/stat.c -=================================================================== ---- linux.mcp2.orig/fs/stat.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/stat.c 2004-05-05 14:19:59.000000000 -0700 -@@ -17,10 +17,12 @@ - * Revalidate the inode. This is required for proper NFS attribute caching. - */ - static __inline__ int --do_revalidate(struct dentry *dentry) -+do_revalidate(struct dentry *dentry, struct lookup_intent *it) - { - struct inode * inode = dentry->d_inode; -- if (inode->i_op && inode->i_op->revalidate) -+ if (inode->i_op && inode->i_op->revalidate_it) -+ return inode->i_op->revalidate_it(dentry, it); -+ else if (inode->i_op && inode->i_op->revalidate) - return inode->i_op->revalidate(dentry); - return 0; - } -@@ -135,13 +139,15 @@ - asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -151,13 +157,15 @@ - asmlinkage long sys_newstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -172,13 +180,15 @@ - asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -189,13 +199,15 @@ - asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -216,7 +228,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_old_stat(dentry->d_inode, statbuf); - fput(f); -@@ -235,7 +247,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_new_stat(dentry->d_inode, statbuf); - fput(f); -@@ -257,7 +269,7 @@ - - error = -EINVAL; - if (inode->i_op && inode->i_op->readlink && -- !(error = do_revalidate(nd.dentry))) { -+ !(error = do_revalidate(nd.dentry, NULL))) { - UPDATE_ATIME(inode); - error = inode->i_op->readlink(nd.dentry, buf, bufsiz); - } -@@ -333,12 +345,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -348,12 +362,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -368,7 +384,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_new_stat64(dentry->d_inode, statbuf); - fput(f); -Index: linux.mcp2/include/linux/dcache.h -=================================================================== ---- linux.mcp2.orig/include/linux/dcache.h 2004-04-23 16:52:28.000000000 -0700 -+++ linux.mcp2/include/linux/dcache.h 2004-05-05 14:19:59.000000000 -0700 -@@ -5,6 +5,51 @@ - - #include - #include -+#include -+ -+#define IT_OPEN 0x0001 -+#define IT_CREAT 0x0002 -+#define IT_READDIR 0x0004 -+#define IT_GETATTR 0x0008 -+#define IT_LOOKUP 0x0010 -+#define IT_UNLINK 0x0020 -+#define IT_GETXATTR 0x0040 -+#define IT_EXEC 0x0080 -+#define IT_PIN 0x0100 -+ -+#define IT_FL_LOCKED 0x0001 -+#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */ -+ -+#define INTENT_MAGIC 0x19620323 -+ -+ -+struct lustre_intent_data { -+ int it_disposition; -+ int it_status; -+ __u64 it_lock_handle; -+ void *it_data; -+ int it_lock_mode; -+ int it_int_flags; -+}; -+struct lookup_intent { -+ int it_magic; -+ void (*it_op_release)(struct lookup_intent *); -+ int it_op; -+ int it_flags; -+ int it_create_mode; -+ union { -+ struct lustre_intent_data lustre; -+ } d; -+}; -+ -+static inline void intent_init(struct lookup_intent *it, int op, int flags) -+{ -+ memset(it, 0, sizeof(*it)); -+ it->it_magic = INTENT_MAGIC; -+ it->it_op = op; -+ it->it_flags = flags; -+} -+ - - /* - * linux/include/linux/dcache.h -@@ -90,8 +135,22 @@ - int (*d_delete)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); -+ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *); -+ void (*d_pin)(struct dentry *, struct vfsmount * , int); -+ void (*d_unpin)(struct dentry *, struct vfsmount *, int); - }; - -+#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \ -+ de->d_op->d_pin(de, mnt, flag); -+#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \ -+ de->d_op->d_unpin(de, mnt, flag); -+ -+ -+/* defined in fs/namei.c */ -+extern void intent_release(struct lookup_intent *it); -+/* defined in fs/dcache.c */ -+extern void __d_rehash(struct dentry * entry, int lock); -+ - /* the dentry parameter passed to d_hash and d_compare is the parent - * directory of the entries to be compared. It is used in case these - * functions need any directory specific information for determining -@@ -123,6 +182,7 @@ - * s_nfsd_free_path semaphore will be down - */ - #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ -+#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ - - extern spinlock_t dcache_lock; - -Index: linux.mcp2/include/linux/fs.h -=================================================================== ---- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:12:28.000000000 -0700 -+++ linux.mcp2/include/linux/fs.h 2004-05-05 14:19:59.000000000 -0700 -@@ -73,6 +73,7 @@ - - #define FMODE_READ 1 - #define FMODE_WRITE 2 -+#define FMODE_EXEC 4 - - #define READ 0 - #define WRITE 1 -@@ -335,6 +336,9 @@ - #define ATTR_MTIME_SET 256 - #define ATTR_FORCE 512 /* Not a change, but a change it */ - #define ATTR_ATTR_FLAG 1024 -+#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ -+#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 - - /* - * This is the Inode Attributes structure, used for notify_change(). It -@@ -470,6 +474,7 @@ - struct pipe_inode_info *i_pipe; - struct block_device *i_bdev; - struct char_device *i_cdev; -+ void *i_filterdata; - - unsigned long i_dnotify_mask; /* Directory notify events */ - struct dnotify_struct *i_dnotify; /* for directory notifications */ -@@ -574,6 +579,7 @@ - - /* needed for tty driver, and maybe others */ - void *private_data; -+ struct lookup_intent *f_it; - - /* preallocated helper kiobuf to speedup O_DIRECT */ - struct kiobuf *f_iobuf; -@@ -692,6 +698,7 @@ - struct qstr last; - unsigned int flags; - int last_type; -+ struct lookup_intent *intent; - }; - - #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ -@@ -840,7 +847,8 @@ - extern int vfs_link(struct dentry *, struct inode *, struct dentry *); - extern int vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_unlink(struct inode *, struct dentry *); --extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -+ struct inode *new_dir, struct dentry *new_dentry); - - /* - * File types -@@ -900,21 +908,32 @@ - - struct inode_operations { - int (*create) (struct inode *,struct dentry *,int); -+ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); - struct dentry * (*lookup) (struct inode *,struct dentry *); -+ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); - int (*link) (struct dentry *,struct inode *,struct dentry *); -+ int (*link_raw) (struct nameidata *,struct nameidata *); - int (*unlink) (struct inode *,struct dentry *); -+ int (*unlink_raw) (struct nameidata *); - int (*symlink) (struct inode *,struct dentry *,const char *); -+ int (*symlink_raw) (struct nameidata *,const char *); - int (*mkdir) (struct inode *,struct dentry *,int); -+ int (*mkdir_raw) (struct nameidata *,int); - int (*rmdir) (struct inode *,struct dentry *); -+ int (*rmdir_raw) (struct nameidata *); - int (*mknod) (struct inode *,struct dentry *,int,int); -+ int (*mknod_raw) (struct nameidata *,int,dev_t); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *); -+ int (*rename_raw) (struct nameidata *, struct nameidata *); - int (*readlink) (struct dentry *, char *,int); - int (*follow_link) (struct dentry *, struct nameidata *); - void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*revalidate) (struct dentry *); -+ int (*revalidate_it) (struct dentry *, struct lookup_intent *); - int (*setattr) (struct dentry *, struct iattr *); -+ int (*setattr_raw) (struct inode *, struct iattr *); - int (*getattr) (struct dentry *, struct iattr *); - }; - -@@ -1115,10 +1134,14 @@ - - asmlinkage long sys_open(const char *, int, int); - asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ --extern int do_truncate(struct dentry *, loff_t start); -+extern int do_truncate(struct dentry *, loff_t start, int called_from_open); - - extern struct file *filp_open(const char *, int, int); - extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); - extern int filp_close(struct file *, fl_owner_t id); - extern char * getname(const char *); - -@@ -1380,6 +1403,7 @@ - extern loff_t default_llseek(struct file *file, loff_t offset, int origin); - - extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); -+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); - extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); - extern int FASTCALL(path_walk(const char *, struct nameidata *)); - extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1390,6 +1414,8 @@ - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) -+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) - - extern void iput(struct inode *); - extern void force_delete(struct inode *); -@@ -1499,6 +1525,8 @@ - - extern int vfs_readlink(struct dentry *, char *, int, const char *); - extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); - extern int page_readlink(struct dentry *, char *, int); - extern int page_follow_link(struct dentry *, struct nameidata *); - extern struct inode_operations page_symlink_inode_operations; -Index: linux.mcp2/include/linux/fs_struct.h -=================================================================== ---- linux.mcp2.orig/include/linux/fs_struct.h 2004-01-19 07:49:42.000000000 -0800 -+++ linux.mcp2/include/linux/fs_struct.h 2004-05-05 14:19:59.000000000 -0700 -@@ -34,10 +34,12 @@ - write_lock(&fs->lock); - old_root = fs->root; - old_rootmnt = fs->rootmnt; -+ PIN(dentry, mnt, 1); - fs->rootmnt = mntget(mnt); - fs->root = dget(dentry); - write_unlock(&fs->lock); - if (old_root) { -+ UNPIN(old_root, old_rootmnt, 1); - dput(old_root); - mntput(old_rootmnt); - } -@@ -57,10 +59,12 @@ - write_lock(&fs->lock); - old_pwd = fs->pwd; - old_pwdmnt = fs->pwdmnt; -+ PIN(dentry, mnt, 0); - fs->pwdmnt = mntget(mnt); - fs->pwd = dget(dentry); - write_unlock(&fs->lock); - if (old_pwd) { -+ UNPIN(old_pwd, old_pwdmnt, 0); - dput(old_pwd); - mntput(old_pwdmnt); - } -Index: linux.mcp2/kernel/exit.c -=================================================================== ---- linux.mcp2.orig/kernel/exit.c 2004-01-19 07:49:44.000000000 -0800 -+++ linux.mcp2/kernel/exit.c 2004-05-05 14:19:59.000000000 -0700 -@@ -252,11 +252,14 @@ - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } -Index: linux.mcp2/kernel/fork.c -=================================================================== ---- linux.mcp2.orig/kernel/fork.c 2004-01-19 07:49:44.000000000 -0800 -+++ linux.mcp2/kernel/fork.c 2004-05-05 14:19:59.000000000 -0700 -@@ -384,10 +384,13 @@ - fs->umask = old->umask; - read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); -+ PIN(old->pwd, old->pwdmnt, 0); -+ PIN(old->root, old->rootmnt, 1); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { -+ PIN(old->altroot, old->altrootmnt, 1); - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); - } else { -Index: linux.mcp2/kernel/ksyms.c -=================================================================== ---- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:12:28.000000000 -0700 -+++ linux.mcp2/kernel/ksyms.c 2004-05-05 14:19:59.000000000 -0700 -@@ -264,6 +264,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch deleted file mode 100644 index 340ce7c..0000000 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch +++ /dev/null @@ -1,1858 +0,0 @@ - fs/dcache.c | 19 ++ - fs/exec.c | 17 +- - fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++------- - fs/namespace.c | 28 +++- - fs/open.c | 172 +++++++++++++++++++------- - fs/stat.c | 52 +++++--- - include/linux/dcache.h | 60 +++++++++ - include/linux/fs.h | 32 ++++ - include/linux/fs_struct.h | 4 - kernel/exit.c | 3 - kernel/fork.c | 3 - kernel/ksyms.c | 1 - 12 files changed, 558 insertions(+), 128 deletions(-) - -Index: linux-2.4.19.SuSE/fs/dcache.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/dcache.c Mon Jan 27 05:08:04 2003 -+++ linux-2.4.19.SuSE/fs/dcache.c Sat Nov 15 17:29:03 2003 -@@ -186,6 +186,13 @@ - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -838,13 +845,19 @@ - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux-2.4.19.SuSE/fs/exec.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/exec.c Mon Jan 27 05:08:35 2003 -+++ linux-2.4.19.SuSE/fs/exec.c Sat Nov 15 17:34:06 2003 -@@ -107,8 +107,10 @@ - struct file * file; - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; - -- error = user_path_walk(library, &nd); -+ error = user_path_walk_it(library, &nd, &it); - if (error) - goto out; - -@@ -120,7 +122,8 @@ - if (error) - goto exit; - -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(&it); - error = PTR_ERR(file); - if (IS_ERR(file)) - goto out; -@@ -346,9 +349,11 @@ - struct inode *inode; - struct file *file; - int err = 0; -+ struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; - - if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- err = path_walk(name, &nd); -+ err = path_walk_it(name, &nd, &it); - file = ERR_PTR(err); - if (!err) { - inode = nd.dentry->d_inode; -@@ -360,7 +365,8 @@ - err = -EACCES; - file = ERR_PTR(err); - if (!err) { -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(&it); - if (!IS_ERR(file)) { - err = deny_write_access(file); - if (err) { -@@ -372,6 +378,7 @@ - return file; - } - } -+ intent_release(&it); - path_release(&nd); - } - goto out; -@@ -981,7 +988,7 @@ - goto close_fail; - if (!file->f_op->write) - goto close_fail; -- if (do_truncate(file->f_dentry, 0) != 0) -+ if (do_truncate(file->f_dentry, 0, 0) != 0) - goto close_fail; - - retval = binfmt->core_dump(signr, regs, file); -Index: linux-2.4.19.SuSE/fs/namei.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/namei.c Mon Jan 27 05:08:07 2003 -+++ linux-2.4.19.SuSE/fs/namei.c Sat Nov 15 17:52:03 2003 -@@ -94,6 +94,13 @@ - * XEmacs seems to be relying on it... - */ - -+void intent_release(struct lookup_intent *it) -+{ -+ if (it && it->it_op_release) -+ it->it_op_release(it); -+ -+} -+ - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the - * kernel data space before using them.. -@@ -260,10 +267,19 @@ - * Internal lookup() using the new generic dcache. - * SMP-safe - */ --static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * dentry = d_lookup(parent, name); - -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { - dput(dentry); -@@ -281,11 +297,15 @@ - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ --static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * result; - struct inode *dir = parent->d_inode; -+ int counter = 0; - -+again: -+ counter++; - down(&dir->i_sem); - /* - * First re-do the cached lookup just in case it was created -@@ -300,6 +320,9 @@ - result = ERR_PTR(-ENOMEM); - if (dentry) { - lock_kernel(); -+ if (dir->i_op->lookup_it) -+ result = dir->i_op->lookup_it(dir, dentry, it, flags); -+ else - result = dir->i_op->lookup(dir, dentry); - unlock_kernel(); - if (result) -@@ -321,6 +344,15 @@ - dput(result); - result = ERR_PTR(-ENOENT); - } -+ } else if (result->d_op && result->d_op->d_revalidate_it) { -+ if (!result->d_op->d_revalidate_it(result, flags, it) && -+ !d_invalidate(result)) { -+ dput(result); -+ if (counter > 10) -+ result = ERR_PTR(-ESTALE); -+ if (!IS_ERR(result)) -+ goto again; -+ } - } - return result; - } -@@ -332,7 +364,8 @@ - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ --static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) - { - int err; - if (current->link_count >= 8) -@@ -346,10 +379,12 @@ - current->link_count++; - current->total_link_count++; - UPDATE_ATIME(dentry->d_inode); -+ nd->intent = it; - err = dentry->d_inode->i_op->follow_link(dentry, nd); - current->link_count--; - return err; - loop: -+ intent_release(it); - path_release(nd); - return -ELOOP; - } -@@ -447,7 +482,8 @@ - * - * We expect 'base' to be positive and a directory. - */ --int link_path_walk(const char * name, struct nameidata *nd) -+int link_path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it) - { - struct dentry *dentry; - struct inode *inode; -@@ -524,12 +560,13 @@ - break; - } - /* This does the actual lookups.. */ -- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - if (!dentry) { - err = -EWOULDBLOCKIO; - if (atomic) - break; -- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, -+ NULL); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -547,7 +584,7 @@ - goto out_dput; - - if (inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ err = do_follow_link(dentry, nd, NULL); - dput(dentry); - if (err) - goto return_err; -@@ -563,7 +600,7 @@ - nd->dentry = dentry; - } - err = -ENOTDIR; -- if (!inode->i_op->lookup) -+ if (!inode->i_op->lookup && !inode->i_op->lookup_it) - break; - continue; - /* here ends the main loop */ -@@ -590,12 +627,12 @@ - if (err < 0) - break; - } -- dentry = cached_lookup(nd->dentry, &this, 0); -+ dentry = cached_lookup(nd->dentry, &this, 0, it); - if (!dentry) { - err = -EWOULDBLOCKIO; - if (atomic) - break; -- dentry = real_lookup(nd->dentry, &this, 0); -+ dentry = real_lookup(nd->dentry, &this, 0, it); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -605,7 +642,7 @@ - inode = dentry->d_inode; - if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ err = do_follow_link(dentry, nd, it); - dput(dentry); - if (err) - goto return_err; -@@ -619,7 +656,8 @@ - goto no_inode; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; -- if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || -+ (!inode->i_op->lookup && !inode->i_op->lookup_it)) - break; - } - goto return_base; -@@ -643,6 +681,32 @@ - * Check the cached dentry for staleness. - */ - dentry = nd->dentry; -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ err = -ESTALE; -+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { -+ struct dentry *new; -+ err = permission(dentry->d_parent->d_inode, -+ MAY_EXEC); -+ if (err) -+ break; -+ new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, it); -+ if (IS_ERR(new)) { -+ err = PTR_ERR(new); -+ break; -+ } -+ d_invalidate(dentry); -+ dput(dentry); -+ nd->dentry = new; -+ } -+ if (!nd->dentry->d_inode) -+ goto no_inode; -+ if (lookup_flags & LOOKUP_DIRECTORY) { -+ err = -ENOTDIR; -+ if (!nd->dentry->d_inode->i_op || -+ (!nd->dentry->d_inode->i_op->lookup && -+ !nd->dentry->d_inode->i_op->lookup_it)) -+ break; -+ } -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - err = -ESTALE; - if (!dentry->d_op->d_revalidate(dentry, lookup_flags & LOOKUP_PARENT)) { -@@ -656,15 +713,28 @@ - dput(dentry); - break; - } -+ if (err) -+ intent_release(it); - path_release(nd); - return_err: - return err; - } - -+int link_path_walk(const char * name, struct nameidata *nd) -+{ -+ return link_path_walk_it(name, nd, NULL); -+} -+ -+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) -+{ -+ current->total_link_count = 0; -+ return link_path_walk_it(name, nd, it); -+} -+ - int path_walk(const char * name, struct nameidata *nd) - { - current->total_link_count = 0; -- return link_path_walk(name, nd); -+ return link_path_walk_it(name, nd, NULL); - } - - /* SMP-safe */ -@@ -753,6 +823,7 @@ - { - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; -+ nd->intent = NULL; - if (*name=='/') - return walk_init_root(name,nd); - read_lock(¤t->fs->lock); -@@ -767,7 +838,8 @@ - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, -+ struct lookup_intent *it) - { - struct dentry * dentry; - struct inode *inode; -@@ -790,13 +862,16 @@ - goto out; - } - -- dentry = cached_lookup(base, name, 0); -+ dentry = cached_lookup(base, name, 0, it); - if (!dentry) { - struct dentry *new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; - lock_kernel(); -+ if (inode->i_op->lookup_it) -+ dentry = inode->i_op->lookup_it(inode, new, it, 0); -+ else - dentry = inode->i_op->lookup(inode, new); - unlock_kernel(); - if (!dentry) -@@ -808,6 +883,12 @@ - return dentry; - } - -+struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+{ -+ return lookup_hash_it(name, base, NULL); -+} -+ -+ - /* SMP-safe */ - struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) - { -@@ -829,7 +910,7 @@ - } - this.hash = end_name_hash(hash); - -- return lookup_hash(&this, base); -+ return lookup_hash_it(&this, base, NULL); - access: - return ERR_PTR(-EACCES); - } -@@ -861,6 +942,23 @@ - return err; - } - -+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, -+ struct lookup_intent *it) -+{ -+ char *tmp; -+ int err; -+ -+ tmp = getname(name); -+ err = PTR_ERR(tmp); -+ if (!IS_ERR(tmp)) { -+ err = 0; -+ if (path_init(tmp, flags, nd)) -+ err = path_walk_it(tmp, nd, it); -+ putname(tmp); -+ } -+ return err; -+} -+ - /* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. -@@ -958,7 +1056,8 @@ - return retval; - } - --int vfs_create(struct inode *dir, struct dentry *dentry, int mode) -+static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode, -+ struct lookup_intent *it) - { - int error; - -@@ -971,12 +1070,15 @@ - goto exit_lock; - - error = -EACCES; /* shouldn't it be ENOSYS? */ -- if (!dir->i_op || !dir->i_op->create) -+ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it)) - goto exit_lock; - - DQUOT_INIT(dir); - lock_kernel(); -- error = dir->i_op->create(dir, dentry, mode); -+ if (dir->i_op->create_it) -+ error = dir->i_op->create_it(dir, dentry, mode, it); -+ else -+ error = dir->i_op->create(dir, dentry, mode); - unlock_kernel(); - exit_lock: - up(&dir->i_zombie); -@@ -985,6 +1087,11 @@ - return error; - } - -+int vfs_create(struct inode *dir, struct dentry *dentry, int mode) -+{ -+ return vfs_create_it(dir, dentry, mode, NULL); -+} -+ - /* - * open_namei() - * -@@ -999,7 +1106,8 @@ - * for symlinks (where the permissions are checked later). - * SMP-safe - */ --int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) -+int open_namei_it(const char *pathname, int flag, int mode, -+ struct nameidata *nd, struct lookup_intent *it) - { - int acc_mode, error = 0; - struct inode *inode; -@@ -1009,12 +1117,14 @@ - - acc_mode = ACC_MODE(flag); - -+ if (it) -+ it->it_flags = flag; - /* - * The simplest case - just a plain lookup. - */ - if (!(flag & O_CREAT)) { - if (path_init(pathname, lookup_flags(flag), nd)) -- error = path_walk(pathname, nd); -+ error = path_walk_it(pathname, nd, it); - if (error) - return error; - dentry = nd->dentry; -@@ -1024,6 +1134,10 @@ - /* - * Create - we need to know the parent. - */ -+ if (it) { -+ it->it_create_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - if (path_init(pathname, LOOKUP_PARENT, nd)) - error = path_walk(pathname, nd); - if (error) -@@ -1040,7 +1154,7 @@ - - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - - do_last: - error = PTR_ERR(dentry); -@@ -1049,11 +1163,13 @@ - goto exit; - } - -+ it->it_create_mode = mode; - /* Negative dentry, just create the file */ - if (!dentry->d_inode) { - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current->fs->umask; -- error = vfs_create(dir->d_inode, dentry, mode); -+ error = vfs_create_it(dir->d_inode, dentry, -+ mode & ~current->fs->umask, it); - up(&dir->d_inode->i_sem); - #ifndef DENTRY_WASTE_RAM - if (error) -@@ -1161,7 +1277,7 @@ - if (!error) { - DQUOT_INIT(inode); - -- error = do_truncate(dentry, 0); -+ error = do_truncate(dentry, 0, 1); - } - put_write_access(inode); - if (error) -@@ -1173,8 +1289,10 @@ - return 0; - - exit_dput: -+ intent_release(it); - dput(dentry); - exit: -+ intent_release(it); - path_release(nd); - return error; - -@@ -1193,7 +1311,10 @@ - * are done. Procfs-like symlinks just set LAST_BIND. - */ - UPDATE_ATIME(dentry->d_inode); -+ nd->intent = it; - error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (error) -+ intent_release(it); - dput(dentry); - if (error) - return error; -@@ -1215,13 +1336,20 @@ - } - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - putname(nd->last.name); - goto do_last; - } - -+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) -+{ -+ return open_namei_it(pathname, flag, mode, nd, NULL); -+} -+ -+ - /* SMP-safe */ --static struct dentry *lookup_create(struct nameidata *nd, int is_dir) -+static struct dentry *lookup_create(struct nameidata *nd, int is_dir, -+ struct lookup_intent *it) - { - struct dentry *dentry; - -@@ -1229,7 +1357,7 @@ - dentry = ERR_PTR(-EEXIST); - if (nd->last_type != LAST_NORM) - goto fail; -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - if (IS_ERR(dentry)) - goto fail; - if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1286,7 +1414,20 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->mknod_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mknod_raw(&nd, mode, dev); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ -+ dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(dentry); - - if (!IS_POSIXACL(nd.dentry->d_inode)) -@@ -1308,6 +1445,7 @@ - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1356,7 +1494,18 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 1); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->mkdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mkdir_raw(&nd, mode); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 1, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - if (!IS_POSIXACL(nd.dentry->d_inode)) -@@ -1365,6 +1510,7 @@ - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1466,8 +1612,16 @@ - error = -EBUSY; - goto exit1; - } -+ if (nd.dentry->d_inode->i_op->rmdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ error = op->rmdir_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_rmdir(nd.dentry->d_inode, dentry); -@@ -1526,8 +1680,15 @@ - error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit1; -+ if (nd.dentry->d_inode->i_op->unlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->unlink_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ -@@ -1595,15 +1756,27 @@ - error = path_walk(to, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->symlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->symlink_raw(&nd, from); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_symlink(nd.dentry->d_inode, dentry, from); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+ out2: - path_release(&nd); --out: -+ out: - putname(to); - } - putname(from); -@@ -1686,7 +1855,14 @@ - error = -EXDEV; - if (old_nd.mnt != nd.mnt) - goto out_release; -- new_dentry = lookup_create(&nd, 0); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out_release; -+ } -+ if (nd.dentry->d_inode->i_op->link_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->link_raw(&old_nd, &nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out_release; -+ } -+ new_dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(new_dentry); - if (!IS_ERR(new_dentry)) { - error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -@@ -1732,7 +1908,7 @@ - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error; - struct inode *target; -@@ -1811,7 +1987,7 @@ - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error; - -@@ -1902,9 +2078,18 @@ - if (newnd.last_type != LAST_NORM) - goto exit2; - -+ if (old_dir->d_inode->i_op->rename_raw) { -+ lock_kernel(); -+ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); -+ unlock_kernel(); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit2; -+ } -+ - double_lock(new_dir, old_dir); - -- old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; -@@ -1920,16 +2105,16 @@ - if (newnd.last.name[newnd.last.len]) - goto exit4; - } -- new_dentry = lookup_hash(&newnd.last, new_dir); -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - -+ - lock_kernel(); - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); - unlock_kernel(); -- - dput(new_dentry); - exit4: - dput(old_dentry); -@@ -1980,20 +2165,26 @@ - } - - static inline int --__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) - { - int res = 0; - char *name; - if (IS_ERR(link)) - goto fail; - -+ if (it == NULL) -+ it = nd->intent; -+ else if (it != nd->intent) -+ printk("it != nd->intent: tell phil@clusterfs.com\n"); -+ - if (*link == '/') { - path_release(nd); - if (!walk_init_root(link, nd)) - /* weird __emul_prefix() stuff did it */ - goto out; - } -- res = link_path_walk(link, nd); -+ res = link_path_walk_it(link, nd, it); - out: - if (current->link_count || res || nd->last_type!=LAST_NORM) - return res; -@@ -2015,7 +2206,13 @@ - - int vfs_follow_link(struct nameidata *nd, const char *link) - { -- return __vfs_follow_link(nd, link); -+ return __vfs_follow_link(nd, link, NULL); -+} -+ -+int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) -+{ -+ return __vfs_follow_link(nd, link, it); - } - - /* get the link contents into pagecache */ -@@ -2057,7 +2254,7 @@ - { - struct page *page = NULL; - char *s = page_getlink(dentry, &page); -- int res = __vfs_follow_link(nd, s); -+ int res = __vfs_follow_link(nd, s, NULL); - if (page) { - kunmap(page); - page_cache_release(page); -Index: linux-2.4.19.SuSE/fs/namespace.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/namespace.c Mon Jan 27 05:08:07 2003 -+++ linux-2.4.19.SuSE/fs/namespace.c Sat Nov 15 17:56:42 2003 -@@ -97,6 +97,7 @@ - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -108,6 +109,7 @@ - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -491,15 +493,18 @@ - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; - if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) -- err = path_walk(old_name, &old_nd); -- if (err) -+ err = path_walk_it(old_name, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down_write(¤t->namespace->sem); - err = -EINVAL; -@@ -522,6 +527,7 @@ - } - - up_write(¤t->namespace->sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -725,6 +731,7 @@ - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -750,9 +757,11 @@ - - /* ... and get the mountpoint */ - if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- retval = path_walk(dir_name, &nd); -- if (retval) -+ retval = path_walk_it(dir_name, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -+ } - - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, -@@ -764,6 +773,8 @@ - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -929,6 +940,8 @@ - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - char *name; - int error; - -@@ -943,7 +956,7 @@ - goto out0; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) -- error = path_walk(name, &new_nd); -+ error = path_walk_it(name, &new_nd, &new_it); - putname(name); - if (error) - goto out0; -@@ -957,7 +970,7 @@ - goto out1; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) -- error = path_walk(name, &old_nd); -+ error = path_walk_it(name, &old_nd, &old_it); - putname(name); - if (error) - goto out1; -@@ -1013,8 +1026,10 @@ - up(&old_nd.dentry->d_inode->i_zombie); - up_write(¤t->namespace->sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); -Index: linux-2.4.19.SuSE/fs/open.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/open.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/open.c Sat Nov 15 17:43:27 2003 -@@ -19,6 +19,8 @@ - #include - - #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -+extern int path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it); - - int vfs_statfs(struct super_block *sb, struct statfs *buf) - { -@@ -95,9 +97,10 @@ - write_unlock(&files->file_lock); - } - --int do_truncate(struct dentry *dentry, loff_t length) -+int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) - { - struct inode *inode = dentry->d_inode; -+ struct inode_operations *op = dentry->d_inode->i_op; - int error; - struct iattr newattrs; - -@@ -108,7 +111,13 @@ - down(&inode->i_sem); - newattrs.ia_size = length; - newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; -- error = notify_change(dentry, &newattrs); -+ if (called_from_open) -+ newattrs.ia_valid |= ATTR_FROM_OPEN; -+ if (op->setattr_raw) { -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ } else -+ error = notify_change(dentry, &newattrs); - up(&inode->i_sem); - return error; - } -@@ -118,12 +127,13 @@ - struct nameidata nd; - struct inode * inode; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; - -- error = user_path_walk(path, &nd); -+ error = user_path_walk_it(path, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -163,11 +173,13 @@ - error = locks_verify_truncate(inode, NULL, length); - if (!error) { - DQUOT_INIT(inode); -- error = do_truncate(nd.dentry, length); -+ intent_release(&it); -+ error = do_truncate(nd.dentry, length, 0); - } - put_write_access(inode); - - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; -@@ -215,7 +227,7 @@ - - error = locks_verify_truncate(inode, file, length); - if (!error) -- error = do_truncate(dentry, length); -+ error = do_truncate(dentry, length, 0); - out_putf: - fput(file); - out: -@@ -260,11 +272,13 @@ - struct inode * inode; - struct iattr newattrs; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, NULL); - if (error) - goto out; - inode = nd.dentry->d_inode; - -+ /* this is safe without a Lustre lock because it only depends -+ on the super block */ - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; -@@ -279,11 +293,25 @@ - goto dput_and_out; - - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; -- } else { -+ } -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } -+ -+ error = -EPERM; -+ if (!times) { - if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE)) != 0) - goto dput_and_out; - } -+ - error = notify_change(nd.dentry, &newattrs); - dput_and_out: - path_release(&nd); -@@ -304,12 +332,14 @@ - struct inode * inode; - struct iattr newattrs; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, NULL); - - if (error) - goto out; - inode = nd.dentry->d_inode; - -+ /* this is safe without a Lustre lock because it only depends -+ on the super block */ - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; -@@ -324,7 +354,20 @@ - newattrs.ia_atime = times[0].tv_sec; - newattrs.ia_mtime = times[1].tv_sec; - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; -- } else { -+ } -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } -+ -+ error = -EPERM; -+ if (!utimes) { - if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE)) != 0) - goto dput_and_out; -@@ -347,6 +390,7 @@ - int old_fsuid, old_fsgid; - kernel_cap_t old_cap; - int res; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; -@@ -364,13 +408,14 @@ - else - current->cap_effective = current->cap_permitted; - -- res = user_path_walk(filename, &nd); -+ res = user_path_walk_it(filename, &nd, &it); - if (!res) { - res = permission(nd.dentry->d_inode, mode); - /* SuS v2 requires we report a read only fs too */ - if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) - && !special_file(nd.dentry->d_inode->i_mode)) - res = -EROFS; -+ intent_release(&it); - path_release(&nd); - } - -@@ -386,6 +431,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -394,7 +440,7 @@ - - error = 0; - if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -406,6 +452,7 @@ - set_fs_pwd(current->fs, nd.mnt, nd.dentry); - - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; -@@ -446,6 +493,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -454,7 +502,7 @@ - - path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -471,39 +519,56 @@ - set_fs_altroot(); - error = 0; - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; - } - --asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) -+int chmod_common(struct dentry *dentry, mode_t mode) - { -- struct inode * inode; -- struct dentry * dentry; -- struct file * file; -- int err = -EBADF; -+ struct inode *inode = dentry->d_inode; - struct iattr newattrs; -+ int err = -EROFS; - -- file = fget(fd); -- if (!file) -+ if (IS_RDONLY(inode)) - goto out; - -- dentry = file->f_dentry; -- inode = dentry->d_inode; -+ if (inode->i_op->setattr_raw) { -+ newattrs.ia_mode = mode; -+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; -+ newattrs.ia_valid |= ATTR_RAW; -+ err = inode->i_op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (err != -EOPNOTSUPP) -+ goto out; -+ } - -- err = -EROFS; -- if (IS_RDONLY(inode)) -- goto out_putf; - err = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- goto out_putf; -+ goto out; -+ - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); - --out_putf: -+out: -+ return err; -+} -+ -+asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) -+{ -+ struct file * file; -+ int err = -EBADF; -+ -+ file = fget(fd); -+ if (!file) -+ goto out; -+ -+ err = chmod_common(file->f_dentry, mode); -+ - fput(file); - out: - return err; -@@ -512,30 +577,14 @@ - asmlinkage long sys_chmod(const char * filename, mode_t mode) - { - struct nameidata nd; -- struct inode * inode; - int error; -- struct iattr newattrs; - - error = user_path_walk(filename, &nd); - if (error) - goto out; -- inode = nd.dentry->d_inode; -- -- error = -EROFS; -- if (IS_RDONLY(inode)) -- goto dput_and_out; - -- error = -EPERM; -- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- goto dput_and_out; -+ error = chmod_common(nd.dentry, mode); - -- if (mode == (mode_t) -1) -- mode = inode->i_mode; -- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); -- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; -- error = notify_change(nd.dentry, &newattrs); -- --dput_and_out: - path_release(&nd); - out: - return error; -@@ -555,6 +604,20 @@ - error = -EROFS; - if (IS_RDONLY(inode)) - goto out; -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = dentry->d_inode->i_op; -+ -+ newattrs.ia_uid = user; -+ newattrs.ia_gid = group; -+ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ return error; -+ } -+ - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out; -@@ -659,6 +722,7 @@ - { - int namei_flags, error; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) -@@ -666,14 +730,15 @@ - if (namei_flags & O_TRUNC) - namei_flags |= 2; - -- error = open_namei(filename, namei_flags, mode, &nd); -- if (!error) -- return dentry_open(nd.dentry, nd.mnt, flags); -+ error = open_namei_it(filename, namei_flags, mode, &nd, &it); -+ if (error) -+ return ERR_PTR(error); - -- return ERR_PTR(error); -+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); - } - --struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it) - { - struct file * f; - struct inode *inode; -@@ -710,7 +775,9 @@ - } - - if (f->f_op && f->f_op->open) { -+ f->f_it = it; - error = f->f_op->open(inode,f); -+ f->f_it = NULL; - if (error) - goto cleanup_all; - } -@@ -722,6 +789,7 @@ - !inode->i_mapping->a_ops->direct_IO)) - goto cleanup_all; - -+ intent_release(it); - return f; - - cleanup_all: -@@ -736,11 +804,17 @@ - cleanup_file: - put_filp(f); - cleanup_dentry: -+ intent_release(it); - dput(dentry); - mntput(mnt); - return ERR_PTR(error); - } - -+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+{ -+ return dentry_open_it(dentry, mnt, flags, NULL); -+} -+ - /* - * Find an empty file descriptor entry, and mark it busy. - */ -Index: linux-2.4.19.SuSE/fs/stat.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/stat.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/stat.c Sat Nov 15 17:29:03 2003 -@@ -17,10 +17,16 @@ - * Revalidate the inode. This is required for proper NFS attribute caching. - */ - static __inline__ int --do_revalidate(struct dentry *dentry) -+do_revalidate(struct dentry *dentry, struct lookup_intent *it) - { - struct inode * inode = dentry->d_inode; -- if (inode->i_op && inode->i_op->revalidate) -+ if (inode->i_op && inode->i_op->revalidate_it) -+ return inode->i_op->revalidate_it(dentry, it); -+ else if (inode->i_op && inode->i_op->revalidate) - return inode->i_op->revalidate(dentry); - return 0; - } -@@ -141,13 +145,15 @@ - asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -157,13 +163,15 @@ - asmlinkage long sys_newstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -178,13 +186,15 @@ - asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -195,13 +205,15 @@ - asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -222,7 +234,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_old_stat(dentry->d_inode, statbuf); - fput(f); -@@ -241,7 +253,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_new_stat(dentry->d_inode, statbuf); - fput(f); -@@ -263,7 +275,7 @@ - - error = -EINVAL; - if (inode->i_op && inode->i_op->readlink && -- !(error = do_revalidate(nd.dentry))) { -+ !(error = do_revalidate(nd.dentry, NULL))) { - UPDATE_ATIME(inode); - error = inode->i_op->readlink(nd.dentry, buf, bufsiz); - } -@@ -339,12 +351,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -354,12 +368,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -374,7 +390,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_new_stat64(dentry->d_inode, statbuf); - fput(f); -Index: linux-2.4.19.SuSE/include/linux/dcache.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/dcache.h Mon Jan 27 05:13:15 2003 -+++ linux-2.4.19.SuSE/include/linux/dcache.h Sat Nov 15 17:35:46 2003 -@@ -5,6 +5,51 @@ - - #include - #include -+#include -+ -+#define IT_OPEN 0x0001 -+#define IT_CREAT 0x0002 -+#define IT_READDIR 0x0004 -+#define IT_GETATTR 0x0008 -+#define IT_LOOKUP 0x0010 -+#define IT_UNLINK 0x0020 -+#define IT_GETXATTR 0x0040 -+#define IT_EXEC 0x0080 -+#define IT_PIN 0x0100 -+ -+#define IT_FL_LOCKED 0x0001 -+#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */ -+ -+#define INTENT_MAGIC 0x19620323 -+ -+ -+struct lustre_intent_data { -+ int it_disposition; -+ int it_status; -+ __u64 it_lock_handle; -+ void *it_data; -+ int it_lock_mode; -+ int it_int_flags; -+}; -+struct lookup_intent { -+ int it_magic; -+ void (*it_op_release)(struct lookup_intent *); -+ int it_op; -+ int it_flags; -+ int it_create_mode; -+ union { -+ struct lustre_intent_data lustre; -+ } d; -+}; -+ -+static inline void intent_init(struct lookup_intent *it, int op, int flags) -+{ -+ memset(it, 0, sizeof(*it)); -+ it->it_magic = INTENT_MAGIC; -+ it->it_op = op; -+ it->it_flags = flags; -+} -+ - - /* - * linux/include/linux/dcache.h -@@ -92,8 +137,22 @@ - int (*d_delete)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); -+ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *); -+ void (*d_pin)(struct dentry *, struct vfsmount * , int); -+ void (*d_unpin)(struct dentry *, struct vfsmount *, int); - }; - -+#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \ -+ de->d_op->d_pin(de, mnt, flag); -+#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \ -+ de->d_op->d_unpin(de, mnt, flag); -+ -+ -+/* defined in fs/namei.c */ -+extern void intent_release(struct lookup_intent *it); -+/* defined in fs/dcache.c */ -+extern void __d_rehash(struct dentry * entry, int lock); -+ - /* the dentry parameter passed to d_hash and d_compare is the parent - * directory of the entries to be compared. It is used in case these - * functions need any directory specific information for determining -@@ -125,6 +184,7 @@ - * s_nfsd_free_path semaphore will be down - */ - #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ -+#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ - - extern spinlock_t dcache_lock; - -Index: linux-2.4.19.SuSE/include/linux/fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/fs.h Sat Nov 15 17:25:06 2003 -+++ linux-2.4.19.SuSE/include/linux/fs.h Sat Nov 15 17:29:03 2003 -@@ -73,6 +73,7 @@ - - #define FMODE_READ 1 - #define FMODE_WRITE 2 -+#define FMODE_EXEC 4 - - #define READ 0 - #define WRITE 1 -@@ -363,6 +364,9 @@ - #define ATTR_MTIME_SET 256 - #define ATTR_FORCE 512 /* Not a change, but a change it */ - #define ATTR_ATTR_FLAG 1024 -+#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ -+#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 - - /* - * This is the Inode Attributes structure, used for notify_change(). It -@@ -507,6 +511,7 @@ - struct pipe_inode_info *i_pipe; - struct block_device *i_bdev; - struct char_device *i_cdev; -+ void *i_filterdata; - - unsigned long i_dnotify_mask; /* Directory notify events */ - struct dnotify_struct *i_dnotify; /* for directory notifications */ -@@ -669,6 +674,7 @@ - - /* needed for tty driver, and maybe others */ - void *private_data; -+ struct lookup_intent *f_it; - - /* preallocated helper kiobuf to speedup O_DIRECT */ - struct kiobuf *f_iobuf; -@@ -799,6 +805,7 @@ - struct qstr last; - unsigned int flags; - int last_type; -+ struct lookup_intent *intent; - }; - - #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ -@@ -947,7 +954,8 @@ - extern int __vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_unlink(struct inode *, struct dentry *); --extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -+ struct inode *new_dir, struct dentry *new_dentry); - - /* - * File types -@@ -1020,21 +1028,32 @@ - - struct inode_operations { - int (*create) (struct inode *,struct dentry *,int); -+ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); - struct dentry * (*lookup) (struct inode *,struct dentry *); -+ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); - int (*link) (struct dentry *,struct inode *,struct dentry *); -+ int (*link_raw) (struct nameidata *,struct nameidata *); - int (*unlink) (struct inode *,struct dentry *); -+ int (*unlink_raw) (struct nameidata *); - int (*symlink) (struct inode *,struct dentry *,const char *); -+ int (*symlink_raw) (struct nameidata *,const char *); - int (*mkdir) (struct inode *,struct dentry *,int); -+ int (*mkdir_raw) (struct nameidata *,int); - int (*rmdir) (struct inode *,struct dentry *); -+ int (*rmdir_raw) (struct nameidata *); - int (*mknod) (struct inode *,struct dentry *,int,int); -+ int (*mknod_raw) (struct nameidata *,int,dev_t); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *); -+ int (*rename_raw) (struct nameidata *, struct nameidata *); - int (*readlink) (struct dentry *, char *,int); - int (*follow_link) (struct dentry *, struct nameidata *); - void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*revalidate) (struct dentry *); -+ int (*revalidate_it) (struct dentry *, struct lookup_intent *); - int (*setattr) (struct dentry *, struct iattr *); -+ int (*setattr_raw) (struct inode *, struct iattr *); - int (*getattr) (struct dentry *, struct iattr *); - int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); - ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); -@@ -1244,10 +1263,14 @@ - - asmlinkage long sys_open(const char *, int, int); - asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ --extern int do_truncate(struct dentry *, loff_t start); -+extern int do_truncate(struct dentry *, loff_t start, int called_from_open); - - extern struct file *filp_open(const char *, int, int); - extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); - extern int filp_close(struct file *, fl_owner_t id); - extern char * getname(const char *); - -@@ -1515,6 +1538,7 @@ - extern loff_t default_llseek(struct file *file, loff_t offset, int origin); - - extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); -+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); - extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); - extern int FASTCALL(path_walk(const char *, struct nameidata *)); - extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1526,6 +1550,8 @@ - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) -+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) - - extern void iput(struct inode *); - extern void force_delete(struct inode *); -@@ -1646,6 +1672,8 @@ - - extern int vfs_readlink(struct dentry *, char *, int, const char *); - extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); - extern int page_readlink(struct dentry *, char *, int); - extern int page_follow_link(struct dentry *, struct nameidata *); - extern struct inode_operations page_symlink_inode_operations; -Index: linux-2.4.19.SuSE/include/linux/fs_struct.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/fs_struct.h Fri Jul 13 15:10:44 2001 -+++ linux-2.4.19.SuSE/include/linux/fs_struct.h Sat Nov 15 17:29:03 2003 -@@ -34,10 +34,12 @@ - write_lock(&fs->lock); - old_root = fs->root; - old_rootmnt = fs->rootmnt; -+ PIN(dentry, mnt, 1); - fs->rootmnt = mntget(mnt); - fs->root = dget(dentry); - write_unlock(&fs->lock); - if (old_root) { -+ UNPIN(old_root, old_rootmnt, 1); - dput(old_root); - mntput(old_rootmnt); - } -@@ -57,10 +59,12 @@ - write_lock(&fs->lock); - old_pwd = fs->pwd; - old_pwdmnt = fs->pwdmnt; -+ PIN(dentry, mnt, 0); - fs->pwdmnt = mntget(mnt); - fs->pwd = dget(dentry); - write_unlock(&fs->lock); - if (old_pwd) { -+ UNPIN(old_pwd, old_pwdmnt, 0); - dput(old_pwd); - mntput(old_pwdmnt); - } -Index: linux-2.4.19.SuSE/kernel/exit.c -=================================================================== ---- linux-2.4.19.SuSE.orig/kernel/exit.c Mon Jan 27 05:08:16 2003 -+++ linux-2.4.19.SuSE/kernel/exit.c Sat Nov 15 17:29:03 2003 -@@ -288,11 +288,14 @@ - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } -Index: linux-2.4.19.SuSE/kernel/fork.c -=================================================================== ---- linux-2.4.19.SuSE.orig/kernel/fork.c Mon Jan 27 05:08:56 2003 -+++ linux-2.4.19.SuSE/kernel/fork.c Sat Nov 15 17:29:03 2003 -@@ -454,10 +454,13 @@ - fs->umask = old->umask; - read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); -+ PIN(old->pwd, old->pwdmnt, 0); -+ PIN(old->root, old->rootmnt, 1); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { -+ PIN(old->altroot, old->altrootmnt, 1); - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); - } else { -Index: linux-2.4.19.SuSE/kernel/ksyms.c -=================================================================== ---- linux-2.4.19.SuSE.orig/kernel/ksyms.c Sat Nov 15 17:24:46 2003 -+++ linux-2.4.19.SuSE/kernel/ksyms.c Sat Nov 15 17:29:03 2003 -@@ -315,6 +315,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch b/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch deleted file mode 100644 index 1ff2f5d..0000000 --- a/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch +++ /dev/null @@ -1,12 +0,0 @@ -Index: linux.mcp2/kernel/ksyms.c -=================================================================== ---- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:57:48.000000000 -0700 -+++ linux.mcp2/kernel/ksyms.c 2004-05-05 15:32:44.000000000 -0700 -@@ -108,6 +108,7 @@ - EXPORT_SYMBOL(kfree); - EXPORT_SYMBOL(vfree); - EXPORT_SYMBOL(__vmalloc); -+extern struct page * vmalloc_to_page(void *addr); - EXPORT_SYMBOL(vmalloc_to_page); - EXPORT_SYMBOL(mem_map); - EXPORT_SYMBOL(remap_page_range); diff --git a/lustre/kernel_patches/series/bgl-2.4.19 b/lustre/kernel_patches/series/bgl-2.4.19 deleted file mode 100644 index bd67a30..0000000 --- a/lustre/kernel_patches/series/bgl-2.4.19 +++ /dev/null @@ -1,47 +0,0 @@ -dev_read_only_2.4.20-rh.patch -exports_2.4.19-bgl.patch -lustre_version.patch -vfs_intent-2.4.19-bgl.patch -invalidate_show-2.4.19-bgl.patch -export-truncate-bgl.patch -iod-stock-24-exports-2.4.19-bgl.patch -ext3-htree-2.4.19-bgl.patch -linux-2.4.19-bgl-xattr-0.8.54.patch -ext3-2.4.20-fixes.patch -ext3-2.4-ino_t.patch -ext3-largefile.patch -ext3-truncate_blocks.patch -ext3-unmount_sync.patch -ext3-use-after-free-2.4.19-pre1.patch -ext3-orphan_lock.patch -ext3-noread-2.4.20.patch -ext3-delete_thread-2.4.20.patch -extN-wantedi.patch -ext3-san-2.4.20.patch -ext3-map_inode_page.patch -ext3-error-export.patch -iopen-2.4.19-bgl.patch -tcp-zero-copy-2.4.19-pre1.patch -jbd-dont-account-blocks-twice.patch -jbd-commit-tricks.patch -ext3-no-write-super.patch -add_page_private-2.4.19-bgl.patch -socket-exports-2.4.19-bgl.patch -removepage-2.4.20.patch -jbd-ctx_switch.patch -jbd-flushtime-2.4.19-suse.patch -jbd-get_write_access.patch -nfs_export_kernel-2.4.19-bgl.patch -ext3-raw-lookup.patch -ext3-ea-in-inode-2.4.20.patch -listman-2.4.19-bgl.patch -ext3-trusted_ea-2.4.20.patch -jbd-2.4.19-pre1-jcberr.patch -resched-2.4.19-pre1.patch -ext3-xattr-ptr-arith-fix.patch -vmalloc_to_page-2.4.19-bgl.patch -procfs-ndynamic-2.4.patch -ext3-truncate-buffer-head.patch -kallsyms-2.4-bgl.patch -kksymoops-2.4-bgl.patch -export-show_task-2.4-bgl.patch -- 1.8.3.1