LLNL has updated the BG/L clients to 2.6.9 and have stopped using 2.4.19.
+++ /dev/null
-diff -rup --new-file linux.mcp2/fs/ext3/Makefile linux_tmp/fs/ext3/Makefile
---- linux.mcp2/fs/ext3/Makefile 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/Makefile 2001-12-21 09:41:55.000000000 -0800
-@@ -0,0 +1,16 @@
-+#
-+# Makefile for the linux ext2-filesystem routines.
-+#
-+# Note! Dependencies are done automagically by 'make dep', which also
-+# removes any old dependencies. DON'T put your own dependencies here
-+# unless it's something special (ie not a .c file).
-+#
-+# Note 2! The CFLAGS definitions are now in the main makefile...
-+
-+O_TARGET := ext3.o
-+
-+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-+ ioctl.o namei.o super.o symlink.o
-+obj-m := $(O_TARGET)
-+
-+include $(TOPDIR)/Rules.make
-diff -rup --new-file linux.mcp2/fs/ext3/balloc.c linux_tmp/fs/ext3/balloc.c
---- linux.mcp2/fs/ext3/balloc.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/balloc.c 2002-08-02 17:39:45.000000000 -0700
-@@ -0,0 +1,999 @@
-+/*
-+ * linux/fs/ext3/balloc.c
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ *
-+ * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
-+ * Big-endian to little-endian byte-swapping/bitmaps by
-+ * David S. Miller (davem@caip.rutgers.edu), 1995
-+ */
-+
-+#include <linux/config.h>
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/locks.h>
-+#include <linux/quotaops.h>
-+
-+/*
-+ * balloc.c contains the blocks allocation and deallocation routines
-+ */
-+
-+/*
-+ * The free blocks are managed by bitmaps. A file system contains several
-+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
-+ * block for inodes, N blocks for the inode table and data blocks.
-+ *
-+ * The file system contains group descriptors which are located after the
-+ * super block. Each descriptor contains the number of the bitmap block and
-+ * the free blocks count in the block. The descriptors are loaded in memory
-+ * when a file system is mounted (see ext3_read_super).
-+ */
-+
-+
-+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-+
-+struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
-+ unsigned int block_group,
-+ struct buffer_head ** bh)
-+{
-+ unsigned long group_desc;
-+ unsigned long desc;
-+ struct ext3_group_desc * gdp;
-+
-+ if (block_group >= sb->u.ext3_sb.s_groups_count) {
-+ ext3_error (sb, "ext3_get_group_desc",
-+ "block_group >= groups_count - "
-+ "block_group = %d, groups_count = %lu",
-+ block_group, sb->u.ext3_sb.s_groups_count);
-+
-+ return NULL;
-+ }
-+
-+ group_desc = block_group / EXT3_DESC_PER_BLOCK(sb);
-+ desc = block_group % EXT3_DESC_PER_BLOCK(sb);
-+ if (!sb->u.ext3_sb.s_group_desc[group_desc]) {
-+ ext3_error (sb, "ext3_get_group_desc",
-+ "Group descriptor not loaded - "
-+ "block_group = %d, group_desc = %lu, desc = %lu",
-+ block_group, group_desc, desc);
-+ return NULL;
-+ }
-+
-+ gdp = (struct ext3_group_desc *)
-+ sb->u.ext3_sb.s_group_desc[group_desc]->b_data;
-+ if (bh)
-+ *bh = sb->u.ext3_sb.s_group_desc[group_desc];
-+ return gdp + desc;
-+}
-+
-+/*
-+ * Read the bitmap for a given block_group, reading into the specified
-+ * slot in the superblock's bitmap cache.
-+ *
-+ * Return >=0 on success or a -ve error code.
-+ */
-+
-+static int read_block_bitmap (struct super_block * sb,
-+ unsigned int block_group,
-+ unsigned long bitmap_nr)
-+{
-+ struct ext3_group_desc * gdp;
-+ struct buffer_head * bh = NULL;
-+ int retval = -EIO;
-+
-+ gdp = ext3_get_group_desc (sb, block_group, NULL);
-+ if (!gdp)
-+ goto error_out;
-+ retval = 0;
-+ bh = sb_bread(sb, le32_to_cpu(gdp->bg_block_bitmap));
-+ if (!bh) {
-+ ext3_error (sb, "read_block_bitmap",
-+ "Cannot read block bitmap - "
-+ "block_group = %d, block_bitmap = %lu",
-+ block_group, (unsigned long) gdp->bg_block_bitmap);
-+ retval = -EIO;
-+ }
-+ /*
-+ * On IO error, just leave a zero in the superblock's block pointer for
-+ * this group. The IO will be retried next time.
-+ */
-+error_out:
-+ sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group;
-+ sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh;
-+ return retval;
-+}
-+
-+/*
-+ * load_block_bitmap loads the block bitmap for a blocks group
-+ *
-+ * It maintains a cache for the last bitmaps loaded. This cache is managed
-+ * with a LRU algorithm.
-+ *
-+ * Notes:
-+ * 1/ There is one cache per mounted file system.
-+ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
-+ * this function reads the bitmap without maintaining a LRU cache.
-+ *
-+ * Return the slot used to store the bitmap, or a -ve error code.
-+ */
-+static int __load_block_bitmap (struct super_block * sb,
-+ unsigned int block_group)
-+{
-+ int i, j, retval = 0;
-+ unsigned long block_bitmap_number;
-+ struct buffer_head * block_bitmap;
-+
-+ if (block_group >= sb->u.ext3_sb.s_groups_count)
-+ ext3_panic (sb, "load_block_bitmap",
-+ "block_group >= groups_count - "
-+ "block_group = %d, groups_count = %lu",
-+ block_group, sb->u.ext3_sb.s_groups_count);
-+
-+ if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) {
-+ if (sb->u.ext3_sb.s_block_bitmap[block_group]) {
-+ if (sb->u.ext3_sb.s_block_bitmap_number[block_group] ==
-+ block_group)
-+ return block_group;
-+ ext3_error (sb, "__load_block_bitmap",
-+ "block_group != block_bitmap_number");
-+ }
-+ retval = read_block_bitmap (sb, block_group, block_group);
-+ if (retval < 0)
-+ return retval;
-+ return block_group;
-+ }
-+
-+ for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
-+ sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++)
-+ ;
-+ if (i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
-+ sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) {
-+ block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i];
-+ block_bitmap = sb->u.ext3_sb.s_block_bitmap[i];
-+ for (j = i; j > 0; j--) {
-+ sb->u.ext3_sb.s_block_bitmap_number[j] =
-+ sb->u.ext3_sb.s_block_bitmap_number[j - 1];
-+ sb->u.ext3_sb.s_block_bitmap[j] =
-+ sb->u.ext3_sb.s_block_bitmap[j - 1];
-+ }
-+ sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number;
-+ sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap;
-+
-+ /*
-+ * There's still one special case here --- if block_bitmap == 0
-+ * then our last attempt to read the bitmap failed and we have
-+ * just ended up caching that failure. Try again to read it.
-+ */
-+ if (!block_bitmap)
-+ retval = read_block_bitmap (sb, block_group, 0);
-+ } else {
-+ if (sb->u.ext3_sb.s_loaded_block_bitmaps<EXT3_MAX_GROUP_LOADED)
-+ sb->u.ext3_sb.s_loaded_block_bitmaps++;
-+ else
-+ brelse (sb->u.ext3_sb.s_block_bitmap
-+ [EXT3_MAX_GROUP_LOADED - 1]);
-+ for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1;
-+ j > 0; j--) {
-+ sb->u.ext3_sb.s_block_bitmap_number[j] =
-+ sb->u.ext3_sb.s_block_bitmap_number[j - 1];
-+ sb->u.ext3_sb.s_block_bitmap[j] =
-+ sb->u.ext3_sb.s_block_bitmap[j - 1];
-+ }
-+ retval = read_block_bitmap (sb, block_group, 0);
-+ }
-+ return retval;
-+}
-+
-+/*
-+ * Load the block bitmap for a given block group. First of all do a couple
-+ * of fast lookups for common cases and then pass the request onto the guts
-+ * of the bitmap loader.
-+ *
-+ * Return the slot number of the group in the superblock bitmap cache's on
-+ * success, or a -ve error code.
-+ *
-+ * There is still one inconsistency here --- if the number of groups in this
-+ * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of
-+ * differentiating between a group for which we have never performed a bitmap
-+ * IO request, and a group for which the last bitmap read request failed.
-+ */
-+static inline int load_block_bitmap (struct super_block * sb,
-+ unsigned int block_group)
-+{
-+ int slot;
-+
-+ /*
-+ * Do the lookup for the slot. First of all, check if we're asking
-+ * for the same slot as last time, and did we succeed that last time?
-+ */
-+ if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 &&
-+ sb->u.ext3_sb.s_block_bitmap_number[0] == block_group &&
-+ sb->u.ext3_sb.s_block_bitmap[0]) {
-+ return 0;
-+ }
-+ /*
-+ * Or can we do a fast lookup based on a loaded group on a filesystem
-+ * small enough to be mapped directly into the superblock?
-+ */
-+ else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED &&
-+ sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group
-+ && sb->u.ext3_sb.s_block_bitmap[block_group]) {
-+ slot = block_group;
-+ }
-+ /*
-+ * If not, then do a full lookup for this block group.
-+ */
-+ else {
-+ slot = __load_block_bitmap (sb, block_group);
-+ }
-+
-+ /*
-+ * <0 means we just got an error
-+ */
-+ if (slot < 0)
-+ return slot;
-+
-+ /*
-+ * If it's a valid slot, we may still have cached a previous IO error,
-+ * in which case the bh in the superblock cache will be zero.
-+ */
-+ if (!sb->u.ext3_sb.s_block_bitmap[slot])
-+ return -EIO;
-+
-+ /*
-+ * Must have been read in OK to get this far.
-+ */
-+ return slot;
-+}
-+
-+/* Free given blocks, update quota and i_blocks field */
-+void ext3_free_blocks (handle_t *handle, struct inode * inode,
-+ unsigned long block, unsigned long count)
-+{
-+ struct buffer_head *bitmap_bh;
-+ struct buffer_head *gd_bh;
-+ unsigned long block_group;
-+ unsigned long bit;
-+ unsigned long i;
-+ int bitmap_nr;
-+ unsigned long overflow;
-+ struct super_block * sb;
-+ struct ext3_group_desc * gdp;
-+ struct ext3_super_block * es;
-+ int err = 0, ret;
-+ int dquot_freed_blocks = 0;
-+
-+ sb = inode->i_sb;
-+ if (!sb) {
-+ printk ("ext3_free_blocks: nonexistent device");
-+ return;
-+ }
-+ lock_super (sb);
-+ es = sb->u.ext3_sb.s_es;
-+ if (block < le32_to_cpu(es->s_first_data_block) ||
-+ (block + count) > le32_to_cpu(es->s_blocks_count)) {
-+ ext3_error (sb, "ext3_free_blocks",
-+ "Freeing blocks not in datazone - "
-+ "block = %lu, count = %lu", block, count);
-+ goto error_return;
-+ }
-+
-+ ext3_debug ("freeing block %lu\n", block);
-+
-+do_more:
-+ overflow = 0;
-+ block_group = (block - le32_to_cpu(es->s_first_data_block)) /
-+ EXT3_BLOCKS_PER_GROUP(sb);
-+ bit = (block - le32_to_cpu(es->s_first_data_block)) %
-+ EXT3_BLOCKS_PER_GROUP(sb);
-+ /*
-+ * Check to see if we are freeing blocks across a group
-+ * boundary.
-+ */
-+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
-+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
-+ count -= overflow;
-+ }
-+ bitmap_nr = load_block_bitmap (sb, block_group);
-+ if (bitmap_nr < 0)
-+ goto error_return;
-+
-+ bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
-+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
-+ if (!gdp)
-+ goto error_return;
-+
-+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
-+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
-+ in_range (block, le32_to_cpu(gdp->bg_inode_table),
-+ sb->u.ext3_sb.s_itb_per_group) ||
-+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
-+ sb->u.ext3_sb.s_itb_per_group))
-+ ext3_error (sb, "ext3_free_blocks",
-+ "Freeing blocks in system zones - "
-+ "Block = %lu, count = %lu",
-+ block, count);
-+
-+ /*
-+ * We are about to start releasing blocks in the bitmap,
-+ * so we need undo access.
-+ */
-+ /* @@@ check errors */
-+ BUFFER_TRACE(bitmap_bh, "getting undo access");
-+ err = ext3_journal_get_undo_access(handle, bitmap_bh);
-+ if (err)
-+ goto error_return;
-+
-+ /*
-+ * We are about to modify some metadata. Call the journal APIs
-+ * to unshare ->b_data if a currently-committing transaction is
-+ * using it
-+ */
-+ BUFFER_TRACE(gd_bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, gd_bh);
-+ if (err)
-+ goto error_return;
-+
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
-+ if (err)
-+ goto error_return;
-+
-+ for (i = 0; i < count; i++) {
-+ /*
-+ * An HJ special. This is expensive...
-+ */
-+#ifdef CONFIG_JBD_DEBUG
-+ {
-+ struct buffer_head *debug_bh;
-+ debug_bh = sb_get_hash_table(sb, block + i);
-+ if (debug_bh) {
-+ BUFFER_TRACE(debug_bh, "Deleted!");
-+ if (!bh2jh(bitmap_bh)->b_committed_data)
-+ BUFFER_TRACE(debug_bh,
-+ "No commited data in bitmap");
-+ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-+ __brelse(debug_bh);
-+ }
-+ }
-+#endif
-+ BUFFER_TRACE(bitmap_bh, "clear bit");
-+ if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
-+ ext3_error (sb, __FUNCTION__,
-+ "bit already cleared for block %lu",
-+ block + i);
-+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
-+ } else {
-+ dquot_freed_blocks++;
-+ gdp->bg_free_blocks_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1);
-+ es->s_free_blocks_count =
-+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1);
-+ }
-+ /* @@@ This prevents newly-allocated data from being
-+ * freed and then reallocated within the same
-+ * transaction.
-+ *
-+ * Ideally we would want to allow that to happen, but to
-+ * do so requires making journal_forget() capable of
-+ * revoking the queued write of a data block, which
-+ * implies blocking on the journal lock. *forget()
-+ * cannot block due to truncate races.
-+ *
-+ * Eventually we can fix this by making journal_forget()
-+ * return a status indicating whether or not it was able
-+ * to revoke the buffer. On successful revoke, it is
-+ * safe not to set the allocation bit in the committed
-+ * bitmap, because we know that there is no outstanding
-+ * activity on the buffer any more and so it is safe to
-+ * reallocate it.
-+ */
-+ BUFFER_TRACE(bitmap_bh, "clear in b_committed_data");
-+ J_ASSERT_BH(bitmap_bh,
-+ bh2jh(bitmap_bh)->b_committed_data != NULL);
-+ ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data);
-+ }
-+
-+ /* We dirtied the bitmap block */
-+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-+ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
-+
-+ /* And the group descriptor block */
-+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-+ ret = ext3_journal_dirty_metadata(handle, gd_bh);
-+ if (!err) err = ret;
-+
-+ /* And the superblock */
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock");
-+ ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
-+ if (!err) err = ret;
-+
-+ if (overflow && !err) {
-+ block += count;
-+ count = overflow;
-+ goto do_more;
-+ }
-+ sb->s_dirt = 1;
-+error_return:
-+ ext3_std_error(sb, err);
-+ unlock_super(sb);
-+ if (dquot_freed_blocks)
-+ DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-+ return;
-+}
-+
-+/* For ext3 allocations, we must not reuse any blocks which are
-+ * allocated in the bitmap buffer's "last committed data" copy. This
-+ * prevents deletes from freeing up the page for reuse until we have
-+ * committed the delete transaction.
-+ *
-+ * If we didn't do this, then deleting something and reallocating it as
-+ * data would allow the old block to be overwritten before the
-+ * transaction committed (because we force data to disk before commit).
-+ * This would lead to corruption if we crashed between overwriting the
-+ * data and committing the delete.
-+ *
-+ * @@@ We may want to make this allocation behaviour conditional on
-+ * data-writes at some point, and disable it for metadata allocations or
-+ * sync-data inodes.
-+ */
-+static int ext3_test_allocatable(int nr, struct buffer_head *bh)
-+{
-+ if (ext3_test_bit(nr, bh->b_data))
-+ return 0;
-+ if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
-+ return 1;
-+ return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
-+}
-+
-+/*
-+ * Find an allocatable block in a bitmap. We honour both the bitmap and
-+ * its last-committed copy (if that exists), and perform the "most
-+ * appropriate allocation" algorithm of looking for a free block near
-+ * the initial goal; then for a free byte somewhere in the bitmap; then
-+ * for any free bit in the bitmap.
-+ */
-+static int find_next_usable_block(int start,
-+ struct buffer_head *bh, int maxblocks)
-+{
-+ int here, next;
-+ char *p, *r;
-+
-+ if (start > 0) {
-+ /*
-+ * The goal was occupied; search forward for a free
-+ * block within the next XX blocks.
-+ *
-+ * end_goal is more or less random, but it has to be
-+ * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
-+ * next 64-bit boundary is simple..
-+ */
-+ int end_goal = (start + 63) & ~63;
-+ here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
-+ if (here < end_goal && ext3_test_allocatable(here, bh))
-+ return here;
-+
-+ ext3_debug ("Bit not found near goal\n");
-+
-+ }
-+
-+ here = start;
-+ if (here < 0)
-+ here = 0;
-+
-+ /*
-+ * There has been no free block found in the near vicinity of
-+ * the goal: do a search forward through the block groups,
-+ * searching in each group first for an entire free byte in the
-+ * bitmap and then for any free bit.
-+ *
-+ * Search first in the remainder of the current group
-+ */
-+ p = ((char *) bh->b_data) + (here >> 3);
-+ r = memscan(p, 0, (maxblocks - here + 7) >> 3);
-+ next = (r - ((char *) bh->b_data)) << 3;
-+
-+ if (next < maxblocks && ext3_test_allocatable(next, bh))
-+ return next;
-+
-+ /* The bitmap search --- search forward alternately
-+ * through the actual bitmap and the last-committed copy
-+ * until we find a bit free in both. */
-+
-+ while (here < maxblocks) {
-+ next = ext3_find_next_zero_bit ((unsigned long *) bh->b_data,
-+ maxblocks, here);
-+ if (next >= maxblocks)
-+ return -1;
-+ if (ext3_test_allocatable(next, bh))
-+ return next;
-+
-+ J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data);
-+ here = ext3_find_next_zero_bit
-+ ((unsigned long *) bh2jh(bh)->b_committed_data,
-+ maxblocks, next);
-+ }
-+ return -1;
-+}
-+
-+/*
-+ * ext3_new_block uses a goal block to assist allocation. If the goal is
-+ * free, or there is a free block within 32 blocks of the goal, that block
-+ * is allocated. Otherwise a forward search is made for a free block; within
-+ * each block group the search first looks for an entire free byte in the block
-+ * bitmap, and then for any free bit if that fails.
-+ * This function also updates quota and i_blocks field.
-+ */
-+int ext3_new_block (handle_t *handle, struct inode * inode,
-+ unsigned long goal, u32 * prealloc_count,
-+ u32 * prealloc_block, int * errp)
-+{
-+ struct buffer_head * bh, *bhtmp;
-+ struct buffer_head * bh2;
-+#if 0
-+ char * p, * r;
-+#endif
-+ int i, j, k, tmp, alloctmp;
-+ int bitmap_nr;
-+ int fatal = 0, err;
-+ int performed_allocation = 0;
-+ struct super_block * sb;
-+ struct ext3_group_desc * gdp;
-+ struct ext3_super_block * es;
-+#ifdef EXT3FS_DEBUG
-+ static int goal_hits = 0, goal_attempts = 0;
-+#endif
-+ *errp = -ENOSPC;
-+ sb = inode->i_sb;
-+ if (!sb) {
-+ printk ("ext3_new_block: nonexistent device");
-+ return 0;
-+ }
-+
-+ /*
-+ * Check quota for allocation of this block.
-+ */
-+ if (DQUOT_ALLOC_BLOCK(inode, 1)) {
-+ *errp = -EDQUOT;
-+ return 0;
-+ }
-+
-+ lock_super (sb);
-+ es = sb->u.ext3_sb.s_es;
-+ if (le32_to_cpu(es->s_free_blocks_count) <=
-+ le32_to_cpu(es->s_r_blocks_count) &&
-+ ((sb->u.ext3_sb.s_resuid != current->fsuid) &&
-+ (sb->u.ext3_sb.s_resgid == 0 ||
-+ !in_group_p (sb->u.ext3_sb.s_resgid)) &&
-+ !capable(CAP_SYS_RESOURCE)))
-+ goto out;
-+
-+ ext3_debug ("goal=%lu.\n", goal);
-+
-+ /*
-+ * First, test whether the goal block is free.
-+ */
-+ if (goal < le32_to_cpu(es->s_first_data_block) ||
-+ goal >= le32_to_cpu(es->s_blocks_count))
-+ goal = le32_to_cpu(es->s_first_data_block);
-+ i = (goal - le32_to_cpu(es->s_first_data_block)) /
-+ EXT3_BLOCKS_PER_GROUP(sb);
-+ gdp = ext3_get_group_desc (sb, i, &bh2);
-+ if (!gdp)
-+ goto io_error;
-+
-+ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
-+ j = ((goal - le32_to_cpu(es->s_first_data_block)) %
-+ EXT3_BLOCKS_PER_GROUP(sb));
-+#ifdef EXT3FS_DEBUG
-+ if (j)
-+ goal_attempts++;
-+#endif
-+ bitmap_nr = load_block_bitmap (sb, i);
-+ if (bitmap_nr < 0)
-+ goto io_error;
-+
-+ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
-+
-+ ext3_debug ("goal is at %d:%d.\n", i, j);
-+
-+ if (ext3_test_allocatable(j, bh)) {
-+#ifdef EXT3FS_DEBUG
-+ goal_hits++;
-+ ext3_debug ("goal bit allocated.\n");
-+#endif
-+ goto got_block;
-+ }
-+
-+ j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb));
-+ if (j >= 0)
-+ goto search_back;
-+ }
-+
-+ ext3_debug ("Bit not found in block group %d.\n", i);
-+
-+ /*
-+ * Now search the rest of the groups. We assume that
-+ * i and gdp correctly point to the last group visited.
-+ */
-+ for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) {
-+ i++;
-+ if (i >= sb->u.ext3_sb.s_groups_count)
-+ i = 0;
-+ gdp = ext3_get_group_desc (sb, i, &bh2);
-+ if (!gdp) {
-+ *errp = -EIO;
-+ goto out;
-+ }
-+ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
-+ bitmap_nr = load_block_bitmap (sb, i);
-+ if (bitmap_nr < 0)
-+ goto io_error;
-+
-+ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
-+ j = find_next_usable_block(-1, bh,
-+ EXT3_BLOCKS_PER_GROUP(sb));
-+ if (j >= 0)
-+ goto search_back;
-+ }
-+ }
-+
-+ /* No space left on the device */
-+ goto out;
-+
-+search_back:
-+ /*
-+ * We have succeeded in finding a free byte in the block
-+ * bitmap. Now search backwards up to 7 bits to find the
-+ * start of this group of free blocks.
-+ */
-+ for ( k = 0;
-+ k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh);
-+ k++, j--)
-+ ;
-+
-+got_block:
-+
-+ ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count);
-+
-+ /* Make sure we use undo access for the bitmap, because it is
-+ critical that we do the frozen_data COW on bitmap buffers in
-+ all cases even if the buffer is in BJ_Forget state in the
-+ committing transaction. */
-+ BUFFER_TRACE(bh, "get undo access for marking new block");
-+ fatal = ext3_journal_get_undo_access(handle, bh);
-+ if (fatal) goto out;
-+
-+ BUFFER_TRACE(bh2, "get_write_access");
-+ fatal = ext3_journal_get_write_access(handle, bh2);
-+ if (fatal) goto out;
-+
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
-+ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
-+ if (fatal) goto out;
-+
-+ tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb)
-+ + le32_to_cpu(es->s_first_data_block);
-+
-+ if (tmp == le32_to_cpu(gdp->bg_block_bitmap) ||
-+ tmp == le32_to_cpu(gdp->bg_inode_bitmap) ||
-+ in_range (tmp, le32_to_cpu(gdp->bg_inode_table),
-+ sb->u.ext3_sb.s_itb_per_group))
-+ ext3_error (sb, "ext3_new_block",
-+ "Allocating block in system zone - "
-+ "block = %u", tmp);
-+
-+ /* The superblock lock should guard against anybody else beating
-+ * us to this point! */
-+ J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data));
-+ BUFFER_TRACE(bh, "setting bitmap bit");
-+ ext3_set_bit(j, bh->b_data);
-+ performed_allocation = 1;
-+
-+#ifdef CONFIG_JBD_DEBUG
-+ {
-+ struct buffer_head *debug_bh;
-+
-+ /* Record bitmap buffer state in the newly allocated block */
-+ debug_bh = sb_get_hash_table(sb, tmp);
-+ if (debug_bh) {
-+ BUFFER_TRACE(debug_bh, "state when allocated");
-+ BUFFER_TRACE2(debug_bh, bh, "bitmap state");
-+ brelse(debug_bh);
-+ }
-+ }
-+#endif
-+ if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data)
-+ J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data));
-+ bhtmp = bh;
-+ alloctmp = j;
-+
-+ ext3_debug ("found bit %d\n", j);
-+
-+ /*
-+ * Do block preallocation now if required.
-+ */
-+#ifdef EXT3_PREALLOCATE
-+ /*
-+ * akpm: this is not enabled for ext3. Need to use
-+ * ext3_test_allocatable()
-+ */
-+ /* Writer: ->i_prealloc* */
-+ if (prealloc_count && !*prealloc_count) {
-+ int prealloc_goal;
-+ unsigned long next_block = tmp + 1;
-+
-+ prealloc_goal = es->s_prealloc_blocks ?
-+ es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS;
-+
-+ *prealloc_block = next_block;
-+ /* Writer: end */
-+ for (k = 1;
-+ k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb);
-+ k++, next_block++) {
-+ if (DQUOT_PREALLOC_BLOCK(inode, 1))
-+ break;
-+ /* Writer: ->i_prealloc* */
-+ if (*prealloc_block + *prealloc_count != next_block ||
-+ ext3_set_bit (j + k, bh->b_data)) {
-+ /* Writer: end */
-+ DQUOT_FREE_BLOCK(inode, 1);
-+ break;
-+ }
-+ (*prealloc_count)++;
-+ /* Writer: end */
-+ }
-+ /*
-+ * As soon as we go for per-group spinlocks we'll need these
-+ * done inside the loop above.
-+ */
-+ gdp->bg_free_blocks_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
-+ (k - 1));
-+ es->s_free_blocks_count =
-+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) -
-+ (k - 1));
-+ ext3_debug ("Preallocated a further %lu bits.\n",
-+ (k - 1));
-+ }
-+#endif
-+
-+ j = tmp;
-+
-+ BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block");
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (!fatal) fatal = err;
-+
-+ if (j >= le32_to_cpu(es->s_blocks_count)) {
-+ ext3_error (sb, "ext3_new_block",
-+ "block(%d) >= blocks count(%d) - "
-+ "block_group = %d, es == %p ",j,
-+ le32_to_cpu(es->s_blocks_count), i, es);
-+ goto out;
-+ }
-+
-+ /*
-+ * It is up to the caller to add the new buffer to a journal
-+ * list of some description. We don't know in advance whether
-+ * the caller wants to use it as metadata or data.
-+ */
-+
-+ ext3_debug ("allocating block %d. "
-+ "Goal hits %d of %d.\n", j, goal_hits, goal_attempts);
-+
-+ gdp->bg_free_blocks_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
-+ es->s_free_blocks_count =
-+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1);
-+
-+ BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor");
-+ err = ext3_journal_dirty_metadata(handle, bh2);
-+ if (!fatal) fatal = err;
-+
-+ BUFFER_TRACE(bh, "journal_dirty_metadata for superblock");
-+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
-+ if (!fatal) fatal = err;
-+
-+ sb->s_dirt = 1;
-+ if (fatal)
-+ goto out;
-+
-+ unlock_super (sb);
-+ *errp = 0;
-+ return j;
-+
-+io_error:
-+ *errp = -EIO;
-+out:
-+ if (fatal) {
-+ *errp = fatal;
-+ ext3_std_error(sb, fatal);
-+ }
-+ unlock_super (sb);
-+ /*
-+ * Undo the block allocation
-+ */
-+ if (!performed_allocation)
-+ DQUOT_FREE_BLOCK(inode, 1);
-+ return 0;
-+
-+}
-+
-+unsigned long ext3_count_free_blocks (struct super_block * sb)
-+{
-+#ifdef EXT3FS_DEBUG
-+ struct ext3_super_block * es;
-+ unsigned long desc_count, bitmap_count, x;
-+ int bitmap_nr;
-+ struct ext3_group_desc * gdp;
-+ int i;
-+
-+ lock_super (sb);
-+ es = sb->u.ext3_sb.s_es;
-+ desc_count = 0;
-+ bitmap_count = 0;
-+ gdp = NULL;
-+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
-+ gdp = ext3_get_group_desc (sb, i, NULL);
-+ if (!gdp)
-+ continue;
-+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
-+ bitmap_nr = load_block_bitmap (sb, i);
-+ if (bitmap_nr < 0)
-+ continue;
-+
-+ x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr],
-+ sb->s_blocksize);
-+ printk ("group %d: stored = %d, counted = %lu\n",
-+ i, le16_to_cpu(gdp->bg_free_blocks_count), x);
-+ bitmap_count += x;
-+ }
-+ printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
-+ le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
-+ unlock_super (sb);
-+ return bitmap_count;
-+#else
-+ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count);
-+#endif
-+}
-+
-+static inline int block_in_use (unsigned long block,
-+ struct super_block * sb,
-+ unsigned char * map)
-+{
-+ return ext3_test_bit ((block -
-+ le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) %
-+ EXT3_BLOCKS_PER_GROUP(sb), map);
-+}
-+
-+static inline int test_root(int a, int b)
-+{
-+ if (a == 0)
-+ return 1;
-+ while (1) {
-+ if (a == 1)
-+ return 1;
-+ if (a % b)
-+ return 0;
-+ a = a / b;
-+ }
-+}
-+
-+int ext3_group_sparse(int group)
-+{
-+ return (test_root(group, 3) || test_root(group, 5) ||
-+ test_root(group, 7));
-+}
-+
-+/**
-+ * ext3_bg_has_super - number of blocks used by the superblock in group
-+ * @sb: superblock for filesystem
-+ * @group: group number to check
-+ *
-+ * Return the number of blocks used by the superblock (primary or backup)
-+ * in this group. Currently this will be only 0 or 1.
-+ */
-+int ext3_bg_has_super(struct super_block *sb, int group)
-+{
-+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
-+ !ext3_group_sparse(group))
-+ return 0;
-+ return 1;
-+}
-+
-+/**
-+ * ext3_bg_num_gdb - number of blocks used by the group table in group
-+ * @sb: superblock for filesystem
-+ * @group: group number to check
-+ *
-+ * Return the number of blocks used by the group descriptor table
-+ * (primary or backup) in this group. In the future there may be a
-+ * different number of descriptor blocks in each group.
-+ */
-+unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
-+{
-+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
-+ !ext3_group_sparse(group))
-+ return 0;
-+ return EXT3_SB(sb)->s_gdb_count;
-+}
-+
-+#ifdef CONFIG_EXT3_CHECK
-+/* Called at mount-time, super-block is locked */
-+void ext3_check_blocks_bitmap (struct super_block * sb)
-+{
-+ struct buffer_head * bh;
-+ struct ext3_super_block * es;
-+ unsigned long desc_count, bitmap_count, x, j;
-+ unsigned long desc_blocks;
-+ int bitmap_nr;
-+ struct ext3_group_desc * gdp;
-+ int i;
-+
-+ es = sb->u.ext3_sb.s_es;
-+ desc_count = 0;
-+ bitmap_count = 0;
-+ gdp = NULL;
-+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
-+ gdp = ext3_get_group_desc (sb, i, NULL);
-+ if (!gdp)
-+ continue;
-+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
-+ bitmap_nr = load_block_bitmap (sb, i);
-+ if (bitmap_nr < 0)
-+ continue;
-+
-+ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr];
-+
-+ if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data))
-+ ext3_error(sb, __FUNCTION__,
-+ "Superblock in group %d is marked free", i);
-+
-+ desc_blocks = ext3_bg_num_gdb(sb, i);
-+ for (j = 0; j < desc_blocks; j++)
-+ if (!ext3_test_bit(j + 1, bh->b_data))
-+ ext3_error(sb, __FUNCTION__,
-+ "Descriptor block #%ld in group "
-+ "%d is marked free", j, i);
-+
-+ if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap),
-+ sb, bh->b_data))
-+ ext3_error (sb, "ext3_check_blocks_bitmap",
-+ "Block bitmap for group %d is marked free",
-+ i);
-+
-+ if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap),
-+ sb, bh->b_data))
-+ ext3_error (sb, "ext3_check_blocks_bitmap",
-+ "Inode bitmap for group %d is marked free",
-+ i);
-+
-+ for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++)
-+ if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
-+ sb, bh->b_data))
-+ ext3_error (sb, "ext3_check_blocks_bitmap",
-+ "Block #%d of the inode table in "
-+ "group %d is marked free", j, i);
-+
-+ x = ext3_count_free (bh, sb->s_blocksize);
-+ if (le16_to_cpu(gdp->bg_free_blocks_count) != x)
-+ ext3_error (sb, "ext3_check_blocks_bitmap",
-+ "Wrong free blocks count for group %d, "
-+ "stored = %d, counted = %lu", i,
-+ le16_to_cpu(gdp->bg_free_blocks_count), x);
-+ bitmap_count += x;
-+ }
-+ if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count)
-+ ext3_error (sb, "ext3_check_blocks_bitmap",
-+ "Wrong free blocks count in super block, "
-+ "stored = %lu, counted = %lu",
-+ (unsigned long)le32_to_cpu(es->s_free_blocks_count),
-+ bitmap_count);
-+}
-+#endif
-diff -rup --new-file linux.mcp2/fs/ext3/bitmap.c linux_tmp/fs/ext3/bitmap.c
---- linux.mcp2/fs/ext3/bitmap.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/bitmap.c 2001-11-09 14:25:04.000000000 -0800
-@@ -0,0 +1,26 @@
-+/*
-+ * linux/fs/ext3/bitmap.c
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ */
-+
-+#include <linux/fs.h>
-+
-+
-+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-+
-+unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
-+{
-+ unsigned int i;
-+ unsigned long sum = 0;
-+
-+ if (!map)
-+ return (0);
-+ for (i = 0; i < numchars; i++)
-+ sum += nibblemap[map->b_data[i] & 0xf] +
-+ nibblemap[(map->b_data[i] >> 4) & 0xf];
-+ return (sum);
-+}
-diff -rup --new-file linux.mcp2/fs/ext3/dir.c linux_tmp/fs/ext3/dir.c
---- linux.mcp2/fs/ext3/dir.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800
-@@ -0,0 +1,190 @@
-+/*
-+ * linux/fs/ext3/dir.c
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ *
-+ * from
-+ *
-+ * linux/fs/minix/dir.c
-+ *
-+ * Copyright (C) 1991, 1992 Linus Torvalds
-+ *
-+ * ext3 directory handling functions
-+ *
-+ * Big-endian to little-endian byte-swapping/bitmaps by
-+ * David S. Miller (davem@caip.rutgers.edu), 1995
-+ */
-+
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+
-+static unsigned char ext3_filetype_table[] = {
-+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-+};
-+
-+static int ext3_readdir(struct file *, void *, filldir_t);
-+
-+struct file_operations ext3_dir_operations = {
-+ read: generic_read_dir,
-+ readdir: ext3_readdir, /* BKL held */
-+ ioctl: ext3_ioctl, /* BKL held */
-+ fsync: ext3_sync_file, /* BKL held */
-+};
-+
-+int ext3_check_dir_entry (const char * function, struct inode * dir,
-+ struct ext3_dir_entry_2 * de,
-+ struct buffer_head * bh,
-+ unsigned long offset)
-+{
-+ const char * error_msg = NULL;
-+ const int rlen = le16_to_cpu(de->rec_len);
-+
-+ if (rlen < EXT3_DIR_REC_LEN(1))
-+ error_msg = "rec_len is smaller than minimal";
-+ else if (rlen % 4 != 0)
-+ error_msg = "rec_len % 4 != 0";
-+ else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
-+ error_msg = "rec_len is too small for name_len";
-+ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
-+ error_msg = "directory entry across blocks";
-+ else if (le32_to_cpu(de->inode) >
-+ le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
-+ error_msg = "inode out of bounds";
-+
-+ if (error_msg != NULL)
-+ ext3_error (dir->i_sb, function,
-+ "bad entry in directory #%lu: %s - "
-+ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
-+ dir->i_ino, error_msg, offset,
-+ (unsigned long) le32_to_cpu(de->inode),
-+ rlen, de->name_len);
-+ return error_msg == NULL ? 1 : 0;
-+}
-+
-+static int ext3_readdir(struct file * filp,
-+ void * dirent, filldir_t filldir)
-+{
-+ int error = 0;
-+ unsigned long offset, blk;
-+ int i, num, stored;
-+ struct buffer_head * bh, * tmp, * bha[16];
-+ struct ext3_dir_entry_2 * de;
-+ struct super_block * sb;
-+ int err;
-+ struct inode *inode = filp->f_dentry->d_inode;
-+
-+ sb = inode->i_sb;
-+
-+ stored = 0;
-+ bh = NULL;
-+ offset = filp->f_pos & (sb->s_blocksize - 1);
-+
-+ while (!error && !stored && filp->f_pos < inode->i_size) {
-+ blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
-+ bh = ext3_bread (0, inode, blk, 0, &err);
-+ if (!bh) {
-+ ext3_error (sb, "ext3_readdir",
-+ "directory #%lu contains a hole at offset %lu",
-+ inode->i_ino, (unsigned long)filp->f_pos);
-+ filp->f_pos += sb->s_blocksize - offset;
-+ continue;
-+ }
-+
-+ /*
-+ * Do the readahead
-+ */
-+ if (!offset) {
-+ for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
-+ i > 0; i--) {
-+ tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
-+ if (tmp && !buffer_uptodate(tmp) &&
-+ !buffer_locked(tmp))
-+ bha[num++] = tmp;
-+ else
-+ brelse (tmp);
-+ }
-+ if (num) {
-+ ll_rw_block (READA, num, bha);
-+ for (i = 0; i < num; i++)
-+ brelse (bha[i]);
-+ }
-+ }
-+
-+revalidate:
-+ /* If the dir block has changed since the last call to
-+ * readdir(2), then we might be pointing to an invalid
-+ * dirent right now. Scan from the start of the block
-+ * to make sure. */
-+ if (filp->f_version != inode->i_version) {
-+ for (i = 0; i < sb->s_blocksize && i < offset; ) {
-+ de = (struct ext3_dir_entry_2 *)
-+ (bh->b_data + i);
-+ /* It's too expensive to do a full
-+ * dirent test each time round this
-+ * loop, but we do have to test at
-+ * least that it is non-zero. A
-+ * failure will be detected in the
-+ * dirent test below. */
-+ if (le16_to_cpu(de->rec_len) <
-+ EXT3_DIR_REC_LEN(1))
-+ break;
-+ i += le16_to_cpu(de->rec_len);
-+ }
-+ offset = i;
-+ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
-+ | offset;
-+ filp->f_version = inode->i_version;
-+ }
-+
-+ while (!error && filp->f_pos < inode->i_size
-+ && offset < sb->s_blocksize) {
-+ de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
-+ if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
-+ bh, offset)) {
-+ /* On error, skip the f_pos to the
-+ next block. */
-+ filp->f_pos = (filp->f_pos |
-+ (sb->s_blocksize - 1)) + 1;
-+ brelse (bh);
-+ return stored;
-+ }
-+ offset += le16_to_cpu(de->rec_len);
-+ if (le32_to_cpu(de->inode)) {
-+ /* We might block in the next section
-+ * if the data destination is
-+ * currently swapped out. So, use a
-+ * version stamp to detect whether or
-+ * not the directory has been modified
-+ * during the copy operation.
-+ */
-+ unsigned long version = filp->f_version;
-+ unsigned char d_type = DT_UNKNOWN;
-+
-+ if (EXT3_HAS_INCOMPAT_FEATURE(sb,
-+ EXT3_FEATURE_INCOMPAT_FILETYPE)
-+ && de->file_type < EXT3_FT_MAX)
-+ d_type =
-+ ext3_filetype_table[de->file_type];
-+ error = filldir(dirent, de->name,
-+ de->name_len,
-+ filp->f_pos,
-+ le32_to_cpu(de->inode),
-+ d_type);
-+ if (error)
-+ break;
-+ if (version != filp->f_version)
-+ goto revalidate;
-+ stored ++;
-+ }
-+ filp->f_pos += le16_to_cpu(de->rec_len);
-+ }
-+ offset = 0;
-+ brelse (bh);
-+ }
-+ UPDATE_ATIME(inode);
-+ return 0;
-+}
-diff -rup --new-file linux.mcp2/fs/ext3/file.c linux_tmp/fs/ext3/file.c
---- linux.mcp2/fs/ext3/file.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/file.c 2001-11-15 13:37:55.000000000 -0800
-@@ -0,0 +1,94 @@
-+/*
-+ * linux/fs/ext3/file.c
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ *
-+ * from
-+ *
-+ * linux/fs/minix/file.c
-+ *
-+ * Copyright (C) 1991, 1992 Linus Torvalds
-+ *
-+ * ext3 fs regular file handling primitives
-+ *
-+ * 64-bit file support on 64-bit platforms by Jakub Jelinek
-+ * (jj@sunsite.ms.mff.cuni.cz)
-+ */
-+
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/locks.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/smp_lock.h>
-+
-+/*
-+ * Called when an inode is released. Note that this is different
-+ * from ext3_file_open: open gets called at every open, but release
-+ * gets called only when /all/ the files are closed.
-+ */
-+static int ext3_release_file (struct inode * inode, struct file * filp)
-+{
-+ if (filp->f_mode & FMODE_WRITE)
-+ ext3_discard_prealloc (inode);
-+ return 0;
-+}
-+
-+/*
-+ * Called when an inode is about to be opened.
-+ * We use this to disallow opening RW large files on 32bit systems if
-+ * the caller didn't specify O_LARGEFILE. On 64bit systems we force
-+ * on this flag in sys_open.
-+ */
-+static int ext3_open_file (struct inode * inode, struct file * filp)
-+{
-+ if (!(filp->f_flags & O_LARGEFILE) &&
-+ inode->i_size > 0x7FFFFFFFLL)
-+ return -EFBIG;
-+ return 0;
-+}
-+
-+/*
-+ * ext3_file_write().
-+ *
-+ * Most things are done in ext3_prepare_write() and ext3_commit_write().
-+ */
-+
-+static ssize_t
-+ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
-+{
-+ struct inode *inode = file->f_dentry->d_inode;
-+
-+ /*
-+ * Nasty: if the file is subject to synchronous writes then we need
-+ * to force generic_osync_inode() to call ext3_write_inode().
-+ * We do that by marking the inode dirty. This adds much more
-+ * computational expense than we need, but we're going to sync
-+ * anyway.
-+ */
-+ if (IS_SYNC(inode) || (file->f_flags & O_SYNC))
-+ mark_inode_dirty(inode);
-+
-+ return generic_file_write(file, buf, count, ppos);
-+}
-+
-+struct file_operations ext3_file_operations = {
-+ llseek: generic_file_llseek, /* BKL held */
-+ read: generic_file_read, /* BKL not held. Don't need */
-+ write: ext3_file_write, /* BKL not held. Don't need */
-+ ioctl: ext3_ioctl, /* BKL held */
-+ mmap: generic_file_mmap,
-+ open: ext3_open_file, /* BKL not held. Don't need */
-+ release: ext3_release_file, /* BKL not held. Don't need */
-+ fsync: ext3_sync_file, /* BKL held */
-+};
-+
-+struct inode_operations ext3_file_inode_operations = {
-+ truncate: ext3_truncate, /* BKL held */
-+ setattr: ext3_setattr, /* BKL held */
-+};
-+
-diff -rup --new-file linux.mcp2/fs/ext3/fsync.c linux_tmp/fs/ext3/fsync.c
---- linux.mcp2/fs/ext3/fsync.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/fsync.c 2001-11-20 21:34:13.000000000 -0800
-@@ -0,0 +1,70 @@
-+/*
-+ * linux/fs/ext3/fsync.c
-+ *
-+ * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
-+ * from
-+ * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ * from
-+ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
-+ *
-+ * ext3fs fsync primitive
-+ *
-+ * Big-endian to little-endian byte-swapping/bitmaps by
-+ * David S. Miller (davem@caip.rutgers.edu), 1995
-+ *
-+ * Removed unnecessary code duplication for little endian machines
-+ * and excessive __inline__s.
-+ * Andi Kleen, 1997
-+ *
-+ * Major simplications and cleanup - we only need to do the metadata, because
-+ * we can depend on generic_block_fdatasync() to sync the data blocks.
-+ */
-+
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/jbd.h>
-+#include <linux/smp_lock.h>
-+
-+/*
-+ * akpm: A new design for ext3_sync_file().
-+ *
-+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
-+ * There cannot be a transaction open by this task. (AKPM: quotas?)
-+ * Another task could have dirtied this inode. Its data can be in any
-+ * state in the journalling system.
-+ *
-+ * What we do is just kick off a commit and wait on it. This will snapshot the
-+ * inode to disk.
-+ *
-+ * Note that there is a serious optimisation we can make here: if the current
-+ * inode is not part of j_running_transaction or j_committing_transaction
-+ * then we have nothing to do. That would require implementation of t_ilist,
-+ * which isn't too hard.
-+ */
-+
-+int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
-+{
-+ struct inode *inode = dentry->d_inode;
-+ int ret;
-+
-+ J_ASSERT(ext3_journal_current_handle() == 0);
-+
-+ /*
-+ * fsync_inode_buffers() just walks i_dirty_buffers and waits
-+ * on them. It's a no-op for full data journalling because
-+ * i_dirty_buffers will be ampty.
-+ * Really, we only need to start I/O on the dirty buffers -
-+ * we'll end up waiting on them in commit.
-+ */
-+ ret = fsync_inode_buffers(inode);
-+ ret |= fsync_inode_data_buffers(inode);
-+
-+ ext3_force_commit(inode->i_sb);
-+
-+ return ret;
-+}
-diff -rup --new-file linux.mcp2/fs/ext3/ialloc.c linux_tmp/fs/ext3/ialloc.c
---- linux.mcp2/fs/ext3/ialloc.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/ialloc.c 2002-02-25 11:38:08.000000000 -0800
-@@ -0,0 +1,663 @@
-+/*
-+ * linux/fs/ext3/ialloc.c
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ *
-+ * BSD ufs-inspired inode and directory allocation by
-+ * Stephen Tweedie (sct@redhat.com), 1993
-+ * Big-endian to little-endian byte-swapping/bitmaps by
-+ * David S. Miller (davem@caip.rutgers.edu), 1995
-+ */
-+
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/stat.h>
-+#include <linux/string.h>
-+#include <linux/locks.h>
-+#include <linux/quotaops.h>
-+
-+#include <asm/bitops.h>
-+#include <asm/byteorder.h>
-+
-+/*
-+ * ialloc.c contains the inodes allocation and deallocation routines
-+ */
-+
-+/*
-+ * The free inodes are managed by bitmaps. A file system contains several
-+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
-+ * block for inodes, N blocks for the inode table and data blocks.
-+ *
-+ * The file system contains group descriptors which are located after the
-+ * super block. Each descriptor contains the number of the bitmap block and
-+ * the free blocks count in the block. The descriptors are loaded in memory
-+ * when a file system is mounted (see ext3_read_super).
-+ */
-+
-+
-+/*
-+ * Read the inode allocation bitmap for a given block_group, reading
-+ * into the specified slot in the superblock's bitmap cache.
-+ *
-+ * Return >=0 on success or a -ve error code.
-+ */
-+static int read_inode_bitmap (struct super_block * sb,
-+ unsigned long block_group,
-+ unsigned int bitmap_nr)
-+{
-+ struct ext3_group_desc * gdp;
-+ struct buffer_head * bh = NULL;
-+ int retval = 0;
-+
-+ gdp = ext3_get_group_desc (sb, block_group, NULL);
-+ if (!gdp) {
-+ retval = -EIO;
-+ goto error_out;
-+ }
-+ bh = sb_bread(sb, le32_to_cpu(gdp->bg_inode_bitmap));
-+ if (!bh) {
-+ ext3_error (sb, "read_inode_bitmap",
-+ "Cannot read inode bitmap - "
-+ "block_group = %lu, inode_bitmap = %lu",
-+ block_group, (unsigned long) gdp->bg_inode_bitmap);
-+ retval = -EIO;
-+ }
-+ /*
-+ * On IO error, just leave a zero in the superblock's block pointer for
-+ * this group. The IO will be retried next time.
-+ */
-+error_out:
-+ sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group;
-+ sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh;
-+ return retval;
-+}
-+
-+/*
-+ * load_inode_bitmap loads the inode bitmap for a blocks group
-+ *
-+ * It maintains a cache for the last bitmaps loaded. This cache is managed
-+ * with a LRU algorithm.
-+ *
-+ * Notes:
-+ * 1/ There is one cache per mounted file system.
-+ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
-+ * this function reads the bitmap without maintaining a LRU cache.
-+ *
-+ * Return the slot used to store the bitmap, or a -ve error code.
-+ */
-+static int load_inode_bitmap (struct super_block * sb,
-+ unsigned int block_group)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ unsigned long inode_bitmap_number;
-+ struct buffer_head * inode_bitmap;
-+ int i, j, retval = 0;
-+
-+ if (block_group >= sbi->s_groups_count)
-+ ext3_panic (sb, "load_inode_bitmap",
-+ "block_group >= groups_count - "
-+ "block_group = %d, groups_count = %lu",
-+ block_group, sbi->s_groups_count);
-+ if (sbi->s_loaded_inode_bitmaps > 0 &&
-+ sbi->s_inode_bitmap_number[0] == block_group &&
-+ sbi->s_inode_bitmap[0] != NULL)
-+ return 0;
-+ if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) {
-+ if (sbi->s_inode_bitmap[block_group]) {
-+ if (sbi->s_inode_bitmap_number[block_group] !=
-+ block_group)
-+ ext3_panic(sb, "load_inode_bitmap",
-+ "block_group != inode_bitmap_number");
-+ return block_group;
-+ }
-+ retval = read_inode_bitmap(sb, block_group, block_group);
-+ if (retval < 0)
-+ return retval;
-+ return block_group;
-+ }
-+
-+ for (i = 0; i < sbi->s_loaded_inode_bitmaps &&
-+ sbi->s_inode_bitmap_number[i] != block_group; i++)
-+ /* do nothing */;
-+ if (i < sbi->s_loaded_inode_bitmaps &&
-+ sbi->s_inode_bitmap_number[i] == block_group) {
-+ inode_bitmap_number = sbi->s_inode_bitmap_number[i];
-+ inode_bitmap = sbi->s_inode_bitmap[i];
-+ for (j = i; j > 0; j--) {
-+ sbi->s_inode_bitmap_number[j] =
-+ sbi->s_inode_bitmap_number[j - 1];
-+ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
-+ }
-+ sbi->s_inode_bitmap_number[0] = inode_bitmap_number;
-+ sbi->s_inode_bitmap[0] = inode_bitmap;
-+
-+ /*
-+ * There's still one special case here --- if inode_bitmap == 0
-+ * then our last attempt to read the bitmap failed and we have
-+ * just ended up caching that failure. Try again to read it.
-+ */
-+ if (!inode_bitmap)
-+ retval = read_inode_bitmap (sb, block_group, 0);
-+ } else {
-+ if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED)
-+ sbi->s_loaded_inode_bitmaps++;
-+ else
-+ brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]);
-+ for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) {
-+ sbi->s_inode_bitmap_number[j] =
-+ sbi->s_inode_bitmap_number[j - 1];
-+ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
-+ }
-+ retval = read_inode_bitmap (sb, block_group, 0);
-+ }
-+ return retval;
-+}
-+
-+/*
-+ * NOTE! When we get the inode, we're the only people
-+ * that have access to it, and as such there are no
-+ * race conditions we have to worry about. The inode
-+ * is not on the hash-lists, and it cannot be reached
-+ * through the filesystem because the directory entry
-+ * has been deleted earlier.
-+ *
-+ * HOWEVER: we must make sure that we get no aliases,
-+ * which means that we have to call "clear_inode()"
-+ * _before_ we mark the inode not in use in the inode
-+ * bitmaps. Otherwise a newly created file might use
-+ * the same inode number (not actually the same pointer
-+ * though), and then we'd have two inodes sharing the
-+ * same inode number and space on the harddisk.
-+ */
-+void ext3_free_inode (handle_t *handle, struct inode * inode)
-+{
-+ struct super_block * sb = inode->i_sb;
-+ int is_directory;
-+ unsigned long ino;
-+ struct buffer_head * bh;
-+ struct buffer_head * bh2;
-+ unsigned long block_group;
-+ unsigned long bit;
-+ int bitmap_nr;
-+ struct ext3_group_desc * gdp;
-+ struct ext3_super_block * es;
-+ int fatal = 0, err;
-+
-+ if (!inode->i_dev) {
-+ printk ("ext3_free_inode: inode has no device\n");
-+ return;
-+ }
-+ if (atomic_read(&inode->i_count) > 1) {
-+ printk ("ext3_free_inode: inode has count=%d\n",
-+ atomic_read(&inode->i_count));
-+ return;
-+ }
-+ if (inode->i_nlink) {
-+ printk ("ext3_free_inode: inode has nlink=%d\n",
-+ inode->i_nlink);
-+ return;
-+ }
-+ if (!sb) {
-+ printk("ext3_free_inode: inode on nonexistent device\n");
-+ return;
-+ }
-+
-+ ino = inode->i_ino;
-+ ext3_debug ("freeing inode %lu\n", ino);
-+
-+ /*
-+ * Note: we must free any quota before locking the superblock,
-+ * as writing the quota to disk may need the lock as well.
-+ */
-+ DQUOT_INIT(inode);
-+ DQUOT_FREE_INODE(inode);
-+ DQUOT_DROP(inode);
-+
-+ is_directory = S_ISDIR(inode->i_mode);
-+
-+ /* Do this BEFORE marking the inode not in use or returning an error */
-+ clear_inode (inode);
-+
-+ lock_super (sb);
-+ es = sb->u.ext3_sb.s_es;
-+ if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-+ ext3_error (sb, "ext3_free_inode",
-+ "reserved or nonexistent inode %lu", ino);
-+ goto error_return;
-+ }
-+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
-+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
-+ bitmap_nr = load_inode_bitmap (sb, block_group);
-+ if (bitmap_nr < 0)
-+ goto error_return;
-+
-+ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
-+
-+ BUFFER_TRACE(bh, "get_write_access");
-+ fatal = ext3_journal_get_write_access(handle, bh);
-+ if (fatal)
-+ goto error_return;
-+
-+ /* Ok, now we can actually update the inode bitmaps.. */
-+ if (!ext3_clear_bit (bit, bh->b_data))
-+ ext3_error (sb, "ext3_free_inode",
-+ "bit already cleared for inode %lu", ino);
-+ else {
-+ gdp = ext3_get_group_desc (sb, block_group, &bh2);
-+
-+ BUFFER_TRACE(bh2, "get_write_access");
-+ fatal = ext3_journal_get_write_access(handle, bh2);
-+ if (fatal) goto error_return;
-+
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access");
-+ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
-+ if (fatal) goto error_return;
-+
-+ if (gdp) {
-+ gdp->bg_free_inodes_count = cpu_to_le16(
-+ le16_to_cpu(gdp->bg_free_inodes_count) + 1);
-+ if (is_directory)
-+ gdp->bg_used_dirs_count = cpu_to_le16(
-+ le16_to_cpu(gdp->bg_used_dirs_count) - 1);
-+ }
-+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh2);
-+ if (!fatal) fatal = err;
-+ es->s_free_inodes_count =
-+ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh,
-+ "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
-+ if (!fatal) fatal = err;
-+ }
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (!fatal)
-+ fatal = err;
-+ sb->s_dirt = 1;
-+error_return:
-+ ext3_std_error(sb, fatal);
-+ unlock_super(sb);
-+}
-+
-+/*
-+ * There are two policies for allocating an inode. If the new inode is
-+ * a directory, then a forward search is made for a block group with both
-+ * free space and a low directory-to-inode ratio; if that fails, then of
-+ * the groups with above-average free space, that group with the fewest
-+ * directories already is chosen.
-+ *
-+ * For other inodes, search forward from the parent directory's block
-+ * group to find a free inode.
-+ */
-+struct inode * ext3_new_inode (handle_t *handle,
-+ const struct inode * dir, int mode)
-+{
-+ struct super_block * sb;
-+ struct buffer_head * bh;
-+ struct buffer_head * bh2;
-+ int i, j, avefreei;
-+ struct inode * inode;
-+ int bitmap_nr;
-+ struct ext3_group_desc * gdp;
-+ struct ext3_group_desc * tmp;
-+ struct ext3_super_block * es;
-+ int err = 0;
-+
-+ /* Cannot create files in a deleted directory */
-+ if (!dir || !dir->i_nlink)
-+ return ERR_PTR(-EPERM);
-+
-+ sb = dir->i_sb;
-+ inode = new_inode(sb);
-+ if (!inode)
-+ return ERR_PTR(-ENOMEM);
-+ init_rwsem(&inode->u.ext3_i.truncate_sem);
-+
-+ lock_super (sb);
-+ es = sb->u.ext3_sb.s_es;
-+repeat:
-+ gdp = NULL;
-+ i = 0;
-+
-+ if (S_ISDIR(mode)) {
-+ avefreei = le32_to_cpu(es->s_free_inodes_count) /
-+ sb->u.ext3_sb.s_groups_count;
-+ if (!gdp) {
-+ for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) {
-+ struct buffer_head *temp_buffer;
-+ tmp = ext3_get_group_desc (sb, j, &temp_buffer);
-+ if (tmp &&
-+ le16_to_cpu(tmp->bg_free_inodes_count) &&
-+ le16_to_cpu(tmp->bg_free_inodes_count) >=
-+ avefreei) {
-+ if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) >
-+ le16_to_cpu(gdp->bg_free_blocks_count))) {
-+ i = j;
-+ gdp = tmp;
-+ bh2 = temp_buffer;
-+ }
-+ }
-+ }
-+ }
-+ } else {
-+ /*
-+ * Try to place the inode in its parent directory
-+ */
-+ i = dir->u.ext3_i.i_block_group;
-+ tmp = ext3_get_group_desc (sb, i, &bh2);
-+ if (tmp && le16_to_cpu(tmp->bg_free_inodes_count))
-+ gdp = tmp;
-+ else
-+ {
-+ /*
-+ * Use a quadratic hash to find a group with a
-+ * free inode
-+ */
-+ for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) {
-+ i += j;
-+ if (i >= sb->u.ext3_sb.s_groups_count)
-+ i -= sb->u.ext3_sb.s_groups_count;
-+ tmp = ext3_get_group_desc (sb, i, &bh2);
-+ if (tmp &&
-+ le16_to_cpu(tmp->bg_free_inodes_count)) {
-+ gdp = tmp;
-+ break;
-+ }
-+ }
-+ }
-+ if (!gdp) {
-+ /*
-+ * That failed: try linear search for a free inode
-+ */
-+ i = dir->u.ext3_i.i_block_group + 1;
-+ for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) {
-+ if (++i >= sb->u.ext3_sb.s_groups_count)
-+ i = 0;
-+ tmp = ext3_get_group_desc (sb, i, &bh2);
-+ if (tmp &&
-+ le16_to_cpu(tmp->bg_free_inodes_count)) {
-+ gdp = tmp;
-+ break;
-+ }
-+ }
-+ }
-+ }
-+
-+ err = -ENOSPC;
-+ if (!gdp)
-+ goto fail;
-+
-+ err = -EIO;
-+ bitmap_nr = load_inode_bitmap (sb, i);
-+ if (bitmap_nr < 0)
-+ goto fail;
-+
-+ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
-+
-+ if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data,
-+ EXT3_INODES_PER_GROUP(sb))) <
-+ EXT3_INODES_PER_GROUP(sb)) {
-+ BUFFER_TRACE(bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err) goto fail;
-+
-+ if (ext3_set_bit (j, bh->b_data)) {
-+ ext3_error (sb, "ext3_new_inode",
-+ "bit already set for inode %d", j);
-+ goto repeat;
-+ }
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err) goto fail;
-+ } else {
-+ if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) {
-+ ext3_error (sb, "ext3_new_inode",
-+ "Free inodes count corrupted in group %d",
-+ i);
-+ /* Is it really ENOSPC? */
-+ err = -ENOSPC;
-+ if (sb->s_flags & MS_RDONLY)
-+ goto fail;
-+
-+ BUFFER_TRACE(bh2, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh2);
-+ if (err) goto fail;
-+ gdp->bg_free_inodes_count = 0;
-+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh2);
-+ if (err) goto fail;
-+ }
-+ goto repeat;
-+ }
-+ j += i * EXT3_INODES_PER_GROUP(sb) + 1;
-+ if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) {
-+ ext3_error (sb, "ext3_new_inode",
-+ "reserved inode or inode > inodes count - "
-+ "block_group = %d,inode=%d", i, j);
-+ err = -EIO;
-+ goto fail;
-+ }
-+
-+ BUFFER_TRACE(bh2, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh2);
-+ if (err) goto fail;
-+ gdp->bg_free_inodes_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
-+ if (S_ISDIR(mode))
-+ gdp->bg_used_dirs_count =
-+ cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
-+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh2);
-+ if (err) goto fail;
-+
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
-+ if (err) goto fail;
-+ es->s_free_inodes_count =
-+ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
-+ sb->s_dirt = 1;
-+ if (err) goto fail;
-+
-+ inode->i_uid = current->fsuid;
-+ if (test_opt (sb, GRPID))
-+ inode->i_gid = dir->i_gid;
-+ else if (dir->i_mode & S_ISGID) {
-+ inode->i_gid = dir->i_gid;
-+ if (S_ISDIR(mode))
-+ mode |= S_ISGID;
-+ } else
-+ inode->i_gid = current->fsgid;
-+ inode->i_mode = mode;
-+
-+ inode->i_ino = j;
-+ /* This is the optimal IO size (for stat), not the fs block size */
-+ inode->i_blksize = PAGE_SIZE;
-+ inode->i_blocks = 0;
-+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-+ inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL;
-+ if (S_ISLNK(mode))
-+ inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
-+#ifdef EXT3_FRAGMENTS
-+ inode->u.ext3_i.i_faddr = 0;
-+ inode->u.ext3_i.i_frag_no = 0;
-+ inode->u.ext3_i.i_frag_size = 0;
-+#endif
-+ inode->u.ext3_i.i_file_acl = 0;
-+ inode->u.ext3_i.i_dir_acl = 0;
-+ inode->u.ext3_i.i_dtime = 0;
-+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
-+#ifdef EXT3_PREALLOCATE
-+ inode->u.ext3_i.i_prealloc_count = 0;
-+#endif
-+ inode->u.ext3_i.i_block_group = i;
-+
-+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
-+ inode->i_flags |= S_SYNC;
-+ if (IS_SYNC(inode))
-+ handle->h_sync = 1;
-+ insert_inode_hash(inode);
-+ inode->i_generation = sb->u.ext3_sb.s_next_generation++;
-+
-+ inode->u.ext3_i.i_state = EXT3_STATE_NEW;
-+ err = ext3_mark_inode_dirty(handle, inode);
-+ if (err) goto fail;
-+
-+ unlock_super (sb);
-+ if(DQUOT_ALLOC_INODE(inode)) {
-+ DQUOT_DROP(inode);
-+ inode->i_flags |= S_NOQUOTA;
-+ inode->i_nlink = 0;
-+ iput(inode);
-+ return ERR_PTR(-EDQUOT);
-+ }
-+ ext3_debug ("allocating inode %lu\n", inode->i_ino);
-+ return inode;
-+
-+fail:
-+ unlock_super(sb);
-+ iput(inode);
-+ ext3_std_error(sb, err);
-+ return ERR_PTR(err);
-+}
-+
-+/* Verify that we are loading a valid orphan from disk */
-+struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino)
-+{
-+ ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
-+ unsigned long block_group;
-+ int bit;
-+ int bitmap_nr;
-+ struct buffer_head *bh;
-+ struct inode *inode = NULL;
-+
-+ /* Error cases - e2fsck has already cleaned up for us */
-+ if (ino > max_ino) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "bad orphan ino %ld! e2fsck was run?\n", ino);
-+ return NULL;
-+ }
-+
-+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
-+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
-+ if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 ||
-+ !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "inode bitmap error for orphan %ld\n", ino);
-+ return NULL;
-+ }
-+
-+ /* Having the inode bit set should be a 100% indicator that this
-+ * is a valid orphan (no e2fsck run on fs). Orphans also include
-+ * inodes that were being truncated, so we can't check i_nlink==0.
-+ */
-+ if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) ||
-+ is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "bad orphan inode %ld! e2fsck was run?\n", ino);
-+ printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n",
-+ bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data));
-+ printk(KERN_NOTICE "inode=%p\n", inode);
-+ if (inode) {
-+ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
-+ is_bad_inode(inode));
-+ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n",
-+ NEXT_ORPHAN(inode));
-+ printk(KERN_NOTICE "max_ino=%ld\n", max_ino);
-+ }
-+ /* Avoid freeing blocks if we got a bad deleted inode */
-+ if (inode && inode->i_nlink == 0)
-+ inode->i_blocks = 0;
-+ iput(inode);
-+ return NULL;
-+ }
-+
-+ return inode;
-+}
-+
-+unsigned long ext3_count_free_inodes (struct super_block * sb)
-+{
-+#ifdef EXT3FS_DEBUG
-+ struct ext3_super_block * es;
-+ unsigned long desc_count, bitmap_count, x;
-+ int bitmap_nr;
-+ struct ext3_group_desc * gdp;
-+ int i;
-+
-+ lock_super (sb);
-+ es = sb->u.ext3_sb.s_es;
-+ desc_count = 0;
-+ bitmap_count = 0;
-+ gdp = NULL;
-+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
-+ gdp = ext3_get_group_desc (sb, i, NULL);
-+ if (!gdp)
-+ continue;
-+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
-+ bitmap_nr = load_inode_bitmap (sb, i);
-+ if (bitmap_nr < 0)
-+ continue;
-+
-+ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
-+ EXT3_INODES_PER_GROUP(sb) / 8);
-+ printk ("group %d: stored = %d, counted = %lu\n",
-+ i, le16_to_cpu(gdp->bg_free_inodes_count), x);
-+ bitmap_count += x;
-+ }
-+ printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
-+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
-+ unlock_super (sb);
-+ return desc_count;
-+#else
-+ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count);
-+#endif
-+}
-+
-+#ifdef CONFIG_EXT3_CHECK
-+/* Called at mount-time, super-block is locked */
-+void ext3_check_inodes_bitmap (struct super_block * sb)
-+{
-+ struct ext3_super_block * es;
-+ unsigned long desc_count, bitmap_count, x;
-+ int bitmap_nr;
-+ struct ext3_group_desc * gdp;
-+ int i;
-+
-+ es = sb->u.ext3_sb.s_es;
-+ desc_count = 0;
-+ bitmap_count = 0;
-+ gdp = NULL;
-+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
-+ gdp = ext3_get_group_desc (sb, i, NULL);
-+ if (!gdp)
-+ continue;
-+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
-+ bitmap_nr = load_inode_bitmap (sb, i);
-+ if (bitmap_nr < 0)
-+ continue;
-+
-+ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
-+ EXT3_INODES_PER_GROUP(sb) / 8);
-+ if (le16_to_cpu(gdp->bg_free_inodes_count) != x)
-+ ext3_error (sb, "ext3_check_inodes_bitmap",
-+ "Wrong free inodes count in group %d, "
-+ "stored = %d, counted = %lu", i,
-+ le16_to_cpu(gdp->bg_free_inodes_count), x);
-+ bitmap_count += x;
-+ }
-+ if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count)
-+ ext3_error (sb, "ext3_check_inodes_bitmap",
-+ "Wrong free inodes count in super block, "
-+ "stored = %lu, counted = %lu",
-+ (unsigned long)le32_to_cpu(es->s_free_inodes_count),
-+ bitmap_count);
-+}
-+#endif
-diff -rup --new-file linux.mcp2/fs/ext3/inode.c linux_tmp/fs/ext3/inode.c
---- linux.mcp2/fs/ext3/inode.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/inode.c 2002-08-02 17:39:45.000000000 -0700
-@@ -0,0 +1,2699 @@
-+/*
-+ * linux/fs/ext3/inode.c
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ *
-+ * from
-+ *
-+ * linux/fs/minix/inode.c
-+ *
-+ * Copyright (C) 1991, 1992 Linus Torvalds
-+ *
-+ * Goal-directed block allocation by Stephen Tweedie
-+ * (sct@redhat.com), 1993, 1998
-+ * Big-endian to little-endian byte-swapping/bitmaps by
-+ * David S. Miller (davem@caip.rutgers.edu), 1995
-+ * 64-bit file support on 64-bit platforms by Jakub Jelinek
-+ * (jj@sunsite.ms.mff.cuni.cz)
-+ *
-+ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
-+ */
-+
-+#include <linux/fs.h>
-+#include <linux/sched.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/jbd.h>
-+#include <linux/locks.h>
-+#include <linux/smp_lock.h>
-+#include <linux/highuid.h>
-+#include <linux/quotaops.h>
-+#include <linux/module.h>
-+
-+/*
-+ * SEARCH_FROM_ZERO forces each block allocation to search from the start
-+ * of the filesystem. This is to force rapid reallocation of recently-freed
-+ * blocks. The file fragmentation is horrendous.
-+ */
-+#undef SEARCH_FROM_ZERO
-+
-+/* The ext3 forget function must perform a revoke if we are freeing data
-+ * which has been journaled. Metadata (eg. indirect blocks) must be
-+ * revoked in all cases.
-+ *
-+ * "bh" may be NULL: a metadata block may have been freed from memory
-+ * but there may still be a record of it in the journal, and that record
-+ * still needs to be revoked.
-+ */
-+
-+static int ext3_forget(handle_t *handle, int is_metadata,
-+ struct inode *inode, struct buffer_head *bh,
-+ int blocknr)
-+{
-+ int err;
-+
-+ BUFFER_TRACE(bh, "enter");
-+
-+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-+ "data mode %lx\n",
-+ bh, is_metadata, inode->i_mode,
-+ test_opt(inode->i_sb, DATA_FLAGS));
-+
-+ /* Never use the revoke function if we are doing full data
-+ * journaling: there is no need to, and a V1 superblock won't
-+ * support it. Otherwise, only skip the revoke on un-journaled
-+ * data blocks. */
-+
-+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
-+ (!is_metadata && !ext3_should_journal_data(inode))) {
-+ if (bh) {
-+ BUFFER_TRACE(bh, "call journal_forget");
-+ ext3_journal_forget(handle, bh);
-+ }
-+ return 0;
-+ }
-+
-+ /*
-+ * data!=journal && (is_metadata || should_journal_data(inode))
-+ */
-+ BUFFER_TRACE(bh, "call ext3_journal_revoke");
-+ err = ext3_journal_revoke(handle, blocknr, bh);
-+ if (err)
-+ ext3_abort(inode->i_sb, __FUNCTION__,
-+ "error %d when attempting revoke", err);
-+ BUFFER_TRACE(bh, "exit");
-+ return err;
-+}
-+
-+/*
-+ * Truncate transactions can be complex and absolutely huge. So we need to
-+ * be able to restart the transaction at a conventient checkpoint to make
-+ * sure we don't overflow the journal.
-+ *
-+ * start_transaction gets us a new handle for a truncate transaction,
-+ * and extend_transaction tries to extend the existing one a bit. If
-+ * extend fails, we need to propagate the failure up and restart the
-+ * transaction in the top-level truncate loop. --sct
-+ */
-+
-+static handle_t *start_transaction(struct inode *inode)
-+{
-+ long needed;
-+ handle_t *result;
-+
-+ needed = inode->i_blocks;
-+ if (needed > EXT3_MAX_TRANS_DATA)
-+ needed = EXT3_MAX_TRANS_DATA;
-+
-+ result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
-+ if (!IS_ERR(result))
-+ return result;
-+
-+ ext3_std_error(inode->i_sb, PTR_ERR(result));
-+ return result;
-+}
-+
-+/*
-+ * Try to extend this transaction for the purposes of truncation.
-+ *
-+ * Returns 0 if we managed to create more room. If we can't create more
-+ * room, and the transaction must be restarted we return 1.
-+ */
-+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
-+{
-+ long needed;
-+
-+ if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
-+ return 0;
-+ needed = inode->i_blocks;
-+ if (needed > EXT3_MAX_TRANS_DATA)
-+ needed = EXT3_MAX_TRANS_DATA;
-+ if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed))
-+ return 0;
-+ return 1;
-+}
-+
-+/*
-+ * Restart the transaction associated with *handle. This does a commit,
-+ * so before we call here everything must be consistently dirtied against
-+ * this transaction.
-+ */
-+static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
-+{
-+ long needed = inode->i_blocks;
-+ if (needed > EXT3_MAX_TRANS_DATA)
-+ needed = EXT3_MAX_TRANS_DATA;
-+ jbd_debug(2, "restarting handle %p\n", handle);
-+ return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed);
-+}
-+
-+/*
-+ * Called at each iput()
-+ */
-+void ext3_put_inode (struct inode * inode)
-+{
-+ ext3_discard_prealloc (inode);
-+}
-+
-+/*
-+ * Called at the last iput() if i_nlink is zero.
-+ */
-+void ext3_delete_inode (struct inode * inode)
-+{
-+ handle_t *handle;
-+
-+ if (is_bad_inode(inode) ||
-+ inode->i_ino == EXT3_ACL_IDX_INO ||
-+ inode->i_ino == EXT3_ACL_DATA_INO)
-+ goto no_delete;
-+
-+ lock_kernel();
-+ handle = start_transaction(inode);
-+ if (IS_ERR(handle)) {
-+ /* If we're going to skip the normal cleanup, we still
-+ * need to make sure that the in-core orphan linked list
-+ * is properly cleaned up. */
-+ ext3_orphan_del(NULL, inode);
-+
-+ ext3_std_error(inode->i_sb, PTR_ERR(handle));
-+ unlock_kernel();
-+ goto no_delete;
-+ }
-+
-+ if (IS_SYNC(inode))
-+ handle->h_sync = 1;
-+ inode->i_size = 0;
-+ if (inode->i_blocks)
-+ ext3_truncate(inode);
-+ /*
-+ * Kill off the orphan record which ext3_truncate created.
-+ * AKPM: I think this can be inside the above `if'.
-+ * Note that ext3_orphan_del() has to be able to cope with the
-+ * deletion of a non-existent orphan - this is because we don't
-+ * know if ext3_truncate() actually created an orphan record.
-+ * (Well, we could do this if we need to, but heck - it works)
-+ */
-+ ext3_orphan_del(handle, inode);
-+ inode->u.ext3_i.i_dtime = CURRENT_TIME;
-+
-+ /*
-+ * One subtle ordering requirement: if anything has gone wrong
-+ * (transaction abort, IO errors, whatever), then we can still
-+ * do these next steps (the fs will already have been marked as
-+ * having errors), but we can't free the inode if the mark_dirty
-+ * fails.
-+ */
-+ if (ext3_mark_inode_dirty(handle, inode))
-+ /* If that failed, just do the required in-core inode clear. */
-+ clear_inode(inode);
-+ else
-+ ext3_free_inode(handle, inode);
-+ ext3_journal_stop(handle, inode);
-+ unlock_kernel();
-+ return;
-+no_delete:
-+ clear_inode(inode); /* We must guarantee clearing of inode... */
-+}
-+
-+void ext3_discard_prealloc (struct inode * inode)
-+{
-+#ifdef EXT3_PREALLOCATE
-+ lock_kernel();
-+ /* Writer: ->i_prealloc* */
-+ if (inode->u.ext3_i.i_prealloc_count) {
-+ unsigned short total = inode->u.ext3_i.i_prealloc_count;
-+ unsigned long block = inode->u.ext3_i.i_prealloc_block;
-+ inode->u.ext3_i.i_prealloc_count = 0;
-+ inode->u.ext3_i.i_prealloc_block = 0;
-+ /* Writer: end */
-+ ext3_free_blocks (inode, block, total);
-+ }
-+ unlock_kernel();
-+#endif
-+}
-+
-+static int ext3_alloc_block (handle_t *handle,
-+ struct inode * inode, unsigned long goal, int *err)
-+{
-+#ifdef EXT3FS_DEBUG
-+ static unsigned long alloc_hits = 0, alloc_attempts = 0;
-+#endif
-+ unsigned long result;
-+
-+#ifdef EXT3_PREALLOCATE
-+ /* Writer: ->i_prealloc* */
-+ if (inode->u.ext3_i.i_prealloc_count &&
-+ (goal == inode->u.ext3_i.i_prealloc_block ||
-+ goal + 1 == inode->u.ext3_i.i_prealloc_block))
-+ {
-+ result = inode->u.ext3_i.i_prealloc_block++;
-+ inode->u.ext3_i.i_prealloc_count--;
-+ /* Writer: end */
-+ ext3_debug ("preallocation hit (%lu/%lu).\n",
-+ ++alloc_hits, ++alloc_attempts);
-+ } else {
-+ ext3_discard_prealloc (inode);
-+ ext3_debug ("preallocation miss (%lu/%lu).\n",
-+ alloc_hits, ++alloc_attempts);
-+ if (S_ISREG(inode->i_mode))
-+ result = ext3_new_block (inode, goal,
-+ &inode->u.ext3_i.i_prealloc_count,
-+ &inode->u.ext3_i.i_prealloc_block, err);
-+ else
-+ result = ext3_new_block (inode, goal, 0, 0, err);
-+ /*
-+ * AKPM: this is somewhat sticky. I'm not surprised it was
-+ * disabled in 2.2's ext3. Need to integrate b_committed_data
-+ * guarding with preallocation, if indeed preallocation is
-+ * effective.
-+ */
-+ }
-+#else
-+ result = ext3_new_block (handle, inode, goal, 0, 0, err);
-+#endif
-+ return result;
-+}
-+
-+
-+typedef struct {
-+ u32 *p;
-+ u32 key;
-+ struct buffer_head *bh;
-+} Indirect;
-+
-+static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
-+{
-+ p->key = *(p->p = v);
-+ p->bh = bh;
-+}
-+
-+static inline int verify_chain(Indirect *from, Indirect *to)
-+{
-+ while (from <= to && from->key == *from->p)
-+ from++;
-+ return (from > to);
-+}
-+
-+/**
-+ * ext3_block_to_path - parse the block number into array of offsets
-+ * @inode: inode in question (we are only interested in its superblock)
-+ * @i_block: block number to be parsed
-+ * @offsets: array to store the offsets in
-+ *
-+ * To store the locations of file's data ext3 uses a data structure common
-+ * for UNIX filesystems - tree of pointers anchored in the inode, with
-+ * data blocks at leaves and indirect blocks in intermediate nodes.
-+ * This function translates the block number into path in that tree -
-+ * return value is the path length and @offsets[n] is the offset of
-+ * pointer to (n+1)th node in the nth one. If @block is out of range
-+ * (negative or too large) warning is printed and zero returned.
-+ *
-+ * Note: function doesn't find node addresses, so no IO is needed. All
-+ * we need to know is the capacity of indirect blocks (taken from the
-+ * inode->i_sb).
-+ */
-+
-+/*
-+ * Portability note: the last comparison (check that we fit into triple
-+ * indirect block) is spelled differently, because otherwise on an
-+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
-+ * if our filesystem had 8Kb blocks. We might use long long, but that would
-+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
-+ * i_block would have to be negative in the very beginning, so we would not
-+ * get there at all.
-+ */
-+
-+static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4])
-+{
-+ int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-+ int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
-+ const long direct_blocks = EXT3_NDIR_BLOCKS,
-+ indirect_blocks = ptrs,
-+ double_blocks = (1 << (ptrs_bits * 2));
-+ int n = 0;
-+
-+ if (i_block < 0) {
-+ ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
-+ } else if (i_block < direct_blocks) {
-+ offsets[n++] = i_block;
-+ } else if ( (i_block -= direct_blocks) < indirect_blocks) {
-+ offsets[n++] = EXT3_IND_BLOCK;
-+ offsets[n++] = i_block;
-+ } else if ((i_block -= indirect_blocks) < double_blocks) {
-+ offsets[n++] = EXT3_DIND_BLOCK;
-+ offsets[n++] = i_block >> ptrs_bits;
-+ offsets[n++] = i_block & (ptrs - 1);
-+ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
-+ offsets[n++] = EXT3_TIND_BLOCK;
-+ offsets[n++] = i_block >> (ptrs_bits * 2);
-+ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
-+ offsets[n++] = i_block & (ptrs - 1);
-+ } else {
-+ ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
-+ }
-+ return n;
-+}
-+
-+/**
-+ * ext3_get_branch - read the chain of indirect blocks leading to data
-+ * @inode: inode in question
-+ * @depth: depth of the chain (1 - direct pointer, etc.)
-+ * @offsets: offsets of pointers in inode/indirect blocks
-+ * @chain: place to store the result
-+ * @err: here we store the error value
-+ *
-+ * Function fills the array of triples <key, p, bh> and returns %NULL
-+ * if everything went OK or the pointer to the last filled triple
-+ * (incomplete one) otherwise. Upon the return chain[i].key contains
-+ * the number of (i+1)-th block in the chain (as it is stored in memory,
-+ * i.e. little-endian 32-bit), chain[i].p contains the address of that
-+ * number (it points into struct inode for i==0 and into the bh->b_data
-+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
-+ * block for i>0 and NULL for i==0. In other words, it holds the block
-+ * numbers of the chain, addresses they were taken from (and where we can
-+ * verify that chain did not change) and buffer_heads hosting these
-+ * numbers.
-+ *
-+ * Function stops when it stumbles upon zero pointer (absent block)
-+ * (pointer to last triple returned, *@err == 0)
-+ * or when it gets an IO error reading an indirect block
-+ * (ditto, *@err == -EIO)
-+ * or when it notices that chain had been changed while it was reading
-+ * (ditto, *@err == -EAGAIN)
-+ * or when it reads all @depth-1 indirect blocks successfully and finds
-+ * the whole chain, all way to the data (returns %NULL, *err == 0).
-+ */
-+static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
-+ Indirect chain[4], int *err)
-+{
-+ struct super_block *sb = inode->i_sb;
-+ Indirect *p = chain;
-+ struct buffer_head *bh;
-+
-+ *err = 0;
-+ /* i_data is not going away, no lock needed */
-+ add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
-+ if (!p->key)
-+ goto no_block;
-+ while (--depth) {
-+ bh = sb_bread(sb, le32_to_cpu(p->key));
-+ if (!bh)
-+ goto failure;
-+ /* Reader: pointers */
-+ if (!verify_chain(chain, p))
-+ goto changed;
-+ add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
-+ /* Reader: end */
-+ if (!p->key)
-+ goto no_block;
-+ }
-+ return NULL;
-+
-+changed:
-+ *err = -EAGAIN;
-+ goto no_block;
-+failure:
-+ *err = -EIO;
-+no_block:
-+ return p;
-+}
-+
-+/**
-+ * ext3_find_near - find a place for allocation with sufficient locality
-+ * @inode: owner
-+ * @ind: descriptor of indirect block.
-+ *
-+ * This function returns the prefered place for block allocation.
-+ * It is used when heuristic for sequential allocation fails.
-+ * Rules are:
-+ * + if there is a block to the left of our position - allocate near it.
-+ * + if pointer will live in indirect block - allocate near that block.
-+ * + if pointer will live in inode - allocate in the same
-+ * cylinder group.
-+ * Caller must make sure that @ind is valid and will stay that way.
-+ */
-+
-+static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
-+{
-+ u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
-+ u32 *p;
-+
-+ /* Try to find previous block */
-+ for (p = ind->p - 1; p >= start; p--)
-+ if (*p)
-+ return le32_to_cpu(*p);
-+
-+ /* No such thing, so let's try location of indirect block */
-+ if (ind->bh)
-+ return ind->bh->b_blocknr;
-+
-+ /*
-+ * It is going to be refered from inode itself? OK, just put it into
-+ * the same cylinder group then.
-+ */
-+ return (inode->u.ext3_i.i_block_group *
-+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
-+ le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
-+}
-+
-+/**
-+ * ext3_find_goal - find a prefered place for allocation.
-+ * @inode: owner
-+ * @block: block we want
-+ * @chain: chain of indirect blocks
-+ * @partial: pointer to the last triple within a chain
-+ * @goal: place to store the result.
-+ *
-+ * Normally this function find the prefered place for block allocation,
-+ * stores it in *@goal and returns zero. If the branch had been changed
-+ * under us we return -EAGAIN.
-+ */
-+
-+static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
-+ Indirect *partial, unsigned long *goal)
-+{
-+ /* Writer: ->i_next_alloc* */
-+ if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
-+ inode->u.ext3_i.i_next_alloc_block++;
-+ inode->u.ext3_i.i_next_alloc_goal++;
-+ }
-+#ifdef SEARCH_FROM_ZERO
-+ inode->u.ext3_i.i_next_alloc_block = 0;
-+ inode->u.ext3_i.i_next_alloc_goal = 0;
-+#endif
-+ /* Writer: end */
-+ /* Reader: pointers, ->i_next_alloc* */
-+ if (verify_chain(chain, partial)) {
-+ /*
-+ * try the heuristic for sequential allocation,
-+ * failing that at least try to get decent locality.
-+ */
-+ if (block == inode->u.ext3_i.i_next_alloc_block)
-+ *goal = inode->u.ext3_i.i_next_alloc_goal;
-+ if (!*goal)
-+ *goal = ext3_find_near(inode, partial);
-+#ifdef SEARCH_FROM_ZERO
-+ *goal = 0;
-+#endif
-+ return 0;
-+ }
-+ /* Reader: end */
-+ return -EAGAIN;
-+}
-+
-+/**
-+ * ext3_alloc_branch - allocate and set up a chain of blocks.
-+ * @inode: owner
-+ * @num: depth of the chain (number of blocks to allocate)
-+ * @offsets: offsets (in the blocks) to store the pointers to next.
-+ * @branch: place to store the chain in.
-+ *
-+ * This function allocates @num blocks, zeroes out all but the last one,
-+ * links them into chain and (if we are synchronous) writes them to disk.
-+ * In other words, it prepares a branch that can be spliced onto the
-+ * inode. It stores the information about that chain in the branch[], in
-+ * the same format as ext3_get_branch() would do. We are calling it after
-+ * we had read the existing part of chain and partial points to the last
-+ * triple of that (one with zero ->key). Upon the exit we have the same
-+ * picture as after the successful ext3_get_block(), excpet that in one
-+ * place chain is disconnected - *branch->p is still zero (we did not
-+ * set the last link), but branch->key contains the number that should
-+ * be placed into *branch->p to fill that gap.
-+ *
-+ * If allocation fails we free all blocks we've allocated (and forget
-+ * their buffer_heads) and return the error value the from failed
-+ * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
-+ * as described above and return 0.
-+ */
-+
-+static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
-+ int num,
-+ unsigned long goal,
-+ int *offsets,
-+ Indirect *branch)
-+{
-+ int blocksize = inode->i_sb->s_blocksize;
-+ int n = 0, keys = 0;
-+ int err = 0;
-+ int i;
-+ int parent = ext3_alloc_block(handle, inode, goal, &err);
-+
-+ branch[0].key = cpu_to_le32(parent);
-+ if (parent) {
-+ for (n = 1; n < num; n++) {
-+ struct buffer_head *bh;
-+ /* Allocate the next block */
-+ int nr = ext3_alloc_block(handle, inode, parent, &err);
-+ if (!nr)
-+ break;
-+ branch[n].key = cpu_to_le32(nr);
-+ keys = n+1;
-+
-+ /*
-+ * Get buffer_head for parent block, zero it out
-+ * and set the pointer to new one, then send
-+ * parent to disk.
-+ */
-+ bh = sb_getblk(inode->i_sb, parent);
-+ branch[n].bh = bh;
-+ lock_buffer(bh);
-+ BUFFER_TRACE(bh, "call get_create_access");
-+ err = ext3_journal_get_create_access(handle, bh);
-+ if (err) {
-+ unlock_buffer(bh);
-+ brelse(bh);
-+ break;
-+ }
-+
-+ memset(bh->b_data, 0, blocksize);
-+ branch[n].p = (u32*) bh->b_data + offsets[n];
-+ *branch[n].p = branch[n].key;
-+ BUFFER_TRACE(bh, "marking uptodate");
-+ mark_buffer_uptodate(bh, 1);
-+ unlock_buffer(bh);
-+
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ break;
-+
-+ parent = nr;
-+ }
-+ }
-+ if (n == num)
-+ return 0;
-+
-+ /* Allocation failed, free what we already allocated */
-+ for (i = 1; i < keys; i++) {
-+ BUFFER_TRACE(branch[i].bh, "call journal_forget");
-+ ext3_journal_forget(handle, branch[i].bh);
-+ }
-+ for (i = 0; i < keys; i++)
-+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
-+ return err;
-+}
-+
-+/**
-+ * ext3_splice_branch - splice the allocated branch onto inode.
-+ * @inode: owner
-+ * @block: (logical) number of block we are adding
-+ * @chain: chain of indirect blocks (with a missing link - see
-+ * ext3_alloc_branch)
-+ * @where: location of missing link
-+ * @num: number of blocks we are adding
-+ *
-+ * This function verifies that chain (up to the missing link) had not
-+ * changed, fills the missing link and does all housekeeping needed in
-+ * inode (->i_blocks, etc.). In case of success we end up with the full
-+ * chain to new block and return 0. Otherwise (== chain had been changed)
-+ * we free the new blocks (forgetting their buffer_heads, indeed) and
-+ * return -EAGAIN.
-+ */
-+
-+static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
-+ Indirect chain[4], Indirect *where, int num)
-+{
-+ int i;
-+ int err = 0;
-+
-+ /*
-+ * If we're splicing into a [td]indirect block (as opposed to the
-+ * inode) then we need to get write access to the [td]indirect block
-+ * before the splice.
-+ */
-+ if (where->bh) {
-+ BUFFER_TRACE(where->bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, where->bh);
-+ if (err)
-+ goto err_out;
-+ }
-+ /* Verify that place we are splicing to is still there and vacant */
-+
-+ /* Writer: pointers, ->i_next_alloc* */
-+ if (!verify_chain(chain, where-1) || *where->p)
-+ /* Writer: end */
-+ goto changed;
-+
-+ /* That's it */
-+
-+ *where->p = where->key;
-+ inode->u.ext3_i.i_next_alloc_block = block;
-+ inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
-+#ifdef SEARCH_FROM_ZERO
-+ inode->u.ext3_i.i_next_alloc_block = 0;
-+ inode->u.ext3_i.i_next_alloc_goal = 0;
-+#endif
-+ /* Writer: end */
-+
-+ /* We are done with atomic stuff, now do the rest of housekeeping */
-+
-+ inode->i_ctime = CURRENT_TIME;
-+ ext3_mark_inode_dirty(handle, inode);
-+
-+ /* had we spliced it onto indirect block? */
-+ if (where->bh) {
-+ /*
-+ * akpm: If we spliced it onto an indirect block, we haven't
-+ * altered the inode. Note however that if it is being spliced
-+ * onto an indirect block at the very end of the file (the
-+ * file is growing) then we *will* alter the inode to reflect
-+ * the new i_size. But that is not done here - it is done in
-+ * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
-+ */
-+ jbd_debug(5, "splicing indirect only\n");
-+ BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, where->bh);
-+ if (err)
-+ goto err_out;
-+ } else {
-+ /*
-+ * OK, we spliced it into the inode itself on a direct block.
-+ * Inode was dirtied above.
-+ */
-+ jbd_debug(5, "splicing direct\n");
-+ }
-+ return err;
-+
-+changed:
-+ /*
-+ * AKPM: if where[i].bh isn't part of the current updating
-+ * transaction then we explode nastily. Test this code path.
-+ */
-+ jbd_debug(1, "the chain changed: try again\n");
-+ err = -EAGAIN;
-+
-+err_out:
-+ for (i = 1; i < num; i++) {
-+ BUFFER_TRACE(where[i].bh, "call journal_forget");
-+ ext3_journal_forget(handle, where[i].bh);
-+ }
-+ /* For the normal collision cleanup case, we free up the blocks.
-+ * On genuine filesystem errors we don't even think about doing
-+ * that. */
-+ if (err == -EAGAIN)
-+ for (i = 0; i < num; i++)
-+ ext3_free_blocks(handle, inode,
-+ le32_to_cpu(where[i].key), 1);
-+ return err;
-+}
-+
-+/*
-+ * Allocation strategy is simple: if we have to allocate something, we will
-+ * have to go the whole way to leaf. So let's do it before attaching anything
-+ * to tree, set linkage between the newborn blocks, write them if sync is
-+ * required, recheck the path, free and repeat if check fails, otherwise
-+ * set the last missing link (that will protect us from any truncate-generated
-+ * removals - all blocks on the path are immune now) and possibly force the
-+ * write on the parent block.
-+ * That has a nice additional property: no special recovery from the failed
-+ * allocations is needed - we simply release blocks and do not touch anything
-+ * reachable from inode.
-+ *
-+ * akpm: `handle' can be NULL if create == 0.
-+ *
-+ * The BKL may not be held on entry here. Be sure to take it early.
-+ */
-+
-+static int ext3_get_block_handle(handle_t *handle, struct inode *inode,
-+ long iblock,
-+ struct buffer_head *bh_result, int create)
-+{
-+ int err = -EIO;
-+ int offsets[4];
-+ Indirect chain[4];
-+ Indirect *partial;
-+ unsigned long goal;
-+ int left;
-+ int depth = ext3_block_to_path(inode, iblock, offsets);
-+ loff_t new_size;
-+
-+ J_ASSERT(handle != NULL || create == 0);
-+
-+ if (depth == 0)
-+ goto out;
-+
-+ lock_kernel();
-+reread:
-+ partial = ext3_get_branch(inode, depth, offsets, chain, &err);
-+
-+ /* Simplest case - block found, no allocation needed */
-+ if (!partial) {
-+ bh_result->b_state &= ~(1UL << BH_New);
-+got_it:
-+ bh_result->b_dev = inode->i_dev;
-+ bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
-+ bh_result->b_state |= (1UL << BH_Mapped);
-+ /* Clean up and exit */
-+ partial = chain+depth-1; /* the whole chain */
-+ goto cleanup;
-+ }
-+
-+ /* Next simple case - plain lookup or failed read of indirect block */
-+ if (!create || err == -EIO) {
-+cleanup:
-+ while (partial > chain) {
-+ BUFFER_TRACE(partial->bh, "call brelse");
-+ brelse(partial->bh);
-+ partial--;
-+ }
-+ BUFFER_TRACE(bh_result, "returned");
-+ unlock_kernel();
-+out:
-+ return err;
-+ }
-+
-+ /*
-+ * Indirect block might be removed by truncate while we were
-+ * reading it. Handling of that case (forget what we've got and
-+ * reread) is taken out of the main path.
-+ */
-+ if (err == -EAGAIN)
-+ goto changed;
-+
-+ if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
-+ goto changed;
-+
-+ left = (chain + depth) - partial;
-+
-+ /*
-+ * Block out ext3_truncate while we alter the tree
-+ */
-+ down_read(&inode->u.ext3_i.truncate_sem);
-+ err = ext3_alloc_branch(handle, inode, left, goal,
-+ offsets+(partial-chain), partial);
-+
-+ /* The ext3_splice_branch call will free and forget any buffers
-+ * on the new chain if there is a failure, but that risks using
-+ * up transaction credits, especially for bitmaps where the
-+ * credits cannot be returned. Can we handle this somehow? We
-+ * may need to return -EAGAIN upwards in the worst case. --sct */
-+ if (!err)
-+ err = ext3_splice_branch(handle, inode, iblock, chain,
-+ partial, left);
-+ up_read(&inode->u.ext3_i.truncate_sem);
-+ if (err == -EAGAIN)
-+ goto changed;
-+ if (err)
-+ goto cleanup;
-+
-+ new_size = inode->i_size;
-+ /*
-+ * This is not racy against ext3_truncate's modification of i_disksize
-+ * because VM/VFS ensures that the file cannot be extended while
-+ * truncate is in progress. It is racy between multiple parallel
-+ * instances of get_block, but we have the BKL.
-+ */
-+ if (new_size > inode->u.ext3_i.i_disksize)
-+ inode->u.ext3_i.i_disksize = new_size;
-+
-+ bh_result->b_state |= (1UL << BH_New);
-+ goto got_it;
-+
-+changed:
-+ while (partial > chain) {
-+ jbd_debug(1, "buffer chain changed, retrying\n");
-+ BUFFER_TRACE(partial->bh, "brelsing");
-+ brelse(partial->bh);
-+ partial--;
-+ }
-+ goto reread;
-+}
-+
-+/*
-+ * The BKL is not held on entry here.
-+ */
-+static int ext3_get_block(struct inode *inode, long iblock,
-+ struct buffer_head *bh_result, int create)
-+{
-+ handle_t *handle = 0;
-+ int ret;
-+
-+ if (create) {
-+ handle = ext3_journal_current_handle();
-+ J_ASSERT(handle != 0);
-+ }
-+ ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
-+ return ret;
-+}
-+
-+/*
-+ * `handle' can be NULL if create is zero
-+ */
-+struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
-+ long block, int create, int * errp)
-+{
-+ struct buffer_head dummy;
-+ int fatal = 0, err;
-+
-+ J_ASSERT(handle != NULL || create == 0);
-+
-+ dummy.b_state = 0;
-+ dummy.b_blocknr = -1000;
-+ buffer_trace_init(&dummy.b_history);
-+ *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
-+ if (!*errp && buffer_mapped(&dummy)) {
-+ struct buffer_head *bh;
-+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-+ if (buffer_new(&dummy)) {
-+ J_ASSERT(create != 0);
-+ J_ASSERT(handle != 0);
-+
-+ /* Now that we do not always journal data, we
-+ should keep in mind whether this should
-+ always journal the new buffer as metadata.
-+ For now, regular file writes use
-+ ext3_get_block instead, so it's not a
-+ problem. */
-+ lock_kernel();
-+ lock_buffer(bh);
-+ BUFFER_TRACE(bh, "call get_create_access");
-+ fatal = ext3_journal_get_create_access(handle, bh);
-+ if (!fatal) {
-+ memset(bh->b_data, 0,
-+ inode->i_sb->s_blocksize);
-+ mark_buffer_uptodate(bh, 1);
-+ }
-+ unlock_buffer(bh);
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (!fatal) fatal = err;
-+ unlock_kernel();
-+ } else {
-+ BUFFER_TRACE(bh, "not a new buffer");
-+ }
-+ if (fatal) {
-+ *errp = fatal;
-+ brelse(bh);
-+ bh = NULL;
-+ }
-+ return bh;
-+ }
-+ return NULL;
-+}
-+
-+struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
-+ int block, int create, int *err)
-+{
-+ struct buffer_head * bh;
-+ int prev_blocks;
-+
-+ prev_blocks = inode->i_blocks;
-+
-+ bh = ext3_getblk (handle, inode, block, create, err);
-+ if (!bh)
-+ return bh;
-+#ifdef EXT3_PREALLOCATE
-+ /*
-+ * If the inode has grown, and this is a directory, then use a few
-+ * more of the preallocated blocks to keep directory fragmentation
-+ * down. The preallocated blocks are guaranteed to be contiguous.
-+ */
-+ if (create &&
-+ S_ISDIR(inode->i_mode) &&
-+ inode->i_blocks > prev_blocks &&
-+ EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
-+ EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
-+ int i;
-+ struct buffer_head *tmp_bh;
-+
-+ for (i = 1;
-+ inode->u.ext3_i.i_prealloc_count &&
-+ i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
-+ i++) {
-+ /*
-+ * ext3_getblk will zero out the contents of the
-+ * directory for us
-+ */
-+ tmp_bh = ext3_getblk(handle, inode,
-+ block+i, create, err);
-+ if (!tmp_bh) {
-+ brelse (bh);
-+ return 0;
-+ }
-+ brelse (tmp_bh);
-+ }
-+ }
-+#endif
-+ if (buffer_uptodate(bh))
-+ return bh;
-+ ll_rw_block (READ, 1, &bh);
-+ wait_on_buffer (bh);
-+ if (buffer_uptodate(bh))
-+ return bh;
-+ brelse (bh);
-+ *err = -EIO;
-+ return NULL;
-+}
-+
-+static int walk_page_buffers( handle_t *handle,
-+ struct buffer_head *head,
-+ unsigned from,
-+ unsigned to,
-+ int *partial,
-+ int (*fn)( handle_t *handle,
-+ struct buffer_head *bh))
-+{
-+ struct buffer_head *bh;
-+ unsigned block_start, block_end;
-+ unsigned blocksize = head->b_size;
-+ int err, ret = 0;
-+
-+ for ( bh = head, block_start = 0;
-+ ret == 0 && (bh != head || !block_start);
-+ block_start = block_end, bh = bh->b_this_page)
-+ {
-+ block_end = block_start + blocksize;
-+ if (block_end <= from || block_start >= to) {
-+ if (partial && !buffer_uptodate(bh))
-+ *partial = 1;
-+ continue;
-+ }
-+ err = (*fn)(handle, bh);
-+ if (!ret)
-+ ret = err;
-+ }
-+ return ret;
-+}
-+
-+/*
-+ * To preserve ordering, it is essential that the hole instantiation and
-+ * the data write be encapsulated in a single transaction. We cannot
-+ * close off a transaction and start a new one between the ext3_get_block()
-+ * and the commit_write(). So doing the journal_start at the start of
-+ * prepare_write() is the right place.
-+ *
-+ * Also, this function can nest inside ext3_writepage() ->
-+ * block_write_full_page(). In that case, we *know* that ext3_writepage()
-+ * has generated enough buffer credits to do the whole page. So we won't
-+ * block on the journal in that case, which is good, because the caller may
-+ * be PF_MEMALLOC.
-+ *
-+ * By accident, ext3 can be reentered when a transaction is open via
-+ * quota file writes. If we were to commit the transaction while thus
-+ * reentered, there can be a deadlock - we would be holding a quota
-+ * lock, and the commit would never complete if another thread had a
-+ * transaction open and was blocking on the quota lock - a ranking
-+ * violation.
-+ *
-+ * So what we do is to rely on the fact that journal_stop/journal_start
-+ * will _not_ run commit under these circumstances because handle->h_ref
-+ * is elevated. We'll still have enough credits for the tiny quotafile
-+ * write.
-+ */
-+
-+static int do_journal_get_write_access(handle_t *handle,
-+ struct buffer_head *bh)
-+{
-+ return ext3_journal_get_write_access(handle, bh);
-+}
-+
-+static int ext3_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to)
-+{
-+ struct inode *inode = page->mapping->host;
-+ int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
-+ handle_t *handle;
-+
-+ lock_kernel();
-+ handle = ext3_journal_start(inode, needed_blocks);
-+ if (IS_ERR(handle)) {
-+ ret = PTR_ERR(handle);
-+ goto out;
-+ }
-+ unlock_kernel();
-+ ret = block_prepare_write(page, from, to, ext3_get_block);
-+ lock_kernel();
-+ if (ret != 0)
-+ goto prepare_write_failed;
-+
-+ if (ext3_should_journal_data(inode)) {
-+ ret = walk_page_buffers(handle, page->buffers,
-+ from, to, NULL, do_journal_get_write_access);
-+ if (ret) {
-+ /*
-+ * We're going to fail this prepare_write(),
-+ * so commit_write() will not be called.
-+ * We need to undo block_prepare_write()'s kmap().
-+ * AKPM: Do we need to clear PageUptodate? I don't
-+ * think so.
-+ */
-+ kunmap(page);
-+ }
-+ }
-+prepare_write_failed:
-+ if (ret)
-+ ext3_journal_stop(handle, inode);
-+out:
-+ unlock_kernel();
-+ return ret;
-+}
-+
-+static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
-+{
-+ return ext3_journal_dirty_data(handle, bh, 0);
-+}
-+
-+/*
-+ * For ext3_writepage(). We also brelse() the buffer to account for
-+ * the bget() which ext3_writepage() performs.
-+ */
-+static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
-+{
-+ int ret = ext3_journal_dirty_data(handle, bh, 1);
-+ __brelse(bh);
-+ return ret;
-+}
-+
-+/* For commit_write() in data=journal mode */
-+static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
-+{
-+ set_bit(BH_Uptodate, &bh->b_state);
-+ return ext3_journal_dirty_metadata(handle, bh);
-+}
-+
-+/*
-+ * We need to pick up the new inode size which generic_commit_write gave us
-+ * `file' can be NULL - eg, when called from block_symlink().
-+ *
-+ * ext3 inode->i_dirty_buffers policy: If we're journalling data we
-+ * definitely don't want them to appear on the inode at all - instead
-+ * we need to manage them at the JBD layer and we need to intercept
-+ * the relevant sync operations and translate them into journal operations.
-+ *
-+ * If we're not journalling data then we can just leave the buffers
-+ * on ->i_dirty_buffers. If someone writes them out for us then thanks.
-+ * Otherwise we'll do it in commit, if we're using ordered data.
-+ */
-+
-+static int ext3_commit_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to)
-+{
-+ handle_t *handle = ext3_journal_current_handle();
-+ struct inode *inode = page->mapping->host;
-+ int ret = 0, ret2;
-+
-+ lock_kernel();
-+ if (ext3_should_journal_data(inode)) {
-+ /*
-+ * Here we duplicate the generic_commit_write() functionality
-+ */
-+ int partial = 0;
-+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-+
-+ ret = walk_page_buffers(handle, page->buffers,
-+ from, to, &partial, commit_write_fn);
-+ if (!partial)
-+ SetPageUptodate(page);
-+ kunmap(page);
-+ if (pos > inode->i_size)
-+ inode->i_size = pos;
-+ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
-+ } else {
-+ if (ext3_should_order_data(inode)) {
-+ ret = walk_page_buffers(handle, page->buffers,
-+ from, to, NULL, journal_dirty_sync_data);
-+ }
-+ /* Be careful here if generic_commit_write becomes a
-+ * required invocation after block_prepare_write. */
-+ if (ret == 0) {
-+ ret = generic_commit_write(file, page, from, to);
-+ } else {
-+ /*
-+ * block_prepare_write() was called, but we're not
-+ * going to call generic_commit_write(). So we
-+ * need to perform generic_commit_write()'s kunmap
-+ * by hand.
-+ */
-+ kunmap(page);
-+ }
-+ }
-+ if (inode->i_size > inode->u.ext3_i.i_disksize) {
-+ inode->u.ext3_i.i_disksize = inode->i_size;
-+ ret2 = ext3_mark_inode_dirty(handle, inode);
-+ if (!ret)
-+ ret = ret2;
-+ }
-+ ret2 = ext3_journal_stop(handle, inode);
-+ unlock_kernel();
-+ if (!ret)
-+ ret = ret2;
-+ return ret;
-+}
-+
-+/*
-+ * bmap() is special. It gets used by applications such as lilo and by
-+ * the swapper to find the on-disk block of a specific piece of data.
-+ *
-+ * Naturally, this is dangerous if the block concerned is still in the
-+ * journal. If somebody makes a swapfile on an ext3 data-journaling
-+ * filesystem and enables swap, then they may get a nasty shock when the
-+ * data getting swapped to that swapfile suddenly gets overwritten by
-+ * the original zero's written out previously to the journal and
-+ * awaiting writeback in the kernel's buffer cache.
-+ *
-+ * So, if we see any bmap calls here on a modified, data-journaled file,
-+ * take extra steps to flush any blocks which might be in the cache.
-+ */
-+static int ext3_bmap(struct address_space *mapping, long block)
-+{
-+ struct inode *inode = mapping->host;
-+ journal_t *journal;
-+ int err;
-+
-+ if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
-+ /*
-+ * This is a REALLY heavyweight approach, but the use of
-+ * bmap on dirty files is expected to be extremely rare:
-+ * only if we run lilo or swapon on a freshly made file
-+ * do we expect this to happen.
-+ *
-+ * (bmap requires CAP_SYS_RAWIO so this does not
-+ * represent an unprivileged user DOS attack --- we'd be
-+ * in trouble if mortal users could trigger this path at
-+ * will.)
-+ *
-+ * NB. EXT3_STATE_JDATA is not set on files other than
-+ * regular files. If somebody wants to bmap a directory
-+ * or symlink and gets confused because the buffer
-+ * hasn't yet been flushed to disk, they deserve
-+ * everything they get.
-+ */
-+
-+ EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
-+ journal = EXT3_JOURNAL(inode);
-+ journal_lock_updates(journal);
-+ err = journal_flush(journal);
-+ journal_unlock_updates(journal);
-+
-+ if (err)
-+ return 0;
-+ }
-+
-+ return generic_block_bmap(mapping,block,ext3_get_block);
-+}
-+
-+static int bget_one(handle_t *handle, struct buffer_head *bh)
-+{
-+ atomic_inc(&bh->b_count);
-+ return 0;
-+}
-+
-+/*
-+ * Note that we always start a transaction even if we're not journalling
-+ * data. This is to preserve ordering: any hole instantiation within
-+ * __block_write_full_page -> ext3_get_block() should be journalled
-+ * along with the data so we don't crash and then get metadata which
-+ * refers to old data.
-+ *
-+ * In all journalling modes block_write_full_page() will start the I/O.
-+ *
-+ * Problem:
-+ *
-+ * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
-+ * ext3_writepage()
-+ *
-+ * Similar for:
-+ *
-+ * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
-+ *
-+ * Same applies to ext3_get_block(). We will deadlock on various things like
-+ * lock_journal and i_truncate_sem.
-+ *
-+ * Setting PF_MEMALLOC here doesn't work - too many internal memory
-+ * allocations fail.
-+ *
-+ * 16May01: If we're reentered then journal_current_handle() will be
-+ * non-zero. We simply *return*.
-+ *
-+ * 1 July 2001: @@@ FIXME:
-+ * In journalled data mode, a data buffer may be metadata against the
-+ * current transaction. But the same file is part of a shared mapping
-+ * and someone does a writepage() on it.
-+ *
-+ * We will move the buffer onto the async_data list, but *after* it has
-+ * been dirtied. So there's a small window where we have dirty data on
-+ * BJ_Metadata.
-+ *
-+ * Note that this only applies to the last partial page in the file. The
-+ * bit which block_write_full_page() uses prepare/commit for. (That's
-+ * broken code anyway: it's wrong for msync()).
-+ *
-+ * It's a rare case: affects the final partial page, for journalled data
-+ * where the file is subject to bith write() and writepage() in the same
-+ * transction. To fix it we'll need a custom block_write_full_page().
-+ * We'll probably need that anyway for journalling writepage() output.
-+ *
-+ * We don't honour synchronous mounts for writepage(). That would be
-+ * disastrous. Any write() or metadata operation will sync the fs for
-+ * us.
-+ */
-+static int ext3_writepage(struct page *page)
-+{
-+ struct inode *inode = page->mapping->host;
-+ struct buffer_head *page_buffers;
-+ handle_t *handle = NULL;
-+ int ret = 0, err;
-+ int needed;
-+ int order_data;
-+
-+ J_ASSERT(PageLocked(page));
-+
-+ /*
-+ * We give up here if we're reentered, because it might be
-+ * for a different filesystem. One *could* look for a
-+ * nested transaction opportunity.
-+ */
-+ lock_kernel();
-+ if (ext3_journal_current_handle())
-+ goto out_fail;
-+
-+ needed = ext3_writepage_trans_blocks(inode);
-+ if (current->flags & PF_MEMALLOC)
-+ handle = ext3_journal_try_start(inode, needed);
-+ else
-+ handle = ext3_journal_start(inode, needed);
-+
-+ if (IS_ERR(handle)) {
-+ ret = PTR_ERR(handle);
-+ goto out_fail;
-+ }
-+
-+ order_data = ext3_should_order_data(inode) ||
-+ ext3_should_journal_data(inode);
-+
-+ unlock_kernel();
-+
-+ page_buffers = NULL; /* Purely to prevent compiler warning */
-+
-+ /* bget() all the buffers */
-+ if (order_data) {
-+ if (!page->buffers)
-+ create_empty_buffers(page,
-+ inode->i_dev, inode->i_sb->s_blocksize);
-+ page_buffers = page->buffers;
-+ walk_page_buffers(handle, page_buffers, 0,
-+ PAGE_CACHE_SIZE, NULL, bget_one);
-+ }
-+
-+ ret = block_write_full_page(page, ext3_get_block);
-+
-+ /*
-+ * The page can become unlocked at any point now, and
-+ * truncate can then come in and change things. So we
-+ * can't touch *page from now on. But *page_buffers is
-+ * safe due to elevated refcount.
-+ */
-+
-+ handle = ext3_journal_current_handle();
-+ lock_kernel();
-+
-+ /* And attach them to the current transaction */
-+ if (order_data) {
-+ err = walk_page_buffers(handle, page_buffers,
-+ 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
-+ if (!ret)
-+ ret = err;
-+ }
-+
-+ err = ext3_journal_stop(handle, inode);
-+ if (!ret)
-+ ret = err;
-+ unlock_kernel();
-+ return ret;
-+
-+out_fail:
-+
-+ unlock_kernel();
-+ SetPageDirty(page);
-+ UnlockPage(page);
-+ return ret;
-+}
-+
-+static int ext3_readpage(struct file *file, struct page *page)
-+{
-+ return block_read_full_page(page,ext3_get_block);
-+}
-+
-+
-+static int ext3_flushpage(struct page *page, unsigned long offset)
-+{
-+ journal_t *journal = EXT3_JOURNAL(page->mapping->host);
-+ return journal_flushpage(journal, page, offset);
-+}
-+
-+static int ext3_releasepage(struct page *page, int wait)
-+{
-+ journal_t *journal = EXT3_JOURNAL(page->mapping->host);
-+ return journal_try_to_free_buffers(journal, page, wait);
-+}
-+
-+
-+struct address_space_operations ext3_aops = {
-+ readpage: ext3_readpage, /* BKL not held. Don't need */
-+ writepage: ext3_writepage, /* BKL not held. We take it */
-+ sync_page: block_sync_page,
-+ prepare_write: ext3_prepare_write, /* BKL not held. We take it */
-+ commit_write: ext3_commit_write, /* BKL not held. We take it */
-+ bmap: ext3_bmap, /* BKL held */
-+ flushpage: ext3_flushpage, /* BKL not held. Don't need */
-+ releasepage: ext3_releasepage, /* BKL not held. Don't need */
-+};
-+
-+/*
-+ * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
-+ * up to the end of the block which corresponds to `from'.
-+ * This required during truncate. We need to physically zero the tail end
-+ * of that block so it doesn't yield old data if the file is later grown.
-+ */
-+static int ext3_block_truncate_page(handle_t *handle,
-+ struct address_space *mapping, loff_t from)
-+{
-+ unsigned long index = from >> PAGE_CACHE_SHIFT;
-+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
-+ unsigned blocksize, iblock, length, pos;
-+ struct inode *inode = mapping->host;
-+ struct page *page;
-+ struct buffer_head *bh;
-+ int err;
-+
-+ blocksize = inode->i_sb->s_blocksize;
-+ length = offset & (blocksize - 1);
-+
-+ /* Block boundary? Nothing to do */
-+ if (!length)
-+ return 0;
-+
-+ length = blocksize - length;
-+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-+
-+ page = grab_cache_page(mapping, index);
-+ err = -ENOMEM;
-+ if (!page)
-+ goto out;
-+
-+ if (!page->buffers)
-+ create_empty_buffers(page, inode->i_dev, blocksize);
-+
-+ /* Find the buffer that contains "offset" */
-+ bh = page->buffers;
-+ pos = blocksize;
-+ while (offset >= pos) {
-+ bh = bh->b_this_page;
-+ iblock++;
-+ pos += blocksize;
-+ }
-+
-+ err = 0;
-+ if (!buffer_mapped(bh)) {
-+ /* Hole? Nothing to do */
-+ if (buffer_uptodate(bh))
-+ goto unlock;
-+ ext3_get_block(inode, iblock, bh, 0);
-+ /* Still unmapped? Nothing to do */
-+ if (!buffer_mapped(bh))
-+ goto unlock;
-+ }
-+
-+ /* Ok, it's mapped. Make sure it's up-to-date */
-+ if (Page_Uptodate(page))
-+ set_bit(BH_Uptodate, &bh->b_state);
-+
-+ if (!buffer_uptodate(bh)) {
-+ err = -EIO;
-+ ll_rw_block(READ, 1, &bh);
-+ wait_on_buffer(bh);
-+ /* Uhhuh. Read error. Complain and punt. */
-+ if (!buffer_uptodate(bh))
-+ goto unlock;
-+ }
-+
-+ if (ext3_should_journal_data(inode)) {
-+ BUFFER_TRACE(bh, "get write access");
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto unlock;
-+ }
-+
-+ memset(kmap(page) + offset, 0, length);
-+ flush_dcache_page(page);
-+ kunmap(page);
-+
-+ BUFFER_TRACE(bh, "zeroed end of block");
-+
-+ err = 0;
-+ if (ext3_should_journal_data(inode)) {
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ } else {
-+ if (ext3_should_order_data(inode))
-+ err = ext3_journal_dirty_data(handle, bh, 0);
-+ __mark_buffer_dirty(bh);
-+ }
-+
-+unlock:
-+ UnlockPage(page);
-+ page_cache_release(page);
-+out:
-+ return err;
-+}
-+
-+/*
-+ * Probably it should be a library function... search for first non-zero word
-+ * or memcmp with zero_page, whatever is better for particular architecture.
-+ * Linus?
-+ */
-+static inline int all_zeroes(u32 *p, u32 *q)
-+{
-+ while (p < q)
-+ if (*p++)
-+ return 0;
-+ return 1;
-+}
-+
-+/**
-+ * ext3_find_shared - find the indirect blocks for partial truncation.
-+ * @inode: inode in question
-+ * @depth: depth of the affected branch
-+ * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
-+ * @chain: place to store the pointers to partial indirect blocks
-+ * @top: place to the (detached) top of branch
-+ *
-+ * This is a helper function used by ext3_truncate().
-+ *
-+ * When we do truncate() we may have to clean the ends of several
-+ * indirect blocks but leave the blocks themselves alive. Block is
-+ * partially truncated if some data below the new i_size is refered
-+ * from it (and it is on the path to the first completely truncated
-+ * data block, indeed). We have to free the top of that path along
-+ * with everything to the right of the path. Since no allocation
-+ * past the truncation point is possible until ext3_truncate()
-+ * finishes, we may safely do the latter, but top of branch may
-+ * require special attention - pageout below the truncation point
-+ * might try to populate it.
-+ *
-+ * We atomically detach the top of branch from the tree, store the
-+ * block number of its root in *@top, pointers to buffer_heads of
-+ * partially truncated blocks - in @chain[].bh and pointers to
-+ * their last elements that should not be removed - in
-+ * @chain[].p. Return value is the pointer to last filled element
-+ * of @chain.
-+ *
-+ * The work left to caller to do the actual freeing of subtrees:
-+ * a) free the subtree starting from *@top
-+ * b) free the subtrees whose roots are stored in
-+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
-+ * c) free the subtrees growing from the inode past the @chain[0].
-+ * (no partially truncated stuff there). */
-+
-+static Indirect *ext3_find_shared(struct inode *inode,
-+ int depth,
-+ int offsets[4],
-+ Indirect chain[4],
-+ u32 *top)
-+{
-+ Indirect *partial, *p;
-+ int k, err;
-+
-+ *top = 0;
-+ /* Make k index the deepest non-null offest + 1 */
-+ for (k = depth; k > 1 && !offsets[k-1]; k--)
-+ ;
-+ partial = ext3_get_branch(inode, k, offsets, chain, &err);
-+ /* Writer: pointers */
-+ if (!partial)
-+ partial = chain + k-1;
-+ /*
-+ * If the branch acquired continuation since we've looked at it -
-+ * fine, it should all survive and (new) top doesn't belong to us.
-+ */
-+ if (!partial->key && *partial->p)
-+ /* Writer: end */
-+ goto no_top;
-+ for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
-+ ;
-+ /*
-+ * OK, we've found the last block that must survive. The rest of our
-+ * branch should be detached before unlocking. However, if that rest
-+ * of branch is all ours and does not grow immediately from the inode
-+ * it's easier to cheat and just decrement partial->p.
-+ */
-+ if (p == chain + k - 1 && p > chain) {
-+ p->p--;
-+ } else {
-+ *top = *p->p;
-+ /* Nope, don't do this in ext3. Must leave the tree intact */
-+#if 0
-+ *p->p = 0;
-+#endif
-+ }
-+ /* Writer: end */
-+
-+ while(partial > p)
-+ {
-+ brelse(partial->bh);
-+ partial--;
-+ }
-+no_top:
-+ return partial;
-+}
-+
-+/*
-+ * Zero a number of block pointers in either an inode or an indirect block.
-+ * If we restart the transaction we must again get write access to the
-+ * indirect block for further modification.
-+ *
-+ * We release `count' blocks on disk, but (last - first) may be greater
-+ * than `count' because there can be holes in there.
-+ */
-+static void
-+ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
-+ unsigned long block_to_free, unsigned long count,
-+ u32 *first, u32 *last)
-+{
-+ u32 *p;
-+ if (try_to_extend_transaction(handle, inode)) {
-+ if (bh) {
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ ext3_journal_dirty_metadata(handle, bh);
-+ }
-+ ext3_mark_inode_dirty(handle, inode);
-+ ext3_journal_test_restart(handle, inode);
-+ BUFFER_TRACE(bh, "get_write_access");
-+ ext3_journal_get_write_access(handle, bh);
-+ }
-+
-+ /*
-+ * Any buffers which are on the journal will be in memory. We find
-+ * them on the hash table so journal_revoke() will run journal_forget()
-+ * on them. We've already detached each block from the file, so
-+ * bforget() in journal_forget() should be safe.
-+ *
-+ * AKPM: turn on bforget in journal_forget()!!!
-+ */
-+ for (p = first; p < last; p++) {
-+ u32 nr = le32_to_cpu(*p);
-+ if (nr) {
-+ struct buffer_head *bh;
-+
-+ *p = 0;
-+ bh = sb_get_hash_table(inode->i_sb, nr);
-+ ext3_forget(handle, 0, inode, bh, nr);
-+ }
-+ }
-+
-+ ext3_free_blocks(handle, inode, block_to_free, count);
-+}
-+
-+/**
-+ * ext3_free_data - free a list of data blocks
-+ * @handle: handle for this transaction
-+ * @inode: inode we are dealing with
-+ * @this_bh: indirect buffer_head which contains *@first and *@last
-+ * @first: array of block numbers
-+ * @last: points immediately past the end of array
-+ *
-+ * We are freeing all blocks refered from that array (numbers are stored as
-+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
-+ *
-+ * We accumulate contiguous runs of blocks to free. Conveniently, if these
-+ * blocks are contiguous then releasing them at one time will only affect one
-+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
-+ * actually use a lot of journal space.
-+ *
-+ * @this_bh will be %NULL if @first and @last point into the inode's direct
-+ * block pointers.
-+ */
-+static void ext3_free_data(handle_t *handle, struct inode *inode,
-+ struct buffer_head *this_bh, u32 *first, u32 *last)
-+{
-+ unsigned long block_to_free = 0; /* Starting block # of a run */
-+ unsigned long count = 0; /* Number of blocks in the run */
-+ u32 *block_to_free_p = NULL; /* Pointer into inode/ind
-+ corresponding to
-+ block_to_free */
-+ unsigned long nr; /* Current block # */
-+ u32 *p; /* Pointer into inode/ind
-+ for current block */
-+ int err;
-+
-+ if (this_bh) { /* For indirect block */
-+ BUFFER_TRACE(this_bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, this_bh);
-+ /* Important: if we can't update the indirect pointers
-+ * to the blocks, we can't free them. */
-+ if (err)
-+ return;
-+ }
-+
-+ for (p = first; p < last; p++) {
-+ nr = le32_to_cpu(*p);
-+ if (nr) {
-+ /* accumulate blocks to free if they're contiguous */
-+ if (count == 0) {
-+ block_to_free = nr;
-+ block_to_free_p = p;
-+ count = 1;
-+ } else if (nr == block_to_free + count) {
-+ count++;
-+ } else {
-+ ext3_clear_blocks(handle, inode, this_bh,
-+ block_to_free,
-+ count, block_to_free_p, p);
-+ block_to_free = nr;
-+ block_to_free_p = p;
-+ count = 1;
-+ }
-+ }
-+ }
-+
-+ if (count > 0)
-+ ext3_clear_blocks(handle, inode, this_bh, block_to_free,
-+ count, block_to_free_p, p);
-+
-+ if (this_bh) {
-+ BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
-+ ext3_journal_dirty_metadata(handle, this_bh);
-+ }
-+}
-+
-+/**
-+ * ext3_free_branches - free an array of branches
-+ * @handle: JBD handle for this transaction
-+ * @inode: inode we are dealing with
-+ * @parent_bh: the buffer_head which contains *@first and *@last
-+ * @first: array of block numbers
-+ * @last: pointer immediately past the end of array
-+ * @depth: depth of the branches to free
-+ *
-+ * We are freeing all blocks refered from these branches (numbers are
-+ * stored as little-endian 32-bit) and updating @inode->i_blocks
-+ * appropriately.
-+ */
-+static void ext3_free_branches(handle_t *handle, struct inode *inode,
-+ struct buffer_head *parent_bh,
-+ u32 *first, u32 *last, int depth)
-+{
-+ unsigned long nr;
-+ u32 *p;
-+
-+ if (is_handle_aborted(handle))
-+ return;
-+
-+ if (depth--) {
-+ struct buffer_head *bh;
-+ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-+ p = last;
-+ while (--p >= first) {
-+ nr = le32_to_cpu(*p);
-+ if (!nr)
-+ continue; /* A hole */
-+
-+ /* Go read the buffer for the next level down */
-+ bh = sb_bread(inode->i_sb, nr);
-+
-+ /*
-+ * A read failure? Report error and clear slot
-+ * (should be rare).
-+ */
-+ if (!bh) {
-+ ext3_error(inode->i_sb, "ext3_free_branches",
-+ "Read failure, inode=%ld, block=%ld",
-+ inode->i_ino, nr);
-+ continue;
-+ }
-+
-+ /* This zaps the entire block. Bottom up. */
-+ BUFFER_TRACE(bh, "free child branches");
-+ ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
-+ (u32*)bh->b_data + addr_per_block,
-+ depth);
-+
-+ /*
-+ * We've probably journalled the indirect block several
-+ * times during the truncate. But it's no longer
-+ * needed and we now drop it from the transaction via
-+ * journal_revoke().
-+ *
-+ * That's easy if it's exclusively part of this
-+ * transaction. But if it's part of the committing
-+ * transaction then journal_forget() will simply
-+ * brelse() it. That means that if the underlying
-+ * block is reallocated in ext3_get_block(),
-+ * unmap_underlying_metadata() will find this block
-+ * and will try to get rid of it. damn, damn.
-+ *
-+ * If this block has already been committed to the
-+ * journal, a revoke record will be written. And
-+ * revoke records must be emitted *before* clearing
-+ * this block's bit in the bitmaps.
-+ */
-+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-+
-+ /*
-+ * Everything below this this pointer has been
-+ * released. Now let this top-of-subtree go.
-+ *
-+ * We want the freeing of this indirect block to be
-+ * atomic in the journal with the updating of the
-+ * bitmap block which owns it. So make some room in
-+ * the journal.
-+ *
-+ * We zero the parent pointer *after* freeing its
-+ * pointee in the bitmaps, so if extend_transaction()
-+ * for some reason fails to put the bitmap changes and
-+ * the release into the same transaction, recovery
-+ * will merely complain about releasing a free block,
-+ * rather than leaking blocks.
-+ */
-+ if (is_handle_aborted(handle))
-+ return;
-+ if (try_to_extend_transaction(handle, inode)) {
-+ ext3_mark_inode_dirty(handle, inode);
-+ ext3_journal_test_restart(handle, inode);
-+ }
-+
-+ ext3_free_blocks(handle, inode, nr, 1);
-+
-+ if (parent_bh) {
-+ /*
-+ * The block which we have just freed is
-+ * pointed to by an indirect block: journal it
-+ */
-+ BUFFER_TRACE(parent_bh, "get_write_access");
-+ if (!ext3_journal_get_write_access(handle,
-+ parent_bh)){
-+ *p = 0;
-+ BUFFER_TRACE(parent_bh,
-+ "call ext3_journal_dirty_metadata");
-+ ext3_journal_dirty_metadata(handle,
-+ parent_bh);
-+ }
-+ }
-+ }
-+ } else {
-+ /* We have reached the bottom of the tree. */
-+ BUFFER_TRACE(parent_bh, "free data blocks");
-+ ext3_free_data(handle, inode, parent_bh, first, last);
-+ }
-+}
-+
-+/*
-+ * ext3_truncate()
-+ *
-+ * We block out ext3_get_block() block instantiations across the entire
-+ * transaction, and VFS/VM ensures that ext3_truncate() cannot run
-+ * simultaneously on behalf of the same inode.
-+ *
-+ * As we work through the truncate and commmit bits of it to the journal there
-+ * is one core, guiding principle: the file's tree must always be consistent on
-+ * disk. We must be able to restart the truncate after a crash.
-+ *
-+ * The file's tree may be transiently inconsistent in memory (although it
-+ * probably isn't), but whenever we close off and commit a journal transaction,
-+ * the contents of (the filesystem + the journal) must be consistent and
-+ * restartable. It's pretty simple, really: bottom up, right to left (although
-+ * left-to-right works OK too).
-+ *
-+ * Note that at recovery time, journal replay occurs *before* the restart of
-+ * truncate against the orphan inode list.
-+ *
-+ * The committed inode has the new, desired i_size (which is the same as
-+ * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
-+ * that this inode's truncate did not complete and it will again call
-+ * ext3_truncate() to have another go. So there will be instantiated blocks
-+ * to the right of the truncation point in a crashed ext3 filesystem. But
-+ * that's fine - as long as they are linked from the inode, the post-crash
-+ * ext3_truncate() run will find them and release them.
-+ */
-+
-+void ext3_truncate(struct inode * inode)
-+{
-+ handle_t *handle;
-+ u32 *i_data = inode->u.ext3_i.i_data;
-+ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-+ int offsets[4];
-+ Indirect chain[4];
-+ Indirect *partial;
-+ int nr = 0;
-+ int n;
-+ long last_block;
-+ unsigned blocksize;
-+
-+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-+ S_ISLNK(inode->i_mode)))
-+ return;
-+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-+ return;
-+
-+ ext3_discard_prealloc(inode);
-+
-+ handle = start_transaction(inode);
-+ if (IS_ERR(handle))
-+ return; /* AKPM: return what? */
-+
-+ blocksize = inode->i_sb->s_blocksize;
-+ last_block = (inode->i_size + blocksize-1)
-+ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
-+
-+ ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size);
-+
-+
-+ n = ext3_block_to_path(inode, last_block, offsets);
-+ if (n == 0)
-+ goto out_stop; /* error */
-+
-+ /*
-+ * OK. This truncate is going to happen. We add the inode to the
-+ * orphan list, so that if this truncate spans multiple transactions,
-+ * and we crash, we will resume the truncate when the filesystem
-+ * recovers. It also marks the inode dirty, to catch the new size.
-+ *
-+ * Implication: the file must always be in a sane, consistent
-+ * truncatable state while each transaction commits.
-+ */
-+ if (ext3_orphan_add(handle, inode))
-+ goto out_stop;
-+
-+ /*
-+ * The orphan list entry will now protect us from any crash which
-+ * occurs before the truncate completes, so it is now safe to propagate
-+ * the new, shorter inode size (held for now in i_size) into the
-+ * on-disk inode. We do this via i_disksize, which is the value which
-+ * ext3 *really* writes onto the disk inode.
-+ */
-+ inode->u.ext3_i.i_disksize = inode->i_size;
-+
-+ /*
-+ * From here we block out all ext3_get_block() callers who want to
-+ * modify the block allocation tree.
-+ */
-+ down_write(&inode->u.ext3_i.truncate_sem);
-+
-+ if (n == 1) { /* direct blocks */
-+ ext3_free_data(handle, inode, NULL, i_data+offsets[0],
-+ i_data + EXT3_NDIR_BLOCKS);
-+ goto do_indirects;
-+ }
-+
-+ partial = ext3_find_shared(inode, n, offsets, chain, &nr);
-+ /* Kill the top of shared branch (not detached) */
-+ if (nr) {
-+ if (partial == chain) {
-+ /* Shared branch grows from the inode */
-+ ext3_free_branches(handle, inode, NULL,
-+ &nr, &nr+1, (chain+n-1) - partial);
-+ *partial->p = 0;
-+ /*
-+ * We mark the inode dirty prior to restart,
-+ * and prior to stop. No need for it here.
-+ */
-+ } else {
-+ /* Shared branch grows from an indirect block */
-+ BUFFER_TRACE(partial->bh, "get_write_access");
-+ ext3_free_branches(handle, inode, partial->bh,
-+ partial->p,
-+ partial->p+1, (chain+n-1) - partial);
-+ }
-+ }
-+ /* Clear the ends of indirect blocks on the shared branch */
-+ while (partial > chain) {
-+ ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
-+ (u32*)partial->bh->b_data + addr_per_block,
-+ (chain+n-1) - partial);
-+ BUFFER_TRACE(partial->bh, "call brelse");
-+ brelse (partial->bh);
-+ partial--;
-+ }
-+do_indirects:
-+ /* Kill the remaining (whole) subtrees */
-+ switch (offsets[0]) {
-+ default:
-+ nr = i_data[EXT3_IND_BLOCK];
-+ if (nr) {
-+ ext3_free_branches(handle, inode, NULL,
-+ &nr, &nr+1, 1);
-+ i_data[EXT3_IND_BLOCK] = 0;
-+ }
-+ case EXT3_IND_BLOCK:
-+ nr = i_data[EXT3_DIND_BLOCK];
-+ if (nr) {
-+ ext3_free_branches(handle, inode, NULL,
-+ &nr, &nr+1, 2);
-+ i_data[EXT3_DIND_BLOCK] = 0;
-+ }
-+ case EXT3_DIND_BLOCK:
-+ nr = i_data[EXT3_TIND_BLOCK];
-+ if (nr) {
-+ ext3_free_branches(handle, inode, NULL,
-+ &nr, &nr+1, 3);
-+ i_data[EXT3_TIND_BLOCK] = 0;
-+ }
-+ case EXT3_TIND_BLOCK:
-+ ;
-+ }
-+ up_write(&inode->u.ext3_i.truncate_sem);
-+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-+ ext3_mark_inode_dirty(handle, inode);
-+
-+ /* In a multi-transaction truncate, we only make the final
-+ * transaction synchronous */
-+ if (IS_SYNC(inode))
-+ handle->h_sync = 1;
-+out_stop:
-+ /*
-+ * If this was a simple ftruncate(), and the file will remain alive
-+ * then we need to clear up the orphan record which we created above.
-+ * However, if this was a real unlink then we were called by
-+ * ext3_delete_inode(), and we allow that function to clean up the
-+ * orphan info for us.
-+ */
-+ if (inode->i_nlink)
-+ ext3_orphan_del(handle, inode);
-+
-+ ext3_journal_stop(handle, inode);
-+}
-+
-+/*
-+ * ext3_get_inode_loc returns with an extra refcount against the
-+ * inode's underlying buffer_head on success.
-+ */
-+
-+int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
-+{
-+ struct buffer_head *bh = 0;
-+ unsigned long block;
-+ unsigned long block_group;
-+ unsigned long group_desc;
-+ unsigned long desc;
-+ unsigned long offset;
-+ struct ext3_group_desc * gdp;
-+
-+ if ((inode->i_ino != EXT3_ROOT_INO &&
-+ inode->i_ino != EXT3_ACL_IDX_INO &&
-+ inode->i_ino != EXT3_ACL_DATA_INO &&
-+ inode->i_ino != EXT3_JOURNAL_INO &&
-+ inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
-+ inode->i_ino > le32_to_cpu(
-+ inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
-+ ext3_error (inode->i_sb, "ext3_get_inode_loc",
-+ "bad inode number: %lu", inode->i_ino);
-+ goto bad_inode;
-+ }
-+ block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
-+ if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
-+ ext3_error (inode->i_sb, "ext3_get_inode_loc",
-+ "group >= groups count");
-+ goto bad_inode;
-+ }
-+ group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
-+ desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
-+ bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
-+ if (!bh) {
-+ ext3_error (inode->i_sb, "ext3_get_inode_loc",
-+ "Descriptor not loaded");
-+ goto bad_inode;
-+ }
-+
-+ gdp = (struct ext3_group_desc *) bh->b_data;
-+ /*
-+ * Figure out the offset within the block group inode table
-+ */
-+ offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
-+ EXT3_INODE_SIZE(inode->i_sb);
-+ block = le32_to_cpu(gdp[desc].bg_inode_table) +
-+ (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
-+ if (!(bh = sb_bread(inode->i_sb, block))) {
-+ ext3_error (inode->i_sb, "ext3_get_inode_loc",
-+ "unable to read inode block - "
-+ "inode=%lu, block=%lu", inode->i_ino, block);
-+ goto bad_inode;
-+ }
-+ offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
-+
-+ iloc->bh = bh;
-+ iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
-+ iloc->block_group = block_group;
-+
-+ return 0;
-+
-+ bad_inode:
-+ return -EIO;
-+}
-+
-+void ext3_read_inode(struct inode * inode)
-+{
-+ struct ext3_iloc iloc;
-+ struct ext3_inode *raw_inode;
-+ struct buffer_head *bh;
-+ int block;
-+
-+ if(ext3_get_inode_loc(inode, &iloc))
-+ goto bad_inode;
-+ bh = iloc.bh;
-+ raw_inode = iloc.raw_inode;
-+ init_rwsem(&inode->u.ext3_i.truncate_sem);
-+ inode->i_mode = le16_to_cpu(raw_inode->i_mode);
-+ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
-+ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
-+ if(!(test_opt (inode->i_sb, NO_UID32))) {
-+ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
-+ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
-+ }
-+ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-+ inode->i_size = le32_to_cpu(raw_inode->i_size);
-+ inode->i_atime = le32_to_cpu(raw_inode->i_atime);
-+ inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
-+ inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
-+ inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
-+ /* We now have enough fields to check if the inode was active or not.
-+ * This is needed because nfsd might try to access dead inodes
-+ * the test is that same one that e2fsck uses
-+ * NeilBrown 1999oct15
-+ */
-+ if (inode->i_nlink == 0) {
-+ if (inode->i_mode == 0 ||
-+ !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
-+ /* this inode is deleted */
-+ brelse (bh);
-+ goto bad_inode;
-+ }
-+ /* The only unlinked inodes we let through here have
-+ * valid i_mode and are being read by the orphan
-+ * recovery code: that's fine, we're about to complete
-+ * the process of deleting those. */
-+ }
-+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
-+ * (for stat), not the fs block
-+ * size */
-+ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
-+ inode->i_version = ++event;
-+ inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
-+#ifdef EXT3_FRAGMENTS
-+ inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
-+ inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
-+ inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
-+#endif
-+ inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
-+ if (!S_ISREG(inode->i_mode)) {
-+ inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
-+ } else {
-+ inode->i_size |=
-+ ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
-+ }
-+ inode->u.ext3_i.i_disksize = inode->i_size;
-+ inode->i_generation = le32_to_cpu(raw_inode->i_generation);
-+#ifdef EXT3_PREALLOCATE
-+ inode->u.ext3_i.i_prealloc_count = 0;
-+#endif
-+ inode->u.ext3_i.i_block_group = iloc.block_group;
-+
-+ /*
-+ * NOTE! The in-memory inode i_data array is in little-endian order
-+ * even on big-endian machines: we do NOT byteswap the block numbers!
-+ */
-+ for (block = 0; block < EXT3_N_BLOCKS; block++)
-+ inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
-+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
-+
-+ brelse (iloc.bh);
-+
-+ if (inode->i_ino == EXT3_ACL_IDX_INO ||
-+ inode->i_ino == EXT3_ACL_DATA_INO)
-+ /* Nothing to do */ ;
-+ else if (S_ISREG(inode->i_mode)) {
-+ inode->i_op = &ext3_file_inode_operations;
-+ inode->i_fop = &ext3_file_operations;
-+ inode->i_mapping->a_ops = &ext3_aops;
-+ } else if (S_ISDIR(inode->i_mode)) {
-+ inode->i_op = &ext3_dir_inode_operations;
-+ inode->i_fop = &ext3_dir_operations;
-+ } else if (S_ISLNK(inode->i_mode)) {
-+ if (!inode->i_blocks)
-+ inode->i_op = &ext3_fast_symlink_inode_operations;
-+ else {
-+ inode->i_op = &page_symlink_inode_operations;
-+ inode->i_mapping->a_ops = &ext3_aops;
-+ }
-+ } else
-+ init_special_inode(inode, inode->i_mode,
-+ le32_to_cpu(iloc.raw_inode->i_block[0]));
-+ /* inode->i_attr_flags = 0; unused */
-+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
-+ /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
-+ inode->i_flags |= S_SYNC;
-+ }
-+ if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) {
-+ /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */
-+ inode->i_flags |= S_APPEND;
-+ }
-+ if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FL) {
-+ /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */
-+ inode->i_flags |= S_IMMUTABLE;
-+ }
-+ if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) {
-+ /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */
-+ inode->i_flags |= S_NOATIME;
-+ }
-+ return;
-+
-+bad_inode:
-+ make_bad_inode(inode);
-+ return;
-+}
-+
-+/*
-+ * Post the struct inode info into an on-disk inode location in the
-+ * buffer-cache. This gobbles the caller's reference to the
-+ * buffer_head in the inode location struct.
-+ */
-+
-+static int ext3_do_update_inode(handle_t *handle,
-+ struct inode *inode,
-+ struct ext3_iloc *iloc)
-+{
-+ struct ext3_inode *raw_inode = iloc->raw_inode;
-+ struct buffer_head *bh = iloc->bh;
-+ int err = 0, rc, block;
-+
-+ if (handle) {
-+ BUFFER_TRACE(bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto out_brelse;
-+ }
-+ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
-+ if(!(test_opt(inode->i_sb, NO_UID32))) {
-+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
-+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
-+/*
-+ * Fix up interoperability with old kernels. Otherwise, old inodes get
-+ * re-used with the upper 16 bits of the uid/gid intact
-+ */
-+ if(!inode->u.ext3_i.i_dtime) {
-+ raw_inode->i_uid_high =
-+ cpu_to_le16(high_16_bits(inode->i_uid));
-+ raw_inode->i_gid_high =
-+ cpu_to_le16(high_16_bits(inode->i_gid));
-+ } else {
-+ raw_inode->i_uid_high = 0;
-+ raw_inode->i_gid_high = 0;
-+ }
-+ } else {
-+ raw_inode->i_uid_low =
-+ cpu_to_le16(fs_high2lowuid(inode->i_uid));
-+ raw_inode->i_gid_low =
-+ cpu_to_le16(fs_high2lowgid(inode->i_gid));
-+ raw_inode->i_uid_high = 0;
-+ raw_inode->i_gid_high = 0;
-+ }
-+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-+ raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
-+ raw_inode->i_atime = cpu_to_le32(inode->i_atime);
-+ raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
-+ raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
-+ raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
-+ raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
-+ raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
-+#ifdef EXT3_FRAGMENTS
-+ raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
-+ raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
-+ raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
-+#else
-+ /* If we are not tracking these fields in the in-memory inode,
-+ * then preserve them on disk, but still initialise them to zero
-+ * for new inodes. */
-+ if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
-+ raw_inode->i_faddr = 0;
-+ raw_inode->i_frag = 0;
-+ raw_inode->i_fsize = 0;
-+ }
-+#endif
-+ raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
-+ if (!S_ISREG(inode->i_mode)) {
-+ raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
-+ } else {
-+ raw_inode->i_size_high =
-+ cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
-+ if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
-+ struct super_block *sb = inode->i_sb;
-+ if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
-+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
-+ EXT3_SB(sb)->s_es->s_rev_level ==
-+ cpu_to_le32(EXT3_GOOD_OLD_REV)) {
-+ /* If this is the first large file
-+ * created, add a flag to the superblock.
-+ */
-+ err = ext3_journal_get_write_access(handle,
-+ sb->u.ext3_sb.s_sbh);
-+ if (err)
-+ goto out_brelse;
-+ ext3_update_dynamic_rev(sb);
-+ EXT3_SET_RO_COMPAT_FEATURE(sb,
-+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
-+ sb->s_dirt = 1;
-+ handle->h_sync = 1;
-+ err = ext3_journal_dirty_metadata(handle,
-+ sb->u.ext3_sb.s_sbh);
-+ }
-+ }
-+ }
-+ raw_inode->i_generation = le32_to_cpu(inode->i_generation);
-+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-+ raw_inode->i_block[0] =
-+ cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
-+ else for (block = 0; block < EXT3_N_BLOCKS; block++)
-+ raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
-+
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ rc = ext3_journal_dirty_metadata(handle, bh);
-+ if (!err)
-+ err = rc;
-+ EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
-+
-+out_brelse:
-+ brelse (bh);
-+ ext3_std_error(inode->i_sb, err);
-+ return err;
-+}
-+
-+/*
-+ * ext3_write_inode()
-+ *
-+ * We are called from a few places:
-+ *
-+ * - Within generic_file_write() for O_SYNC files.
-+ * Here, there will be no transaction running. We wait for any running
-+ * trasnaction to commit.
-+ *
-+ * - Within sys_sync(), kupdate and such.
-+ * We wait on commit, if tol to.
-+ *
-+ * - Within prune_icache() (PF_MEMALLOC == true)
-+ * Here we simply return. We can't afford to block kswapd on the
-+ * journal commit.
-+ *
-+ * In all cases it is actually safe for us to return without doing anything,
-+ * because the inode has been copied into a raw inode buffer in
-+ * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
-+ * knfsd.
-+ *
-+ * Note that we are absolutely dependent upon all inode dirtiers doing the
-+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
-+ * which we are interested.
-+ *
-+ * It would be a bug for them to not do this. The code:
-+ *
-+ * mark_inode_dirty(inode)
-+ * stuff();
-+ * inode->i_size = expr;
-+ *
-+ * is in error because a kswapd-driven write_inode() could occur while
-+ * `stuff()' is running, and the new i_size will be lost. Plus the inode
-+ * will no longer be on the superblock's dirty inode list.
-+ */
-+void ext3_write_inode(struct inode *inode, int wait)
-+{
-+ if (current->flags & PF_MEMALLOC)
-+ return;
-+
-+ if (ext3_journal_current_handle()) {
-+ jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
-+ return;
-+ }
-+
-+ if (!wait)
-+ return;
-+
-+ ext3_force_commit(inode->i_sb);
-+}
-+
-+/*
-+ * ext3_setattr()
-+ *
-+ * Called from notify_change.
-+ *
-+ * We want to trap VFS attempts to truncate the file as soon as
-+ * possible. In particular, we want to make sure that when the VFS
-+ * shrinks i_size, we put the inode on the orphan list and modify
-+ * i_disksize immediately, so that during the subsequent flushing of
-+ * dirty pages and freeing of disk blocks, we can guarantee that any
-+ * commit will leave the blocks being flushed in an unused state on
-+ * disk. (On recovery, the inode will get truncated and the blocks will
-+ * be freed, so we have a strong guarantee that no future commit will
-+ * leave these blocks visible to the user.)
-+ *
-+ * This is only needed for regular files. rmdir() has its own path, and
-+ * we can never truncate a direcory except on final unlink (at which
-+ * point i_nlink is zero so recovery is easy.)
-+ *
-+ * Called with the BKL.
-+ */
-+
-+int ext3_setattr(struct dentry *dentry, struct iattr *attr)
-+{
-+ struct inode *inode = dentry->d_inode;
-+ int error, rc = 0;
-+ const unsigned int ia_valid = attr->ia_valid;
-+
-+ error = inode_change_ok(inode, attr);
-+ if (error)
-+ return error;
-+
-+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-+ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
-+ if (error)
-+ return error;
-+ }
-+
-+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
-+ handle_t *handle;
-+
-+ handle = ext3_journal_start(inode, 3);
-+ if (IS_ERR(handle)) {
-+ error = PTR_ERR(handle);
-+ goto err_out;
-+ }
-+
-+ error = ext3_orphan_add(handle, inode);
-+ inode->u.ext3_i.i_disksize = attr->ia_size;
-+ rc = ext3_mark_inode_dirty(handle, inode);
-+ if (!error)
-+ error = rc;
-+ ext3_journal_stop(handle, inode);
-+ }
-+
-+ rc = inode_setattr(inode, attr);
-+
-+ /* If inode_setattr's call to ext3_truncate failed to get a
-+ * transaction handle at all, we need to clean up the in-core
-+ * orphan list manually. */
-+ if (inode->i_nlink)
-+ ext3_orphan_del(NULL, inode);
-+
-+err_out:
-+ ext3_std_error(inode->i_sb, error);
-+ if (!error)
-+ error = rc;
-+ return error;
-+}
-+
-+
-+/*
-+ * akpm: how many blocks doth make a writepage()?
-+ *
-+ * With N blocks per page, it may be:
-+ * N data blocks
-+ * 2 indirect block
-+ * 2 dindirect
-+ * 1 tindirect
-+ * N+5 bitmap blocks (from the above)
-+ * N+5 group descriptor summary blocks
-+ * 1 inode block
-+ * 1 superblock.
-+ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
-+ *
-+ * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
-+ *
-+ * With ordered or writeback data it's the same, less the N data blocks.
-+ *
-+ * If the inode's direct blocks can hold an integral number of pages then a
-+ * page cannot straddle two indirect blocks, and we can only touch one indirect
-+ * and dindirect block, and the "5" above becomes "3".
-+ *
-+ * This still overestimates under most circumstances. If we were to pass the
-+ * start and end offsets in here as well we could do block_to_path() on each
-+ * block and work out the exact number of indirects which are touched. Pah.
-+ */
-+
-+int ext3_writepage_trans_blocks(struct inode *inode)
-+{
-+ int bpp = ext3_journal_blocks_per_page(inode);
-+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
-+ int ret;
-+
-+ if (ext3_should_journal_data(inode))
-+ ret = 3 * (bpp + indirects) + 2;
-+ else
-+ ret = 2 * (bpp + indirects) + 2;
-+
-+#ifdef CONFIG_QUOTA
-+ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
-+#endif
-+
-+ return ret;
-+}
-+
-+int
-+ext3_mark_iloc_dirty(handle_t *handle,
-+ struct inode *inode,
-+ struct ext3_iloc *iloc)
-+{
-+ int err = 0;
-+
-+ if (handle) {
-+ /* the do_update_inode consumes one bh->b_count */
-+ atomic_inc(&iloc->bh->b_count);
-+ err = ext3_do_update_inode(handle, inode, iloc);
-+ /* ext3_do_update_inode() does journal_dirty_metadata */
-+ brelse(iloc->bh);
-+ } else {
-+ printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n");
-+ }
-+ return err;
-+}
-+
-+/*
-+ * On success, We end up with an outstanding reference count against
-+ * iloc->bh. This _must_ be cleaned up later.
-+ */
-+
-+int
-+ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
-+ struct ext3_iloc *iloc)
-+{
-+ int err = 0;
-+ if (handle) {
-+ err = ext3_get_inode_loc(inode, iloc);
-+ if (!err) {
-+ BUFFER_TRACE(iloc->bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, iloc->bh);
-+ if (err) {
-+ brelse(iloc->bh);
-+ iloc->bh = NULL;
-+ }
-+ }
-+ }
-+ ext3_std_error(inode->i_sb, err);
-+ return err;
-+}
-+
-+/*
-+ * akpm: What we do here is to mark the in-core inode as clean
-+ * with respect to inode dirtiness (it may still be data-dirty).
-+ * This means that the in-core inode may be reaped by prune_icache
-+ * without having to perform any I/O. This is a very good thing,
-+ * because *any* task may call prune_icache - even ones which
-+ * have a transaction open against a different journal.
-+ *
-+ * Is this cheating? Not really. Sure, we haven't written the
-+ * inode out, but prune_icache isn't a user-visible syncing function.
-+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
-+ * we start and wait on commits.
-+ *
-+ * Is this efficient/effective? Well, we're being nice to the system
-+ * by cleaning up our inodes proactively so they can be reaped
-+ * without I/O. But we are potentially leaving up to five seconds'
-+ * worth of inodes floating about which prune_icache wants us to
-+ * write out. One way to fix that would be to get prune_icache()
-+ * to do a write_super() to free up some memory. It has the desired
-+ * effect.
-+ */
-+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
-+{
-+ struct ext3_iloc iloc;
-+ int err;
-+
-+ err = ext3_reserve_inode_write(handle, inode, &iloc);
-+ if (!err)
-+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-+ return err;
-+}
-+
-+/*
-+ * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
-+ *
-+ * We're really interested in the case where a file is being extended.
-+ * i_size has been changed by generic_commit_write() and we thus need
-+ * to include the updated inode in the current transaction.
-+ *
-+ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
-+ * are allocated to the file.
-+ *
-+ * If the inode is marked synchronous, we don't honour that here - doing
-+ * so would cause a commit on atime updates, which we don't bother doing.
-+ * We handle synchronous inodes at the highest possible level.
-+ */
-+void ext3_dirty_inode(struct inode *inode)
-+{
-+ handle_t *current_handle = ext3_journal_current_handle();
-+ handle_t *handle;
-+
-+ lock_kernel();
-+ handle = ext3_journal_start(inode, 1);
-+ if (IS_ERR(handle))
-+ goto out;
-+ if (current_handle &&
-+ current_handle->h_transaction != handle->h_transaction) {
-+ /* This task has a transaction open against a different fs */
-+ printk(KERN_EMERG __FUNCTION__": transactions do not match!\n");
-+ } else {
-+ jbd_debug(5, "marking dirty. outer handle=%p\n",
-+ current_handle);
-+ ext3_mark_inode_dirty(handle, inode);
-+ }
-+ ext3_journal_stop(handle, inode);
-+out:
-+ unlock_kernel();
-+}
-+
-+#ifdef AKPM
-+/*
-+ * Bind an inode's backing buffer_head into this transaction, to prevent
-+ * it from being flushed to disk early. Unlike
-+ * ext3_reserve_inode_write, this leaves behind no bh reference and
-+ * returns no iloc structure, so the caller needs to repeat the iloc
-+ * lookup to mark the inode dirty later.
-+ */
-+static inline int
-+ext3_pin_inode(handle_t *handle, struct inode *inode)
-+{
-+ struct ext3_iloc iloc;
-+
-+ int err = 0;
-+ if (handle) {
-+ err = ext3_get_inode_loc(inode, &iloc);
-+ if (!err) {
-+ BUFFER_TRACE(iloc.bh, "get_write_access");
-+ err = journal_get_write_access(handle, iloc.bh);
-+ if (!err)
-+ err = ext3_journal_dirty_metadata(handle,
-+ iloc.bh);
-+ brelse(iloc.bh);
-+ }
-+ }
-+ ext3_std_error(inode->i_sb, err);
-+ return err;
-+}
-+#endif
-+
-+int ext3_change_inode_journal_flag(struct inode *inode, int val)
-+{
-+ journal_t *journal;
-+ handle_t *handle;
-+ int err;
-+
-+ /*
-+ * We have to be very careful here: changing a data block's
-+ * journaling status dynamically is dangerous. If we write a
-+ * data block to the journal, change the status and then delete
-+ * that block, we risk forgetting to revoke the old log record
-+ * from the journal and so a subsequent replay can corrupt data.
-+ * So, first we make sure that the journal is empty and that
-+ * nobody is changing anything.
-+ */
-+
-+ journal = EXT3_JOURNAL(inode);
-+ if (is_journal_aborted(journal) || IS_RDONLY(inode))
-+ return -EROFS;
-+
-+ journal_lock_updates(journal);
-+ journal_flush(journal);
-+
-+ /*
-+ * OK, there are no updates running now, and all cached data is
-+ * synced to disk. We are now in a completely consistent state
-+ * which doesn't have anything in the journal, and we know that
-+ * no filesystem updates are running, so it is safe to modify
-+ * the inode's in-core data-journaling state flag now.
-+ */
-+
-+ if (val)
-+ inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
-+ else
-+ inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
-+
-+ journal_unlock_updates(journal);
-+
-+ /* Finally we can mark the inode as dirty. */
-+
-+ handle = ext3_journal_start(inode, 1);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ err = ext3_mark_inode_dirty(handle, inode);
-+ handle->h_sync = 1;
-+ ext3_journal_stop(handle, inode);
-+ ext3_std_error(inode->i_sb, err);
-+
-+ return err;
-+}
-+
-+
-+/*
-+ * ext3_aops_journal_start().
-+ *
-+ * <This function died, but the comment lives on>
-+ *
-+ * We need to take the inode semaphore *outside* the
-+ * journal_start/journal_stop. Otherwise, a different task could do a
-+ * wait_for_commit() while holding ->i_sem, which deadlocks. The rule
-+ * is: transaction open/closes are considered to be a locking operation
-+ * and they nest *inside* ->i_sem.
-+ * ----------------------------------------------------------------------------
-+ * Possible problem:
-+ * ext3_file_write()
-+ * -> generic_file_write()
-+ * -> __alloc_pages()
-+ * -> page_launder()
-+ * -> ext3_writepage()
-+ *
-+ * And the writepage can be on a different fs while we have a
-+ * transaction open against this one! Bad.
-+ *
-+ * I tried making the task PF_MEMALLOC here, but that simply results in
-+ * 0-order allocation failures passed back to generic_file_write().
-+ * Instead, we rely on the reentrancy protection in ext3_writepage().
-+ * ----------------------------------------------------------------------------
-+ * When we do the journal_start() here we don't really need to reserve
-+ * any blocks - we won't need any until we hit ext3_prepare_write(),
-+ * which does all the needed journal extending. However! There is a
-+ * problem with quotas:
-+ *
-+ * Thread 1:
-+ * sys_sync
-+ * ->sync_dquots
-+ * ->commit_dquot
-+ * ->lock_dquot
-+ * ->write_dquot
-+ * ->ext3_file_write
-+ * ->journal_start
-+ * ->ext3_prepare_write
-+ * ->journal_extend
-+ * ->journal_start
-+ * Thread 2:
-+ * ext3_create (for example)
-+ * ->ext3_new_inode
-+ * ->dquot_initialize
-+ * ->lock_dquot
-+ *
-+ * Deadlock. Thread 1's journal_start blocks because thread 2 has a
-+ * transaction open. Thread 2's transaction will never close because
-+ * thread 2 is stuck waiting for the dquot lock.
-+ *
-+ * So. We must ensure that thread 1 *never* needs to extend the journal
-+ * for quota writes. We do that by reserving enough journal blocks
-+ * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
-+ * need to extend" test in ext3_prepare_write() succeeds.
-+ */
-diff -rup --new-file linux.mcp2/fs/ext3/ioctl.c linux_tmp/fs/ext3/ioctl.c
---- linux.mcp2/fs/ext3/ioctl.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/ioctl.c 2001-11-09 14:25:04.000000000 -0800
-@@ -0,0 +1,170 @@
-+/*
-+ * linux/fs/ext3/ioctl.c
-+ *
-+ * Copyright (C) 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ */
-+
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/sched.h>
-+#include <asm/uaccess.h>
-+
-+
-+int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
-+ unsigned long arg)
-+{
-+ unsigned int flags;
-+
-+ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
-+
-+ switch (cmd) {
-+ case EXT3_IOC_GETFLAGS:
-+ flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
-+ return put_user(flags, (int *) arg);
-+ case EXT3_IOC_SETFLAGS: {
-+ handle_t *handle = NULL;
-+ int err;
-+ struct ext3_iloc iloc;
-+ unsigned int oldflags;
-+ unsigned int jflag;
-+
-+ if (IS_RDONLY(inode))
-+ return -EROFS;
-+
-+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
-+ return -EPERM;
-+
-+ if (get_user(flags, (int *) arg))
-+ return -EFAULT;
-+
-+ oldflags = inode->u.ext3_i.i_flags;
-+
-+ /* The JOURNAL_DATA flag is modifiable only by root */
-+ jflag = flags & EXT3_JOURNAL_DATA_FL;
-+
-+ /*
-+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-+ * the relevant capability.
-+ *
-+ * This test looks nicer. Thanks to Pauline Middelink
-+ */
-+ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-+ if (!capable(CAP_LINUX_IMMUTABLE))
-+ return -EPERM;
-+ }
-+
-+ /*
-+ * The JOURNAL_DATA flag can only be changed by
-+ * the relevant capability.
-+ */
-+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-+ if (!capable(CAP_SYS_RESOURCE))
-+ return -EPERM;
-+ }
-+
-+
-+ handle = ext3_journal_start(inode, 1);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+ if (IS_SYNC(inode))
-+ handle->h_sync = 1;
-+ err = ext3_reserve_inode_write(handle, inode, &iloc);
-+ if (err)
-+ goto flags_err;
-+
-+ flags = flags & EXT3_FL_USER_MODIFIABLE;
-+ flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
-+ inode->u.ext3_i.i_flags = flags;
-+
-+ if (flags & EXT3_SYNC_FL)
-+ inode->i_flags |= S_SYNC;
-+ else
-+ inode->i_flags &= ~S_SYNC;
-+ if (flags & EXT3_APPEND_FL)
-+ inode->i_flags |= S_APPEND;
-+ else
-+ inode->i_flags &= ~S_APPEND;
-+ if (flags & EXT3_IMMUTABLE_FL)
-+ inode->i_flags |= S_IMMUTABLE;
-+ else
-+ inode->i_flags &= ~S_IMMUTABLE;
-+ if (flags & EXT3_NOATIME_FL)
-+ inode->i_flags |= S_NOATIME;
-+ else
-+ inode->i_flags &= ~S_NOATIME;
-+ inode->i_ctime = CURRENT_TIME;
-+
-+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-+flags_err:
-+ ext3_journal_stop(handle, inode);
-+ if (err)
-+ return err;
-+
-+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
-+ err = ext3_change_inode_journal_flag(inode, jflag);
-+ return err;
-+ }
-+ case EXT3_IOC_GETVERSION:
-+ case EXT3_IOC_GETVERSION_OLD:
-+ return put_user(inode->i_generation, (int *) arg);
-+ case EXT3_IOC_SETVERSION:
-+ case EXT3_IOC_SETVERSION_OLD: {
-+ handle_t *handle;
-+ struct ext3_iloc iloc;
-+ __u32 generation;
-+ int err;
-+
-+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
-+ return -EPERM;
-+ if (IS_RDONLY(inode))
-+ return -EROFS;
-+ if (get_user(generation, (int *) arg))
-+ return -EFAULT;
-+
-+ handle = ext3_journal_start(inode, 1);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+ err = ext3_reserve_inode_write(handle, inode, &iloc);
-+ if (err)
-+ return err;
-+
-+ inode->i_ctime = CURRENT_TIME;
-+ inode->i_generation = generation;
-+
-+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-+ ext3_journal_stop(handle, inode);
-+ return err;
-+ }
-+#ifdef CONFIG_JBD_DEBUG
-+ case EXT3_IOC_WAIT_FOR_READONLY:
-+ /*
-+ * This is racy - by the time we're woken up and running,
-+ * the superblock could be released. And the module could
-+ * have been unloaded. So sue me.
-+ *
-+ * Returns 1 if it slept, else zero.
-+ */
-+ {
-+ struct super_block *sb = inode->i_sb;
-+ DECLARE_WAITQUEUE(wait, current);
-+ int ret = 0;
-+
-+ set_current_state(TASK_INTERRUPTIBLE);
-+ add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
-+ if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) {
-+ schedule();
-+ ret = 1;
-+ }
-+ remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
-+ return ret;
-+ }
-+#endif
-+ default:
-+ return -ENOTTY;
-+ }
-+}
-diff -rup --new-file linux.mcp2/fs/ext3/namei.c linux_tmp/fs/ext3/namei.c
---- linux.mcp2/fs/ext3/namei.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/namei.c 2001-11-09 14:25:04.000000000 -0800
-@@ -0,0 +1,1125 @@
-+/*
-+ * linux/fs/ext3/namei.c
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ *
-+ * from
-+ *
-+ * linux/fs/minix/namei.c
-+ *
-+ * Copyright (C) 1991, 1992 Linus Torvalds
-+ *
-+ * Big-endian to little-endian byte-swapping/bitmaps by
-+ * David S. Miller (davem@caip.rutgers.edu), 1995
-+ * Directory entry file type support and forward compatibility hooks
-+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
-+ */
-+
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/sched.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/fcntl.h>
-+#include <linux/stat.h>
-+#include <linux/string.h>
-+#include <linux/locks.h>
-+#include <linux/quotaops.h>
-+
-+
-+/*
-+ * define how far ahead to read directories while searching them.
-+ */
-+#define NAMEI_RA_CHUNKS 2
-+#define NAMEI_RA_BLOCKS 4
-+#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-+#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
-+
-+/*
-+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
-+ *
-+ * `len <= EXT3_NAME_LEN' is guaranteed by caller.
-+ * `de != NULL' is guaranteed by caller.
-+ */
-+static inline int ext3_match (int len, const char * const name,
-+ struct ext3_dir_entry_2 * de)
-+{
-+ if (len != de->name_len)
-+ return 0;
-+ if (!de->inode)
-+ return 0;
-+ return !memcmp(name, de->name, len);
-+}
-+
-+/*
-+ * Returns 0 if not found, -1 on failure, and 1 on success
-+ */
-+static int inline search_dirblock(struct buffer_head * bh,
-+ struct inode *dir,
-+ struct dentry *dentry,
-+ unsigned long offset,
-+ struct ext3_dir_entry_2 ** res_dir)
-+{
-+ struct ext3_dir_entry_2 * de;
-+ char * dlimit;
-+ int de_len;
-+ const char *name = dentry->d_name.name;
-+ int namelen = dentry->d_name.len;
-+
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ dlimit = bh->b_data + dir->i_sb->s_blocksize;
-+ while ((char *) de < dlimit) {
-+ /* this code is executed quadratically often */
-+ /* do minimal checking `by hand' */
-+
-+ if ((char *) de + namelen <= dlimit &&
-+ ext3_match (namelen, name, de)) {
-+ /* found a match - just to be sure, do a full check */
-+ if (!ext3_check_dir_entry("ext3_find_entry",
-+ dir, de, bh, offset))
-+ return -1;
-+ *res_dir = de;
-+ return 1;
-+ }
-+ /* prevent looping on a bad block */
-+ de_len = le16_to_cpu(de->rec_len);
-+ if (de_len <= 0)
-+ return -1;
-+ offset += de_len;
-+ de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
-+ }
-+ return 0;
-+}
-+
-+/*
-+ * ext3_find_entry()
-+ *
-+ * finds an entry in the specified directory with the wanted name. It
-+ * returns the cache buffer in which the entry was found, and the entry
-+ * itself (as a parameter - res_dir). It does NOT read the inode of the
-+ * entry - you'll have to do that yourself if you want to.
-+ *
-+ * The returned buffer_head has ->b_count elevated. The caller is expected
-+ * to brelse() it when appropriate.
-+ */
-+static struct buffer_head * ext3_find_entry (struct dentry *dentry,
-+ struct ext3_dir_entry_2 ** res_dir)
-+{
-+ struct super_block * sb;
-+ struct buffer_head * bh_use[NAMEI_RA_SIZE];
-+ struct buffer_head * bh, *ret = NULL;
-+ unsigned long start, block, b;
-+ int ra_max = 0; /* Number of bh's in the readahead
-+ buffer, bh_use[] */
-+ int ra_ptr = 0; /* Current index into readahead
-+ buffer */
-+ int num = 0;
-+ int nblocks, i, err;
-+ struct inode *dir = dentry->d_parent->d_inode;
-+
-+ *res_dir = NULL;
-+ sb = dir->i_sb;
-+
-+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
-+ start = dir->u.ext3_i.i_dir_start_lookup;
-+ if (start >= nblocks)
-+ start = 0;
-+ block = start;
-+restart:
-+ do {
-+ /*
-+ * We deal with the read-ahead logic here.
-+ */
-+ if (ra_ptr >= ra_max) {
-+ /* Refill the readahead buffer */
-+ ra_ptr = 0;
-+ b = block;
-+ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
-+ /*
-+ * Terminate if we reach the end of the
-+ * directory and must wrap, or if our
-+ * search has finished at this block.
-+ */
-+ if (b >= nblocks || (num && block == start)) {
-+ bh_use[ra_max] = NULL;
-+ break;
-+ }
-+ num++;
-+ bh = ext3_getblk(NULL, dir, b++, 0, &err);
-+ bh_use[ra_max] = bh;
-+ if (bh)
-+ ll_rw_block(READ, 1, &bh);
-+ }
-+ }
-+ if ((bh = bh_use[ra_ptr++]) == NULL)
-+ goto next;
-+ wait_on_buffer(bh);
-+ if (!buffer_uptodate(bh)) {
-+ /* read error, skip block & hope for the best */
-+ brelse(bh);
-+ goto next;
-+ }
-+ i = search_dirblock(bh, dir, dentry,
-+ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
-+ if (i == 1) {
-+ dir->u.ext3_i.i_dir_start_lookup = block;
-+ ret = bh;
-+ goto cleanup_and_exit;
-+ } else {
-+ brelse(bh);
-+ if (i < 0)
-+ goto cleanup_and_exit;
-+ }
-+ next:
-+ if (++block >= nblocks)
-+ block = 0;
-+ } while (block != start);
-+
-+ /*
-+ * If the directory has grown while we were searching, then
-+ * search the last part of the directory before giving up.
-+ */
-+ block = nblocks;
-+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
-+ if (block < nblocks) {
-+ start = 0;
-+ goto restart;
-+ }
-+
-+cleanup_and_exit:
-+ /* Clean up the read-ahead blocks */
-+ for (; ra_ptr < ra_max; ra_ptr++)
-+ brelse (bh_use[ra_ptr]);
-+ return ret;
-+}
-+
-+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
-+{
-+ struct inode * inode;
-+ struct ext3_dir_entry_2 * de;
-+ struct buffer_head * bh;
-+
-+ if (dentry->d_name.len > EXT3_NAME_LEN)
-+ return ERR_PTR(-ENAMETOOLONG);
-+
-+ bh = ext3_find_entry(dentry, &de);
-+ inode = NULL;
-+ if (bh) {
-+ unsigned long ino = le32_to_cpu(de->inode);
-+ brelse (bh);
-+ inode = iget(dir->i_sb, ino);
-+
-+ if (!inode)
-+ return ERR_PTR(-EACCES);
-+ }
-+ d_add(dentry, inode);
-+ return NULL;
-+}
-+
-+#define S_SHIFT 12
-+static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
-+ [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE,
-+ [S_IFDIR >> S_SHIFT] EXT3_FT_DIR,
-+ [S_IFCHR >> S_SHIFT] EXT3_FT_CHRDEV,
-+ [S_IFBLK >> S_SHIFT] EXT3_FT_BLKDEV,
-+ [S_IFIFO >> S_SHIFT] EXT3_FT_FIFO,
-+ [S_IFSOCK >> S_SHIFT] EXT3_FT_SOCK,
-+ [S_IFLNK >> S_SHIFT] EXT3_FT_SYMLINK,
-+};
-+
-+static inline void ext3_set_de_type(struct super_block *sb,
-+ struct ext3_dir_entry_2 *de,
-+ umode_t mode) {
-+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
-+ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-+}
-+
-+/*
-+ * ext3_add_entry()
-+ *
-+ * adds a file entry to the specified directory, using the same
-+ * semantics as ext3_find_entry(). It returns NULL if it failed.
-+ *
-+ * NOTE!! The inode part of 'de' is left at 0 - which means you
-+ * may not sleep between calling this and putting something into
-+ * the entry, as someone else might have used it while you slept.
-+ */
-+
-+/*
-+ * AKPM: the journalling code here looks wrong on the error paths
-+ */
-+static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
-+{
-+ struct inode *dir = dentry->d_parent->d_inode;
-+ const char *name = dentry->d_name.name;
-+ int namelen = dentry->d_name.len;
-+ unsigned long offset;
-+ unsigned short rec_len;
-+ struct buffer_head * bh;
-+ struct ext3_dir_entry_2 * de, * de1;
-+ struct super_block * sb;
-+ int retval;
-+
-+ sb = dir->i_sb;
-+
-+ if (!namelen)
-+ return -EINVAL;
-+ bh = ext3_bread (handle, dir, 0, 0, &retval);
-+ if (!bh)
-+ return retval;
-+ rec_len = EXT3_DIR_REC_LEN(namelen);
-+ offset = 0;
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ while (1) {
-+ if ((char *)de >= sb->s_blocksize + bh->b_data) {
-+ brelse (bh);
-+ bh = NULL;
-+ bh = ext3_bread (handle, dir,
-+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
-+ if (!bh)
-+ return retval;
-+ if (dir->i_size <= offset) {
-+ if (dir->i_size == 0) {
-+ brelse(bh);
-+ return -ENOENT;
-+ }
-+
-+ ext3_debug ("creating next block\n");
-+
-+ BUFFER_TRACE(bh, "get_write_access");
-+ ext3_journal_get_write_access(handle, bh);
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ de->inode = 0;
-+ de->rec_len = le16_to_cpu(sb->s_blocksize);
-+ dir->u.ext3_i.i_disksize =
-+ dir->i_size = offset + sb->s_blocksize;
-+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_mark_inode_dirty(handle, dir);
-+ } else {
-+
-+ ext3_debug ("skipping to next block\n");
-+
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ }
-+ }
-+ if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
-+ offset)) {
-+ brelse (bh);
-+ return -ENOENT;
-+ }
-+ if (ext3_match (namelen, name, de)) {
-+ brelse (bh);
-+ return -EEXIST;
-+ }
-+ if ((le32_to_cpu(de->inode) == 0 &&
-+ le16_to_cpu(de->rec_len) >= rec_len) ||
-+ (le16_to_cpu(de->rec_len) >=
-+ EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
-+ BUFFER_TRACE(bh, "get_write_access");
-+ ext3_journal_get_write_access(handle, bh);
-+ /* By now the buffer is marked for journaling */
-+ offset += le16_to_cpu(de->rec_len);
-+ if (le32_to_cpu(de->inode)) {
-+ de1 = (struct ext3_dir_entry_2 *) ((char *) de +
-+ EXT3_DIR_REC_LEN(de->name_len));
-+ de1->rec_len =
-+ cpu_to_le16(le16_to_cpu(de->rec_len) -
-+ EXT3_DIR_REC_LEN(de->name_len));
-+ de->rec_len = cpu_to_le16(
-+ EXT3_DIR_REC_LEN(de->name_len));
-+ de = de1;
-+ }
-+ de->file_type = EXT3_FT_UNKNOWN;
-+ if (inode) {
-+ de->inode = cpu_to_le32(inode->i_ino);
-+ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
-+ } else
-+ de->inode = 0;
-+ de->name_len = namelen;
-+ memcpy (de->name, name, namelen);
-+ /*
-+ * XXX shouldn't update any times until successful
-+ * completion of syscall, but too many callers depend
-+ * on this.
-+ *
-+ * XXX similarly, too many callers depend on
-+ * ext3_new_inode() setting the times, but error
-+ * recovery deletes the inode, so the worst that can
-+ * happen is that the times are slightly out of date
-+ * and/or different from the directory change time.
-+ */
-+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_mark_inode_dirty(handle, dir);
-+ dir->i_version = ++event;
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ ext3_journal_dirty_metadata(handle, bh);
-+ brelse(bh);
-+ return 0;
-+ }
-+ offset += le16_to_cpu(de->rec_len);
-+ de = (struct ext3_dir_entry_2 *)
-+ ((char *) de + le16_to_cpu(de->rec_len));
-+ }
-+ brelse (bh);
-+ return -ENOSPC;
-+}
-+
-+/*
-+ * ext3_delete_entry deletes a directory entry by merging it with the
-+ * previous entry
-+ */
-+static int ext3_delete_entry (handle_t *handle,
-+ struct inode * dir,
-+ struct ext3_dir_entry_2 * de_del,
-+ struct buffer_head * bh)
-+{
-+ struct ext3_dir_entry_2 * de, * pde;
-+ int i;
-+
-+ i = 0;
-+ pde = NULL;
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ while (i < bh->b_size) {
-+ if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
-+ return -EIO;
-+ if (de == de_del) {
-+ BUFFER_TRACE(bh, "get_write_access");
-+ ext3_journal_get_write_access(handle, bh);
-+ if (pde)
-+ pde->rec_len =
-+ cpu_to_le16(le16_to_cpu(pde->rec_len) +
-+ le16_to_cpu(de->rec_len));
-+ else
-+ de->inode = 0;
-+ dir->i_version = ++event;
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ ext3_journal_dirty_metadata(handle, bh);
-+ return 0;
-+ }
-+ i += le16_to_cpu(de->rec_len);
-+ pde = de;
-+ de = (struct ext3_dir_entry_2 *)
-+ ((char *) de + le16_to_cpu(de->rec_len));
-+ }
-+ return -ENOENT;
-+}
-+
-+/*
-+ * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we
-+ * do not perform it in these functions. We perform it at the call site,
-+ * if it is needed.
-+ */
-+static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
-+{
-+ inode->i_nlink++;
-+}
-+
-+static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
-+{
-+ inode->i_nlink--;
-+}
-+
-+static int ext3_add_nondir(handle_t *handle,
-+ struct dentry *dentry, struct inode *inode)
-+{
-+ int err = ext3_add_entry(handle, dentry, inode);
-+ if (!err) {
-+ d_instantiate(dentry, inode);
-+ return 0;
-+ }
-+ ext3_dec_count(handle, inode);
-+ iput(inode);
-+ return err;
-+}
-+
-+/*
-+ * By the time this is called, we already have created
-+ * the directory cache entry for the new file, but it
-+ * is so far negative - it has no inode.
-+ *
-+ * If the create succeeds, we fill in the inode information
-+ * with d_instantiate().
-+ */
-+static int ext3_create (struct inode * dir, struct dentry * dentry, int mode)
-+{
-+ handle_t *handle;
-+ struct inode * inode;
-+ int err;
-+
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ if (IS_SYNC(dir))
-+ handle->h_sync = 1;
-+
-+ inode = ext3_new_inode (handle, dir, mode);
-+ err = PTR_ERR(inode);
-+ if (!IS_ERR(inode)) {
-+ inode->i_op = &ext3_file_inode_operations;
-+ inode->i_fop = &ext3_file_operations;
-+ inode->i_mapping->a_ops = &ext3_aops;
-+ ext3_mark_inode_dirty(handle, inode);
-+ err = ext3_add_nondir(handle, dentry, inode);
-+ }
-+ ext3_journal_stop(handle, dir);
-+ return err;
-+}
-+
-+static int ext3_mknod (struct inode * dir, struct dentry *dentry,
-+ int mode, int rdev)
-+{
-+ handle_t *handle;
-+ struct inode *inode;
-+ int err;
-+
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ if (IS_SYNC(dir))
-+ handle->h_sync = 1;
-+
-+ inode = ext3_new_inode (handle, dir, mode);
-+ err = PTR_ERR(inode);
-+ if (!IS_ERR(inode)) {
-+ init_special_inode(inode, mode, rdev);
-+ ext3_mark_inode_dirty(handle, inode);
-+ err = ext3_add_nondir(handle, dentry, inode);
-+ }
-+ ext3_journal_stop(handle, dir);
-+ return err;
-+}
-+
-+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
-+{
-+ handle_t *handle;
-+ struct inode * inode;
-+ struct buffer_head * dir_block;
-+ struct ext3_dir_entry_2 * de;
-+ int err;
-+
-+ if (dir->i_nlink >= EXT3_LINK_MAX)
-+ return -EMLINK;
-+
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ if (IS_SYNC(dir))
-+ handle->h_sync = 1;
-+
-+ inode = ext3_new_inode (handle, dir, S_IFDIR);
-+ err = PTR_ERR(inode);
-+ if (IS_ERR(inode))
-+ goto out_stop;
-+
-+ inode->i_op = &ext3_dir_inode_operations;
-+ inode->i_fop = &ext3_dir_operations;
-+ inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
-+ inode->i_blocks = 0;
-+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
-+ if (!dir_block) {
-+ inode->i_nlink--; /* is this nlink == 0? */
-+ ext3_mark_inode_dirty(handle, inode);
-+ iput (inode);
-+ goto out_stop;
-+ }
-+ BUFFER_TRACE(dir_block, "get_write_access");
-+ ext3_journal_get_write_access(handle, dir_block);
-+ de = (struct ext3_dir_entry_2 *) dir_block->b_data;
-+ de->inode = cpu_to_le32(inode->i_ino);
-+ de->name_len = 1;
-+ de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
-+ strcpy (de->name, ".");
-+ ext3_set_de_type(dir->i_sb, de, S_IFDIR);
-+ de = (struct ext3_dir_entry_2 *)
-+ ((char *) de + le16_to_cpu(de->rec_len));
-+ de->inode = cpu_to_le32(dir->i_ino);
-+ de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
-+ de->name_len = 2;
-+ strcpy (de->name, "..");
-+ ext3_set_de_type(dir->i_sb, de, S_IFDIR);
-+ inode->i_nlink = 2;
-+ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-+ ext3_journal_dirty_metadata(handle, dir_block);
-+ brelse (dir_block);
-+ inode->i_mode = S_IFDIR | mode;
-+ if (dir->i_mode & S_ISGID)
-+ inode->i_mode |= S_ISGID;
-+ ext3_mark_inode_dirty(handle, inode);
-+ err = ext3_add_entry (handle, dentry, inode);
-+ if (err)
-+ goto out_no_entry;
-+ dir->i_nlink++;
-+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_mark_inode_dirty(handle, dir);
-+ d_instantiate(dentry, inode);
-+out_stop:
-+ ext3_journal_stop(handle, dir);
-+ return err;
-+
-+out_no_entry:
-+ inode->i_nlink = 0;
-+ ext3_mark_inode_dirty(handle, inode);
-+ iput (inode);
-+ goto out_stop;
-+}
-+
-+/*
-+ * routine to check that the specified directory is empty (for rmdir)
-+ */
-+static int empty_dir (struct inode * inode)
-+{
-+ unsigned long offset;
-+ struct buffer_head * bh;
-+ struct ext3_dir_entry_2 * de, * de1;
-+ struct super_block * sb;
-+ int err;
-+
-+ sb = inode->i_sb;
-+ if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
-+ !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
-+ ext3_warning (inode->i_sb, "empty_dir",
-+ "bad directory (dir #%lu) - no data block",
-+ inode->i_ino);
-+ return 1;
-+ }
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ de1 = (struct ext3_dir_entry_2 *)
-+ ((char *) de + le16_to_cpu(de->rec_len));
-+ if (le32_to_cpu(de->inode) != inode->i_ino ||
-+ !le32_to_cpu(de1->inode) ||
-+ strcmp (".", de->name) ||
-+ strcmp ("..", de1->name)) {
-+ ext3_warning (inode->i_sb, "empty_dir",
-+ "bad directory (dir #%lu) - no `.' or `..'",
-+ inode->i_ino);
-+ brelse (bh);
-+ return 1;
-+ }
-+ offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
-+ de = (struct ext3_dir_entry_2 *)
-+ ((char *) de1 + le16_to_cpu(de1->rec_len));
-+ while (offset < inode->i_size ) {
-+ if (!bh ||
-+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
-+ brelse (bh);
-+ bh = ext3_bread (NULL, inode,
-+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
-+ if (!bh) {
-+#if 0
-+ ext3_error (sb, "empty_dir",
-+ "directory #%lu contains a hole at offset %lu",
-+ inode->i_ino, offset);
-+#endif
-+ offset += sb->s_blocksize;
-+ continue;
-+ }
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ }
-+ if (!ext3_check_dir_entry ("empty_dir", inode, de, bh,
-+ offset)) {
-+ brelse (bh);
-+ return 1;
-+ }
-+ if (le32_to_cpu(de->inode)) {
-+ brelse (bh);
-+ return 0;
-+ }
-+ offset += le16_to_cpu(de->rec_len);
-+ de = (struct ext3_dir_entry_2 *)
-+ ((char *) de + le16_to_cpu(de->rec_len));
-+ }
-+ brelse (bh);
-+ return 1;
-+}
-+
-+/* ext3_orphan_add() links an unlinked or truncated inode into a list of
-+ * such inodes, starting at the superblock, in case we crash before the
-+ * file is closed/deleted, or in case the inode truncate spans multiple
-+ * transactions and the last transaction is not recovered after a crash.
-+ *
-+ * At filesystem recovery time, we walk this list deleting unlinked
-+ * inodes and truncating linked inodes in ext3_orphan_cleanup().
-+ */
-+int ext3_orphan_add(handle_t *handle, struct inode *inode)
-+{
-+ struct super_block *sb = inode->i_sb;
-+ struct ext3_iloc iloc;
-+ int err = 0, rc;
-+
-+ lock_super(sb);
-+ if (!list_empty(&inode->u.ext3_i.i_orphan))
-+ goto out_unlock;
-+
-+ /* Orphan handling is only valid for files with data blocks
-+ * being truncated, or files being unlinked. */
-+
-+ /* @@@ FIXME: Observation from aviro:
-+ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
-+ * here (on lock_super()), so race with ext3_link() which might bump
-+ * ->i_nlink. For, say it, character device. Not a regular file,
-+ * not a directory, not a symlink and ->i_nlink > 0.
-+ */
-+ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
-+
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
-+ if (err)
-+ goto out_unlock;
-+
-+ err = ext3_reserve_inode_write(handle, inode, &iloc);
-+ if (err)
-+ goto out_unlock;
-+
-+ /* Insert this inode at the head of the on-disk orphan list... */
-+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
-+ EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
-+ rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
-+ if (!err)
-+ err = rc;
-+
-+ /* Only add to the head of the in-memory list if all the
-+ * previous operations succeeded. If the orphan_add is going to
-+ * fail (possibly taking the journal offline), we can't risk
-+ * leaving the inode on the orphan list: stray orphan-list
-+ * entries can cause panics at unmount time.
-+ *
-+ * This is safe: on error we're going to ignore the orphan list
-+ * anyway on the next recovery. */
-+ if (!err)
-+ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
-+
-+ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
-+ jbd_debug(4, "orphan inode %ld will point to %d\n",
-+ inode->i_ino, NEXT_ORPHAN(inode));
-+out_unlock:
-+ unlock_super(sb);
-+ ext3_std_error(inode->i_sb, err);
-+ return err;
-+}
-+
-+/*
-+ * ext3_orphan_del() removes an unlinked or truncated inode from the list
-+ * of such inodes stored on disk, because it is finally being cleaned up.
-+ */
-+int ext3_orphan_del(handle_t *handle, struct inode *inode)
-+{
-+ struct list_head *prev;
-+ struct ext3_sb_info *sbi;
-+ ino_t ino_next;
-+ struct ext3_iloc iloc;
-+ int err = 0;
-+
-+ lock_super(inode->i_sb);
-+ if (list_empty(&inode->u.ext3_i.i_orphan)) {
-+ unlock_super(inode->i_sb);
-+ return 0;
-+ }
-+
-+ ino_next = NEXT_ORPHAN(inode);
-+ prev = inode->u.ext3_i.i_orphan.prev;
-+ sbi = EXT3_SB(inode->i_sb);
-+
-+ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
-+
-+ list_del(&inode->u.ext3_i.i_orphan);
-+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
-+
-+ /* If we're on an error path, we may not have a valid
-+ * transaction handle with which to update the orphan list on
-+ * disk, but we still need to remove the inode from the linked
-+ * list in memory. */
-+ if (!handle)
-+ goto out;
-+
-+ err = ext3_reserve_inode_write(handle, inode, &iloc);
-+ if (err)
-+ goto out_err;
-+
-+ if (prev == &sbi->s_orphan) {
-+ jbd_debug(4, "superblock will point to %ld\n", ino_next);
-+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, sbi->s_sbh);
-+ if (err)
-+ goto out_brelse;
-+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-+ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-+ } else {
-+ struct ext3_iloc iloc2;
-+ struct inode *i_prev =
-+ list_entry(prev, struct inode, u.ext3_i.i_orphan);
-+
-+ jbd_debug(4, "orphan inode %ld will point to %ld\n",
-+ i_prev->i_ino, ino_next);
-+ err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
-+ if (err)
-+ goto out_brelse;
-+ NEXT_ORPHAN(i_prev) = ino_next;
-+ err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
-+ }
-+ if (err)
-+ goto out_brelse;
-+ NEXT_ORPHAN(inode) = 0;
-+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-+ if (err)
-+ goto out_brelse;
-+
-+out_err:
-+ ext3_std_error(inode->i_sb, err);
-+out:
-+ unlock_super(inode->i_sb);
-+ return err;
-+
-+out_brelse:
-+ brelse(iloc.bh);
-+ goto out_err;
-+}
-+
-+static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
-+{
-+ int retval;
-+ struct inode * inode;
-+ struct buffer_head * bh;
-+ struct ext3_dir_entry_2 * de;
-+ handle_t *handle;
-+
-+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ retval = -ENOENT;
-+ bh = ext3_find_entry (dentry, &de);
-+ if (!bh)
-+ goto end_rmdir;
-+
-+ if (IS_SYNC(dir))
-+ handle->h_sync = 1;
-+
-+ inode = dentry->d_inode;
-+ DQUOT_INIT(inode);
-+
-+ retval = -EIO;
-+ if (le32_to_cpu(de->inode) != inode->i_ino)
-+ goto end_rmdir;
-+
-+ retval = -ENOTEMPTY;
-+ if (!empty_dir (inode))
-+ goto end_rmdir;
-+
-+ retval = ext3_delete_entry(handle, dir, de, bh);
-+ if (retval)
-+ goto end_rmdir;
-+ if (inode->i_nlink != 2)
-+ ext3_warning (inode->i_sb, "ext3_rmdir",
-+ "empty directory has nlink!=2 (%d)",
-+ inode->i_nlink);
-+ inode->i_version = ++event;
-+ inode->i_nlink = 0;
-+ /* There's no need to set i_disksize: the fact that i_nlink is
-+ * zero will ensure that the right thing happens during any
-+ * recovery. */
-+ inode->i_size = 0;
-+ ext3_orphan_add(handle, inode);
-+ ext3_mark_inode_dirty(handle, inode);
-+ dir->i_nlink--;
-+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_mark_inode_dirty(handle, dir);
-+
-+end_rmdir:
-+ ext3_journal_stop(handle, dir);
-+ brelse (bh);
-+ return retval;
-+}
-+
-+static int ext3_unlink(struct inode * dir, struct dentry *dentry)
-+{
-+ int retval;
-+ struct inode * inode;
-+ struct buffer_head * bh;
-+ struct ext3_dir_entry_2 * de;
-+ handle_t *handle;
-+
-+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ if (IS_SYNC(dir))
-+ handle->h_sync = 1;
-+
-+ retval = -ENOENT;
-+ bh = ext3_find_entry (dentry, &de);
-+ if (!bh)
-+ goto end_unlink;
-+
-+ inode = dentry->d_inode;
-+ DQUOT_INIT(inode);
-+
-+ retval = -EIO;
-+ if (le32_to_cpu(de->inode) != inode->i_ino)
-+ goto end_unlink;
-+
-+ if (!inode->i_nlink) {
-+ ext3_warning (inode->i_sb, "ext3_unlink",
-+ "Deleting nonexistent file (%lu), %d",
-+ inode->i_ino, inode->i_nlink);
-+ inode->i_nlink = 1;
-+ }
-+ retval = ext3_delete_entry(handle, dir, de, bh);
-+ if (retval)
-+ goto end_unlink;
-+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_mark_inode_dirty(handle, dir);
-+ inode->i_nlink--;
-+ if (!inode->i_nlink)
-+ ext3_orphan_add(handle, inode);
-+ ext3_mark_inode_dirty(handle, inode);
-+ inode->i_ctime = dir->i_ctime;
-+ retval = 0;
-+
-+end_unlink:
-+ ext3_journal_stop(handle, dir);
-+ brelse (bh);
-+ return retval;
-+}
-+
-+static int ext3_symlink (struct inode * dir,
-+ struct dentry *dentry, const char * symname)
-+{
-+ handle_t *handle;
-+ struct inode * inode;
-+ int l, err;
-+
-+ l = strlen(symname)+1;
-+ if (l > dir->i_sb->s_blocksize)
-+ return -ENAMETOOLONG;
-+
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ if (IS_SYNC(dir))
-+ handle->h_sync = 1;
-+
-+ inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
-+ err = PTR_ERR(inode);
-+ if (IS_ERR(inode))
-+ goto out_stop;
-+
-+ if (l > sizeof (inode->u.ext3_i.i_data)) {
-+ inode->i_op = &page_symlink_inode_operations;
-+ inode->i_mapping->a_ops = &ext3_aops;
-+ /*
-+ * block_symlink() calls back into ext3_prepare/commit_write.
-+ * We have a transaction open. All is sweetness. It also sets
-+ * i_size in generic_commit_write().
-+ */
-+ err = block_symlink(inode, symname, l);
-+ if (err)
-+ goto out_no_entry;
-+ } else {
-+ inode->i_op = &ext3_fast_symlink_inode_operations;
-+ memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
-+ inode->i_size = l-1;
-+ }
-+ inode->u.ext3_i.i_disksize = inode->i_size;
-+ ext3_mark_inode_dirty(handle, inode);
-+ err = ext3_add_nondir(handle, dentry, inode);
-+out_stop:
-+ ext3_journal_stop(handle, dir);
-+ return err;
-+
-+out_no_entry:
-+ ext3_dec_count(handle, inode);
-+ ext3_mark_inode_dirty(handle, inode);
-+ iput (inode);
-+ goto out_stop;
-+}
-+
-+static int ext3_link (struct dentry * old_dentry,
-+ struct inode * dir, struct dentry *dentry)
-+{
-+ handle_t *handle;
-+ struct inode *inode = old_dentry->d_inode;
-+ int err;
-+
-+ if (S_ISDIR(inode->i_mode))
-+ return -EPERM;
-+
-+ if (inode->i_nlink >= EXT3_LINK_MAX)
-+ return -EMLINK;
-+
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ if (IS_SYNC(dir))
-+ handle->h_sync = 1;
-+
-+ inode->i_ctime = CURRENT_TIME;
-+ ext3_inc_count(handle, inode);
-+ atomic_inc(&inode->i_count);
-+
-+ ext3_mark_inode_dirty(handle, inode);
-+ err = ext3_add_nondir(handle, dentry, inode);
-+ ext3_journal_stop(handle, dir);
-+ return err;
-+}
-+
-+#define PARENT_INO(buffer) \
-+ ((struct ext3_dir_entry_2 *) ((char *) buffer + \
-+ le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
-+
-+/*
-+ * Anybody can rename anything with this: the permission checks are left to the
-+ * higher-level routines.
-+ */
-+static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
-+ struct inode * new_dir,struct dentry *new_dentry)
-+{
-+ handle_t *handle;
-+ struct inode * old_inode, * new_inode;
-+ struct buffer_head * old_bh, * new_bh, * dir_bh;
-+ struct ext3_dir_entry_2 * old_de, * new_de;
-+ int retval;
-+
-+ old_bh = new_bh = dir_bh = NULL;
-+
-+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+
-+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
-+ handle->h_sync = 1;
-+
-+ old_bh = ext3_find_entry (old_dentry, &old_de);
-+ /*
-+ * Check for inode number is _not_ due to possible IO errors.
-+ * We might rmdir the source, keep it as pwd of some process
-+ * and merrily kill the link to whatever was created under the
-+ * same name. Goodbye sticky bit ;-<
-+ */
-+ old_inode = old_dentry->d_inode;
-+ retval = -ENOENT;
-+ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
-+ goto end_rename;
-+
-+ new_inode = new_dentry->d_inode;
-+ new_bh = ext3_find_entry (new_dentry, &new_de);
-+ if (new_bh) {
-+ if (!new_inode) {
-+ brelse (new_bh);
-+ new_bh = NULL;
-+ } else {
-+ DQUOT_INIT(new_inode);
-+ }
-+ }
-+ if (S_ISDIR(old_inode->i_mode)) {
-+ if (new_inode) {
-+ retval = -ENOTEMPTY;
-+ if (!empty_dir (new_inode))
-+ goto end_rename;
-+ }
-+ retval = -EIO;
-+ dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
-+ if (!dir_bh)
-+ goto end_rename;
-+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
-+ goto end_rename;
-+ retval = -EMLINK;
-+ if (!new_inode && new_dir!=old_dir &&
-+ new_dir->i_nlink >= EXT3_LINK_MAX)
-+ goto end_rename;
-+ }
-+ if (!new_bh) {
-+ retval = ext3_add_entry (handle, new_dentry, old_inode);
-+ if (retval)
-+ goto end_rename;
-+ } else {
-+ BUFFER_TRACE(new_bh, "get write access");
-+ BUFFER_TRACE(new_bh, "get_write_access");
-+ ext3_journal_get_write_access(handle, new_bh);
-+ new_de->inode = le32_to_cpu(old_inode->i_ino);
-+ if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
-+ EXT3_FEATURE_INCOMPAT_FILETYPE))
-+ new_de->file_type = old_de->file_type;
-+ new_dir->i_version = ++event;
-+ BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
-+ ext3_journal_dirty_metadata(handle, new_bh);
-+ brelse(new_bh);
-+ new_bh = NULL;
-+ }
-+
-+ /*
-+ * Like most other Unix systems, set the ctime for inodes on a
-+ * rename.
-+ */
-+ old_inode->i_ctime = CURRENT_TIME;
-+ ext3_mark_inode_dirty(handle, old_inode);
-+
-+ /*
-+ * ok, that's it
-+ */
-+ ext3_delete_entry(handle, old_dir, old_de, old_bh);
-+
-+ if (new_inode) {
-+ new_inode->i_nlink--;
-+ new_inode->i_ctime = CURRENT_TIME;
-+ }
-+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
-+ old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ if (dir_bh) {
-+ BUFFER_TRACE(dir_bh, "get_write_access");
-+ ext3_journal_get_write_access(handle, dir_bh);
-+ PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino);
-+ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
-+ ext3_journal_dirty_metadata(handle, dir_bh);
-+ old_dir->i_nlink--;
-+ if (new_inode) {
-+ new_inode->i_nlink--;
-+ } else {
-+ new_dir->i_nlink++;
-+ new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_mark_inode_dirty(handle, new_dir);
-+ }
-+ }
-+ ext3_mark_inode_dirty(handle, old_dir);
-+ if (new_inode) {
-+ ext3_mark_inode_dirty(handle, new_inode);
-+ if (!new_inode->i_nlink)
-+ ext3_orphan_add(handle, new_inode);
-+ }
-+ retval = 0;
-+
-+end_rename:
-+ brelse (dir_bh);
-+ brelse (old_bh);
-+ brelse (new_bh);
-+ ext3_journal_stop(handle, old_dir);
-+ return retval;
-+}
-+
-+/*
-+ * directories can handle most operations...
-+ */
-+struct inode_operations ext3_dir_inode_operations = {
-+ create: ext3_create, /* BKL held */
-+ lookup: ext3_lookup, /* BKL held */
-+ link: ext3_link, /* BKL held */
-+ unlink: ext3_unlink, /* BKL held */
-+ symlink: ext3_symlink, /* BKL held */
-+ mkdir: ext3_mkdir, /* BKL held */
-+ rmdir: ext3_rmdir, /* BKL held */
-+ mknod: ext3_mknod, /* BKL held */
-+ rename: ext3_rename, /* BKL held */
-+};
-diff -rup --new-file linux.mcp2/fs/ext3/super.c linux_tmp/fs/ext3/super.c
---- linux.mcp2/fs/ext3/super.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/super.c 2002-02-25 11:38:08.000000000 -0800
-@@ -0,0 +1,1753 @@
-+/*
-+ * linux/fs/ext3/super.c
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ *
-+ * from
-+ *
-+ * linux/fs/minix/inode.c
-+ *
-+ * Copyright (C) 1991, 1992 Linus Torvalds
-+ *
-+ * Big-endian to little-endian byte-swapping/bitmaps by
-+ * David S. Miller (davem@caip.rutgers.edu), 1995
-+ */
-+
-+#include <linux/config.h>
-+#include <linux/module.h>
-+#include <linux/string.h>
-+#include <linux/fs.h>
-+#include <linux/sched.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/slab.h>
-+#include <linux/init.h>
-+#include <linux/locks.h>
-+#include <linux/blkdev.h>
-+#include <linux/smp_lock.h>
-+#include <linux/random.h>
-+#include <asm/uaccess.h>
-+
-+#ifdef CONFIG_JBD_DEBUG
-+static int ext3_ro_after; /* Make fs read-only after this many jiffies */
-+#endif
-+
-+static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
-+static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
-+ int);
-+static void ext3_commit_super (struct super_block * sb,
-+ struct ext3_super_block * es,
-+ int sync);
-+static void ext3_mark_recovery_complete(struct super_block * sb,
-+ struct ext3_super_block * es);
-+static void ext3_clear_journal_err(struct super_block * sb,
-+ struct ext3_super_block * es);
-+
-+#ifdef CONFIG_JBD_DEBUG
-+int journal_no_write[2];
-+
-+/*
-+ * Debug code for turning filesystems "read-only" after a specified
-+ * amount of time. This is for crash/recovery testing.
-+ */
-+
-+static void make_rdonly(kdev_t dev, int *no_write)
-+{
-+ if (dev) {
-+ printk(KERN_WARNING "Turning device %s read-only\n",
-+ bdevname(dev));
-+ *no_write = 0xdead0000 + dev;
-+ }
-+}
-+
-+static void turn_fs_readonly(unsigned long arg)
-+{
-+ struct super_block *sb = (struct super_block *)arg;
-+
-+ make_rdonly(sb->s_dev, &journal_no_write[0]);
-+ make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]);
-+ wake_up(&EXT3_SB(sb)->ro_wait_queue);
-+}
-+
-+static void setup_ro_after(struct super_block *sb)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ init_timer(&sbi->turn_ro_timer);
-+ if (ext3_ro_after) {
-+ printk(KERN_DEBUG "fs will go read-only in %d jiffies\n",
-+ ext3_ro_after);
-+ init_waitqueue_head(&sbi->ro_wait_queue);
-+ journal_no_write[0] = 0;
-+ journal_no_write[1] = 0;
-+ sbi->turn_ro_timer.function = turn_fs_readonly;
-+ sbi->turn_ro_timer.data = (unsigned long)sb;
-+ sbi->turn_ro_timer.expires = jiffies + ext3_ro_after;
-+ ext3_ro_after = 0;
-+ add_timer(&sbi->turn_ro_timer);
-+ }
-+}
-+
-+static void clear_ro_after(struct super_block *sb)
-+{
-+ del_timer_sync(&EXT3_SB(sb)->turn_ro_timer);
-+ journal_no_write[0] = 0;
-+ journal_no_write[1] = 0;
-+ ext3_ro_after = 0;
-+}
-+#else
-+#define setup_ro_after(sb) do {} while (0)
-+#define clear_ro_after(sb) do {} while (0)
-+#endif
-+
-+
-+static char error_buf[1024];
-+
-+/* Determine the appropriate response to ext3_error on a given filesystem */
-+
-+static int ext3_error_behaviour(struct super_block *sb)
-+{
-+ /* First check for mount-time options */
-+ if (test_opt (sb, ERRORS_PANIC))
-+ return EXT3_ERRORS_PANIC;
-+ if (test_opt (sb, ERRORS_RO))
-+ return EXT3_ERRORS_RO;
-+ if (test_opt (sb, ERRORS_CONT))
-+ return EXT3_ERRORS_CONTINUE;
-+
-+ /* If no overrides were specified on the mount, then fall back
-+ * to the default behaviour set in the filesystem's superblock
-+ * on disk. */
-+ switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) {
-+ case EXT3_ERRORS_PANIC:
-+ return EXT3_ERRORS_PANIC;
-+ case EXT3_ERRORS_RO:
-+ return EXT3_ERRORS_RO;
-+ default:
-+ break;
-+ }
-+ return EXT3_ERRORS_CONTINUE;
-+}
-+
-+/* Deal with the reporting of failure conditions on a filesystem such as
-+ * inconsistencies detected or read IO failures.
-+ *
-+ * On ext2, we can store the error state of the filesystem in the
-+ * superblock. That is not possible on ext3, because we may have other
-+ * write ordering constraints on the superblock which prevent us from
-+ * writing it out straight away; and given that the journal is about to
-+ * be aborted, we can't rely on the current, or future, transactions to
-+ * write out the superblock safely.
-+ *
-+ * We'll just use the journal_abort() error code to record an error in
-+ * the journal instead. On recovery, the journal will compain about
-+ * that error until we've noted it down and cleared it.
-+ */
-+
-+static void ext3_handle_error(struct super_block *sb)
-+{
-+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-+
-+ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
-+ es->s_state |= cpu_to_le32(EXT3_ERROR_FS);
-+
-+ if (sb->s_flags & MS_RDONLY)
-+ return;
-+
-+ if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) {
-+ EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
-+ journal_abort(EXT3_SB(sb)->s_journal, -EIO);
-+ }
-+
-+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
-+ panic ("EXT3-fs (device %s): panic forced after error\n",
-+ bdevname(sb->s_dev));
-+
-+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) {
-+ printk (KERN_CRIT "Remounting filesystem read-only\n");
-+ sb->s_flags |= MS_RDONLY;
-+ }
-+
-+ ext3_commit_super(sb, es, 1);
-+}
-+
-+void ext3_error (struct super_block * sb, const char * function,
-+ const char * fmt, ...)
-+{
-+ va_list args;
-+
-+ va_start (args, fmt);
-+ vsprintf (error_buf, fmt, args);
-+ va_end (args);
-+
-+ printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n",
-+ bdevname(sb->s_dev), function, error_buf);
-+
-+ ext3_handle_error(sb);
-+}
-+
-+const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16])
-+{
-+ char *errstr = NULL;
-+
-+ switch (errno) {
-+ case -EIO:
-+ errstr = "IO failure";
-+ break;
-+ case -ENOMEM:
-+ errstr = "Out of memory";
-+ break;
-+ case -EROFS:
-+ if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
-+ errstr = "Journal has aborted";
-+ else
-+ errstr = "Readonly filesystem";
-+ break;
-+ default:
-+ /* If the caller passed in an extra buffer for unknown
-+ * errors, textualise them now. Else we just return
-+ * NULL. */
-+ if (nbuf) {
-+ /* Check for truncated error codes... */
-+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
-+ errstr = nbuf;
-+ }
-+
-+ break;
-+ }
-+
-+ return errstr;
-+}
-+
-+/* __ext3_std_error decodes expected errors from journaling functions
-+ * automatically and invokes the appropriate error response. */
-+
-+void __ext3_std_error (struct super_block * sb, const char * function,
-+ int errno)
-+{
-+ char nbuf[16];
-+ const char *errstr = ext3_decode_error(sb, errno, nbuf);
-+
-+ printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
-+ bdevname(sb->s_dev), function, errstr);
-+
-+ ext3_handle_error(sb);
-+}
-+
-+/*
-+ * ext3_abort is a much stronger failure handler than ext3_error. The
-+ * abort function may be used to deal with unrecoverable failures such
-+ * as journal IO errors or ENOMEM at a critical moment in log management.
-+ *
-+ * We unconditionally force the filesystem into an ABORT|READONLY state,
-+ * unless the error response on the fs has been set to panic in which
-+ * case we take the easy way out and panic immediately.
-+ */
-+
-+void ext3_abort (struct super_block * sb, const char * function,
-+ const char * fmt, ...)
-+{
-+ va_list args;
-+
-+ printk (KERN_CRIT "ext3_abort called.\n");
-+
-+ va_start (args, fmt);
-+ vsprintf (error_buf, fmt, args);
-+ va_end (args);
-+
-+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
-+ panic ("EXT3-fs panic (device %s): %s: %s\n",
-+ bdevname(sb->s_dev), function, error_buf);
-+
-+ printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n",
-+ bdevname(sb->s_dev), function, error_buf);
-+
-+ if (sb->s_flags & MS_RDONLY)
-+ return;
-+
-+ printk (KERN_CRIT "Remounting filesystem read-only\n");
-+ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
-+ sb->s_flags |= MS_RDONLY;
-+ sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT;
-+ journal_abort(EXT3_SB(sb)->s_journal, -EIO);
-+}
-+
-+/* Deal with the reporting of failure conditions while running, such as
-+ * inconsistencies in operation or invalid system states.
-+ *
-+ * Use ext3_error() for cases of invalid filesystem states, as that will
-+ * record an error on disk and force a filesystem check on the next boot.
-+ */
-+NORET_TYPE void ext3_panic (struct super_block * sb, const char * function,
-+ const char * fmt, ...)
-+{
-+ va_list args;
-+
-+ va_start (args, fmt);
-+ vsprintf (error_buf, fmt, args);
-+ va_end (args);
-+
-+ /* this is to prevent panic from syncing this filesystem */
-+ /* AKPM: is this sufficient? */
-+ sb->s_flags |= MS_RDONLY;
-+ panic ("EXT3-fs panic (device %s): %s: %s\n",
-+ bdevname(sb->s_dev), function, error_buf);
-+}
-+
-+void ext3_warning (struct super_block * sb, const char * function,
-+ const char * fmt, ...)
-+{
-+ va_list args;
-+
-+ va_start (args, fmt);
-+ vsprintf (error_buf, fmt, args);
-+ va_end (args);
-+ printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n",
-+ bdevname(sb->s_dev), function, error_buf);
-+}
-+
-+void ext3_update_dynamic_rev(struct super_block *sb)
-+{
-+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-+
-+ if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
-+ return;
-+
-+ ext3_warning(sb, __FUNCTION__,
-+ "updating to rev %d because of new feature flag, "
-+ "running e2fsck is recommended",
-+ EXT3_DYNAMIC_REV);
-+
-+ es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
-+ es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
-+ es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
-+ /* leave es->s_feature_*compat flags alone */
-+ /* es->s_uuid will be set by e2fsck if empty */
-+
-+ /*
-+ * The rest of the superblock fields should be zero, and if not it
-+ * means they are likely already in use, so leave them alone. We
-+ * can leave it up to e2fsck to clean up any inconsistencies there.
-+ */
-+}
-+
-+/*
-+ * Open the external journal device
-+ */
-+static struct block_device *ext3_blkdev_get(kdev_t dev)
-+{
-+ struct block_device *bdev;
-+ int err = -ENODEV;
-+
-+ bdev = bdget(kdev_t_to_nr(dev));
-+ if (bdev == NULL)
-+ goto fail;
-+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS);
-+ if (err < 0)
-+ goto fail;
-+ return bdev;
-+
-+fail:
-+ printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n",
-+ bdevname(dev), err);
-+ return NULL;
-+}
-+
-+/*
-+ * Release the journal device
-+ */
-+static int ext3_blkdev_put(struct block_device *bdev)
-+{
-+ return blkdev_put(bdev, BDEV_FS);
-+}
-+
-+static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
-+{
-+ struct block_device *bdev;
-+ int ret = -ENODEV;
-+
-+ bdev = sbi->journal_bdev;
-+ if (bdev) {
-+ ret = ext3_blkdev_put(bdev);
-+ sbi->journal_bdev = 0;
-+ }
-+ return ret;
-+}
-+
-+#define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan)
-+
-+static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
-+{
-+ struct list_head *l;
-+
-+ printk(KERN_ERR "sb orphan head is %d\n",
-+ le32_to_cpu(sbi->s_es->s_last_orphan));
-+
-+ printk(KERN_ERR "sb_info orphan list:\n");
-+ list_for_each(l, &sbi->s_orphan) {
-+ struct inode *inode = orphan_list_entry(l);
-+ printk(KERN_ERR " "
-+ "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n",
-+ inode->i_dev, inode->i_ino, inode,
-+ inode->i_mode, inode->i_nlink,
-+ le32_to_cpu(NEXT_ORPHAN(inode)));
-+ }
-+}
-+
-+void ext3_put_super (struct super_block * sb)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_super_block *es = sbi->s_es;
-+ kdev_t j_dev = sbi->s_journal->j_dev;
-+ int i;
-+
-+ journal_destroy(sbi->s_journal);
-+ if (!(sb->s_flags & MS_RDONLY)) {
-+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-+ es->s_state = le16_to_cpu(sbi->s_mount_state);
-+ BUFFER_TRACE(sbi->s_sbh, "marking dirty");
-+ mark_buffer_dirty(sbi->s_sbh);
-+ ext3_commit_super(sb, es, 1);
-+ }
-+
-+ for (i = 0; i < sbi->s_gdb_count; i++)
-+ brelse(sbi->s_group_desc[i]);
-+ kfree(sbi->s_group_desc);
-+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
-+ brelse(sbi->s_inode_bitmap[i]);
-+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
-+ brelse(sbi->s_block_bitmap[i]);
-+ brelse(sbi->s_sbh);
-+
-+ /* Debugging code just in case the in-memory inode orphan list
-+ * isn't empty. The on-disk one can be non-empty if we've
-+ * detected an error and taken the fs readonly, but the
-+ * in-memory list had better be clean by this point. */
-+ if (!list_empty(&sbi->s_orphan))
-+ dump_orphan_list(sb, sbi);
-+ J_ASSERT(list_empty(&sbi->s_orphan));
-+
-+ invalidate_buffers(sb->s_dev);
-+ if (j_dev != sb->s_dev) {
-+ /*
-+ * Invalidate the journal device's buffers. We don't want them
-+ * floating about in memory - the physical journal device may
-+ * hotswapped, and it breaks the `ro-after' testing code.
-+ */
-+ fsync_no_super(j_dev);
-+ invalidate_buffers(j_dev);
-+ ext3_blkdev_remove(sbi);
-+ }
-+ clear_ro_after(sb);
-+
-+ return;
-+}
-+
-+static struct super_operations ext3_sops = {
-+ read_inode: ext3_read_inode, /* BKL held */
-+ write_inode: ext3_write_inode, /* BKL not held. Don't need */
-+ dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */
-+ put_inode: ext3_put_inode, /* BKL not held. Don't need */
-+ delete_inode: ext3_delete_inode, /* BKL not held. We take it */
-+ put_super: ext3_put_super, /* BKL held */
-+ write_super: ext3_write_super, /* BKL held */
-+ write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
-+ unlockfs: ext3_unlockfs, /* BKL not held. We take it */
-+ statfs: ext3_statfs, /* BKL held */
-+ remount_fs: ext3_remount, /* BKL held */
-+};
-+
-+static int want_value(char *value, char *option)
-+{
-+ if (!value || !*value) {
-+ printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n",
-+ option);
-+ return -1;
-+ }
-+ return 0;
-+}
-+
-+static int want_null_value(char *value, char *option)
-+{
-+ if (*value) {
-+ printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n",
-+ option, value);
-+ return -1;
-+ }
-+ return 0;
-+}
-+
-+static int want_numeric(char *value, char *option, unsigned long *number)
-+{
-+ if (want_value(value, option))
-+ return -1;
-+ *number = simple_strtoul(value, &value, 0);
-+ if (want_null_value(value, option))
-+ return -1;
-+ return 0;
-+}
-+
-+/*
-+ * This function has been shamelessly adapted from the msdos fs
-+ */
-+static int parse_options (char * options, unsigned long * sb_block,
-+ struct ext3_sb_info *sbi,
-+ unsigned long * inum,
-+ int is_remount)
-+{
-+ unsigned long *mount_options = &sbi->s_mount_opt;
-+ uid_t *resuid = &sbi->s_resuid;
-+ gid_t *resgid = &sbi->s_resgid;
-+ char * this_char;
-+ char * value;
-+
-+ if (!options)
-+ return 1;
-+ for (this_char = strtok (options, ",");
-+ this_char != NULL;
-+ this_char = strtok (NULL, ",")) {
-+ if ((value = strchr (this_char, '=')) != NULL)
-+ *value++ = 0;
-+ if (!strcmp (this_char, "bsddf"))
-+ clear_opt (*mount_options, MINIX_DF);
-+ else if (!strcmp (this_char, "nouid32")) {
-+ set_opt (*mount_options, NO_UID32);
-+ }
-+ else if (!strcmp (this_char, "abort"))
-+ set_opt (*mount_options, ABORT);
-+ else if (!strcmp (this_char, "check")) {
-+ if (!value || !*value || !strcmp (value, "none"))
-+ clear_opt (*mount_options, CHECK);
-+ else
-+#ifdef CONFIG_EXT3_CHECK
-+ set_opt (*mount_options, CHECK);
-+#else
-+ printk(KERN_ERR
-+ "EXT3 Check option not supported\n");
-+#endif
-+ }
-+ else if (!strcmp (this_char, "debug"))
-+ set_opt (*mount_options, DEBUG);
-+ else if (!strcmp (this_char, "errors")) {
-+ if (want_value(value, "errors"))
-+ return 0;
-+ if (!strcmp (value, "continue")) {
-+ clear_opt (*mount_options, ERRORS_RO);
-+ clear_opt (*mount_options, ERRORS_PANIC);
-+ set_opt (*mount_options, ERRORS_CONT);
-+ }
-+ else if (!strcmp (value, "remount-ro")) {
-+ clear_opt (*mount_options, ERRORS_CONT);
-+ clear_opt (*mount_options, ERRORS_PANIC);
-+ set_opt (*mount_options, ERRORS_RO);
-+ }
-+ else if (!strcmp (value, "panic")) {
-+ clear_opt (*mount_options, ERRORS_CONT);
-+ clear_opt (*mount_options, ERRORS_RO);
-+ set_opt (*mount_options, ERRORS_PANIC);
-+ }
-+ else {
-+ printk (KERN_ERR
-+ "EXT3-fs: Invalid errors option: %s\n",
-+ value);
-+ return 0;
-+ }
-+ }
-+ else if (!strcmp (this_char, "grpid") ||
-+ !strcmp (this_char, "bsdgroups"))
-+ set_opt (*mount_options, GRPID);
-+ else if (!strcmp (this_char, "minixdf"))
-+ set_opt (*mount_options, MINIX_DF);
-+ else if (!strcmp (this_char, "nocheck"))
-+ clear_opt (*mount_options, CHECK);
-+ else if (!strcmp (this_char, "nogrpid") ||
-+ !strcmp (this_char, "sysvgroups"))
-+ clear_opt (*mount_options, GRPID);
-+ else if (!strcmp (this_char, "resgid")) {
-+ unsigned long v;
-+ if (want_numeric(value, "resgid", &v))
-+ return 0;
-+ *resgid = v;
-+ }
-+ else if (!strcmp (this_char, "resuid")) {
-+ unsigned long v;
-+ if (want_numeric(value, "resuid", &v))
-+ return 0;
-+ *resuid = v;
-+ }
-+ else if (!strcmp (this_char, "sb")) {
-+ if (want_numeric(value, "sb", sb_block))
-+ return 0;
-+ }
-+#ifdef CONFIG_JBD_DEBUG
-+ else if (!strcmp (this_char, "ro-after")) {
-+ unsigned long v;
-+ if (want_numeric(value, "ro-after", &v))
-+ return 0;
-+ ext3_ro_after = v;
-+ }
-+#endif
-+ /* Silently ignore the quota options */
-+ else if (!strcmp (this_char, "grpquota")
-+ || !strcmp (this_char, "noquota")
-+ || !strcmp (this_char, "quota")
-+ || !strcmp (this_char, "usrquota"))
-+ /* Don't do anything ;-) */ ;
-+ else if (!strcmp (this_char, "journal")) {
-+ /* @@@ FIXME */
-+ /* Eventually we will want to be able to create
-+ a journal file here. For now, only allow the
-+ user to specify an existing inode to be the
-+ journal file. */
-+ if (is_remount) {
-+ printk(KERN_ERR "EXT3-fs: cannot specify "
-+ "journal on remount\n");
-+ return 0;
-+ }
-+
-+ if (want_value(value, "journal"))
-+ return 0;
-+ if (!strcmp (value, "update"))
-+ set_opt (*mount_options, UPDATE_JOURNAL);
-+ else if (want_numeric(value, "journal", inum))
-+ return 0;
-+ }
-+ else if (!strcmp (this_char, "noload"))
-+ set_opt (*mount_options, NOLOAD);
-+ else if (!strcmp (this_char, "data")) {
-+ int data_opt = 0;
-+
-+ if (want_value(value, "data"))
-+ return 0;
-+ if (!strcmp (value, "journal"))
-+ data_opt = EXT3_MOUNT_JOURNAL_DATA;
-+ else if (!strcmp (value, "ordered"))
-+ data_opt = EXT3_MOUNT_ORDERED_DATA;
-+ else if (!strcmp (value, "writeback"))
-+ data_opt = EXT3_MOUNT_WRITEBACK_DATA;
-+ else {
-+ printk (KERN_ERR
-+ "EXT3-fs: Invalid data option: %s\n",
-+ value);
-+ return 0;
-+ }
-+ if (is_remount) {
-+ if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) !=
-+ data_opt) {
-+ printk(KERN_ERR
-+ "EXT3-fs: cannot change data "
-+ "mode on remount\n");
-+ return 0;
-+ }
-+ } else {
-+ *mount_options &= ~EXT3_MOUNT_DATA_FLAGS;
-+ *mount_options |= data_opt;
-+ }
-+ } else {
-+ printk (KERN_ERR
-+ "EXT3-fs: Unrecognized mount option %s\n",
-+ this_char);
-+ return 0;
-+ }
-+ }
-+ return 1;
-+}
-+
-+static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
-+ int read_only)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int res = 0;
-+
-+ if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
-+ printk (KERN_ERR "EXT3-fs warning: revision level too high, "
-+ "forcing read-only mode\n");
-+ res = MS_RDONLY;
-+ }
-+ if (read_only)
-+ return res;
-+ if (!(sbi->s_mount_state & EXT3_VALID_FS))
-+ printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
-+ "running e2fsck is recommended\n");
-+ else if ((sbi->s_mount_state & EXT3_ERROR_FS))
-+ printk (KERN_WARNING
-+ "EXT3-fs warning: mounting fs with errors, "
-+ "running e2fsck is recommended\n");
-+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
-+ le16_to_cpu(es->s_mnt_count) >=
-+ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-+ printk (KERN_WARNING
-+ "EXT3-fs warning: maximal mount count reached, "
-+ "running e2fsck is recommended\n");
-+ else if (le32_to_cpu(es->s_checkinterval) &&
-+ (le32_to_cpu(es->s_lastcheck) +
-+ le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME))
-+ printk (KERN_WARNING
-+ "EXT3-fs warning: checktime reached, "
-+ "running e2fsck is recommended\n");
-+#if 0
-+ /* @@@ We _will_ want to clear the valid bit if we find
-+ inconsistencies, to force a fsck at reboot. But for
-+ a plain journaled filesystem we can keep it set as
-+ valid forever! :) */
-+ es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS);
-+#endif
-+ if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
-+ es->s_max_mnt_count =
-+ (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
-+ es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
-+ es->s_mtime = cpu_to_le32(CURRENT_TIME);
-+ ext3_update_dynamic_rev(sb);
-+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-+ ext3_commit_super (sb, es, 1);
-+ if (test_opt (sb, DEBUG))
-+ printk (KERN_INFO
-+ "[EXT3 FS %s, %s, bs=%lu, gc=%lu, "
-+ "bpg=%lu, ipg=%lu, mo=%04lx]\n",
-+ EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize,
-+ sbi->s_groups_count,
-+ EXT3_BLOCKS_PER_GROUP(sb),
-+ EXT3_INODES_PER_GROUP(sb),
-+ sbi->s_mount_opt);
-+ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
-+ bdevname(sb->s_dev));
-+ if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
-+ printk("external journal on %s\n",
-+ bdevname(EXT3_SB(sb)->s_journal->j_dev));
-+ } else {
-+ printk("internal journal\n");
-+ }
-+#ifdef CONFIG_EXT3_CHECK
-+ if (test_opt (sb, CHECK)) {
-+ ext3_check_blocks_bitmap (sb);
-+ ext3_check_inodes_bitmap (sb);
-+ }
-+#endif
-+ setup_ro_after(sb);
-+ return res;
-+}
-+
-+static int ext3_check_descriptors (struct super_block * sb)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
-+ struct ext3_group_desc * gdp = NULL;
-+ int desc_block = 0;
-+ int i;
-+
-+ ext3_debug ("Checking group descriptors");
-+
-+ for (i = 0; i < sbi->s_groups_count; i++)
-+ {
-+ if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
-+ gdp = (struct ext3_group_desc *)
-+ sbi->s_group_desc[desc_block++]->b_data;
-+ if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
-+ le32_to_cpu(gdp->bg_block_bitmap) >=
-+ block + EXT3_BLOCKS_PER_GROUP(sb))
-+ {
-+ ext3_error (sb, "ext3_check_descriptors",
-+ "Block bitmap for group %d"
-+ " not in group (block %lu)!",
-+ i, (unsigned long)
-+ le32_to_cpu(gdp->bg_block_bitmap));
-+ return 0;
-+ }
-+ if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
-+ le32_to_cpu(gdp->bg_inode_bitmap) >=
-+ block + EXT3_BLOCKS_PER_GROUP(sb))
-+ {
-+ ext3_error (sb, "ext3_check_descriptors",
-+ "Inode bitmap for group %d"
-+ " not in group (block %lu)!",
-+ i, (unsigned long)
-+ le32_to_cpu(gdp->bg_inode_bitmap));
-+ return 0;
-+ }
-+ if (le32_to_cpu(gdp->bg_inode_table) < block ||
-+ le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
-+ block + EXT3_BLOCKS_PER_GROUP(sb))
-+ {
-+ ext3_error (sb, "ext3_check_descriptors",
-+ "Inode table for group %d"
-+ " not in group (block %lu)!",
-+ i, (unsigned long)
-+ le32_to_cpu(gdp->bg_inode_table));
-+ return 0;
-+ }
-+ block += EXT3_BLOCKS_PER_GROUP(sb);
-+ gdp++;
-+ }
-+ return 1;
-+}
-+
-+
-+/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
-+ * the superblock) which were deleted from all directories, but held open by
-+ * a process at the time of a crash. We walk the list and try to delete these
-+ * inodes at recovery time (only with a read-write filesystem).
-+ *
-+ * In order to keep the orphan inode chain consistent during traversal (in
-+ * case of crash during recovery), we link each inode into the superblock
-+ * orphan list_head and handle it the same way as an inode deletion during
-+ * normal operation (which journals the operations for us).
-+ *
-+ * We only do an iget() and an iput() on each inode, which is very safe if we
-+ * accidentally point at an in-use or already deleted inode. The worst that
-+ * can happen in this case is that we get a "bit already cleared" message from
-+ * ext3_free_inode(). The only reason we would point at a wrong inode is if
-+ * e2fsck was run on this filesystem, and it must have already done the orphan
-+ * inode cleanup for us, so we can safely abort without any further action.
-+ */
-+static void ext3_orphan_cleanup (struct super_block * sb,
-+ struct ext3_super_block * es)
-+{
-+ unsigned int s_flags = sb->s_flags;
-+ int nr_orphans = 0, nr_truncates = 0;
-+ if (!es->s_last_orphan) {
-+ jbd_debug(4, "no orphan inodes to clean up\n");
-+ return;
-+ }
-+
-+ if (s_flags & MS_RDONLY) {
-+ printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
-+ bdevname(sb->s_dev));
-+ sb->s_flags &= ~MS_RDONLY;
-+ }
-+
-+ if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) {
-+ if (es->s_last_orphan)
-+ jbd_debug(1, "Errors on filesystem, "
-+ "clearing orphan list.\n");
-+ es->s_last_orphan = 0;
-+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
-+ return;
-+ }
-+
-+ while (es->s_last_orphan) {
-+ struct inode *inode;
-+
-+ if (!(inode =
-+ ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
-+ es->s_last_orphan = 0;
-+ break;
-+ }
-+
-+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-+ if (inode->i_nlink) {
-+ printk(KERN_DEBUG __FUNCTION__
-+ ": truncating inode %ld to %Ld bytes\n",
-+ inode->i_ino, inode->i_size);
-+ jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
-+ inode->i_ino, inode->i_size);
-+ ext3_truncate(inode);
-+ nr_truncates++;
-+ } else {
-+ printk(KERN_DEBUG __FUNCTION__
-+ ": deleting unreferenced inode %ld\n",
-+ inode->i_ino);
-+ jbd_debug(2, "deleting unreferenced inode %ld\n",
-+ inode->i_ino);
-+ nr_orphans++;
-+ }
-+ iput(inode); /* The delete magic happens here! */
-+ }
-+
-+#define PLURAL(x) (x), ((x)==1) ? "" : "s"
-+
-+ if (nr_orphans)
-+ printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
-+ bdevname(sb->s_dev), PLURAL(nr_orphans));
-+ if (nr_truncates)
-+ printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
-+ bdevname(sb->s_dev), PLURAL(nr_truncates));
-+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */
-+}
-+
-+#define log2(n) ffz(~(n))
-+
-+/*
-+ * Maximal file size. There is a direct, and {,double-,triple-}indirect
-+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
-+ * We need to be 1 filesystem block less than the 2^32 sector limit.
-+ */
-+static loff_t ext3_max_size(int bits)
-+{
-+ loff_t res = EXT3_NDIR_BLOCKS;
-+ res += 1LL << (bits-2);
-+ res += 1LL << (2*(bits-2));
-+ res += 1LL << (3*(bits-2));
-+ res <<= bits;
-+ if (res > (512LL << 32) - (1 << bits))
-+ res = (512LL << 32) - (1 << bits);
-+ return res;
-+}
-+
-+struct super_block * ext3_read_super (struct super_block * sb, void * data,
-+ int silent)
-+{
-+ struct buffer_head * bh;
-+ struct ext3_super_block *es = 0;
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ unsigned long sb_block = 1;
-+ unsigned long logic_sb_block = 1;
-+ unsigned long offset = 0;
-+ unsigned long journal_inum = 0;
-+ kdev_t dev = sb->s_dev;
-+ int blocksize;
-+ int hblock;
-+ int db_count;
-+ int i;
-+ int needs_recovery;
-+
-+#ifdef CONFIG_JBD_DEBUG
-+ ext3_ro_after = 0;
-+#endif
-+ /*
-+ * See what the current blocksize for the device is, and
-+ * use that as the blocksize. Otherwise (or if the blocksize
-+ * is smaller than the default) use the default.
-+ * This is important for devices that have a hardware
-+ * sectorsize that is larger than the default.
-+ */
-+ blocksize = EXT3_MIN_BLOCK_SIZE;
-+ hblock = get_hardsect_size(dev);
-+ if (blocksize < hblock)
-+ blocksize = hblock;
-+
-+ sbi->s_mount_opt = 0;
-+ sbi->s_resuid = EXT3_DEF_RESUID;
-+ sbi->s_resgid = EXT3_DEF_RESGID;
-+ if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
-+ sb->s_dev = 0;
-+ goto out_fail;
-+ }
-+
-+ sb->s_blocksize = blocksize;
-+ set_blocksize (dev, blocksize);
-+
-+ /*
-+ * The ext3 superblock will not be buffer aligned for other than 1kB
-+ * block sizes. We need to calculate the offset from buffer start.
-+ */
-+ if (blocksize != EXT3_MIN_BLOCK_SIZE) {
-+ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
-+ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
-+ }
-+
-+ if (!(bh = sb_bread(sb, logic_sb_block))) {
-+ printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
-+ goto out_fail;
-+ }
-+ /*
-+ * Note: s_es must be initialized as soon as possible because
-+ * some ext3 macro-instructions depend on its value
-+ */
-+ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
-+ sbi->s_es = es;
-+ sb->s_magic = le16_to_cpu(es->s_magic);
-+ if (sb->s_magic != EXT3_SUPER_MAGIC) {
-+ if (!silent)
-+ printk(KERN_ERR
-+ "VFS: Can't find ext3 filesystem on dev %s.\n",
-+ bdevname(dev));
-+ goto failed_mount;
-+ }
-+ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
-+ (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
-+ EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
-+ EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-+ printk(KERN_WARNING
-+ "EXT3-fs warning: feature flags set on rev 0 fs, "
-+ "running e2fsck is recommended\n");
-+ /*
-+ * Check feature flags regardless of the revision level, since we
-+ * previously didn't change the revision level when setting the flags,
-+ * so there is a chance incompat flags are set on a rev 0 filesystem.
-+ */
-+ if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) {
-+ printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
-+ "unsupported optional features (%x).\n",
-+ bdevname(dev), i);
-+ goto failed_mount;
-+ }
-+ if (!(sb->s_flags & MS_RDONLY) &&
-+ (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){
-+ printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
-+ "unsupported optional features (%x).\n",
-+ bdevname(dev), i);
-+ goto failed_mount;
-+ }
-+ sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10;
-+ sb->s_blocksize = 1 << sb->s_blocksize_bits;
-+
-+ if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE ||
-+ sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) {
-+ printk(KERN_ERR
-+ "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
-+ blocksize, bdevname(dev));
-+ goto failed_mount;
-+ }
-+
-+ sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
-+
-+ if (sb->s_blocksize != blocksize) {
-+ blocksize = sb->s_blocksize;
-+
-+ /*
-+ * Make sure the blocksize for the filesystem is larger
-+ * than the hardware sectorsize for the machine.
-+ */
-+ if (sb->s_blocksize < hblock) {
-+ printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
-+ "device blocksize %d.\n", blocksize, hblock);
-+ goto failed_mount;
-+ }
-+
-+ brelse (bh);
-+ set_blocksize (dev, sb->s_blocksize);
-+ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
-+ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
-+ bh = sb_bread(sb, logic_sb_block);
-+ if (!bh) {
-+ printk(KERN_ERR
-+ "EXT3-fs: Can't read superblock on 2nd try.\n");
-+ return NULL;
-+ }
-+ es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
-+ sbi->s_es = es;
-+ if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) {
-+ printk (KERN_ERR
-+ "EXT3-fs: Magic mismatch, very weird !\n");
-+ goto failed_mount;
-+ }
-+ }
-+
-+ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
-+ sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
-+ sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
-+ } else {
-+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
-+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
-+ if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) {
-+ printk (KERN_ERR
-+ "EXT3-fs: unsupported inode size: %d\n",
-+ sbi->s_inode_size);
-+ goto failed_mount;
-+ }
-+ }
-+ sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
-+ le32_to_cpu(es->s_log_frag_size);
-+ if (blocksize != sbi->s_frag_size) {
-+ printk(KERN_ERR
-+ "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
-+ sbi->s_frag_size, blocksize);
-+ goto failed_mount;
-+ }
-+ sbi->s_frags_per_block = 1;
-+ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
-+ sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
-+ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-+ sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
-+ sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block;
-+ sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
-+ sbi->s_sbh = bh;
-+ if (sbi->s_resuid == EXT3_DEF_RESUID)
-+ sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
-+ if (sbi->s_resgid == EXT3_DEF_RESGID)
-+ sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
-+ sbi->s_mount_state = le16_to_cpu(es->s_state);
-+ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
-+ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
-+
-+ if (sbi->s_blocks_per_group > blocksize * 8) {
-+ printk (KERN_ERR
-+ "EXT3-fs: #blocks per group too big: %lu\n",
-+ sbi->s_blocks_per_group);
-+ goto failed_mount;
-+ }
-+ if (sbi->s_frags_per_group > blocksize * 8) {
-+ printk (KERN_ERR
-+ "EXT3-fs: #fragments per group too big: %lu\n",
-+ sbi->s_frags_per_group);
-+ goto failed_mount;
-+ }
-+ if (sbi->s_inodes_per_group > blocksize * 8) {
-+ printk (KERN_ERR
-+ "EXT3-fs: #inodes per group too big: %lu\n",
-+ sbi->s_inodes_per_group);
-+ goto failed_mount;
-+ }
-+
-+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
-+ le32_to_cpu(es->s_first_data_block) +
-+ EXT3_BLOCKS_PER_GROUP(sb) - 1) /
-+ EXT3_BLOCKS_PER_GROUP(sb);
-+ db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
-+ EXT3_DESC_PER_BLOCK(sb);
-+ sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
-+ GFP_KERNEL);
-+ if (sbi->s_group_desc == NULL) {
-+ printk (KERN_ERR "EXT3-fs: not enough memory\n");
-+ goto failed_mount;
-+ }
-+ for (i = 0; i < db_count; i++) {
-+ sbi->s_group_desc[i] = sb_bread(sb, logic_sb_block + i + 1);
-+ if (!sbi->s_group_desc[i]) {
-+ printk (KERN_ERR "EXT3-fs: "
-+ "can't read group descriptor %d\n", i);
-+ db_count = i;
-+ goto failed_mount2;
-+ }
-+ }
-+ if (!ext3_check_descriptors (sb)) {
-+ printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
-+ goto failed_mount2;
-+ }
-+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) {
-+ sbi->s_inode_bitmap_number[i] = 0;
-+ sbi->s_inode_bitmap[i] = NULL;
-+ sbi->s_block_bitmap_number[i] = 0;
-+ sbi->s_block_bitmap[i] = NULL;
-+ }
-+ sbi->s_loaded_inode_bitmaps = 0;
-+ sbi->s_loaded_block_bitmaps = 0;
-+ sbi->s_gdb_count = db_count;
-+ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
-+ /*
-+ * set up enough so that it can read an inode
-+ */
-+ sb->s_op = &ext3_sops;
-+ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
-+
-+ sb->s_root = 0;
-+
-+ needs_recovery = (es->s_last_orphan != 0 ||
-+ EXT3_HAS_INCOMPAT_FEATURE(sb,
-+ EXT3_FEATURE_INCOMPAT_RECOVER));
-+
-+ /*
-+ * The first inode we look at is the journal inode. Don't try
-+ * root first: it may be modified in the journal!
-+ */
-+ if (!test_opt(sb, NOLOAD) &&
-+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
-+ if (ext3_load_journal(sb, es))
-+ goto failed_mount2;
-+ } else if (journal_inum) {
-+ if (ext3_create_journal(sb, es, journal_inum))
-+ goto failed_mount2;
-+ } else {
-+ if (!silent)
-+ printk (KERN_ERR
-+ "ext3: No journal on filesystem on %s\n",
-+ bdevname(dev));
-+ goto failed_mount2;
-+ }
-+
-+ /* We have now updated the journal if required, so we can
-+ * validate the data journaling mode. */
-+ switch (test_opt(sb, DATA_FLAGS)) {
-+ case 0:
-+ /* No mode set, assume a default based on the journal
-+ capabilities: ORDERED_DATA if the journal can
-+ cope, else JOURNAL_DATA */
-+ if (journal_check_available_features
-+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-+ set_opt(sbi->s_mount_opt, ORDERED_DATA);
-+ else
-+ set_opt(sbi->s_mount_opt, JOURNAL_DATA);
-+ break;
-+
-+ case EXT3_MOUNT_ORDERED_DATA:
-+ case EXT3_MOUNT_WRITEBACK_DATA:
-+ if (!journal_check_available_features
-+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
-+ printk(KERN_ERR "EXT3-fs: Journal does not support "
-+ "requested data journaling mode\n");
-+ goto failed_mount3;
-+ }
-+ default:
-+ break;
-+ }
-+
-+ /*
-+ * The journal_load will have done any necessary log recovery,
-+ * so we can safely mount the rest of the filesystem now.
-+ */
-+
-+ sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO));
-+ if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) ||
-+ !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) {
-+ if (sb->s_root) {
-+ dput(sb->s_root);
-+ sb->s_root = NULL;
-+ printk(KERN_ERR
-+ "EXT3-fs: corrupt root inode, run e2fsck\n");
-+ } else
-+ printk(KERN_ERR "EXT3-fs: get root inode failed\n");
-+ goto failed_mount3;
-+ }
-+
-+ ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-+ /*
-+ * akpm: core read_super() calls in here with the superblock locked.
-+ * That deadlocks, because orphan cleanup needs to lock the superblock
-+ * in numerous places. Here we just pop the lock - it's relatively
-+ * harmless, because we are now ready to accept write_super() requests,
-+ * and aviro says that's the only reason for hanging onto the
-+ * superblock lock.
-+ */
-+ EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
-+ unlock_super(sb); /* akpm: sigh */
-+ ext3_orphan_cleanup(sb, es);
-+ lock_super(sb);
-+ EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
-+ if (needs_recovery)
-+ printk (KERN_INFO "EXT3-fs: recovery complete.\n");
-+ ext3_mark_recovery_complete(sb, es);
-+ printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
-+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
-+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
-+ "writeback");
-+
-+ return sb;
-+
-+failed_mount3:
-+ journal_destroy(sbi->s_journal);
-+failed_mount2:
-+ for (i = 0; i < db_count; i++)
-+ brelse(sbi->s_group_desc[i]);
-+ kfree(sbi->s_group_desc);
-+failed_mount:
-+ ext3_blkdev_remove(sbi);
-+ brelse(bh);
-+out_fail:
-+ return NULL;
-+}
-+
-+static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
-+{
-+ struct inode *journal_inode;
-+ journal_t *journal;
-+
-+ /* First, test for the existence of a valid inode on disk. Bad
-+ * things happen if we iget() an unused inode, as the subsequent
-+ * iput() will try to delete it. */
-+
-+ journal_inode = iget(sb, journal_inum);
-+ if (!journal_inode) {
-+ printk(KERN_ERR "EXT3-fs: no journal found.\n");
-+ return NULL;
-+ }
-+ if (!journal_inode->i_nlink) {
-+ make_bad_inode(journal_inode);
-+ iput(journal_inode);
-+ printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
-+ return NULL;
-+ }
-+
-+ jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
-+ journal_inode, journal_inode->i_size);
-+ if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
-+ printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
-+ iput(journal_inode);
-+ return NULL;
-+ }
-+
-+ journal = journal_init_inode(journal_inode);
-+ if (!journal) {
-+ printk(KERN_ERR "EXT3-fs: Could not load journal inode\n");
-+ iput(journal_inode);
-+ }
-+
-+ return journal;
-+}
-+
-+static journal_t *ext3_get_dev_journal(struct super_block *sb,
-+ int dev)
-+{
-+ struct buffer_head * bh;
-+ journal_t *journal;
-+ int start;
-+ int len;
-+ int hblock, blocksize;
-+ unsigned long sb_block;
-+ unsigned long offset;
-+ kdev_t journal_dev = to_kdev_t(dev);
-+ struct ext3_super_block * es;
-+ struct block_device *bdev;
-+
-+ bdev = ext3_blkdev_get(journal_dev);
-+ if (bdev == NULL)
-+ return NULL;
-+
-+ blocksize = sb->s_blocksize;
-+ hblock = get_hardsect_size(journal_dev);
-+ if (blocksize < hblock) {
-+ printk(KERN_ERR
-+ "EXT3-fs: blocksize too small for journal device.\n");
-+ goto out_bdev;
-+ }
-+
-+ sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
-+ offset = EXT3_MIN_BLOCK_SIZE % blocksize;
-+ set_blocksize(dev, blocksize);
-+ if (!(bh = bread(dev, sb_block, blocksize))) {
-+ printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
-+ "external journal\n");
-+ goto out_bdev;
-+ }
-+
-+ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
-+ if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
-+ !(le32_to_cpu(es->s_feature_incompat) &
-+ EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-+ printk(KERN_ERR "EXT3-fs: external journal has "
-+ "bad superblock\n");
-+ brelse(bh);
-+ goto out_bdev;
-+ }
-+
-+ if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-+ printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
-+ brelse(bh);
-+ goto out_bdev;
-+ }
-+
-+ len = le32_to_cpu(es->s_blocks_count);
-+ start = sb_block + 1;
-+ brelse(bh); /* we're done with the superblock */
-+
-+ journal = journal_init_dev(journal_dev, sb->s_dev,
-+ start, len, blocksize);
-+ if (!journal) {
-+ printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
-+ goto out_bdev;
-+ }
-+ ll_rw_block(READ, 1, &journal->j_sb_buffer);
-+ wait_on_buffer(journal->j_sb_buffer);
-+ if (!buffer_uptodate(journal->j_sb_buffer)) {
-+ printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
-+ goto out_journal;
-+ }
-+ if (ntohl(journal->j_superblock->s_nr_users) != 1) {
-+ printk(KERN_ERR "EXT3-fs: External journal has more than one "
-+ "user (unsupported) - %d\n",
-+ ntohl(journal->j_superblock->s_nr_users));
-+ goto out_journal;
-+ }
-+ EXT3_SB(sb)->journal_bdev = bdev;
-+ return journal;
-+out_journal:
-+ journal_destroy(journal);
-+out_bdev:
-+ ext3_blkdev_put(bdev);
-+ return NULL;
-+}
-+
-+static int ext3_load_journal(struct super_block * sb,
-+ struct ext3_super_block * es)
-+{
-+ journal_t *journal;
-+ int journal_inum = le32_to_cpu(es->s_journal_inum);
-+ int journal_dev = le32_to_cpu(es->s_journal_dev);
-+ int err = 0;
-+ int really_read_only;
-+
-+ really_read_only = is_read_only(sb->s_dev);
-+
-+ /*
-+ * Are we loading a blank journal or performing recovery after a
-+ * crash? For recovery, we need to check in advance whether we
-+ * can get read-write access to the device.
-+ */
-+
-+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
-+ if (sb->s_flags & MS_RDONLY) {
-+ printk(KERN_INFO "EXT3-fs: INFO: recovery "
-+ "required on readonly filesystem.\n");
-+ if (really_read_only) {
-+ printk(KERN_ERR "EXT3-fs: write access "
-+ "unavailable, cannot proceed.\n");
-+ return -EROFS;
-+ }
-+ printk (KERN_INFO "EXT3-fs: write access will "
-+ "be enabled during recovery.\n");
-+ }
-+ }
-+
-+ if (journal_inum && journal_dev) {
-+ printk(KERN_ERR "EXT3-fs: filesystem has both journal "
-+ "and inode journals!\n");
-+ return -EINVAL;
-+ }
-+
-+ if (journal_inum) {
-+ if (!(journal = ext3_get_journal(sb, journal_inum)))
-+ return -EINVAL;
-+ } else {
-+ if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
-+ return -EINVAL;
-+ }
-+
-+
-+ if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
-+ err = journal_update_format(journal);
-+ if (err) {
-+ printk(KERN_ERR "EXT3-fs: error updating journal.\n");
-+ journal_destroy(journal);
-+ return err;
-+ }
-+ }
-+
-+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
-+ err = journal_wipe(journal, !really_read_only);
-+ if (!err)
-+ err = journal_load(journal);
-+
-+ if (err) {
-+ printk(KERN_ERR "EXT3-fs: error loading journal.\n");
-+ journal_destroy(journal);
-+ return err;
-+ }
-+
-+ EXT3_SB(sb)->s_journal = journal;
-+ ext3_clear_journal_err(sb, es);
-+ return 0;
-+}
-+
-+static int ext3_create_journal(struct super_block * sb,
-+ struct ext3_super_block * es,
-+ int journal_inum)
-+{
-+ journal_t *journal;
-+
-+ if (sb->s_flags & MS_RDONLY) {
-+ printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
-+ "create journal.\n");
-+ return -EROFS;
-+ }
-+
-+ if (!(journal = ext3_get_journal(sb, journal_inum)))
-+ return -EINVAL;
-+
-+ printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
-+ journal_inum);
-+
-+ if (journal_create(journal)) {
-+ printk(KERN_ERR "EXT3-fs: error creating journal.\n");
-+ journal_destroy(journal);
-+ return -EIO;
-+ }
-+
-+ EXT3_SB(sb)->s_journal = journal;
-+
-+ ext3_update_dynamic_rev(sb);
-+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-+ EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
-+
-+ es->s_journal_inum = cpu_to_le32(journal_inum);
-+ sb->s_dirt = 1;
-+
-+ /* Make sure we flush the recovery flag to disk. */
-+ ext3_commit_super(sb, es, 1);
-+
-+ return 0;
-+}
-+
-+static void ext3_commit_super (struct super_block * sb,
-+ struct ext3_super_block * es,
-+ int sync)
-+{
-+ es->s_wtime = cpu_to_le32(CURRENT_TIME);
-+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty");
-+ mark_buffer_dirty(sb->u.ext3_sb.s_sbh);
-+ if (sync) {
-+ ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh);
-+ wait_on_buffer(sb->u.ext3_sb.s_sbh);
-+ }
-+}
-+
-+
-+/*
-+ * Have we just finished recovery? If so, and if we are mounting (or
-+ * remounting) the filesystem readonly, then we will end up with a
-+ * consistent fs on disk. Record that fact.
-+ */
-+static void ext3_mark_recovery_complete(struct super_block * sb,
-+ struct ext3_super_block * es)
-+{
-+ journal_flush(EXT3_SB(sb)->s_journal);
-+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
-+ sb->s_flags & MS_RDONLY) {
-+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-+ sb->s_dirt = 0;
-+ ext3_commit_super(sb, es, 1);
-+ }
-+}
-+
-+/*
-+ * If we are mounting (or read-write remounting) a filesystem whose journal
-+ * has recorded an error from a previous lifetime, move that error to the
-+ * main filesystem now.
-+ */
-+static void ext3_clear_journal_err(struct super_block * sb,
-+ struct ext3_super_block * es)
-+{
-+ journal_t *journal;
-+ int j_errno;
-+ const char *errstr;
-+
-+ journal = EXT3_SB(sb)->s_journal;
-+
-+ /*
-+ * Now check for any error status which may have been recorded in the
-+ * journal by a prior ext3_error() or ext3_abort()
-+ */
-+
-+ j_errno = journal_errno(journal);
-+ if (j_errno) {
-+ char nbuf[16];
-+
-+ errstr = ext3_decode_error(sb, j_errno, nbuf);
-+ ext3_warning(sb, __FUNCTION__, "Filesystem error recorded "
-+ "from previous mount: %s", errstr);
-+ ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
-+ "filesystem check.");
-+
-+ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
-+ es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
-+ ext3_commit_super (sb, es, 1);
-+
-+ journal_clear_err(journal);
-+ }
-+}
-+
-+/*
-+ * Force the running and committing transactions to commit,
-+ * and wait on the commit.
-+ */
-+int ext3_force_commit(struct super_block *sb)
-+{
-+ journal_t *journal;
-+ int ret;
-+
-+ if (sb->s_flags & MS_RDONLY)
-+ return 0;
-+
-+ journal = EXT3_SB(sb)->s_journal;
-+ sb->s_dirt = 0;
-+ lock_kernel(); /* important: lock down j_running_transaction */
-+ ret = ext3_journal_force_commit(journal);
-+ unlock_kernel();
-+ return ret;
-+}
-+
-+/*
-+ * Ext3 always journals updates to the superblock itself, so we don't
-+ * have to propagate any other updates to the superblock on disk at this
-+ * point. Just start an async writeback to get the buffers on their way
-+ * to the disk.
-+ *
-+ * This implicitly triggers the writebehind on sync().
-+ */
-+
-+static int do_sync_supers = 0;
-+MODULE_PARM(do_sync_supers, "i");
-+MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously");
-+
-+void ext3_write_super (struct super_block * sb)
-+{
-+ tid_t target;
-+
-+ if (down_trylock(&sb->s_lock) == 0)
-+ BUG(); /* aviro detector */
-+ sb->s_dirt = 0;
-+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
-+
-+ if (do_sync_supers) {
-+ unlock_super(sb);
-+ log_wait_commit(EXT3_SB(sb)->s_journal, target);
-+ lock_super(sb);
-+ }
-+}
-+
-+/*
-+ * LVM calls this function before a (read-only) snapshot is created. This
-+ * gives us a chance to flush the journal completely and mark the fs clean.
-+ */
-+void ext3_write_super_lockfs(struct super_block *sb)
-+{
-+ sb->s_dirt = 0;
-+
-+ lock_kernel(); /* 2.4.5 forgot to do this for us */
-+ if (!(sb->s_flags & MS_RDONLY)) {
-+ journal_t *journal = EXT3_SB(sb)->s_journal;
-+
-+ /* Now we set up the journal barrier. */
-+ journal_lock_updates(journal);
-+ journal_flush(journal);
-+
-+ /* Journal blocked and flushed, clear needs_recovery flag. */
-+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-+ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
-+ }
-+ unlock_kernel();
-+}
-+
-+/*
-+ * Called by LVM after the snapshot is done. We need to reset the RECOVER
-+ * flag here, even though the filesystem is not technically dirty yet.
-+ */
-+void ext3_unlockfs(struct super_block *sb)
-+{
-+ if (!(sb->s_flags & MS_RDONLY)) {
-+ lock_kernel();
-+ lock_super(sb);
-+ /* Reser the needs_recovery flag before the fs is unlocked. */
-+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-+ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
-+ unlock_super(sb);
-+ journal_unlock_updates(EXT3_SB(sb)->s_journal);
-+ unlock_kernel();
-+ }
-+}
-+
-+int ext3_remount (struct super_block * sb, int * flags, char * data)
-+{
-+ struct ext3_super_block * es;
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ unsigned long tmp;
-+
-+ clear_ro_after(sb);
-+
-+ /*
-+ * Allow the "check" option to be passed as a remount option.
-+ */
-+ if (!parse_options(data, &tmp, sbi, &tmp, 1))
-+ return -EINVAL;
-+
-+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
-+ ext3_abort(sb, __FUNCTION__, "Abort forced by user");
-+
-+ es = sbi->s_es;
-+
-+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
-+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
-+ return -EROFS;
-+
-+ if (*flags & MS_RDONLY) {
-+ /*
-+ * First of all, the unconditional stuff we have to do
-+ * to disable replay of the journal when we next remount
-+ */
-+ sb->s_flags |= MS_RDONLY;
-+
-+ /*
-+ * OK, test if we are remounting a valid rw partition
-+ * readonly, and if so set the rdonly flag and then
-+ * mark the partition as valid again.
-+ */
-+ if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
-+ (sbi->s_mount_state & EXT3_VALID_FS))
-+ es->s_state = cpu_to_le16(sbi->s_mount_state);
-+
-+ ext3_mark_recovery_complete(sb, es);
-+ } else {
-+ int ret;
-+ if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
-+ ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
-+ printk(KERN_WARNING "EXT3-fs: %s: couldn't "
-+ "remount RDWR because of unsupported "
-+ "optional features (%x).\n",
-+ bdevname(sb->s_dev), ret);
-+ return -EROFS;
-+ }
-+ /*
-+ * Mounting a RDONLY partition read-write, so reread
-+ * and store the current valid flag. (It may have
-+ * been changed by e2fsck since we originally mounted
-+ * the partition.)
-+ */
-+ ext3_clear_journal_err(sb, es);
-+ sbi->s_mount_state = le16_to_cpu(es->s_state);
-+ if (!ext3_setup_super (sb, es, 0))
-+ sb->s_flags &= ~MS_RDONLY;
-+ }
-+ }
-+ setup_ro_after(sb);
-+ return 0;
-+}
-+
-+int ext3_statfs (struct super_block * sb, struct statfs * buf)
-+{
-+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-+ unsigned long overhead;
-+ int i;
-+
-+ if (test_opt (sb, MINIX_DF))
-+ overhead = 0;
-+ else {
-+ /*
-+ * Compute the overhead (FS structures)
-+ */
-+
-+ /*
-+ * All of the blocks before first_data_block are
-+ * overhead
-+ */
-+ overhead = le32_to_cpu(es->s_first_data_block);
-+
-+ /*
-+ * Add the overhead attributed to the superblock and
-+ * block group descriptors. If the sparse superblocks
-+ * feature is turned on, then not all groups have this.
-+ */
-+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
-+ overhead += ext3_bg_has_super(sb, i) +
-+ ext3_bg_num_gdb(sb, i);
-+
-+ /*
-+ * Every block group has an inode bitmap, a block
-+ * bitmap, and an inode table.
-+ */
-+ overhead += (EXT3_SB(sb)->s_groups_count *
-+ (2 + EXT3_SB(sb)->s_itb_per_group));
-+ }
-+
-+ buf->f_type = EXT3_SUPER_MAGIC;
-+ buf->f_bsize = sb->s_blocksize;
-+ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
-+ buf->f_bfree = ext3_count_free_blocks (sb);
-+ buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
-+ if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
-+ buf->f_bavail = 0;
-+ buf->f_files = le32_to_cpu(es->s_inodes_count);
-+ buf->f_ffree = ext3_count_free_inodes (sb);
-+ buf->f_namelen = EXT3_NAME_LEN;
-+ return 0;
-+}
-+
-+static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super);
-+
-+static int __init init_ext3_fs(void)
-+{
-+ return register_filesystem(&ext3_fs_type);
-+}
-+
-+static void __exit exit_ext3_fs(void)
-+{
-+ unregister_filesystem(&ext3_fs_type);
-+}
-+
-+EXPORT_NO_SYMBOLS;
-+
-+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-+MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
-+MODULE_LICENSE("GPL");
-+module_init(init_ext3_fs)
-+module_exit(exit_ext3_fs)
-diff -rup --new-file linux.mcp2/fs/ext3/symlink.c linux_tmp/fs/ext3/symlink.c
---- linux.mcp2/fs/ext3/symlink.c 1969-12-31 16:00:00.000000000 -0800
-+++ linux_tmp/fs/ext3/symlink.c 2001-11-09 14:25:04.000000000 -0800
-@@ -0,0 +1,39 @@
-+/*
-+ * linux/fs/ext3/symlink.c
-+ *
-+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
-+ *
-+ * Copyright (C) 1992, 1993, 1994, 1995
-+ * Remy Card (card@masi.ibp.fr)
-+ * Laboratoire MASI - Institut Blaise Pascal
-+ * Universite Pierre et Marie Curie (Paris VI)
-+ *
-+ * from
-+ *
-+ * linux/fs/minix/symlink.c
-+ *
-+ * Copyright (C) 1991, 1992 Linus Torvalds
-+ *
-+ * ext3 symlink handling code
-+ */
-+
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+
-+static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
-+{
-+ char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
-+ return vfs_readlink(dentry, buffer, buflen, s);
-+}
-+
-+static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
-+{
-+ char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
-+ return vfs_follow_link(nd, s);
-+}
-+
-+struct inode_operations ext3_fast_symlink_inode_operations = {
-+ readlink: ext3_readlink, /* BKL not held. Don't need */
-+ follow_link: ext3_follow_link, /* BKL not held. Don't need */
-+};
+++ /dev/null
-diff -ruP linux.mcp2/fs/jbd/Makefile linuxppc_2.4.19_final/fs/jbd/Makefile
---- linux.mcp2/fs/jbd/Makefile 1969-12-31 16:00:00.000000000 -0800
-+++ linuxppc_2.4.19_final/fs/jbd/Makefile 2004-05-17 13:56:17.000000000 -0700
-@@ -0,0 +1,15 @@
-+#
-+# fs/jbd/Makefile
-+#
-+# Makefile for the linux journaling routines.
-+#
-+
-+export-objs := journal.o
-+O_TARGET := jbd.o
-+
-+obj-y := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
-+
-+obj-m := $(O_TARGET)
-+
-+include $(TOPDIR)/Rules.make
-+
-diff -ruP linux.mcp2/fs/jbd/checkpoint.c linuxppc_2.4.19_final/fs/jbd/checkpoint.c
---- linux.mcp2/fs/jbd/checkpoint.c 1969-12-31 16:00:00.000000000 -0800
-+++ linuxppc_2.4.19_final/fs/jbd/checkpoint.c 2004-05-17 13:56:17.000000000 -0700
-@@ -0,0 +1,605 @@
-+/*
-+ * linux/fs/checkpoint.c
-+ *
-+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
-+ *
-+ * Copyright 1999 Red Hat Software --- All Rights Reserved
-+ *
-+ * This file is part of the Linux kernel and is made available under
-+ * the terms of the GNU General Public License, version 2, or at your
-+ * option, any later version, incorporated herein by reference.
-+ *
-+ * Checkpoint routines for the generic filesystem journaling code.
-+ * Part of the ext2fs journaling system.
-+ *
-+ * Checkpointing is the process of ensuring that a section of the log is
-+ * committed fully to disk, so that that portion of the log can be
-+ * reused.
-+ */
-+
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/errno.h>
-+#include <linux/slab.h>
-+#include <linux/locks.h>
-+
-+extern spinlock_t journal_datalist_lock;
-+
-+/*
-+ * Unlink a buffer from a transaction.
-+ *
-+ * Called with journal_datalist_lock held.
-+ */
-+
-+static inline void __buffer_unlink(struct journal_head *jh)
-+{
-+ transaction_t *transaction;
-+
-+ transaction = jh->b_cp_transaction;
-+ jh->b_cp_transaction = NULL;
-+
-+ jh->b_cpnext->b_cpprev = jh->b_cpprev;
-+ jh->b_cpprev->b_cpnext = jh->b_cpnext;
-+ if (transaction->t_checkpoint_list == jh)
-+ transaction->t_checkpoint_list = jh->b_cpnext;
-+ if (transaction->t_checkpoint_list == jh)
-+ transaction->t_checkpoint_list = NULL;
-+}
-+
-+/*
-+ * Try to release a checkpointed buffer from its transaction.
-+ * Returns 1 if we released it.
-+ * Requires journal_datalist_lock
-+ */
-+static int __try_to_free_cp_buf(struct journal_head *jh)
-+{
-+ int ret = 0;
-+ struct buffer_head *bh = jh2bh(jh);
-+
-+ if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
-+ JBUFFER_TRACE(jh, "remove from checkpoint list");
-+ __journal_remove_checkpoint(jh);
-+ __journal_remove_journal_head(bh);
-+ BUFFER_TRACE(bh, "release");
-+ /* BUF_LOCKED -> BUF_CLEAN (fwiw) */
-+ refile_buffer(bh);
-+ __brelse(bh);
-+ ret = 1;
-+ }
-+ return ret;
-+}
-+
-+/*
-+ * log_wait_for_space: wait until there is space in the journal.
-+ *
-+ * Called with the journal already locked, but it will be unlocked if we have
-+ * to wait for a checkpoint to free up some space in the log.
-+ */
-+
-+void log_wait_for_space(journal_t *journal, int nblocks)
-+{
-+ while (log_space_left(journal) < nblocks) {
-+ if (journal->j_flags & JFS_ABORT)
-+ return;
-+ unlock_journal(journal);
-+ down(&journal->j_checkpoint_sem);
-+ lock_journal(journal);
-+
-+ /* Test again, another process may have checkpointed
-+ * while we were waiting for the checkpoint lock */
-+ if (log_space_left(journal) < nblocks) {
-+ log_do_checkpoint(journal, nblocks);
-+ }
-+ up(&journal->j_checkpoint_sem);
-+ }
-+}
-+
-+/*
-+ * Clean up a transaction's checkpoint list.
-+ *
-+ * We wait for any pending IO to complete and make sure any clean
-+ * buffers are removed from the transaction.
-+ *
-+ * Return 1 if we performed any actions which might have destroyed the
-+ * checkpoint. (journal_remove_checkpoint() deletes the transaction when
-+ * the last checkpoint buffer is cleansed)
-+ *
-+ * Called with the journal locked.
-+ * Called with journal_datalist_lock held.
-+ */
-+static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
-+{
-+ struct journal_head *jh, *next_jh, *last_jh;
-+ struct buffer_head *bh;
-+ int ret = 0;
-+
-+ assert_spin_locked(&journal_datalist_lock);
-+ jh = transaction->t_checkpoint_list;
-+ if (!jh)
-+ return 0;
-+
-+ last_jh = jh->b_cpprev;
-+ next_jh = jh;
-+ do {
-+ jh = next_jh;
-+ bh = jh2bh(jh);
-+ if (buffer_locked(bh)) {
-+ atomic_inc(&bh->b_count);
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_journal(journal);
-+ wait_on_buffer(bh);
-+ /* the journal_head may have gone by now */
-+ BUFFER_TRACE(bh, "brelse");
-+ __brelse(bh);
-+ goto out_return_1;
-+ }
-+
-+ if (jh->b_transaction != NULL) {
-+ transaction_t *transaction = jh->b_transaction;
-+ tid_t tid = transaction->t_tid;
-+
-+ spin_unlock(&journal_datalist_lock);
-+ log_start_commit(journal, transaction);
-+ unlock_journal(journal);
-+ log_wait_commit(journal, tid);
-+ goto out_return_1;
-+ }
-+
-+ /*
-+ * We used to test for (jh->b_list != BUF_CLEAN) here.
-+ * But unmap_underlying_metadata() can place buffer onto
-+ * BUF_CLEAN. Since refile_buffer() no longer takes buffers
-+ * off checkpoint lists, we cope with it here
-+ */
-+ /*
-+ * AKPM: I think the buffer_jdirty test is redundant - it
-+ * shouldn't have NULL b_transaction?
-+ */
-+ next_jh = jh->b_cpnext;
-+ if (!buffer_dirty(bh) && !buffer_jdirty(bh)) {
-+ BUFFER_TRACE(bh, "remove from checkpoint");
-+ __journal_remove_checkpoint(jh);
-+ __journal_remove_journal_head(bh);
-+ refile_buffer(bh);
-+ __brelse(bh);
-+ ret = 1;
-+ }
-+
-+ jh = next_jh;
-+ } while (jh != last_jh);
-+
-+ return ret;
-+out_return_1:
-+ lock_journal(journal);
-+ spin_lock(&journal_datalist_lock);
-+ return 1;
-+}
-+
-+#define NR_BATCH 64
-+
-+static void __flush_batch(struct buffer_head **bhs, int *batch_count)
-+{
-+ int i;
-+
-+ spin_unlock(&journal_datalist_lock);
-+ ll_rw_block(WRITE, *batch_count, bhs);
-+ run_task_queue(&tq_disk);
-+ spin_lock(&journal_datalist_lock);
-+ for (i = 0; i < *batch_count; i++) {
-+ struct buffer_head *bh = bhs[i];
-+ clear_bit(BH_JWrite, &bh->b_state);
-+ BUFFER_TRACE(bh, "brelse");
-+ __brelse(bh);
-+ }
-+ *batch_count = 0;
-+}
-+
-+/*
-+ * Try to flush one buffer from the checkpoint list to disk.
-+ *
-+ * Return 1 if something happened which requires us to abort the current
-+ * scan of the checkpoint list.
-+ *
-+ * Called with journal_datalist_lock held.
-+ */
-+static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-+ struct buffer_head **bhs, int *batch_count,
-+ int *drop_count)
-+{
-+ struct buffer_head *bh = jh2bh(jh);
-+ int ret = 0;
-+
-+ if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-+ J_ASSERT_JH(jh, jh->b_transaction == NULL);
-+
-+ /*
-+ * Important: we are about to write the buffer, and
-+ * possibly block, while still holding the journal lock.
-+ * We cannot afford to let the transaction logic start
-+ * messing around with this buffer before we write it to
-+ * disk, as that would break recoverability.
-+ */
-+ BUFFER_TRACE(bh, "queue");
-+ atomic_inc(&bh->b_count);
-+ J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state));
-+ set_bit(BH_JWrite, &bh->b_state);
-+ bhs[*batch_count] = bh;
-+ (*batch_count)++;
-+ if (*batch_count == NR_BATCH) {
-+ __flush_batch(bhs, batch_count);
-+ ret = 1;
-+ }
-+ } else {
-+ int last_buffer = 0;
-+ if (jh->b_cpnext == jh) {
-+ /* We may be about to drop the transaction. Tell the
-+ * caller that the lists have changed.
-+ */
-+ last_buffer = 1;
-+ }
-+ if (__try_to_free_cp_buf(jh)) {
-+ (*drop_count)++;
-+ ret = last_buffer;
-+ }
-+ }
-+ return ret;
-+}
-+
-+
-+/*
-+ * Perform an actual checkpoint. We don't write out only enough to
-+ * satisfy the current blocked requests: rather we submit a reasonably
-+ * sized chunk of the outstanding data to disk at once for
-+ * efficiency. log_wait_for_space() will retry if we didn't free enough.
-+ *
-+ * However, we _do_ take into account the amount requested so that once
-+ * the IO has been queued, we can return as soon as enough of it has
-+ * completed to disk.
-+ *
-+ * The journal should be locked before calling this function.
-+ */
-+
-+/* @@@ `nblocks' is unused. Should it be used? */
-+int log_do_checkpoint (journal_t *journal, int nblocks)
-+{
-+ transaction_t *transaction, *last_transaction, *next_transaction;
-+ int result;
-+ int target;
-+ int batch_count = 0;
-+ struct buffer_head *bhs[NR_BATCH];
-+
-+ jbd_debug(1, "Start checkpoint\n");
-+
-+ /*
-+ * First thing: if there are any transactions in the log which
-+ * don't need checkpointing, just eliminate them from the
-+ * journal straight away.
-+ */
-+ result = cleanup_journal_tail(journal);
-+ jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
-+ if (result <= 0)
-+ return result;
-+
-+ /*
-+ * OK, we need to start writing disk blocks. Try to free up a
-+ * quarter of the log in a single checkpoint if we can.
-+ */
-+ /*
-+ * AKPM: check this code. I had a feeling a while back that it
-+ * degenerates into a busy loop at unmount time.
-+ */
-+ target = (journal->j_last - journal->j_first) / 4;
-+
-+ spin_lock(&journal_datalist_lock);
-+repeat:
-+ transaction = journal->j_checkpoint_transactions;
-+ if (transaction == NULL)
-+ goto done;
-+ last_transaction = transaction->t_cpprev;
-+ next_transaction = transaction;
-+
-+ do {
-+ struct journal_head *jh, *last_jh, *next_jh;
-+ int drop_count = 0;
-+ int cleanup_ret, retry = 0;
-+
-+ transaction = next_transaction;
-+ next_transaction = transaction->t_cpnext;
-+ jh = transaction->t_checkpoint_list;
-+ last_jh = jh->b_cpprev;
-+ next_jh = jh;
-+ do {
-+ jh = next_jh;
-+ next_jh = jh->b_cpnext;
-+ retry = __flush_buffer(journal, jh, bhs, &batch_count,
-+ &drop_count);
-+ } while (jh != last_jh && !retry);
-+ if (batch_count) {
-+ __flush_batch(bhs, &batch_count);
-+ goto repeat;
-+ }
-+ if (retry)
-+ goto repeat;
-+ /*
-+ * We have walked the whole transaction list without
-+ * finding anything to write to disk. We had better be
-+ * able to make some progress or we are in trouble.
-+ */
-+ cleanup_ret = __cleanup_transaction(journal, transaction);
-+ J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-+ goto repeat; /* __cleanup may have dropped lock */
-+ } while (transaction != last_transaction);
-+
-+done:
-+ spin_unlock(&journal_datalist_lock);
-+ result = cleanup_journal_tail(journal);
-+ if (result < 0)
-+ return result;
-+
-+ return 0;
-+}
-+
-+/*
-+ * Check the list of checkpoint transactions for the journal to see if
-+ * we have already got rid of any since the last update of the log tail
-+ * in the journal superblock. If so, we can instantly roll the
-+ * superblock forward to remove those transactions from the log.
-+ *
-+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
-+ *
-+ * Called with the journal lock held.
-+ *
-+ * This is the only part of the journaling code which really needs to be
-+ * aware of transaction aborts. Checkpointing involves writing to the
-+ * main filesystem area rather than to the journal, so it can proceed
-+ * even in abort state, but we must not update the journal superblock if
-+ * we have an abort error outstanding.
-+ */
-+
-+int cleanup_journal_tail(journal_t *journal)
-+{
-+ transaction_t * transaction;
-+ tid_t first_tid;
-+ unsigned long blocknr, freed;
-+
-+ /* OK, work out the oldest transaction remaining in the log, and
-+ * the log block it starts at.
-+ *
-+ * If the log is now empty, we need to work out which is the
-+ * next transaction ID we will write, and where it will
-+ * start. */
-+
-+ /* j_checkpoint_transactions needs locking */
-+ spin_lock(&journal_datalist_lock);
-+ transaction = journal->j_checkpoint_transactions;
-+ if (transaction) {
-+ first_tid = transaction->t_tid;
-+ blocknr = transaction->t_log_start;
-+ } else if ((transaction = journal->j_committing_transaction) != NULL) {
-+ first_tid = transaction->t_tid;
-+ blocknr = transaction->t_log_start;
-+ } else if ((transaction = journal->j_running_transaction) != NULL) {
-+ first_tid = transaction->t_tid;
-+ blocknr = journal->j_head;
-+ } else {
-+ first_tid = journal->j_transaction_sequence;
-+ blocknr = journal->j_head;
-+ }
-+ spin_unlock(&journal_datalist_lock);
-+ J_ASSERT (blocknr != 0);
-+
-+ /* If the oldest pinned transaction is at the tail of the log
-+ already then there's not much we can do right now. */
-+ if (journal->j_tail_sequence == first_tid)
-+ return 1;
-+
-+ /* OK, update the superblock to recover the freed space.
-+ * Physical blocks come first: have we wrapped beyond the end of
-+ * the log? */
-+ freed = blocknr - journal->j_tail;
-+ if (blocknr < journal->j_tail)
-+ freed = freed + journal->j_last - journal->j_first;
-+
-+ jbd_debug(1,
-+ "Cleaning journal tail from %d to %d (offset %lu), "
-+ "freeing %lu\n",
-+ journal->j_tail_sequence, first_tid, blocknr, freed);
-+
-+ journal->j_free += freed;
-+ journal->j_tail_sequence = first_tid;
-+ journal->j_tail = blocknr;
-+ if (!(journal->j_flags & JFS_ABORT))
-+ journal_update_superblock(journal, 1);
-+ return 0;
-+}
-+
-+
-+/* Checkpoint list management */
-+
-+/*
-+ * journal_clean_checkpoint_list
-+ *
-+ * Find all the written-back checkpoint buffers in the journal and release them.
-+ *
-+ * Called with the journal locked.
-+ * Called with journal_datalist_lock held.
-+ * Returns number of bufers reaped (for debug)
-+ */
-+
-+int __journal_clean_checkpoint_list(journal_t *journal)
-+{
-+ transaction_t *transaction, *last_transaction, *next_transaction;
-+ int ret = 0;
-+
-+ transaction = journal->j_checkpoint_transactions;
-+ if (transaction == 0)
-+ goto out;
-+
-+ last_transaction = transaction->t_cpprev;
-+ next_transaction = transaction;
-+ do {
-+ struct journal_head *jh;
-+
-+ transaction = next_transaction;
-+ next_transaction = transaction->t_cpnext;
-+ jh = transaction->t_checkpoint_list;
-+ if (jh) {
-+ struct journal_head *last_jh = jh->b_cpprev;
-+ struct journal_head *next_jh = jh;
-+ do {
-+ jh = next_jh;
-+ next_jh = jh->b_cpnext;
-+ ret += __try_to_free_cp_buf(jh);
-+ } while (jh != last_jh);
-+ }
-+ } while (transaction != last_transaction);
-+out:
-+ return ret;
-+}
-+
-+/*
-+ * journal_remove_checkpoint: called after a buffer has been committed
-+ * to disk (either by being write-back flushed to disk, or being
-+ * committed to the log).
-+ *
-+ * We cannot safely clean a transaction out of the log until all of the
-+ * buffer updates committed in that transaction have safely been stored
-+ * elsewhere on disk. To achieve this, all of the buffers in a
-+ * transaction need to be maintained on the transaction's checkpoint
-+ * list until they have been rewritten, at which point this function is
-+ * called to remove the buffer from the existing transaction's
-+ * checkpoint list.
-+ *
-+ * This function is called with the journal locked.
-+ * This function is called with journal_datalist_lock held.
-+ */
-+
-+void __journal_remove_checkpoint(struct journal_head *jh)
-+{
-+ transaction_t *transaction;
-+ journal_t *journal;
-+
-+ JBUFFER_TRACE(jh, "entry");
-+
-+ if ((transaction = jh->b_cp_transaction) == NULL) {
-+ JBUFFER_TRACE(jh, "not on transaction");
-+ goto out;
-+ }
-+
-+ journal = transaction->t_journal;
-+
-+ __buffer_unlink(jh);
-+
-+ if (transaction->t_checkpoint_list != NULL)
-+ goto out;
-+ JBUFFER_TRACE(jh, "transaction has no more buffers");
-+
-+ /* There is one special case to worry about: if we have just
-+ pulled the buffer off a committing transaction's forget list,
-+ then even if the checkpoint list is empty, the transaction
-+ obviously cannot be dropped! */
-+
-+ if (transaction == journal->j_committing_transaction) {
-+ JBUFFER_TRACE(jh, "belongs to committing transaction");
-+ goto out;
-+ }
-+
-+ /* OK, that was the last buffer for the transaction: we can now
-+ safely remove this transaction from the log */
-+
-+ __journal_drop_transaction(journal, transaction);
-+
-+ /* Just in case anybody was waiting for more transactions to be
-+ checkpointed... */
-+ wake_up(&journal->j_wait_logspace);
-+out:
-+ JBUFFER_TRACE(jh, "exit");
-+}
-+
-+void journal_remove_checkpoint(struct journal_head *jh)
-+{
-+ spin_lock(&journal_datalist_lock);
-+ __journal_remove_checkpoint(jh);
-+ spin_unlock(&journal_datalist_lock);
-+}
-+
-+/*
-+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
-+ * list so that we know when it is safe to clean the transaction out of
-+ * the log.
-+ *
-+ * Called with the journal locked.
-+ * Called with journal_datalist_lock held.
-+ */
-+void __journal_insert_checkpoint(struct journal_head *jh,
-+ transaction_t *transaction)
-+{
-+ JBUFFER_TRACE(jh, "entry");
-+ J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh)));
-+ J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
-+
-+ assert_spin_locked(&journal_datalist_lock);
-+ jh->b_cp_transaction = transaction;
-+
-+ if (!transaction->t_checkpoint_list) {
-+ jh->b_cpnext = jh->b_cpprev = jh;
-+ } else {
-+ jh->b_cpnext = transaction->t_checkpoint_list;
-+ jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
-+ jh->b_cpprev->b_cpnext = jh;
-+ jh->b_cpnext->b_cpprev = jh;
-+ }
-+ transaction->t_checkpoint_list = jh;
-+}
-+
-+void journal_insert_checkpoint(struct journal_head *jh,
-+ transaction_t *transaction)
-+{
-+ spin_lock(&journal_datalist_lock);
-+ __journal_insert_checkpoint(jh, transaction);
-+ spin_unlock(&journal_datalist_lock);
-+}
-+
-+/*
-+ * We've finished with this transaction structure: adios...
-+ *
-+ * The transaction must have no links except for the checkpoint by this
-+ * point.
-+ *
-+ * Called with the journal locked.
-+ * Called with journal_datalist_lock held.
-+ */
-+
-+void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
-+{
-+ assert_spin_locked(&journal_datalist_lock);
-+ if (transaction->t_cpnext) {
-+ transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
-+ transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
-+ if (journal->j_checkpoint_transactions == transaction)
-+ journal->j_checkpoint_transactions =
-+ transaction->t_cpnext;
-+ if (journal->j_checkpoint_transactions == transaction)
-+ journal->j_checkpoint_transactions = NULL;
-+ }
-+
-+ J_ASSERT (transaction->t_ilist == NULL);
-+ J_ASSERT (transaction->t_buffers == NULL);
-+ J_ASSERT (transaction->t_sync_datalist == NULL);
-+ J_ASSERT (transaction->t_async_datalist == NULL);
-+ J_ASSERT (transaction->t_forget == NULL);
-+ J_ASSERT (transaction->t_iobuf_list == NULL);
-+ J_ASSERT (transaction->t_shadow_list == NULL);
-+ J_ASSERT (transaction->t_log_list == NULL);
-+ J_ASSERT (transaction->t_checkpoint_list == NULL);
-+ J_ASSERT (transaction->t_updates == 0);
-+
-+ J_ASSERT (transaction->t_journal->j_committing_transaction !=
-+ transaction);
-+
-+ jbd_debug (1, "Dropping transaction %d, all done\n",
-+ transaction->t_tid);
-+ kfree (transaction);
-+}
-+
-diff -ruP linux.mcp2/fs/jbd/commit.c linuxppc_2.4.19_final/fs/jbd/commit.c
---- linux.mcp2/fs/jbd/commit.c 1969-12-31 16:00:00.000000000 -0800
-+++ linuxppc_2.4.19_final/fs/jbd/commit.c 2004-05-17 13:56:17.000000000 -0700
-@@ -0,0 +1,719 @@
-+/*
-+ * linux/fs/commit.c
-+ *
-+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
-+ *
-+ * Copyright 1998 Red Hat corp --- All Rights Reserved
-+ *
-+ * This file is part of the Linux kernel and is made available under
-+ * the terms of the GNU General Public License, version 2, or at your
-+ * option, any later version, incorporated herein by reference.
-+ *
-+ * Journal commit routines for the generic filesystem journaling code;
-+ * part of the ext2fs journaling system.
-+ */
-+
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/errno.h>
-+#include <linux/slab.h>
-+#include <linux/locks.h>
-+#include <linux/smp_lock.h>
-+
-+extern spinlock_t journal_datalist_lock;
-+
-+/*
-+ * Default IO end handler for temporary BJ_IO buffer_heads.
-+ */
-+void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
-+{
-+ BUFFER_TRACE(bh, "");
-+ mark_buffer_uptodate(bh, uptodate);
-+ unlock_buffer(bh);
-+}
-+
-+/*
-+ * journal_commit_transaction
-+ *
-+ * The primary function for committing a transaction to the log. This
-+ * function is called by the journal thread to begin a complete commit.
-+ */
-+void journal_commit_transaction(journal_t *journal)
-+{
-+ transaction_t *commit_transaction;
-+ struct journal_head *jh, *new_jh, *descriptor;
-+ struct journal_head *next_jh, *last_jh;
-+ struct buffer_head *wbuf[64];
-+ int bufs;
-+ int flags;
-+ int err;
-+ unsigned long blocknr;
-+ char *tagp = NULL;
-+ journal_header_t *header;
-+ journal_block_tag_t *tag = NULL;
-+ int space_left = 0;
-+ int first_tag = 0;
-+ int tag_flag;
-+ int i;
-+
-+ /*
-+ * First job: lock down the current transaction and wait for
-+ * all outstanding updates to complete.
-+ */
-+
-+ lock_journal(journal); /* Protect journal->j_running_transaction */
-+
-+#ifdef COMMIT_STATS
-+ spin_lock(&journal_datalist_lock);
-+ summarise_journal_usage(journal);
-+ spin_unlock(&journal_datalist_lock);
-+#endif
-+
-+ lock_kernel();
-+
-+ J_ASSERT (journal->j_running_transaction != NULL);
-+ J_ASSERT (journal->j_committing_transaction == NULL);
-+
-+ commit_transaction = journal->j_running_transaction;
-+ J_ASSERT (commit_transaction->t_state == T_RUNNING);
-+
-+ jbd_debug (1, "JBD: starting commit of transaction %d\n",
-+ commit_transaction->t_tid);
-+
-+ commit_transaction->t_state = T_LOCKED;
-+ while (commit_transaction->t_updates != 0) {
-+ unlock_journal(journal);
-+ sleep_on(&journal->j_wait_updates);
-+ lock_journal(journal);
-+ }
-+
-+ J_ASSERT (commit_transaction->t_outstanding_credits <=
-+ journal->j_max_transaction_buffers);
-+
-+ /* Do we need to erase the effects of a prior journal_flush? */
-+ if (journal->j_flags & JFS_FLUSHED) {
-+ jbd_debug(3, "super block updated\n");
-+ journal_update_superblock(journal, 1);
-+ } else {
-+ jbd_debug(3, "superblock not updated\n");
-+ }
-+
-+ /*
-+ * First thing we are allowed to do is to discard any remaining
-+ * BJ_Reserved buffers. Note, it is _not_ permissible to assume
-+ * that there are no such buffers: if a large filesystem
-+ * operation like a truncate needs to split itself over multiple
-+ * transactions, then it may try to do a journal_restart() while
-+ * there are still BJ_Reserved buffers outstanding. These must
-+ * be released cleanly from the current transaction.
-+ *
-+ * In this case, the filesystem must still reserve write access
-+ * again before modifying the buffer in the new transaction, but
-+ * we do not require it to remember exactly which old buffers it
-+ * has reserved. This is consistent with the existing behaviour
-+ * that multiple journal_get_write_access() calls to the same
-+ * buffer are perfectly permissable.
-+ */
-+
-+ while (commit_transaction->t_reserved_list) {
-+ jh = commit_transaction->t_reserved_list;
-+ JBUFFER_TRACE(jh, "reserved, unused: refile");
-+ journal_refile_buffer(jh);
-+ }
-+
-+ /*
-+ * Now try to drop any written-back buffers from the journal's
-+ * checkpoint lists. We do this *before* commit because it potentially
-+ * frees some memory
-+ */
-+ spin_lock(&journal_datalist_lock);
-+ __journal_clean_checkpoint_list(journal);
-+ spin_unlock(&journal_datalist_lock);
-+
-+ /* First part of the commit: force the revoke list out to disk.
-+ * The revoke code generates its own metadata blocks on disk for this.
-+ *
-+ * It is important that we do this while the transaction is
-+ * still locked. Generating the revoke records should not
-+ * generate any IO stalls, so this should be quick; and doing
-+ * the work while we have the transaction locked means that we
-+ * only ever have to maintain the revoke list for one
-+ * transaction at a time.
-+ */
-+
-+ jbd_debug (3, "JBD: commit phase 1\n");
-+
-+ journal_write_revoke_records(journal, commit_transaction);
-+
-+ /*
-+ * Now that we have built the revoke records, we can start
-+ * reusing the revoke list for a new running transaction. We
-+ * can now safely start committing the old transaction: time to
-+ * get a new running transaction for incoming filesystem updates
-+ */
-+
-+ commit_transaction->t_state = T_FLUSH;
-+
-+ wake_up(&journal->j_wait_transaction_locked);
-+
-+ journal->j_committing_transaction = commit_transaction;
-+ journal->j_running_transaction = NULL;
-+
-+ commit_transaction->t_log_start = journal->j_head;
-+
-+ unlock_kernel();
-+
-+ jbd_debug (3, "JBD: commit phase 2\n");
-+
-+ /*
-+ * Now start flushing things to disk, in the order they appear
-+ * on the transaction lists. Data blocks go first.
-+ */
-+
-+ /*
-+ * Whenever we unlock the journal and sleep, things can get added
-+ * onto ->t_datalist, so we have to keep looping back to write_out_data
-+ * until we *know* that the list is empty.
-+ */
-+write_out_data:
-+
-+ /*
-+ * Cleanup any flushed data buffers from the data list. Even in
-+ * abort mode, we want to flush this out as soon as possible.
-+ *
-+ * We take journal_datalist_lock to protect the lists from
-+ * journal_try_to_free_buffers().
-+ */
-+ spin_lock(&journal_datalist_lock);
-+
-+write_out_data_locked:
-+ bufs = 0;
-+ next_jh = commit_transaction->t_sync_datalist;
-+ if (next_jh == NULL)
-+ goto sync_datalist_empty;
-+ last_jh = next_jh->b_tprev;
-+
-+ do {
-+ struct buffer_head *bh;
-+
-+ jh = next_jh;
-+ next_jh = jh->b_tnext;
-+ bh = jh2bh(jh);
-+ if (!buffer_locked(bh)) {
-+ if (buffer_dirty(bh)) {
-+ BUFFER_TRACE(bh, "start journal writeout");
-+ atomic_inc(&bh->b_count);
-+ wbuf[bufs++] = bh;
-+ } else {
-+ BUFFER_TRACE(bh, "writeout complete: unfile");
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = NULL;
-+ __journal_remove_journal_head(bh);
-+ refile_buffer(bh);
-+ __brelse(bh);
-+ }
-+ }
-+ if (bufs == ARRAY_SIZE(wbuf)) {
-+ /*
-+ * Major speedup: start here on the next scan
-+ */
-+ J_ASSERT(commit_transaction->t_sync_datalist != 0);
-+ commit_transaction->t_sync_datalist = jh;
-+ break;
-+ }
-+ } while (jh != last_jh);
-+
-+ if (bufs || current->need_resched) {
-+ jbd_debug(2, "submit %d writes\n", bufs);
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_journal(journal);
-+ if (bufs)
-+ ll_rw_block(WRITE, bufs, wbuf);
-+ if (current->need_resched)
-+ schedule();
-+ journal_brelse_array(wbuf, bufs);
-+ lock_journal(journal);
-+ spin_lock(&journal_datalist_lock);
-+ if (bufs)
-+ goto write_out_data_locked;
-+ }
-+
-+ /*
-+ * Wait for all previously submitted IO on the data list to complete.
-+ */
-+ jh = commit_transaction->t_sync_datalist;
-+ if (jh == NULL)
-+ goto sync_datalist_empty;
-+
-+ do {
-+ struct buffer_head *bh;
-+ jh = jh->b_tprev; /* Wait on the last written */
-+ bh = jh2bh(jh);
-+ if (buffer_locked(bh)) {
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_journal(journal);
-+ wait_on_buffer(bh);
-+ /* the journal_head may have been removed now */
-+ lock_journal(journal);
-+ goto write_out_data;
-+ } else if (buffer_dirty(bh)) {
-+ goto write_out_data_locked;
-+ }
-+ } while (jh != commit_transaction->t_sync_datalist);
-+ goto write_out_data_locked;
-+
-+sync_datalist_empty:
-+ /*
-+ * Wait for all the async writepage data. As they become unlocked
-+ * in end_buffer_io_async(), the only place where they can be
-+ * reaped is in try_to_free_buffers(), and we're locked against
-+ * that.
-+ */
-+ while ((jh = commit_transaction->t_async_datalist)) {
-+ struct buffer_head *bh = jh2bh(jh);
-+ if (buffer_locked(bh)) {
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_journal(journal);
-+ wait_on_buffer(bh);
-+ lock_journal(journal);
-+ spin_lock(&journal_datalist_lock);
-+ continue; /* List may have changed */
-+ }
-+ if (jh->b_next_transaction) {
-+ /*
-+ * For writepage() buffers in journalled data mode: a
-+ * later transaction may want the buffer for "metadata"
-+ */
-+ __journal_refile_buffer(jh);
-+ } else {
-+ BUFFER_TRACE(bh, "finished async writeout: unfile");
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = NULL;
-+ __journal_remove_journal_head(bh);
-+ BUFFER_TRACE(bh, "finished async writeout: refile");
-+ /* It can sometimes be on BUF_LOCKED due to migration
-+ * from syncdata to asyncdata */
-+ if (bh->b_list != BUF_CLEAN)
-+ refile_buffer(bh);
-+ __brelse(bh);
-+ }
-+ }
-+ spin_unlock(&journal_datalist_lock);
-+
-+ /*
-+ * If we found any dirty or locked buffers, then we should have
-+ * looped back up to the write_out_data label. If there weren't
-+ * any then journal_clean_data_list should have wiped the list
-+ * clean by now, so check that it is in fact empty.
-+ */
-+ J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-+ J_ASSERT (commit_transaction->t_async_datalist == NULL);
-+
-+ jbd_debug (3, "JBD: commit phase 3\n");
-+
-+ /*
-+ * Way to go: we have now written out all of the data for a
-+ * transaction! Now comes the tricky part: we need to write out
-+ * metadata. Loop over the transaction's entire buffer list:
-+ */
-+ commit_transaction->t_state = T_COMMIT;
-+
-+ descriptor = 0;
-+ bufs = 0;
-+ while (commit_transaction->t_buffers) {
-+
-+ /* Find the next buffer to be journaled... */
-+
-+ jh = commit_transaction->t_buffers;
-+
-+ /* If we're in abort mode, we just un-journal the buffer and
-+ release it for background writing. */
-+
-+ if (is_journal_aborted(journal)) {
-+ JBUFFER_TRACE(jh, "journal is aborting: refile");
-+ journal_refile_buffer(jh);
-+ /* If that was the last one, we need to clean up
-+ * any descriptor buffers which may have been
-+ * already allocated, even if we are now
-+ * aborting. */
-+ if (!commit_transaction->t_buffers)
-+ goto start_journal_io;
-+ continue;
-+ }
-+
-+ /* Make sure we have a descriptor block in which to
-+ record the metadata buffer. */
-+
-+ if (!descriptor) {
-+ struct buffer_head *bh;
-+
-+ J_ASSERT (bufs == 0);
-+
-+ jbd_debug(4, "JBD: get descriptor\n");
-+
-+ descriptor = journal_get_descriptor_buffer(journal);
-+ if (!descriptor) {
-+ __journal_abort_hard(journal);
-+ continue;
-+ }
-+
-+ bh = jh2bh(descriptor);
-+ jbd_debug(4, "JBD: got buffer %ld (%p)\n",
-+ bh->b_blocknr, bh->b_data);
-+ header = (journal_header_t *)&bh->b_data[0];
-+ header->h_magic = htonl(JFS_MAGIC_NUMBER);
-+ header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK);
-+ header->h_sequence = htonl(commit_transaction->t_tid);
-+
-+ tagp = &bh->b_data[sizeof(journal_header_t)];
-+ space_left = bh->b_size - sizeof(journal_header_t);
-+ first_tag = 1;
-+ set_bit(BH_JWrite, &bh->b_state);
-+ wbuf[bufs++] = bh;
-+
-+ /* Record it so that we can wait for IO
-+ completion later */
-+ BUFFER_TRACE(bh, "ph3: file as descriptor");
-+ journal_file_buffer(descriptor, commit_transaction,
-+ BJ_LogCtl);
-+ }
-+
-+ /* Where is the buffer to be written? */
-+
-+ err = journal_next_log_block(journal, &blocknr);
-+ /* If the block mapping failed, just abandon the buffer
-+ and repeat this loop: we'll fall into the
-+ refile-on-abort condition above. */
-+ if (err) {
-+ __journal_abort_hard(journal);
-+ continue;
-+ }
-+
-+ /* Bump b_count to prevent truncate from stumbling over
-+ the shadowed buffer! @@@ This can go if we ever get
-+ rid of the BJ_IO/BJ_Shadow pairing of buffers. */
-+ atomic_inc(&jh2bh(jh)->b_count);
-+
-+ /* Make a temporary IO buffer with which to write it out
-+ (this will requeue both the metadata buffer and the
-+ temporary IO buffer). new_bh goes on BJ_IO*/
-+
-+ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
-+ /*
-+ * akpm: journal_write_metadata_buffer() sets
-+ * new_bh->b_transaction to commit_transaction.
-+ * We need to clean this up before we release new_bh
-+ * (which is of type BJ_IO)
-+ */
-+ JBUFFER_TRACE(jh, "ph3: write metadata");
-+ flags = journal_write_metadata_buffer(commit_transaction,
-+ jh, &new_jh, blocknr);
-+ set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
-+ set_bit(BH_Lock, &jh2bh(new_jh)->b_state);
-+ wbuf[bufs++] = jh2bh(new_jh);
-+
-+ /* Record the new block's tag in the current descriptor
-+ buffer */
-+
-+ tag_flag = 0;
-+ if (flags & 1)
-+ tag_flag |= JFS_FLAG_ESCAPE;
-+ if (!first_tag)
-+ tag_flag |= JFS_FLAG_SAME_UUID;
-+
-+ tag = (journal_block_tag_t *) tagp;
-+ tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr);
-+ tag->t_flags = htonl(tag_flag);
-+ tagp += sizeof(journal_block_tag_t);
-+ space_left -= sizeof(journal_block_tag_t);
-+
-+ if (first_tag) {
-+ memcpy (tagp, journal->j_uuid, 16);
-+ tagp += 16;
-+ space_left -= 16;
-+ first_tag = 0;
-+ }
-+
-+ /* If there's no more to do, or if the descriptor is full,
-+ let the IO rip! */
-+
-+ if (bufs == ARRAY_SIZE(wbuf) ||
-+ commit_transaction->t_buffers == NULL ||
-+ space_left < sizeof(journal_block_tag_t) + 16) {
-+
-+ jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
-+
-+ /* Write an end-of-descriptor marker before
-+ submitting the IOs. "tag" still points to
-+ the last tag we set up. */
-+
-+ tag->t_flags |= htonl(JFS_FLAG_LAST_TAG);
-+
-+start_journal_io:
-+ unlock_journal(journal);
-+ for (i=0; i<bufs; i++) {
-+ struct buffer_head *bh = wbuf[i];
-+ clear_bit(BH_Dirty, &bh->b_state);
-+ bh->b_end_io = journal_end_buffer_io_sync;
-+ submit_bh(WRITE, bh);
-+ }
-+ if (current->need_resched)
-+ schedule();
-+ lock_journal(journal);
-+
-+ /* Force a new descriptor to be generated next
-+ time round the loop. */
-+ descriptor = NULL;
-+ bufs = 0;
-+ }
-+ }
-+
-+ /* Lo and behold: we have just managed to send a transaction to
-+ the log. Before we can commit it, wait for the IO so far to
-+ complete. Control buffers being written are on the
-+ transaction's t_log_list queue, and metadata buffers are on
-+ the t_iobuf_list queue.
-+
-+ Wait for the transactions in reverse order. That way we are
-+ less likely to be woken up until all IOs have completed, and
-+ so we incur less scheduling load.
-+ */
-+
-+ jbd_debug(3, "JBD: commit phase 4\n");
-+
-+ /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */
-+ wait_for_iobuf:
-+ while (commit_transaction->t_iobuf_list != NULL) {
-+ struct buffer_head *bh;
-+ jh = commit_transaction->t_iobuf_list->b_tprev;
-+ bh = jh2bh(jh);
-+ if (buffer_locked(bh)) {
-+ unlock_journal(journal);
-+ wait_on_buffer(bh);
-+ lock_journal(journal);
-+ goto wait_for_iobuf;
-+ }
-+
-+ clear_bit(BH_JWrite, &jh2bh(jh)->b_state);
-+
-+ JBUFFER_TRACE(jh, "ph4: unfile after journal write");
-+ journal_unfile_buffer(jh);
-+
-+ /*
-+ * akpm: don't put back a buffer_head with stale pointers
-+ * dangling around.
-+ */
-+ J_ASSERT_JH(jh, jh->b_transaction != NULL);
-+ jh->b_transaction = NULL;
-+
-+ /*
-+ * ->t_iobuf_list should contain only dummy buffer_heads
-+ * which were created by journal_write_metadata_buffer().
-+ */
-+ bh = jh2bh(jh);
-+ BUFFER_TRACE(bh, "dumping temporary bh");
-+ journal_unlock_journal_head(jh);
-+ __brelse(bh);
-+ J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
-+ put_unused_buffer_head(bh);
-+
-+ /* We also have to unlock and free the corresponding
-+ shadowed buffer */
-+ jh = commit_transaction->t_shadow_list->b_tprev;
-+ bh = jh2bh(jh);
-+ clear_bit(BH_JWrite, &bh->b_state);
-+ J_ASSERT_BH(bh, buffer_jdirty(bh));
-+
-+ /* The metadata is now released for reuse, but we need
-+ to remember it against this transaction so that when
-+ we finally commit, we can do any checkpointing
-+ required. */
-+ JBUFFER_TRACE(jh, "file as BJ_Forget");
-+ journal_file_buffer(jh, commit_transaction, BJ_Forget);
-+ /* Wake up any transactions which were waiting for this
-+ IO to complete */
-+ wake_up(&bh->b_wait);
-+ JBUFFER_TRACE(jh, "brelse shadowed buffer");
-+ __brelse(bh);
-+ }
-+
-+ J_ASSERT (commit_transaction->t_shadow_list == NULL);
-+
-+ jbd_debug(3, "JBD: commit phase 5\n");
-+
-+ /* Here we wait for the revoke record and descriptor record buffers */
-+ wait_for_ctlbuf:
-+ while (commit_transaction->t_log_list != NULL) {
-+ struct buffer_head *bh;
-+
-+ jh = commit_transaction->t_log_list->b_tprev;
-+ bh = jh2bh(jh);
-+ if (buffer_locked(bh)) {
-+ unlock_journal(journal);
-+ wait_on_buffer(bh);
-+ lock_journal(journal);
-+ goto wait_for_ctlbuf;
-+ }
-+
-+ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
-+ clear_bit(BH_JWrite, &bh->b_state);
-+ journal_unfile_buffer(jh);
-+ jh->b_transaction = NULL;
-+ journal_unlock_journal_head(jh);
-+ put_bh(bh); /* One for getblk */
-+ }
-+
-+ jbd_debug(3, "JBD: commit phase 6\n");
-+
-+ if (is_journal_aborted(journal))
-+ goto skip_commit;
-+
-+ /* Done it all: now write the commit record. We should have
-+ * cleaned up our previous buffers by now, so if we are in abort
-+ * mode we can now just skip the rest of the journal write
-+ * entirely. */
-+
-+ descriptor = journal_get_descriptor_buffer(journal);
-+ if (!descriptor) {
-+ __journal_abort_hard(journal);
-+ goto skip_commit;
-+ }
-+
-+ /* AKPM: buglet - add `i' to tmp! */
-+ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
-+ journal_header_t *tmp =
-+ (journal_header_t*)jh2bh(descriptor)->b_data;
-+ tmp->h_magic = htonl(JFS_MAGIC_NUMBER);
-+ tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK);
-+ tmp->h_sequence = htonl(commit_transaction->t_tid);
-+ }
-+
-+ unlock_journal(journal);
-+ JBUFFER_TRACE(descriptor, "write commit block");
-+ {
-+ struct buffer_head *bh = jh2bh(descriptor);
-+ clear_bit(BH_Dirty, &bh->b_state);
-+ bh->b_end_io = journal_end_buffer_io_sync;
-+ submit_bh(WRITE, bh);
-+ wait_on_buffer(bh);
-+ put_bh(bh); /* One for getblk() */
-+ journal_unlock_journal_head(descriptor);
-+ }
-+ lock_journal(journal);
-+
-+ /* End of a transaction! Finally, we can do checkpoint
-+ processing: any buffers committed as a result of this
-+ transaction can be removed from any checkpoint list it was on
-+ before. */
-+
-+skip_commit:
-+
-+ jbd_debug(3, "JBD: commit phase 7\n");
-+
-+ J_ASSERT(commit_transaction->t_sync_datalist == NULL);
-+ J_ASSERT(commit_transaction->t_async_datalist == NULL);
-+ J_ASSERT(commit_transaction->t_buffers == NULL);
-+ J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
-+ J_ASSERT(commit_transaction->t_iobuf_list == NULL);
-+ J_ASSERT(commit_transaction->t_shadow_list == NULL);
-+ J_ASSERT(commit_transaction->t_log_list == NULL);
-+
-+ while (commit_transaction->t_forget) {
-+ transaction_t *cp_transaction;
-+ struct buffer_head *bh;
-+
-+ jh = commit_transaction->t_forget;
-+ J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
-+ jh->b_transaction == journal->j_running_transaction);
-+
-+ /*
-+ * If there is undo-protected committed data against
-+ * this buffer, then we can remove it now. If it is a
-+ * buffer needing such protection, the old frozen_data
-+ * field now points to a committed version of the
-+ * buffer, so rotate that field to the new committed
-+ * data.
-+ *
-+ * Otherwise, we can just throw away the frozen data now.
-+ */
-+ if (jh->b_committed_data) {
-+ kfree(jh->b_committed_data);
-+ jh->b_committed_data = NULL;
-+ if (jh->b_frozen_data) {
-+ jh->b_committed_data = jh->b_frozen_data;
-+ jh->b_frozen_data = NULL;
-+ }
-+ } else if (jh->b_frozen_data) {
-+ kfree(jh->b_frozen_data);
-+ jh->b_frozen_data = NULL;
-+ }
-+
-+ spin_lock(&journal_datalist_lock);
-+ cp_transaction = jh->b_cp_transaction;
-+ if (cp_transaction) {
-+ JBUFFER_TRACE(jh, "remove from old cp transaction");
-+ J_ASSERT_JH(jh, commit_transaction != cp_transaction);
-+ __journal_remove_checkpoint(jh);
-+ }
-+
-+ /* Only re-checkpoint the buffer_head if it is marked
-+ * dirty. If the buffer was added to the BJ_Forget list
-+ * by journal_forget, it may no longer be dirty and
-+ * there's no point in keeping a checkpoint record for
-+ * it. */
-+ bh = jh2bh(jh);
-+ if (buffer_jdirty(bh)) {
-+ JBUFFER_TRACE(jh, "add to new checkpointing trans");
-+ __journal_insert_checkpoint(jh, commit_transaction);
-+ JBUFFER_TRACE(jh, "refile for checkpoint writeback");
-+ __journal_refile_buffer(jh);
-+ } else {
-+ J_ASSERT_BH(bh, !buffer_dirty(bh));
-+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = 0;
-+ __journal_remove_journal_head(bh);
-+ __brelse(bh);
-+ }
-+ spin_unlock(&journal_datalist_lock);
-+ }
-+
-+ /* Done with this transaction! */
-+
-+ jbd_debug(3, "JBD: commit phase 8\n");
-+
-+ J_ASSERT (commit_transaction->t_state == T_COMMIT);
-+ commit_transaction->t_state = T_FINISHED;
-+
-+ J_ASSERT (commit_transaction == journal->j_committing_transaction);
-+ journal->j_commit_sequence = commit_transaction->t_tid;
-+ journal->j_committing_transaction = NULL;
-+
-+ spin_lock(&journal_datalist_lock);
-+ if (commit_transaction->t_checkpoint_list == NULL) {
-+ __journal_drop_transaction(journal, commit_transaction);
-+ } else {
-+ if (journal->j_checkpoint_transactions == NULL) {
-+ journal->j_checkpoint_transactions = commit_transaction;
-+ commit_transaction->t_cpnext = commit_transaction;
-+ commit_transaction->t_cpprev = commit_transaction;
-+ } else {
-+ commit_transaction->t_cpnext =
-+ journal->j_checkpoint_transactions;
-+ commit_transaction->t_cpprev =
-+ commit_transaction->t_cpnext->t_cpprev;
-+ commit_transaction->t_cpnext->t_cpprev =
-+ commit_transaction;
-+ commit_transaction->t_cpprev->t_cpnext =
-+ commit_transaction;
-+ }
-+ }
-+ spin_unlock(&journal_datalist_lock);
-+
-+ jbd_debug(1, "JBD: commit %d complete, head %d\n",
-+ journal->j_commit_sequence, journal->j_tail_sequence);
-+
-+ unlock_journal(journal);
-+ wake_up(&journal->j_wait_done_commit);
-+}
-diff -ruP linux.mcp2/fs/jbd/journal.c linuxppc_2.4.19_final/fs/jbd/journal.c
---- linux.mcp2/fs/jbd/journal.c 1969-12-31 16:00:00.000000000 -0800
-+++ linuxppc_2.4.19_final/fs/jbd/journal.c 2004-05-17 13:56:17.000000000 -0700
-@@ -0,0 +1,1877 @@
-+/*
-+ * linux/fs/journal.c
-+ *
-+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
-+ *
-+ * Copyright 1998 Red Hat corp --- All Rights Reserved
-+ *
-+ * This file is part of the Linux kernel and is made available under
-+ * the terms of the GNU General Public License, version 2, or at your
-+ * option, any later version, incorporated herein by reference.
-+ *
-+ * Generic filesystem journal-writing code; part of the ext2fs
-+ * journaling system.
-+ *
-+ * This file manages journals: areas of disk reserved for logging
-+ * transactional updates. This includes the kernel journaling thread
-+ * which is responsible for scheduling updates to the log.
-+ *
-+ * We do not actually manage the physical storage of the journal in this
-+ * file: that is left to a per-journal policy function, which allows us
-+ * to store the journal within a filesystem-specified area for ext2
-+ * journaling (ext2 can use a reserved inode for storing the log).
-+ */
-+
-+#include <linux/module.h>
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/errno.h>
-+#include <linux/slab.h>
-+#include <linux/locks.h>
-+#include <linux/smp_lock.h>
-+#include <linux/sched.h>
-+#include <linux/init.h>
-+#include <linux/mm.h>
-+#include <linux/slab.h>
-+#include <asm/uaccess.h>
-+#include <linux/proc_fs.h>
-+
-+EXPORT_SYMBOL(journal_start);
-+EXPORT_SYMBOL(journal_try_start);
-+EXPORT_SYMBOL(journal_restart);
-+EXPORT_SYMBOL(journal_extend);
-+EXPORT_SYMBOL(journal_stop);
-+EXPORT_SYMBOL(journal_lock_updates);
-+EXPORT_SYMBOL(journal_unlock_updates);
-+EXPORT_SYMBOL(journal_get_write_access);
-+EXPORT_SYMBOL(journal_get_create_access);
-+EXPORT_SYMBOL(journal_get_undo_access);
-+EXPORT_SYMBOL(journal_dirty_data);
-+EXPORT_SYMBOL(journal_dirty_metadata);
-+#if 0
-+EXPORT_SYMBOL(journal_release_buffer);
-+#endif
-+EXPORT_SYMBOL(journal_forget);
-+#if 0
-+EXPORT_SYMBOL(journal_sync_buffer);
-+#endif
-+EXPORT_SYMBOL(journal_flush);
-+EXPORT_SYMBOL(journal_revoke);
-+
-+EXPORT_SYMBOL(journal_init_dev);
-+EXPORT_SYMBOL(journal_init_inode);
-+EXPORT_SYMBOL(journal_update_format);
-+EXPORT_SYMBOL(journal_check_used_features);
-+EXPORT_SYMBOL(journal_check_available_features);
-+EXPORT_SYMBOL(journal_set_features);
-+EXPORT_SYMBOL(journal_create);
-+EXPORT_SYMBOL(journal_load);
-+EXPORT_SYMBOL(journal_destroy);
-+EXPORT_SYMBOL(journal_recover);
-+EXPORT_SYMBOL(journal_update_superblock);
-+EXPORT_SYMBOL(journal_abort);
-+EXPORT_SYMBOL(journal_errno);
-+EXPORT_SYMBOL(journal_ack_err);
-+EXPORT_SYMBOL(journal_clear_err);
-+EXPORT_SYMBOL(log_wait_commit);
-+EXPORT_SYMBOL(log_start_commit);
-+EXPORT_SYMBOL(journal_wipe);
-+EXPORT_SYMBOL(journal_blocks_per_page);
-+EXPORT_SYMBOL(journal_flushpage);
-+EXPORT_SYMBOL(journal_try_to_free_buffers);
-+EXPORT_SYMBOL(journal_bmap);
-+EXPORT_SYMBOL(journal_force_commit);
-+
-+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
-+
-+/*
-+ * journal_datalist_lock is used to protect data buffers:
-+ *
-+ * bh->b_transaction
-+ * bh->b_tprev
-+ * bh->b_tnext
-+ *
-+ * journal_free_buffer() is called from journal_try_to_free_buffer(), and is
-+ * async wrt everything else.
-+ *
-+ * It is also used for checkpoint data, also to protect against
-+ * journal_try_to_free_buffer():
-+ *
-+ * bh->b_cp_transaction
-+ * bh->b_cpnext
-+ * bh->b_cpprev
-+ * transaction->t_checkpoint_list
-+ * transaction->t_cpnext
-+ * transaction->t_cpprev
-+ * journal->j_checkpoint_transactions
-+ *
-+ * It is global at this time rather than per-journal because it's
-+ * impossible for __journal_free_buffer to go from a buffer_head
-+ * back to a journal_t unracily (well, not true. Fix later)
-+ *
-+ *
-+ * The `datalist' and `checkpoint list' functions are quite
-+ * separate and we could use two spinlocks here.
-+ *
-+ * lru_list_lock nests inside journal_datalist_lock.
-+ */
-+spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED;
-+
-+/*
-+ * jh_splice_lock needs explantion.
-+ *
-+ * In a number of places we want to do things like:
-+ *
-+ * if (buffer_jbd(bh) && bh2jh(bh)->foo)
-+ *
-+ * This is racy on SMP, because another CPU could remove the journal_head
-+ * in the middle of this expression. We need locking.
-+ *
-+ * But we can greatly optimise the locking cost by testing BH_JBD
-+ * outside the lock. So, effectively:
-+ *
-+ * ret = 0;
-+ * if (buffer_jbd(bh)) {
-+ * spin_lock(&jh_splice_lock);
-+ * if (buffer_jbd(bh)) { (* Still there? *)
-+ * ret = bh2jh(bh)->foo;
-+ * }
-+ * spin_unlock(&jh_splice_lock);
-+ * }
-+ * return ret;
-+ *
-+ * Now, that protects us from races where another CPU can remove the
-+ * journal_head. But it doesn't defend us from the situation where another
-+ * CPU can *add* a journal_head. This is a correctness issue. But it's not
-+ * a problem because a) the calling code was *already* racy and b) it often
-+ * can't happen at the call site and c) the places where we add journal_heads
-+ * tend to be under external locking.
-+ */
-+spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED;
-+
-+/*
-+ * List of all journals in the system. Protected by the BKL.
-+ */
-+static LIST_HEAD(all_journals);
-+
-+/*
-+ * Helper function used to manage commit timeouts
-+ */
-+
-+static void commit_timeout(unsigned long __data)
-+{
-+ struct task_struct * p = (struct task_struct *) __data;
-+
-+ wake_up_process(p);
-+}
-+
-+/* Static check for data structure consistency. There's no code
-+ * invoked --- we'll just get a linker failure if things aren't right.
-+ */
-+void __journal_internal_check(void)
-+{
-+ extern void journal_bad_superblock_size(void);
-+ if (sizeof(struct journal_superblock_s) != 1024)
-+ journal_bad_superblock_size();
-+}
-+
-+/*
-+ * kjournald: The main thread function used to manage a logging device
-+ * journal.
-+ *
-+ * This kernel thread is responsible for two things:
-+ *
-+ * 1) COMMIT: Every so often we need to commit the current state of the
-+ * filesystem to disk. The journal thread is responsible for writing
-+ * all of the metadata buffers to disk.
-+ *
-+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
-+ * of the data in that part of the log has been rewritten elsewhere on
-+ * the disk. Flushing these old buffers to reclaim space in the log is
-+ * known as checkpointing, and this thread is responsible for that job.
-+ */
-+
-+journal_t *current_journal; // AKPM: debug
-+
-+int kjournald(void *arg)
-+{
-+ journal_t *journal = (journal_t *) arg;
-+ transaction_t *transaction;
-+ struct timer_list timer;
-+
-+ current_journal = journal;
-+
-+ lock_kernel();
-+ daemonize();
-+ reparent_to_init();
-+ spin_lock_irq(¤t->sigmask_lock);
-+ sigfillset(¤t->blocked);
-+ recalc_sigpending(current);
-+ spin_unlock_irq(¤t->sigmask_lock);
-+
-+ sprintf(current->comm, "kjournald");
-+
-+ /* Set up an interval timer which can be used to trigger a
-+ commit wakeup after the commit interval expires */
-+ init_timer(&timer);
-+ timer.data = (unsigned long) current;
-+ timer.function = commit_timeout;
-+ journal->j_commit_timer = &timer;
-+
-+ /* Record that the journal thread is running */
-+ journal->j_task = current;
-+ wake_up(&journal->j_wait_done_commit);
-+
-+ printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
-+ journal->j_commit_interval / HZ);
-+ list_add(&journal->j_all_journals, &all_journals);
-+
-+ /* And now, wait forever for commit wakeup events. */
-+ while (1) {
-+ if (journal->j_flags & JFS_UNMOUNT)
-+ break;
-+
-+ jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
-+ journal->j_commit_sequence, journal->j_commit_request);
-+
-+ if (journal->j_commit_sequence != journal->j_commit_request) {
-+ jbd_debug(1, "OK, requests differ\n");
-+ if (journal->j_commit_timer_active) {
-+ journal->j_commit_timer_active = 0;
-+ del_timer(journal->j_commit_timer);
-+ }
-+
-+ journal_commit_transaction(journal);
-+ continue;
-+ }
-+
-+ wake_up(&journal->j_wait_done_commit);
-+ interruptible_sleep_on(&journal->j_wait_commit);
-+
-+ jbd_debug(1, "kjournald wakes\n");
-+
-+ /* Were we woken up by a commit wakeup event? */
-+ if ((transaction = journal->j_running_transaction) != NULL &&
-+ time_after_eq(jiffies, transaction->t_expires)) {
-+ journal->j_commit_request = transaction->t_tid;
-+ jbd_debug(1, "woke because of timeout\n");
-+ }
-+ }
-+
-+ if (journal->j_commit_timer_active) {
-+ journal->j_commit_timer_active = 0;
-+ del_timer_sync(journal->j_commit_timer);
-+ }
-+
-+ list_del(&journal->j_all_journals);
-+
-+ journal->j_task = NULL;
-+ wake_up(&journal->j_wait_done_commit);
-+ unlock_kernel();
-+ jbd_debug(1, "Journal thread exiting.\n");
-+ return 0;
-+}
-+
-+static void journal_start_thread(journal_t *journal)
-+{
-+ kernel_thread(kjournald, (void *) journal,
-+ CLONE_VM | CLONE_FS | CLONE_FILES);
-+ while (!journal->j_task)
-+ sleep_on(&journal->j_wait_done_commit);
-+}
-+
-+static void journal_kill_thread(journal_t *journal)
-+{
-+ journal->j_flags |= JFS_UNMOUNT;
-+
-+ while (journal->j_task) {
-+ wake_up(&journal->j_wait_commit);
-+ sleep_on(&journal->j_wait_done_commit);
-+ }
-+}
-+
-+#if 0
-+
-+This is no longer needed - we do it in commit quite efficiently.
-+Note that if this function is resurrected, the loop needs to
-+be reorganised into the next_jh/last_jh algorithm.
-+
-+/*
-+ * journal_clean_data_list: cleanup after data IO.
-+ *
-+ * Once the IO system has finished writing the buffers on the transaction's
-+ * data list, we can remove those buffers from the list. This function
-+ * scans the list for such buffers and removes them cleanly.
-+ *
-+ * We assume that the journal is already locked.
-+ * We are called with journal_datalist_lock held.
-+ *
-+ * AKPM: This function looks inefficient. Approximately O(n^2)
-+ * for potentially thousands of buffers. It no longer shows on profiles
-+ * because these buffers are mainly dropped in journal_commit_transaction().
-+ */
-+
-+void __journal_clean_data_list(transaction_t *transaction)
-+{
-+ struct journal_head *jh, *next;
-+
-+ assert_spin_locked(&journal_datalist_lock);
-+
-+restart:
-+ jh = transaction->t_sync_datalist;
-+ if (!jh)
-+ goto out;
-+ do {
-+ next = jh->b_tnext;
-+ if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) {
-+ struct buffer_head *bh = jh2bh(jh);
-+ BUFFER_TRACE(bh, "data writeout complete: unfile");
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = NULL;
-+ __journal_remove_journal_head(bh);
-+ refile_buffer(bh);
-+ __brelse(bh);
-+ goto restart;
-+ }
-+ jh = next;
-+ } while (transaction->t_sync_datalist &&
-+ jh != transaction->t_sync_datalist);
-+out:
-+ return;
-+}
-+#endif
-+
-+/*
-+ * journal_write_metadata_buffer: write a metadata buffer to the journal.
-+ *
-+ * Writes a metadata buffer to a given disk block. The actual IO is not
-+ * performed but a new buffer_head is constructed which labels the data
-+ * to be written with the correct destination disk block.
-+ *
-+ * Any magic-number escaping which needs to be done will cause a
-+ * copy-out here. If the buffer happens to start with the
-+ * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
-+ * magic number is only written to the log for descripter blocks. In
-+ * this case, we copy the data and replace the first word with 0, and we
-+ * return a result code which indicates that this buffer needs to be
-+ * marked as an escaped buffer in the corresponding log descriptor
-+ * block. The missing word can then be restored when the block is read
-+ * during recovery.
-+ *
-+ * If the source buffer has already been modified by a new transaction
-+ * since we took the last commit snapshot, we use the frozen copy of
-+ * that data for IO. If we end up using the existing buffer_head's data
-+ * for the write, then we *have* to lock the buffer to prevent anyone
-+ * else from using and possibly modifying it while the IO is in
-+ * progress.
-+ *
-+ * The function returns a pointer to the buffer_heads to be used for IO.
-+ *
-+ * We assume that the journal has already been locked in this function.
-+ *
-+ * Return value:
-+ * <0: Error
-+ * >=0: Finished OK
-+ *
-+ * On success:
-+ * Bit 0 set == escape performed on the data
-+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
-+ */
-+
-+static inline unsigned long virt_to_offset(void *p)
-+{return ((unsigned long) p) & ~PAGE_MASK;}
-+
-+int journal_write_metadata_buffer(transaction_t *transaction,
-+ struct journal_head *jh_in,
-+ struct journal_head **jh_out,
-+ int blocknr)
-+{
-+ int need_copy_out = 0;
-+ int done_copy_out = 0;
-+ int do_escape = 0;
-+ char *mapped_data;
-+ struct buffer_head *new_bh;
-+ struct journal_head * new_jh;
-+ struct page *new_page;
-+ unsigned int new_offset;
-+
-+ /*
-+ * The buffer really shouldn't be locked: only the current committing
-+ * transaction is allowed to write it, so nobody else is allowed
-+ * to do any IO.
-+ *
-+ * akpm: except if we're journalling data, and write() output is
-+ * also part of a shared mapping, and another thread has
-+ * decided to launch a writepage() against this buffer.
-+ */
-+ J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in)));
-+
-+ /*
-+ * If a new transaction has already done a buffer copy-out, then
-+ * we use that version of the data for the commit.
-+ */
-+
-+ if (jh_in->b_frozen_data) {
-+ done_copy_out = 1;
-+ new_page = virt_to_page(jh_in->b_frozen_data);
-+ new_offset = virt_to_offset(jh_in->b_frozen_data);
-+ } else {
-+ new_page = jh2bh(jh_in)->b_page;
-+ new_offset = virt_to_offset(jh2bh(jh_in)->b_data);
-+ }
-+
-+ mapped_data = ((char *) kmap(new_page)) + new_offset;
-+
-+ /*
-+ * Check for escaping
-+ */
-+ if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) {
-+ need_copy_out = 1;
-+ do_escape = 1;
-+ }
-+
-+ /*
-+ * Do we need to do a data copy?
-+ */
-+
-+ if (need_copy_out && !done_copy_out) {
-+ char *tmp;
-+ tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS);
-+
-+ jh_in->b_frozen_data = tmp;
-+ memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size);
-+
-+ /* If we get to this path, we'll always need the new
-+ address kmapped so that we can clear the escaped
-+ magic number below. */
-+ kunmap(new_page);
-+ new_page = virt_to_page(tmp);
-+ new_offset = virt_to_offset(tmp);
-+ mapped_data = ((char *) kmap(new_page)) + new_offset;
-+
-+ done_copy_out = 1;
-+ }
-+
-+ /*
-+ * Right, time to make up the new buffer_head.
-+ */
-+ do {
-+ new_bh = get_unused_buffer_head(0);
-+ if (!new_bh) {
-+ printk (KERN_NOTICE __FUNCTION__
-+ ": ENOMEM at get_unused_buffer_head, "
-+ "trying again.\n");
-+ current->policy |= SCHED_YIELD;
-+ schedule();
-+ }
-+ } while (!new_bh);
-+ /* keep subsequent assertions sane */
-+ new_bh->b_prev_free = 0;
-+ new_bh->b_next_free = 0;
-+ new_bh->b_state = 0;
-+ init_buffer(new_bh, NULL, NULL);
-+ atomic_set(&new_bh->b_count, 1);
-+ new_jh = journal_add_journal_head(new_bh);
-+
-+ set_bh_page(new_bh, new_page, new_offset);
-+
-+ new_jh->b_transaction = NULL;
-+ new_bh->b_size = jh2bh(jh_in)->b_size;
-+ new_bh->b_dev = transaction->t_journal->j_dev;
-+ new_bh->b_blocknr = blocknr;
-+ new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty);
-+
-+ *jh_out = new_jh;
-+
-+ /*
-+ * Did we need to do an escaping? Now we've done all the
-+ * copying, we can finally do so.
-+ */
-+
-+ if (do_escape)
-+ * ((unsigned int *) mapped_data) = 0;
-+ kunmap(new_page);
-+
-+ /*
-+ * The to-be-written buffer needs to get moved to the io queue,
-+ * and the original buffer whose contents we are shadowing or
-+ * copying is moved to the transaction's shadow queue.
-+ */
-+ JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-+ journal_file_buffer(jh_in, transaction, BJ_Shadow);
-+ JBUFFER_TRACE(new_jh, "file as BJ_IO");
-+ journal_file_buffer(new_jh, transaction, BJ_IO);
-+
-+ return do_escape | (done_copy_out << 1);
-+}
-+
-+/*
-+ * Allocation code for the journal file. Manage the space left in the
-+ * journal, so that we can begin checkpointing when appropriate.
-+ */
-+
-+/*
-+ * log_space_left: Return the number of free blocks left in the journal.
-+ *
-+ * Called with the journal already locked.
-+ */
-+
-+int log_space_left (journal_t *journal)
-+{
-+ int left = journal->j_free;
-+
-+ /* Be pessimistic here about the number of those free blocks
-+ * which might be required for log descriptor control blocks. */
-+
-+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-+
-+ left -= MIN_LOG_RESERVED_BLOCKS;
-+
-+ if (left <= 0)
-+ return 0;
-+ left -= (left >> 3);
-+ return left;
-+}
-+
-+/*
-+ * This function must be non-allocating for PF_MEMALLOC tasks
-+ */
-+tid_t log_start_commit (journal_t *journal, transaction_t *transaction)
-+{
-+ tid_t target = journal->j_commit_request;
-+
-+ lock_kernel(); /* Protect journal->j_running_transaction */
-+
-+ /*
-+ * A NULL transaction asks us to commit the currently running
-+ * transaction, if there is one.
-+ */
-+ if (transaction)
-+ target = transaction->t_tid;
-+ else {
-+ transaction = journal->j_running_transaction;
-+ if (!transaction)
-+ goto out;
-+ target = transaction->t_tid;
-+ }
-+
-+ /*
-+ * Are we already doing a recent enough commit?
-+ */
-+ if (tid_geq(journal->j_commit_request, target))
-+ goto out;
-+
-+ /*
-+ * We want a new commit: OK, mark the request and wakup the
-+ * commit thread. We do _not_ do the commit ourselves.
-+ */
-+
-+ journal->j_commit_request = target;
-+ jbd_debug(1, "JBD: requesting commit %d/%d\n",
-+ journal->j_commit_request,
-+ journal->j_commit_sequence);
-+ wake_up(&journal->j_wait_commit);
-+
-+out:
-+ unlock_kernel();
-+ return target;
-+}
-+
-+/*
-+ * Wait for a specified commit to complete.
-+ * The caller may not hold the journal lock.
-+ */
-+void log_wait_commit (journal_t *journal, tid_t tid)
-+{
-+ lock_kernel();
-+#ifdef CONFIG_JBD_DEBUG
-+ lock_journal(journal);
-+ if (!tid_geq(journal->j_commit_request, tid)) {
-+ printk(KERN_EMERG __FUNCTION__
-+ ": error: j_commit_request=%d, tid=%d\n",
-+ journal->j_commit_request, tid);
-+ }
-+ unlock_journal(journal);
-+#endif
-+ while (tid_gt(tid, journal->j_commit_sequence)) {
-+ jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
-+ tid, journal->j_commit_sequence);
-+ wake_up(&journal->j_wait_commit);
-+ sleep_on(&journal->j_wait_done_commit);
-+ }
-+ unlock_kernel();
-+}
-+
-+/*
-+ * Log buffer allocation routines:
-+ */
-+
-+int journal_next_log_block(journal_t *journal, unsigned long *retp)
-+{
-+ unsigned long blocknr;
-+
-+ J_ASSERT(journal->j_free > 1);
-+
-+ blocknr = journal->j_head;
-+ journal->j_head++;
-+ journal->j_free--;
-+ if (journal->j_head == journal->j_last)
-+ journal->j_head = journal->j_first;
-+ return journal_bmap(journal, blocknr, retp);
-+}
-+
-+/*
-+ * Conversion of logical to physical block numbers for the journal
-+ *
-+ * On external journals the journal blocks are identity-mapped, so
-+ * this is a no-op. If needed, we can use j_blk_offset - everything is
-+ * ready.
-+ */
-+int journal_bmap(journal_t *journal, unsigned long blocknr,
-+ unsigned long *retp)
-+{
-+ int err = 0;
-+ unsigned long ret;
-+
-+ if (journal->j_inode) {
-+ ret = bmap(journal->j_inode, blocknr);
-+ if (ret)
-+ *retp = ret;
-+ else {
-+ printk (KERN_ALERT __FUNCTION__
-+ ": journal block not found "
-+ "at offset %lu on %s\n",
-+ blocknr, bdevname(journal->j_dev));
-+ err = -EIO;
-+ __journal_abort_soft(journal, err);
-+ }
-+ } else {
-+ *retp = blocknr; /* +journal->j_blk_offset */
-+ }
-+ return err;
-+}
-+
-+/*
-+ * We play buffer_head aliasing tricks to write data/metadata blocks to
-+ * the journal without copying their contents, but for journal
-+ * descriptor blocks we do need to generate bona fide buffers.
-+ *
-+ * We return a jh whose bh is locked and ready to be populated.
-+ */
-+
-+struct journal_head * journal_get_descriptor_buffer(journal_t *journal)
-+{
-+ struct buffer_head *bh;
-+ unsigned long blocknr;
-+ int err;
-+
-+ err = journal_next_log_block(journal, &blocknr);
-+
-+ if (err)
-+ return NULL;
-+
-+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
-+ lock_buffer(bh);
-+ BUFFER_TRACE(bh, "return this buffer");
-+ return journal_add_journal_head(bh);
-+}
-+
-+/*
-+ * Management for journal control blocks: functions to create and
-+ * destroy journal_t structures, and to initialise and read existing
-+ * journal blocks from disk. */
-+
-+/* First: create and setup a journal_t object in memory. We initialise
-+ * very few fields yet: that has to wait until we have created the
-+ * journal structures from from scratch, or loaded them from disk. */
-+
-+static journal_t * journal_init_common (void)
-+{
-+ journal_t *journal;
-+ int err;
-+
-+ MOD_INC_USE_COUNT;
-+
-+ journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
-+ if (!journal)
-+ goto fail;
-+ memset(journal, 0, sizeof(*journal));
-+
-+ init_waitqueue_head(&journal->j_wait_transaction_locked);
-+ init_waitqueue_head(&journal->j_wait_logspace);
-+ init_waitqueue_head(&journal->j_wait_done_commit);
-+ init_waitqueue_head(&journal->j_wait_checkpoint);
-+ init_waitqueue_head(&journal->j_wait_commit);
-+ init_waitqueue_head(&journal->j_wait_updates);
-+ init_MUTEX(&journal->j_barrier);
-+ init_MUTEX(&journal->j_checkpoint_sem);
-+ init_MUTEX(&journal->j_sem);
-+
-+ journal->j_commit_interval = (HZ * 5);
-+
-+ /* The journal is marked for error until we succeed with recovery! */
-+ journal->j_flags = JFS_ABORT;
-+
-+ /* Set up a default-sized revoke table for the new mount. */
-+ err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
-+ if (err) {
-+ kfree(journal);
-+ goto fail;
-+ }
-+ return journal;
-+fail:
-+ MOD_DEC_USE_COUNT;
-+ return NULL;
-+}
-+
-+/* journal_init_dev and journal_init_inode:
-+ *
-+ * Create a journal structure assigned some fixed set of disk blocks to
-+ * the journal. We don't actually touch those disk blocks yet, but we
-+ * need to set up all of the mapping information to tell the journaling
-+ * system where the journal blocks are.
-+ *
-+ * journal_init_dev creates a journal which maps a fixed contiguous
-+ * range of blocks on an arbitrary block device.
-+ *
-+ * journal_init_inode creates a journal which maps an on-disk inode as
-+ * the journal. The inode must exist already, must support bmap() and
-+ * must have all data blocks preallocated.
-+ */
-+
-+journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev,
-+ int start, int len, int blocksize)
-+{
-+ journal_t *journal = journal_init_common();
-+ struct buffer_head *bh;
-+
-+ if (!journal)
-+ return NULL;
-+
-+ journal->j_dev = dev;
-+ journal->j_fs_dev = fs_dev;
-+ journal->j_blk_offset = start;
-+ journal->j_maxlen = len;
-+ journal->j_blocksize = blocksize;
-+
-+ bh = getblk(journal->j_dev, start, journal->j_blocksize);
-+ J_ASSERT(bh != NULL);
-+ journal->j_sb_buffer = bh;
-+ journal->j_superblock = (journal_superblock_t *)bh->b_data;
-+
-+ return journal;
-+}
-+
-+journal_t * journal_init_inode (struct inode *inode)
-+{
-+ struct buffer_head *bh;
-+ journal_t *journal = journal_init_common();
-+ int err;
-+ unsigned long blocknr;
-+
-+ if (!journal)
-+ return NULL;
-+
-+ journal->j_dev = inode->i_dev;
-+ journal->j_fs_dev = inode->i_dev;
-+ journal->j_inode = inode;
-+ jbd_debug(1,
-+ "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
-+ journal, bdevname(inode->i_dev), inode->i_ino,
-+ (long long) inode->i_size,
-+ inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
-+
-+ journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
-+ journal->j_blocksize = inode->i_sb->s_blocksize;
-+
-+ err = journal_bmap(journal, 0, &blocknr);
-+ /* If that failed, give up */
-+ if (err) {
-+ printk(KERN_ERR __FUNCTION__ ": Cannnot locate journal "
-+ "superblock\n");
-+ kfree(journal);
-+ return NULL;
-+ }
-+
-+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
-+ J_ASSERT(bh != NULL);
-+ journal->j_sb_buffer = bh;
-+ journal->j_superblock = (journal_superblock_t *)bh->b_data;
-+
-+ return journal;
-+}
-+
-+/*
-+ * If the journal init or create aborts, we need to mark the journal
-+ * superblock as being NULL to prevent the journal destroy from writing
-+ * back a bogus superblock.
-+ */
-+static void journal_fail_superblock (journal_t *journal)
-+{
-+ struct buffer_head *bh = journal->j_sb_buffer;
-+ brelse(bh);
-+ journal->j_sb_buffer = NULL;
-+}
-+
-+/*
-+ * Given a journal_t structure, initialise the various fields for
-+ * startup of a new journaling session. We use this both when creating
-+ * a journal, and after recovering an old journal to reset it for
-+ * subsequent use.
-+ */
-+
-+static int journal_reset (journal_t *journal)
-+{
-+ journal_superblock_t *sb = journal->j_superblock;
-+ unsigned int first, last;
-+
-+ first = ntohl(sb->s_first);
-+ last = ntohl(sb->s_maxlen);
-+
-+ journal->j_first = first;
-+ journal->j_last = last;
-+
-+ journal->j_head = first;
-+ journal->j_tail = first;
-+ journal->j_free = last - first;
-+
-+ journal->j_tail_sequence = journal->j_transaction_sequence;
-+ journal->j_commit_sequence = journal->j_transaction_sequence - 1;
-+ journal->j_commit_request = journal->j_commit_sequence;
-+
-+ journal->j_max_transaction_buffers = journal->j_maxlen / 4;
-+
-+ /* Add the dynamic fields and write it to disk. */
-+ journal_update_superblock(journal, 1);
-+
-+ lock_journal(journal);
-+ journal_start_thread(journal);
-+ unlock_journal(journal);
-+
-+ return 0;
-+}
-+
-+/*
-+ * Given a journal_t structure which tells us which disk blocks we can
-+ * use, create a new journal superblock and initialise all of the
-+ * journal fields from scratch. */
-+
-+int journal_create (journal_t *journal)
-+{
-+ unsigned long blocknr;
-+ struct buffer_head *bh;
-+ journal_superblock_t *sb;
-+ int i, err;
-+
-+ if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
-+ printk (KERN_ERR "Journal length (%d blocks) too short.\n",
-+ journal->j_maxlen);
-+ journal_fail_superblock(journal);
-+ return -EINVAL;
-+ }
-+
-+ if (journal->j_inode == NULL) {
-+ /*
-+ * We don't know what block to start at!
-+ */
-+ printk(KERN_EMERG __FUNCTION__
-+ ": creation of journal on external device!\n");
-+ BUG();
-+ }
-+
-+ /* Zero out the entire journal on disk. We cannot afford to
-+ have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
-+ jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
-+ for (i = 0; i < journal->j_maxlen; i++) {
-+ err = journal_bmap(journal, i, &blocknr);
-+ if (err)
-+ return err;
-+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
-+ wait_on_buffer(bh);
-+ memset (bh->b_data, 0, journal->j_blocksize);
-+ BUFFER_TRACE(bh, "marking dirty");
-+ mark_buffer_dirty(bh);
-+ BUFFER_TRACE(bh, "marking uptodate");
-+ mark_buffer_uptodate(bh, 1);
-+ __brelse(bh);
-+ }
-+
-+ sync_dev(journal->j_dev);
-+ jbd_debug(1, "JBD: journal cleared.\n");
-+
-+ /* OK, fill in the initial static fields in the new superblock */
-+ sb = journal->j_superblock;
-+
-+ sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER);
-+ sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2);
-+
-+ sb->s_blocksize = htonl(journal->j_blocksize);
-+ sb->s_maxlen = htonl(journal->j_maxlen);
-+ sb->s_first = htonl(1);
-+
-+ journal->j_transaction_sequence = 1;
-+
-+ journal->j_flags &= ~JFS_ABORT;
-+ journal->j_format_version = 2;
-+
-+ return journal_reset(journal);
-+}
-+
-+/*
-+ * Update a journal's dynamic superblock fields and write it to disk,
-+ * optionally waiting for the IO to complete.
-+*/
-+
-+void journal_update_superblock(journal_t *journal, int wait)
-+{
-+ journal_superblock_t *sb = journal->j_superblock;
-+ struct buffer_head *bh = journal->j_sb_buffer;
-+
-+ jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
-+ journal->j_tail, journal->j_tail_sequence, journal->j_errno);
-+
-+ sb->s_sequence = htonl(journal->j_tail_sequence);
-+ sb->s_start = htonl(journal->j_tail);
-+ sb->s_errno = htonl(journal->j_errno);
-+
-+ BUFFER_TRACE(bh, "marking dirty");
-+ mark_buffer_dirty(bh);
-+ ll_rw_block(WRITE, 1, &bh);
-+ if (wait)
-+ wait_on_buffer(bh);
-+
-+ /* If we have just flushed the log (by marking s_start==0), then
-+ * any future commit will have to be careful to update the
-+ * superblock again to re-record the true start of the log. */
-+
-+ if (sb->s_start)
-+ journal->j_flags &= ~JFS_FLUSHED;
-+ else
-+ journal->j_flags |= JFS_FLUSHED;
-+}
-+
-+
-+/*
-+ * Read the superblock for a given journal, performing initial
-+ * validation of the format.
-+ */
-+
-+static int journal_get_superblock(journal_t *journal)
-+{
-+ struct buffer_head *bh;
-+ journal_superblock_t *sb;
-+ int err = -EIO;
-+
-+ bh = journal->j_sb_buffer;
-+
-+ J_ASSERT(bh != NULL);
-+ if (!buffer_uptodate(bh)) {
-+ ll_rw_block(READ, 1, &bh);
-+ wait_on_buffer(bh);
-+ if (!buffer_uptodate(bh)) {
-+ printk (KERN_ERR
-+ "JBD: IO error reading journal superblock\n");
-+ goto out;
-+ }
-+ }
-+
-+ sb = journal->j_superblock;
-+
-+ err = -EINVAL;
-+
-+ if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) ||
-+ sb->s_blocksize != htonl(journal->j_blocksize)) {
-+ printk(KERN_WARNING "JBD: no valid journal superblock found\n");
-+ goto out;
-+ }
-+
-+ switch(ntohl(sb->s_header.h_blocktype)) {
-+ case JFS_SUPERBLOCK_V1:
-+ journal->j_format_version = 1;
-+ break;
-+ case JFS_SUPERBLOCK_V2:
-+ journal->j_format_version = 2;
-+ break;
-+ default:
-+ printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
-+ goto out;
-+ }
-+
-+ if (ntohl(sb->s_maxlen) < journal->j_maxlen)
-+ journal->j_maxlen = ntohl(sb->s_maxlen);
-+ else if (ntohl(sb->s_maxlen) > journal->j_maxlen) {
-+ printk (KERN_WARNING "JBD: journal file too short\n");
-+ goto out;
-+ }
-+
-+ return 0;
-+
-+out:
-+ journal_fail_superblock(journal);
-+ return err;
-+}
-+
-+/*
-+ * Load the on-disk journal superblock and read the key fields into the
-+ * journal_t.
-+ */
-+
-+static int load_superblock(journal_t *journal)
-+{
-+ int err;
-+ journal_superblock_t *sb;
-+
-+ err = journal_get_superblock(journal);
-+ if (err)
-+ return err;
-+
-+ sb = journal->j_superblock;
-+
-+ journal->j_tail_sequence = ntohl(sb->s_sequence);
-+ journal->j_tail = ntohl(sb->s_start);
-+ journal->j_first = ntohl(sb->s_first);
-+ journal->j_last = ntohl(sb->s_maxlen);
-+ journal->j_errno = ntohl(sb->s_errno);
-+
-+ return 0;
-+}
-+
-+
-+/*
-+ * Given a journal_t structure which tells us which disk blocks contain
-+ * a journal, read the journal from disk to initialise the in-memory
-+ * structures.
-+ */
-+
-+int journal_load(journal_t *journal)
-+{
-+ int err;
-+
-+ err = load_superblock(journal);
-+ if (err)
-+ return err;
-+
-+ /* If this is a V2 superblock, then we have to check the
-+ * features flags on it. */
-+
-+ if (journal->j_format_version >= 2) {
-+ journal_superblock_t *sb = journal->j_superblock;
-+
-+ if ((sb->s_feature_ro_compat &
-+ ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
-+ (sb->s_feature_incompat &
-+ ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
-+ printk (KERN_WARNING
-+ "JBD: Unrecognised features on journal\n");
-+ return -EINVAL;
-+ }
-+ }
-+
-+ /* Let the recovery code check whether it needs to recover any
-+ * data from the journal. */
-+ if (journal_recover(journal))
-+ goto recovery_error;
-+
-+ /* OK, we've finished with the dynamic journal bits:
-+ * reinitialise the dynamic contents of the superblock in memory
-+ * and reset them on disk. */
-+ if (journal_reset(journal))
-+ goto recovery_error;
-+
-+ journal->j_flags &= ~JFS_ABORT;
-+ journal->j_flags |= JFS_LOADED;
-+ return 0;
-+
-+recovery_error:
-+ printk (KERN_WARNING "JBD: recovery failed\n");
-+ return -EIO;
-+}
-+
-+/*
-+ * Release a journal_t structure once it is no longer in use by the
-+ * journaled object.
-+ */
-+
-+void journal_destroy (journal_t *journal)
-+{
-+ /* Wait for the commit thread to wake up and die. */
-+ journal_kill_thread(journal);
-+
-+ /* Force a final log commit */
-+ if (journal->j_running_transaction)
-+ journal_commit_transaction(journal);
-+
-+ /* Force any old transactions to disk */
-+ lock_journal(journal);
-+ while (journal->j_checkpoint_transactions != NULL)
-+ log_do_checkpoint(journal, 1);
-+
-+ J_ASSERT(journal->j_running_transaction == NULL);
-+ J_ASSERT(journal->j_committing_transaction == NULL);
-+ J_ASSERT(journal->j_checkpoint_transactions == NULL);
-+
-+ /* We can now mark the journal as empty. */
-+ journal->j_tail = 0;
-+ journal->j_tail_sequence = ++journal->j_transaction_sequence;
-+ if (journal->j_sb_buffer) {
-+ journal_update_superblock(journal, 1);
-+ brelse(journal->j_sb_buffer);
-+ }
-+
-+ if (journal->j_inode)
-+ iput(journal->j_inode);
-+ if (journal->j_revoke)
-+ journal_destroy_revoke(journal);
-+
-+ unlock_journal(journal);
-+ kfree(journal);
-+ MOD_DEC_USE_COUNT;
-+}
-+
-+
-+/* Published API: Check whether the journal uses all of a given set of
-+ * features. Return true (non-zero) if it does. */
-+
-+int journal_check_used_features (journal_t *journal, unsigned long compat,
-+ unsigned long ro, unsigned long incompat)
-+{
-+ journal_superblock_t *sb;
-+
-+ if (!compat && !ro && !incompat)
-+ return 1;
-+ if (journal->j_format_version == 1)
-+ return 0;
-+
-+ sb = journal->j_superblock;
-+
-+ if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
-+ ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
-+ ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
-+ return 1;
-+
-+ return 0;
-+}
-+
-+/* Published API: Check whether the journaling code supports the use of
-+ * all of a given set of features on this journal. Return true
-+ * (non-zero) if it can. */
-+
-+int journal_check_available_features (journal_t *journal, unsigned long compat,
-+ unsigned long ro, unsigned long incompat)
-+{
-+ journal_superblock_t *sb;
-+
-+ if (!compat && !ro && !incompat)
-+ return 1;
-+
-+ sb = journal->j_superblock;
-+
-+ /* We can support any known requested features iff the
-+ * superblock is in version 2. Otherwise we fail to support any
-+ * extended sb features. */
-+
-+ if (journal->j_format_version != 2)
-+ return 0;
-+
-+ if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
-+ (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
-+ (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
-+ return 1;
-+
-+ return 0;
-+}
-+
-+/* Published API: Mark a given journal feature as present on the
-+ * superblock. Returns true if the requested features could be set. */
-+
-+int journal_set_features (journal_t *journal, unsigned long compat,
-+ unsigned long ro, unsigned long incompat)
-+{
-+ journal_superblock_t *sb;
-+
-+ if (journal_check_used_features(journal, compat, ro, incompat))
-+ return 1;
-+
-+ if (!journal_check_available_features(journal, compat, ro, incompat))
-+ return 0;
-+
-+ jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
-+ compat, ro, incompat);
-+
-+ sb = journal->j_superblock;
-+
-+ sb->s_feature_compat |= cpu_to_be32(compat);
-+ sb->s_feature_ro_compat |= cpu_to_be32(ro);
-+ sb->s_feature_incompat |= cpu_to_be32(incompat);
-+
-+ return 1;
-+}
-+
-+
-+/*
-+ * Published API:
-+ * Given an initialised but unloaded journal struct, poke about in the
-+ * on-disk structure to update it to the most recent supported version.
-+ */
-+
-+int journal_update_format (journal_t *journal)
-+{
-+ journal_superblock_t *sb;
-+ int err;
-+
-+ err = journal_get_superblock(journal);
-+ if (err)
-+ return err;
-+
-+ sb = journal->j_superblock;
-+
-+ switch (ntohl(sb->s_header.h_blocktype)) {
-+ case JFS_SUPERBLOCK_V2:
-+ return 0;
-+ case JFS_SUPERBLOCK_V1:
-+ return journal_convert_superblock_v1(journal, sb);
-+ default:
-+ break;
-+ }
-+ return -EINVAL;
-+}
-+
-+static int journal_convert_superblock_v1(journal_t *journal,
-+ journal_superblock_t *sb)
-+{
-+ int offset, blocksize;
-+ struct buffer_head *bh;
-+
-+ printk(KERN_WARNING
-+ "JBD: Converting superblock from version 1 to 2.\n");
-+
-+ /* Pre-initialise new fields to zero */
-+ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
-+ blocksize = ntohl(sb->s_blocksize);
-+ memset(&sb->s_feature_compat, 0, blocksize-offset);
-+
-+ sb->s_nr_users = cpu_to_be32(1);
-+ sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
-+ journal->j_format_version = 2;
-+
-+ bh = journal->j_sb_buffer;
-+ BUFFER_TRACE(bh, "marking dirty");
-+ mark_buffer_dirty(bh);
-+ ll_rw_block(WRITE, 1, &bh);
-+ wait_on_buffer(bh);
-+ return 0;
-+}
-+
-+
-+/*
-+ * Flush all data for a given journal to disk and empty the journal.
-+ * Filesystems can use this when remounting readonly to ensure that
-+ * recovery does not need to happen on remount.
-+ */
-+
-+int journal_flush (journal_t *journal)
-+{
-+ int err = 0;
-+ transaction_t *transaction = NULL;
-+ unsigned long old_tail;
-+
-+ lock_kernel();
-+
-+ /* Force everything buffered to the log... */
-+ if (journal->j_running_transaction) {
-+ transaction = journal->j_running_transaction;
-+ log_start_commit(journal, transaction);
-+ } else if (journal->j_committing_transaction)
-+ transaction = journal->j_committing_transaction;
-+
-+ /* Wait for the log commit to complete... */
-+ if (transaction)
-+ log_wait_commit(journal, transaction->t_tid);
-+
-+ /* ...and flush everything in the log out to disk. */
-+ lock_journal(journal);
-+ while (!err && journal->j_checkpoint_transactions != NULL)
-+ err = log_do_checkpoint(journal, journal->j_maxlen);
-+ cleanup_journal_tail(journal);
-+
-+ /* Finally, mark the journal as really needing no recovery.
-+ * This sets s_start==0 in the underlying superblock, which is
-+ * the magic code for a fully-recovered superblock. Any future
-+ * commits of data to the journal will restore the current
-+ * s_start value. */
-+ old_tail = journal->j_tail;
-+ journal->j_tail = 0;
-+ journal_update_superblock(journal, 1);
-+ journal->j_tail = old_tail;
-+
-+ unlock_journal(journal);
-+
-+ J_ASSERT(!journal->j_running_transaction);
-+ J_ASSERT(!journal->j_committing_transaction);
-+ J_ASSERT(!journal->j_checkpoint_transactions);
-+ J_ASSERT(journal->j_head == journal->j_tail);
-+ J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
-+
-+ unlock_kernel();
-+
-+ return err;
-+}
-+
-+/*
-+ * Wipe out all of the contents of a journal, safely. This will produce
-+ * a warning if the journal contains any valid recovery information.
-+ * Must be called between journal_init_*() and journal_load().
-+ *
-+ * If (write) is non-zero, then we wipe out the journal on disk; otherwise
-+ * we merely suppress recovery.
-+ */
-+
-+int journal_wipe (journal_t *journal, int write)
-+{
-+ journal_superblock_t *sb;
-+ int err = 0;
-+
-+ J_ASSERT (!(journal->j_flags & JFS_LOADED));
-+
-+ err = load_superblock(journal);
-+ if (err)
-+ return err;
-+
-+ sb = journal->j_superblock;
-+
-+ if (!journal->j_tail)
-+ goto no_recovery;
-+
-+ printk (KERN_WARNING "JBD: %s recovery information on journal\n",
-+ write ? "Clearing" : "Ignoring");
-+
-+ err = journal_skip_recovery(journal);
-+ if (write)
-+ journal_update_superblock(journal, 1);
-+
-+ no_recovery:
-+ return err;
-+}
-+
-+/*
-+ * journal_dev_name: format a character string to describe on what
-+ * device this journal is present.
-+ */
-+
-+const char * journal_dev_name(journal_t *journal)
-+{
-+ kdev_t dev;
-+
-+ if (journal->j_inode)
-+ dev = journal->j_inode->i_dev;
-+ else
-+ dev = journal->j_dev;
-+
-+ return bdevname(dev);
-+}
-+
-+/*
-+ * journal_abort: perform a complete, immediate shutdown of the ENTIRE
-+ * journal (not of a single transaction). This operation cannot be
-+ * undone without closing and reopening the journal.
-+ *
-+ * The journal_abort function is intended to support higher level error
-+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
-+ * mode.
-+ *
-+ * Journal abort has very specific semantics. Any existing dirty,
-+ * unjournaled buffers in the main filesystem will still be written to
-+ * disk by bdflush, but the journaling mechanism will be suspended
-+ * immediately and no further transaction commits will be honoured.
-+ *
-+ * Any dirty, journaled buffers will be written back to disk without
-+ * hitting the journal. Atomicity cannot be guaranteed on an aborted
-+ * filesystem, but we _do_ attempt to leave as much data as possible
-+ * behind for fsck to use for cleanup.
-+ *
-+ * Any attempt to get a new transaction handle on a journal which is in
-+ * ABORT state will just result in an -EROFS error return. A
-+ * journal_stop on an existing handle will return -EIO if we have
-+ * entered abort state during the update.
-+ *
-+ * Recursive transactions are not disturbed by journal abort until the
-+ * final journal_stop, which will receive the -EIO error.
-+ *
-+ * Finally, the journal_abort call allows the caller to supply an errno
-+ * which will be recored (if possible) in the journal superblock. This
-+ * allows a client to record failure conditions in the middle of a
-+ * transaction without having to complete the transaction to record the
-+ * failure to disk. ext3_error, for example, now uses this
-+ * functionality.
-+ *
-+ * Errors which originate from within the journaling layer will NOT
-+ * supply an errno; a null errno implies that absolutely no further
-+ * writes are done to the journal (unless there are any already in
-+ * progress).
-+ */
-+
-+/* Quick version for internal journal use (doesn't lock the journal).
-+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
-+ * and don't attempt to make any other journal updates. */
-+void __journal_abort_hard (journal_t *journal)
-+{
-+ transaction_t *transaction;
-+
-+ if (journal->j_flags & JFS_ABORT)
-+ return;
-+
-+ printk (KERN_ERR "Aborting journal on device %s.\n",
-+ journal_dev_name(journal));
-+
-+ journal->j_flags |= JFS_ABORT;
-+ transaction = journal->j_running_transaction;
-+ if (transaction)
-+ log_start_commit(journal, transaction);
-+}
-+
-+/* Soft abort: record the abort error status in the journal superblock,
-+ * but don't do any other IO. */
-+void __journal_abort_soft (journal_t *journal, int errno)
-+{
-+ if (journal->j_flags & JFS_ABORT)
-+ return;
-+
-+ if (!journal->j_errno)
-+ journal->j_errno = errno;
-+
-+ __journal_abort_hard(journal);
-+
-+ if (errno)
-+ journal_update_superblock(journal, 1);
-+}
-+
-+/* Full version for external use */
-+void journal_abort (journal_t *journal, int errno)
-+{
-+ lock_journal(journal);
-+ __journal_abort_soft(journal, errno);
-+ unlock_journal(journal);
-+}
-+
-+int journal_errno (journal_t *journal)
-+{
-+ int err;
-+
-+ lock_journal(journal);
-+ if (journal->j_flags & JFS_ABORT)
-+ err = -EROFS;
-+ else
-+ err = journal->j_errno;
-+ unlock_journal(journal);
-+ return err;
-+}
-+
-+int journal_clear_err (journal_t *journal)
-+{
-+ int err = 0;
-+
-+ lock_journal(journal);
-+ if (journal->j_flags & JFS_ABORT)
-+ err = -EROFS;
-+ else
-+ journal->j_errno = 0;
-+ unlock_journal(journal);
-+ return err;
-+}
-+
-+void journal_ack_err (journal_t *journal)
-+{
-+ lock_journal(journal);
-+ if (journal->j_errno)
-+ journal->j_flags |= JFS_ACK_ERR;
-+ unlock_journal(journal);
-+}
-+
-+int journal_blocks_per_page(struct inode *inode)
-+{
-+ return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-+}
-+
-+/*
-+ * shrink_journal_memory().
-+ * Called when we're under memory pressure. Free up all the written-back
-+ * checkpointed metadata buffers.
-+ */
-+void shrink_journal_memory(void)
-+{
-+ struct list_head *list;
-+
-+ lock_kernel();
-+ list_for_each(list, &all_journals) {
-+ journal_t *journal =
-+ list_entry(list, journal_t, j_all_journals);
-+ spin_lock(&journal_datalist_lock);
-+ __journal_clean_checkpoint_list(journal);
-+ spin_unlock(&journal_datalist_lock);
-+ }
-+ unlock_kernel();
-+}
-+
-+/*
-+ * Simple support for retying memory allocations. Introduced to help to
-+ * debug different VM deadlock avoidance strategies.
-+ */
-+/*
-+ * Simple support for retying memory allocations. Introduced to help to
-+ * debug different VM deadlock avoidance strategies.
-+ */
-+void * __jbd_kmalloc (char *where, size_t size, int flags, int retry)
-+{
-+ void *p;
-+ static unsigned long last_warning;
-+
-+ while (1) {
-+ p = kmalloc(size, flags);
-+ if (p)
-+ return p;
-+ if (!retry)
-+ return NULL;
-+ /* Log every retry for debugging. Also log them to the
-+ * syslog, but do rate-limiting on the non-debugging
-+ * messages. */
-+ jbd_debug(1, "ENOMEM in %s, retrying.\n", where);
-+
-+ if (time_after(jiffies, last_warning + 5*HZ)) {
-+ printk(KERN_NOTICE
-+ "ENOMEM in %s, retrying.\n", where);
-+ last_warning = jiffies;
-+ }
-+
-+ current->policy |= SCHED_YIELD;
-+ schedule();
-+ }
-+}
-+
-+/*
-+ * Journal_head storage management
-+ */
-+static kmem_cache_t *journal_head_cache;
-+#ifdef CONFIG_JBD_DEBUG
-+static atomic_t nr_journal_heads = ATOMIC_INIT(0);
-+#endif
-+
-+static int journal_init_journal_head_cache(void)
-+{
-+ int retval;
-+
-+ J_ASSERT(journal_head_cache == 0);
-+ journal_head_cache = kmem_cache_create("journal_head",
-+ sizeof(struct journal_head),
-+ 0, /* offset */
-+ 0, /* flags */
-+ NULL, /* ctor */
-+ NULL); /* dtor */
-+ retval = 0;
-+ if (journal_head_cache == 0) {
-+ retval = -ENOMEM;
-+ printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
-+ }
-+ return retval;
-+}
-+
-+static void journal_destroy_journal_head_cache(void)
-+{
-+ J_ASSERT(journal_head_cache != NULL);
-+ kmem_cache_destroy(journal_head_cache);
-+ journal_head_cache = 0;
-+}
-+
-+/*
-+ * journal_head splicing and dicing
-+ */
-+static struct journal_head *journal_alloc_journal_head(void)
-+{
-+ struct journal_head *ret;
-+ static unsigned long last_warning;
-+
-+#ifdef CONFIG_JBD_DEBUG
-+ atomic_inc(&nr_journal_heads);
-+#endif
-+ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
-+ if (ret == 0) {
-+ jbd_debug(1, "out of memory for journal_head\n");
-+ if (time_after(jiffies, last_warning + 5*HZ)) {
-+ printk(KERN_NOTICE "ENOMEM in " __FUNCTION__
-+ ", retrying.\n");
-+ last_warning = jiffies;
-+ }
-+ while (ret == 0) {
-+ current->policy |= SCHED_YIELD;
-+ schedule();
-+ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
-+ }
-+ }
-+ return ret;
-+}
-+
-+static void journal_free_journal_head(struct journal_head *jh)
-+{
-+#ifdef CONFIG_JBD_DEBUG
-+ atomic_dec(&nr_journal_heads);
-+ memset(jh, 0x5b, sizeof(*jh));
-+#endif
-+ kmem_cache_free(journal_head_cache, jh);
-+}
-+
-+/*
-+ * A journal_head is attached to a buffer_head whenever JBD has an
-+ * interest in the buffer.
-+ *
-+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
-+ * is set. This bit is tested in core kernel code where we need to take
-+ * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
-+ * there.
-+ *
-+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
-+ *
-+ * When a buffer has its BH_JBD bit set it is immune from being released by
-+ * core kernel code, mainly via ->b_count.
-+ *
-+ * A journal_head may be detached from its buffer_head when the journal_head's
-+ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
-+ * Various places in JBD call journal_remove_journal_head() to indicate that the
-+ * journal_head can be dropped if needed.
-+ *
-+ * Various places in the kernel want to attach a journal_head to a buffer_head
-+ * _before_ attaching the journal_head to a transaction. To protect the
-+ * journal_head in this situation, journal_add_journal_head elevates the
-+ * journal_head's b_jcount refcount by one. The caller must call
-+ * journal_unlock_journal_head() to undo this.
-+ *
-+ * So the typical usage would be:
-+ *
-+ * (Attach a journal_head if needed. Increments b_jcount)
-+ * struct journal_head *jh = journal_add_journal_head(bh);
-+ * ...
-+ * jh->b_transaction = xxx;
-+ * journal_unlock_journal_head(jh);
-+ *
-+ * Now, the journal_head's b_jcount is zero, but it is safe from being released
-+ * because it has a non-zero b_transaction.
-+ */
-+
-+/*
-+ * Give a buffer_head a journal_head.
-+ *
-+ * Doesn't need the journal lock.
-+ * May sleep.
-+ * Cannot be called with journal_datalist_lock held.
-+ */
-+struct journal_head *journal_add_journal_head(struct buffer_head *bh)
-+{
-+ struct journal_head *jh;
-+
-+ spin_lock(&journal_datalist_lock);
-+ if (buffer_jbd(bh)) {
-+ jh = bh2jh(bh);
-+ } else {
-+ J_ASSERT_BH(bh,
-+ (atomic_read(&bh->b_count) > 0) ||
-+ (bh->b_page && bh->b_page->mapping));
-+ spin_unlock(&journal_datalist_lock);
-+ jh = journal_alloc_journal_head();
-+ memset(jh, 0, sizeof(*jh));
-+ spin_lock(&journal_datalist_lock);
-+
-+ if (buffer_jbd(bh)) {
-+ /* Someone did it for us! */
-+ J_ASSERT_BH(bh, bh->b_private != NULL);
-+ journal_free_journal_head(jh);
-+ jh = bh->b_private;
-+ } else {
-+ /*
-+ * We actually don't need jh_splice_lock when
-+ * adding a journal_head - only on removal.
-+ */
-+ spin_lock(&jh_splice_lock);
-+ set_bit(BH_JBD, &bh->b_state);
-+ bh->b_private = jh;
-+ jh->b_bh = bh;
-+ atomic_inc(&bh->b_count);
-+ spin_unlock(&jh_splice_lock);
-+ BUFFER_TRACE(bh, "added journal_head");
-+ }
-+ }
-+ jh->b_jcount++;
-+ spin_unlock(&journal_datalist_lock);
-+ return bh->b_private;
-+}
-+
-+/*
-+ * journal_remove_journal_head(): if the buffer isn't attached to a transaction
-+ * and has a zero b_jcount then remove and release its journal_head. If we did
-+ * see that the buffer is not used by any transaction we also "logically"
-+ * decrement ->b_count.
-+ *
-+ * We in fact take an additional increment on ->b_count as a convenience,
-+ * because the caller usually wants to do additional things with the bh
-+ * after calling here.
-+ * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
-+ * time. Once the caller has run __brelse(), the buffer is eligible for
-+ * reaping by try_to_free_buffers().
-+ *
-+ * Requires journal_datalist_lock.
-+ */
-+void __journal_remove_journal_head(struct buffer_head *bh)
-+{
-+ struct journal_head *jh = bh2jh(bh);
-+
-+ assert_spin_locked(&journal_datalist_lock);
-+ J_ASSERT_JH(jh, jh->b_jcount >= 0);
-+ atomic_inc(&bh->b_count);
-+ if (jh->b_jcount == 0) {
-+ if (jh->b_transaction == NULL &&
-+ jh->b_next_transaction == NULL &&
-+ jh->b_cp_transaction == NULL) {
-+ J_ASSERT_BH(bh, buffer_jbd(bh));
-+ J_ASSERT_BH(bh, jh2bh(jh) == bh);
-+ BUFFER_TRACE(bh, "remove journal_head");
-+ spin_lock(&jh_splice_lock);
-+ bh->b_private = NULL;
-+ jh->b_bh = NULL; /* debug, really */
-+ clear_bit(BH_JBD, &bh->b_state);
-+ __brelse(bh);
-+ spin_unlock(&jh_splice_lock);
-+ journal_free_journal_head(jh);
-+ } else {
-+ BUFFER_TRACE(bh, "journal_head was locked");
-+ }
-+ }
-+}
-+
-+void journal_unlock_journal_head(struct journal_head *jh)
-+{
-+ spin_lock(&journal_datalist_lock);
-+ J_ASSERT_JH(jh, jh->b_jcount > 0);
-+ --jh->b_jcount;
-+ if (!jh->b_jcount && !jh->b_transaction) {
-+ struct buffer_head *bh;
-+ bh = jh2bh(jh);
-+ __journal_remove_journal_head(bh);
-+ __brelse(bh);
-+ }
-+
-+ spin_unlock(&journal_datalist_lock);
-+}
-+
-+void journal_remove_journal_head(struct buffer_head *bh)
-+{
-+ spin_lock(&journal_datalist_lock);
-+ __journal_remove_journal_head(bh);
-+ spin_unlock(&journal_datalist_lock);
-+}
-+
-+/*
-+ * /proc tunables
-+ */
-+#if defined(CONFIG_JBD_DEBUG)
-+int journal_enable_debug;
-+EXPORT_SYMBOL(journal_enable_debug);
-+#endif
-+
-+#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
-+
-+static struct proc_dir_entry *proc_jbd_debug;
-+
-+int read_jbd_debug(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
-+{
-+ int ret;
-+
-+ ret = sprintf(page + off, "%d\n", journal_enable_debug);
-+ *eof = 1;
-+ return ret;
-+}
-+
-+int write_jbd_debug(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
-+{
-+ char buf[32];
-+
-+ if (count > ARRAY_SIZE(buf) - 1)
-+ count = ARRAY_SIZE(buf) - 1;
-+ if (copy_from_user(buf, buffer, count))
-+ return -EFAULT;
-+ buf[ARRAY_SIZE(buf) - 1] = '\0';
-+ journal_enable_debug = simple_strtoul(buf, NULL, 10);
-+ return count;
-+}
-+
-+#define JBD_PROC_NAME "sys/fs/jbd-debug"
-+
-+static void __init create_jbd_proc_entry(void)
-+{
-+ proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
-+ if (proc_jbd_debug) {
-+ /* Why is this so hard? */
-+ proc_jbd_debug->read_proc = read_jbd_debug;
-+ proc_jbd_debug->write_proc = write_jbd_debug;
-+ }
-+}
-+
-+static void __exit remove_jbd_proc_entry(void)
-+{
-+ if (proc_jbd_debug)
-+ remove_proc_entry(JBD_PROC_NAME, NULL);
-+}
-+
-+#else
-+
-+#define create_jbd_proc_entry() do {} while (0)
-+#define remove_jbd_proc_entry() do {} while (0)
-+
-+#endif
-+
-+/*
-+ * Module startup and shutdown
-+ */
-+
-+static int __init journal_init_caches(void)
-+{
-+ int ret;
-+
-+ ret = journal_init_revoke_caches();
-+ if (ret == 0)
-+ ret = journal_init_journal_head_cache();
-+ return ret;
-+}
-+
-+static void journal_destroy_caches(void)
-+{
-+ journal_destroy_revoke_caches();
-+ journal_destroy_journal_head_cache();
-+}
-+
-+static int __init journal_init(void)
-+{
-+ int ret;
-+
-+ printk(KERN_INFO "Journalled Block Device driver loaded\n");
-+ ret = journal_init_caches();
-+ if (ret != 0)
-+ journal_destroy_caches();
-+ create_jbd_proc_entry();
-+ return ret;
-+}
-+
-+static void __exit journal_exit(void)
-+{
-+#ifdef CONFIG_JBD_DEBUG
-+ int n = atomic_read(&nr_journal_heads);
-+ if (n)
-+ printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
-+#endif
-+ remove_jbd_proc_entry();
-+ journal_destroy_caches();
-+}
-+
-+MODULE_LICENSE("GPL");
-+module_init(journal_init);
-+module_exit(journal_exit);
-+
-diff -ruP linux.mcp2/fs/jbd/recovery.c linuxppc_2.4.19_final/fs/jbd/recovery.c
---- linux.mcp2/fs/jbd/recovery.c 1969-12-31 16:00:00.000000000 -0800
-+++ linuxppc_2.4.19_final/fs/jbd/recovery.c 2004-05-17 13:56:17.000000000 -0700
-@@ -0,0 +1,589 @@
-+/*
-+ * linux/fs/recovery.c
-+ *
-+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
-+ *
-+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
-+ *
-+ * This file is part of the Linux kernel and is made available under
-+ * the terms of the GNU General Public License, version 2, or at your
-+ * option, any later version, incorporated herein by reference.
-+ *
-+ * Journal recovery routines for the generic filesystem journaling code;
-+ * part of the ext2fs journaling system.
-+ */
-+
-+#ifndef __KERNEL__
-+#include "jfs_user.h"
-+#else
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/errno.h>
-+#include <linux/slab.h>
-+#include <linux/locks.h>
-+#endif
-+
-+/*
-+ * Maintain information about the progress of the recovery job, so that
-+ * the different passes can carry information between them.
-+ */
-+struct recovery_info
-+{
-+ tid_t start_transaction;
-+ tid_t end_transaction;
-+
-+ int nr_replays;
-+ int nr_revokes;
-+ int nr_revoke_hits;
-+};
-+
-+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
-+static int do_one_pass(journal_t *journal,
-+ struct recovery_info *info, enum passtype pass);
-+static int scan_revoke_records(journal_t *, struct buffer_head *,
-+ tid_t, struct recovery_info *);
-+
-+#ifdef __KERNEL__
-+
-+/* Release readahead buffers after use */
-+void journal_brelse_array(struct buffer_head *b[], int n)
-+{
-+ while (--n >= 0)
-+ brelse (b[n]);
-+}
-+
-+
-+/*
-+ * When reading from the journal, we are going through the block device
-+ * layer directly and so there is no readahead being done for us. We
-+ * need to implement any readahead ourselves if we want it to happen at
-+ * all. Recovery is basically one long sequential read, so make sure we
-+ * do the IO in reasonably large chunks.
-+ *
-+ * This is not so critical that we need to be enormously clever about
-+ * the readahead size, though. 128K is a purely arbitrary, good-enough
-+ * fixed value.
-+ */
-+
-+#define MAXBUF 8
-+static int do_readahead(journal_t *journal, unsigned int start)
-+{
-+ int err;
-+ unsigned int max, nbufs, next;
-+ unsigned long blocknr;
-+ struct buffer_head *bh;
-+
-+ struct buffer_head * bufs[MAXBUF];
-+
-+ /* Do up to 128K of readahead */
-+ max = start + (128 * 1024 / journal->j_blocksize);
-+ if (max > journal->j_maxlen)
-+ max = journal->j_maxlen;
-+
-+ /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
-+ * a time to the block device IO layer. */
-+
-+ nbufs = 0;
-+
-+ for (next = start; next < max; next++) {
-+ err = journal_bmap(journal, next, &blocknr);
-+
-+ if (err) {
-+ printk (KERN_ERR "JBD: bad block at offset %u\n",
-+ next);
-+ goto failed;
-+ }
-+
-+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
-+ if (!bh) {
-+ err = -ENOMEM;
-+ goto failed;
-+ }
-+
-+ if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
-+ bufs[nbufs++] = bh;
-+ if (nbufs == MAXBUF) {
-+ ll_rw_block(READ, nbufs, bufs);
-+ journal_brelse_array(bufs, nbufs);
-+ nbufs = 0;
-+ }
-+ } else
-+ brelse(bh);
-+ }
-+
-+ if (nbufs)
-+ ll_rw_block(READ, nbufs, bufs);
-+ err = 0;
-+
-+failed:
-+ if (nbufs)
-+ journal_brelse_array(bufs, nbufs);
-+ return err;
-+}
-+
-+#endif /* __KERNEL__ */
-+
-+
-+/*
-+ * Read a block from the journal
-+ */
-+
-+static int jread(struct buffer_head **bhp, journal_t *journal,
-+ unsigned int offset)
-+{
-+ int err;
-+ unsigned long blocknr;
-+ struct buffer_head *bh;
-+
-+ *bhp = NULL;
-+
-+ J_ASSERT (offset < journal->j_maxlen);
-+
-+ err = journal_bmap(journal, offset, &blocknr);
-+
-+ if (err) {
-+ printk (KERN_ERR "JBD: bad block at offset %u\n",
-+ offset);
-+ return err;
-+ }
-+
-+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
-+ if (!bh)
-+ return -ENOMEM;
-+
-+ if (!buffer_uptodate(bh)) {
-+ /* If this is a brand new buffer, start readahead.
-+ Otherwise, we assume we are already reading it. */
-+ if (!buffer_req(bh))
-+ do_readahead(journal, offset);
-+ wait_on_buffer(bh);
-+ }
-+
-+ if (!buffer_uptodate(bh)) {
-+ printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
-+ offset);
-+ brelse(bh);
-+ return -EIO;
-+ }
-+
-+ *bhp = bh;
-+ return 0;
-+}
-+
-+
-+/*
-+ * Count the number of in-use tags in a journal descriptor block.
-+ */
-+
-+static int count_tags(struct buffer_head *bh, int size)
-+{
-+ char * tagp;
-+ journal_block_tag_t * tag;
-+ int nr = 0;
-+
-+ tagp = &bh->b_data[sizeof(journal_header_t)];
-+
-+ while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
-+ tag = (journal_block_tag_t *) tagp;
-+
-+ nr++;
-+ tagp += sizeof(journal_block_tag_t);
-+ if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID)))
-+ tagp += 16;
-+
-+ if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG))
-+ break;
-+ }
-+
-+ return nr;
-+}
-+
-+
-+/* Make sure we wrap around the log correctly! */
-+#define wrap(journal, var) \
-+do { \
-+ if (var >= (journal)->j_last) \
-+ var -= ((journal)->j_last - (journal)->j_first); \
-+} while (0)
-+
-+/*
-+ * journal_recover
-+ *
-+ * The primary function for recovering the log contents when mounting a
-+ * journaled device.
-+ *
-+ * Recovery is done in three passes. In the first pass, we look for the
-+ * end of the log. In the second, we assemble the list of revoke
-+ * blocks. In the third and final pass, we replay any un-revoked blocks
-+ * in the log.
-+ */
-+
-+int journal_recover(journal_t *journal)
-+{
-+ int err;
-+ journal_superblock_t * sb;
-+
-+ struct recovery_info info;
-+
-+ memset(&info, 0, sizeof(info));
-+ sb = journal->j_superblock;
-+
-+ /*
-+ * The journal superblock's s_start field (the current log head)
-+ * is always zero if, and only if, the journal was cleanly
-+ * unmounted.
-+ */
-+
-+ if (!sb->s_start) {
-+ jbd_debug(1, "No recovery required, last transaction %d\n",
-+ ntohl(sb->s_sequence));
-+ journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1;
-+ return 0;
-+ }
-+
-+
-+ err = do_one_pass(journal, &info, PASS_SCAN);
-+ if (!err)
-+ err = do_one_pass(journal, &info, PASS_REVOKE);
-+ if (!err)
-+ err = do_one_pass(journal, &info, PASS_REPLAY);
-+
-+ jbd_debug(0, "JBD: recovery, exit status %d, "
-+ "recovered transactions %u to %u\n",
-+ err, info.start_transaction, info.end_transaction);
-+ jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
-+ info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
-+
-+ /* Restart the log at the next transaction ID, thus invalidating
-+ * any existing commit records in the log. */
-+ journal->j_transaction_sequence = ++info.end_transaction;
-+
-+ journal_clear_revoke(journal);
-+ fsync_no_super(journal->j_fs_dev);
-+ return err;
-+}
-+
-+/*
-+ * journal_skip_recovery
-+ *
-+ * Locate any valid recovery information from the journal and set up the
-+ * journal structures in memory to ignore it (presumably because the
-+ * caller has evidence that it is out of date).
-+ *
-+ * We perform one pass over the journal to allow us to tell the user how
-+ * much recovery information is being erased, and to let us initialise
-+ * the journal transaction sequence numbers to the next unused ID.
-+ */
-+
-+int journal_skip_recovery(journal_t *journal)
-+{
-+ int err;
-+ journal_superblock_t * sb;
-+
-+ struct recovery_info info;
-+
-+ memset (&info, 0, sizeof(info));
-+ sb = journal->j_superblock;
-+
-+ err = do_one_pass(journal, &info, PASS_SCAN);
-+
-+ if (err) {
-+ printk(KERN_ERR "JBD: error %d scanning journal\n", err);
-+ ++journal->j_transaction_sequence;
-+ } else {
-+#ifdef CONFIG_JBD_DEBUG
-+ int dropped = info.end_transaction - ntohl(sb->s_sequence);
-+#endif
-+
-+ jbd_debug(0,
-+ "JBD: ignoring %d transaction%s from the journal.\n",
-+ dropped, (dropped == 1) ? "" : "s");
-+ journal->j_transaction_sequence = ++info.end_transaction;
-+ }
-+
-+ journal->j_tail = 0;
-+
-+ return err;
-+}
-+
-+static int do_one_pass(journal_t *journal,
-+ struct recovery_info *info, enum passtype pass)
-+{
-+
-+ unsigned int first_commit_ID, next_commit_ID;
-+ unsigned long next_log_block;
-+ int err, success = 0;
-+ journal_superblock_t * sb;
-+ journal_header_t * tmp;
-+ struct buffer_head * bh;
-+ unsigned int sequence;
-+ int blocktype;
-+
-+ /* Precompute the maximum metadata descriptors in a descriptor block */
-+ int MAX_BLOCKS_PER_DESC;
-+ MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
-+ / sizeof(journal_block_tag_t));
-+
-+ /*
-+ * First thing is to establish what we expect to find in the log
-+ * (in terms of transaction IDs), and where (in terms of log
-+ * block offsets): query the superblock.
-+ */
-+
-+ sb = journal->j_superblock;
-+ next_commit_ID = ntohl(sb->s_sequence);
-+ next_log_block = ntohl(sb->s_start);
-+
-+ first_commit_ID = next_commit_ID;
-+ if (pass == PASS_SCAN)
-+ info->start_transaction = first_commit_ID;
-+
-+ jbd_debug(1, "Starting recovery pass %d\n", pass);
-+
-+ /*
-+ * Now we walk through the log, transaction by transaction,
-+ * making sure that each transaction has a commit block in the
-+ * expected place. Each complete transaction gets replayed back
-+ * into the main filesystem.
-+ */
-+
-+ while (1) {
-+ int flags;
-+ char * tagp;
-+ journal_block_tag_t * tag;
-+ struct buffer_head * obh;
-+ struct buffer_head * nbh;
-+
-+ /* If we already know where to stop the log traversal,
-+ * check right now that we haven't gone past the end of
-+ * the log. */
-+
-+ if (pass != PASS_SCAN)
-+ if (tid_geq(next_commit_ID, info->end_transaction))
-+ break;
-+
-+ jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
-+ next_commit_ID, next_log_block, journal->j_last);
-+
-+ /* Skip over each chunk of the transaction looking
-+ * either the next descriptor block or the final commit
-+ * record. */
-+
-+ jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
-+ err = jread(&bh, journal, next_log_block);
-+ if (err)
-+ goto failed;
-+
-+ next_log_block++;
-+ wrap(journal, next_log_block);
-+
-+ /* What kind of buffer is it?
-+ *
-+ * If it is a descriptor block, check that it has the
-+ * expected sequence number. Otherwise, we're all done
-+ * here. */
-+
-+ tmp = (journal_header_t *)bh->b_data;
-+
-+ if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) {
-+ brelse(bh);
-+ break;
-+ }
-+
-+ blocktype = ntohl(tmp->h_blocktype);
-+ sequence = ntohl(tmp->h_sequence);
-+ jbd_debug(3, "Found magic %d, sequence %d\n",
-+ blocktype, sequence);
-+
-+ if (sequence != next_commit_ID) {
-+ brelse(bh);
-+ break;
-+ }
-+
-+ /* OK, we have a valid descriptor block which matches
-+ * all of the sequence number checks. What are we going
-+ * to do with it? That depends on the pass... */
-+
-+ switch(blocktype) {
-+ case JFS_DESCRIPTOR_BLOCK:
-+ /* If it is a valid descriptor block, replay it
-+ * in pass REPLAY; otherwise, just skip over the
-+ * blocks it describes. */
-+ if (pass != PASS_REPLAY) {
-+ next_log_block +=
-+ count_tags(bh, journal->j_blocksize);
-+ wrap(journal, next_log_block);
-+ brelse(bh);
-+ continue;
-+ }
-+
-+ /* A descriptor block: we can now write all of
-+ * the data blocks. Yay, useful work is finally
-+ * getting done here! */
-+
-+ tagp = &bh->b_data[sizeof(journal_header_t)];
-+ while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
-+ <= journal->j_blocksize) {
-+ unsigned long io_block;
-+
-+ tag = (journal_block_tag_t *) tagp;
-+ flags = ntohl(tag->t_flags);
-+
-+ io_block = next_log_block++;
-+ wrap(journal, next_log_block);
-+ err = jread(&obh, journal, io_block);
-+ if (err) {
-+ /* Recover what we can, but
-+ * report failure at the end. */
-+ success = err;
-+ printk (KERN_ERR
-+ "JBD: IO error %d recovering "
-+ "block %ld in log\n",
-+ err, io_block);
-+ } else {
-+ unsigned long blocknr;
-+
-+ J_ASSERT(obh != NULL);
-+ blocknr = ntohl(tag->t_blocknr);
-+
-+ /* If the block has been
-+ * revoked, then we're all done
-+ * here. */
-+ if (journal_test_revoke
-+ (journal, blocknr,
-+ next_commit_ID)) {
-+ brelse(obh);
-+ ++info->nr_revoke_hits;
-+ goto skip_write;
-+ }
-+
-+ /* Find a buffer for the new
-+ * data being restored */
-+ nbh = getblk(journal->j_fs_dev, blocknr,
-+ journal->j_blocksize);
-+ if (nbh == NULL) {
-+ printk(KERN_ERR
-+ "JBD: Out of memory "
-+ "during recovery.\n");
-+ err = -ENOMEM;
-+ brelse(bh);
-+ brelse(obh);
-+ goto failed;
-+ }
-+
-+ lock_buffer(nbh);
-+ memcpy(nbh->b_data, obh->b_data,
-+ journal->j_blocksize);
-+ if (flags & JFS_FLAG_ESCAPE) {
-+ *((unsigned int *)bh->b_data) =
-+ htonl(JFS_MAGIC_NUMBER);
-+ }
-+
-+ BUFFER_TRACE(nbh, "marking dirty");
-+ mark_buffer_dirty(nbh);
-+ BUFFER_TRACE(nbh, "marking uptodate");
-+ mark_buffer_uptodate(nbh, 1);
-+ unlock_buffer(nbh);
-+ ++info->nr_replays;
-+ /* ll_rw_block(WRITE, 1, &nbh); */
-+ brelse(obh);
-+ brelse(nbh);
-+ }
-+
-+ skip_write:
-+ tagp += sizeof(journal_block_tag_t);
-+ if (!(flags & JFS_FLAG_SAME_UUID))
-+ tagp += 16;
-+
-+ if (flags & JFS_FLAG_LAST_TAG)
-+ break;
-+ }
-+
-+ brelse(bh);
-+ continue;
-+
-+ case JFS_COMMIT_BLOCK:
-+ /* Found an expected commit block: not much to
-+ * do other than move on to the next sequence
-+ * number. */
-+ brelse(bh);
-+ next_commit_ID++;
-+ continue;
-+
-+ case JFS_REVOKE_BLOCK:
-+ /* If we aren't in the REVOKE pass, then we can
-+ * just skip over this block. */
-+ if (pass != PASS_REVOKE) {
-+ brelse(bh);
-+ continue;
-+ }
-+
-+ err = scan_revoke_records(journal, bh,
-+ next_commit_ID, info);
-+ brelse(bh);
-+ if (err)
-+ goto failed;
-+ continue;
-+
-+ default:
-+ jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
-+ blocktype);
-+ goto done;
-+ }
-+ }
-+
-+ done:
-+ /*
-+ * We broke out of the log scan loop: either we came to the
-+ * known end of the log or we found an unexpected block in the
-+ * log. If the latter happened, then we know that the "current"
-+ * transaction marks the end of the valid log.
-+ */
-+
-+ if (pass == PASS_SCAN)
-+ info->end_transaction = next_commit_ID;
-+ else {
-+ /* It's really bad news if different passes end up at
-+ * different places (but possible due to IO errors). */
-+ if (info->end_transaction != next_commit_ID) {
-+ printk (KERN_ERR "JBD: recovery pass %d ended at "
-+ "transaction %u, expected %u\n",
-+ pass, next_commit_ID, info->end_transaction);
-+ if (!success)
-+ success = -EIO;
-+ }
-+ }
-+
-+ return success;
-+
-+ failed:
-+ return err;
-+}
-+
-+
-+/* Scan a revoke record, marking all blocks mentioned as revoked. */
-+
-+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
-+ tid_t sequence, struct recovery_info *info)
-+{
-+ journal_revoke_header_t *header;
-+ int offset, max;
-+
-+ header = (journal_revoke_header_t *) bh->b_data;
-+ offset = sizeof(journal_revoke_header_t);
-+ max = ntohl(header->r_count);
-+
-+ while (offset < max) {
-+ unsigned long blocknr;
-+ int err;
-+
-+ blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset)));
-+ offset += 4;
-+ err = journal_set_revoke(journal, blocknr, sequence);
-+ if (err)
-+ return err;
-+ ++info->nr_revokes;
-+ }
-+ return 0;
-+}
-diff -ruP linux.mcp2/fs/jbd/revoke.c linuxppc_2.4.19_final/fs/jbd/revoke.c
---- linux.mcp2/fs/jbd/revoke.c 1969-12-31 16:00:00.000000000 -0800
-+++ linuxppc_2.4.19_final/fs/jbd/revoke.c 2004-05-17 13:56:17.000000000 -0700
-@@ -0,0 +1,636 @@
-+/*
-+ * linux/fs/revoke.c
-+ *
-+ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
-+ *
-+ * Copyright 2000 Red Hat corp --- All Rights Reserved
-+ *
-+ * This file is part of the Linux kernel and is made available under
-+ * the terms of the GNU General Public License, version 2, or at your
-+ * option, any later version, incorporated herein by reference.
-+ *
-+ * Journal revoke routines for the generic filesystem journaling code;
-+ * part of the ext2fs journaling system.
-+ *
-+ * Revoke is the mechanism used to prevent old log records for deleted
-+ * metadata from being replayed on top of newer data using the same
-+ * blocks. The revoke mechanism is used in two separate places:
-+ *
-+ * + Commit: during commit we write the entire list of the current
-+ * transaction's revoked blocks to the journal
-+ *
-+ * + Recovery: during recovery we record the transaction ID of all
-+ * revoked blocks. If there are multiple revoke records in the log
-+ * for a single block, only the last one counts, and if there is a log
-+ * entry for a block beyond the last revoke, then that log entry still
-+ * gets replayed.
-+ *
-+ * We can get interactions between revokes and new log data within a
-+ * single transaction:
-+ *
-+ * Block is revoked and then journaled:
-+ * The desired end result is the journaling of the new block, so we
-+ * cancel the revoke before the transaction commits.
-+ *
-+ * Block is journaled and then revoked:
-+ * The revoke must take precedence over the write of the block, so we
-+ * need either to cancel the journal entry or to write the revoke
-+ * later in the log than the log block. In this case, we choose the
-+ * latter: journaling a block cancels any revoke record for that block
-+ * in the current transaction, so any revoke for that block in the
-+ * transaction must have happened after the block was journaled and so
-+ * the revoke must take precedence.
-+ *
-+ * Block is revoked and then written as data:
-+ * The data write is allowed to succeed, but the revoke is _not_
-+ * cancelled. We still need to prevent old log records from
-+ * overwriting the new data. We don't even need to clear the revoke
-+ * bit here.
-+ *
-+ * Revoke information on buffers is a tri-state value:
-+ *
-+ * RevokeValid clear: no cached revoke status, need to look it up
-+ * RevokeValid set, Revoked clear:
-+ * buffer has not been revoked, and cancel_revoke
-+ * need do nothing.
-+ * RevokeValid set, Revoked set:
-+ * buffer has been revoked.
-+ */
-+
-+#ifndef __KERNEL__
-+#include "jfs_user.h"
-+#else
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/errno.h>
-+#include <linux/slab.h>
-+#include <linux/locks.h>
-+#include <linux/list.h>
-+#include <linux/smp_lock.h>
-+#include <linux/init.h>
-+#endif
-+
-+static kmem_cache_t *revoke_record_cache;
-+static kmem_cache_t *revoke_table_cache;
-+
-+/* Each revoke record represents one single revoked block. During
-+ journal replay, this involves recording the transaction ID of the
-+ last transaction to revoke this block. */
-+
-+struct jbd_revoke_record_s
-+{
-+ struct list_head hash;
-+ tid_t sequence; /* Used for recovery only */
-+ unsigned long blocknr;
-+};
-+
-+
-+/* The revoke table is just a simple hash table of revoke records. */
-+struct jbd_revoke_table_s
-+{
-+ /* It is conceivable that we might want a larger hash table
-+ * for recovery. Must be a power of two. */
-+ int hash_size;
-+ int hash_shift;
-+ struct list_head *hash_table;
-+};
-+
-+
-+#ifdef __KERNEL__
-+static void write_one_revoke_record(journal_t *, transaction_t *,
-+ struct journal_head **, int *,
-+ struct jbd_revoke_record_s *);
-+static void flush_descriptor(journal_t *, struct journal_head *, int);
-+#endif
-+
-+/* Utility functions to maintain the revoke table */
-+
-+/* Borrowed from buffer.c: this is a tried and tested block hash function */
-+static inline int hash(journal_t *journal, unsigned long block)
-+{
-+ struct jbd_revoke_table_s *table = journal->j_revoke;
-+ int hash_shift = table->hash_shift;
-+
-+ return ((block << (hash_shift - 6)) ^
-+ (block >> 13) ^
-+ (block << (hash_shift - 12))) & (table->hash_size - 1);
-+}
-+
-+int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
-+{
-+ struct list_head *hash_list;
-+ struct jbd_revoke_record_s *record;
-+
-+repeat:
-+ record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
-+ if (!record)
-+ goto oom;
-+
-+ record->sequence = seq;
-+ record->blocknr = blocknr;
-+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
-+ list_add(&record->hash, hash_list);
-+ return 0;
-+
-+oom:
-+ if (!journal_oom_retry)
-+ return -ENOMEM;
-+ jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
-+ current->policy |= SCHED_YIELD;
-+ schedule();
-+ goto repeat;
-+}
-+
-+/* Find a revoke record in the journal's hash table. */
-+
-+static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
-+ unsigned long blocknr)
-+{
-+ struct list_head *hash_list;
-+ struct jbd_revoke_record_s *record;
-+
-+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
-+
-+ record = (struct jbd_revoke_record_s *) hash_list->next;
-+ while (&(record->hash) != hash_list) {
-+ if (record->blocknr == blocknr)
-+ return record;
-+ record = (struct jbd_revoke_record_s *) record->hash.next;
-+ }
-+ return NULL;
-+}
-+
-+int __init journal_init_revoke_caches(void)
-+{
-+ revoke_record_cache = kmem_cache_create("revoke_record",
-+ sizeof(struct jbd_revoke_record_s),
-+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-+ if (revoke_record_cache == 0)
-+ return -ENOMEM;
-+
-+ revoke_table_cache = kmem_cache_create("revoke_table",
-+ sizeof(struct jbd_revoke_table_s),
-+ 0, 0, NULL, NULL);
-+ if (revoke_table_cache == 0) {
-+ kmem_cache_destroy(revoke_record_cache);
-+ revoke_record_cache = NULL;
-+ return -ENOMEM;
-+ }
-+ return 0;
-+}
-+
-+void journal_destroy_revoke_caches(void)
-+{
-+ kmem_cache_destroy(revoke_record_cache);
-+ revoke_record_cache = 0;
-+ kmem_cache_destroy(revoke_table_cache);
-+ revoke_table_cache = 0;
-+}
-+
-+/* Initialise the revoke table for a given journal to a given size. */
-+
-+int journal_init_revoke(journal_t *journal, int hash_size)
-+{
-+ int shift, tmp;
-+
-+ J_ASSERT (journal->j_revoke == NULL);
-+
-+ journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
-+ if (!journal->j_revoke)
-+ return -ENOMEM;
-+
-+ /* Check that the hash_size is a power of two */
-+ J_ASSERT ((hash_size & (hash_size-1)) == 0);
-+
-+ journal->j_revoke->hash_size = hash_size;
-+
-+ shift = 0;
-+ tmp = hash_size;
-+ while((tmp >>= 1UL) != 0UL)
-+ shift++;
-+ journal->j_revoke->hash_shift = shift;
-+
-+ journal->j_revoke->hash_table =
-+ kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-+ if (!journal->j_revoke->hash_table) {
-+ kmem_cache_free(revoke_table_cache, journal->j_revoke);
-+ journal->j_revoke = NULL;
-+ return -ENOMEM;
-+ }
-+
-+ for (tmp = 0; tmp < hash_size; tmp++)
-+ INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
-+
-+ return 0;
-+}
-+
-+/* Destoy a journal's revoke table. The table must already be empty! */
-+
-+void journal_destroy_revoke(journal_t *journal)
-+{
-+ struct jbd_revoke_table_s *table;
-+ struct list_head *hash_list;
-+ int i;
-+
-+ table = journal->j_revoke;
-+ if (!table)
-+ return;
-+
-+ for (i=0; i<table->hash_size; i++) {
-+ hash_list = &table->hash_table[i];
-+ J_ASSERT (list_empty(hash_list));
-+ }
-+
-+ kfree(table->hash_table);
-+ kmem_cache_free(revoke_table_cache, table);
-+ journal->j_revoke = NULL;
-+}
-+
-+
-+#ifdef __KERNEL__
-+
-+/*
-+ * journal_revoke: revoke a given buffer_head from the journal. This
-+ * prevents the block from being replayed during recovery if we take a
-+ * crash after this current transaction commits. Any subsequent
-+ * metadata writes of the buffer in this transaction cancel the
-+ * revoke.
-+ *
-+ * Note that this call may block --- it is up to the caller to make
-+ * sure that there are no further calls to journal_write_metadata
-+ * before the revoke is complete. In ext3, this implies calling the
-+ * revoke before clearing the block bitmap when we are deleting
-+ * metadata.
-+ *
-+ * Revoke performs a journal_forget on any buffer_head passed in as a
-+ * parameter, but does _not_ forget the buffer_head if the bh was only
-+ * found implicitly.
-+ *
-+ * bh_in may not be a journalled buffer - it may have come off
-+ * the hash tables without an attached journal_head.
-+ *
-+ * If bh_in is non-zero, journal_revoke() will decrement its b_count
-+ * by one.
-+ */
-+
-+int journal_revoke(handle_t *handle, unsigned long blocknr,
-+ struct buffer_head *bh_in)
-+{
-+ struct buffer_head *bh = NULL;
-+ journal_t *journal;
-+ kdev_t dev;
-+ int err;
-+
-+ if (bh_in)
-+ BUFFER_TRACE(bh_in, "enter");
-+
-+ journal = handle->h_transaction->t_journal;
-+ if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
-+ J_ASSERT (!"Cannot set revoke feature!");
-+ return -EINVAL;
-+ }
-+
-+ dev = journal->j_fs_dev;
-+ bh = bh_in;
-+
-+ if (!bh) {
-+ bh = get_hash_table(dev, blocknr, journal->j_blocksize);
-+ if (bh)
-+ BUFFER_TRACE(bh, "found on hash");
-+ }
-+#ifdef JBD_EXPENSIVE_CHECKING
-+ else {
-+ struct buffer_head *bh2;
-+
-+ /* If there is a different buffer_head lying around in
-+ * memory anywhere... */
-+ bh2 = get_hash_table(dev, blocknr, journal->j_blocksize);
-+ if (bh2) {
-+ /* ... and it has RevokeValid status... */
-+ if ((bh2 != bh) &&
-+ test_bit(BH_RevokeValid, &bh2->b_state))
-+ /* ...then it better be revoked too,
-+ * since it's illegal to create a revoke
-+ * record against a buffer_head which is
-+ * not marked revoked --- that would
-+ * risk missing a subsequent revoke
-+ * cancel. */
-+ J_ASSERT_BH(bh2, test_bit(BH_Revoked, &
-+ bh2->b_state));
-+ __brelse(bh2);
-+ }
-+ }
-+#endif
-+
-+ /* We really ought not ever to revoke twice in a row without
-+ first having the revoke cancelled: it's illegal to free a
-+ block twice without allocating it in between! */
-+ if (bh) {
-+ J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state));
-+ set_bit(BH_Revoked, &bh->b_state);
-+ set_bit(BH_RevokeValid, &bh->b_state);
-+ if (bh_in) {
-+ BUFFER_TRACE(bh_in, "call journal_forget");
-+ journal_forget(handle, bh_in);
-+ } else {
-+ BUFFER_TRACE(bh, "call brelse");
-+ __brelse(bh);
-+ }
-+ }
-+
-+ lock_journal(journal);
-+ jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
-+ err = insert_revoke_hash(journal, blocknr,
-+ handle->h_transaction->t_tid);
-+ unlock_journal(journal);
-+ BUFFER_TRACE(bh_in, "exit");
-+ return err;
-+}
-+
-+/*
-+ * Cancel an outstanding revoke. For use only internally by the
-+ * journaling code (called from journal_get_write_access).
-+ *
-+ * We trust the BH_Revoked bit on the buffer if the buffer is already
-+ * being journaled: if there is no revoke pending on the buffer, then we
-+ * don't do anything here.
-+ *
-+ * This would break if it were possible for a buffer to be revoked and
-+ * discarded, and then reallocated within the same transaction. In such
-+ * a case we would have lost the revoked bit, but when we arrived here
-+ * the second time we would still have a pending revoke to cancel. So,
-+ * do not trust the Revoked bit on buffers unless RevokeValid is also
-+ * set.
-+ *
-+ * The caller must have the journal locked.
-+ */
-+int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
-+{
-+ struct jbd_revoke_record_s *record;
-+ journal_t *journal = handle->h_transaction->t_journal;
-+ int need_cancel;
-+ int did_revoke = 0; /* akpm: debug */
-+ struct buffer_head *bh = jh2bh(jh);
-+
-+ jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
-+
-+ /* Is the existing Revoke bit valid? If so, we trust it, and
-+ * only perform the full cancel if the revoke bit is set. If
-+ * not, we can't trust the revoke bit, and we need to do the
-+ * full search for a revoke record. */
-+ if (test_and_set_bit(BH_RevokeValid, &bh->b_state))
-+ need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state));
-+ else {
-+ need_cancel = 1;
-+ clear_bit(BH_Revoked, &bh->b_state);
-+ }
-+
-+ if (need_cancel) {
-+ record = find_revoke_record(journal, bh->b_blocknr);
-+ if (record) {
-+ jbd_debug(4, "cancelled existing revoke on "
-+ "blocknr %lu\n", bh->b_blocknr);
-+ list_del(&record->hash);
-+ kmem_cache_free(revoke_record_cache, record);
-+ did_revoke = 1;
-+ }
-+ }
-+
-+#ifdef JBD_EXPENSIVE_CHECKING
-+ /* There better not be one left behind by now! */
-+ record = find_revoke_record(journal, bh->b_blocknr);
-+ J_ASSERT_JH(jh, record == NULL);
-+#endif
-+
-+ /* Finally, have we just cleared revoke on an unhashed
-+ * buffer_head? If so, we'd better make sure we clear the
-+ * revoked status on any hashed alias too, otherwise the revoke
-+ * state machine will get very upset later on. */
-+ if (need_cancel && !bh->b_pprev) {
-+ struct buffer_head *bh2;
-+ bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
-+ if (bh2) {
-+ clear_bit(BH_Revoked, &bh2->b_state);
-+ __brelse(bh2);
-+ }
-+ }
-+
-+ return did_revoke;
-+}
-+
-+
-+/*
-+ * Write revoke records to the journal for all entries in the current
-+ * revoke hash, deleting the entries as we go.
-+ *
-+ * Called with the journal lock held.
-+ */
-+
-+void journal_write_revoke_records(journal_t *journal,
-+ transaction_t *transaction)
-+{
-+ struct journal_head *descriptor;
-+ struct jbd_revoke_record_s *record;
-+ struct jbd_revoke_table_s *revoke;
-+ struct list_head *hash_list;
-+ int i, offset, count;
-+
-+ descriptor = NULL;
-+ offset = 0;
-+ count = 0;
-+ revoke = journal->j_revoke;
-+
-+ for (i = 0; i < revoke->hash_size; i++) {
-+ hash_list = &revoke->hash_table[i];
-+
-+ while (!list_empty(hash_list)) {
-+ record = (struct jbd_revoke_record_s *)
-+ hash_list->next;
-+ write_one_revoke_record(journal, transaction,
-+ &descriptor, &offset,
-+ record);
-+ count++;
-+ list_del(&record->hash);
-+ kmem_cache_free(revoke_record_cache, record);
-+ }
-+ }
-+ if (descriptor)
-+ flush_descriptor(journal, descriptor, offset);
-+ jbd_debug(1, "Wrote %d revoke records\n", count);
-+}
-+
-+/*
-+ * Write out one revoke record. We need to create a new descriptor
-+ * block if the old one is full or if we have not already created one.
-+ */
-+
-+static void write_one_revoke_record(journal_t *journal,
-+ transaction_t *transaction,
-+ struct journal_head **descriptorp,
-+ int *offsetp,
-+ struct jbd_revoke_record_s *record)
-+{
-+ struct journal_head *descriptor;
-+ int offset;
-+ journal_header_t *header;
-+
-+ /* If we are already aborting, this all becomes a noop. We
-+ still need to go round the loop in
-+ journal_write_revoke_records in order to free all of the
-+ revoke records: only the IO to the journal is omitted. */
-+ if (is_journal_aborted(journal))
-+ return;
-+
-+ descriptor = *descriptorp;
-+ offset = *offsetp;
-+
-+ /* Make sure we have a descriptor with space left for the record */
-+ if (descriptor) {
-+ if (offset == journal->j_blocksize) {
-+ flush_descriptor(journal, descriptor, offset);
-+ descriptor = NULL;
-+ }
-+ }
-+
-+ if (!descriptor) {
-+ descriptor = journal_get_descriptor_buffer(journal);
-+ if (!descriptor)
-+ return;
-+ header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
-+ header->h_magic = htonl(JFS_MAGIC_NUMBER);
-+ header->h_blocktype = htonl(JFS_REVOKE_BLOCK);
-+ header->h_sequence = htonl(transaction->t_tid);
-+
-+ /* Record it so that we can wait for IO completion later */
-+ JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
-+ journal_file_buffer(descriptor, transaction, BJ_LogCtl);
-+
-+ offset = sizeof(journal_revoke_header_t);
-+ *descriptorp = descriptor;
-+ }
-+
-+ * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) =
-+ htonl(record->blocknr);
-+ offset += 4;
-+ *offsetp = offset;
-+}
-+
-+/*
-+ * Flush a revoke descriptor out to the journal. If we are aborting,
-+ * this is a noop; otherwise we are generating a buffer which needs to
-+ * be waited for during commit, so it has to go onto the appropriate
-+ * journal buffer list.
-+ */
-+
-+static void flush_descriptor(journal_t *journal,
-+ struct journal_head *descriptor,
-+ int offset)
-+{
-+ journal_revoke_header_t *header;
-+
-+ if (is_journal_aborted(journal)) {
-+ JBUFFER_TRACE(descriptor, "brelse");
-+ unlock_buffer(jh2bh(descriptor));
-+ __brelse(jh2bh(descriptor));
-+ return;
-+ }
-+
-+ header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
-+ header->r_count = htonl(offset);
-+ set_bit(BH_JWrite, &jh2bh(descriptor)->b_state);
-+ {
-+ struct buffer_head *bh = jh2bh(descriptor);
-+ BUFFER_TRACE(bh, "write");
-+ clear_bit(BH_Dirty, &bh->b_state);
-+ bh->b_end_io = journal_end_buffer_io_sync;
-+ submit_bh(WRITE, bh);
-+ }
-+}
-+
-+#endif
-+
-+/*
-+ * Revoke support for recovery.
-+ *
-+ * Recovery needs to be able to:
-+ *
-+ * record all revoke records, including the tid of the latest instance
-+ * of each revoke in the journal
-+ *
-+ * check whether a given block in a given transaction should be replayed
-+ * (ie. has not been revoked by a revoke record in that or a subsequent
-+ * transaction)
-+ *
-+ * empty the revoke table after recovery.
-+ */
-+
-+/*
-+ * First, setting revoke records. We create a new revoke record for
-+ * every block ever revoked in the log as we scan it for recovery, and
-+ * we update the existing records if we find multiple revokes for a
-+ * single block.
-+ */
-+
-+int journal_set_revoke(journal_t *journal,
-+ unsigned long blocknr,
-+ tid_t sequence)
-+{
-+ struct jbd_revoke_record_s *record;
-+
-+ record = find_revoke_record(journal, blocknr);
-+ if (record) {
-+ /* If we have multiple occurences, only record the
-+ * latest sequence number in the hashed record */
-+ if (tid_gt(sequence, record->sequence))
-+ record->sequence = sequence;
-+ return 0;
-+ }
-+ return insert_revoke_hash(journal, blocknr, sequence);
-+}
-+
-+/*
-+ * Test revoke records. For a given block referenced in the log, has
-+ * that block been revoked? A revoke record with a given transaction
-+ * sequence number revokes all blocks in that transaction and earlier
-+ * ones, but later transactions still need replayed.
-+ */
-+
-+int journal_test_revoke(journal_t *journal,
-+ unsigned long blocknr,
-+ tid_t sequence)
-+{
-+ struct jbd_revoke_record_s *record;
-+
-+ record = find_revoke_record(journal, blocknr);
-+ if (!record)
-+ return 0;
-+ if (tid_gt(sequence, record->sequence))
-+ return 0;
-+ return 1;
-+}
-+
-+/*
-+ * Finally, once recovery is over, we need to clear the revoke table so
-+ * that it can be reused by the running filesystem.
-+ */
-+
-+void journal_clear_revoke(journal_t *journal)
-+{
-+ int i;
-+ struct list_head *hash_list;
-+ struct jbd_revoke_record_s *record;
-+ struct jbd_revoke_table_s *revoke;
-+
-+ revoke = journal->j_revoke;
-+
-+ for (i = 0; i < revoke->hash_size; i++) {
-+ hash_list = &revoke->hash_table[i];
-+ while (!list_empty(hash_list)) {
-+ record = (struct jbd_revoke_record_s*) hash_list->next;
-+ list_del(&record->hash);
-+ kmem_cache_free(revoke_record_cache, record);
-+ }
-+ }
-+}
-+
-diff -ruP linux.mcp2/fs/jbd/transaction.c linuxppc_2.4.19_final/fs/jbd/transaction.c
---- linux.mcp2/fs/jbd/transaction.c 1969-12-31 16:00:00.000000000 -0800
-+++ linuxppc_2.4.19_final/fs/jbd/transaction.c 2004-05-17 13:56:17.000000000 -0700
-@@ -0,0 +1,2055 @@
-+/*
-+ * linux/fs/transaction.c
-+ *
-+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
-+ *
-+ * Copyright 1998 Red Hat corp --- All Rights Reserved
-+ *
-+ * This file is part of the Linux kernel and is made available under
-+ * the terms of the GNU General Public License, version 2, or at your
-+ * option, any later version, incorporated herein by reference.
-+ *
-+ * Generic filesystem transaction handling code; part of the ext2fs
-+ * journaling system.
-+ *
-+ * This file manages transactions (compound commits managed by the
-+ * journaling code) and handles (individual atomic operations by the
-+ * filesystem).
-+ */
-+
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/errno.h>
-+#include <linux/slab.h>
-+#include <linux/locks.h>
-+#include <linux/timer.h>
-+#include <linux/smp_lock.h>
-+#include <linux/mm.h>
-+
-+extern spinlock_t journal_datalist_lock;
-+
-+/*
-+ * get_transaction: obtain a new transaction_t object.
-+ *
-+ * Simply allocate and initialise a new transaction. Create it in
-+ * RUNNING state and add it to the current journal (which should not
-+ * have an existing running transaction: we only make a new transaction
-+ * once we have started to commit the old one).
-+ *
-+ * Preconditions:
-+ * The journal MUST be locked. We don't perform atomic mallocs on the
-+ * new transaction and we can't block without protecting against other
-+ * processes trying to touch the journal while it is in transition.
-+ */
-+
-+static transaction_t * get_transaction (journal_t * journal, int is_try)
-+{
-+ transaction_t * transaction;
-+
-+ transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS);
-+ if (!transaction)
-+ return NULL;
-+
-+ memset (transaction, 0, sizeof (transaction_t));
-+
-+ transaction->t_journal = journal;
-+ transaction->t_state = T_RUNNING;
-+ transaction->t_tid = journal->j_transaction_sequence++;
-+ transaction->t_expires = jiffies + journal->j_commit_interval;
-+
-+ /* Set up the commit timer for the new transaction. */
-+ J_ASSERT (!journal->j_commit_timer_active);
-+ journal->j_commit_timer_active = 1;
-+ journal->j_commit_timer->expires = transaction->t_expires;
-+ add_timer(journal->j_commit_timer);
-+
-+ J_ASSERT (journal->j_running_transaction == NULL);
-+ journal->j_running_transaction = transaction;
-+
-+ return transaction;
-+}
-+
-+/*
-+ * Handle management.
-+ *
-+ * A handle_t is an object which represents a single atomic update to a
-+ * filesystem, and which tracks all of the modifications which form part
-+ * of that one update.
-+ */
-+
-+/*
-+ * start_this_handle: Given a handle, deal with any locking or stalling
-+ * needed to make sure that there is enough journal space for the handle
-+ * to begin. Attach the handle to a transaction and set up the
-+ * transaction's buffer credits.
-+ */
-+
-+static int start_this_handle(journal_t *journal, handle_t *handle)
-+{
-+ transaction_t *transaction;
-+ int needed;
-+ int nblocks = handle->h_buffer_credits;
-+
-+ jbd_debug(3, "New handle %p going live.\n", handle);
-+
-+repeat:
-+
-+ lock_journal(journal);
-+
-+repeat_locked:
-+
-+ if (is_journal_aborted(journal) ||
-+ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
-+ unlock_journal(journal);
-+ return -EROFS;
-+ }
-+
-+ /* Wait on the journal's transaction barrier if necessary */
-+ if (journal->j_barrier_count) {
-+ unlock_journal(journal);
-+ sleep_on(&journal->j_wait_transaction_locked);
-+ goto repeat;
-+ }
-+
-+ if (!journal->j_running_transaction)
-+ get_transaction(journal, 0);
-+ /* @@@ Error? */
-+ J_ASSERT(journal->j_running_transaction);
-+
-+ transaction = journal->j_running_transaction;
-+
-+ /* If the current transaction is locked down for commit, wait
-+ * for the lock to be released. */
-+
-+ if (transaction->t_state == T_LOCKED) {
-+ unlock_journal(journal);
-+ jbd_debug(3, "Handle %p stalling...\n", handle);
-+ sleep_on(&journal->j_wait_transaction_locked);
-+ goto repeat;
-+ }
-+
-+ /* If there is not enough space left in the log to write all
-+ * potential buffers requested by this operation, we need to
-+ * stall pending a log checkpoint to free some more log
-+ * space. */
-+
-+ needed = transaction->t_outstanding_credits + nblocks;
-+
-+ if (needed > journal->j_max_transaction_buffers) {
-+ /* If the current transaction is already too large, then
-+ * start to commit it: we can then go back and attach
-+ * this handle to a new transaction. */
-+
-+ jbd_debug(2, "Handle %p starting new commit...\n", handle);
-+ log_start_commit(journal, transaction);
-+ unlock_journal(journal);
-+ sleep_on(&journal->j_wait_transaction_locked);
-+ lock_journal(journal);
-+ goto repeat_locked;
-+ }
-+
-+ /*
-+ * The commit code assumes that it can get enough log space
-+ * without forcing a checkpoint. This is *critical* for
-+ * correctness: a checkpoint of a buffer which is also
-+ * associated with a committing transaction creates a deadlock,
-+ * so commit simply cannot force through checkpoints.
-+ *
-+ * We must therefore ensure the necessary space in the journal
-+ * *before* starting to dirty potentially checkpointed buffers
-+ * in the new transaction.
-+ *
-+ * The worst part is, any transaction currently committing can
-+ * reduce the free space arbitrarily. Be careful to account for
-+ * those buffers when checkpointing.
-+ */
-+
-+ /*
-+ * @@@ AKPM: This seems rather over-defensive. We're giving commit
-+ * a _lot_ of headroom: 1/4 of the journal plus the size of
-+ * the committing transaction. Really, we only need to give it
-+ * committing_transaction->t_outstanding_credits plus "enough" for
-+ * the log control blocks.
-+ * Also, this test is inconsitent with the matching one in
-+ * journal_extend().
-+ */
-+ needed = journal->j_max_transaction_buffers;
-+ if (journal->j_committing_transaction)
-+ needed += journal->j_committing_transaction->
-+ t_outstanding_credits;
-+
-+ if (log_space_left(journal) < needed) {
-+ jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-+ log_wait_for_space(journal, needed);
-+ goto repeat_locked;
-+ }
-+
-+ /* OK, account for the buffers that this operation expects to
-+ * use and add the handle to the running transaction. */
-+
-+ handle->h_transaction = transaction;
-+ transaction->t_outstanding_credits += nblocks;
-+ transaction->t_updates++;
-+ transaction->t_handle_count++;
-+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-+ handle, nblocks, transaction->t_outstanding_credits,
-+ log_space_left(journal));
-+
-+ unlock_journal(journal);
-+
-+ return 0;
-+}
-+
-+/*
-+ * Obtain a new handle.
-+ *
-+ * We make sure that the transaction can guarantee at least nblocks of
-+ * modified buffers in the log. We block until the log can guarantee
-+ * that much space.
-+ *
-+ * This function is visible to journal users (like ext2fs), so is not
-+ * called with the journal already locked.
-+ *
-+ * Return a pointer to a newly allocated handle, or NULL on failure
-+ */
-+
-+handle_t *journal_start(journal_t *journal, int nblocks)
-+{
-+ handle_t *handle = journal_current_handle();
-+ int err;
-+
-+ if (!journal)
-+ return ERR_PTR(-EROFS);
-+
-+ if (handle) {
-+ J_ASSERT(handle->h_transaction->t_journal == journal);
-+ handle->h_ref++;
-+ return handle;
-+ }
-+
-+ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+ if (!handle)
-+ return ERR_PTR(-ENOMEM);
-+ memset (handle, 0, sizeof (handle_t));
-+
-+ handle->h_buffer_credits = nblocks;
-+ handle->h_ref = 1;
-+ current->journal_info = handle;
-+
-+ err = start_this_handle(journal, handle);
-+ if (err < 0) {
-+ kfree(handle);
-+ current->journal_info = NULL;
-+ return ERR_PTR(err);
-+ }
-+
-+ return handle;
-+}
-+
-+/*
-+ * Return zero on success
-+ */
-+static int try_start_this_handle(journal_t *journal, handle_t *handle)
-+{
-+ transaction_t *transaction;
-+ int needed;
-+ int nblocks = handle->h_buffer_credits;
-+ int ret = 0;
-+
-+ jbd_debug(3, "New handle %p maybe going live.\n", handle);
-+
-+ lock_journal(journal);
-+
-+ if (is_journal_aborted(journal) ||
-+ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
-+ ret = -EROFS;
-+ goto fail_unlock;
-+ }
-+
-+ if (journal->j_barrier_count)
-+ goto fail_unlock;
-+
-+ if (!journal->j_running_transaction && get_transaction(journal, 1) == 0)
-+ goto fail_unlock;
-+
-+ transaction = journal->j_running_transaction;
-+ if (transaction->t_state == T_LOCKED)
-+ goto fail_unlock;
-+
-+ needed = transaction->t_outstanding_credits + nblocks;
-+ /* We could run log_start_commit here */
-+ if (needed > journal->j_max_transaction_buffers)
-+ goto fail_unlock;
-+
-+ needed = journal->j_max_transaction_buffers;
-+ if (journal->j_committing_transaction)
-+ needed += journal->j_committing_transaction->
-+ t_outstanding_credits;
-+
-+ if (log_space_left(journal) < needed)
-+ goto fail_unlock;
-+
-+ handle->h_transaction = transaction;
-+ transaction->t_outstanding_credits += nblocks;
-+ transaction->t_updates++;
-+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-+ handle, nblocks, transaction->t_outstanding_credits,
-+ log_space_left(journal));
-+ unlock_journal(journal);
-+ return 0;
-+
-+fail_unlock:
-+ unlock_journal(journal);
-+ if (ret >= 0)
-+ ret = -1;
-+ return ret;
-+}
-+
-+/*
-+ * Try to start a handle, but non-blockingly. If we weren't able
-+ * to, return an ERR_PTR value.
-+ */
-+handle_t *journal_try_start(journal_t *journal, int nblocks)
-+{
-+ handle_t *handle = journal_current_handle();
-+ int err;
-+
-+ if (!journal)
-+ return ERR_PTR(-EROFS);
-+
-+ if (handle) {
-+ jbd_debug(4, "h_ref %d -> %d\n",
-+ handle->h_ref,
-+ handle->h_ref + 1);
-+ J_ASSERT(handle->h_transaction->t_journal == journal);
-+ if (is_handle_aborted(handle))
-+ return ERR_PTR(-EIO);
-+ handle->h_ref++;
-+ return handle;
-+ } else {
-+ jbd_debug(4, "no current transaction\n");
-+ }
-+
-+ if (is_journal_aborted(journal))
-+ return ERR_PTR(-EIO);
-+
-+ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+ if (!handle)
-+ return ERR_PTR(-ENOMEM);
-+ memset (handle, 0, sizeof (handle_t));
-+
-+ handle->h_buffer_credits = nblocks;
-+ handle->h_ref = 1;
-+ current->journal_info = handle;
-+
-+ err = try_start_this_handle(journal, handle);
-+ if (err < 0) {
-+ kfree(handle);
-+ current->journal_info = NULL;
-+ return ERR_PTR(err);
-+ }
-+
-+ return handle;
-+}
-+
-+/*
-+ * journal_extend: extend buffer credits.
-+ *
-+ * Some transactions, such as large extends and truncates, can be done
-+ * atomically all at once or in several stages. The operation requests
-+ * a credit for a number of buffer modications in advance, but can
-+ * extend its credit if it needs more.
-+ *
-+ * journal_extend tries to give the running handle more buffer credits.
-+ * It does not guarantee that allocation: this is a best-effort only.
-+ * The calling process MUST be able to deal cleanly with a failure to
-+ * extend here.
-+ *
-+ * Return 0 on success, non-zero on failure.
-+ *
-+ * return code < 0 implies an error
-+ * return code > 0 implies normal transaction-full status.
-+ */
-+
-+int journal_extend (handle_t *handle, int nblocks)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ int result;
-+ int wanted;
-+
-+ lock_journal (journal);
-+
-+ result = -EIO;
-+ if (is_handle_aborted(handle))
-+ goto error_out;
-+
-+ result = 1;
-+
-+ /* Don't extend a locked-down transaction! */
-+ if (handle->h_transaction->t_state != T_RUNNING) {
-+ jbd_debug(3, "denied handle %p %d blocks: "
-+ "transaction not running\n", handle, nblocks);
-+ goto error_out;
-+ }
-+
-+ wanted = transaction->t_outstanding_credits + nblocks;
-+
-+ if (wanted > journal->j_max_transaction_buffers) {
-+ jbd_debug(3, "denied handle %p %d blocks: "
-+ "transaction too large\n", handle, nblocks);
-+ goto error_out;
-+ }
-+
-+ if (wanted > log_space_left(journal)) {
-+ jbd_debug(3, "denied handle %p %d blocks: "
-+ "insufficient log space\n", handle, nblocks);
-+ goto error_out;
-+ }
-+
-+ handle->h_buffer_credits += nblocks;
-+ transaction->t_outstanding_credits += nblocks;
-+ result = 0;
-+
-+ jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
-+
-+error_out:
-+ unlock_journal (journal);
-+ return result;
-+}
-+
-+
-+/*
-+ * journal_restart: restart a handle for a multi-transaction filesystem
-+ * operation.
-+ *
-+ * If the journal_extend() call above fails to grant new buffer credits
-+ * to a running handle, a call to journal_restart will commit the
-+ * handle's transaction so far and reattach the handle to a new
-+ * transaction capabable of guaranteeing the requested number of
-+ * credits.
-+ */
-+
-+int journal_restart(handle_t *handle, int nblocks)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ int ret;
-+
-+ /* If we've had an abort of any type, don't even think about
-+ * actually doing the restart! */
-+ if (is_handle_aborted(handle))
-+ return 0;
-+
-+ /* First unlink the handle from its current transaction, and
-+ * start the commit on that. */
-+
-+ J_ASSERT (transaction->t_updates > 0);
-+ J_ASSERT (journal_current_handle() == handle);
-+
-+ transaction->t_outstanding_credits -= handle->h_buffer_credits;
-+ transaction->t_updates--;
-+
-+ if (!transaction->t_updates)
-+ wake_up(&journal->j_wait_updates);
-+
-+ jbd_debug(2, "restarting handle %p\n", handle);
-+ log_start_commit(journal, transaction);
-+
-+ handle->h_buffer_credits = nblocks;
-+ ret = start_this_handle(journal, handle);
-+ return ret;
-+}
-+
-+
-+/*
-+ * Barrier operation: establish a transaction barrier.
-+ *
-+ * This locks out any further updates from being started, and blocks
-+ * until all existing updates have completed, returning only once the
-+ * journal is in a quiescent state with no updates running.
-+ *
-+ * The journal lock should not be held on entry.
-+ */
-+
-+void journal_lock_updates (journal_t *journal)
-+{
-+ lock_journal(journal);
-+ ++journal->j_barrier_count;
-+
-+ /* Wait until there are no running updates */
-+ while (1) {
-+ transaction_t *transaction = journal->j_running_transaction;
-+ if (!transaction)
-+ break;
-+ if (!transaction->t_updates)
-+ break;
-+
-+ unlock_journal(journal);
-+ sleep_on(&journal->j_wait_updates);
-+ lock_journal(journal);
-+ }
-+
-+ unlock_journal(journal);
-+
-+ /* We have now established a barrier against other normal
-+ * updates, but we also need to barrier against other
-+ * journal_lock_updates() calls to make sure that we serialise
-+ * special journal-locked operations too. */
-+ down(&journal->j_barrier);
-+}
-+
-+/*
-+ * Release a transaction barrier obtained with journal_lock_updates().
-+ *
-+ * Should be called without the journal lock held.
-+ */
-+
-+void journal_unlock_updates (journal_t *journal)
-+{
-+ lock_journal(journal);
-+
-+ J_ASSERT (journal->j_barrier_count != 0);
-+
-+ up(&journal->j_barrier);
-+ --journal->j_barrier_count;
-+ wake_up(&journal->j_wait_transaction_locked);
-+ unlock_journal(journal);
-+}
-+
-+/*
-+ * journal_get_write_access: notify intent to modify a buffer for metadata
-+ * (not data) update.
-+ *
-+ * If the buffer is already part of the current transaction, then there
-+ * is nothing we need to do. If it is already part of a prior
-+ * transaction which we are still committing to disk, then we need to
-+ * make sure that we do not overwrite the old copy: we do copy-out to
-+ * preserve the copy going to disk. We also account the buffer against
-+ * the handle's metadata buffer credits (unless the buffer is already
-+ * part of the transaction, that is).
-+ *
-+ * Returns an error code or 0 on success.
-+ *
-+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
-+ * because we're write()ing a buffer which is also part of a shared mapping.
-+ */
-+
-+static int
-+do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ int error;
-+ char *frozen_buffer = NULL;
-+ int need_copy = 0;
-+
-+ jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
-+
-+ JBUFFER_TRACE(jh, "entry");
-+repeat:
-+ /* @@@ Need to check for errors here at some point. */
-+
-+ /*
-+ * AKPM: neither bdflush nor kupdate run with the BKL. There's
-+ * nothing we can do to prevent them from starting writeout of a
-+ * BUF_DIRTY buffer at any time. And checkpointing buffers are on
-+ * BUF_DIRTY. So. We no longer assert that the buffer is unlocked.
-+ *
-+ * However. It is very wrong for us to allow ext3 to start directly
-+ * altering the ->b_data of buffers which may at that very time be
-+ * undergoing writeout to the client filesystem. This can leave
-+ * the filesystem in an inconsistent, transient state if we crash.
-+ * So what we do is to steal the buffer if it is in checkpoint
-+ * mode and dirty. The journal lock will keep out checkpoint-mode
-+ * state transitions within journal_remove_checkpoint() and the buffer
-+ * is locked to keep bdflush/kupdate/whoever away from it as well.
-+ *
-+ * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a
-+ * simple lock_journal(). This code here will care for locked buffers.
-+ */
-+ /*
-+ * The buffer_locked() || buffer_dirty() tests here are simply an
-+ * optimisation tweak. If anyone else in the system decides to
-+ * lock this buffer later on, we'll blow up. There doesn't seem
-+ * to be a good reason why they should do this.
-+ */
-+ if (jh->b_cp_transaction &&
-+ (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) {
-+ unlock_journal(journal);
-+ lock_buffer(jh2bh(jh));
-+ spin_lock(&journal_datalist_lock);
-+ if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) {
-+ /* OK, we need to steal it */
-+ JBUFFER_TRACE(jh, "stealing from checkpoint mode");
-+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-+ J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
-+
-+ J_ASSERT(handle->h_buffer_credits > 0);
-+ handle->h_buffer_credits--;
-+
-+ /* This will clear BH_Dirty and set BH_JBDDirty. */
-+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
-+ __journal_file_buffer(jh, transaction, BJ_Reserved);
-+
-+ /* And pull it off BUF_DIRTY, onto BUF_CLEAN */
-+ refile_buffer(jh2bh(jh));
-+
-+ /*
-+ * The buffer is now hidden from bdflush. It is
-+ * metadata against the current transaction.
-+ */
-+ JBUFFER_TRACE(jh, "steal from cp mode is complete");
-+ }
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_buffer(jh2bh(jh));
-+ lock_journal(journal);
-+ goto repeat;
-+ }
-+
-+ J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh)));
-+
-+ error = -EROFS;
-+ if (is_handle_aborted(handle))
-+ goto out_unlocked;
-+ error = 0;
-+
-+ spin_lock(&journal_datalist_lock);
-+
-+ /* The buffer is already part of this transaction if
-+ * b_transaction or b_next_transaction points to it. */
-+
-+ if (jh->b_transaction == transaction ||
-+ jh->b_next_transaction == transaction)
-+ goto done_locked;
-+
-+ /* If there is already a copy-out version of this buffer, then
-+ * we don't need to make another one. */
-+
-+ if (jh->b_frozen_data) {
-+ JBUFFER_TRACE(jh, "has frozen data");
-+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-+ jh->b_next_transaction = transaction;
-+
-+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
-+ handle->h_buffer_credits--;
-+ goto done_locked;
-+ }
-+
-+ /* Is there data here we need to preserve? */
-+
-+ if (jh->b_transaction && jh->b_transaction != transaction) {
-+ JBUFFER_TRACE(jh, "owned by older transaction");
-+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-+ J_ASSERT_JH(jh, jh->b_transaction ==
-+ journal->j_committing_transaction);
-+
-+ /* There is one case we have to be very careful about.
-+ * If the committing transaction is currently writing
-+ * this buffer out to disk and has NOT made a copy-out,
-+ * then we cannot modify the buffer contents at all
-+ * right now. The essence of copy-out is that it is the
-+ * extra copy, not the primary copy, which gets
-+ * journaled. If the primary copy is already going to
-+ * disk then we cannot do copy-out here. */
-+
-+ if (jh->b_jlist == BJ_Shadow) {
-+ JBUFFER_TRACE(jh, "on shadow: sleep");
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_journal(journal);
-+ /* commit wakes up all shadow buffers after IO */
-+ sleep_on(&jh2bh(jh)->b_wait);
-+ lock_journal(journal);
-+ goto repeat;
-+ }
-+
-+ /* Only do the copy if the currently-owning transaction
-+ * still needs it. If it is on the Forget list, the
-+ * committing transaction is past that stage. The
-+ * buffer had better remain locked during the kmalloc,
-+ * but that should be true --- we hold the journal lock
-+ * still and the buffer is already on the BUF_JOURNAL
-+ * list so won't be flushed.
-+ *
-+ * Subtle point, though: if this is a get_undo_access,
-+ * then we will be relying on the frozen_data to contain
-+ * the new value of the committed_data record after the
-+ * transaction, so we HAVE to force the frozen_data copy
-+ * in that case. */
-+
-+ if (jh->b_jlist != BJ_Forget || force_copy) {
-+ JBUFFER_TRACE(jh, "generate frozen data");
-+ if (!frozen_buffer) {
-+ JBUFFER_TRACE(jh, "allocate memory for buffer");
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_journal(journal);
-+ frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
-+ GFP_NOFS);
-+ lock_journal(journal);
-+ if (!frozen_buffer) {
-+ printk(KERN_EMERG __FUNCTION__
-+ "OOM for frozen_buffer\n");
-+ JBUFFER_TRACE(jh, "oom!");
-+ error = -ENOMEM;
-+ spin_lock(&journal_datalist_lock);
-+ goto done_locked;
-+ }
-+ goto repeat;
-+ }
-+
-+ jh->b_frozen_data = frozen_buffer;
-+ frozen_buffer = NULL;
-+ need_copy = 1;
-+ }
-+ jh->b_next_transaction = transaction;
-+ }
-+
-+ J_ASSERT(handle->h_buffer_credits > 0);
-+ handle->h_buffer_credits--;
-+
-+ /* Finally, if the buffer is not journaled right now, we need to
-+ * make sure it doesn't get written to disk before the caller
-+ * actually commits the new data. */
-+
-+ if (!jh->b_transaction) {
-+ JBUFFER_TRACE(jh, "no transaction");
-+ J_ASSERT_JH(jh, !jh->b_next_transaction);
-+ jh->b_transaction = transaction;
-+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
-+ __journal_file_buffer(jh, transaction, BJ_Reserved);
-+ }
-+
-+done_locked:
-+ spin_unlock(&journal_datalist_lock);
-+ if (need_copy) {
-+ struct page *page;
-+ int offset;
-+ char *source;
-+
-+ J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh)));
-+ page = jh2bh(jh)->b_page;
-+ offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
-+ source = kmap(page);
-+ memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
-+ kunmap(page);
-+ }
-+
-+
-+ /* If we are about to journal a buffer, then any revoke pending
-+ on it is no longer valid. */
-+ journal_cancel_revoke(handle, jh);
-+
-+out_unlocked:
-+ if (frozen_buffer)
-+ kfree(frozen_buffer);
-+
-+ JBUFFER_TRACE(jh, "exit");
-+ return error;
-+}
-+
-+int journal_get_write_access (handle_t *handle, struct buffer_head *bh)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ struct journal_head *jh = journal_add_journal_head(bh);
-+ int rc;
-+
-+ /* We do not want to get caught playing with fields which the
-+ * log thread also manipulates. Make sure that the buffer
-+ * completes any outstanding IO before proceeding. */
-+ lock_journal(journal);
-+ rc = do_get_write_access(handle, jh, 0);
-+ journal_unlock_journal_head(jh);
-+ unlock_journal(journal);
-+ return rc;
-+}
-+
-+
-+/*
-+ * When the user wants to journal a newly created buffer_head
-+ * (ie. getblk() returned a new buffer and we are going to populate it
-+ * manually rather than reading off disk), then we need to keep the
-+ * buffer_head locked until it has been completely filled with new
-+ * data. In this case, we should be able to make the assertion that
-+ * the bh is not already part of an existing transaction.
-+ *
-+ * The buffer should already be locked by the caller by this point.
-+ * There is no lock ranking violation: it was a newly created,
-+ * unlocked buffer beforehand. */
-+
-+int journal_get_create_access (handle_t *handle, struct buffer_head *bh)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ struct journal_head *jh = journal_add_journal_head(bh);
-+ int err;
-+
-+ jbd_debug(5, "journal_head %p\n", jh);
-+ lock_journal(journal);
-+ err = -EROFS;
-+ if (is_handle_aborted(handle))
-+ goto out;
-+ err = 0;
-+
-+ JBUFFER_TRACE(jh, "entry");
-+ /* The buffer may already belong to this transaction due to
-+ * pre-zeroing in the filesystem's new_block code. It may also
-+ * be on the previous, committing transaction's lists, but it
-+ * HAS to be in Forget state in that case: the transaction must
-+ * have deleted the buffer for it to be reused here. */
-+ J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
-+ jh->b_transaction == NULL ||
-+ (jh->b_transaction == journal->j_committing_transaction &&
-+ jh->b_jlist == BJ_Forget)));
-+
-+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-+ J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
-+
-+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
-+ handle->h_buffer_credits--;
-+
-+ spin_lock(&journal_datalist_lock);
-+ if (jh->b_transaction == NULL) {
-+ jh->b_transaction = transaction;
-+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
-+ __journal_file_buffer(jh, transaction, BJ_Reserved);
-+ JBUFFER_TRACE(jh, "refile");
-+ refile_buffer(jh2bh(jh));
-+ } else if (jh->b_transaction == journal->j_committing_transaction) {
-+ JBUFFER_TRACE(jh, "set next transaction");
-+ jh->b_next_transaction = transaction;
-+ }
-+ spin_unlock(&journal_datalist_lock);
-+
-+ /*
-+ * akpm: I added this. ext3_alloc_branch can pick up new indirect
-+ * blocks which contain freed but then revoked metadata. We need
-+ * to cancel the revoke in case we end up freeing it yet again
-+ * and the reallocating as data - this would cause a second revoke,
-+ * which hits an assertion error.
-+ */
-+ JBUFFER_TRACE(jh, "cancelling revoke");
-+ journal_cancel_revoke(handle, jh);
-+ journal_unlock_journal_head(jh);
-+out:
-+ unlock_journal(journal);
-+ return err;
-+}
-+
-+
-+
-+/*
-+ * journal_get_undo_access: Notify intent to modify metadata with non-
-+ * rewindable consequences
-+ *
-+ * Sometimes there is a need to distinguish between metadata which has
-+ * been committed to disk and that which has not. The ext3fs code uses
-+ * this for freeing and allocating space: we have to make sure that we
-+ * do not reuse freed space until the deallocation has been committed,
-+ * since if we overwrote that space we would make the delete
-+ * un-rewindable in case of a crash.
-+ *
-+ * To deal with that, journal_get_undo_access requests write access to a
-+ * buffer for parts of non-rewindable operations such as delete
-+ * operations on the bitmaps. The journaling code must keep a copy of
-+ * the buffer's contents prior to the undo_access call until such time
-+ * as we know that the buffer has definitely been committed to disk.
-+ *
-+ * We never need to know which transaction the committed data is part
-+ * of: buffers touched here are guaranteed to be dirtied later and so
-+ * will be committed to a new transaction in due course, at which point
-+ * we can discard the old committed data pointer.
-+ *
-+ * Returns error number or 0 on success.
-+ */
-+
-+int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
-+{
-+ journal_t *journal = handle->h_transaction->t_journal;
-+ int err;
-+ struct journal_head *jh = journal_add_journal_head(bh);
-+
-+ JBUFFER_TRACE(jh, "entry");
-+ lock_journal(journal);
-+
-+ /* Do this first --- it can drop the journal lock, so we want to
-+ * make sure that obtaining the committed_data is done
-+ * atomically wrt. completion of any outstanding commits. */
-+ err = do_get_write_access (handle, jh, 1);
-+ if (err)
-+ goto out;
-+
-+ if (!jh->b_committed_data) {
-+ /* Copy out the current buffer contents into the
-+ * preserved, committed copy. */
-+ JBUFFER_TRACE(jh, "generate b_committed data");
-+ jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size,
-+ GFP_NOFS);
-+ if (!jh->b_committed_data) {
-+ printk(KERN_EMERG __FUNCTION__
-+ ": No memory for committed data!\n");
-+ err = -ENOMEM;
-+ goto out;
-+ }
-+
-+ memcpy (jh->b_committed_data, jh2bh(jh)->b_data,
-+ jh2bh(jh)->b_size);
-+ }
-+
-+out:
-+ if (!err)
-+ J_ASSERT_JH(jh, jh->b_committed_data);
-+ journal_unlock_journal_head(jh);
-+ unlock_journal(journal);
-+ return err;
-+}
-+
-+/*
-+ * journal_dirty_data: mark a buffer as containing dirty data which
-+ * needs to be flushed before we can commit the current transaction.
-+ *
-+ * The buffer is placed on the transaction's data list and is marked as
-+ * belonging to the transaction.
-+ *
-+ * If `async' is set then the writebask will be initiated by the caller
-+ * using submit_bh -> end_buffer_io_async. We put the buffer onto
-+ * t_async_datalist.
-+ *
-+ * Returns error number or 0 on success.
-+ *
-+ * journal_dirty_data() can be called via page_launder->ext3_writepage
-+ * by kswapd. So it cannot block. Happily, there's nothing here
-+ * which needs lock_journal if `async' is set.
-+ *
-+ * When the buffer is on the current transaction we freely move it
-+ * between BJ_AsyncData and BJ_SyncData according to who tried to
-+ * change its state last.
-+ */
-+
-+int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
-+{
-+ journal_t *journal = handle->h_transaction->t_journal;
-+ int need_brelse = 0;
-+ int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData;
-+ struct journal_head *jh;
-+
-+ if (is_handle_aborted(handle))
-+ return 0;
-+
-+ jh = journal_add_journal_head(bh);
-+ JBUFFER_TRACE(jh, "entry");
-+
-+ /*
-+ * The buffer could *already* be dirty. Writeout can start
-+ * at any time.
-+ */
-+ jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-+
-+ /*
-+ * What if the buffer is already part of a running transaction?
-+ *
-+ * There are two cases:
-+ * 1) It is part of the current running transaction. Refile it,
-+ * just in case we have allocated it as metadata, deallocated
-+ * it, then reallocated it as data.
-+ * 2) It is part of the previous, still-committing transaction.
-+ * If all we want to do is to guarantee that the buffer will be
-+ * written to disk before this new transaction commits, then
-+ * being sure that the *previous* transaction has this same
-+ * property is sufficient for us! Just leave it on its old
-+ * transaction.
-+ *
-+ * In case (2), the buffer must not already exist as metadata
-+ * --- that would violate write ordering (a transaction is free
-+ * to write its data at any point, even before the previous
-+ * committing transaction has committed). The caller must
-+ * never, ever allow this to happen: there's nothing we can do
-+ * about it in this layer.
-+ */
-+ spin_lock(&journal_datalist_lock);
-+ if (jh->b_transaction) {
-+ JBUFFER_TRACE(jh, "has transaction");
-+ if (jh->b_transaction != handle->h_transaction) {
-+ JBUFFER_TRACE(jh, "belongs to older transaction");
-+ J_ASSERT_JH(jh, jh->b_transaction ==
-+ journal->j_committing_transaction);
-+
-+ /* @@@ IS THIS TRUE ? */
-+ /*
-+ * Not any more. Scenario: someone does a write()
-+ * in data=journal mode. The buffer's transaction has
-+ * moved into commit. Then someone does another
-+ * write() to the file. We do the frozen data copyout
-+ * and set b_next_transaction to point to j_running_t.
-+ * And while we're in that state, someone does a
-+ * writepage() in an attempt to pageout the same area
-+ * of the file via a shared mapping. At present that
-+ * calls journal_dirty_data(), and we get right here.
-+ * It may be too late to journal the data. Simply
-+ * falling through to the next test will suffice: the
-+ * data will be dirty and wil be checkpointed. The
-+ * ordering comments in the next comment block still
-+ * apply.
-+ */
-+ //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-+
-+ /*
-+ * If we're journalling data, and this buffer was
-+ * subject to a write(), it could be metadata, forget
-+ * or shadow against the committing transaction. Now,
-+ * someone has dirtied the same darn page via a mapping
-+ * and it is being writepage()'d.
-+ * We *could* just steal the page from commit, with some
-+ * fancy locking there. Instead, we just skip it -
-+ * don't tie the page's buffers to the new transaction
-+ * at all.
-+ * Implication: if we crash before the writepage() data
-+ * is written into the filesystem, recovery will replay
-+ * the write() data.
-+ */
-+ if (jh->b_jlist != BJ_None &&
-+ jh->b_jlist != BJ_SyncData &&
-+ jh->b_jlist != BJ_AsyncData) {
-+ JBUFFER_TRACE(jh, "Not stealing");
-+ goto no_journal;
-+ }
-+
-+ /*
-+ * This buffer may be undergoing writeout in commit. We
-+ * can't return from here and let the caller dirty it
-+ * again because that can cause the write-out loop in
-+ * commit to never terminate.
-+ */
-+ if (!async && buffer_dirty(bh)) {
-+ atomic_inc(&bh->b_count);
-+ spin_unlock(&journal_datalist_lock);
-+ need_brelse = 1;
-+ ll_rw_block(WRITE, 1, &bh);
-+ wait_on_buffer(bh);
-+ spin_lock(&journal_datalist_lock);
-+ /* The buffer may become locked again at any
-+ time if it is redirtied */
-+ }
-+
-+ /* journal_clean_data_list() may have got there first */
-+ if (jh->b_transaction != NULL) {
-+ JBUFFER_TRACE(jh, "unfile from commit");
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = NULL;
-+ }
-+ /* The buffer will be refiled below */
-+
-+ }
-+ /*
-+ * Special case --- the buffer might actually have been
-+ * allocated and then immediately deallocated in the previous,
-+ * committing transaction, so might still be left on that
-+ * transaction's metadata lists.
-+ */
-+ if (jh->b_jlist != wanted_jlist) {
-+ JBUFFER_TRACE(jh, "not on correct data list: unfile");
-+ J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = NULL;
-+ JBUFFER_TRACE(jh, "file as data");
-+ __journal_file_buffer(jh, handle->h_transaction,
-+ wanted_jlist);
-+ }
-+ } else {
-+ JBUFFER_TRACE(jh, "not on a transaction");
-+ __journal_file_buffer(jh, handle->h_transaction, wanted_jlist);
-+ }
-+no_journal:
-+ spin_unlock(&journal_datalist_lock);
-+ if (need_brelse) {
-+ BUFFER_TRACE(bh, "brelse");
-+ __brelse(bh);
-+ }
-+ JBUFFER_TRACE(jh, "exit");
-+ journal_unlock_journal_head(jh);
-+ return 0;
-+}
-+
-+/*
-+ * journal_dirty_metadata: mark a buffer as containing dirty metadata
-+ * which needs to be journaled as part of the current transaction.
-+ *
-+ * The buffer is placed on the transaction's metadata list and is marked
-+ * as belonging to the transaction.
-+ *
-+ * Special care needs to be taken if the buffer already belongs to the
-+ * current committing transaction (in which case we should have frozen
-+ * data present for that commit). In that case, we don't relink the
-+ * buffer: that only gets done when the old transaction finally
-+ * completes its commit.
-+ *
-+ * Returns error number or 0 on success.
-+ */
-+
-+int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ struct journal_head *jh = bh2jh(bh);
-+
-+ jbd_debug(5, "journal_head %p\n", jh);
-+ JBUFFER_TRACE(jh, "entry");
-+ lock_journal(journal);
-+ if (is_handle_aborted(handle))
-+ goto out_unlock;
-+
-+ spin_lock(&journal_datalist_lock);
-+ set_bit(BH_JBDDirty, &bh->b_state);
-+ set_buffer_flushtime(bh);
-+
-+ J_ASSERT_JH(jh, jh->b_transaction != NULL);
-+
-+ /*
-+ * Metadata already on the current transaction list doesn't
-+ * need to be filed. Metadata on another transaction's list must
-+ * be committing, and will be refiled once the commit completes:
-+ * leave it alone for now.
-+ */
-+
-+ if (jh->b_transaction != transaction) {
-+ JBUFFER_TRACE(jh, "already on other transaction");
-+ J_ASSERT_JH(jh, jh->b_transaction ==
-+ journal->j_committing_transaction);
-+ J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
-+ /* And this case is illegal: we can't reuse another
-+ * transaction's data buffer, ever. */
-+ /* FIXME: writepage() should be journalled */
-+ J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData);
-+ goto done_locked;
-+ }
-+
-+ /* That test should have eliminated the following case: */
-+ J_ASSERT_JH(jh, jh->b_frozen_data == 0);
-+
-+ JBUFFER_TRACE(jh, "file as BJ_Metadata");
-+ __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
-+
-+done_locked:
-+ spin_unlock(&journal_datalist_lock);
-+ JBUFFER_TRACE(jh, "exit");
-+out_unlock:
-+ unlock_journal(journal);
-+ return 0;
-+}
-+
-+#if 0
-+/*
-+ * journal_release_buffer: undo a get_write_access without any buffer
-+ * updates, if the update decided in the end that it didn't need access.
-+ *
-+ * journal_get_write_access() can block, so it is quite possible for a
-+ * journaling component to decide after the write access is returned
-+ * that global state has changed and the update is no longer required. */
-+
-+void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ struct journal_head *jh = bh2jh(bh);
-+
-+ lock_journal(journal);
-+ JBUFFER_TRACE(jh, "entry");
-+
-+ /* If the buffer is reserved but not modified by this
-+ * transaction, then it is safe to release it. In all other
-+ * cases, just leave the buffer as it is. */
-+
-+ spin_lock(&journal_datalist_lock);
-+ if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction &&
-+ !buffer_jdirty(jh2bh(jh))) {
-+ JBUFFER_TRACE(jh, "unused: refiling it");
-+ handle->h_buffer_credits++;
-+ __journal_refile_buffer(jh);
-+ }
-+ spin_unlock(&journal_datalist_lock);
-+
-+ JBUFFER_TRACE(jh, "exit");
-+ unlock_journal(journal);
-+}
-+#endif
-+
-+/*
-+ * journal_forget: bforget() for potentially-journaled buffers. We can
-+ * only do the bforget if there are no commits pending against the
-+ * buffer. If the buffer is dirty in the current running transaction we
-+ * can safely unlink it.
-+ *
-+ * bh may not be a journalled buffer at all - it may be a non-JBD
-+ * buffer which came off the hashtable. Check for this.
-+ *
-+ * Decrements bh->b_count by one.
-+ *
-+ * Allow this call even if the handle has aborted --- it may be part of
-+ * the caller's cleanup after an abort.
-+ */
-+
-+void journal_forget (handle_t *handle, struct buffer_head *bh)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ struct journal_head *jh;
-+
-+ BUFFER_TRACE(bh, "entry");
-+
-+ lock_journal(journal);
-+ spin_lock(&journal_datalist_lock);
-+
-+ if (!buffer_jbd(bh))
-+ goto not_jbd;
-+ jh = bh2jh(bh);
-+
-+ if (jh->b_transaction == handle->h_transaction) {
-+ J_ASSERT_JH(jh, !jh->b_frozen_data);
-+
-+ /* If we are forgetting a buffer which is already part
-+ * of this transaction, then we can just drop it from
-+ * the transaction immediately. */
-+ clear_bit(BH_Dirty, &bh->b_state);
-+ clear_bit(BH_JBDDirty, &bh->b_state);
-+
-+ JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
-+ J_ASSERT_JH(jh, !jh->b_committed_data);
-+
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = 0;
-+
-+ /*
-+ * We are no longer going to journal this buffer.
-+ * However, the commit of this transaction is still
-+ * important to the buffer: the delete that we are now
-+ * processing might obsolete an old log entry, so by
-+ * committing, we can satisfy the buffer's checkpoint.
-+ *
-+ * So, if we have a checkpoint on the buffer, we should
-+ * now refile the buffer on our BJ_Forget list so that
-+ * we know to remove the checkpoint after we commit.
-+ */
-+
-+ if (jh->b_cp_transaction) {
-+ __journal_file_buffer(jh, transaction, BJ_Forget);
-+ } else {
-+ __journal_remove_journal_head(bh);
-+ __brelse(bh);
-+ if (!buffer_jbd(bh)) {
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_journal(journal);
-+ __bforget(bh);
-+ return;
-+ }
-+ }
-+
-+ } else if (jh->b_transaction) {
-+ J_ASSERT_JH(jh, (jh->b_transaction ==
-+ journal->j_committing_transaction));
-+ /* However, if the buffer is still owned by a prior
-+ * (committing) transaction, we can't drop it yet... */
-+ JBUFFER_TRACE(jh, "belongs to older transaction");
-+ /* ... but we CAN drop it from the new transaction if we
-+ * have also modified it since the original commit. */
-+
-+ if (jh->b_next_transaction) {
-+ J_ASSERT(jh->b_next_transaction == transaction);
-+ jh->b_next_transaction = NULL;
-+ }
-+ }
-+
-+not_jbd:
-+ spin_unlock(&journal_datalist_lock);
-+ unlock_journal(journal);
-+ __brelse(bh);
-+ return;
-+}
-+
-+#if 0 /* Unused */
-+/*
-+ * journal_sync_buffer: flush a potentially-journaled buffer to disk.
-+ *
-+ * Used for O_SYNC filesystem operations. If the buffer is journaled,
-+ * we need to complete the O_SYNC by waiting for the transaction to
-+ * complete. It is an error to call journal_sync_buffer before
-+ * journal_stop!
-+ */
-+
-+void journal_sync_buffer(struct buffer_head *bh)
-+{
-+ transaction_t *transaction;
-+ journal_t *journal;
-+ long sequence;
-+ struct journal_head *jh;
-+
-+ /* If the buffer isn't journaled, this is easy: just sync it to
-+ * disk. */
-+ BUFFER_TRACE(bh, "entry");
-+
-+ spin_lock(&journal_datalist_lock);
-+ if (!buffer_jbd(bh)) {
-+ spin_unlock(&journal_datalist_lock);
-+ return;
-+ }
-+ jh = bh2jh(bh);
-+ if (jh->b_transaction == NULL) {
-+ /* If the buffer has already been journaled, then this
-+ * is a noop. */
-+ if (jh->b_cp_transaction == NULL) {
-+ spin_unlock(&journal_datalist_lock);
-+ return;
-+ }
-+ atomic_inc(&bh->b_count);
-+ spin_unlock(&journal_datalist_lock);
-+ ll_rw_block (WRITE, 1, &bh);
-+ wait_on_buffer(bh);
-+ __brelse(bh);
-+ goto out;
-+ }
-+
-+ /* Otherwise, just wait until the transaction is synced to disk. */
-+ transaction = jh->b_transaction;
-+ journal = transaction->t_journal;
-+ sequence = transaction->t_tid;
-+ spin_unlock(&journal_datalist_lock);
-+
-+ jbd_debug(2, "requesting commit for jh %p\n", jh);
-+ log_start_commit (journal, transaction);
-+
-+ while (tid_gt(sequence, journal->j_commit_sequence)) {
-+ wake_up(&journal->j_wait_done_commit);
-+ sleep_on(&journal->j_wait_done_commit);
-+ }
-+ JBUFFER_TRACE(jh, "exit");
-+out:
-+ return;
-+}
-+#endif
-+
-+/*
-+ * All done for a particular handle.
-+ *
-+ * There is not much action needed here. We just return any remaining
-+ * buffer credits to the transaction and remove the handle. The only
-+ * complication is that we need to start a commit operation if the
-+ * filesystem is marked for synchronous update.
-+ *
-+ * journal_stop itself will not usually return an error, but it may
-+ * do so in unusual circumstances. In particular, expect it to
-+ * return -EIO if a journal_abort has been executed since the
-+ * transaction began.
-+ */
-+
-+int journal_stop(handle_t *handle)
-+{
-+ transaction_t *transaction = handle->h_transaction;
-+ journal_t *journal = transaction->t_journal;
-+ int old_handle_count, err;
-+
-+ if (!handle)
-+ return 0;
-+
-+ J_ASSERT (transaction->t_updates > 0);
-+ J_ASSERT (journal_current_handle() == handle);
-+
-+ if (is_handle_aborted(handle))
-+ err = -EIO;
-+ else
-+ err = 0;
-+
-+ if (--handle->h_ref > 0) {
-+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
-+ handle->h_ref);
-+ return err;
-+ }
-+
-+ jbd_debug(4, "Handle %p going down\n", handle);
-+
-+ /*
-+ * Implement synchronous transaction batching. If the handle
-+ * was synchronous, don't force a commit immediately. Let's
-+ * yield and let another thread piggyback onto this transaction.
-+ * Keep doing that while new threads continue to arrive.
-+ * It doesn't cost much - we're about to run a commit and sleep
-+ * on IO anyway. Speeds up many-threaded, many-dir operations
-+ * by 30x or more...
-+ */
-+ if (handle->h_sync) {
-+ do {
-+ old_handle_count = transaction->t_handle_count;
-+ set_current_state(TASK_RUNNING);
-+ current->policy |= SCHED_YIELD;
-+ schedule();
-+ } while (old_handle_count != transaction->t_handle_count);
-+ }
-+
-+ current->journal_info = NULL;
-+ transaction->t_outstanding_credits -= handle->h_buffer_credits;
-+ transaction->t_updates--;
-+ if (!transaction->t_updates) {
-+ wake_up(&journal->j_wait_updates);
-+ if (journal->j_barrier_count)
-+ wake_up(&journal->j_wait_transaction_locked);
-+ }
-+
-+ /*
-+ * If the handle is marked SYNC, we need to set another commit
-+ * going! We also want to force a commit if the current
-+ * transaction is occupying too much of the log, or if the
-+ * transaction is too old now.
-+ */
-+ if (handle->h_sync ||
-+ transaction->t_outstanding_credits >
-+ journal->j_max_transaction_buffers ||
-+ time_after_eq(jiffies, transaction->t_expires)) {
-+ /* Do this even for aborted journals: an abort still
-+ * completes the commit thread, it just doesn't write
-+ * anything to disk. */
-+ tid_t tid = transaction->t_tid;
-+
-+ jbd_debug(2, "transaction too old, requesting commit for "
-+ "handle %p\n", handle);
-+ /* This is non-blocking */
-+ log_start_commit(journal, transaction);
-+
-+ /*
-+ * Special case: JFS_SYNC synchronous updates require us
-+ * to wait for the commit to complete.
-+ */
-+ if (handle->h_sync && !(current->flags & PF_MEMALLOC))
-+ log_wait_commit(journal, tid);
-+ }
-+ kfree(handle);
-+ return err;
-+}
-+
-+/*
-+ * For synchronous operations: force any uncommitted trasnactions
-+ * to disk. May seem kludgy, but it reuses all the handle batching
-+ * code in a very simple manner.
-+ */
-+int journal_force_commit(journal_t *journal)
-+{
-+ handle_t *handle;
-+ int ret = 0;
-+
-+ lock_kernel();
-+ handle = journal_start(journal, 1);
-+ if (IS_ERR(handle)) {
-+ ret = PTR_ERR(handle);
-+ goto out;
-+ }
-+ handle->h_sync = 1;
-+ journal_stop(handle);
-+out:
-+ unlock_kernel();
-+ return ret;
-+}
-+
-+/*
-+ *
-+ * List management code snippets: various functions for manipulating the
-+ * transaction buffer lists.
-+ *
-+ */
-+
-+/*
-+ * Append a buffer to a transaction list, given the transaction's list head
-+ * pointer.
-+ * journal_datalist_lock is held.
-+ */
-+
-+static inline void
-+__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
-+{
-+ if (!*list) {
-+ jh->b_tnext = jh->b_tprev = jh;
-+ *list = jh;
-+ } else {
-+ /* Insert at the tail of the list to preserve order */
-+ struct journal_head *first = *list, *last = first->b_tprev;
-+ jh->b_tprev = last;
-+ jh->b_tnext = first;
-+ last->b_tnext = first->b_tprev = jh;
-+ }
-+}
-+
-+/*
-+ * Remove a buffer from a transaction list, given the transaction's list
-+ * head pointer.
-+ *
-+ * Called with journal_datalist_lock held, and the journal may not
-+ * be locked.
-+ */
-+
-+static inline void
-+__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
-+{
-+ if (*list == jh) {
-+ *list = jh->b_tnext;
-+ if (*list == jh)
-+ *list = 0;
-+ }
-+ jh->b_tprev->b_tnext = jh->b_tnext;
-+ jh->b_tnext->b_tprev = jh->b_tprev;
-+}
-+
-+/*
-+ * Remove a buffer from the appropriate transaction list.
-+ *
-+ * Note that this function can *change* the value of
-+ * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget,
-+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
-+ * is holding onto a copy of one of thee pointers, it could go bad.
-+ * Generally the caller needs to re-read the pointer from the transaction_t.
-+ *
-+ * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called
-+ * via journal_try_to_free_buffer() or journal_clean_data_list(). In that
-+ * case, journal_datalist_lock will be held, and the journal may not be locked.
-+ */
-+void __journal_unfile_buffer(struct journal_head *jh)
-+{
-+ struct journal_head **list = 0;
-+ transaction_t * transaction;
-+
-+ assert_spin_locked(&journal_datalist_lock);
-+ transaction = jh->b_transaction;
-+
-+#ifdef __SMP__
-+ J_ASSERT (current->lock_depth >= 0);
-+#endif
-+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
-+
-+ if (jh->b_jlist != BJ_None)
-+ J_ASSERT_JH(jh, transaction != 0);
-+
-+ switch (jh->b_jlist) {
-+ case BJ_None:
-+ return;
-+ case BJ_SyncData:
-+ list = &transaction->t_sync_datalist;
-+ break;
-+ case BJ_AsyncData:
-+ list = &transaction->t_async_datalist;
-+ break;
-+ case BJ_Metadata:
-+ transaction->t_nr_buffers--;
-+ J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
-+ list = &transaction->t_buffers;
-+ break;
-+ case BJ_Forget:
-+ list = &transaction->t_forget;
-+ break;
-+ case BJ_IO:
-+ list = &transaction->t_iobuf_list;
-+ break;
-+ case BJ_Shadow:
-+ list = &transaction->t_shadow_list;
-+ break;
-+ case BJ_LogCtl:
-+ list = &transaction->t_log_list;
-+ break;
-+ case BJ_Reserved:
-+ list = &transaction->t_reserved_list;
-+ break;
-+ }
-+
-+ __blist_del_buffer(list, jh);
-+ jh->b_jlist = BJ_None;
-+ if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) {
-+ set_bit(BH_Dirty, &jh2bh(jh)->b_state);
-+ }
-+}
-+
-+void journal_unfile_buffer(struct journal_head *jh)
-+{
-+ spin_lock(&journal_datalist_lock);
-+ __journal_unfile_buffer(jh);
-+ spin_unlock(&journal_datalist_lock);
-+}
-+
-+/*
-+ * Called from journal_try_to_free_buffers(). The journal is not
-+ * locked. lru_list_lock is not held.
-+ *
-+ * Here we see why journal_datalist_lock is global and not per-journal.
-+ * We cannot get back to this buffer's journal pointer without locking
-+ * out journal_clean_data_list() in some manner.
-+ *
-+ * One could use journal_datalist_lock to get unracy access to a
-+ * per-journal lock.
-+ *
-+ * Called with journal_datalist_lock held.
-+ *
-+ * Returns non-zero iff we were able to free the journal_head.
-+ */
-+static int __journal_try_to_free_buffer(struct buffer_head *bh,
-+ int *locked_or_dirty)
-+{
-+ struct journal_head *jh;
-+
-+ assert_spin_locked(&journal_datalist_lock);
-+
-+ jh = bh2jh(bh);
-+
-+ if (buffer_locked(bh) || buffer_dirty(bh)) {
-+ *locked_or_dirty = 1;
-+ goto out;
-+ }
-+
-+ if (!buffer_uptodate(bh))
-+ goto out;
-+
-+ if (jh->b_next_transaction != 0)
-+ goto out;
-+
-+ if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
-+ if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) {
-+ /* A written-back ordered data buffer */
-+ JBUFFER_TRACE(jh, "release data");
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = 0;
-+ __journal_remove_journal_head(bh);
-+ __brelse(bh);
-+ }
-+ }
-+ else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
-+ /* written-back checkpointed metadata buffer */
-+ if (jh->b_jlist == BJ_None) {
-+ JBUFFER_TRACE(jh, "remove from checkpoint list");
-+ __journal_remove_checkpoint(jh);
-+ __journal_remove_journal_head(bh);
-+ __brelse(bh);
-+ }
-+ }
-+ return !buffer_jbd(bh);
-+
-+out:
-+ return 0;
-+}
-+
-+/*
-+ * journal_try_to_free_buffers(). For all the buffers on this page,
-+ * if they are fully written out ordered data, move them onto BUF_CLEAN
-+ * so try_to_free_buffers() can reap them. Called with lru_list_lock
-+ * not held. Does its own locking.
-+ *
-+ * This complicates JBD locking somewhat. We aren't protected by the
-+ * BKL here. We wish to remove the buffer from its committing or
-+ * running transaction's ->t_datalist via __journal_unfile_buffer.
-+ *
-+ * This may *change* the value of transaction_t->t_datalist, so anyone
-+ * who looks at t_datalist needs to lock against this function.
-+ *
-+ * Even worse, someone may be doing a journal_dirty_data on this
-+ * buffer. So we need to lock against that. journal_dirty_data()
-+ * will come out of the lock with the buffer dirty, which makes it
-+ * ineligible for release here.
-+ *
-+ * Who else is affected by this? hmm... Really the only contender
-+ * is do_get_write_access() - it could be looking at the buffer while
-+ * journal_try_to_free_buffer() is changing its state. But that
-+ * cannot happen because we never reallocate freed data as metadata
-+ * while the data is part of a transaction. Yes?
-+ *
-+ * This function returns non-zero if we wish try_to_free_buffers()
-+ * to be called. We do this is the page is releasable by try_to_free_buffers().
-+ * We also do it if the page has locked or dirty buffers and the caller wants
-+ * us to perform sync or async writeout.
-+ */
-+int journal_try_to_free_buffers(journal_t *journal,
-+ struct page *page, int gfp_mask)
-+{
-+ struct buffer_head *bh;
-+ struct buffer_head *tmp;
-+ int locked_or_dirty = 0;
-+ int call_ttfb = 1;
-+
-+ J_ASSERT(PageLocked(page));
-+
-+ bh = page->buffers;
-+ tmp = bh;
-+ spin_lock(&journal_datalist_lock);
-+ do {
-+ struct buffer_head *p = tmp;
-+
-+ tmp = tmp->b_this_page;
-+ if (buffer_jbd(p))
-+ if (!__journal_try_to_free_buffer(p, &locked_or_dirty))
-+ call_ttfb = 0;
-+ } while (tmp != bh);
-+ spin_unlock(&journal_datalist_lock);
-+
-+ if (!(gfp_mask & (__GFP_IO|__GFP_WAIT)))
-+ goto out;
-+ if (!locked_or_dirty)
-+ goto out;
-+ /*
-+ * The VM wants us to do writeout, or to block on IO, or both.
-+ * So we allow try_to_free_buffers to be called even if the page
-+ * still has journalled buffers.
-+ */
-+ call_ttfb = 1;
-+out:
-+ return call_ttfb;
-+}
-+
-+/*
-+ * This buffer is no longer needed. If it is on an older transaction's
-+ * checkpoint list we need to record it on this transaction's forget list
-+ * to pin this buffer (and hence its checkpointing transaction) down until
-+ * this transaction commits. If the buffer isn't on a checkpoint list, we
-+ * release it.
-+ * Returns non-zero if JBD no longer has an interest in the buffer.
-+ */
-+static int dispose_buffer(struct journal_head *jh,
-+ transaction_t *transaction)
-+{
-+ int may_free = 1;
-+ struct buffer_head *bh = jh2bh(jh);
-+
-+ spin_lock(&journal_datalist_lock);
-+ __journal_unfile_buffer(jh);
-+ jh->b_transaction = 0;
-+
-+ if (jh->b_cp_transaction) {
-+ JBUFFER_TRACE(jh, "on running+cp transaction");
-+ __journal_file_buffer(jh, transaction, BJ_Forget);
-+ clear_bit(BH_JBDDirty, &bh->b_state);
-+ may_free = 0;
-+ } else {
-+ JBUFFER_TRACE(jh, "on running transaction");
-+ __journal_remove_journal_head(bh);
-+ __brelse(bh);
-+ }
-+ spin_unlock(&journal_datalist_lock);
-+ return may_free;
-+}
-+
-+/*
-+ * journal_flushpage
-+ *
-+ * This code is tricky. It has a number of cases to deal with.
-+ *
-+ * There are two invariants which this code relies on:
-+ *
-+ * i_size must be updated on disk before we start calling flushpage on the
-+ * data.
-+ *
-+ * This is done in ext3 by defining an ext3_setattr method which
-+ * updates i_size before truncate gets going. By maintaining this
-+ * invariant, we can be sure that it is safe to throw away any buffers
-+ * attached to the current transaction: once the transaction commits,
-+ * we know that the data will not be needed.
-+ *
-+ * Note however that we can *not* throw away data belonging to the
-+ * previous, committing transaction!
-+ *
-+ * Any disk blocks which *are* part of the previous, committing
-+ * transaction (and which therefore cannot be discarded immediately) are
-+ * not going to be reused in the new running transaction
-+ *
-+ * The bitmap committed_data images guarantee this: any block which is
-+ * allocated in one transaction and removed in the next will be marked
-+ * as in-use in the committed_data bitmap, so cannot be reused until
-+ * the next transaction to delete the block commits. This means that
-+ * leaving committing buffers dirty is quite safe: the disk blocks
-+ * cannot be reallocated to a different file and so buffer aliasing is
-+ * not possible.
-+ *
-+ *
-+ * The above applies mainly to ordered data mode. In writeback mode we
-+ * don't make guarantees about the order in which data hits disk --- in
-+ * particular we don't guarantee that new dirty data is flushed before
-+ * transaction commit --- so it is always safe just to discard data
-+ * immediately in that mode. --sct
-+ */
-+
-+/*
-+ * The journal_unmap_buffer helper function returns zero if the buffer
-+ * concerned remains pinned as an anonymous buffer belonging to an older
-+ * transaction.
-+ *
-+ * We're outside-transaction here. Either or both of j_running_transaction
-+ * and j_committing_transaction may be NULL.
-+ */
-+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
-+{
-+ transaction_t *transaction;
-+ struct journal_head *jh;
-+ int may_free = 1;
-+
-+ BUFFER_TRACE(bh, "entry");
-+
-+ if (!buffer_mapped(bh))
-+ return 1;
-+
-+ /* It is safe to proceed here without the
-+ * journal_datalist_spinlock because the buffers cannot be
-+ * stolen by try_to_free_buffers as long as we are holding the
-+ * page lock. --sct */
-+
-+ if (!buffer_jbd(bh))
-+ goto zap_buffer;
-+
-+ jh = bh2jh(bh);
-+ transaction = jh->b_transaction;
-+ if (transaction == NULL) {
-+ /* First case: not on any transaction. If it
-+ * has no checkpoint link, then we can zap it:
-+ * it's a writeback-mode buffer so we don't care
-+ * if it hits disk safely. */
-+ if (!jh->b_cp_transaction) {
-+ JBUFFER_TRACE(jh, "not on any transaction: zap");
-+ goto zap_buffer;
-+ }
-+
-+ if (!buffer_dirty(bh)) {
-+ /* bdflush has written it. We can drop it now */
-+ goto zap_buffer;
-+ }
-+
-+ /* OK, it must be in the journal but still not
-+ * written fully to disk: it's metadata or
-+ * journaled data... */
-+
-+ if (journal->j_running_transaction) {
-+ /* ... and once the current transaction has
-+ * committed, the buffer won't be needed any
-+ * longer. */
-+ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
-+ return dispose_buffer(jh,
-+ journal->j_running_transaction);
-+ } else {
-+ /* There is no currently-running transaction. So the
-+ * orphan record which we wrote for this file must have
-+ * passed into commit. We must attach this buffer to
-+ * the committing transaction, if it exists. */
-+ if (journal->j_committing_transaction) {
-+ JBUFFER_TRACE(jh, "give to committing trans");
-+ return dispose_buffer(jh,
-+ journal->j_committing_transaction);
-+ } else {
-+ /* The orphan record's transaction has
-+ * committed. We can cleanse this buffer */
-+ clear_bit(BH_JBDDirty, &bh->b_state);
-+ goto zap_buffer;
-+ }
-+ }
-+ } else if (transaction == journal->j_committing_transaction) {
-+ /* If it is committing, we simply cannot touch it. We
-+ * can remove it's next_transaction pointer from the
-+ * running transaction if that is set, but nothing
-+ * else. */
-+ JBUFFER_TRACE(jh, "on committing transaction");
-+ if (jh->b_next_transaction) {
-+ J_ASSERT(jh->b_next_transaction ==
-+ journal->j_running_transaction);
-+ jh->b_next_transaction = NULL;
-+ }
-+ return 0;
-+ } else {
-+ /* Good, the buffer belongs to the running transaction.
-+ * We are writing our own transaction's data, not any
-+ * previous one's, so it is safe to throw it away
-+ * (remember that we expect the filesystem to have set
-+ * i_size already for this truncate so recovery will not
-+ * expose the disk blocks we are discarding here.) */
-+ J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
-+ may_free = dispose_buffer(jh, transaction);
-+ }
-+
-+zap_buffer:
-+ if (buffer_dirty(bh))
-+ mark_buffer_clean(bh);
-+ J_ASSERT_BH(bh, !buffer_jdirty(bh));
-+ clear_bit(BH_Uptodate, &bh->b_state);
-+ clear_bit(BH_Mapped, &bh->b_state);
-+ clear_bit(BH_Req, &bh->b_state);
-+ clear_bit(BH_New, &bh->b_state);
-+ return may_free;
-+}
-+
-+/*
-+ * Return non-zero if the page's buffers were successfully reaped
-+ */
-+int journal_flushpage(journal_t *journal,
-+ struct page *page,
-+ unsigned long offset)
-+{
-+ struct buffer_head *head, *bh, *next;
-+ unsigned int curr_off = 0;
-+ int may_free = 1;
-+
-+ if (!PageLocked(page))
-+ BUG();
-+ if (!page->buffers)
-+ return 1;
-+
-+ /* We will potentially be playing with lists other than just the
-+ * data lists (especially for journaled data mode), so be
-+ * cautious in our locking. */
-+ lock_journal(journal);
-+
-+ head = bh = page->buffers;
-+ do {
-+ unsigned int next_off = curr_off + bh->b_size;
-+ next = bh->b_this_page;
-+
-+ /* AKPM: doing lock_buffer here may be overly paranoid */
-+ if (offset <= curr_off) {
-+ /* This block is wholly outside the truncation point */
-+ lock_buffer(bh);
-+ may_free &= journal_unmap_buffer(journal, bh);
-+ unlock_buffer(bh);
-+ }
-+ curr_off = next_off;
-+ bh = next;
-+
-+ } while (bh != head);
-+
-+ unlock_journal(journal);
-+
-+ if (!offset) {
-+ if (!may_free || !try_to_free_buffers(page, 0))
-+ return 0;
-+ J_ASSERT(page->buffers == NULL);
-+ }
-+ return 1;
-+}
-+
-+/*
-+ * File a buffer on the given transaction list.
-+ */
-+void __journal_file_buffer(struct journal_head *jh,
-+ transaction_t *transaction, int jlist)
-+{
-+ struct journal_head **list = 0;
-+
-+ assert_spin_locked(&journal_datalist_lock);
-+
-+#ifdef __SMP__
-+ J_ASSERT (current->lock_depth >= 0);
-+#endif
-+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
-+ J_ASSERT_JH(jh, jh->b_transaction == transaction ||
-+ jh->b_transaction == 0);
-+
-+ if (jh->b_transaction) {
-+ if (jh->b_jlist == jlist)
-+ return;
-+ __journal_unfile_buffer(jh);
-+ } else {
-+ jh->b_transaction = transaction;
-+ }
-+
-+ switch (jlist) {
-+ case BJ_None:
-+ J_ASSERT_JH(jh, !jh->b_committed_data);
-+ J_ASSERT_JH(jh, !jh->b_frozen_data);
-+ return;
-+ case BJ_SyncData:
-+ list = &transaction->t_sync_datalist;
-+ break;
-+ case BJ_AsyncData:
-+ list = &transaction->t_async_datalist;
-+ break;
-+ case BJ_Metadata:
-+ transaction->t_nr_buffers++;
-+ list = &transaction->t_buffers;
-+ break;
-+ case BJ_Forget:
-+ list = &transaction->t_forget;
-+ break;
-+ case BJ_IO:
-+ list = &transaction->t_iobuf_list;
-+ break;
-+ case BJ_Shadow:
-+ list = &transaction->t_shadow_list;
-+ break;
-+ case BJ_LogCtl:
-+ list = &transaction->t_log_list;
-+ break;
-+ case BJ_Reserved:
-+ list = &transaction->t_reserved_list;
-+ break;
-+ }
-+
-+ __blist_add_buffer(list, jh);
-+ jh->b_jlist = jlist;
-+
-+ if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-+ jlist == BJ_Shadow || jlist == BJ_Forget) {
-+ if (atomic_set_buffer_clean(jh2bh(jh))) {
-+ set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
-+ }
-+ }
-+}
-+
-+void journal_file_buffer(struct journal_head *jh,
-+ transaction_t *transaction, int jlist)
-+{
-+ spin_lock(&journal_datalist_lock);
-+ __journal_file_buffer(jh, transaction, jlist);
-+ spin_unlock(&journal_datalist_lock);
-+}
-+
-+/*
-+ * Remove a buffer from its current buffer list in preparation for
-+ * dropping it from its current transaction entirely. If the buffer has
-+ * already started to be used by a subsequent transaction, refile the
-+ * buffer on that transaction's metadata list.
-+ */
-+
-+void __journal_refile_buffer(struct journal_head *jh)
-+{
-+ assert_spin_locked(&journal_datalist_lock);
-+#ifdef __SMP__
-+ J_ASSERT_JH(jh, current->lock_depth >= 0);
-+#endif
-+ __journal_unfile_buffer(jh);
-+
-+ /* If the buffer is now unused, just drop it. If it has been
-+ modified by a later transaction, add it to the new
-+ transaction's metadata list. */
-+
-+ jh->b_transaction = jh->b_next_transaction;
-+ jh->b_next_transaction = NULL;
-+
-+ if (jh->b_transaction != NULL) {
-+ __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
-+ J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
-+ } else {
-+ /* Onto BUF_DIRTY for writeback */
-+ refile_buffer(jh2bh(jh));
-+ }
-+}
-+
-+/*
-+ * For the unlocked version of this call, also make sure that any
-+ * hanging journal_head is cleaned up if necessary.
-+ *
-+ * __journal_refile_buffer is usually called as part of a single locked
-+ * operation on a buffer_head, in which the caller is probably going to
-+ * be hooking the journal_head onto other lists. In that case it is up
-+ * to the caller to remove the journal_head if necessary. For the
-+ * unlocked journal_refile_buffer call, the caller isn't going to be
-+ * doing anything else to the buffer so we need to do the cleanup
-+ * ourselves to avoid a jh leak.
-+ *
-+ * *** The journal_head may be freed by this call! ***
-+ */
-+void journal_refile_buffer(struct journal_head *jh)
-+{
-+ struct buffer_head *bh;
-+
-+ spin_lock(&journal_datalist_lock);
-+ bh = jh2bh(jh);
-+
-+ __journal_refile_buffer(jh);
-+ __journal_remove_journal_head(bh);
-+
-+ spin_unlock(&journal_datalist_lock);
-+ __brelse(bh);
-+}
+++ /dev/null
- include/linux/mm.h | 1 +
- 1 files changed, 1 insertion(+)
-
-Index: linux.mcp2/include/linux/mm.h
-===================================================================
---- linux.mcp2.orig/include/linux/mm.h 2004-05-05 14:32:29.000000000 -0700
-+++ linux.mcp2/include/linux/mm.h 2004-05-05 14:46:54.000000000 -0700
-@@ -162,6 +162,7 @@
- protected by pagemap_lru_lock !! */
- struct page **pprev_hash; /* Complement to *next_hash. */
- struct buffer_head * buffers; /* Buffer maps us to a disk block. */
-+ unsigned long private;
-
- /*
- * On machines where all RAM is mapped into kernel address space,
+++ /dev/null
-Index: linux-bgl/kernel/sched.c
-===================================================================
---- linux-bgl.orig/kernel/sched.c 2003-07-02 08:43:33.000000000 -0700
-+++ linux-bgl/kernel/sched.c 2004-10-26 23:37:44.314193755 -0700
-@@ -1124,7 +1124,7 @@
- return retval;
- }
-
--static void show_task(struct task_struct * p)
-+void show_task(struct task_struct * p)
- {
- unsigned long free = 0;
- int state;
-Index: linux-bgl/kernel/ksyms.c
-===================================================================
---- linux-bgl.orig/kernel/ksyms.c 2004-10-26 23:23:00.518654978 -0700
-+++ linux-bgl/kernel/ksyms.c 2004-10-26 23:38:29.289071295 -0700
-@@ -76,6 +76,7 @@
- };
- #endif
-
-+void show_task(struct task_struct *);
-
- EXPORT_SYMBOL(inter_module_register);
- EXPORT_SYMBOL(inter_module_unregister);
-@@ -595,3 +596,6 @@
-
- EXPORT_SYMBOL(tasklist_lock);
- EXPORT_SYMBOL(pidhash);
-+
-+/* debug */
-+EXPORT_SYMBOL(show_task);
+++ /dev/null
- include/linux/mm.h | 1 +
- mm/filemap.c | 3 ++-
- 2 files changed, 3 insertions(+), 1 deletion(-)
-
-Index: linux-ion/include/linux/mm.h
-===================================================================
---- linux-ion.orig/include/linux/mm.h 2004-07-28 14:34:57.000000000 -0700
-+++ linux-ion/include/linux/mm.h 2004-09-27 15:07:50.000000000 -0700
-@@ -593,6 +593,7 @@
- /* filemap.c */
- extern void remove_inode_page(struct page *);
- extern unsigned long page_unuse(struct page *);
-+extern void truncate_complete_page(struct page *);
- extern void truncate_inode_pages(struct address_space *, loff_t);
-
- /* generic vm_area_ops exported for stackable file systems */
-Index: linux-ion/mm/filemap.c
-===================================================================
---- linux-ion.orig/mm/filemap.c 2004-07-28 14:34:57.000000000 -0700
-+++ linux-ion/mm/filemap.c 2004-09-27 15:08:13.000000000 -0700
-@@ -231,7 +231,7 @@
- do_flushpage(page, partial);
- }
-
--static void truncate_complete_page(struct page *page)
-+void truncate_complete_page(struct page *page)
- {
- /* Leave it on the LRU if it gets converted into anonymous buffers */
- if (!page->buffers || do_flushpage(page, 0))
-@@ -249,6 +249,7 @@
- remove_inode_page(page);
- page_cache_release(page);
- }
-+EXPORT_SYMBOL(truncate_complete_page);
-
- static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
- static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
+++ /dev/null
-
-
-
-Index: linux-ion/kernel/ksyms.c
-===================================================================
---- linux-ion.orig/kernel/ksyms.c 2004-07-28 14:34:57.000000000 -0700
-+++ linux-ion/kernel/ksyms.c 2004-09-27 15:04:52.000000000 -0700
-@@ -286,6 +286,10 @@
- EXPORT_SYMBOL(dcache_readdir);
- EXPORT_SYMBOL(dcache_dir_ops);
-
-+/* lustre */
-+EXPORT_SYMBOL(panic_notifier_list);
-+EXPORT_SYMBOL(do_kern_mount);
-+
- /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
- EXPORT_SYMBOL(default_llseek);
- EXPORT_SYMBOL(dentry_open);
-Index: linux-ion/include/linux/fs.h
-===================================================================
---- linux-ion.orig/include/linux/fs.h 2004-07-28 14:34:57.000000000 -0700
-+++ linux-ion/include/linux/fs.h 2004-09-27 15:04:52.000000000 -0700
-@@ -1050,6 +1050,7 @@
- extern struct vfsmount *kern_mount(struct file_system_type *);
- extern int may_umount(struct vfsmount *);
- extern long do_mount(char *, char *, char *, unsigned long, void *);
-+struct vfsmount *do_kern_mount(const char *fstype, int flags, char *name, void *data);
- extern void umount_tree(struct vfsmount *);
-
- #define kern_umount mntput
-Index: linux-ion/mm/memory.c
-===================================================================
---- linux-ion.orig/mm/memory.c 2004-07-28 14:34:57.000000000 -0700
-+++ linux-ion/mm/memory.c 2004-09-27 15:05:56.000000000 -0700
-@@ -401,6 +401,7 @@
- mm->rss = 0;
- spin_unlock(&mm->page_table_lock);
- }
-+EXPORT_SYMBOL(zap_page_range);
-
- /*
- * Do a quick page-table lookup for a single page.
+++ /dev/null
- fs/ext3/Makefile | 2
- fs/ext3/dir.c | 299 +++++++++
- fs/ext3/file.c | 3
- fs/ext3/hash.c | 215 ++++++
- fs/ext3/namei.c | 1388 ++++++++++++++++++++++++++++++++++++++++-----
- fs/ext3/super.c | 7
- include/linux/ext3_fs.h | 85 ++
- include/linux/ext3_fs_sb.h | 2
- include/linux/ext3_jbd.h | 2
- include/linux/rbtree.h | 2
- lib/rbtree.c | 42 +
- 11 files changed, 1887 insertions(+), 160 deletions(-)
-
-Index: linux-2.4.19.SuSE/fs/ext3/Makefile
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/Makefile 2004-05-27 11:07:21.000000000 -0700
-+++ linux-2.4.19.SuSE/fs/ext3/Makefile 2004-05-27 11:08:28.000000000 -0700
-@@ -12,7 +12,7 @@
- export-objs := super.o inode.o
-
- obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-- ioctl.o namei.o super.o symlink.o
-+ ioctl.o namei.o super.o symlink.o hash.o
- obj-m := $(O_TARGET)
-
- obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o
-Index: linux-2.4.19.SuSE/fs/ext3/dir.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800
-+++ linux-2.4.19.SuSE/fs/ext3/dir.c 2004-05-27 11:08:28.000000000 -0700
-@@ -21,12 +21,16 @@
- #include <linux/fs.h>
- #include <linux/jbd.h>
- #include <linux/ext3_fs.h>
-+#include <linux/slab.h>
-+#include <linux/rbtree.h>
-
- static unsigned char ext3_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
- };
-
- static int ext3_readdir(struct file *, void *, filldir_t);
-+static int ext3_dx_readdir(struct file * filp,
-+ void * dirent, filldir_t filldir);
-
- struct file_operations ext3_dir_operations = {
- read: generic_read_dir,
-@@ -35,6 +39,17 @@
- fsync: ext3_sync_file, /* BKL held */
- };
-
-+
-+static unsigned char get_dtype(struct super_block *sb, int filetype)
-+{
-+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
-+ (filetype >= EXT3_FT_MAX))
-+ return DT_UNKNOWN;
-+
-+ return (ext3_filetype_table[filetype]);
-+}
-+
-+
- int ext3_check_dir_entry (const char * function, struct inode * dir,
- struct ext3_dir_entry_2 * de,
- struct buffer_head * bh,
-@@ -79,6 +94,16 @@
-
- sb = inode->i_sb;
-
-+ if (is_dx(inode)) {
-+ err = ext3_dx_readdir(filp, dirent, filldir);
-+ if (err != ERR_BAD_DX_DIR)
-+ return err;
-+ /*
-+ * We don't set the inode dirty flag since it's not
-+ * critical that it get flushed back to the disk.
-+ */
-+ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
-+ }
- stored = 0;
- bh = NULL;
- offset = filp->f_pos & (sb->s_blocksize - 1);
-@@ -162,18 +187,12 @@
- * during the copy operation.
- */
- unsigned long version = filp->f_version;
-- unsigned char d_type = DT_UNKNOWN;
-
-- if (EXT3_HAS_INCOMPAT_FEATURE(sb,
-- EXT3_FEATURE_INCOMPAT_FILETYPE)
-- && de->file_type < EXT3_FT_MAX)
-- d_type =
-- ext3_filetype_table[de->file_type];
- error = filldir(dirent, de->name,
- de->name_len,
- filp->f_pos,
- le32_to_cpu(de->inode),
-- d_type);
-+ get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
-@@ -188,3 +207,269 @@
- UPDATE_ATIME(inode);
- return 0;
- }
-+
-+#ifdef CONFIG_EXT3_INDEX
-+/*
-+ * These functions convert from the major/minor hash to an f_pos
-+ * value.
-+ *
-+ * Currently we only use major hash numer. This is unfortunate, but
-+ * on 32-bit machines, the same VFS interface is used for lseek and
-+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
-+ * lseek/telldir/seekdir will blow out spectacularly, and from within
-+ * the ext2 low-level routine, we don't know if we're being called by
-+ * a 64-bit version of the system call or the 32-bit version of the
-+ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
-+ * cookie. Sigh.
-+ */
-+#define hash2pos(major, minor) (major >> 1)
-+#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-+#define pos2min_hash(pos) (0)
-+
-+/*
-+ * This structure holds the nodes of the red-black tree used to store
-+ * the directory entry in hash order.
-+ */
-+struct fname {
-+ __u32 hash;
-+ __u32 minor_hash;
-+ rb_node_t rb_hash;
-+ struct fname *next;
-+ __u32 inode;
-+ __u8 name_len;
-+ __u8 file_type;
-+ char name[0];
-+};
-+
-+/*
-+ * This functoin implements a non-recursive way of freeing all of the
-+ * nodes in the red-black tree.
-+ */
-+static void free_rb_tree_fname(rb_root_t *root)
-+{
-+ rb_node_t *n = root->rb_node;
-+ rb_node_t *parent;
-+ struct fname *fname;
-+
-+ while (n) {
-+ /* Do the node's children first */
-+ if ((n)->rb_left) {
-+ n = n->rb_left;
-+ continue;
-+ }
-+ if (n->rb_right) {
-+ n = n->rb_right;
-+ continue;
-+ }
-+ /*
-+ * The node has no children; free it, and then zero
-+ * out parent's link to it. Finally go to the
-+ * beginning of the loop and try to free the parent
-+ * node.
-+ */
-+ parent = n->rb_parent;
-+ fname = rb_entry(n, struct fname, rb_hash);
-+ kfree(fname);
-+ if (!parent)
-+ root->rb_node = 0;
-+ else if (parent->rb_left == n)
-+ parent->rb_left = 0;
-+ else if (parent->rb_right == n)
-+ parent->rb_right = 0;
-+ n = parent;
-+ }
-+ root->rb_node = 0;
-+}
-+
-+
-+struct dir_private_info *create_dir_info(loff_t pos)
-+{
-+ struct dir_private_info *p;
-+
-+ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
-+ if (!p)
-+ return NULL;
-+ p->root.rb_node = 0;
-+ p->curr_node = 0;
-+ p->extra_fname = 0;
-+ p->last_pos = 0;
-+ p->curr_hash = pos2maj_hash(pos);
-+ p->curr_minor_hash = pos2min_hash(pos);
-+ p->next_hash = 0;
-+ return p;
-+}
-+
-+void ext3_htree_free_dir_info(struct dir_private_info *p)
-+{
-+ free_rb_tree_fname(&p->root);
-+ kfree(p);
-+}
-+
-+/*
-+ * Given a directory entry, enter it into the fname rb tree.
-+ */
-+void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
-+ __u32 minor_hash,
-+ struct ext3_dir_entry_2 *dirent)
-+{
-+ rb_node_t **p, *parent = NULL;
-+ struct fname * fname, *new_fn;
-+ struct dir_private_info *info;
-+ int len;
-+
-+ info = (struct dir_private_info *) dir_file->private_data;
-+ p = &info->root.rb_node;
-+
-+ /* Create and allocate the fname structure */
-+ len = sizeof(struct fname) + dirent->name_len + 1;
-+ new_fn = kmalloc(len, GFP_KERNEL);
-+ memset(new_fn, 0, len);
-+ new_fn->hash = hash;
-+ new_fn->minor_hash = minor_hash;
-+ new_fn->inode = le32_to_cpu(dirent->inode);
-+ new_fn->name_len = dirent->name_len;
-+ new_fn->file_type = dirent->file_type;
-+ memcpy(new_fn->name, dirent->name, dirent->name_len);
-+ new_fn->name[dirent->name_len] = 0;
-+
-+ while (*p) {
-+ parent = *p;
-+ fname = rb_entry(parent, struct fname, rb_hash);
-+
-+ /*
-+ * If the hash and minor hash match up, then we put
-+ * them on a linked list. This rarely happens...
-+ */
-+ if ((new_fn->hash == fname->hash) &&
-+ (new_fn->minor_hash == fname->minor_hash)) {
-+ new_fn->next = fname->next;
-+ fname->next = new_fn;
-+ return;
-+ }
-+
-+ if (new_fn->hash < fname->hash)
-+ p = &(*p)->rb_left;
-+ else if (new_fn->hash > fname->hash)
-+ p = &(*p)->rb_right;
-+ else if (new_fn->minor_hash < fname->minor_hash)
-+ p = &(*p)->rb_left;
-+ else /* if (new_fn->minor_hash > fname->minor_hash) */
-+ p = &(*p)->rb_right;
-+ }
-+
-+ rb_link_node(&new_fn->rb_hash, parent, p);
-+ rb_insert_color(&new_fn->rb_hash, &info->root);
-+}
-+
-+
-+
-+/*
-+ * This is a helper function for ext3_dx_readdir. It calls filldir
-+ * for all entres on the fname linked list. (Normally there is only
-+ * one entry on the linked list, unless there are 62 bit hash collisions.)
-+ */
-+static int call_filldir(struct file * filp, void * dirent,
-+ filldir_t filldir, struct fname *fname)
-+{
-+ struct dir_private_info *info = filp->private_data;
-+ loff_t curr_pos;
-+ struct inode *inode = filp->f_dentry->d_inode;
-+ struct super_block * sb;
-+ int error;
-+
-+ sb = inode->i_sb;
-+
-+ if (!fname) {
-+ printk("call_filldir: called with null fname?!?\n");
-+ return 0;
-+ }
-+ curr_pos = hash2pos(fname->hash, fname->minor_hash);
-+ while (fname) {
-+ error = filldir(dirent, fname->name,
-+ fname->name_len, curr_pos,
-+ fname->inode,
-+ get_dtype(sb, fname->file_type));
-+ if (error) {
-+ filp->f_pos = curr_pos;
-+ info->extra_fname = fname->next;
-+ return error;
-+ }
-+ fname = fname->next;
-+ }
-+ return 0;
-+}
-+
-+static int ext3_dx_readdir(struct file * filp,
-+ void * dirent, filldir_t filldir)
-+{
-+ struct dir_private_info *info = filp->private_data;
-+ struct inode *inode = filp->f_dentry->d_inode;
-+ struct fname *fname;
-+ int ret;
-+
-+ if (!info) {
-+ info = create_dir_info(filp->f_pos);
-+ if (!info)
-+ return -ENOMEM;
-+ filp->private_data = info;
-+ }
-+
-+ /* Some one has messed with f_pos; reset the world */
-+ if (info->last_pos != filp->f_pos) {
-+ free_rb_tree_fname(&info->root);
-+ info->curr_node = 0;
-+ info->extra_fname = 0;
-+ info->curr_hash = pos2maj_hash(filp->f_pos);
-+ info->curr_minor_hash = pos2min_hash(filp->f_pos);
-+ }
-+
-+ /*
-+ * If there are any leftover names on the hash collision
-+ * chain, return them first.
-+ */
-+ if (info->extra_fname &&
-+ call_filldir(filp, dirent, filldir, info->extra_fname))
-+ goto finished;
-+
-+ if (!info->curr_node)
-+ info->curr_node = rb_get_first(&info->root);
-+
-+ while (1) {
-+ /*
-+ * Fill the rbtree if we have no more entries,
-+ * or the inode has changed since we last read in the
-+ * cached entries.
-+ */
-+ if ((!info->curr_node) ||
-+ (filp->f_version != inode->i_version)) {
-+ info->curr_node = 0;
-+ free_rb_tree_fname(&info->root);
-+ filp->f_version = inode->i_version;
-+ ret = ext3_htree_fill_tree(filp, info->curr_hash,
-+ info->curr_minor_hash,
-+ &info->next_hash);
-+ if (ret < 0)
-+ return ret;
-+ if (ret == 0)
-+ break;
-+ info->curr_node = rb_get_first(&info->root);
-+ }
-+
-+ fname = rb_entry(info->curr_node, struct fname, rb_hash);
-+ info->curr_hash = fname->hash;
-+ info->curr_minor_hash = fname->minor_hash;
-+ if (call_filldir(filp, dirent, filldir, fname))
-+ break;
-+
-+ info->curr_node = rb_get_next(info->curr_node);
-+ if (!info->curr_node) {
-+ info->curr_hash = info->next_hash;
-+ info->curr_minor_hash = 0;
-+ }
-+ }
-+finished:
-+ info->last_pos = filp->f_pos;
-+ UPDATE_ATIME(inode);
-+ return 0;
-+}
-+#endif
-Index: linux-2.4.19.SuSE/fs/ext3/namei.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c 2002-12-04 09:46:03.000000000 -0800
-+++ linux-2.4.19.SuSE/fs/ext3/namei.c 2004-05-27 11:08:52.000000000 -0700
-@@ -16,6 +16,12 @@
- * David S. Miller (davem@caip.rutgers.edu), 1995
- * Directory entry file type support and forward compatibility hooks
- * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
-+ * Hash Tree Directory indexing (c)
-+ * Daniel Phillips, 2001
-+ * Hash Tree Directory indexing porting
-+ * Christopher Li, 2002
-+ * Hash Tree Directory indexing cleanup
-+ * Theodore Ts'o, 2002
- */
-
- #include <linux/fs.h>
-@@ -40,6 +46,630 @@
- #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
- #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
-
-+static struct buffer_head *ext3_append(handle_t *handle,
-+ struct inode *inode,
-+ u32 *block, int *err)
-+{
-+ struct buffer_head *bh;
-+
-+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
-+
-+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
-+ inode->i_size += inode->i_sb->s_blocksize;
-+ EXT3_I(inode)->i_disksize = inode->i_size;
-+ ext3_journal_get_write_access(handle,bh);
-+ }
-+ return bh;
-+}
-+
-+#ifndef assert
-+#define assert(test) J_ASSERT(test)
-+#endif
-+
-+#ifndef swap
-+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-+#endif
-+
-+typedef struct { u32 v; } le_u32;
-+typedef struct { u16 v; } le_u16;
-+
-+#ifdef DX_DEBUG
-+#define dxtrace(command) command
-+#else
-+#define dxtrace(command)
-+#endif
-+
-+struct fake_dirent
-+{
-+ /*le*/u32 inode;
-+ /*le*/u16 rec_len;
-+ u8 name_len;
-+ u8 file_type;
-+};
-+
-+struct dx_countlimit
-+{
-+ le_u16 limit;
-+ le_u16 count;
-+};
-+
-+struct dx_entry
-+{
-+ le_u32 hash;
-+ le_u32 block;
-+};
-+
-+/*
-+ * dx_root_info is laid out so that if it should somehow get overlaid by a
-+ * dirent the two low bits of the hash version will be zero. Therefore, the
-+ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
-+ */
-+
-+struct dx_root
-+{
-+ struct fake_dirent dot;
-+ char dot_name[4];
-+ struct fake_dirent dotdot;
-+ char dotdot_name[4];
-+ struct dx_root_info
-+ {
-+ le_u32 reserved_zero;
-+ u8 hash_version;
-+ u8 info_length; /* 8 */
-+ u8 indirect_levels;
-+ u8 unused_flags;
-+ }
-+ info;
-+ struct dx_entry entries[0];
-+};
-+
-+struct dx_node
-+{
-+ struct fake_dirent fake;
-+ struct dx_entry entries[0];
-+};
-+
-+
-+struct dx_frame
-+{
-+ struct buffer_head *bh;
-+ struct dx_entry *entries;
-+ struct dx_entry *at;
-+};
-+
-+struct dx_map_entry
-+{
-+ u32 hash;
-+ u32 offs;
-+};
-+
-+#ifdef CONFIG_EXT3_INDEX
-+static inline unsigned dx_get_block (struct dx_entry *entry);
-+static void dx_set_block (struct dx_entry *entry, unsigned value);
-+static inline unsigned dx_get_hash (struct dx_entry *entry);
-+static void dx_set_hash (struct dx_entry *entry, unsigned value);
-+static unsigned dx_get_count (struct dx_entry *entries);
-+static unsigned dx_get_limit (struct dx_entry *entries);
-+static void dx_set_count (struct dx_entry *entries, unsigned value);
-+static void dx_set_limit (struct dx_entry *entries, unsigned value);
-+static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-+static unsigned dx_node_limit (struct inode *dir);
-+static struct dx_frame *dx_probe(struct dentry *dentry,
-+ struct inode *dir,
-+ struct dx_hash_info *hinfo,
-+ struct dx_frame *frame,
-+ int *err);
-+static void dx_release (struct dx_frame *frames);
-+static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
-+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
-+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-+static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
-+ struct dx_map_entry *offsets, int count);
-+static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
-+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
-+static int ext3_htree_next_block(struct inode *dir, __u32 hash,
-+ struct dx_frame *frame,
-+ struct dx_frame *frames, int *err,
-+ __u32 *start_hash);
-+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-+ struct ext3_dir_entry_2 **res_dir, int *err);
-+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode);
-+
-+/*
-+ * Future: use high four bits of block for coalesce-on-delete flags
-+ * Mask them off for now.
-+ */
-+
-+static inline unsigned dx_get_block (struct dx_entry *entry)
-+{
-+ return le32_to_cpu(entry->block.v) & 0x00ffffff;
-+}
-+
-+static inline void dx_set_block (struct dx_entry *entry, unsigned value)
-+{
-+ entry->block.v = cpu_to_le32(value);
-+}
-+
-+static inline unsigned dx_get_hash (struct dx_entry *entry)
-+{
-+ return le32_to_cpu(entry->hash.v);
-+}
-+
-+static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
-+{
-+ entry->hash.v = cpu_to_le32(value);
-+}
-+
-+static inline unsigned dx_get_count (struct dx_entry *entries)
-+{
-+ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
-+}
-+
-+static inline unsigned dx_get_limit (struct dx_entry *entries)
-+{
-+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
-+}
-+
-+static inline void dx_set_count (struct dx_entry *entries, unsigned value)
-+{
-+ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
-+}
-+
-+static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
-+{
-+ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
-+}
-+
-+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
-+{
-+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
-+ EXT3_DIR_REC_LEN(2) - infosize;
-+ return 0? 20: entry_space / sizeof(struct dx_entry);
-+}
-+
-+static inline unsigned dx_node_limit (struct inode *dir)
-+{
-+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
-+ return 0? 22: entry_space / sizeof(struct dx_entry);
-+}
-+
-+/*
-+ * Debug
-+ */
-+#ifdef DX_DEBUG
-+struct stats
-+{
-+ unsigned names;
-+ unsigned space;
-+ unsigned bcount;
-+};
-+
-+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
-+ int size, int show_names)
-+{
-+ unsigned names = 0, space = 0;
-+ char *base = (char *) de;
-+ struct dx_hash_info h = *hinfo;
-+
-+ printk("names: ");
-+ while ((char *) de < base + size)
-+ {
-+ if (de->inode)
-+ {
-+ if (show_names)
-+ {
-+ int len = de->name_len;
-+ char *name = de->name;
-+ while (len--) printk("%c", *name++);
-+ ext3fs_dirhash(de->name, de->name_len, &h);
-+ printk(":%x.%u ", h.hash,
-+ ((char *) de - base));
-+ }
-+ space += EXT3_DIR_REC_LEN(de->name_len);
-+ names++;
-+ }
-+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
-+ }
-+ printk("(%i)\n", names);
-+ return (struct stats) { names, space, 1 };
-+}
-+
-+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
-+ struct dx_entry *entries, int levels)
-+{
-+ unsigned blocksize = dir->i_sb->s_blocksize;
-+ unsigned count = dx_get_count (entries), names = 0, space = 0, i;
-+ unsigned bcount = 0;
-+ struct buffer_head *bh;
-+ int err;
-+ printk("%i indexed blocks...\n", count);
-+ for (i = 0; i < count; i++, entries++)
-+ {
-+ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
-+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
-+ struct stats stats;
-+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
-+ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
-+ stats = levels?
-+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
-+ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
-+ names += stats.names;
-+ space += stats.space;
-+ bcount += stats.bcount;
-+ brelse (bh);
-+ }
-+ if (bcount)
-+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
-+ names, space/bcount,(space/bcount)*100/blocksize);
-+ return (struct stats) { names, space, bcount};
-+}
-+#endif /* DX_DEBUG */
-+
-+/*
-+ * Probe for a directory leaf block to search.
-+ *
-+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
-+ * error in the directory index, and the caller should fall back to
-+ * searching the directory normally. The callers of dx_probe **MUST**
-+ * check for this error code, and make sure it never gets reflected
-+ * back to userspace.
-+ */
-+static struct dx_frame *
-+dx_probe(struct dentry *dentry, struct inode *dir,
-+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
-+{
-+ unsigned count, indirect;
-+ struct dx_entry *at, *entries, *p, *q, *m;
-+ struct dx_root *root;
-+ struct buffer_head *bh;
-+ struct dx_frame *frame = frame_in;
-+ u32 hash;
-+
-+ frame->bh = NULL;
-+ if (dentry)
-+ dir = dentry->d_parent->d_inode;
-+ if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
-+ goto fail;
-+ root = (struct dx_root *) bh->b_data;
-+ if (root->info.hash_version != DX_HASH_TEA &&
-+ root->info.hash_version != DX_HASH_HALF_MD4 &&
-+ root->info.hash_version != DX_HASH_LEGACY) {
-+ ext3_warning(dir->i_sb, __FUNCTION__,
-+ "Unrecognised inode hash code %d",
-+ root->info.hash_version);
-+ brelse(bh);
-+ *err = ERR_BAD_DX_DIR;
-+ goto fail;
-+ }
-+ hinfo->hash_version = root->info.hash_version;
-+ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
-+ if (dentry)
-+ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
-+ hash = hinfo->hash;
-+
-+ if (root->info.unused_flags & 1) {
-+ ext3_warning(dir->i_sb, __FUNCTION__,
-+ "Unimplemented inode hash flags: %#06x",
-+ root->info.unused_flags);
-+ brelse(bh);
-+ *err = ERR_BAD_DX_DIR;
-+ goto fail;
-+ }
-+
-+ if ((indirect = root->info.indirect_levels) > 1) {
-+ ext3_warning(dir->i_sb, __FUNCTION__,
-+ "Unimplemented inode hash depth: %#06x",
-+ root->info.indirect_levels);
-+ brelse(bh);
-+ *err = ERR_BAD_DX_DIR;
-+ goto fail;
-+ }
-+
-+ entries = (struct dx_entry *) (((char *)&root->info) +
-+ root->info.info_length);
-+ assert(dx_get_limit(entries) == dx_root_limit(dir,
-+ root->info.info_length));
-+ dxtrace (printk("Look up %x", hash));
-+ while (1)
-+ {
-+ count = dx_get_count(entries);
-+ assert (count && count <= dx_get_limit(entries));
-+ p = entries + 1;
-+ q = entries + count - 1;
-+ while (p <= q)
-+ {
-+ m = p + (q - p)/2;
-+ dxtrace(printk("."));
-+ if (dx_get_hash(m) > hash)
-+ q = m - 1;
-+ else
-+ p = m + 1;
-+ }
-+
-+ if (0) // linear search cross check
-+ {
-+ unsigned n = count - 1;
-+ at = entries;
-+ while (n--)
-+ {
-+ dxtrace(printk(","));
-+ if (dx_get_hash(++at) > hash)
-+ {
-+ at--;
-+ break;
-+ }
-+ }
-+ assert (at == p - 1);
-+ }
-+
-+ at = p - 1;
-+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
-+ frame->bh = bh;
-+ frame->entries = entries;
-+ frame->at = at;
-+ if (!indirect--) return frame;
-+ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
-+ goto fail2;
-+ at = entries = ((struct dx_node *) bh->b_data)->entries;
-+ assert (dx_get_limit(entries) == dx_node_limit (dir));
-+ frame++;
-+ }
-+fail2:
-+ while (frame >= frame_in) {
-+ brelse(frame->bh);
-+ frame--;
-+ }
-+fail:
-+ return NULL;
-+}
-+
-+static void dx_release (struct dx_frame *frames)
-+{
-+ if (frames[0].bh == NULL)
-+ return;
-+
-+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
-+ brelse(frames[1].bh);
-+ brelse(frames[0].bh);
-+}
-+
-+/*
-+ * This function increments the frame pointer to search the next leaf
-+ * block, and reads in the necessary intervening nodes if the search
-+ * should be necessary. Whether or not the search is necessary is
-+ * controlled by the hash parameter. If the hash value is even, then
-+ * the search is only continued if the next block starts with that
-+ * hash value. This is used if we are searching for a specific file.
-+ *
-+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
-+ *
-+ * This function returns 1 if the caller should continue to search,
-+ * or 0 if it should not. If there is an error reading one of the
-+ * index blocks, it will return -1.
-+ *
-+ * If start_hash is non-null, it will be filled in with the starting
-+ * hash of the next page.
-+ */
-+static int ext3_htree_next_block(struct inode *dir, __u32 hash,
-+ struct dx_frame *frame,
-+ struct dx_frame *frames, int *err,
-+ __u32 *start_hash)
-+{
-+ struct dx_frame *p;
-+ struct buffer_head *bh;
-+ int num_frames = 0;
-+ __u32 bhash;
-+
-+ *err = ENOENT;
-+ p = frame;
-+ /*
-+ * Find the next leaf page by incrementing the frame pointer.
-+ * If we run out of entries in the interior node, loop around and
-+ * increment pointer in the parent node. When we break out of
-+ * this loop, num_frames indicates the number of interior
-+ * nodes need to be read.
-+ */
-+ while (1) {
-+ if (++(p->at) < p->entries + dx_get_count(p->entries))
-+ break;
-+ if (p == frames)
-+ return 0;
-+ num_frames++;
-+ p--;
-+ }
-+
-+ /*
-+ * If the hash is 1, then continue only if the next page has a
-+ * continuation hash of any value. This is used for readdir
-+ * handling. Otherwise, check to see if the hash matches the
-+ * desired contiuation hash. If it doesn't, return since
-+ * there's no point to read in the successive index pages.
-+ */
-+ bhash = dx_get_hash(p->at);
-+ if (start_hash)
-+ *start_hash = bhash;
-+ if ((hash & 1) == 0) {
-+ if ((bhash & ~1) != hash)
-+ return 0;
-+ }
-+ /*
-+ * If the hash is HASH_NB_ALWAYS, we always go to the next
-+ * block so no check is necessary
-+ */
-+ while (num_frames--) {
-+ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
-+ 0, err)))
-+ return -1; /* Failure */
-+ p++;
-+ brelse (p->bh);
-+ p->bh = bh;
-+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
-+ }
-+ return 1;
-+}
-+
-+
-+/*
-+ * p is at least 6 bytes before the end of page
-+ */
-+static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
-+{
-+ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
-+}
-+
-+/*
-+ * This function fills a red-black tree with information from a
-+ * directory. We start scanning the directory in hash order, starting
-+ * at start_hash and start_minor_hash.
-+ *
-+ * This function returns the number of entries inserted into the tree,
-+ * or a negative error code.
-+ */
-+int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
-+ __u32 start_minor_hash, __u32 *next_hash)
-+{
-+ struct dx_hash_info hinfo;
-+ struct buffer_head *bh;
-+ struct ext3_dir_entry_2 *de, *top;
-+ static struct dx_frame frames[2], *frame;
-+ struct inode *dir;
-+ int block, err;
-+ int count = 0;
-+ int ret;
-+ __u32 hashval;
-+
-+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
-+ start_minor_hash));
-+ dir = dir_file->f_dentry->d_inode;
-+ hinfo.hash = start_hash;
-+ hinfo.minor_hash = 0;
-+ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
-+ if (!frame)
-+ return err;
-+
-+ while (1) {
-+ block = dx_get_block(frame->at);
-+ dxtrace(printk("Reading block %d\n", block));
-+ if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
-+ goto errout;
-+
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize -
-+ EXT3_DIR_REC_LEN(0));
-+ for (; de < top; de = ext3_next_entry(de)) {
-+ ext3fs_dirhash(de->name, de->name_len, &hinfo);
-+ if ((hinfo.hash < start_hash) ||
-+ ((hinfo.hash == start_hash) &&
-+ (hinfo.minor_hash < start_minor_hash)))
-+ continue;
-+ ext3_htree_store_dirent(dir_file, hinfo.hash,
-+ hinfo.minor_hash, de);
-+ count++;
-+ }
-+ brelse (bh);
-+ hashval = ~1;
-+ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
-+ frame, frames, &err, &hashval);
-+ if (next_hash)
-+ *next_hash = hashval;
-+ if (ret == -1)
-+ goto errout;
-+ /*
-+ * Stop if: (a) there are no more entries, or
-+ * (b) we have inserted at least one entry and the
-+ * next hash value is not a continuation
-+ */
-+ if ((ret == 0) ||
-+ (count && ((hashval & 1) == 0)))
-+ break;
-+ }
-+ dx_release(frames);
-+ dxtrace(printk("Fill tree: returned %d entries\n", count));
-+ return count;
-+errout:
-+ dx_release(frames);
-+ return (err);
-+}
-+
-+
-+/*
-+ * Directory block splitting, compacting
-+ */
-+
-+static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
-+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
-+{
-+ int count = 0;
-+ char *base = (char *) de;
-+ struct dx_hash_info h = *hinfo;
-+
-+ while ((char *) de < base + size)
-+ {
-+ if (de->name_len && de->inode) {
-+ ext3fs_dirhash(de->name, de->name_len, &h);
-+ map_tail--;
-+ map_tail->hash = h.hash;
-+ map_tail->offs = (u32) ((char *) de - base);
-+ count++;
-+ }
-+ /* XXX: do we need to check rec_len == 0 case? -Chris */
-+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
-+ }
-+ return count;
-+}
-+
-+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
-+{
-+ struct dx_map_entry *p, *q, *top = map + count - 1;
-+ int more;
-+ /* Combsort until bubble sort doesn't suck */
-+ while (count > 2)
-+ {
-+ count = count*10/13;
-+ if (count - 9 < 2) /* 9, 10 -> 11 */
-+ count = 11;
-+ for (p = top, q = p - count; q >= map; p--, q--)
-+ if (p->hash < q->hash)
-+ swap(*p, *q);
-+ }
-+ /* Garden variety bubble sort */
-+ do {
-+ more = 0;
-+ q = top;
-+ while (q-- > map)
-+ {
-+ if (q[1].hash >= q[0].hash)
-+ continue;
-+ swap(*(q+1), *q);
-+ more = 1;
-+ }
-+ } while(more);
-+}
-+
-+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
-+{
-+ struct dx_entry *entries = frame->entries;
-+ struct dx_entry *old = frame->at, *new = old + 1;
-+ int count = dx_get_count(entries);
-+
-+ assert(count < dx_get_limit(entries));
-+ assert(old < entries + count);
-+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
-+ dx_set_hash(new, hash);
-+ dx_set_block(new, block);
-+ dx_set_count(entries, count + 1);
-+}
-+#endif
-+
-+
-+static void ext3_update_dx_flag(struct inode *inode)
-+{
-+ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
-+ EXT3_FEATURE_COMPAT_DIR_INDEX))
-+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
-+}
-+
- /*
- * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
- *
-@@ -96,6 +726,7 @@
- return 0;
- }
-
-+
- /*
- * ext3_find_entry()
- *
-@@ -107,6 +738,8 @@
- * The returned buffer_head has ->b_count elevated. The caller is expected
- * to brelse() it when appropriate.
- */
-+
-+
- static struct buffer_head * ext3_find_entry (struct dentry *dentry,
- struct ext3_dir_entry_2 ** res_dir)
- {
-@@ -121,12 +754,32 @@
- int num = 0;
- int nblocks, i, err;
- struct inode *dir = dentry->d_parent->d_inode;
-+ int namelen;
-+ const u8 *name;
-+ unsigned blocksize;
-
- *res_dir = NULL;
- sb = dir->i_sb;
--
-+ blocksize = sb->s_blocksize;
-+ namelen = dentry->d_name.len;
-+ name = dentry->d_name.name;
-+ if (namelen > EXT3_NAME_LEN)
-+ return NULL;
-+#ifdef CONFIG_EXT3_INDEX
-+ if (is_dx(dir)) {
-+ bh = ext3_dx_find_entry(dentry, res_dir, &err);
-+ /*
-+ * On success, or if the error was file not found,
-+ * return. Otherwise, fall back to doing a search the
-+ * old fashioned way.
-+ */
-+ if (bh || (err != ERR_BAD_DX_DIR))
-+ return bh;
-+ dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
-+ }
-+#endif
- nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
-- start = dir->u.ext3_i.i_dir_start_lookup;
-+ start = EXT3_I(dir)->i_dir_start_lookup;
- if (start >= nblocks)
- start = 0;
- block = start;
-@@ -167,7 +820,7 @@
- i = search_dirblock(bh, dir, dentry,
- block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
- if (i == 1) {
-- dir->u.ext3_i.i_dir_start_lookup = block;
-+ EXT3_I(dir)->i_dir_start_lookup = block;
- ret = bh;
- goto cleanup_and_exit;
- } else {
-@@ -198,6 +851,74 @@
- return ret;
- }
-
-+#ifdef CONFIG_EXT3_INDEX
-+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-+ struct ext3_dir_entry_2 **res_dir, int *err)
-+{
-+ struct super_block * sb;
-+ struct dx_hash_info hinfo;
-+ u32 hash;
-+ struct dx_frame frames[2], *frame;
-+ struct ext3_dir_entry_2 *de, *top;
-+ struct buffer_head *bh;
-+ unsigned long block;
-+ int retval;
-+ int namelen = dentry->d_name.len;
-+ const u8 *name = dentry->d_name.name;
-+ struct inode *dir = dentry->d_parent->d_inode;
-+
-+ sb = dir->i_sb;
-+ /* NFS may look up ".." - look at dx_root directory block */
-+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-+ if (!(frame = dx_probe(dentry, 0, &hinfo, frames, err)))
-+ return NULL;
-+ } else {
-+ frame = frames;
-+ frame->bh = NULL; /* for dx_release() */
-+ frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
-+ dx_set_block(frame->at, 0); /* dx_root block is 0 */
-+ }
-+ hash = hinfo.hash;
-+ do {
-+ block = dx_get_block(frame->at);
-+ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
-+ goto errout;
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ top = (struct ext3_dir_entry_2 *)((char *)de + sb->s_blocksize -
-+ EXT3_DIR_REC_LEN(0));
-+ for (; de < top; de = ext3_next_entry(de))
-+ if (ext3_match (namelen, name, de)) {
-+ if (!ext3_check_dir_entry("ext3_find_entry",
-+ dir, de, bh,
-+ (block<<EXT3_BLOCK_SIZE_BITS(sb))
-+ +((char *)de - bh->b_data))) {
-+ brelse (bh);
-+ goto errout;
-+ }
-+ *res_dir = de;
-+ dx_release (frames);
-+ return bh;
-+ }
-+ brelse (bh);
-+ /* Check to see if we should continue to search */
-+ retval = ext3_htree_next_block(dir, hash, frame,
-+ frames, err, 0);
-+ if (retval == -1) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "error reading index page in directory #%lu",
-+ dir->i_ino);
-+ goto errout;
-+ }
-+ } while (retval == 1);
-+
-+ *err = -ENOENT;
-+errout:
-+ dxtrace(printk("%s not found\n", name));
-+ dx_release (frames);
-+ return NULL;
-+}
-+#endif
-+
- static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
- {
- struct inode * inode;
-@@ -214,8 +927,9 @@
- brelse (bh);
- inode = iget(dir->i_sb, ino);
-
-- if (!inode)
-+ if (!inode) {
- return ERR_PTR(-EACCES);
-+ }
- }
- d_add(dentry, inode);
- return NULL;
-@@ -239,6 +953,301 @@
- de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
- }
-
-+#ifdef CONFIG_EXT3_INDEX
-+static struct ext3_dir_entry_2 *
-+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
-+{
-+ unsigned rec_len = 0;
-+
-+ while (count--) {
-+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
-+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
-+ memcpy (to, de, rec_len);
-+ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
-+ de->inode = 0;
-+ map++;
-+ to += rec_len;
-+ }
-+ return (struct ext3_dir_entry_2 *) (to - rec_len);
-+}
-+
-+static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
-+{
-+ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
-+ unsigned rec_len = 0;
-+
-+ prev = to = de;
-+ while ((char*)de < base + size) {
-+ next = (struct ext3_dir_entry_2 *) ((char *) de +
-+ le16_to_cpu(de->rec_len));
-+ if (de->inode && de->name_len) {
-+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
-+ if (de > to)
-+ memmove(to, de, rec_len);
-+ to->rec_len = cpu_to_le16(rec_len);
-+ prev = to;
-+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
-+ }
-+ de = next;
-+ }
-+ return prev;
-+}
-+
-+static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
-+ struct buffer_head **bh,struct dx_frame *frame,
-+ struct dx_hash_info *hinfo, int *error)
-+{
-+ unsigned blocksize = dir->i_sb->s_blocksize;
-+ unsigned count, continued;
-+ struct buffer_head *bh2;
-+ u32 newblock;
-+ u32 hash2;
-+ struct dx_map_entry *map;
-+ char *data1 = (*bh)->b_data, *data2;
-+ unsigned split;
-+ struct ext3_dir_entry_2 *de = NULL, *de2;
-+ int err;
-+
-+ bh2 = ext3_append (handle, dir, &newblock, error);
-+ if (!(bh2)) {
-+ brelse(*bh);
-+ *bh = NULL;
-+ goto errout;
-+ }
-+
-+ BUFFER_TRACE(*bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, *bh);
-+ if (err) {
-+ journal_error:
-+ brelse(*bh);
-+ brelse(bh2);
-+ *bh = NULL;
-+ ext3_std_error(dir->i_sb, err);
-+ goto errout;
-+ }
-+ BUFFER_TRACE(frame->bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, frame->bh);
-+ if (err)
-+ goto journal_error;
-+
-+ data2 = bh2->b_data;
-+
-+ /* create map in the end of data2 block */
-+ map = (struct dx_map_entry *) (data2 + blocksize);
-+ count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
-+ blocksize, hinfo, map);
-+ map -= count;
-+ split = count/2; // need to adjust to actual middle
-+ dx_sort_map (map, count);
-+ hash2 = map[split].hash;
-+ continued = hash2 == map[split - 1].hash;
-+ dxtrace(printk("Split block %i at %x, %i/%i\n",
-+ dx_get_block(frame->at), hash2, split, count-split));
-+
-+ /* Fancy dance to stay within two buffers */
-+ de2 = dx_move_dirents(data1, data2, map + split, count - split);
-+ de = dx_pack_dirents(data1,blocksize);
-+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
-+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
-+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
-+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
-+
-+ /* Which block gets the new entry? */
-+ if (hinfo->hash >= hash2)
-+ {
-+ swap(*bh, bh2);
-+ de = de2;
-+ }
-+ dx_insert_block (frame, hash2 + continued, newblock);
-+ err = ext3_journal_dirty_metadata (handle, bh2);
-+ if (err)
-+ goto journal_error;
-+ err = ext3_journal_dirty_metadata (handle, frame->bh);
-+ if (err)
-+ goto journal_error;
-+ brelse (bh2);
-+ dxtrace(dx_show_index ("frame", frame->entries));
-+errout:
-+ return de;
-+}
-+#endif
-+
-+
-+/*
-+ * Add a new entry into a directory (leaf) block. If de is non-NULL,
-+ * it points to a directory entry which is guaranteed to be large
-+ * enough for new directory entry. If de is NULL, then
-+ * add_dirent_to_buf will attempt search the directory block for
-+ * space. It will return -ENOSPC if no space is available, and -EIO
-+ * and -EEXIST if directory entry already exists.
-+ *
-+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In
-+ * all other cases bh is released.
-+ */
-+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode, struct ext3_dir_entry_2 *de,
-+ struct buffer_head * bh)
-+{
-+ struct inode *dir = dentry->d_parent->d_inode;
-+ const char *name = dentry->d_name.name;
-+ int namelen = dentry->d_name.len;
-+ unsigned long offset = 0;
-+ unsigned short reclen;
-+ int nlen, rlen, err;
-+ char *top;
-+
-+ reclen = EXT3_DIR_REC_LEN(namelen);
-+ if (!de) {
-+ de = (struct ext3_dir_entry_2 *)bh->b_data;
-+ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
-+ while ((char *) de <= top) {
-+ if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
-+ bh, offset)) {
-+ brelse (bh);
-+ return -EIO;
-+ }
-+ if (ext3_match (namelen, name, de)) {
-+ brelse (bh);
-+ return -EEXIST;
-+ }
-+ nlen = EXT3_DIR_REC_LEN(de->name_len);
-+ rlen = le16_to_cpu(de->rec_len);
-+ if ((de->inode? rlen - nlen: rlen) >= reclen)
-+ break;
-+ de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
-+ offset += rlen;
-+ }
-+ if ((char *) de > top)
-+ return -ENOSPC;
-+ }
-+ BUFFER_TRACE(bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err) {
-+ ext3_std_error(dir->i_sb, err);
-+ brelse(bh);
-+ return err;
-+ }
-+
-+ /* By now the buffer is marked for journaling */
-+ nlen = EXT3_DIR_REC_LEN(de->name_len);
-+ rlen = le16_to_cpu(de->rec_len);
-+ if (de->inode) {
-+ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
-+ de1->rec_len = cpu_to_le16(rlen - nlen);
-+ de->rec_len = cpu_to_le16(nlen);
-+ de = de1;
-+ }
-+ de->file_type = EXT3_FT_UNKNOWN;
-+ if (inode) {
-+ de->inode = cpu_to_le32(inode->i_ino);
-+ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
-+ } else
-+ de->inode = 0;
-+ de->name_len = namelen;
-+ memcpy (de->name, name, namelen);
-+ /*
-+ * XXX shouldn't update any times until successful
-+ * completion of syscall, but too many callers depend
-+ * on this.
-+ *
-+ * XXX similarly, too many callers depend on
-+ * ext3_new_inode() setting the times, but error
-+ * recovery deletes the inode, so the worst that can
-+ * happen is that the times are slightly out of date
-+ * and/or different from the directory change time.
-+ */
-+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-+ ext3_update_dx_flag(dir);
-+ dir->i_version = ++event;
-+ ext3_mark_inode_dirty(handle, dir);
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ ext3_std_error(dir->i_sb, err);
-+ brelse(bh);
-+ return 0;
-+}
-+
-+#ifdef CONFIG_EXT3_INDEX
-+/*
-+ * This converts a one block unindexed directory to a 3 block indexed
-+ * directory, and adds the dentry to the indexed directory.
-+ */
-+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode, struct buffer_head *bh)
-+{
-+ struct inode *dir = dentry->d_parent->d_inode;
-+ const char *name = dentry->d_name.name;
-+ int namelen = dentry->d_name.len;
-+ struct buffer_head *bh2;
-+ struct dx_root *root;
-+ struct dx_frame frames[2], *frame;
-+ struct dx_entry *entries;
-+ struct ext3_dir_entry_2 *de, *de2;
-+ char *data1, *top;
-+ unsigned len;
-+ int retval;
-+ unsigned blocksize;
-+ struct dx_hash_info hinfo;
-+ u32 block;
-+
-+ blocksize = dir->i_sb->s_blocksize;
-+ dxtrace(printk("Creating index\n"));
-+ retval = ext3_journal_get_write_access(handle, bh);
-+ if (retval) {
-+ ext3_std_error(dir->i_sb, retval);
-+ brelse(bh);
-+ return retval;
-+ }
-+ root = (struct dx_root *) bh->b_data;
-+
-+ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
-+ bh2 = ext3_append (handle, dir, &block, &retval);
-+ if (!(bh2)) {
-+ brelse(bh);
-+ return retval;
-+ }
-+ data1 = bh2->b_data;
-+
-+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *)&root->dotdot;
-+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
-+ len = ((char *) root) + blocksize - (char *) de;
-+ memcpy (data1, de, len);
-+ de = (struct ext3_dir_entry_2 *) data1;
-+ top = data1 + len;
-+ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
-+ de = de2;
-+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
-+ /* Initialize the root; the dot dirents already exist */
-+ de = (struct ext3_dir_entry_2 *) (&root->dotdot);
-+ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
-+ memset (&root->info, 0, sizeof(root->info));
-+ root->info.info_length = sizeof(root->info);
-+ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version;
-+ entries = root->entries;
-+ dx_set_block (entries, 1);
-+ dx_set_count (entries, 1);
-+ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
-+
-+ /* Initialize as for dx_probe */
-+ hinfo.hash_version = root->info.hash_version;
-+ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed;
-+ ext3fs_dirhash(name, namelen, &hinfo);
-+ frame = frames;
-+ frame->entries = entries;
-+ frame->at = entries;
-+ frame->bh = bh;
-+ bh = bh2;
-+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-+ dx_release (frames);
-+ if (!(de))
-+ return retval;
-+
-+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
-+}
-+#endif
-+
- /*
- * ext3_add_entry()
- *
-@@ -249,127 +1258,198 @@
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
--
--/*
-- * AKPM: the journalling code here looks wrong on the error paths
-- */
- static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
- struct inode *inode)
- {
- struct inode *dir = dentry->d_parent->d_inode;
-- const char *name = dentry->d_name.name;
-- int namelen = dentry->d_name.len;
- unsigned long offset;
-- unsigned short rec_len;
- struct buffer_head * bh;
-- struct ext3_dir_entry_2 * de, * de1;
-+ struct ext3_dir_entry_2 *de;
- struct super_block * sb;
- int retval;
-+#ifdef CONFIG_EXT3_INDEX
-+ int dx_fallback=0;
-+#endif
-+ unsigned blocksize;
-+ unsigned nlen, rlen;
-+ u32 block, blocks;
-
- sb = dir->i_sb;
--
-- if (!namelen)
-+ blocksize = sb->s_blocksize;
-+ if (!dentry->d_name.len)
- return -EINVAL;
-- bh = ext3_bread (handle, dir, 0, 0, &retval);
-+#ifdef CONFIG_EXT3_INDEX
-+ if (is_dx(dir)) {
-+ retval = ext3_dx_add_entry(handle, dentry, inode);
-+ if (!retval || (retval != ERR_BAD_DX_DIR))
-+ return retval;
-+ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
-+ dx_fallback++;
-+ ext3_mark_inode_dirty(handle, dir);
-+ }
-+#endif
-+ blocks = dir->i_size >> sb->s_blocksize_bits;
-+ for (block = 0, offset = 0; block < blocks; block++) {
-+ bh = ext3_bread(handle, dir, block, 0, &retval);
-+ if(!bh)
-+ return retval;
-+ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
-+ if (retval != -ENOSPC)
-+ return retval;
-+
-+#ifdef CONFIG_EXT3_INDEX
-+ if (blocks == 1 && !dx_fallback &&
-+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
-+ return make_indexed_dir(handle, dentry, inode, bh);
-+#endif
-+ brelse(bh);
-+ }
-+ bh = ext3_append(handle, dir, &block, &retval);
- if (!bh)
- return retval;
-- rec_len = EXT3_DIR_REC_LEN(namelen);
-- offset = 0;
- de = (struct ext3_dir_entry_2 *) bh->b_data;
-- while (1) {
-- if ((char *)de >= sb->s_blocksize + bh->b_data) {
-- brelse (bh);
-- bh = NULL;
-- bh = ext3_bread (handle, dir,
-- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
-- if (!bh)
-- return retval;
-- if (dir->i_size <= offset) {
-- if (dir->i_size == 0) {
-- brelse(bh);
-- return -ENOENT;
-- }
--
-- ext3_debug ("creating next block\n");
--
-- BUFFER_TRACE(bh, "get_write_access");
-- ext3_journal_get_write_access(handle, bh);
-- de = (struct ext3_dir_entry_2 *) bh->b_data;
-- de->inode = 0;
-- de->rec_len = le16_to_cpu(sb->s_blocksize);
-- dir->u.ext3_i.i_disksize =
-- dir->i_size = offset + sb->s_blocksize;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-- ext3_mark_inode_dirty(handle, dir);
-- } else {
--
-- ext3_debug ("skipping to next block\n");
-+ de->inode = 0;
-+ de->rec_len = cpu_to_le16(rlen = blocksize);
-+ nlen = 0;
-+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
-+}
-
-- de = (struct ext3_dir_entry_2 *) bh->b_data;
-- }
-- }
-- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
-- offset)) {
-- brelse (bh);
-- return -ENOENT;
-- }
-- if (ext3_match (namelen, name, de)) {
-- brelse (bh);
-- return -EEXIST;
-+#ifdef CONFIG_EXT3_INDEX
-+/*
-+ * Returns 0 for success, or a negative error value
-+ */
-+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
-+{
-+ struct dx_frame frames[2], *frame;
-+ struct dx_entry *entries, *at;
-+ struct dx_hash_info hinfo;
-+ struct buffer_head * bh;
-+ struct inode *dir = dentry->d_parent->d_inode;
-+ struct super_block * sb = dir->i_sb;
-+ struct ext3_dir_entry_2 *de;
-+ int err;
-+
-+ frame = dx_probe(dentry, 0, &hinfo, frames, &err);
-+ if (!frame)
-+ return err;
-+ entries = frame->entries;
-+ at = frame->at;
-+
-+ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
-+ goto cleanup;
-+
-+ BUFFER_TRACE(bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto journal_error;
-+
-+ err = add_dirent_to_buf(handle, dentry, inode, 0, bh);
-+ if (err != -ENOSPC) {
-+ bh = 0;
-+ goto cleanup;
-+ }
-+
-+ /* Block full, should compress but for now just split */
-+ dxtrace(printk("using %u of %u node entries\n",
-+ dx_get_count(entries), dx_get_limit(entries)));
-+ /* Need to split index? */
-+ if (dx_get_count(entries) == dx_get_limit(entries)) {
-+ u32 newblock;
-+ unsigned icount = dx_get_count(entries);
-+ int levels = frame - frames;
-+ struct dx_entry *entries2;
-+ struct dx_node *node2;
-+ struct buffer_head *bh2;
-+
-+ if (levels && (dx_get_count(frames->entries) ==
-+ dx_get_limit(frames->entries))) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "Directory index full!\n");
-+ err = -ENOSPC;
-+ goto cleanup;
- }
-- if ((le32_to_cpu(de->inode) == 0 &&
-- le16_to_cpu(de->rec_len) >= rec_len) ||
-- (le16_to_cpu(de->rec_len) >=
-- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
-- BUFFER_TRACE(bh, "get_write_access");
-- ext3_journal_get_write_access(handle, bh);
-- /* By now the buffer is marked for journaling */
-- offset += le16_to_cpu(de->rec_len);
-- if (le32_to_cpu(de->inode)) {
-- de1 = (struct ext3_dir_entry_2 *) ((char *) de +
-- EXT3_DIR_REC_LEN(de->name_len));
-- de1->rec_len =
-- cpu_to_le16(le16_to_cpu(de->rec_len) -
-- EXT3_DIR_REC_LEN(de->name_len));
-- de->rec_len = cpu_to_le16(
-- EXT3_DIR_REC_LEN(de->name_len));
-- de = de1;
-+
-+ bh2 = ext3_append (handle, dir, &newblock, &err);
-+ if (!(bh2))
-+ goto cleanup;
-+ node2 = (struct dx_node *)(bh2->b_data);
-+ entries2 = node2->entries;
-+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
-+ node2->fake.inode = 0;
-+ BUFFER_TRACE(frame->bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, frame->bh);
-+ if (err)
-+ goto journal_error;
-+ if (levels) {
-+ unsigned icount1 = icount/2, icount2 = icount - icount1;
-+ unsigned hash2 = dx_get_hash(entries + icount1);
-+ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
-+
-+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
-+ err = ext3_journal_get_write_access(handle,
-+ frames[0].bh);
-+ if (err)
-+ goto journal_error;
-+
-+ memcpy ((char *) entries2, (char *) (entries + icount1),+ icount2 * sizeof(struct dx_entry));
-+ dx_set_count (entries, icount1);
-+ dx_set_count (entries2, icount2);
-+ dx_set_limit (entries2, dx_node_limit(dir));
-+
-+ /* Which index block gets the new entry? */
-+ if (at - entries >= icount1) {
-+ frame->at = at = at - entries - icount1 + entries2;
-+ frame->entries = entries = entries2;
-+ swap(frame->bh, bh2);
- }
-- de->file_type = EXT3_FT_UNKNOWN;
-- if (inode) {
-- de->inode = cpu_to_le32(inode->i_ino);
-- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
-- } else
-- de->inode = 0;
-- de->name_len = namelen;
-- memcpy (de->name, name, namelen);
-- /*
-- * XXX shouldn't update any times until successful
-- * completion of syscall, but too many callers depend
-- * on this.
-- *
-- * XXX similarly, too many callers depend on
-- * ext3_new_inode() setting the times, but error
-- * recovery deletes the inode, so the worst that can
-- * happen is that the times are slightly out of date
-- * and/or different from the directory change time.
-- */
-- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-- ext3_mark_inode_dirty(handle, dir);
-- dir->i_version = ++event;
-- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-- ext3_journal_dirty_metadata(handle, bh);
-- brelse(bh);
-- return 0;
-+ dx_insert_block (frames + 0, hash2, newblock);
-+ dxtrace(dx_show_index ("node", frames[1].entries));
-+ dxtrace(dx_show_index ("node",
-+ ((struct dx_node *) bh2->b_data)->entries));
-+ err = ext3_journal_dirty_metadata(handle, bh2);
-+ if (err)
-+ goto journal_error;
-+ brelse (bh2);
-+ } else {
-+ dxtrace(printk("Creating second level index...\n"));
-+ memcpy((char *) entries2, (char *) entries,
-+ icount * sizeof(struct dx_entry));
-+ dx_set_limit(entries2, dx_node_limit(dir));
-+
-+ /* Set up root */
-+ dx_set_count(entries, 1);
-+ dx_set_block(entries + 0, newblock);
-+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-+
-+ /* Add new access path frame */
-+ frame = frames + 1;
-+ frame->at = at = at - entries + entries2;
-+ frame->entries = entries = entries2;
-+ frame->bh = bh2;
-+ err = ext3_journal_get_write_access(handle,
-+ frame->bh);
-+ if (err)
-+ goto journal_error;
- }
-- offset += le16_to_cpu(de->rec_len);
-- de = (struct ext3_dir_entry_2 *)
-- ((char *) de + le16_to_cpu(de->rec_len));
-+ ext3_journal_dirty_metadata(handle, frames[0].bh);
- }
-- brelse (bh);
-- return -ENOSPC;
-+ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
-+ if (!de)
-+ goto cleanup;
-+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-+ bh = 0;
-+ goto cleanup;
-+
-+journal_error:
-+ ext3_std_error(dir->i_sb, err);
-+cleanup:
-+ if (bh)
-+ brelse(bh);
-+ dx_release(frames);
-+ return err;
- }
-+#endif
-
- /*
- * ext3_delete_entry deletes a directory entry by merging it with the
-@@ -453,9 +1533,11 @@
- struct inode * inode;
- int err;
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -480,9 +1562,11 @@
- struct inode *inode;
- int err;
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -512,9 +1596,11 @@
- if (dir->i_nlink >= EXT3_LINK_MAX)
- return -EMLINK;
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -526,7 +1612,8 @@
-
- inode->i_op = &ext3_dir_inode_operations;
- inode->i_fop = &ext3_dir_operations;
-- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
-+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-+ inode->i_blocks = 0;
- dir_block = ext3_bread (handle, inode, 0, 1, &err);
- if (!dir_block) {
- inode->i_nlink--; /* is this nlink == 0? */
-@@ -555,21 +1642,19 @@
- brelse (dir_block);
- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_entry (handle, dentry, inode);
-- if (err)
-- goto out_no_entry;
-+ if (err) {
-+ inode->i_nlink = 0;
-+ ext3_mark_inode_dirty(handle, inode);
-+ iput (inode);
-+ goto out_stop;
-+ }
- dir->i_nlink++;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
- d_instantiate(dentry, inode);
- out_stop:
- ext3_journal_stop(handle, dir);
- return err;
--
--out_no_entry:
-- inode->i_nlink = 0;
-- ext3_mark_inode_dirty(handle, inode);
-- iput (inode);
-- goto out_stop;
- }
-
- /*
-@@ -656,7 +1741,7 @@
- int err = 0, rc;
-
- lock_super(sb);
-- if (!list_empty(&inode->u.ext3_i.i_orphan))
-+ if (!list_empty(&EXT3_I(inode)->i_orphan))
- goto out_unlock;
-
- /* Orphan handling is only valid for files with data blocks
-@@ -697,7 +1782,7 @@
- * This is safe: on error we're going to ignore the orphan list
- * anyway on the next recovery. */
- if (!err)
-- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
-+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-
- jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
- jbd_debug(4, "orphan inode %ld will point to %d\n",
-@@ -715,25 +1800,26 @@
- int ext3_orphan_del(handle_t *handle, struct inode *inode)
- {
- struct list_head *prev;
-+ struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_sb_info *sbi;
- ino_t ino_next;
- struct ext3_iloc iloc;
- int err = 0;
-
- lock_super(inode->i_sb);
-- if (list_empty(&inode->u.ext3_i.i_orphan)) {
-+ if (list_empty(&ei->i_orphan)) {
- unlock_super(inode->i_sb);
- return 0;
- }
-
- ino_next = NEXT_ORPHAN(inode);
-- prev = inode->u.ext3_i.i_orphan.prev;
-+ prev = ei->i_orphan.prev;
- sbi = EXT3_SB(inode->i_sb);
-
- jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
-
-- list_del(&inode->u.ext3_i.i_orphan);
-- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
-+ list_del(&ei->i_orphan);
-+ INIT_LIST_HEAD(&ei->i_orphan);
-
- /* If we're on an error path, we may not have a valid
- * transaction handle with which to update the orphan list on
-@@ -794,8 +1880,9 @@
- handle_t *handle;
-
- handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
-- if (IS_ERR(handle))
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- retval = -ENOENT;
- bh = ext3_find_entry (dentry, &de);
-@@ -833,7 +1920,7 @@
- ext3_mark_inode_dirty(handle, inode);
- dir->i_nlink--;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
-
- end_rmdir:
-@@ -851,8 +1938,9 @@
- handle_t *handle;
-
- handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
-- if (IS_ERR(handle))
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -879,7 +1967,7 @@
- if (retval)
- goto end_unlink;
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
- inode->i_nlink--;
- if (!inode->i_nlink)
-@@ -905,9 +1993,11 @@
- if (l > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -917,7 +2007,7 @@
- if (IS_ERR(inode))
- goto out_stop;
-
-- if (l > sizeof (inode->u.ext3_i.i_data)) {
-+ if (l > sizeof (EXT3_I(inode)->i_data)) {
- inode->i_op = &ext3_symlink_inode_operations;
- inode->i_mapping->a_ops = &ext3_aops;
- /*
-@@ -926,25 +2016,23 @@
- * i_size in generic_commit_write().
- */
- err = block_symlink(inode, symname, l);
-- if (err)
-- goto out_no_entry;
-+ if (err) {
-+ ext3_dec_count(handle, inode);
-+ ext3_mark_inode_dirty(handle, inode);
-+ iput (inode);
-+ goto out_stop;
-+ }
- } else {
- inode->i_op = &ext3_fast_symlink_inode_operations;
-- memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
-+ memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
- inode->i_size = l-1;
- }
-- inode->u.ext3_i.i_disksize = inode->i_size;
-+ EXT3_I(inode)->i_disksize = inode->i_size;
- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- out_stop:
- ext3_journal_stop(handle, dir);
- return err;
--
--out_no_entry:
-- ext3_dec_count(handle, inode);
-- ext3_mark_inode_dirty(handle, inode);
-- iput (inode);
-- goto out_stop;
- }
-
- static int ext3_link (struct dentry * old_dentry,
-@@ -957,12 +2045,15 @@
- if (S_ISDIR(inode->i_mode))
- return -EPERM;
-
-- if (inode->i_nlink >= EXT3_LINK_MAX)
-+ if (inode->i_nlink >= EXT3_LINK_MAX) {
- return -EMLINK;
-+ }
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -996,9 +2087,11 @@
-
- old_bh = new_bh = dir_bh = NULL;
-
-- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
- handle->h_sync = 1;
-@@ -1078,7 +2171,7 @@
- new_inode->i_ctime = CURRENT_TIME;
- }
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
-- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(old_dir);
- if (dir_bh) {
- BUFFER_TRACE(dir_bh, "get_write_access");
- ext3_journal_get_write_access(handle, dir_bh);
-@@ -1090,7 +2183,7 @@
- new_inode->i_nlink--;
- } else {
- new_dir->i_nlink++;
-- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(new_dir);
- ext3_mark_inode_dirty(handle, new_dir);
- }
- }
-Index: linux-2.4.19.SuSE/fs/ext3/super.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/super.c 2004-05-27 11:07:21.000000000 -0700
-+++ linux-2.4.19.SuSE/fs/ext3/super.c 2004-05-27 11:08:28.000000000 -0700
-@@ -741,6 +741,7 @@
- es->s_mtime = cpu_to_le32(CURRENT_TIME);
- ext3_update_dynamic_rev(sb);
- EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-+
- ext3_commit_super (sb, es, 1);
- if (test_opt (sb, DEBUG))
- printk (KERN_INFO
-@@ -751,6 +752,7 @@
- EXT3_BLOCKS_PER_GROUP(sb),
- EXT3_INODES_PER_GROUP(sb),
- sbi->s_mount_opt);
-+
- printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
- bdevname(sb->s_dev));
- if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
-@@ -925,6 +927,7 @@
- return res;
- }
-
-+
- struct super_block * ext3_read_super (struct super_block * sb, void * data,
- int silent)
- {
-@@ -1113,6 +1116,9 @@
- sbi->s_mount_state = le16_to_cpu(es->s_state);
- sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
- sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
-+ for (i=0; i < 4; i++)
-+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
-+ sbi->s_def_hash_version = es->s_def_hash_version;
-
- if (sbi->s_blocks_per_group > blocksize * 8) {
- printk (KERN_ERR
-@@ -1821,6 +1827,7 @@
- exit_ext3_xattr();
- }
-
-+EXPORT_SYMBOL(ext3_force_commit);
- EXPORT_SYMBOL(ext3_bread);
-
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-Index: linux-2.4.19.SuSE/fs/ext3/file.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/file.c 2002-12-04 09:46:18.000000000 -0800
-+++ linux-2.4.19.SuSE/fs/ext3/file.c 2004-05-27 11:08:28.000000000 -0700
-@@ -38,6 +38,9 @@
- {
- if (filp->f_mode & FMODE_WRITE)
- ext3_discard_prealloc (inode);
-+ if (is_dx(inode) && filp->private_data)
-+ ext3_htree_free_dir_info(filp->private_data);
-+
- return 0;
- }
-
-Index: linux-2.4.19.SuSE/fs/ext3/hash.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/hash.c 1970-01-02 14:15:01.000000000 -0800
-+++ linux-2.4.19.SuSE/fs/ext3/hash.c 2004-05-27 11:08:28.000000000 -0700
-@@ -0,0 +1,215 @@
-+/*
-+ * linux/fs/ext3/hash.c
-+ *
-+ * Copyright (C) 2002 by Theodore Ts'o
-+ *
-+ * This file is released under the GPL v2.
-+ *
-+ * This file may be redistributed under the terms of the GNU Public
-+ * License.
-+ */
-+
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/sched.h>
-+#include <linux/ext3_fs.h>
-+
-+#define DELTA 0x9E3779B9
-+
-+static void TEA_transform(__u32 buf[4], __u32 const in[])
-+{
-+ __u32 sum = 0;
-+ __u32 b0 = buf[0], b1 = buf[1];
-+ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
-+ int n = 16;
-+
-+ do {
-+ sum += DELTA;
-+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
-+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-+ } while(--n);
-+
-+ buf[0] += b0;
-+ buf[1] += b1;
-+}
-+
-+/* F, G and H are basic MD4 functions: selection, majority, parity */
-+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
-+#define H(x, y, z) ((x) ^ (y) ^ (z))
-+
-+/*
-+ * The generic round function. The application is so specific that
-+ * we don't bother protecting all the arguments with parens, as is generally
-+ * good macro practice, in favor of extra legibility.
-+ * Rotation is separate from addition to prevent recomputation
-+ */
-+#define ROUND(f, a, b, c, d, x, s) \
-+ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
-+#define K1 0
-+#define K2 013240474631UL
-+#define K3 015666365641UL
-+
-+/*
-+ * Basic cut-down MD4 transform. Returns only 32 bits of result.
-+ */
-+static void halfMD4Transform (__u32 buf[4], __u32 const in[])
-+{
-+ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
-+
-+ /* Round 1 */
-+ ROUND(F, a, b, c, d, in[0] + K1, 3);
-+ ROUND(F, d, a, b, c, in[1] + K1, 7);
-+ ROUND(F, c, d, a, b, in[2] + K1, 11);
-+ ROUND(F, b, c, d, a, in[3] + K1, 19);
-+ ROUND(F, a, b, c, d, in[4] + K1, 3);
-+ ROUND(F, d, a, b, c, in[5] + K1, 7);
-+ ROUND(F, c, d, a, b, in[6] + K1, 11);
-+ ROUND(F, b, c, d, a, in[7] + K1, 19);
-+
-+ /* Round 2 */
-+ ROUND(G, a, b, c, d, in[1] + K2, 3);
-+ ROUND(G, d, a, b, c, in[3] + K2, 5);
-+ ROUND(G, c, d, a, b, in[5] + K2, 9);
-+ ROUND(G, b, c, d, a, in[7] + K2, 13);
-+ ROUND(G, a, b, c, d, in[0] + K2, 3);
-+ ROUND(G, d, a, b, c, in[2] + K2, 5);
-+ ROUND(G, c, d, a, b, in[4] + K2, 9);
-+ ROUND(G, b, c, d, a, in[6] + K2, 13);
-+
-+ /* Round 3 */
-+ ROUND(H, a, b, c, d, in[3] + K3, 3);
-+ ROUND(H, d, a, b, c, in[7] + K3, 9);
-+ ROUND(H, c, d, a, b, in[2] + K3, 11);
-+ ROUND(H, b, c, d, a, in[6] + K3, 15);
-+ ROUND(H, a, b, c, d, in[1] + K3, 3);
-+ ROUND(H, d, a, b, c, in[5] + K3, 9);
-+ ROUND(H, c, d, a, b, in[0] + K3, 11);
-+ ROUND(H, b, c, d, a, in[4] + K3, 15);
-+
-+ buf[0] += a;
-+ buf[1] += b;
-+ buf[2] += c;
-+ buf[3] += d;
-+}
-+
-+#undef ROUND
-+#undef F
-+#undef G
-+#undef H
-+#undef K1
-+#undef K2
-+#undef K3
-+
-+/* The old legacy hash */
-+static __u32 dx_hack_hash (const char *name, int len)
-+{
-+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
-+ while (len--) {
-+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
-+
-+ if (hash & 0x80000000) hash -= 0x7fffffff;
-+ hash1 = hash0;
-+ hash0 = hash;
-+ }
-+ return (hash0 << 1);
-+}
-+
-+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
-+{
-+ __u32 pad, val;
-+ int i;
-+
-+ pad = (__u32)len | ((__u32)len << 8);
-+ pad |= pad << 16;
-+
-+ val = pad;
-+ if (len > num*4)
-+ len = num * 4;
-+ for (i=0; i < len; i++) {
-+ if ((i % 4) == 0)
-+ val = pad;
-+ val = msg[i] + (val << 8);
-+ if ((i % 4) == 3) {
-+ *buf++ = val;
-+ val = pad;
-+ num--;
-+ }
-+ }
-+ if (--num >= 0)
-+ *buf++ = val;
-+ while (--num >= 0)
-+ *buf++ = pad;
-+}
-+
-+/*
-+ * Returns the hash of a filename. If len is 0 and name is NULL, then
-+ * this function can be used to test whether or not a hash version is
-+ * supported.
-+ *
-+ * The seed is an 4 longword (32 bits) "secret" which can be used to
-+ * uniquify a hash. If the seed is all zero's, then some default seed
-+ * may be used.
-+ *
-+ * A particular hash version specifies whether or not the seed is
-+ * represented, and whether or not the returned hash is 32 bits or 64
-+ * bits. 32 bit hashes will return 0 for the minor hash.
-+ */
-+int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
-+{
-+ __u32 hash;
-+ __u32 minor_hash = 0;
-+ const char *p;
-+ int i;
-+ __u32 in[8], buf[4];
-+
-+ /* Initialize the default seed for the hash checksum functions */
-+ buf[0] = 0x67452301;
-+ buf[1] = 0xefcdab89;
-+ buf[2] = 0x98badcfe;
-+ buf[3] = 0x10325476;
-+
-+ /* Check to see if the seed is all zero's */
-+ if (hinfo->seed) {
-+ for (i=0; i < 4; i++) {
-+ if (hinfo->seed[i])
-+ break;
-+ }
-+ if (i < 4)
-+ memcpy(buf, hinfo->seed, sizeof(buf));
-+ }
-+
-+ switch (hinfo->hash_version) {
-+ case DX_HASH_LEGACY:
-+ hash = dx_hack_hash(name, len);
-+ break;
-+ case DX_HASH_HALF_MD4:
-+ p = name;
-+ while (len > 0) {
-+ str2hashbuf(p, len, in, 8);
-+ halfMD4Transform(buf, in);
-+ len -= 32;
-+ p += 32;
-+ }
-+ minor_hash = buf[2];
-+ hash = buf[1];
-+ break;
-+ case DX_HASH_TEA:
-+ p = name;
-+ while (len > 0) {
-+ str2hashbuf(p, len, in, 4);
-+ TEA_transform(buf, in);
-+ len -= 16;
-+ p += 16;
-+ }
-+ hash = buf[0];
-+ minor_hash = buf[1];
-+ break;
-+ default:
-+ hinfo->hash = 0;
-+ return -1;
-+ }
-+ hinfo->hash = hash & ~1;
-+ hinfo->minor_hash = minor_hash;
-+ return 0;
-+}
-Index: linux-2.4.19.SuSE/lib/rbtree.c
-===================================================================
---- linux-2.4.19.SuSE.orig/lib/rbtree.c 2002-08-02 17:39:46.000000000 -0700
-+++ linux-2.4.19.SuSE/lib/rbtree.c 2004-05-27 11:08:28.000000000 -0700
-@@ -17,6 +17,8 @@
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
- linux/lib/rbtree.c
-+
-+ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002
- */
-
- #include <linux/rbtree.h>
-@@ -294,3 +296,43 @@
- __rb_erase_color(child, parent, root);
- }
- EXPORT_SYMBOL(rb_erase);
-+
-+/*
-+ * This function returns the first node (in sort order) of the tree.
-+ */
-+rb_node_t *rb_get_first(rb_root_t *root)
-+{
-+ rb_node_t *n;
-+
-+ n = root->rb_node;
-+ if (!n)
-+ return 0;
-+ while (n->rb_left)
-+ n = n->rb_left;
-+ return n;
-+}
-+EXPORT_SYMBOL(rb_get_first);
-+
-+/*
-+ * Given a node, this function will return the next node in the tree.
-+ */
-+rb_node_t *rb_get_next(rb_node_t *n)
-+{
-+ rb_node_t *parent;
-+
-+ if (n->rb_right) {
-+ n = n->rb_right;
-+ while (n->rb_left)
-+ n = n->rb_left;
-+ return n;
-+ } else {
-+ while ((parent = n->rb_parent)) {
-+ if (n == parent->rb_left)
-+ return parent;
-+ n = parent;
-+ }
-+ return 0;
-+ }
-+}
-+EXPORT_SYMBOL(rb_get_next);
-+
-Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h 2003-10-05 09:30:34.000000000 -0700
-+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h 2004-05-27 11:08:28.000000000 -0700
-@@ -40,6 +40,11 @@
- #define EXT3FS_VERSION "2.4-0.9.18"
-
- /*
-+ * Always enable hashed directories
-+ */
-+#define CONFIG_EXT3_INDEX
-+
-+/*
- * Debug code
- */
- #ifdef EXT3FS_DEBUG
-@@ -414,8 +419,11 @@
- /*E0*/ __u32 s_journal_inum; /* inode number of journal file */
- __u32 s_journal_dev; /* device number of journal file */
- __u32 s_last_orphan; /* start of list of inodes to delete */
--
--/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */
-+ __u32 s_hash_seed[4]; /* HTREE hash seed */
-+ __u8 s_def_hash_version; /* Default hash version to use */
-+ __u8 s_reserved_char_pad;
-+ __u16 s_reserved_word_pad;
-+ __u32 s_reserved[192]; /* Padding to the end of the block */
- };
-
- #ifdef __KERNEL__
-@@ -552,9 +560,46 @@
- #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
- #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
- ~EXT3_DIR_ROUND)
-+/*
-+ * Hash Tree Directory indexing
-+ * (c) Daniel Phillips, 2001
-+ */
-+
-+#ifdef CONFIG_EXT3_INDEX
-+ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
-+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#else
-+ #define is_dx(dir) 0
-+#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
-+#endif
-+
-+/* Legal values for the dx_root hash_version field: */
-+
-+#define DX_HASH_LEGACY 0
-+#define DX_HASH_HALF_MD4 1
-+#define DX_HASH_TEA 2
-+
-+/* hash info structure used by the directory hash */
-+struct dx_hash_info
-+{
-+ u32 hash;
-+ u32 minor_hash;
-+ int hash_version;
-+ u32 *seed;
-+};
-
- #ifdef __KERNEL__
- /*
-+ * Control parameters used by ext3_htree_next_block
-+ */
-+#define HASH_NB_ALWAYS 1
-+
-+
-+/*
- * Describe an inode's exact location on disk and in memory
- */
- struct ext3_iloc
-@@ -564,6 +609,27 @@
- unsigned long block_group;
- };
-
-+
-+/*
-+ * This structure is stuffed into the struct file's private_data field
-+ * for directories. It is where we put information so that we can do
-+ * readdir operations in hash tree order.
-+ */
-+struct dir_private_info {
-+ rb_root_t root;
-+ rb_node_t *curr_node;
-+ struct fname *extra_fname;
-+ loff_t last_pos;
-+ __u32 curr_hash;
-+ __u32 curr_minor_hash;
-+ __u32 next_hash;
-+};
-+
-+/*
-+ * Special error return code only used by dx_probe() and its callers.
-+ */
-+#define ERR_BAD_DX_DIR -75000
-+
- /*
- * Function prototypes
- */
-@@ -591,11 +657,20 @@
-
- /* dir.c */
- extern int ext3_check_dir_entry(const char *, struct inode *,
-- struct ext3_dir_entry_2 *, struct buffer_head *,
-- unsigned long);
-+ struct ext3_dir_entry_2 *,
-+ struct buffer_head *, unsigned long);
-+extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
-+ __u32 minor_hash,
-+ struct ext3_dir_entry_2 *dirent);
-+extern void ext3_htree_free_dir_info(struct dir_private_info *p);
-+
- /* fsync.c */
- extern int ext3_sync_file (struct file *, struct dentry *, int);
-
-+/* hash.c */
-+extern int ext3fs_dirhash(const char *name, int len, struct
-+ dx_hash_info *hinfo);
-+
- /* ialloc.c */
- extern struct inode * ext3_new_inode (handle_t *, struct inode *, int);
- extern void ext3_free_inode (handle_t *, struct inode *);
-@@ -628,6 +703,8 @@
- /* namei.c */
- extern int ext3_orphan_add(handle_t *, struct inode *);
- extern int ext3_orphan_del(handle_t *, struct inode *);
-+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
-+ __u32 start_minor_hash, __u32 *next_hash);
-
- /* super.c */
- extern void ext3_error (struct super_block *, const char *, const char *, ...)
-Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h 2003-10-05 09:16:36.000000000 -0700
-+++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h 2004-05-27 11:08:28.000000000 -0700
-@@ -62,6 +62,8 @@
- int s_inode_size;
- int s_first_ino;
- u32 s_next_generation;
-+ u32 s_hash_seed[4];
-+ int s_def_hash_version;
-
- /* Journaling */
- struct inode * s_journal_inode;
-Index: linux-2.4.19.SuSE/include/linux/ext3_jbd.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/ext3_jbd.h 2003-10-05 09:30:34.000000000 -0700
-+++ linux-2.4.19.SuSE/include/linux/ext3_jbd.h 2004-05-27 11:08:28.000000000 -0700
-@@ -69,6 +69,8 @@
-
- #define EXT3_RESERVE_TRANS_BLOCKS 12
-
-+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
-+
- int
- ext3_mark_iloc_dirty(handle_t *handle,
- struct inode *inode,
-Index: linux-2.4.19.SuSE/include/linux/rbtree.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/rbtree.h 2003-10-05 09:16:36.000000000 -0700
-+++ linux-2.4.19.SuSE/include/linux/rbtree.h 2004-05-27 11:08:28.000000000 -0700
-@@ -120,6 +120,8 @@
-
- extern void rb_insert_color(rb_node_t *, rb_root_t *);
- extern void rb_erase(rb_node_t *, rb_root_t *);
-+extern rb_node_t *rb_get_first(rb_root_t *root);
-+extern rb_node_t *rb_get_next(rb_node_t *n);
-
- static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
- {
+++ /dev/null
- fs/ext3/file.c | 4
- fs/ext3/inode.c | 116 ++++++++++++++++++++++
- fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++
- include/linux/ext3_fs.h | 5
- include/linux/ext3_fs_sb.h | 10 +
- 5 files changed, 365 insertions(+)
-
-Index: linux-2.4.19.SuSE/fs/ext3/super.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:18:04 2003
-+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 01:19:22 2003
-@@ -401,6 +401,220 @@
- }
- }
-
-+#ifdef EXT3_DELETE_THREAD
-+/*
-+ * Delete inodes in a loop until there are no more to be deleted.
-+ * Normally, we run in the background doing the deletes and sleeping again,
-+ * and clients just add new inodes to be deleted onto the end of the list.
-+ * If someone is concerned about free space (e.g. block allocation or similar)
-+ * then they can sleep on s_delete_waiter_queue and be woken up when space
-+ * has been freed.
-+ */
-+int ext3_delete_thread(void *data)
-+{
-+ struct super_block *sb = data;
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct task_struct *tsk = current;
-+
-+ /* Almost like daemonize, but not quite */
-+ exit_mm(current);
-+ tsk->session = 1;
-+ tsk->pgrp = 1;
-+ tsk->tty = NULL;
-+ exit_files(current);
-+ reparent_to_init();
-+
-+ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
-+ sigfillset(&tsk->blocked);
-+
-+ /*tsk->flags |= PF_KERNTHREAD;*/
-+
-+ INIT_LIST_HEAD(&sbi->s_delete_list);
-+ wake_up(&sbi->s_delete_waiter_queue);
-+ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
-+
-+ /* main loop */
-+ for (;;) {
-+ wait_event_interruptible(sbi->s_delete_thread_queue,
-+ !list_empty(&sbi->s_delete_list) ||
-+ !test_opt(sb, ASYNCDEL));
-+ ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
-+ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
-+
-+ spin_lock(&sbi->s_delete_lock);
-+ if (list_empty(&sbi->s_delete_list)) {
-+ clear_opt(sbi->s_mount_opt, ASYNCDEL);
-+ memset(&sbi->s_delete_list, 0,
-+ sizeof(sbi->s_delete_list));
-+ spin_unlock(&sbi->s_delete_lock);
-+ ext3_debug("delete thread on %s exiting\n",
-+ kdevname(sb->s_dev));
-+ wake_up(&sbi->s_delete_waiter_queue);
-+ break;
-+ }
-+
-+ while (!list_empty(&sbi->s_delete_list)) {
-+ struct inode *inode=list_entry(sbi->s_delete_list.next,
-+ struct inode, i_dentry);
-+ unsigned long blocks = inode->i_blocks >>
-+ (inode->i_blkbits - 9);
-+
-+ list_del_init(&inode->i_dentry);
-+ spin_unlock(&sbi->s_delete_lock);
-+ ext3_debug("%s delete ino %lu blk %lu\n",
-+ tsk->comm, inode->i_ino, blocks);
-+
-+ iput(inode);
-+
-+ spin_lock(&sbi->s_delete_lock);
-+ sbi->s_delete_blocks -= blocks;
-+ sbi->s_delete_inodes--;
-+ }
-+ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "%lu blocks, %lu inodes on list?\n",
-+ sbi->s_delete_blocks,sbi->s_delete_inodes);
-+ sbi->s_delete_blocks = 0;
-+ sbi->s_delete_inodes = 0;
-+ }
-+ spin_unlock(&sbi->s_delete_lock);
-+ wake_up(&sbi->s_delete_waiter_queue);
-+ }
-+
-+ return 0;
-+}
-+
-+static void ext3_start_delete_thread(struct super_block *sb)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int rc;
-+
-+ spin_lock_init(&sbi->s_delete_lock);
-+ init_waitqueue_head(&sbi->s_delete_thread_queue);
-+ init_waitqueue_head(&sbi->s_delete_waiter_queue);
-+
-+ if (!test_opt(sb, ASYNCDEL))
-+ return;
-+
-+ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
-+ if (rc < 0)
-+ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
-+ rc);
-+ else
-+ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
-+}
-+
-+static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
-+{
-+ if (sbi->s_delete_list.next == 0) /* thread never started */
-+ return;
-+
-+ clear_opt(sbi->s_mount_opt, ASYNCDEL);
-+ wake_up(&sbi->s_delete_thread_queue);
-+ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
-+}
-+
-+/* Instead of playing games with the inode flags, destruction, etc we just
-+ * create a new inode locally and put it on a list for the truncate thread.
-+ * We need large parts of the inode struct in order to complete the
-+ * truncate and unlink, so we may as well just have a real inode to do it.
-+ *
-+ * If we have any problem deferring the delete, just delete it right away.
-+ * If we defer it, we also mark how many blocks it would free, so that we
-+ * can keep the statfs data correct, and we know if we should sleep on the
-+ * delete thread when we run out of space.
-+ */
-+static void ext3_delete_inode_thread(struct inode *old_inode)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
-+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
-+ struct inode *new_inode;
-+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
-+
-+ if (is_bad_inode(old_inode)) {
-+ clear_inode(old_inode);
-+ return;
-+ }
-+
-+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
-+ goto out_delete;
-+
-+ /* We may want to delete the inode immediately and not defer it */
-+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
-+ goto out_delete;
-+
-+ /* We can't use the delete thread as-is during real orphan recovery,
-+ * as we add to the orphan list here, causing ext3_orphan_cleanup()
-+ * to loop endlessly. It would be nice to do so, but needs work.
-+ */
-+ if (oei->i_state & EXT3_STATE_DELETE ||
-+ sbi->s_mount_state & EXT3_ORPHAN_FS) {
-+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
-+ old_inode->i_ino, blocks);
-+ goto out_delete;
-+ }
-+
-+ /* We can iget this inode again here, because our caller has unhashed
-+ * old_inode, so new_inode will be in a different inode struct.
-+ *
-+ * We need to ensure that the i_orphan pointers in the other inodes
-+ * point at the new inode copy instead of the old one so the orphan
-+ * list doesn't get corrupted when the old orphan inode is freed.
-+ */
-+ down(&sbi->s_orphan_lock);
-+
-+ sbi->s_mount_state |= EXT3_ORPHAN_FS;
-+ new_inode = iget(old_inode->i_sb, old_inode->i_ino);
-+ sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
-+ if (is_bad_inode(new_inode)) {
-+ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
-+ iput(new_inode);
-+ new_inode = NULL;
-+ }
-+ if (!new_inode) {
-+ up(&sbi->s_orphan_lock);
-+ ext3_debug("delete inode %lu directly (bad read)\n",
-+ old_inode->i_ino);
-+ goto out_delete;
-+ }
-+ J_ASSERT(new_inode != old_inode);
-+
-+ J_ASSERT(!list_empty(&oei->i_orphan));
-+
-+ nei = EXT3_I(new_inode);
-+ /* Ugh. We need to insert new_inode into the same spot on the list
-+ * as old_inode was, to ensure the in-memory orphan list is still
-+ * in the same order as the on-disk orphan list (badness otherwise).
-+ */
-+ nei->i_orphan = oei->i_orphan;
-+ nei->i_orphan.next->prev = &nei->i_orphan;
-+ nei->i_orphan.prev->next = &nei->i_orphan;
-+ nei->i_state |= EXT3_STATE_DELETE;
-+ up(&sbi->s_orphan_lock);
-+
-+ clear_inode(old_inode);
-+
-+ spin_lock(&sbi->s_delete_lock);
-+ J_ASSERT(list_empty(&new_inode->i_dentry));
-+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
-+ sbi->s_delete_blocks += blocks;
-+ sbi->s_delete_inodes++;
-+ spin_unlock(&sbi->s_delete_lock);
-+
-+ ext3_debug("delete inode %lu (%lu blocks) by thread\n",
-+ new_inode->i_ino, blocks);
-+
-+ wake_up(&sbi->s_delete_thread_queue);
-+ return;
-+
-+out_delete:
-+ ext3_delete_inode(old_inode);
-+}
-+#else
-+#define ext3_start_delete_thread(sbi) do {} while(0)
-+#define ext3_stop_delete_thread(sbi) do {} while(0)
-+#endif /* EXT3_DELETE_THREAD */
-+
- void ext3_put_super (struct super_block * sb)
- {
- struct ext3_sb_info *sbi = EXT3_SB(sb);
-@@ -408,6 +622,7 @@
- kdev_t j_dev = sbi->s_journal->j_dev;
- int i;
-
-+ ext3_stop_delete_thread(sbi);
- ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
- if (!(sb->s_flags & MS_RDONLY)) {
-@@ -476,7 +691,11 @@
- write_inode: ext3_write_inode, /* BKL not held. Don't need */
- dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */
- put_inode: ext3_put_inode, /* BKL not held. Don't need */
-+#ifdef EXT3_DELETE_THREAD
-+ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */
-+#else
- delete_inode: ext3_delete_inode, /* BKL not held. We take it */
-+#endif
- put_super: ext3_put_super, /* BKL held */
- write_super: ext3_write_super, /* BKL held */
- sync_fs: ext3_sync_fs,
-@@ -553,6 +772,13 @@
- clear_opt (*mount_options, POSIX_ACL);
- else
- #endif
-+#ifdef EXT3_DELETE_THREAD
-+ if (!strcmp(this_char, "asyncdel"))
-+ set_opt(*mount_options, ASYNCDEL);
-+ else if (!strcmp(this_char, "noasyncdel"))
-+ clear_opt(*mount_options, ASYNCDEL);
-+ else
-+#endif
- if (!strcmp (this_char, "bsddf"))
- clear_opt (*mount_options, MINIX_DF);
- else if (!strcmp (this_char, "nouid32")) {
-@@ -1254,6 +1480,7 @@
- }
-
- ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-+ ext3_start_delete_thread(sb);
- /*
- * akpm: core read_super() calls in here with the superblock locked.
- * That deadlocks, because orphan cleanup needs to lock the superblock
-@@ -1692,6 +1919,9 @@
- if (!parse_options(data, &tmp, sbi, &tmp, 1))
- return -EINVAL;
-
-+ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
-+ ext3_stop_delete_thread(sbi);
-+
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
- ext3_abort(sb, __FUNCTION__, "Abort forced by user");
-
-Index: linux-2.4.19.SuSE/fs/ext3/inode.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:02:56 2003
-+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:19:22 2003
-@@ -2114,6 +2114,118 @@
- ext3_journal_stop(handle, inode);
- }
-
-+#ifdef EXT3_DELETE_THREAD
-+/* Move blocks from to-be-truncated inode over to a new inode, and delete
-+ * that one from the delete thread instead. This avoids a lot of latency
-+ * when truncating large files.
-+ *
-+ * If we have any problem deferring the truncate, just truncate it right away.
-+ * If we defer it, we also mark how many blocks it would free, so that we
-+ * can keep the statfs data correct, and we know if we should sleep on the
-+ * delete thread when we run out of space.
-+ */
-+void ext3_truncate_thread(struct inode *old_inode)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
-+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
-+ struct inode *new_inode;
-+ handle_t *handle;
-+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
-+
-+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
-+ goto out_truncate;
-+
-+ /* XXX This is a temporary limitation for code simplicity.
-+ * We could truncate to arbitrary sizes at some later time.
-+ */
-+ if (old_inode->i_size != 0)
-+ goto out_truncate;
-+
-+ /* We may want to truncate the inode immediately and not defer it */
-+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
-+ old_inode->i_size > oei->i_disksize)
-+ goto out_truncate;
-+
-+ /* We can't use the delete thread as-is during real orphan recovery,
-+ * as we add to the orphan list here, causing ext3_orphan_cleanup()
-+ * to loop endlessly. It would be nice to do so, but needs work.
-+ */
-+ if (oei->i_state & EXT3_STATE_DELETE ||
-+ sbi->s_mount_state & EXT3_ORPHAN_FS) {
-+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
-+ old_inode->i_ino, blocks);
-+ goto out_truncate;
-+ }
-+
-+ ext3_discard_prealloc(old_inode);
-+
-+ /* old_inode = 1
-+ * new_inode = sb + GDT + ibitmap
-+ * orphan list = 1 inode/superblock for add, 2 inodes for del
-+ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
-+ */
-+ handle = ext3_journal_start(old_inode, 7);
-+ if (IS_ERR(handle))
-+ goto out_truncate;
-+
-+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
-+ if (IS_ERR(new_inode)) {
-+ ext3_debug("truncate inode %lu directly (no new inodes)\n",
-+ old_inode->i_ino);
-+ goto out_journal;
-+ }
-+
-+ nei = EXT3_I(new_inode);
-+
-+ down_write(&oei->truncate_sem);
-+ new_inode->i_size = old_inode->i_size;
-+ new_inode->i_blocks = old_inode->i_blocks;
-+ new_inode->i_uid = old_inode->i_uid;
-+ new_inode->i_gid = old_inode->i_gid;
-+ new_inode->i_nlink = 0;
-+
-+ /* FIXME when we do arbitrary truncates */
-+ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
-+ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
-+
-+ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
-+ memset(oei->i_data, 0, sizeof(oei->i_data));
-+
-+ nei->i_disksize = oei->i_disksize;
-+ nei->i_state |= EXT3_STATE_DELETE;
-+ up_write(&oei->truncate_sem);
-+
-+ if (ext3_orphan_add(handle, new_inode) < 0)
-+ goto out_journal;
-+
-+ if (ext3_orphan_del(handle, old_inode) < 0) {
-+ ext3_orphan_del(handle, new_inode);
-+ iput(new_inode);
-+ goto out_journal;
-+ }
-+
-+ ext3_journal_stop(handle, old_inode);
-+
-+ spin_lock(&sbi->s_delete_lock);
-+ J_ASSERT(list_empty(&new_inode->i_dentry));
-+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
-+ sbi->s_delete_blocks += blocks;
-+ sbi->s_delete_inodes++;
-+ spin_unlock(&sbi->s_delete_lock);
-+
-+ ext3_debug("delete inode %lu (%lu blocks) by thread\n",
-+ new_inode->i_ino, blocks);
-+
-+ wake_up(&sbi->s_delete_thread_queue);
-+ return;
-+
-+out_journal:
-+ ext3_journal_stop(handle, old_inode);
-+out_truncate:
-+ ext3_truncate(old_inode);
-+}
-+#endif /* EXT3_DELETE_THREAD */
-+
- /*
- * ext3_get_inode_loc returns with an extra refcount against the
- * inode's underlying buffer_head on success.
-Index: linux-2.4.19.SuSE/fs/ext3/file.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/file.c Sun Nov 16 00:40:59 2003
-+++ linux-2.4.19.SuSE/fs/ext3/file.c Sun Nov 16 01:19:22 2003
-@@ -132,7 +132,11 @@
- };
-
- struct inode_operations ext3_file_inode_operations = {
-+#ifdef EXT3_DELETE_THREAD
-+ truncate: ext3_truncate_thread, /* BKL held */
-+#else
- truncate: ext3_truncate, /* BKL held */
-+#endif
- setattr: ext3_setattr, /* BKL held */
- setxattr: ext3_setxattr, /* BKL held */
- getxattr: ext3_getxattr, /* BKL held */
-Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:02:51 2003
-+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:20:06 2003
-@@ -193,6 +193,7 @@
- */
- #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */
- #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */
-+#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */
-
- /*
- * ioctl commands
-@@ -321,6 +322,7 @@
- #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
- #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
- #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
-+#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
-
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef _LINUX_EXT2_FS_H
-@@ -695,6 +697,9 @@
- extern void ext3_dirty_inode(struct inode *);
- extern int ext3_change_inode_journal_flag(struct inode *, int);
- extern void ext3_truncate (struct inode *);
-+#ifdef EXT3_DELETE_THREAD
-+extern void ext3_truncate_thread(struct inode *inode);
-+#endif
-
- /* ioctl.c */
- extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
-Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h Sun Nov 16 01:18:41 2003
-+++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h Sun Nov 16 01:19:22 2003
-@@ -29,6 +29,8 @@
-
- #define EXT3_MAX_GROUP_LOADED 8
-
-+#define EXT3_DELETE_THREAD
-+
- /*
- * third extended-fs super-block data in memory
- */
-@@ -75,6 +77,14 @@
- struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
- wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
- #endif
-+#ifdef EXT3_DELETE_THREAD
-+ spinlock_t s_delete_lock;
-+ struct list_head s_delete_list;
-+ unsigned long s_delete_blocks;
-+ unsigned long s_delete_inodes;
-+ wait_queue_head_t s_delete_thread_queue;
-+ wait_queue_head_t s_delete_waiter_queue;
-+#endif
- };
-
- #endif /* _LINUX_EXT3_FS_SB */
+++ /dev/null
- fs/ext3/file.c | 4
- fs/ext3/inode.c | 116 ++++++++++++++++++++++
- fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++
- include/linux/ext3_fs.h | 5
- include/linux/ext3_fs_sb.h | 10 +
- 5 files changed, 365 insertions(+)
-
-Index: linux-2.4.20/fs/ext3/super.c
-===================================================================
---- linux-2.4.20.orig/fs/ext3/super.c 2004-01-12 20:13:37.000000000 +0300
-+++ linux-2.4.20/fs/ext3/super.c 2004-01-13 16:59:54.000000000 +0300
-@@ -48,6 +48,8 @@
- static void ext3_clear_journal_err(struct super_block * sb,
- struct ext3_super_block * es);
-
-+static int ext3_sync_fs(struct super_block * sb);
-+
- #ifdef CONFIG_JBD_DEBUG
- int journal_no_write[2];
-
-@@ -398,6 +400,221 @@
- }
- }
-
-+#ifdef EXT3_DELETE_THREAD
-+/*
-+ * Delete inodes in a loop until there are no more to be deleted.
-+ * Normally, we run in the background doing the deletes and sleeping again,
-+ * and clients just add new inodes to be deleted onto the end of the list.
-+ * If someone is concerned about free space (e.g. block allocation or similar)
-+ * then they can sleep on s_delete_waiter_queue and be woken up when space
-+ * has been freed.
-+ */
-+int ext3_delete_thread(void *data)
-+{
-+ struct super_block *sb = data;
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct task_struct *tsk = current;
-+
-+ /* Almost like daemonize, but not quite */
-+ exit_mm(current);
-+ tsk->session = 1;
-+ tsk->pgrp = 1;
-+ tsk->tty = NULL;
-+ exit_files(current);
-+ reparent_to_init();
-+
-+ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
-+ sigfillset(&tsk->blocked);
-+
-+ /*tsk->flags |= PF_KERNTHREAD;*/
-+
-+ INIT_LIST_HEAD(&sbi->s_delete_list);
-+ wake_up(&sbi->s_delete_waiter_queue);
-+ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
-+
-+ /* main loop */
-+ for (;;) {
-+ wait_event_interruptible(sbi->s_delete_thread_queue,
-+ !list_empty(&sbi->s_delete_list) ||
-+ !test_opt(sb, ASYNCDEL));
-+ ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
-+ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
-+
-+ spin_lock(&sbi->s_delete_lock);
-+ if (list_empty(&sbi->s_delete_list)) {
-+ clear_opt(sbi->s_mount_opt, ASYNCDEL);
-+ memset(&sbi->s_delete_list, 0,
-+ sizeof(sbi->s_delete_list));
-+ spin_unlock(&sbi->s_delete_lock);
-+ ext3_debug("delete thread on %s exiting\n",
-+ kdevname(sb->s_dev));
-+ wake_up(&sbi->s_delete_waiter_queue);
-+ break;
-+ }
-+
-+ while (!list_empty(&sbi->s_delete_list)) {
-+ struct inode *inode=list_entry(sbi->s_delete_list.next,
-+ struct inode, i_dentry);
-+ unsigned long blocks = inode->i_blocks >>
-+ (inode->i_blkbits - 9);
-+
-+ list_del_init(&inode->i_dentry);
-+ spin_unlock(&sbi->s_delete_lock);
-+ ext3_debug("%s delete ino %lu blk %lu\n",
-+ tsk->comm, inode->i_ino, blocks);
-+
-+ iput(inode);
-+
-+ spin_lock(&sbi->s_delete_lock);
-+ sbi->s_delete_blocks -= blocks;
-+ sbi->s_delete_inodes--;
-+ }
-+ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "%lu blocks, %lu inodes on list?\n",
-+ sbi->s_delete_blocks,sbi->s_delete_inodes);
-+ sbi->s_delete_blocks = 0;
-+ sbi->s_delete_inodes = 0;
-+ }
-+ spin_unlock(&sbi->s_delete_lock);
-+ wake_up(&sbi->s_delete_waiter_queue);
-+ }
-+
-+ return 0;
-+}
-+
-+static void ext3_start_delete_thread(struct super_block *sb)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int rc;
-+
-+ spin_lock_init(&sbi->s_delete_lock);
-+ init_waitqueue_head(&sbi->s_delete_thread_queue);
-+ init_waitqueue_head(&sbi->s_delete_waiter_queue);
-+
-+ if (!test_opt(sb, ASYNCDEL))
-+ return;
-+
-+ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
-+ if (rc < 0)
-+ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
-+ rc);
-+ else
-+ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
-+}
-+
-+static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
-+{
-+ if (sbi->s_delete_list.next == 0) /* thread never started */
-+ return;
-+
-+ clear_opt(sbi->s_mount_opt, ASYNCDEL);
-+ wake_up(&sbi->s_delete_thread_queue);
-+ wait_event(sbi->s_delete_waiter_queue,
-+ sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
-+}
-+
-+/* Instead of playing games with the inode flags, destruction, etc we just
-+ * create a new inode locally and put it on a list for the truncate thread.
-+ * We need large parts of the inode struct in order to complete the
-+ * truncate and unlink, so we may as well just have a real inode to do it.
-+ *
-+ * If we have any problem deferring the delete, just delete it right away.
-+ * If we defer it, we also mark how many blocks it would free, so that we
-+ * can keep the statfs data correct, and we know if we should sleep on the
-+ * delete thread when we run out of space.
-+ */
-+static void ext3_delete_inode_thread(struct inode *old_inode)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
-+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
-+ struct inode *new_inode;
-+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
-+
-+ if (is_bad_inode(old_inode)) {
-+ clear_inode(old_inode);
-+ return;
-+ }
-+
-+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
-+ goto out_delete;
-+
-+ /* We may want to delete the inode immediately and not defer it */
-+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
-+ goto out_delete;
-+
-+ /* We can't use the delete thread as-is during real orphan recovery,
-+ * as we add to the orphan list here, causing ext3_orphan_cleanup()
-+ * to loop endlessly. It would be nice to do so, but needs work.
-+ */
-+ if (oei->i_state & EXT3_STATE_DELETE ||
-+ sbi->s_mount_state & EXT3_ORPHAN_FS) {
-+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
-+ old_inode->i_ino, blocks);
-+ goto out_delete;
-+ }
-+
-+ /* We can iget this inode again here, because our caller has unhashed
-+ * old_inode, so new_inode will be in a different inode struct.
-+ *
-+ * We need to ensure that the i_orphan pointers in the other inodes
-+ * point at the new inode copy instead of the old one so the orphan
-+ * list doesn't get corrupted when the old orphan inode is freed.
-+ */
-+ down(&sbi->s_orphan_lock);
-+
-+ sbi->s_mount_state |= EXT3_ORPHAN_FS;
-+ new_inode = iget(old_inode->i_sb, old_inode->i_ino);
-+ sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
-+ if (is_bad_inode(new_inode)) {
-+ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
-+ iput(new_inode);
-+ new_inode = NULL;
-+ }
-+ if (!new_inode) {
-+ up(&sbi->s_orphan_lock);
-+ ext3_debug("delete inode %lu directly (bad read)\n",
-+ old_inode->i_ino);
-+ goto out_delete;
-+ }
-+ J_ASSERT(new_inode != old_inode);
-+
-+ J_ASSERT(!list_empty(&oei->i_orphan));
-+
-+ nei = EXT3_I(new_inode);
-+ /* Ugh. We need to insert new_inode into the same spot on the list
-+ * as old_inode was, to ensure the in-memory orphan list is still
-+ * in the same order as the on-disk orphan list (badness otherwise).
-+ */
-+ nei->i_orphan = oei->i_orphan;
-+ nei->i_orphan.next->prev = &nei->i_orphan;
-+ nei->i_orphan.prev->next = &nei->i_orphan;
-+ nei->i_state |= EXT3_STATE_DELETE;
-+ up(&sbi->s_orphan_lock);
-+
-+ clear_inode(old_inode);
-+
-+ spin_lock(&sbi->s_delete_lock);
-+ J_ASSERT(list_empty(&new_inode->i_dentry));
-+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
-+ sbi->s_delete_blocks += blocks;
-+ sbi->s_delete_inodes++;
-+ spin_unlock(&sbi->s_delete_lock);
-+
-+ ext3_debug("delete inode %lu (%lu blocks) by thread\n",
-+ new_inode->i_ino, blocks);
-+
-+ wake_up(&sbi->s_delete_thread_queue);
-+ return;
-+
-+out_delete:
-+ ext3_delete_inode(old_inode);
-+}
-+#else
-+#define ext3_start_delete_thread(sbi) do {} while(0)
-+#define ext3_stop_delete_thread(sbi) do {} while(0)
-+#endif /* EXT3_DELETE_THREAD */
-+
- void ext3_put_super (struct super_block * sb)
- {
- struct ext3_sb_info *sbi = EXT3_SB(sb);
-@@ -405,6 +622,7 @@
- kdev_t j_dev = sbi->s_journal->j_dev;
- int i;
-
-+ J_ASSERT(sbi->s_delete_inodes == 0);
- ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
- if (!(sb->s_flags & MS_RDONLY)) {
-@@ -453,9 +671,14 @@
- write_inode: ext3_write_inode, /* BKL not held. Don't need */
- dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */
- put_inode: ext3_put_inode, /* BKL not held. Don't need */
-+#ifdef EXT3_DELETE_THREAD
-+ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */
-+#else
- delete_inode: ext3_delete_inode, /* BKL not held. We take it */
-+#endif
- put_super: ext3_put_super, /* BKL held */
- write_super: ext3_write_super, /* BKL held */
-+ sync_fs: ext3_sync_fs,
- write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
- unlockfs: ext3_unlockfs, /* BKL not held. We take it */
- statfs: ext3_statfs, /* BKL held */
-@@ -521,6 +744,13 @@
- clear_opt (*mount_options, XATTR_USER);
- else
- #endif
-+#ifdef EXT3_DELETE_THREAD
-+ if (!strcmp(this_char, "asyncdel"))
-+ set_opt(*mount_options, ASYNCDEL);
-+ else if (!strcmp(this_char, "noasyncdel"))
-+ clear_opt(*mount_options, ASYNCDEL);
-+ else
-+#endif
- if (!strcmp (this_char, "bsddf"))
- clear_opt (*mount_options, MINIX_DF);
- else if (!strcmp (this_char, "nouid32")) {
-@@ -1220,6 +1450,7 @@
- }
-
- ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-+ ext3_start_delete_thread(sb);
- /*
- * akpm: core read_super() calls in here with the superblock locked.
- * That deadlocks, because orphan cleanup needs to lock the superblock
-@@ -1625,6 +1856,21 @@
- }
- }
-
-+static int ext3_sync_fs(struct super_block *sb)
-+{
-+ tid_t target;
-+
-+ if (atomic_read(&sb->s_active) == 0) {
-+ /* fs is being umounted: time to stop delete thread */
-+ ext3_stop_delete_thread(EXT3_SB(sb));
-+ }
-+
-+ sb->s_dirt = 0;
-+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
-+ log_wait_commit(EXT3_SB(sb)->s_journal, target);
-+ return 0;
-+}
-+
- /*
- * LVM calls this function before a (read-only) snapshot is created. This
- * gives us a chance to flush the journal completely and mark the fs clean.
-@@ -1682,6 +1928,9 @@
- if (!parse_options(data, &tmp, sbi, &tmp, 1))
- return -EINVAL;
-
-+ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
-+ ext3_stop_delete_thread(sbi);
-+
- if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
- ext3_abort(sb, __FUNCTION__, "Abort forced by user");
-
-Index: linux-2.4.20/fs/ext3/inode.c
-===================================================================
---- linux-2.4.20.orig/fs/ext3/inode.c 2004-01-12 20:13:37.000000000 +0300
-+++ linux-2.4.20/fs/ext3/inode.c 2004-01-13 16:55:45.000000000 +0300
-@@ -2552,6 +2552,118 @@
- return err;
- }
-
-+#ifdef EXT3_DELETE_THREAD
-+/* Move blocks from to-be-truncated inode over to a new inode, and delete
-+ * that one from the delete thread instead. This avoids a lot of latency
-+ * when truncating large files.
-+ *
-+ * If we have any problem deferring the truncate, just truncate it right away.
-+ * If we defer it, we also mark how many blocks it would free, so that we
-+ * can keep the statfs data correct, and we know if we should sleep on the
-+ * delete thread when we run out of space.
-+ */
-+void ext3_truncate_thread(struct inode *old_inode)
-+{
-+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
-+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
-+ struct inode *new_inode;
-+ handle_t *handle;
-+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
-+
-+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
-+ goto out_truncate;
-+
-+ /* XXX This is a temporary limitation for code simplicity.
-+ * We could truncate to arbitrary sizes at some later time.
-+ */
-+ if (old_inode->i_size != 0)
-+ goto out_truncate;
-+
-+ /* We may want to truncate the inode immediately and not defer it */
-+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
-+ old_inode->i_size > oei->i_disksize)
-+ goto out_truncate;
-+
-+ /* We can't use the delete thread as-is during real orphan recovery,
-+ * as we add to the orphan list here, causing ext3_orphan_cleanup()
-+ * to loop endlessly. It would be nice to do so, but needs work.
-+ */
-+ if (oei->i_state & EXT3_STATE_DELETE ||
-+ sbi->s_mount_state & EXT3_ORPHAN_FS) {
-+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
-+ old_inode->i_ino, blocks);
-+ goto out_truncate;
-+ }
-+
-+ ext3_discard_prealloc(old_inode);
-+
-+ /* old_inode = 1
-+ * new_inode = sb + GDT + ibitmap
-+ * orphan list = 1 inode/superblock for add, 2 inodes for del
-+ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
-+ */
-+ handle = ext3_journal_start(old_inode, 7);
-+ if (IS_ERR(handle))
-+ goto out_truncate;
-+
-+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
-+ if (IS_ERR(new_inode)) {
-+ ext3_debug("truncate inode %lu directly (no new inodes)\n",
-+ old_inode->i_ino);
-+ goto out_journal;
-+ }
-+
-+ nei = EXT3_I(new_inode);
-+
-+ down_write(&oei->truncate_sem);
-+ new_inode->i_size = old_inode->i_size;
-+ new_inode->i_blocks = old_inode->i_blocks;
-+ new_inode->i_uid = old_inode->i_uid;
-+ new_inode->i_gid = old_inode->i_gid;
-+ new_inode->i_nlink = 0;
-+
-+ /* FIXME when we do arbitrary truncates */
-+ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
-+ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
-+
-+ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
-+ memset(oei->i_data, 0, sizeof(oei->i_data));
-+
-+ nei->i_disksize = oei->i_disksize;
-+ nei->i_state |= EXT3_STATE_DELETE;
-+ up_write(&oei->truncate_sem);
-+
-+ if (ext3_orphan_add(handle, new_inode) < 0)
-+ goto out_journal;
-+
-+ if (ext3_orphan_del(handle, old_inode) < 0) {
-+ ext3_orphan_del(handle, new_inode);
-+ iput(new_inode);
-+ goto out_journal;
-+ }
-+
-+ ext3_journal_stop(handle, old_inode);
-+
-+ spin_lock(&sbi->s_delete_lock);
-+ J_ASSERT(list_empty(&new_inode->i_dentry));
-+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
-+ sbi->s_delete_blocks += blocks;
-+ sbi->s_delete_inodes++;
-+ spin_unlock(&sbi->s_delete_lock);
-+
-+ ext3_debug("delete inode %lu (%lu blocks) by thread\n",
-+ new_inode->i_ino, blocks);
-+
-+ wake_up(&sbi->s_delete_thread_queue);
-+ return;
-+
-+out_journal:
-+ ext3_journal_stop(handle, old_inode);
-+out_truncate:
-+ ext3_truncate(old_inode);
-+}
-+#endif /* EXT3_DELETE_THREAD */
-+
- /*
- * On success, We end up with an outstanding reference count against
- * iloc->bh. This _must_ be cleaned up later.
-Index: linux-2.4.20/fs/ext3/file.c
-===================================================================
---- linux-2.4.20.orig/fs/ext3/file.c 2004-01-12 20:13:36.000000000 +0300
-+++ linux-2.4.20/fs/ext3/file.c 2004-01-13 16:55:45.000000000 +0300
-@@ -125,7 +125,11 @@
- };
-
- struct inode_operations ext3_file_inode_operations = {
-+#ifdef EXT3_DELETE_THREAD
-+ truncate: ext3_truncate_thread, /* BKL held */
-+#else
- truncate: ext3_truncate, /* BKL held */
-+#endif
- setattr: ext3_setattr, /* BKL held */
- setxattr: ext3_setxattr, /* BKL held */
- getxattr: ext3_getxattr, /* BKL held */
-Index: linux-2.4.20/fs/buffer.c
-===================================================================
---- linux-2.4.20.orig/fs/buffer.c 2003-05-16 05:29:12.000000000 +0400
-+++ linux-2.4.20/fs/buffer.c 2004-01-13 16:55:45.000000000 +0300
-@@ -328,6 +328,8 @@
- if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
- sb->s_op->write_super(sb);
- unlock_super(sb);
-+ if (sb->s_op && sb->s_op->sync_fs)
-+ sb->s_op->sync_fs(sb);
- unlock_kernel();
-
- return sync_buffers(dev, 1);
-Index: linux-2.4.20/include/linux/ext3_fs.h
-===================================================================
---- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-01-12 20:13:37.000000000 +0300
-+++ linux-2.4.20/include/linux/ext3_fs.h 2004-01-13 16:55:45.000000000 +0300
-@@ -193,6 +193,7 @@
- */
- #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */
- #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */
-+#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */
-
- /*
- * ioctl commands
-@@ -320,6 +321,7 @@
- #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
- #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
- #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
-+#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
-
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef _LINUX_EXT2_FS_H
-@@ -696,6 +698,9 @@
- extern void ext3_dirty_inode(struct inode *);
- extern int ext3_change_inode_journal_flag(struct inode *, int);
- extern void ext3_truncate (struct inode *);
-+#ifdef EXT3_DELETE_THREAD
-+extern void ext3_truncate_thread(struct inode *inode);
-+#endif
-
- /* ioctl.c */
- extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
-Index: linux-2.4.20/include/linux/ext3_fs_sb.h
-===================================================================
---- linux-2.4.20.orig/include/linux/ext3_fs_sb.h 2004-01-12 20:13:37.000000000 +0300
-+++ linux-2.4.20/include/linux/ext3_fs_sb.h 2004-01-13 16:55:45.000000000 +0300
-@@ -29,6 +29,8 @@
-
- #define EXT3_MAX_GROUP_LOADED 8
-
-+#define EXT3_DELETE_THREAD
-+
- /*
- * third extended-fs super-block data in memory
- */
-@@ -76,6 +78,14 @@
- struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
- wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
- #endif
-+#ifdef EXT3_DELETE_THREAD
-+ spinlock_t s_delete_lock;
-+ struct list_head s_delete_list;
-+ unsigned long s_delete_blocks;
-+ unsigned long s_delete_inodes;
-+ wait_queue_head_t s_delete_thread_queue;
-+ wait_queue_head_t s_delete_waiter_queue;
-+#endif
- };
-
- #endif /* _LINUX_EXT3_FS_SB */
-Index: linux-2.4.20/include/linux/fs.h
-===================================================================
---- linux-2.4.20.orig/include/linux/fs.h 2004-01-12 20:13:36.000000000 +0300
-+++ linux-2.4.20/include/linux/fs.h 2004-01-13 16:55:45.000000000 +0300
-@@ -917,6 +917,7 @@
- void (*delete_inode) (struct inode *);
- void (*put_super) (struct super_block *);
- void (*write_super) (struct super_block *);
-+ int (*sync_fs) (struct super_block *);
- void (*write_super_lockfs) (struct super_block *);
- void (*unlockfs) (struct super_block *);
- int (*statfs) (struct super_block *, struct statfs *);
+++ /dev/null
- fs/ext3/Makefile | 2
- fs/ext3/dir.c | 302 +++++++++
- fs/ext3/file.c | 3
- fs/ext3/hash.c | 215 ++++++
- fs/ext3/namei.c | 1420 ++++++++++++++++++++++++++++++++++++++++-----
- fs/ext3/super.c | 7
- include/linux/ext3_fs.h | 85 ++
- include/linux/ext3_fs_sb.h | 2
- include/linux/ext3_jbd.h | 2
- include/linux/rbtree.h | 2
- lib/rbtree.c | 42 +
- 11 files changed, 1921 insertions(+), 161 deletions(-)
-
-Index: linux.mcp2/fs/ext3/dir.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/dir.c 2004-05-17 15:03:55.000000000 -0700
-+++ linux.mcp2/fs/ext3/dir.c 2004-05-17 15:07:06.000000000 -0700
-@@ -21,12 +21,16 @@
- #include <linux/fs.h>
- #include <linux/jbd.h>
- #include <linux/ext3_fs.h>
-+#include <linux/slab.h>
-+#include <linux/rbtree.h>
-
- static unsigned char ext3_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
- };
-
- static int ext3_readdir(struct file *, void *, filldir_t);
-+static int ext3_dx_readdir(struct file * filp,
-+ void * dirent, filldir_t filldir);
-
- struct file_operations ext3_dir_operations = {
- read: generic_read_dir,
-@@ -35,6 +39,17 @@
- fsync: ext3_sync_file, /* BKL held */
- };
-
-+
-+static unsigned char get_dtype(struct super_block *sb, int filetype)
-+{
-+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
-+ (filetype >= EXT3_FT_MAX))
-+ return DT_UNKNOWN;
-+
-+ return (ext3_filetype_table[filetype]);
-+}
-+
-+
- int ext3_check_dir_entry (const char * function, struct inode * dir,
- struct ext3_dir_entry_2 * de,
- struct buffer_head * bh,
-@@ -79,6 +94,16 @@
-
- sb = inode->i_sb;
-
-+ if (is_dx(inode)) {
-+ err = ext3_dx_readdir(filp, dirent, filldir);
-+ if (err != ERR_BAD_DX_DIR)
-+ return err;
-+ /*
-+ * We don't set the inode dirty flag since it's not
-+ * critical that it get flushed back to the disk.
-+ */
-+ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
-+ }
- stored = 0;
- bh = NULL;
- offset = filp->f_pos & (sb->s_blocksize - 1);
-@@ -162,18 +187,12 @@
- * during the copy operation.
- */
- unsigned long version = filp->f_version;
-- unsigned char d_type = DT_UNKNOWN;
-
-- if (EXT3_HAS_INCOMPAT_FEATURE(sb,
-- EXT3_FEATURE_INCOMPAT_FILETYPE)
-- && de->file_type < EXT3_FT_MAX)
-- d_type =
-- ext3_filetype_table[de->file_type];
- error = filldir(dirent, de->name,
- de->name_len,
- filp->f_pos,
- le32_to_cpu(de->inode),
-- d_type);
-+ get_dtype(sb, de->file_type));
- if (error)
- break;
- if (version != filp->f_version)
-@@ -188,3 +207,272 @@
- UPDATE_ATIME(inode);
- return 0;
- }
-+
-+#ifdef CONFIG_EXT3_INDEX
-+/*
-+ * These functions convert from the major/minor hash to an f_pos
-+ * value.
-+ *
-+ * Currently we only use major hash numer. This is unfortunate, but
-+ * on 32-bit machines, the same VFS interface is used for lseek and
-+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
-+ * lseek/telldir/seekdir will blow out spectacularly, and from within
-+ * the ext2 low-level routine, we don't know if we're being called by
-+ * a 64-bit version of the system call or the 32-bit version of the
-+ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
-+ * cookie. Sigh.
-+ */
-+#define hash2pos(major, minor) (major >> 1)
-+#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
-+#define pos2min_hash(pos) (0)
-+
-+/*
-+ * This structure holds the nodes of the red-black tree used to store
-+ * the directory entry in hash order.
-+ */
-+struct fname {
-+ __u32 hash;
-+ __u32 minor_hash;
-+ rb_node_t rb_hash;
-+ struct fname *next;
-+ __u32 inode;
-+ __u8 name_len;
-+ __u8 file_type;
-+ char name[0];
-+};
-+
-+/*
-+ * This functoin implements a non-recursive way of freeing all of the
-+ * nodes in the red-black tree.
-+ */
-+static void free_rb_tree_fname(rb_root_t *root)
-+{
-+ rb_node_t *n = root->rb_node;
-+ rb_node_t *parent;
-+ struct fname *fname;
-+
-+ while (n) {
-+ /* Do the node's children first */
-+ if ((n)->rb_left) {
-+ n = n->rb_left;
-+ continue;
-+ }
-+ if (n->rb_right) {
-+ n = n->rb_right;
-+ continue;
-+ }
-+ /*
-+ * The node has no children; free it, and then zero
-+ * out parent's link to it. Finally go to the
-+ * beginning of the loop and try to free the parent
-+ * node.
-+ */
-+ parent = n->rb_parent;
-+ fname = rb_entry(n, struct fname, rb_hash);
-+ kfree(fname);
-+ if (!parent)
-+ root->rb_node = 0;
-+ else if (parent->rb_left == n)
-+ parent->rb_left = 0;
-+ else if (parent->rb_right == n)
-+ parent->rb_right = 0;
-+ n = parent;
-+ }
-+ root->rb_node = 0;
-+}
-+
-+
-+struct dir_private_info *create_dir_info(loff_t pos)
-+{
-+ struct dir_private_info *p;
-+
-+ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
-+ if (!p)
-+ return NULL;
-+ p->root.rb_node = 0;
-+ p->curr_node = 0;
-+ p->extra_fname = 0;
-+ p->last_pos = 0;
-+ p->curr_hash = pos2maj_hash(pos);
-+ p->curr_minor_hash = pos2min_hash(pos);
-+ p->next_hash = 0;
-+ return p;
-+}
-+
-+void ext3_htree_free_dir_info(struct dir_private_info *p)
-+{
-+ free_rb_tree_fname(&p->root);
-+ kfree(p);
-+}
-+
-+/*
-+ * Given a directory entry, enter it into the fname rb tree.
-+ */
-+int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
-+ __u32 minor_hash,
-+ struct ext3_dir_entry_2 *dirent)
-+{
-+ rb_node_t **p, *parent = NULL;
-+ struct fname * fname, *new_fn;
-+ struct dir_private_info *info;
-+ int len;
-+
-+ info = (struct dir_private_info *) dir_file->private_data;
-+ p = &info->root.rb_node;
-+
-+ /* Create and allocate the fname structure */
-+ len = sizeof(struct fname) + dirent->name_len + 1;
-+ new_fn = kmalloc(len, GFP_KERNEL);
-+ if (!new_fn)
-+ return -ENOMEM;
-+ memset(new_fn, 0, len);
-+ new_fn->hash = hash;
-+ new_fn->minor_hash = minor_hash;
-+ new_fn->inode = le32_to_cpu(dirent->inode);
-+ new_fn->name_len = dirent->name_len;
-+ new_fn->file_type = dirent->file_type;
-+ memcpy(new_fn->name, dirent->name, dirent->name_len);
-+ new_fn->name[dirent->name_len] = 0;
-+
-+ while (*p) {
-+ parent = *p;
-+ fname = rb_entry(parent, struct fname, rb_hash);
-+
-+ /*
-+ * If the hash and minor hash match up, then we put
-+ * them on a linked list. This rarely happens...
-+ */
-+ if ((new_fn->hash == fname->hash) &&
-+ (new_fn->minor_hash == fname->minor_hash)) {
-+ new_fn->next = fname->next;
-+ fname->next = new_fn;
-+ return 0;
-+ }
-+
-+ if (new_fn->hash < fname->hash)
-+ p = &(*p)->rb_left;
-+ else if (new_fn->hash > fname->hash)
-+ p = &(*p)->rb_right;
-+ else if (new_fn->minor_hash < fname->minor_hash)
-+ p = &(*p)->rb_left;
-+ else /* if (new_fn->minor_hash > fname->minor_hash) */
-+ p = &(*p)->rb_right;
-+ }
-+
-+ rb_link_node(&new_fn->rb_hash, parent, p);
-+ rb_insert_color(&new_fn->rb_hash, &info->root);
-+ return 0;
-+}
-+
-+
-+
-+/*
-+ * This is a helper function for ext3_dx_readdir. It calls filldir
-+ * for all entres on the fname linked list. (Normally there is only
-+ * one entry on the linked list, unless there are 62 bit hash collisions.)
-+ */
-+static int call_filldir(struct file * filp, void * dirent,
-+ filldir_t filldir, struct fname *fname)
-+{
-+ struct dir_private_info *info = filp->private_data;
-+ loff_t curr_pos;
-+ struct inode *inode = filp->f_dentry->d_inode;
-+ struct super_block * sb;
-+ int error;
-+
-+ sb = inode->i_sb;
-+
-+ if (!fname) {
-+ printk("call_filldir: called with null fname?!?\n");
-+ return 0;
-+ }
-+ curr_pos = hash2pos(fname->hash, fname->minor_hash);
-+ while (fname) {
-+ error = filldir(dirent, fname->name,
-+ fname->name_len, curr_pos,
-+ fname->inode,
-+ get_dtype(sb, fname->file_type));
-+ if (error) {
-+ filp->f_pos = curr_pos;
-+ info->extra_fname = fname->next;
-+ return error;
-+ }
-+ fname = fname->next;
-+ }
-+ return 0;
-+}
-+
-+static int ext3_dx_readdir(struct file * filp,
-+ void * dirent, filldir_t filldir)
-+{
-+ struct dir_private_info *info = filp->private_data;
-+ struct inode *inode = filp->f_dentry->d_inode;
-+ struct fname *fname;
-+ int ret;
-+
-+ if (!info) {
-+ info = create_dir_info(filp->f_pos);
-+ if (!info)
-+ return -ENOMEM;
-+ filp->private_data = info;
-+ }
-+
-+ /* Some one has messed with f_pos; reset the world */
-+ if (info->last_pos != filp->f_pos) {
-+ free_rb_tree_fname(&info->root);
-+ info->curr_node = 0;
-+ info->extra_fname = 0;
-+ info->curr_hash = pos2maj_hash(filp->f_pos);
-+ info->curr_minor_hash = pos2min_hash(filp->f_pos);
-+ }
-+
-+ /*
-+ * If there are any leftover names on the hash collision
-+ * chain, return them first.
-+ */
-+ if (info->extra_fname &&
-+ call_filldir(filp, dirent, filldir, info->extra_fname))
-+ goto finished;
-+
-+ if (!info->curr_node)
-+ info->curr_node = rb_get_first(&info->root);
-+
-+ while (1) {
-+ /*
-+ * Fill the rbtree if we have no more entries,
-+ * or the inode has changed since we last read in the
-+ * cached entries.
-+ */
-+ if ((!info->curr_node) ||
-+ (filp->f_version != inode->i_version)) {
-+ info->curr_node = 0;
-+ free_rb_tree_fname(&info->root);
-+ filp->f_version = inode->i_version;
-+ ret = ext3_htree_fill_tree(filp, info->curr_hash,
-+ info->curr_minor_hash,
-+ &info->next_hash);
-+ if (ret < 0)
-+ return ret;
-+ if (ret == 0)
-+ break;
-+ info->curr_node = rb_get_first(&info->root);
-+ }
-+
-+ fname = rb_entry(info->curr_node, struct fname, rb_hash);
-+ info->curr_hash = fname->hash;
-+ info->curr_minor_hash = fname->minor_hash;
-+ if (call_filldir(filp, dirent, filldir, fname))
-+ break;
-+
-+ info->curr_node = rb_get_next(info->curr_node);
-+ if (!info->curr_node) {
-+ info->curr_hash = info->next_hash;
-+ info->curr_minor_hash = 0;
-+ }
-+ }
-+finished:
-+ info->last_pos = filp->f_pos;
-+ UPDATE_ATIME(inode);
-+ return 0;
-+}
-+#endif
-Index: linux.mcp2/fs/ext3/file.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/file.c 2004-05-17 15:03:55.000000000 -0700
-+++ linux.mcp2/fs/ext3/file.c 2004-05-17 15:07:06.000000000 -0700
-@@ -35,6 +35,9 @@
- {
- if (filp->f_mode & FMODE_WRITE)
- ext3_discard_prealloc (inode);
-+ if (is_dx(inode) && filp->private_data)
-+ ext3_htree_free_dir_info(filp->private_data);
-+
- return 0;
- }
-
-Index: linux.mcp2/fs/ext3/hash.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/hash.c 2002-04-11 07:25:15.000000000 -0700
-+++ linux.mcp2/fs/ext3/hash.c 2004-05-17 15:07:06.000000000 -0700
-@@ -0,0 +1,215 @@
-+/*
-+ * linux/fs/ext3/hash.c
-+ *
-+ * Copyright (C) 2002 by Theodore Ts'o
-+ *
-+ * This file is released under the GPL v2.
-+ *
-+ * This file may be redistributed under the terms of the GNU Public
-+ * License.
-+ */
-+
-+#include <linux/fs.h>
-+#include <linux/jbd.h>
-+#include <linux/sched.h>
-+#include <linux/ext3_fs.h>
-+
-+#define DELTA 0x9E3779B9
-+
-+static void TEA_transform(__u32 buf[4], __u32 const in[])
-+{
-+ __u32 sum = 0;
-+ __u32 b0 = buf[0], b1 = buf[1];
-+ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
-+ int n = 16;
-+
-+ do {
-+ sum += DELTA;
-+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
-+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-+ } while(--n);
-+
-+ buf[0] += b0;
-+ buf[1] += b1;
-+}
-+
-+/* F, G and H are basic MD4 functions: selection, majority, parity */
-+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
-+#define H(x, y, z) ((x) ^ (y) ^ (z))
-+
-+/*
-+ * The generic round function. The application is so specific that
-+ * we don't bother protecting all the arguments with parens, as is generally
-+ * good macro practice, in favor of extra legibility.
-+ * Rotation is separate from addition to prevent recomputation
-+ */
-+#define ROUND(f, a, b, c, d, x, s) \
-+ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
-+#define K1 0
-+#define K2 013240474631UL
-+#define K3 015666365641UL
-+
-+/*
-+ * Basic cut-down MD4 transform. Returns only 32 bits of result.
-+ */
-+static void halfMD4Transform (__u32 buf[4], __u32 const in[])
-+{
-+ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
-+
-+ /* Round 1 */
-+ ROUND(F, a, b, c, d, in[0] + K1, 3);
-+ ROUND(F, d, a, b, c, in[1] + K1, 7);
-+ ROUND(F, c, d, a, b, in[2] + K1, 11);
-+ ROUND(F, b, c, d, a, in[3] + K1, 19);
-+ ROUND(F, a, b, c, d, in[4] + K1, 3);
-+ ROUND(F, d, a, b, c, in[5] + K1, 7);
-+ ROUND(F, c, d, a, b, in[6] + K1, 11);
-+ ROUND(F, b, c, d, a, in[7] + K1, 19);
-+
-+ /* Round 2 */
-+ ROUND(G, a, b, c, d, in[1] + K2, 3);
-+ ROUND(G, d, a, b, c, in[3] + K2, 5);
-+ ROUND(G, c, d, a, b, in[5] + K2, 9);
-+ ROUND(G, b, c, d, a, in[7] + K2, 13);
-+ ROUND(G, a, b, c, d, in[0] + K2, 3);
-+ ROUND(G, d, a, b, c, in[2] + K2, 5);
-+ ROUND(G, c, d, a, b, in[4] + K2, 9);
-+ ROUND(G, b, c, d, a, in[6] + K2, 13);
-+
-+ /* Round 3 */
-+ ROUND(H, a, b, c, d, in[3] + K3, 3);
-+ ROUND(H, d, a, b, c, in[7] + K3, 9);
-+ ROUND(H, c, d, a, b, in[2] + K3, 11);
-+ ROUND(H, b, c, d, a, in[6] + K3, 15);
-+ ROUND(H, a, b, c, d, in[1] + K3, 3);
-+ ROUND(H, d, a, b, c, in[5] + K3, 9);
-+ ROUND(H, c, d, a, b, in[0] + K3, 11);
-+ ROUND(H, b, c, d, a, in[4] + K3, 15);
-+
-+ buf[0] += a;
-+ buf[1] += b;
-+ buf[2] += c;
-+ buf[3] += d;
-+}
-+
-+#undef ROUND
-+#undef F
-+#undef G
-+#undef H
-+#undef K1
-+#undef K2
-+#undef K3
-+
-+/* The old legacy hash */
-+static __u32 dx_hack_hash (const char *name, int len)
-+{
-+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
-+ while (len--) {
-+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
-+
-+ if (hash & 0x80000000) hash -= 0x7fffffff;
-+ hash1 = hash0;
-+ hash0 = hash;
-+ }
-+ return (hash0 << 1);
-+}
-+
-+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
-+{
-+ __u32 pad, val;
-+ int i;
-+
-+ pad = (__u32)len | ((__u32)len << 8);
-+ pad |= pad << 16;
-+
-+ val = pad;
-+ if (len > num*4)
-+ len = num * 4;
-+ for (i=0; i < len; i++) {
-+ if ((i % 4) == 0)
-+ val = pad;
-+ val = msg[i] + (val << 8);
-+ if ((i % 4) == 3) {
-+ *buf++ = val;
-+ val = pad;
-+ num--;
-+ }
-+ }
-+ if (--num >= 0)
-+ *buf++ = val;
-+ while (--num >= 0)
-+ *buf++ = pad;
-+}
-+
-+/*
-+ * Returns the hash of a filename. If len is 0 and name is NULL, then
-+ * this function can be used to test whether or not a hash version is
-+ * supported.
-+ *
-+ * The seed is an 4 longword (32 bits) "secret" which can be used to
-+ * uniquify a hash. If the seed is all zero's, then some default seed
-+ * may be used.
-+ *
-+ * A particular hash version specifies whether or not the seed is
-+ * represented, and whether or not the returned hash is 32 bits or 64
-+ * bits. 32 bit hashes will return 0 for the minor hash.
-+ */
-+int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
-+{
-+ __u32 hash;
-+ __u32 minor_hash = 0;
-+ const char *p;
-+ int i;
-+ __u32 in[8], buf[4];
-+
-+ /* Initialize the default seed for the hash checksum functions */
-+ buf[0] = 0x67452301;
-+ buf[1] = 0xefcdab89;
-+ buf[2] = 0x98badcfe;
-+ buf[3] = 0x10325476;
-+
-+ /* Check to see if the seed is all zero's */
-+ if (hinfo->seed) {
-+ for (i=0; i < 4; i++) {
-+ if (hinfo->seed[i])
-+ break;
-+ }
-+ if (i < 4)
-+ memcpy(buf, hinfo->seed, sizeof(buf));
-+ }
-+
-+ switch (hinfo->hash_version) {
-+ case DX_HASH_LEGACY:
-+ hash = dx_hack_hash(name, len);
-+ break;
-+ case DX_HASH_HALF_MD4:
-+ p = name;
-+ while (len > 0) {
-+ str2hashbuf(p, len, in, 8);
-+ halfMD4Transform(buf, in);
-+ len -= 32;
-+ p += 32;
-+ }
-+ minor_hash = buf[2];
-+ hash = buf[1];
-+ break;
-+ case DX_HASH_TEA:
-+ p = name;
-+ while (len > 0) {
-+ str2hashbuf(p, len, in, 4);
-+ TEA_transform(buf, in);
-+ len -= 16;
-+ p += 16;
-+ }
-+ hash = buf[0];
-+ minor_hash = buf[1];
-+ break;
-+ default:
-+ hinfo->hash = 0;
-+ return -1;
-+ }
-+ hinfo->hash = hash & ~1;
-+ hinfo->minor_hash = minor_hash;
-+ return 0;
-+}
-Index: linux.mcp2/fs/ext3/Makefile
-===================================================================
---- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:03:55.000000000 -0700
-+++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:07:06.000000000 -0700
-@@ -10,7 +10,7 @@
- O_TARGET := ext3.o
-
- obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-- ioctl.o namei.o super.o symlink.o
-+ ioctl.o namei.o super.o symlink.o hash.o
- obj-m := $(O_TARGET)
-
- include $(TOPDIR)/Rules.make
-Index: linux.mcp2/fs/ext3/namei.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:03:55.000000000 -0700
-+++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:07:06.000000000 -0700
-@@ -16,6 +16,12 @@
- * David S. Miller (davem@caip.rutgers.edu), 1995
- * Directory entry file type support and forward compatibility hooks
- * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
-+ * Hash Tree Directory indexing (c)
-+ * Daniel Phillips, 2001
-+ * Hash Tree Directory indexing porting
-+ * Christopher Li, 2002
-+ * Hash Tree Directory indexing cleanup
-+ * Theodore Ts'o, 2002
- */
-
- #include <linux/fs.h>
-@@ -38,6 +44,642 @@
- #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
- #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
-
-+static struct buffer_head *ext3_append(handle_t *handle,
-+ struct inode *inode,
-+ u32 *block, int *err)
-+{
-+ struct buffer_head *bh;
-+
-+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
-+
-+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
-+ inode->i_size += inode->i_sb->s_blocksize;
-+ EXT3_I(inode)->i_disksize = inode->i_size;
-+ ext3_journal_get_write_access(handle,bh);
-+ }
-+ return bh;
-+}
-+
-+#ifndef assert
-+#define assert(test) J_ASSERT(test)
-+#endif
-+
-+#ifndef swap
-+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-+#endif
-+
-+typedef struct { u32 v; } le_u32;
-+typedef struct { u16 v; } le_u16;
-+
-+#ifdef DX_DEBUG
-+#define dxtrace(command) command
-+#else
-+#define dxtrace(command)
-+#endif
-+
-+struct fake_dirent
-+{
-+ /*le*/u32 inode;
-+ /*le*/u16 rec_len;
-+ u8 name_len;
-+ u8 file_type;
-+};
-+
-+struct dx_countlimit
-+{
-+ le_u16 limit;
-+ le_u16 count;
-+};
-+
-+struct dx_entry
-+{
-+ le_u32 hash;
-+ le_u32 block;
-+};
-+
-+/*
-+ * dx_root_info is laid out so that if it should somehow get overlaid by a
-+ * dirent the two low bits of the hash version will be zero. Therefore, the
-+ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
-+ */
-+
-+struct dx_root
-+{
-+ struct fake_dirent dot;
-+ char dot_name[4];
-+ struct fake_dirent dotdot;
-+ char dotdot_name[4];
-+ struct dx_root_info
-+ {
-+ le_u32 reserved_zero;
-+ u8 hash_version;
-+ u8 info_length; /* 8 */
-+ u8 indirect_levels;
-+ u8 unused_flags;
-+ }
-+ info;
-+ struct dx_entry entries[0];
-+};
-+
-+struct dx_node
-+{
-+ struct fake_dirent fake;
-+ struct dx_entry entries[0];
-+};
-+
-+
-+struct dx_frame
-+{
-+ struct buffer_head *bh;
-+ struct dx_entry *entries;
-+ struct dx_entry *at;
-+};
-+
-+struct dx_map_entry
-+{
-+ u32 hash;
-+ u32 offs;
-+};
-+
-+#ifdef CONFIG_EXT3_INDEX
-+static inline unsigned dx_get_block (struct dx_entry *entry);
-+static void dx_set_block (struct dx_entry *entry, unsigned value);
-+static inline unsigned dx_get_hash (struct dx_entry *entry);
-+static void dx_set_hash (struct dx_entry *entry, unsigned value);
-+static unsigned dx_get_count (struct dx_entry *entries);
-+static unsigned dx_get_limit (struct dx_entry *entries);
-+static void dx_set_count (struct dx_entry *entries, unsigned value);
-+static void dx_set_limit (struct dx_entry *entries, unsigned value);
-+static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-+static unsigned dx_node_limit (struct inode *dir);
-+static struct dx_frame *dx_probe(struct dentry *dentry,
-+ struct inode *dir,
-+ struct dx_hash_info *hinfo,
-+ struct dx_frame *frame,
-+ int *err);
-+static void dx_release (struct dx_frame *frames);
-+static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
-+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
-+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-+static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
-+ struct dx_map_entry *offsets, int count);
-+static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
-+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
-+static int ext3_htree_next_block(struct inode *dir, __u32 hash,
-+ struct dx_frame *frame,
-+ struct dx_frame *frames, int *err,
-+ __u32 *start_hash);
-+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-+ struct ext3_dir_entry_2 **res_dir, int *err);
-+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode);
-+
-+/*
-+ * Future: use high four bits of block for coalesce-on-delete flags
-+ * Mask them off for now.
-+ */
-+
-+static inline unsigned dx_get_block (struct dx_entry *entry)
-+{
-+ return le32_to_cpu(entry->block.v) & 0x00ffffff;
-+}
-+
-+static inline void dx_set_block (struct dx_entry *entry, unsigned value)
-+{
-+ entry->block.v = cpu_to_le32(value);
-+}
-+
-+static inline unsigned dx_get_hash (struct dx_entry *entry)
-+{
-+ return le32_to_cpu(entry->hash.v);
-+}
-+
-+static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
-+{
-+ entry->hash.v = cpu_to_le32(value);
-+}
-+
-+static inline unsigned dx_get_count (struct dx_entry *entries)
-+{
-+ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
-+}
-+
-+static inline unsigned dx_get_limit (struct dx_entry *entries)
-+{
-+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
-+}
-+
-+static inline void dx_set_count (struct dx_entry *entries, unsigned value)
-+{
-+ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
-+}
-+
-+static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
-+{
-+ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
-+}
-+
-+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
-+{
-+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
-+ EXT3_DIR_REC_LEN(2) - infosize;
-+ return 0? 20: entry_space / sizeof(struct dx_entry);
-+}
-+
-+static inline unsigned dx_node_limit (struct inode *dir)
-+{
-+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
-+ return 0? 22: entry_space / sizeof(struct dx_entry);
-+}
-+
-+/*
-+ * Debug
-+ */
-+#ifdef DX_DEBUG
-+struct stats
-+{
-+ unsigned names;
-+ unsigned space;
-+ unsigned bcount;
-+};
-+
-+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
-+ int size, int show_names)
-+{
-+ unsigned names = 0, space = 0;
-+ char *base = (char *) de;
-+ struct dx_hash_info h = *hinfo;
-+
-+ printk("names: ");
-+ while ((char *) de < base + size)
-+ {
-+ if (de->inode)
-+ {
-+ if (show_names)
-+ {
-+ int len = de->name_len;
-+ char *name = de->name;
-+ while (len--) printk("%c", *name++);
-+ ext3fs_dirhash(de->name, de->name_len, &h);
-+ printk(":%x.%u ", h.hash,
-+ ((char *) de - base));
-+ }
-+ space += EXT3_DIR_REC_LEN(de->name_len);
-+ names++;
-+ }
-+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
-+ }
-+ printk("(%i)\n", names);
-+ return (struct stats) { names, space, 1 };
-+}
-+
-+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
-+ struct dx_entry *entries, int levels)
-+{
-+ unsigned blocksize = dir->i_sb->s_blocksize;
-+ unsigned count = dx_get_count (entries), names = 0, space = 0, i;
-+ unsigned bcount = 0;
-+ struct buffer_head *bh;
-+ int err;
-+ printk("%i indexed blocks...\n", count);
-+ for (i = 0; i < count; i++, entries++)
-+ {
-+ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
-+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
-+ struct stats stats;
-+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
-+ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
-+ stats = levels?
-+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
-+ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
-+ names += stats.names;
-+ space += stats.space;
-+ bcount += stats.bcount;
-+ brelse (bh);
-+ }
-+ if (bcount)
-+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
-+ names, space/bcount,(space/bcount)*100/blocksize);
-+ return (struct stats) { names, space, bcount};
-+}
-+#endif /* DX_DEBUG */
-+
-+/*
-+ * Probe for a directory leaf block to search.
-+ *
-+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
-+ * error in the directory index, and the caller should fall back to
-+ * searching the directory normally. The callers of dx_probe **MUST**
-+ * check for this error code, and make sure it never gets reflected
-+ * back to userspace.
-+ */
-+static struct dx_frame *
-+dx_probe(struct dentry *dentry, struct inode *dir,
-+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
-+{
-+ unsigned count, indirect;
-+ struct dx_entry *at, *entries, *p, *q, *m;
-+ struct dx_root *root;
-+ struct buffer_head *bh;
-+ struct dx_frame *frame = frame_in;
-+ u32 hash;
-+
-+ frame->bh = NULL;
-+ if (dentry)
-+ dir = dentry->d_parent->d_inode;
-+ if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
-+ goto fail;
-+ root = (struct dx_root *) bh->b_data;
-+ if (root->info.hash_version != DX_HASH_TEA &&
-+ root->info.hash_version != DX_HASH_HALF_MD4 &&
-+ root->info.hash_version != DX_HASH_LEGACY) {
-+ ext3_warning(dir->i_sb, __FUNCTION__,
-+ "Unrecognised inode hash code %d",
-+ root->info.hash_version);
-+ brelse(bh);
-+ *err = ERR_BAD_DX_DIR;
-+ goto fail;
-+ }
-+ hinfo->hash_version = root->info.hash_version;
-+ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
-+ if (dentry)
-+ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
-+ hash = hinfo->hash;
-+
-+ if (root->info.unused_flags & 1) {
-+ ext3_warning(dir->i_sb, __FUNCTION__,
-+ "Unimplemented inode hash flags: %#06x",
-+ root->info.unused_flags);
-+ brelse(bh);
-+ *err = ERR_BAD_DX_DIR;
-+ goto fail;
-+ }
-+
-+ if ((indirect = root->info.indirect_levels) > 1) {
-+ ext3_warning(dir->i_sb, __FUNCTION__,
-+ "Unimplemented inode hash depth: %#06x",
-+ root->info.indirect_levels);
-+ brelse(bh);
-+ *err = ERR_BAD_DX_DIR;
-+ goto fail;
-+ }
-+
-+ entries = (struct dx_entry *) (((char *)&root->info) +
-+ root->info.info_length);
-+ assert(dx_get_limit(entries) == dx_root_limit(dir,
-+ root->info.info_length));
-+ dxtrace (printk("Look up %x", hash));
-+ while (1)
-+ {
-+ count = dx_get_count(entries);
-+ assert (count && count <= dx_get_limit(entries));
-+ p = entries + 1;
-+ q = entries + count - 1;
-+ while (p <= q)
-+ {
-+ m = p + (q - p)/2;
-+ dxtrace(printk("."));
-+ if (dx_get_hash(m) > hash)
-+ q = m - 1;
-+ else
-+ p = m + 1;
-+ }
-+
-+ if (0) // linear search cross check
-+ {
-+ unsigned n = count - 1;
-+ at = entries;
-+ while (n--)
-+ {
-+ dxtrace(printk(","));
-+ if (dx_get_hash(++at) > hash)
-+ {
-+ at--;
-+ break;
-+ }
-+ }
-+ assert (at == p - 1);
-+ }
-+
-+ at = p - 1;
-+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
-+ frame->bh = bh;
-+ frame->entries = entries;
-+ frame->at = at;
-+ if (!indirect--) return frame;
-+ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
-+ goto fail2;
-+ at = entries = ((struct dx_node *) bh->b_data)->entries;
-+ assert (dx_get_limit(entries) == dx_node_limit (dir));
-+ frame++;
-+ }
-+fail2:
-+ while (frame >= frame_in) {
-+ brelse(frame->bh);
-+ frame--;
-+ }
-+fail:
-+ return NULL;
-+}
-+
-+static void dx_release (struct dx_frame *frames)
-+{
-+ if (frames[0].bh == NULL)
-+ return;
-+
-+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
-+ brelse(frames[1].bh);
-+ brelse(frames[0].bh);
-+}
-+
-+/*
-+ * This function increments the frame pointer to search the next leaf
-+ * block, and reads in the necessary intervening nodes if the search
-+ * should be necessary. Whether or not the search is necessary is
-+ * controlled by the hash parameter. If the hash value is even, then
-+ * the search is only continued if the next block starts with that
-+ * hash value. This is used if we are searching for a specific file.
-+ *
-+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
-+ *
-+ * This function returns 1 if the caller should continue to search,
-+ * or 0 if it should not. If there is an error reading one of the
-+ * index blocks, it will return -1.
-+ *
-+ * If start_hash is non-null, it will be filled in with the starting
-+ * hash of the next page.
-+ */
-+static int ext3_htree_next_block(struct inode *dir, __u32 hash,
-+ struct dx_frame *frame,
-+ struct dx_frame *frames, int *err,
-+ __u32 *start_hash)
-+{
-+ struct dx_frame *p;
-+ struct buffer_head *bh;
-+ int num_frames = 0;
-+ __u32 bhash;
-+
-+ *err = ENOENT;
-+ p = frame;
-+ /*
-+ * Find the next leaf page by incrementing the frame pointer.
-+ * If we run out of entries in the interior node, loop around and
-+ * increment pointer in the parent node. When we break out of
-+ * this loop, num_frames indicates the number of interior
-+ * nodes need to be read.
-+ */
-+ while (1) {
-+ if (++(p->at) < p->entries + dx_get_count(p->entries))
-+ break;
-+ if (p == frames)
-+ return 0;
-+ num_frames++;
-+ p--;
-+ }
-+
-+ /*
-+ * If the hash is 1, then continue only if the next page has a
-+ * continuation hash of any value. This is used for readdir
-+ * handling. Otherwise, check to see if the hash matches the
-+ * desired contiuation hash. If it doesn't, return since
-+ * there's no point to read in the successive index pages.
-+ */
-+ bhash = dx_get_hash(p->at);
-+ if (start_hash)
-+ *start_hash = bhash;
-+ if ((hash & 1) == 0) {
-+ if ((bhash & ~1) != hash)
-+ return 0;
-+ }
-+ /*
-+ * If the hash is HASH_NB_ALWAYS, we always go to the next
-+ * block so no check is necessary
-+ */
-+ while (num_frames--) {
-+ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
-+ 0, err)))
-+ return -1; /* Failure */
-+ p++;
-+ brelse (p->bh);
-+ p->bh = bh;
-+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
-+ }
-+ return 1;
-+}
-+
-+
-+/*
-+ * p is at least 6 bytes before the end of page
-+ */
-+static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
-+{
-+ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
-+}
-+
-+/*
-+ * This function fills a red-black tree with information from a
-+ * directory. We start scanning the directory in hash order, starting
-+ * at start_hash and start_minor_hash.
-+ *
-+ * This function returns the number of entries inserted into the tree,
-+ * or a negative error code.
-+ */
-+int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
-+ __u32 start_minor_hash, __u32 *next_hash)
-+{
-+ struct dx_hash_info hinfo;
-+ struct buffer_head *bh;
-+ struct ext3_dir_entry_2 *de, *top;
-+ static struct dx_frame frames[2], *frame;
-+ struct inode *dir;
-+ int block, err;
-+ int count = 0;
-+ int ret;
-+ __u32 hashval;
-+
-+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
-+ start_minor_hash));
-+ dir = dir_file->f_dentry->d_inode;
-+ hinfo.hash = start_hash;
-+ hinfo.minor_hash = 0;
-+ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
-+ if (!frame)
-+ return err;
-+
-+ /* Add '.' and '..' from the htree header */
-+ if (!start_hash && !start_minor_hash) {
-+ de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
-+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
-+ goto errout;
-+ de = ext3_next_entry(de);
-+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
-+ goto errout;
-+ count += 2;
-+ }
-+
-+ while (1) {
-+ block = dx_get_block(frame->at);
-+ dxtrace(printk("Reading block %d\n", block));
-+ if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
-+ goto errout;
-+
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize -
-+ EXT3_DIR_REC_LEN(0));
-+ for (; de < top; de = ext3_next_entry(de)) {
-+ ext3fs_dirhash(de->name, de->name_len, &hinfo);
-+ if ((hinfo.hash < start_hash) ||
-+ ((hinfo.hash == start_hash) &&
-+ (hinfo.minor_hash < start_minor_hash)))
-+ continue;
-+ if ((err = ext3_htree_store_dirent(dir_file,
-+ hinfo.hash, hinfo.minor_hash, de)) != 0)
-+ goto errout;
-+ count++;
-+ }
-+ brelse (bh);
-+ hashval = ~1;
-+ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
-+ frame, frames, &err, &hashval);
-+ if (next_hash)
-+ *next_hash = hashval;
-+ if (ret == -1)
-+ goto errout;
-+ /*
-+ * Stop if: (a) there are no more entries, or
-+ * (b) we have inserted at least one entry and the
-+ * next hash value is not a continuation
-+ */
-+ if ((ret == 0) ||
-+ (count && ((hashval & 1) == 0)))
-+ break;
-+ }
-+ dx_release(frames);
-+ dxtrace(printk("Fill tree: returned %d entries\n", count));
-+ return count;
-+errout:
-+ dx_release(frames);
-+ return (err);
-+}
-+
-+
-+/*
-+ * Directory block splitting, compacting
-+ */
-+
-+static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
-+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
-+{
-+ int count = 0;
-+ char *base = (char *) de;
-+ struct dx_hash_info h = *hinfo;
-+
-+ while ((char *) de < base + size)
-+ {
-+ if (de->name_len && de->inode) {
-+ ext3fs_dirhash(de->name, de->name_len, &h);
-+ map_tail--;
-+ map_tail->hash = h.hash;
-+ map_tail->offs = (u32) ((char *) de - base);
-+ count++;
-+ }
-+ /* XXX: do we need to check rec_len == 0 case? -Chris */
-+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
-+ }
-+ return count;
-+}
-+
-+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
-+{
-+ struct dx_map_entry *p, *q, *top = map + count - 1;
-+ int more;
-+ /* Combsort until bubble sort doesn't suck */
-+ while (count > 2)
-+ {
-+ count = count*10/13;
-+ if (count - 9 < 2) /* 9, 10 -> 11 */
-+ count = 11;
-+ for (p = top, q = p - count; q >= map; p--, q--)
-+ if (p->hash < q->hash)
-+ swap(*p, *q);
-+ }
-+ /* Garden variety bubble sort */
-+ do {
-+ more = 0;
-+ q = top;
-+ while (q-- > map)
-+ {
-+ if (q[1].hash >= q[0].hash)
-+ continue;
-+ swap(*(q+1), *q);
-+ more = 1;
-+ }
-+ } while(more);
-+}
-+
-+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
-+{
-+ struct dx_entry *entries = frame->entries;
-+ struct dx_entry *old = frame->at, *new = old + 1;
-+ int count = dx_get_count(entries);
-+
-+ assert(count < dx_get_limit(entries));
-+ assert(old < entries + count);
-+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
-+ dx_set_hash(new, hash);
-+ dx_set_block(new, block);
-+ dx_set_count(entries, count + 1);
-+}
-+#endif
-+
-+
-+static void ext3_update_dx_flag(struct inode *inode)
-+{
-+ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
-+ EXT3_FEATURE_COMPAT_DIR_INDEX))
-+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
-+}
-+
- /*
- * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
- *
-@@ -94,6 +736,7 @@
- return 0;
- }
-
-+
- /*
- * ext3_find_entry()
- *
-@@ -105,6 +748,8 @@
- * The returned buffer_head has ->b_count elevated. The caller is expected
- * to brelse() it when appropriate.
- */
-+
-+
- static struct buffer_head * ext3_find_entry (struct dentry *dentry,
- struct ext3_dir_entry_2 ** res_dir)
- {
-@@ -119,12 +764,32 @@
- int num = 0;
- int nblocks, i, err;
- struct inode *dir = dentry->d_parent->d_inode;
-+ int namelen;
-+ const u8 *name;
-+ unsigned blocksize;
-
- *res_dir = NULL;
- sb = dir->i_sb;
--
-+ blocksize = sb->s_blocksize;
-+ namelen = dentry->d_name.len;
-+ name = dentry->d_name.name;
-+ if (namelen > EXT3_NAME_LEN)
-+ return NULL;
-+#ifdef CONFIG_EXT3_INDEX
-+ if (is_dx(dir)) {
-+ bh = ext3_dx_find_entry(dentry, res_dir, &err);
-+ /*
-+ * On success, or if the error was file not found,
-+ * return. Otherwise, fall back to doing a search the
-+ * old fashioned way.
-+ */
-+ if (bh || (err != ERR_BAD_DX_DIR))
-+ return bh;
-+ dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
-+ }
-+#endif
- nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
-- start = dir->u.ext3_i.i_dir_start_lookup;
-+ start = EXT3_I(dir)->i_dir_start_lookup;
- if (start >= nblocks)
- start = 0;
- block = start;
-@@ -165,7 +830,7 @@
- i = search_dirblock(bh, dir, dentry,
- block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
- if (i == 1) {
-- dir->u.ext3_i.i_dir_start_lookup = block;
-+ EXT3_I(dir)->i_dir_start_lookup = block;
- ret = bh;
- goto cleanup_and_exit;
- } else {
-@@ -196,6 +861,66 @@
- return ret;
- }
-
-+#ifdef CONFIG_EXT3_INDEX
-+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-+ struct ext3_dir_entry_2 **res_dir, int *err)
-+{
-+ struct super_block * sb;
-+ struct dx_hash_info hinfo;
-+ u32 hash;
-+ struct dx_frame frames[2], *frame;
-+ struct ext3_dir_entry_2 *de, *top;
-+ struct buffer_head *bh;
-+ unsigned long block;
-+ int retval;
-+ int namelen = dentry->d_name.len;
-+ const u8 *name = dentry->d_name.name;
-+ struct inode *dir = dentry->d_parent->d_inode;
-+
-+ sb = dir->i_sb;
-+ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
-+ return NULL;
-+ hash = hinfo.hash;
-+ do {
-+ block = dx_get_block(frame->at);
-+ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
-+ goto errout;
-+ de = (struct ext3_dir_entry_2 *) bh->b_data;
-+ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-+ EXT3_DIR_REC_LEN(0));
-+ for (; de < top; de = ext3_next_entry(de))
-+ if (ext3_match (namelen, name, de)) {
-+ if (!ext3_check_dir_entry("ext3_find_entry",
-+ dir, de, bh,
-+ (block<<EXT3_BLOCK_SIZE_BITS(sb))
-+ +((char *)de - bh->b_data))) {
-+ brelse (bh);
-+ goto errout;
-+ }
-+ *res_dir = de;
-+ dx_release (frames);
-+ return bh;
-+ }
-+ brelse (bh);
-+ /* Check to see if we should continue to search */
-+ retval = ext3_htree_next_block(dir, hash, frame,
-+ frames, err, 0);
-+ if (retval == -1) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "error reading index page in directory #%lu",
-+ dir->i_ino);
-+ goto errout;
-+ }
-+ } while (retval == 1);
-+
-+ *err = -ENOENT;
-+errout:
-+ dxtrace(printk("%s not found\n", name));
-+ dx_release (frames);
-+ return NULL;
-+}
-+#endif
-+
- static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
- {
- struct inode * inode;
-@@ -212,8 +937,9 @@
- brelse (bh);
- inode = iget(dir->i_sb, ino);
-
-- if (!inode)
-+ if (!inode) {
- return ERR_PTR(-EACCES);
-+ }
- }
- d_add(dentry, inode);
- return NULL;
-@@ -237,6 +963,301 @@
- de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
- }
-
-+#ifdef CONFIG_EXT3_INDEX
-+static struct ext3_dir_entry_2 *
-+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
-+{
-+ unsigned rec_len = 0;
-+
-+ while (count--) {
-+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
-+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
-+ memcpy (to, de, rec_len);
-+ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
-+ de->inode = 0;
-+ map++;
-+ to += rec_len;
-+ }
-+ return (struct ext3_dir_entry_2 *) (to - rec_len);
-+}
-+
-+static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
-+{
-+ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
-+ unsigned rec_len = 0;
-+
-+ prev = to = de;
-+ while ((char*)de < base + size) {
-+ next = (struct ext3_dir_entry_2 *) ((char *) de +
-+ le16_to_cpu(de->rec_len));
-+ if (de->inode && de->name_len) {
-+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
-+ if (de > to)
-+ memmove(to, de, rec_len);
-+ to->rec_len = cpu_to_le16(rec_len);
-+ prev = to;
-+ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len);
-+ }
-+ de = next;
-+ }
-+ return prev;
-+}
-+
-+static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
-+ struct buffer_head **bh,struct dx_frame *frame,
-+ struct dx_hash_info *hinfo, int *error)
-+{
-+ unsigned blocksize = dir->i_sb->s_blocksize;
-+ unsigned count, continued;
-+ struct buffer_head *bh2;
-+ u32 newblock;
-+ u32 hash2;
-+ struct dx_map_entry *map;
-+ char *data1 = (*bh)->b_data, *data2;
-+ unsigned split;
-+ struct ext3_dir_entry_2 *de = NULL, *de2;
-+ int err;
-+
-+ bh2 = ext3_append (handle, dir, &newblock, error);
-+ if (!(bh2)) {
-+ brelse(*bh);
-+ *bh = NULL;
-+ goto errout;
-+ }
-+
-+ BUFFER_TRACE(*bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, *bh);
-+ if (err) {
-+ journal_error:
-+ brelse(*bh);
-+ brelse(bh2);
-+ *bh = NULL;
-+ ext3_std_error(dir->i_sb, err);
-+ goto errout;
-+ }
-+ BUFFER_TRACE(frame->bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, frame->bh);
-+ if (err)
-+ goto journal_error;
-+
-+ data2 = bh2->b_data;
-+
-+ /* create map in the end of data2 block */
-+ map = (struct dx_map_entry *) (data2 + blocksize);
-+ count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
-+ blocksize, hinfo, map);
-+ map -= count;
-+ split = count/2; // need to adjust to actual middle
-+ dx_sort_map (map, count);
-+ hash2 = map[split].hash;
-+ continued = hash2 == map[split - 1].hash;
-+ dxtrace(printk("Split block %i at %x, %i/%i\n",
-+ dx_get_block(frame->at), hash2, split, count-split));
-+
-+ /* Fancy dance to stay within two buffers */
-+ de2 = dx_move_dirents(data1, data2, map + split, count - split);
-+ de = dx_pack_dirents(data1,blocksize);
-+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
-+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
-+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
-+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
-+
-+ /* Which block gets the new entry? */
-+ if (hinfo->hash >= hash2)
-+ {
-+ swap(*bh, bh2);
-+ de = de2;
-+ }
-+ dx_insert_block (frame, hash2 + continued, newblock);
-+ err = ext3_journal_dirty_metadata (handle, bh2);
-+ if (err)
-+ goto journal_error;
-+ err = ext3_journal_dirty_metadata (handle, frame->bh);
-+ if (err)
-+ goto journal_error;
-+ brelse (bh2);
-+ dxtrace(dx_show_index ("frame", frame->entries));
-+errout:
-+ return de;
-+}
-+#endif
-+
-+
-+/*
-+ * Add a new entry into a directory (leaf) block. If de is non-NULL,
-+ * it points to a directory entry which is guaranteed to be large
-+ * enough for new directory entry. If de is NULL, then
-+ * add_dirent_to_buf will attempt search the directory block for
-+ * space. It will return -ENOSPC if no space is available, and -EIO
-+ * and -EEXIST if directory entry already exists.
-+ *
-+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In
-+ * all other cases bh is released.
-+ */
-+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode, struct ext3_dir_entry_2 *de,
-+ struct buffer_head * bh)
-+{
-+ struct inode *dir = dentry->d_parent->d_inode;
-+ const char *name = dentry->d_name.name;
-+ int namelen = dentry->d_name.len;
-+ unsigned long offset = 0;
-+ unsigned short reclen;
-+ int nlen, rlen, err;
-+ char *top;
-+
-+ reclen = EXT3_DIR_REC_LEN(namelen);
-+ if (!de) {
-+ de = (struct ext3_dir_entry_2 *)bh->b_data;
-+ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
-+ while ((char *) de <= top) {
-+ if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
-+ bh, offset)) {
-+ brelse (bh);
-+ return -EIO;
-+ }
-+ if (ext3_match (namelen, name, de)) {
-+ brelse (bh);
-+ return -EEXIST;
-+ }
-+ nlen = EXT3_DIR_REC_LEN(de->name_len);
-+ rlen = le16_to_cpu(de->rec_len);
-+ if ((de->inode? rlen - nlen: rlen) >= reclen)
-+ break;
-+ de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
-+ offset += rlen;
-+ }
-+ if ((char *) de > top)
-+ return -ENOSPC;
-+ }
-+ BUFFER_TRACE(bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err) {
-+ ext3_std_error(dir->i_sb, err);
-+ brelse(bh);
-+ return err;
-+ }
-+
-+ /* By now the buffer is marked for journaling */
-+ nlen = EXT3_DIR_REC_LEN(de->name_len);
-+ rlen = le16_to_cpu(de->rec_len);
-+ if (de->inode) {
-+ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
-+ de1->rec_len = cpu_to_le16(rlen - nlen);
-+ de->rec_len = cpu_to_le16(nlen);
-+ de = de1;
-+ }
-+ de->file_type = EXT3_FT_UNKNOWN;
-+ if (inode) {
-+ de->inode = cpu_to_le32(inode->i_ino);
-+ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
-+ } else
-+ de->inode = 0;
-+ de->name_len = namelen;
-+ memcpy (de->name, name, namelen);
-+ /*
-+ * XXX shouldn't update any times until successful
-+ * completion of syscall, but too many callers depend
-+ * on this.
-+ *
-+ * XXX similarly, too many callers depend on
-+ * ext3_new_inode() setting the times, but error
-+ * recovery deletes the inode, so the worst that can
-+ * happen is that the times are slightly out of date
-+ * and/or different from the directory change time.
-+ */
-+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-+ ext3_update_dx_flag(dir);
-+ dir->i_version = ++event;
-+ ext3_mark_inode_dirty(handle, dir);
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ ext3_std_error(dir->i_sb, err);
-+ brelse(bh);
-+ return 0;
-+}
-+
-+#ifdef CONFIG_EXT3_INDEX
-+/*
-+ * This converts a one block unindexed directory to a 3 block indexed
-+ * directory, and adds the dentry to the indexed directory.
-+ */
-+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode, struct buffer_head *bh)
-+{
-+ struct inode *dir = dentry->d_parent->d_inode;
-+ const char *name = dentry->d_name.name;
-+ int namelen = dentry->d_name.len;
-+ struct buffer_head *bh2;
-+ struct dx_root *root;
-+ struct dx_frame frames[2], *frame;
-+ struct dx_entry *entries;
-+ struct ext3_dir_entry_2 *de, *de2;
-+ char *data1, *top;
-+ unsigned len;
-+ int retval;
-+ unsigned blocksize;
-+ struct dx_hash_info hinfo;
-+ u32 block;
-+
-+ blocksize = dir->i_sb->s_blocksize;
-+ dxtrace(printk("Creating index\n"));
-+ retval = ext3_journal_get_write_access(handle, bh);
-+ if (retval) {
-+ ext3_std_error(dir->i_sb, retval);
-+ brelse(bh);
-+ return retval;
-+ }
-+ root = (struct dx_root *) bh->b_data;
-+
-+ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
-+ bh2 = ext3_append (handle, dir, &block, &retval);
-+ if (!(bh2)) {
-+ brelse(bh);
-+ return retval;
-+ }
-+ data1 = bh2->b_data;
-+
-+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *)&root->dotdot;
-+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
-+ len = ((char *) root) + blocksize - (char *) de;
-+ memcpy (data1, de, len);
-+ de = (struct ext3_dir_entry_2 *) data1;
-+ top = data1 + len;
-+ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
-+ de = de2;
-+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
-+ /* Initialize the root; the dot dirents already exist */
-+ de = (struct ext3_dir_entry_2 *) (&root->dotdot);
-+ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
-+ memset (&root->info, 0, sizeof(root->info));
-+ root->info.info_length = sizeof(root->info);
-+ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version;
-+ entries = root->entries;
-+ dx_set_block (entries, 1);
-+ dx_set_count (entries, 1);
-+ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
-+
-+ /* Initialize as for dx_probe */
-+ hinfo.hash_version = root->info.hash_version;
-+ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed;
-+ ext3fs_dirhash(name, namelen, &hinfo);
-+ frame = frames;
-+ frame->entries = entries;
-+ frame->at = entries;
-+ frame->bh = bh;
-+ bh = bh2;
-+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-+ dx_release (frames);
-+ if (!(de))
-+ return retval;
-+
-+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
-+}
-+#endif
-+
- /*
- * ext3_add_entry()
- *
-@@ -247,127 +1268,198 @@
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
--
--/*
-- * AKPM: the journalling code here looks wrong on the error paths
-- */
- static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
- struct inode *inode)
- {
- struct inode *dir = dentry->d_parent->d_inode;
-- const char *name = dentry->d_name.name;
-- int namelen = dentry->d_name.len;
- unsigned long offset;
-- unsigned short rec_len;
- struct buffer_head * bh;
-- struct ext3_dir_entry_2 * de, * de1;
-+ struct ext3_dir_entry_2 *de;
- struct super_block * sb;
- int retval;
-+#ifdef CONFIG_EXT3_INDEX
-+ int dx_fallback=0;
-+#endif
-+ unsigned blocksize;
-+ unsigned nlen, rlen;
-+ u32 block, blocks;
-
- sb = dir->i_sb;
--
-- if (!namelen)
-+ blocksize = sb->s_blocksize;
-+ if (!dentry->d_name.len)
- return -EINVAL;
-- bh = ext3_bread (handle, dir, 0, 0, &retval);
-+#ifdef CONFIG_EXT3_INDEX
-+ if (is_dx(dir)) {
-+ retval = ext3_dx_add_entry(handle, dentry, inode);
-+ if (!retval || (retval != ERR_BAD_DX_DIR))
-+ return retval;
-+ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
-+ dx_fallback++;
-+ ext3_mark_inode_dirty(handle, dir);
-+ }
-+#endif
-+ blocks = dir->i_size >> sb->s_blocksize_bits;
-+ for (block = 0, offset = 0; block < blocks; block++) {
-+ bh = ext3_bread(handle, dir, block, 0, &retval);
-+ if(!bh)
-+ return retval;
-+ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
-+ if (retval != -ENOSPC)
-+ return retval;
-+
-+#ifdef CONFIG_EXT3_INDEX
-+ if (blocks == 1 && !dx_fallback &&
-+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
-+ return make_indexed_dir(handle, dentry, inode, bh);
-+#endif
-+ brelse(bh);
-+ }
-+ bh = ext3_append(handle, dir, &block, &retval);
- if (!bh)
- return retval;
-- rec_len = EXT3_DIR_REC_LEN(namelen);
-- offset = 0;
- de = (struct ext3_dir_entry_2 *) bh->b_data;
-- while (1) {
-- if ((char *)de >= sb->s_blocksize + bh->b_data) {
-- brelse (bh);
-- bh = NULL;
-- bh = ext3_bread (handle, dir,
-- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
-- if (!bh)
-- return retval;
-- if (dir->i_size <= offset) {
-- if (dir->i_size == 0) {
-- brelse(bh);
-- return -ENOENT;
-- }
-+ de->inode = 0;
-+ de->rec_len = cpu_to_le16(rlen = blocksize);
-+ nlen = 0;
-+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
-+}
-
-- ext3_debug ("creating next block\n");
-+#ifdef CONFIG_EXT3_INDEX
-+/*
-+ * Returns 0 for success, or a negative error value
-+ */
-+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
-+{
-+ struct dx_frame frames[2], *frame;
-+ struct dx_entry *entries, *at;
-+ struct dx_hash_info hinfo;
-+ struct buffer_head * bh;
-+ struct inode *dir = dentry->d_parent->d_inode;
-+ struct super_block * sb = dir->i_sb;
-+ struct ext3_dir_entry_2 *de;
-+ int err;
-
-- BUFFER_TRACE(bh, "get_write_access");
-- ext3_journal_get_write_access(handle, bh);
-- de = (struct ext3_dir_entry_2 *) bh->b_data;
-- de->inode = 0;
-- de->rec_len = le16_to_cpu(sb->s_blocksize);
-- dir->u.ext3_i.i_disksize =
-- dir->i_size = offset + sb->s_blocksize;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-- ext3_mark_inode_dirty(handle, dir);
-- } else {
-+ frame = dx_probe(dentry, 0, &hinfo, frames, &err);
-+ if (!frame)
-+ return err;
-+ entries = frame->entries;
-+ at = frame->at;
-
-- ext3_debug ("skipping to next block\n");
-+ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
-+ goto cleanup;
-
-- de = (struct ext3_dir_entry_2 *) bh->b_data;
-- }
-- }
-- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
-- offset)) {
-- brelse (bh);
-- return -ENOENT;
-- }
-- if (ext3_match (namelen, name, de)) {
-- brelse (bh);
-- return -EEXIST;
-+ BUFFER_TRACE(bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto journal_error;
-+
-+ err = add_dirent_to_buf(handle, dentry, inode, 0, bh);
-+ if (err != -ENOSPC) {
-+ bh = 0;
-+ goto cleanup;
-+ }
-+
-+ /* Block full, should compress but for now just split */
-+ dxtrace(printk("using %u of %u node entries\n",
-+ dx_get_count(entries), dx_get_limit(entries)));
-+ /* Need to split index? */
-+ if (dx_get_count(entries) == dx_get_limit(entries)) {
-+ u32 newblock;
-+ unsigned icount = dx_get_count(entries);
-+ int levels = frame - frames;
-+ struct dx_entry *entries2;
-+ struct dx_node *node2;
-+ struct buffer_head *bh2;
-+
-+ if (levels && (dx_get_count(frames->entries) ==
-+ dx_get_limit(frames->entries))) {
-+ ext3_warning(sb, __FUNCTION__,
-+ "Directory index full!\n");
-+ err = -ENOSPC;
-+ goto cleanup;
- }
-- if ((le32_to_cpu(de->inode) == 0 &&
-- le16_to_cpu(de->rec_len) >= rec_len) ||
-- (le16_to_cpu(de->rec_len) >=
-- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
-- BUFFER_TRACE(bh, "get_write_access");
-- ext3_journal_get_write_access(handle, bh);
-- /* By now the buffer is marked for journaling */
-- offset += le16_to_cpu(de->rec_len);
-- if (le32_to_cpu(de->inode)) {
-- de1 = (struct ext3_dir_entry_2 *) ((char *) de +
-- EXT3_DIR_REC_LEN(de->name_len));
-- de1->rec_len =
-- cpu_to_le16(le16_to_cpu(de->rec_len) -
-- EXT3_DIR_REC_LEN(de->name_len));
-- de->rec_len = cpu_to_le16(
-- EXT3_DIR_REC_LEN(de->name_len));
-- de = de1;
-+ bh2 = ext3_append (handle, dir, &newblock, &err);
-+ if (!(bh2))
-+ goto cleanup;
-+ node2 = (struct dx_node *)(bh2->b_data);
-+ entries2 = node2->entries;
-+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
-+ node2->fake.inode = 0;
-+ BUFFER_TRACE(frame->bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, frame->bh);
-+ if (err)
-+ goto journal_error;
-+ if (levels) {
-+ unsigned icount1 = icount/2, icount2 = icount - icount1;
-+ unsigned hash2 = dx_get_hash(entries + icount1);
-+ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
-+
-+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
-+ err = ext3_journal_get_write_access(handle,
-+ frames[0].bh);
-+ if (err)
-+ goto journal_error;
-+
-+ memcpy ((char *) entries2, (char *) (entries + icount1),
-+ icount2 * sizeof(struct dx_entry));
-+ dx_set_count (entries, icount1);
-+ dx_set_count (entries2, icount2);
-+ dx_set_limit (entries2, dx_node_limit(dir));
-+
-+ /* Which index block gets the new entry? */
-+ if (at - entries >= icount1) {
-+ frame->at = at = at - entries - icount1 + entries2;
-+ frame->entries = entries = entries2;
-+ swap(frame->bh, bh2);
- }
-- de->file_type = EXT3_FT_UNKNOWN;
-- if (inode) {
-- de->inode = cpu_to_le32(inode->i_ino);
-- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
-- } else
-- de->inode = 0;
-- de->name_len = namelen;
-- memcpy (de->name, name, namelen);
-- /*
-- * XXX shouldn't update any times until successful
-- * completion of syscall, but too many callers depend
-- * on this.
-- *
-- * XXX similarly, too many callers depend on
-- * ext3_new_inode() setting the times, but error
-- * recovery deletes the inode, so the worst that can
-- * happen is that the times are slightly out of date
-- * and/or different from the directory change time.
-- */
-- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-- ext3_mark_inode_dirty(handle, dir);
-- dir->i_version = ++event;
-- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-- ext3_journal_dirty_metadata(handle, bh);
-- brelse(bh);
-- return 0;
-+ dx_insert_block (frames + 0, hash2, newblock);
-+ dxtrace(dx_show_index ("node", frames[1].entries));
-+ dxtrace(dx_show_index ("node",
-+ ((struct dx_node *) bh2->b_data)->entries));
-+ err = ext3_journal_dirty_metadata(handle, bh2);
-+ if (err)
-+ goto journal_error;
-+ brelse (bh2);
-+ } else {
-+ dxtrace(printk("Creating second level index...\n"));
-+ memcpy((char *) entries2, (char *) entries,
-+ icount * sizeof(struct dx_entry));
-+ dx_set_limit(entries2, dx_node_limit(dir));
-+
-+ /* Set up root */
-+ dx_set_count(entries, 1);
-+ dx_set_block(entries + 0, newblock);
-+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-+
-+ /* Add new access path frame */
-+ frame = frames + 1;
-+ frame->at = at = at - entries + entries2;
-+ frame->entries = entries = entries2;
-+ frame->bh = bh2;
-+ err = ext3_journal_get_write_access(handle,
-+ frame->bh);
-+ if (err)
-+ goto journal_error;
- }
-- offset += le16_to_cpu(de->rec_len);
-- de = (struct ext3_dir_entry_2 *)
-- ((char *) de + le16_to_cpu(de->rec_len));
-+ ext3_journal_dirty_metadata(handle, frames[0].bh);
- }
-- brelse (bh);
-- return -ENOSPC;
-+ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
-+ if (!de)
-+ goto cleanup;
-+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-+ bh = 0;
-+ goto cleanup;
-+
-+journal_error:
-+ ext3_std_error(dir->i_sb, err);
-+cleanup:
-+ if (bh)
-+ brelse(bh);
-+ dx_release(frames);
-+ return err;
- }
-+#endif
-
- /*
- * ext3_delete_entry deletes a directory entry by merging it with the
-@@ -451,9 +1543,11 @@
- struct inode * inode;
- int err;
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -478,9 +1572,11 @@
- struct inode *inode;
- int err;
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -507,9 +1603,11 @@
- if (dir->i_nlink >= EXT3_LINK_MAX)
- return -EMLINK;
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -521,7 +1619,7 @@
-
- inode->i_op = &ext3_dir_inode_operations;
- inode->i_fop = &ext3_dir_operations;
-- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
-+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- inode->i_blocks = 0;
- dir_block = ext3_bread (handle, inode, 0, 1, &err);
- if (!dir_block) {
-@@ -554,21 +1652,19 @@
- inode->i_mode |= S_ISGID;
- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_entry (handle, dentry, inode);
-- if (err)
-- goto out_no_entry;
-+ if (err) {
-+ inode->i_nlink = 0;
-+ ext3_mark_inode_dirty(handle, inode);
-+ iput (inode);
-+ goto out_stop;
-+ }
- dir->i_nlink++;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
- d_instantiate(dentry, inode);
- out_stop:
- ext3_journal_stop(handle, dir);
- return err;
--
--out_no_entry:
-- inode->i_nlink = 0;
-- ext3_mark_inode_dirty(handle, inode);
-- iput (inode);
-- goto out_stop;
- }
-
- /*
-@@ -655,7 +1751,7 @@
- int err = 0, rc;
-
- lock_super(sb);
-- if (!list_empty(&inode->u.ext3_i.i_orphan))
-+ if (!list_empty(&EXT3_I(inode)->i_orphan))
- goto out_unlock;
-
- /* Orphan handling is only valid for files with data blocks
-@@ -696,7 +1792,7 @@
- * This is safe: on error we're going to ignore the orphan list
- * anyway on the next recovery. */
- if (!err)
-- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
-+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-
- jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
- jbd_debug(4, "orphan inode %ld will point to %d\n",
-@@ -714,25 +1810,26 @@
- int ext3_orphan_del(handle_t *handle, struct inode *inode)
- {
- struct list_head *prev;
-+ struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_sb_info *sbi;
- ino_t ino_next;
- struct ext3_iloc iloc;
- int err = 0;
-
- lock_super(inode->i_sb);
-- if (list_empty(&inode->u.ext3_i.i_orphan)) {
-+ if (list_empty(&ei->i_orphan)) {
- unlock_super(inode->i_sb);
- return 0;
- }
-
- ino_next = NEXT_ORPHAN(inode);
-- prev = inode->u.ext3_i.i_orphan.prev;
-+ prev = ei->i_orphan.prev;
- sbi = EXT3_SB(inode->i_sb);
-
- jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
-
-- list_del(&inode->u.ext3_i.i_orphan);
-- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
-+ list_del(&ei->i_orphan);
-+ INIT_LIST_HEAD(&ei->i_orphan);
-
- /* If we're on an error path, we may not have a valid
- * transaction handle with which to update the orphan list on
-@@ -793,8 +1890,9 @@
- handle_t *handle;
-
- handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
-- if (IS_ERR(handle))
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- retval = -ENOENT;
- bh = ext3_find_entry (dentry, &de);
-@@ -832,7 +1930,7 @@
- ext3_mark_inode_dirty(handle, inode);
- dir->i_nlink--;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
-
- end_rmdir:
-@@ -850,8 +1948,9 @@
- handle_t *handle;
-
- handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
-- if (IS_ERR(handle))
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -878,7 +1977,7 @@
- if (retval)
- goto end_unlink;
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
- inode->i_nlink--;
- if (!inode->i_nlink)
-@@ -904,9 +2003,11 @@
- if (l > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -916,7 +2017,7 @@
- if (IS_ERR(inode))
- goto out_stop;
-
-- if (l > sizeof (inode->u.ext3_i.i_data)) {
-+ if (l > sizeof (EXT3_I(inode)->i_data)) {
- inode->i_op = &page_symlink_inode_operations;
- inode->i_mapping->a_ops = &ext3_aops;
- /*
-@@ -925,8 +2026,12 @@
- * i_size in generic_commit_write().
- */
- err = block_symlink(inode, symname, l);
-- if (err)
-- goto out_no_entry;
-+ if (err) {
-+ ext3_dec_count(handle, inode);
-+ ext3_mark_inode_dirty(handle, inode);
-+ iput (inode);
-+ goto out_stop;
-+ }
- } else {
- inode->i_op = &ext3_fast_symlink_inode_operations;
- memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
-@@ -938,12 +2043,6 @@
- out_stop:
- ext3_journal_stop(handle, dir);
- return err;
--
--out_no_entry:
-- ext3_dec_count(handle, inode);
-- ext3_mark_inode_dirty(handle, inode);
-- iput (inode);
-- goto out_stop;
- }
-
- static int ext3_link (struct dentry * old_dentry,
-@@ -956,12 +2055,15 @@
- if (S_ISDIR(inode->i_mode))
- return -EPERM;
-
-- if (inode->i_nlink >= EXT3_LINK_MAX)
-+ if (inode->i_nlink >= EXT3_LINK_MAX) {
- return -EMLINK;
-+ }
-
-- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-@@ -995,9 +2097,11 @@
-
- old_bh = new_bh = dir_bh = NULL;
-
-- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
-- if (IS_ERR(handle))
-+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
-+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
-+ if (IS_ERR(handle)) {
- return PTR_ERR(handle);
-+ }
-
- if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
- handle->h_sync = 1;
-@@ -1070,14 +2174,33 @@
- /*
- * ok, that's it
- */
-- ext3_delete_entry(handle, old_dir, old_de, old_bh);
-+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh);
-+ if (retval == -ENOENT) {
-+ /*
-+ * old_de could have moved out from under us.
-+ */
-+ struct buffer_head *old_bh2;
-+ struct ext3_dir_entry_2 *old_de2;
-+
-+ old_bh2 = ext3_find_entry(old_dentry, &old_de2);
-+ if (old_bh2) {
-+ retval = ext3_delete_entry(handle, old_dir,
-+ old_de2, old_bh2);
-+ brelse(old_bh2);
-+ }
-+ }
-+ if (retval) {
-+ ext3_warning(old_dir->i_sb, "ext3_rename",
-+ "Deleting old file (%lu), %d, error=%d",
-+ old_dir->i_ino, old_dir->i_nlink, retval);
-+ }
-
- if (new_inode) {
- new_inode->i_nlink--;
- new_inode->i_ctime = CURRENT_TIME;
- }
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
-- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(old_dir);
- if (dir_bh) {
- BUFFER_TRACE(dir_bh, "get_write_access");
- ext3_journal_get_write_access(handle, dir_bh);
-@@ -1089,7 +2212,7 @@
- new_inode->i_nlink--;
- } else {
- new_dir->i_nlink++;
-- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
-+ ext3_update_dx_flag(new_dir);
- ext3_mark_inode_dirty(handle, new_dir);
- }
- }
-Index: linux.mcp2/fs/ext3/super.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:03:55.000000000 -0700
-+++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:08:50.000000000 -0700
-@@ -702,6 +702,7 @@
- es->s_mtime = cpu_to_le32(CURRENT_TIME);
- ext3_update_dynamic_rev(sb);
- EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-+
- ext3_commit_super (sb, es, 1);
- if (test_opt (sb, DEBUG))
- printk (KERN_INFO
-@@ -712,6 +713,7 @@
- EXT3_BLOCKS_PER_GROUP(sb),
- EXT3_INODES_PER_GROUP(sb),
- sbi->s_mount_opt);
-+
- printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
- bdevname(sb->s_dev));
- if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
-@@ -886,6 +888,7 @@
- return res;
- }
-
-+
- struct super_block * ext3_read_super (struct super_block * sb, void * data,
- int silent)
- {
-@@ -1062,6 +1065,9 @@
- sbi->s_mount_state = le16_to_cpu(es->s_state);
- sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
- sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
-+ for (i=0; i < 4; i++)
-+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
-+ sbi->s_def_hash_version = es->s_def_hash_version;
-
- if (sbi->s_blocks_per_group > blocksize * 8) {
- printk (KERN_ERR
-@@ -1744,7 +1750,7 @@
- unregister_filesystem(&ext3_fs_type);
- }
-
--EXPORT_NO_SYMBOLS;
-+EXPORT_SYMBOL(ext3_force_commit);
-
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
- MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
-Index: linux.mcp2/include/linux/ext3_fs.h
-===================================================================
---- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 14:53:17.000000000 -0700
-+++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:07:07.000000000 -0700
-@@ -40,6 +40,11 @@
- #define EXT3FS_VERSION "2.4-0.9.17"
-
- /*
-+ * Always enable hashed directories
-+ */
-+#define CONFIG_EXT3_INDEX
-+
-+/*
- * Debug code
- */
- #ifdef EXT3FS_DEBUG
-@@ -437,8 +442,11 @@
- /*E0*/ __u32 s_journal_inum; /* inode number of journal file */
- __u32 s_journal_dev; /* device number of journal file */
- __u32 s_last_orphan; /* start of list of inodes to delete */
--
--/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */
-+ __u32 s_hash_seed[4]; /* HTREE hash seed */
-+ __u8 s_def_hash_version; /* Default hash version to use */
-+ __u8 s_reserved_char_pad;
-+ __u16 s_reserved_word_pad;
-+ __u32 s_reserved[192]; /* Padding to the end of the block */
- };
-
- #ifdef __KERNEL__
-@@ -575,9 +583,46 @@
- #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
- #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
- ~EXT3_DIR_ROUND)
-+/*
-+ * Hash Tree Directory indexing
-+ * (c) Daniel Phillips, 2001
-+ */
-+
-+#ifdef CONFIG_EXT3_INDEX
-+ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
-+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#else
-+ #define is_dx(dir) 0
-+#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
-+#endif
-+
-+/* Legal values for the dx_root hash_version field: */
-+
-+#define DX_HASH_LEGACY 0
-+#define DX_HASH_HALF_MD4 1
-+#define DX_HASH_TEA 2
-+
-+/* hash info structure used by the directory hash */
-+struct dx_hash_info
-+{
-+ u32 hash;
-+ u32 minor_hash;
-+ int hash_version;
-+ u32 *seed;
-+};
-
- #ifdef __KERNEL__
- /*
-+ * Control parameters used by ext3_htree_next_block
-+ */
-+#define HASH_NB_ALWAYS 1
-+
-+
-+/*
- * Describe an inode's exact location on disk and in memory
- */
- struct ext3_iloc
-@@ -587,6 +632,27 @@
- unsigned long block_group;
- };
-
-+
-+/*
-+ * This structure is stuffed into the struct file's private_data field
-+ * for directories. It is where we put information so that we can do
-+ * readdir operations in hash tree order.
-+ */
-+struct dir_private_info {
-+ rb_root_t root;
-+ rb_node_t *curr_node;
-+ struct fname *extra_fname;
-+ loff_t last_pos;
-+ __u32 curr_hash;
-+ __u32 curr_minor_hash;
-+ __u32 next_hash;
-+};
-+
-+/*
-+ * Special error return code only used by dx_probe() and its callers.
-+ */
-+#define ERR_BAD_DX_DIR -75000
-+
- /*
- * Function prototypes
- */
-@@ -614,11 +680,20 @@
-
- /* dir.c */
- extern int ext3_check_dir_entry(const char *, struct inode *,
-- struct ext3_dir_entry_2 *, struct buffer_head *,
-- unsigned long);
-+ struct ext3_dir_entry_2 *,
-+ struct buffer_head *, unsigned long);
-+extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
-+ __u32 minor_hash,
-+ struct ext3_dir_entry_2 *dirent);
-+extern void ext3_htree_free_dir_info(struct dir_private_info *p);
-+
- /* fsync.c */
- extern int ext3_sync_file (struct file *, struct dentry *, int);
-
-+/* hash.c */
-+extern int ext3fs_dirhash(const char *name, int len, struct
-+ dx_hash_info *hinfo);
-+
- /* ialloc.c */
- extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
- extern void ext3_free_inode (handle_t *, struct inode *);
-@@ -650,6 +725,8 @@
- /* namei.c */
- extern int ext3_orphan_add(handle_t *, struct inode *);
- extern int ext3_orphan_del(handle_t *, struct inode *);
-+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
-+ __u32 start_minor_hash, __u32 *next_hash);
-
- /* super.c */
- extern void ext3_error (struct super_block *, const char *, const char *, ...)
-Index: linux.mcp2/include/linux/ext3_fs_sb.h
-===================================================================
---- linux.mcp2.orig/include/linux/ext3_fs_sb.h 2004-05-17 14:41:25.000000000 -0700
-+++ linux.mcp2/include/linux/ext3_fs_sb.h 2004-05-17 15:07:07.000000000 -0700
-@@ -62,6 +62,8 @@
- int s_inode_size;
- int s_first_ino;
- u32 s_next_generation;
-+ u32 s_hash_seed[4];
-+ int s_def_hash_version;
-
- /* Journaling */
- struct inode * s_journal_inode;
-Index: linux.mcp2/include/linux/ext3_jbd.h
-===================================================================
---- linux.mcp2.orig/include/linux/ext3_jbd.h 2004-05-17 14:53:17.000000000 -0700
-+++ linux.mcp2/include/linux/ext3_jbd.h 2004-05-17 15:07:07.000000000 -0700
-@@ -63,6 +63,8 @@
-
- #define EXT3_RESERVE_TRANS_BLOCKS 12
-
-+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
-+
- int
- ext3_mark_iloc_dirty(handle_t *handle,
- struct inode *inode,
-Index: linux.mcp2/include/linux/rbtree.h
-===================================================================
---- linux.mcp2.orig/include/linux/rbtree.h 2004-05-17 14:41:25.000000000 -0700
-+++ linux.mcp2/include/linux/rbtree.h 2004-05-17 15:07:07.000000000 -0700
-@@ -120,6 +120,8 @@
-
- extern void rb_insert_color(rb_node_t *, rb_root_t *);
- extern void rb_erase(rb_node_t *, rb_root_t *);
-+extern rb_node_t *rb_get_first(rb_root_t *root);
-+extern rb_node_t *rb_get_next(rb_node_t *n);
-
- static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
- {
-Index: linux.mcp2/lib/rbtree.c
-===================================================================
---- linux.mcp2.orig/lib/rbtree.c 2004-01-19 07:49:44.000000000 -0800
-+++ linux.mcp2/lib/rbtree.c 2004-05-17 15:10:39.000000000 -0700
-@@ -17,6 +17,8 @@
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
- linux/lib/rbtree.c
-+
-+ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002
- */
-
- #include <linux/rbtree.h>
-@@ -294,3 +296,42 @@
- __rb_erase_color(child, parent, root);
- }
- EXPORT_SYMBOL(rb_erase);
-+
-+/*
-+ * This function returns the first node (in sort order) of the tree.
-+ */
-+rb_node_t *rb_get_first(rb_root_t *root)
-+{
-+ rb_node_t *n;
-+
-+ n = root->rb_node;
-+ if (!n)
-+ return 0;
-+ while (n->rb_left)
-+ n = n->rb_left;
-+ return n;
-+}
-+EXPORT_SYMBOL(rb_get_first);
-+
-+/*
-+ * Given a node, this function will return the next node in the tree.
-+ */
-+rb_node_t *rb_get_next(rb_node_t *n)
-+{
-+ rb_node_t *parent;
-+
-+ if (n->rb_right) {
-+ n = n->rb_right;
-+ while (n->rb_left)
-+ n = n->rb_left;
-+ return n;
-+ } else {
-+ while ((parent = n->rb_parent)) {
-+ if (n == parent->rb_left)
-+ return parent;
-+ n = parent;
-+ }
-+ return 0;
-+ }
-+}
-+EXPORT_SYMBOL(rb_get_next);
+++ /dev/null
-Index: linux-2.6.7/fs/ext3/namei.c
-===================================================================
---- linux-2.6.7.orig/fs/ext3/namei.c 2004-06-15 23:19:36.000000000 -0600
-+++ linux-2.6.7/fs/ext3/namei.c 2004-08-20 17:48:54.000000000 -0600
-@@ -1596,11 +1596,17 @@ static int ext3_delete_entry (handle_t *
- static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
- {
- inode->i_nlink++;
-+ if (is_dx(inode) && inode->i_nlink > 1) {
-+ /* limit is 16-bit i_links_count */
-+ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2)
-+ inode->i_nlink = 1;
-+ }
- }
-
- static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
- {
-- inode->i_nlink--;
-+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
-+ inode->i_nlink--;
- }
-
- static int ext3_add_nondir(handle_t *handle,
-@@ -1693,7 +1698,7 @@ static int ext3_mkdir(struct inode * dir
- struct ext3_dir_entry_2 * de;
- int err;
-
-- if (dir->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(dir))
- return -EMLINK;
-
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-@@ -1715,7 +1720,7 @@ static int ext3_mkdir(struct inode * dir
- inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext3_bread (handle, inode, 0, 1, &err);
- if (!dir_block) {
-- inode->i_nlink--; /* is this nlink == 0? */
-+ ext3_dec_count(handle, inode); /* is this nlink == 0? */
- ext3_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
-@@ -1747,7 +1752,7 @@ static int ext3_mkdir(struct inode * dir
- iput (inode);
- goto out_stop;
- }
-- dir->i_nlink++;
-+ ext3_inc_count(handle, dir);
- ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
- d_instantiate(dentry, inode);
-@@ -2010,10 +2015,10 @@ static int ext3_rmdir (struct inode * di
- retval = ext3_delete_entry(handle, dir, de, bh);
- if (retval)
- goto end_rmdir;
-- if (inode->i_nlink != 2)
-- ext3_warning (inode->i_sb, "ext3_rmdir",
-- "empty directory has nlink!=2 (%d)",
-- inode->i_nlink);
-+ if (!EXT3_DIR_LINK_EMPTY(inode))
-+ ext3_warning(inode->i_sb, "ext3_rmdir",
-+ "empty directory has too many links (%d)",
-+ inode->i_nlink);
- inode->i_version++;
- inode->i_nlink = 0;
- /* There's no need to set i_disksize: the fact that i_nlink is
-@@ -2023,7 +2028,7 @@ static int ext3_rmdir (struct inode * di
- ext3_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- ext3_mark_inode_dirty(handle, inode);
-- dir->i_nlink--;
-+ ext3_dec_count(handle, dir);
- ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
-
-@@ -2074,7 +2079,7 @@ static int ext3_unlink(struct inode * di
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
-- inode->i_nlink--;
-+ ext3_dec_count(handle, inode);
- if (!inode->i_nlink)
- ext3_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime;
-@@ -2146,7 +2151,7 @@ static int ext3_link (struct dentry * ol
- struct inode *inode = old_dentry->d_inode;
- int err;
-
-- if (inode->i_nlink >= EXT3_LINK_MAX)
-+ if (EXT3_DIR_LINK_MAXED(inode))
- return -EMLINK;
-
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
-@@ -2230,8 +2235,8 @@ static int ext3_rename (struct inode * o
- if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
- goto end_rename;
- retval = -EMLINK;
-- if (!new_inode && new_dir!=old_dir &&
-- new_dir->i_nlink >= EXT3_LINK_MAX)
-+ if (!new_inode && new_dir != old_dir &&
-+ EXT3_DIR_LINK_MAXED(new_dir))
- goto end_rename;
- }
- if (!new_bh) {
-@@ -2288,7 +2293,7 @@ static int ext3_rename (struct inode * o
- }
-
- if (new_inode) {
-- new_inode->i_nlink--;
-+ ext3_dec_count(handle, new_inode);
- new_inode->i_ctime = CURRENT_TIME_SEC;
- }
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-@@ -2299,11 +2304,13 @@ static int ext3_rename (struct inode * o
- PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino);
- BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, dir_bh);
-- old_dir->i_nlink--;
-+ ext3_dec_count(handle, old_dir);
- if (new_inode) {
-- new_inode->i_nlink--;
-+ /* checked empty_dir above, can't have another parent,
-+ * ext3_dec_count() won't work for many-linked dirs */
-+ new_inode->i_nlink = 0;
- } else {
-- new_dir->i_nlink++;
-+ ext3_inc_count(handle, new_dir);
- ext3_update_dx_flag(new_dir);
- ext3_mark_inode_dirty(handle, new_dir);
- }
---- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600
-+++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600
-@@ -79,7 +81,7 @@
- /*
- * Maximal count of links to a file
- */
--#define EXT3_LINK_MAX 32000
-+#define EXT3_LINK_MAX 65000
-
- /*
- * Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
- */
-
- #ifdef CONFIG_EXT3_INDEX
-- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+ (is_dx(dir) && (dir)->i_nlink == 1))
- #else
- #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
-
+++ /dev/null
- 0 files changed
-
---- linux-2.4.20/fs/ext3/super.c~ext3-no-write-super 2003-08-11 13:20:17.000000000 +0400
-+++ linux-2.4.20-alexey/fs/ext3/super.c 2003-08-11 13:31:35.000000000 +0400
-@@ -1849,7 +1849,6 @@ void ext3_write_super (struct super_bloc
- if (down_trylock(&sb->s_lock) == 0)
- BUG(); /* aviro detector */
- sb->s_dirt = 0;
-- target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
-
- /*
- * Tricky --- if we are unmounting, the write really does need
-@@ -1857,6 +1856,7 @@ void ext3_write_super (struct super_bloc
- * sb->s_root.
- */
- if (do_sync_supers || !sb->s_root) {
-+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
- unlock_super(sb);
- log_wait_commit(EXT3_SB(sb)->s_journal, target);
- lock_super(sb);
-
-_
+++ /dev/null
-Index: linux-2.4.19/fs/ext3/namei.c
-===================================================================
---- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:36:03.000000000 -0400
-+++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:37:37.000000000 -0400
-@@ -1751,8 +1751,8 @@
- struct super_block *sb = inode->i_sb;
- struct ext3_iloc iloc;
- int err = 0, rc;
--
-- lock_super(sb);
-+
-+ down(&EXT3_SB(sb)->s_orphan_lock);
- if (!list_empty(&EXT3_I(inode)->i_orphan))
- goto out_unlock;
-
-@@ -1800,7 +1800,7 @@
- jbd_debug(4, "orphan inode %ld will point to %d\n",
- inode->i_ino, NEXT_ORPHAN(inode));
- out_unlock:
-- unlock_super(sb);
-+ up(&EXT3_SB(sb)->s_orphan_lock);
- ext3_std_error(inode->i_sb, err);
- return err;
- }
-@@ -1813,20 +1813,19 @@
- {
- struct list_head *prev;
- struct ext3_inode_info *ei = EXT3_I(inode);
-- struct ext3_sb_info *sbi;
-+ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
- unsigned long ino_next;
- struct ext3_iloc iloc;
- int err = 0;
-
-- lock_super(inode->i_sb);
-+ down(&sbi->s_orphan_lock);
- if (list_empty(&ei->i_orphan)) {
-- unlock_super(inode->i_sb);
-+ up(&sbi->s_orphan_lock);
- return 0;
- }
-
- ino_next = NEXT_ORPHAN(inode);
- prev = ei->i_orphan.prev;
-- sbi = EXT3_SB(inode->i_sb);
-
- jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
-
-@@ -1872,10 +1871,10 @@
- if (err)
- goto out_brelse;
-
--out_err:
-+out_err:
- ext3_std_error(inode->i_sb, err);
- out:
-- unlock_super(inode->i_sb);
-+ up(&sbi->s_orphan_lock);
- return err;
-
- out_brelse:
-Index: linux-2.4.19/fs/ext3/super.c
-===================================================================
---- linux-2.4.19.orig/fs/ext3/super.c 2004-04-23 22:30:41.000000000 -0400
-+++ linux-2.4.19/fs/ext3/super.c 2004-04-23 22:36:22.000000000 -0400
-@@ -1179,6 +1179,7 @@
- */
- sb->s_op = &ext3_sops;
- INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
-+ sema_init(&sbi->s_orphan_lock, 1);
-
- sb->s_root = 0;
-
-Index: linux-2.4.19/include/linux/ext3_fs_sb.h
-===================================================================
---- linux-2.4.19.orig/include/linux/ext3_fs_sb.h 2004-04-23 18:26:27.000000000 -0400
-+++ linux-2.4.19/include/linux/ext3_fs_sb.h 2004-04-23 22:36:22.000000000 -0400
-@@ -69,6 +69,7 @@
- struct inode * s_journal_inode;
- struct journal_s * s_journal;
- struct list_head s_orphan;
-+ struct semaphore s_orphan_lock;
- struct block_device *journal_bdev;
- #ifdef CONFIG_JBD_DEBUG
- struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
+++ /dev/null
- fs/ext3/super.c | 7 ++++++-
- 1 files changed, 6 insertions(+), 1 deletion(-)
-
---- linux-2.4.20/fs/ext3/super.c~ext3-unmount_sync 2003-04-08 23:35:44.000000000 -0600
-+++ linux-2.4.20-braam/fs/ext3/super.c 2003-04-08 23:35:44.000000000 -0600
-@@ -1612,7 +1612,12 @@ void ext3_write_super (struct super_bloc
- sb->s_dirt = 0;
- target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
-
-- if (do_sync_supers) {
-+ /*
-+ * Tricky --- if we are unmounting, the write really does need
-+ * to be synchronous. We can detect that by looking for NULL in
-+ * sb->s_root.
-+ */
-+ if (do_sync_supers || !sb->s_root) {
- unlock_super(sb);
- log_wait_commit(EXT3_SB(sb)->s_journal, target);
- lock_super(sb);
-
-_
+++ /dev/null
- ./fs/ext3/namei.c | 11 +++++------
- 1 files changed, 5 insertions(+), 6 deletions(-)
-
-Index: linux-2.4.19-pre1/./fs/ext3/namei.c
-===================================================================
---- linux-2.4.19-pre1.orig/./fs/ext3/namei.c 2003-11-21 01:52:06.000000000 +0300
-+++ linux-2.4.19-pre1/./fs/ext3/namei.c 2003-11-21 01:58:15.000000000 +0300
-@@ -1522,8 +1522,11 @@
- {
- int err = ext3_add_entry(handle, dentry, inode);
- if (!err) {
-- d_instantiate(dentry, inode);
-- return 0;
-+ err = ext3_mark_inode_dirty(handle, inode);
-+ if (err == 0) {
-+ d_instantiate(dentry, inode);
-+ return 0;
-+ }
- }
- ext3_dec_count(handle, inode);
- iput(inode);
-@@ -1559,7 +1562,6 @@
- inode->i_op = &ext3_file_inode_operations;
- inode->i_fop = &ext3_file_operations;
- inode->i_mapping->a_ops = &ext3_aops;
-- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- }
- ext3_journal_stop(handle, dir);
-@@ -1586,7 +1588,6 @@
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- init_special_inode(inode, mode, rdev);
-- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- }
- ext3_journal_stop(handle, dir);
-@@ -2035,7 +2036,6 @@
- inode->i_size = l-1;
- }
- inode->u.ext3_i.i_disksize = inode->i_size;
-- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- out_stop:
- ext3_journal_stop(handle, dir);
-@@ -2069,7 +2069,6 @@
- ext3_inc_count(handle, inode);
- atomic_inc(&inode->i_count);
-
-- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- ext3_journal_stop(handle, dir);
- return err;
+++ /dev/null
- ./fs/ext3/namei.c | 11 +++++------
- 1 files changed, 5 insertions(+), 6 deletions(-)
-
-Index: linux-2.4.19/fs/ext3/namei.c
-===================================================================
---- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:30:41.000000000 -0400
-+++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:36:03.000000000 -0400
-@@ -1522,8 +1522,11 @@
- {
- int err = ext3_add_entry(handle, dentry, inode);
- if (!err) {
-- d_instantiate(dentry, inode);
-- return 0;
-+ err = ext3_mark_inode_dirty(handle, inode);
-+ if (err == 0) {
-+ d_instantiate(dentry, inode);
-+ return 0;
-+ }
- }
- ext3_dec_count(handle, inode);
- iput(inode);
-@@ -1559,7 +1562,6 @@
- inode->i_op = &ext3_file_inode_operations;
- inode->i_fop = &ext3_file_operations;
- inode->i_mapping->a_ops = &ext3_aops;
-- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- }
- ext3_journal_stop(handle, dir);
-@@ -1589,7 +1591,6 @@
- #ifdef CONFIG_EXT3_FS_XATTR
- inode->i_op = &ext3_special_inode_operations;
- #endif
-- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- }
- ext3_journal_stop(handle, dir);
-@@ -2039,7 +2040,6 @@
- inode->i_size = l-1;
- }
- EXT3_I(inode)->i_disksize = inode->i_size;
-- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- out_stop:
- ext3_journal_stop(handle, dir);
-@@ -2073,7 +2073,6 @@
- ext3_inc_count(handle, inode);
- atomic_inc(&inode->i_count);
-
-- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_nondir(handle, dentry, inode);
- ext3_journal_stop(handle, dir);
- return err;
+++ /dev/null
- fs/ext3/ialloc.c | 40 ++++++++++++++++++++++++++++++++++++++--
- fs/ext3/inode.c | 2 +-
- fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++
- fs/ext3/namei.c | 21 +++++++++++++++++----
- include/linux/dcache.h | 5 +++++
- include/linux/ext3_fs.h | 5 ++++-
- 6 files changed, 90 insertions(+), 8 deletions(-)
-
-Index: linux-2.4.19.SuSE/fs/ext3/namei.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:18:04 2003
-+++ linux-2.4.19.SuSE/fs/ext3/namei.c Sun Nov 16 01:23:20 2003
-@@ -1534,6 +1534,19 @@
- return err;
- }
-
-+static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir,
-+ int mode, struct dentry *dentry)
-+{
-+ unsigned long inum = 0;
-+
-+ if (dentry->d_fsdata != NULL) {
-+ struct dentry_params *param =
-+ (struct dentry_params *) dentry->d_fsdata;
-+ inum = param->p_inum;
-+ }
-+ return ext3_new_inode(handle, dir, mode, inum);
-+}
-+
- /*
- * By the time this is called, we already have created
- * the directory cache entry for the new file, but it
-@@ -1557,7 +1570,7 @@
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-
-- inode = ext3_new_inode (handle, dir, mode);
-+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &ext3_file_inode_operations;
-@@ -1585,7 +1598,7 @@
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-
-- inode = ext3_new_inode (handle, dir, mode);
-+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- init_special_inode(inode, inode->i_mode, rdev);
-@@ -1618,7 +1631,7 @@
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-
-- inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
-+ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
-@@ -2013,7 +2026,7 @@
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-
-- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
-+ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
-Index: linux-2.4.19.SuSE/fs/ext3/ialloc.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/ialloc.c Sun Nov 16 01:20:17 2003
-+++ linux-2.4.19.SuSE/fs/ext3/ialloc.c Sun Nov 16 01:24:49 2003
-@@ -330,7 +330,8 @@
- * For other inodes, search forward from the parent directory's block
- * group to find a free inode.
- */
--struct inode * ext3_new_inode (handle_t *handle, struct inode * dir, int mode)
-+struct inode * ext3_new_inode(handle_t *handle, const struct inode * dir,
-+ int mode, unsigned long goal)
- {
- struct super_block * sb;
- struct buffer_head * bh;
-@@ -355,7 +356,41 @@
- init_rwsem(&inode->u.ext3_i.truncate_sem);
-
- lock_super (sb);
-- es = sb->u.ext3_sb.s_es;
-+ es = EXT3_SB(sb)->s_es;
-+
-+ if (goal) {
-+ i = (goal - 1) / EXT3_INODES_PER_GROUP(sb);
-+ j = (goal - 1) % EXT3_INODES_PER_GROUP(sb);
-+ gdp = ext3_get_group_desc(sb, i, &bh2);
-+
-+ bitmap_nr = load_inode_bitmap (sb, i);
-+ if (bitmap_nr < 0) {
-+ err = bitmap_nr;
-+ goto fail;
-+ }
-+
-+ bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr];
-+
-+ BUFFER_TRACE(bh, "get_write_access");
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err) goto fail;
-+
-+ if (ext3_set_bit(j, bh->b_data)) {
-+ printk(KERN_ERR "goal inode %lu unavailable\n", goal);
-+ /* Oh well, we tried. */
-+ goto repeat;
-+ }
-+
-+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err) goto fail;
-+
-+ /* We've shortcircuited the allocation system successfully,
-+ * now finish filling in the inode.
-+ */
-+ goto have_bit_and_group;
-+ }
-+
- repeat:
- gdp = NULL;
- i = 0;
-@@ -470,6 +505,7 @@
- }
- goto repeat;
- }
-+ have_bit_and_group:
- j += i * EXT3_INODES_PER_GROUP(sb) + 1;
- if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) {
- ext3_error (sb, "ext3_new_inode",
-Index: linux-2.4.19.SuSE/fs/ext3/inode.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:20:17 2003
-+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:23:20 2003
-@@ -2168,7 +2168,7 @@
- if (IS_ERR(handle))
- goto out_truncate;
-
-- new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
-+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode, 0);
- if (IS_ERR(new_inode)) {
- ext3_debug("truncate inode %lu directly (no new inodes)\n",
- old_inode->i_ino);
-Index: linux-2.4.19.SuSE/fs/ext3/ioctl.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/ioctl.c Fri Nov 9 14:25:04 2001
-+++ linux-2.4.19.SuSE/fs/ext3/ioctl.c Sun Nov 16 01:23:20 2003
-@@ -23,6 +23,31 @@
- ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
-
- switch (cmd) {
-+ case EXT3_IOC_CREATE_INUM: {
-+ char name[32];
-+ struct dentry *dchild, *dparent;
-+ int rc = 0;
-+
-+ dparent = list_entry(inode->i_dentry.next, struct dentry,
-+ d_alias);
-+ snprintf(name, sizeof name, "%lu", arg);
-+ dchild = lookup_one_len(name, dparent, strlen(name));
-+ if (dchild->d_inode) {
-+ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n",
-+ dparent->d_name.len, dparent->d_name.name, arg,
-+ dchild->d_inode->i_ino);
-+ rc = -EEXIST;
-+ } else {
-+ dchild->d_fsdata = (void *)arg;
-+ rc = vfs_create(inode, dchild, 0644);
-+ if (rc)
-+ printk(KERN_ERR "vfs_create: %d\n", rc);
-+ else if (dchild->d_inode->i_ino != arg)
-+ rc = -EEXIST;
-+ }
-+ dput(dchild);
-+ return rc;
-+ }
- case EXT3_IOC_GETFLAGS:
- flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
- return put_user(flags, (int *) arg);
-Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:20:17 2003
-+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:25:42 2003
-@@ -202,6 +202,7 @@
- #define EXT3_IOC_SETFLAGS _IOW('f', 2, long)
- #define EXT3_IOC_GETVERSION _IOR('f', 3, long)
- #define EXT3_IOC_SETVERSION _IOW('f', 4, long)
-+/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
- #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long)
- #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long)
- #ifdef CONFIG_JBD_DEBUG
-@@ -674,7 +675,8 @@
- dx_hash_info *hinfo);
-
- /* ialloc.c */
--extern struct inode * ext3_new_inode (handle_t *, struct inode *, int);
-+extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int,
-+ unsigned long);
- extern void ext3_free_inode (handle_t *, struct inode *);
- extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
- extern unsigned long ext3_count_free_inodes (struct super_block *);
-@@ -765,4 +767,5 @@
-
- #endif /* __KERNEL__ */
-
-+#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
- #endif /* _LINUX_EXT3_FS_H */
-Index: linux-2.4.19.SuSE/include/linux/dcache.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/dcache.h Sat Nov 15 17:35:46 2003
-+++ linux-2.4.19.SuSE/include/linux/dcache.h Sun Nov 16 01:23:20 2003
-@@ -62,6 +62,11 @@
-
- #define IS_ROOT(x) ((x) == (x)->d_parent)
-
-+struct dentry_params {
-+ unsigned long p_inum;
-+ void *p_ptr;
-+};
-+
- /*
- * "quick string" -- eases parameter passing, but more importantly
- * saves "metadata" about the string (ie length and the hash).
+++ /dev/null
-
-
-
- fs/inode.c | 21 ++++++++++++++-------
- fs/smbfs/inode.c | 2 +-
- fs/super.c | 4 ++--
- include/linux/fs.h | 2 +-
- 4 files changed, 18 insertions(+), 11 deletions(-)
-
-Index: linux.mcp2/fs/inode.c
-===================================================================
---- linux.mcp2.orig/fs/inode.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/inode.c 2004-05-05 14:31:31.000000000 -0700
-@@ -553,7 +553,8 @@
- /*
- * Invalidate all inodes for a device.
- */
--static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
-+static int invalidate_list(struct list_head *head, struct super_block * sb,
-+ struct list_head * dispose, int show)
- {
- struct list_head *next;
- int busy = 0, count = 0;
-@@ -578,6 +579,11 @@
- count++;
- continue;
- }
-+ if (show)
-+ printk(KERN_ERR
-+ "inode busy: dev %s:%lu (%p) mode %o count %u\n",
-+ kdevname(sb->s_dev), inode->i_ino, inode,
-+ inode->i_mode, atomic_read(&inode->i_count));
- busy = 1;
- }
- /* only unused inodes may be cached with i_count zero */
-@@ -596,22 +602,23 @@
- /**
- * invalidate_inodes - discard the inodes on a device
- * @sb: superblock
-+ * @show: whether we should display any busy inodes found
- *
- * Discard all of the inodes for a given superblock. If the discard
- * fails because there are busy inodes then a non zero value is returned.
- * If the discard is successful all the inodes have been discarded.
- */
-
--int invalidate_inodes(struct super_block * sb)
-+int invalidate_inodes(struct super_block * sb, int show)
- {
- int busy;
- LIST_HEAD(throw_away);
-
- spin_lock(&inode_lock);
-- busy = invalidate_list(&inode_in_use, sb, &throw_away);
-- busy |= invalidate_list(&inode_unused, sb, &throw_away);
-- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
-- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away);
-+ busy = invalidate_list(&inode_in_use, sb, &throw_away, show);
-+ busy |= invalidate_list(&inode_unused, sb, &throw_away, show);
-+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show);
-+ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show);
- spin_unlock(&inode_lock);
-
- dispose_list(&throw_away);
-@@ -637,7 +644,7 @@
- * hold).
- */
- shrink_dcache_sb(sb);
-- res = invalidate_inodes(sb);
-+ res = invalidate_inodes(sb, 0);
- drop_super(sb);
- }
- invalidate_buffers(dev);
-Index: linux.mcp2/fs/super.c
-===================================================================
---- linux.mcp2.orig/fs/super.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/super.c 2004-05-05 14:32:06.000000000 -0700
-@@ -838,7 +838,7 @@
- lock_super(sb);
- lock_kernel();
- sb->s_flags &= ~MS_ACTIVE;
-- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */
-+ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */
- if (sop) {
- if (sop->write_super && sb->s_dirt)
- sop->write_super(sb);
-@@ -847,7 +847,7 @@
- }
-
- /* Forget any remaining inodes */
-- if (invalidate_inodes(sb)) {
-+ if (invalidate_inodes(sb, 1)) {
- printk(KERN_ERR "VFS: Busy inodes after unmount. "
- "Self-destruct in 5 seconds. Have a nice day...\n");
- }
-Index: linux.mcp2/fs/smbfs/inode.c
-===================================================================
---- linux.mcp2.orig/fs/smbfs/inode.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/smbfs/inode.c 2004-05-05 14:31:31.000000000 -0700
-@@ -166,7 +166,7 @@
- {
- VERBOSE("\n");
- shrink_dcache_sb(SB_of(server));
-- invalidate_inodes(SB_of(server));
-+ invalidate_inodes(SB_of(server), 0);
- }
-
- /*
-Index: linux.mcp2/include/linux/fs.h
-===================================================================
---- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:31:06.000000000 -0700
-+++ linux.mcp2/include/linux/fs.h 2004-05-05 14:31:31.000000000 -0700
-@@ -1283,7 +1283,7 @@
- extern void set_buffer_flushtime(struct buffer_head *);
- extern void balance_dirty(void);
- extern int check_disk_change(kdev_t);
--extern int invalidate_inodes(struct super_block *);
-+extern int invalidate_inodes(struct super_block *, int);
- extern int invalidate_device(kdev_t, int);
- extern void invalidate_inode_pages(struct inode *);
- extern void invalidate_inode_pages2(struct address_space *);
+++ /dev/null
- fs/Makefile | 2 +-
- fs/inode.c | 4 +++-
- mm/page_alloc.c | 1 +
- 3 files changed, 5 insertions(+), 2 deletions(-)
-
-Index: linux-ion/fs/inode.c
-===================================================================
---- linux-ion.orig/fs/inode.c 2004-09-27 14:58:03.000000000 -0700
-+++ linux-ion/fs/inode.c 2004-09-27 14:58:34.000000000 -0700
-@@ -5,6 +5,7 @@
- */
-
- #include <linux/config.h>
-+#include <linux/module.h>
- #include <linux/fs.h>
- #include <linux/string.h>
- #include <linux/mm.h>
-@@ -66,7 +67,8 @@
- * NOTE! You also have to own the lock if you change
- * the i_state of an inode while it is in use..
- */
--static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
-+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
-+EXPORT_SYMBOL(inode_lock);
-
- /*
- * Statistics gathering..
-Index: linux-ion/fs/Makefile
-===================================================================
---- linux-ion.orig/fs/Makefile 2004-07-28 14:34:57.000000000 -0700
-+++ linux-ion/fs/Makefile 2004-09-27 14:59:37.000000000 -0700
-@@ -7,7 +7,7 @@
-
- O_TARGET := fs.o
-
--export-objs := filesystems.o open.o dcache.o buffer.o
-+export-objs := filesystems.o open.o dcache.o buffer.o inode.o
- mod-subdirs := nls
-
- obj-y := open.o read_write.o devices.o file_table.o buffer.o \
-Index: linux-ion/mm/page_alloc.c
-===================================================================
---- linux-ion.orig/mm/page_alloc.c 2004-07-28 14:34:57.000000000 -0700
-+++ linux-ion/mm/page_alloc.c 2004-09-27 14:58:34.000000000 -0700
-@@ -28,6 +28,7 @@
- LIST_HEAD(inactive_list);
- LIST_HEAD(active_list);
- pg_data_t *pgdat_list;
-+EXPORT_SYMBOL(pgdat_list);
-
- /* Used to look up the address of the struct zone encoded in page->zone */
- zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
+++ /dev/null
- fs/Makefile | 2 +-
- fs/inode.c | 4 +++-
- mm/page_alloc.c | 1 +
- 3 files changed, 5 insertions(+), 2 deletions(-)
-
-Index: linux-2.4.19.SuSE/fs/inode.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/inode.c Sat Nov 15 18:02:13 2003
-+++ linux-2.4.19.SuSE/fs/inode.c Sat Nov 15 18:03:04 2003
-@@ -5,6 +5,7 @@
- */
-
- #include <linux/config.h>
-+#include <linux/module.h>
- #include <linux/fs.h>
- #include <linux/string.h>
- #include <linux/mm.h>
-@@ -67,7 +68,8 @@
- * NOTE! You also have to own the lock if you change
- * the i_state of an inode while it is in use..
- */
--static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
-+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
-+EXPORT_SYMBOL(inode_lock);
-
- /*
- * Statistics gathering..
-Index: linux-2.4.19.SuSE/fs/Makefile
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/Makefile Mon Jan 27 05:08:56 2003
-+++ linux-2.4.19.SuSE/fs/Makefile Sat Nov 15 18:03:54 2003
-@@ -7,7 +7,7 @@
-
- O_TARGET := fs.o
-
--export-objs := filesystems.o open.o dcache.o buffer.o
-+export-objs := filesystems.o open.o dcache.o buffer.o inode.o
- mod-subdirs := nls
-
- obj-y := open.o read_write.o devices.o file_table.o buffer.o \
-Index: linux-2.4.19.SuSE/mm/page_alloc.c
-===================================================================
---- linux-2.4.19.SuSE.orig/mm/page_alloc.c Mon Jan 27 05:08:55 2003
-+++ linux-2.4.19.SuSE/mm/page_alloc.c Sat Nov 15 18:03:04 2003
-@@ -32,6 +32,7 @@
- LIST_HEAD(inactive_list);
- LIST_HEAD(active_list);
- pg_data_t *pgdat_list;
-+EXPORT_SYMBOL(pgdat_list);
-
- /* Used to look up the address of the struct zone encoded in page->zone */
- zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
+++ /dev/null
- Documentation/filesystems/ext2.txt | 16 ++
- fs/ext3/Makefile | 2
- fs/ext3/inode.c | 4
- fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++
- fs/ext3/iopen.h | 13 +
- fs/ext3/namei.c | 13 +
- fs/ext3/super.c | 11 +
- include/linux/ext3_fs.h | 2
- 8 files changed, 318 insertions(+), 2 deletions(-)
-
-Index: linux-2.4.19/Documentation/filesystems/ext2.txt
-===================================================================
---- linux-2.4.19.orig/Documentation/filesystems/ext2.txt 2001-07-11 18:44:45.000000000 -0400
-+++ linux-2.4.19/Documentation/filesystems/ext2.txt 2004-04-23 22:37:48.000000000 -0400
-@@ -35,6 +35,22 @@
-
- sb=n Use alternate superblock at this location.
-
-+iopen Makes an invisible pseudo-directory called
-+ __iopen__ available in the root directory
-+ of the filesystem. Allows open-by-inode-
-+ number. i.e., inode 3145 can be accessed
-+ via /mntpt/__iopen__/3145
-+
-+iopen_nopriv This option makes the iopen directory be
-+ world-readable. This may be safer since it
-+ allows daemons to run as an unprivileged user,
-+ however it significantly changes the security
-+ model of a Unix filesystem, since previously
-+ all files under a mode 700 directory were not
-+ generally avilable even if the
-+ permissions on the file itself is
-+ world-readable.
-+
- grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
-
-
-Index: linux.mcp2/fs/ext3/Makefile
-===================================================================
---- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:20:52.000000000 -0700
-+++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:21:55.000000000 -0700
-@@ -11,7 +11,7 @@
-
- export-objs := ext3-exports.o
-
--obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
- ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
- obj-m := $(O_TARGET)
-
-Index: linux.mcp2/fs/ext3/inode.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/inode.c 2004-05-17 15:20:59.000000000 -0700
-+++ linux.mcp2/fs/ext3/inode.c 2004-05-17 15:21:55.000000000 -0700
-@@ -31,6 +31,7 @@
- #include <linux/highuid.h>
- #include <linux/quotaops.h>
- #include <linux/module.h>
-+#include "iopen.h"
-
- /*
- * SEARCH_FROM_ZERO forces each block allocation to search from the start
-@@ -2125,6 +2126,9 @@
- struct buffer_head *bh;
- int block;
-
-+ if (ext3_iopen_get_inode(inode))
-+ return;
-+
- if(ext3_get_inode_loc(inode, &iloc))
- goto bad_inode;
- bh = iloc.bh;
-Index: linux.mcp2/fs/ext3/iopen.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/iopen.c 2002-04-11 07:25:15.000000000 -0700
-+++ linux.mcp2/fs/ext3/iopen.c 2004-05-17 15:21:55.000000000 -0700
-@@ -0,0 +1,285 @@
-+/*
-+ * linux/fs/ext3/iopen.c
-+ *
-+ * Special support for open by inode number
-+ *
-+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
-+ *
-+ * This file may be redistributed under the terms of the GNU General
-+ * Public License.
-+ *
-+ *
-+ * Invariants:
-+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
-+ * for an inode at one time.
-+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
-+ * aliases on an inode at the same time.
-+ *
-+ * If we have any connected dentry aliases for an inode, use one of those
-+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED
-+ * dentry for this inode, which thereafter will be found by the dcache
-+ * when looking up this inode number in __iopen__, so we don't return here
-+ * until it is gone.
-+ *
-+ * If we get an inode via a regular name lookup, then we "rename" the
-+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures
-+ * existing users of the disconnected dentry will continue to use the same
-+ * dentry as the connected users, and there will never be both kinds of
-+ * dentry aliases at one time.
-+ */
-+
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/locks.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/smp_lock.h>
-+#include "iopen.h"
-+
-+#ifndef assert
-+#define assert(test) J_ASSERT(test)
-+#endif
-+
-+#define IOPEN_NAME_LEN 32
-+
-+/*
-+ * This implements looking up an inode by number.
-+ */
-+static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry)
-+{
-+ struct inode *inode;
-+ unsigned long ino;
-+ struct list_head *lp;
-+ struct dentry *alternate;
-+ char buf[IOPEN_NAME_LEN];
-+
-+ if (dentry->d_name.len >= IOPEN_NAME_LEN)
-+ return ERR_PTR(-ENAMETOOLONG);
-+
-+ memcpy(buf, dentry->d_name.name, dentry->d_name.len);
-+ buf[dentry->d_name.len] = 0;
-+
-+ if (strcmp(buf, ".") == 0)
-+ ino = dir->i_ino;
-+ else if (strcmp(buf, "..") == 0)
-+ ino = EXT3_ROOT_INO;
-+ else
-+ ino = simple_strtoul(buf, 0, 0);
-+
-+ if ((ino != EXT3_ROOT_INO &&
-+ //ino != EXT3_ACL_IDX_INO &&
-+ //ino != EXT3_ACL_DATA_INO &&
-+ ino < EXT3_FIRST_INO(dir->i_sb)) ||
-+ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
-+ return ERR_PTR(-ENOENT);
-+
-+ inode = iget(dir->i_sb, ino);
-+ if (!inode)
-+ return ERR_PTR(-EACCES);
-+ if (is_bad_inode(inode)) {
-+ iput(inode);
-+ return ERR_PTR(-ENOENT);
-+ }
-+
-+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
-+ assert(list_empty(&dentry->d_hash)); /* d_rehash */
-+
-+ /* preferrably return a connected dentry */
-+ spin_lock(&dcache_lock);
-+ list_for_each(lp, &inode->i_dentry) {
-+ alternate = list_entry(lp, struct dentry, d_alias);
-+ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED));
-+ }
-+
-+ if (!list_empty(&inode->i_dentry)) {
-+ alternate = list_entry(inode->i_dentry.next,
-+ struct dentry, d_alias);
-+ dget_locked(alternate);
-+ alternate->d_vfs_flags |= DCACHE_REFERENCED;
-+ iput(inode);
-+ spin_unlock(&dcache_lock);
-+ return alternate;
-+ }
-+ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
-+
-+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
-+ dentry->d_inode = inode;
-+
-+ __d_rehash(dentry, 0); /* d_rehash */
-+ spin_unlock(&dcache_lock);
-+
-+ return NULL;
-+}
-+
-+#define do_switch(x,y) do { \
-+ __typeof__ (x) __tmp = x; \
-+ x = y; y = __tmp; } while (0)
-+
-+static inline void switch_names(struct dentry *dentry, struct dentry *target)
-+{
-+ const unsigned char *old_name, *new_name;
-+
-+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN);
-+ old_name = target->d_name.name;
-+ new_name = dentry->d_name.name;
-+ if (old_name == target->d_iname)
-+ old_name = dentry->d_iname;
-+ if (new_name == dentry->d_iname)
-+ new_name = target->d_iname;
-+ target->d_name.name = new_name;
-+ dentry->d_name.name = old_name;
-+}
-+
-+/* This function is spliced into ext3_lookup and does the move of a
-+ * disconnected dentry (if it exists) to a connected dentry.
-+ */
-+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
-+ int rehash)
-+{
-+ struct dentry *tmp, *goal = NULL;
-+ struct list_head *lp;
-+
-+ /* verify this dentry is really new */
-+ assert(dentry->d_inode == NULL);
-+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
-+ if (rehash)
-+ assert(list_empty(&dentry->d_hash)); /* d_rehash */
-+ assert(list_empty(&dentry->d_subdirs));
-+
-+ spin_lock(&dcache_lock);
-+ if (!inode)
-+ goto do_rehash;
-+
-+ if (!test_opt(inode->i_sb, IOPEN))
-+ goto do_instantiate;
-+
-+ /* preferrably return a connected dentry */
-+ list_for_each(lp, &inode->i_dentry) {
-+ tmp = list_entry(lp, struct dentry, d_alias);
-+ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) {
-+ assert(tmp->d_alias.next == &inode->i_dentry);
-+ assert(tmp->d_alias.prev == &inode->i_dentry);
-+ goal = tmp;
-+ dget_locked(goal);
-+ break;
-+ }
-+ }
-+
-+ if (!goal)
-+ goto do_instantiate;
-+
-+ /* Move the goal to the de hash queue - like d_move() */
-+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
-+ list_del_init(&goal->d_hash);
-+
-+ list_del(&goal->d_child);
-+ list_del(&dentry->d_child);
-+
-+ /* Switch the parents and the names.. */
-+ switch_names(goal, dentry);
-+ do_switch(goal->d_parent, dentry->d_parent);
-+ do_switch(goal->d_name.len, dentry->d_name.len);
-+ do_switch(goal->d_name.hash, dentry->d_name.hash);
-+
-+ /* And add them back to the (new) parent lists */
-+ list_add(&goal->d_child, &goal->d_parent->d_subdirs);
-+ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
-+ __d_rehash(goal, 0);
-+ spin_unlock(&dcache_lock);
-+ iput(inode);
-+
-+ return goal;
-+
-+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+do_instantiate:
-+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
-+ dentry->d_inode = inode;
-+do_rehash:
-+ if (rehash)
-+ __d_rehash(dentry, 0); /* d_rehash */
-+ spin_unlock(&dcache_lock);
-+
-+ return NULL;
-+}
-+
-+/*
-+ * These are the special structures for the iopen pseudo directory.
-+ */
-+
-+static struct inode_operations iopen_inode_operations = {
-+ lookup: iopen_lookup, /* BKL held */
-+};
-+
-+static struct file_operations iopen_file_operations = {
-+ read: generic_read_dir,
-+};
-+
-+static int match_dentry(struct dentry *dentry, const char *name)
-+{
-+ int len;
-+
-+ len = strlen(name);
-+ if (dentry->d_name.len != len)
-+ return 0;
-+ if (strncmp(dentry->d_name.name, name, len))
-+ return 0;
-+ return 1;
-+}
-+
-+/*
-+ * This function is spliced into ext3_lookup and returns 1 the file
-+ * name is __iopen__ and dentry has been filled in appropriately.
-+ */
-+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
-+{
-+ struct inode *inode;
-+
-+ if (dir->i_ino != EXT3_ROOT_INO ||
-+ !test_opt(dir->i_sb, IOPEN) ||
-+ !match_dentry(dentry, "__iopen__"))
-+ return 0;
-+
-+ inode = iget(dir->i_sb, EXT3_BAD_INO);
-+
-+ if (!inode)
-+ return 0;
-+ d_add(dentry, inode);
-+ return 1;
-+}
-+
-+/*
-+ * This function is spliced into read_inode; it returns 1 if inode
-+ * number is the one for /__iopen__, in which case the inode is filled
-+ * in appropriately. Otherwise, this fuction returns 0.
-+ */
-+int ext3_iopen_get_inode(struct inode *inode)
-+{
-+ if (inode->i_ino != EXT3_BAD_INO)
-+ return 0;
-+
-+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
-+ if (test_opt(inode->i_sb, IOPEN_NOPRIV))
-+ inode->i_mode |= 0777;
-+ inode->i_uid = 0;
-+ inode->i_gid = 0;
-+ inode->i_nlink = 1;
-+ inode->i_size = 4096;
-+ inode->i_atime = CURRENT_TIME;
-+ inode->i_ctime = CURRENT_TIME;
-+ inode->i_mtime = CURRENT_TIME;
-+ inode->u.ext3_i.i_dtime = 0;
-+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
-+ * (for stat), not the fs block
-+ * size */
-+ inode->i_blocks = 0;
-+ inode->i_version = 1;
-+ inode->i_generation = 0;
-+
-+ inode->i_op = &iopen_inode_operations;
-+ inode->i_fop = &iopen_file_operations;
-+ inode->i_mapping->a_ops = 0;
-+
-+ return 1;
-+}
-Index: linux.mcp2/fs/ext3/iopen.h
-===================================================================
---- linux.mcp2.orig/fs/ext3/iopen.h 2002-04-11 07:25:15.000000000 -0700
-+++ linux.mcp2/fs/ext3/iopen.h 2004-05-17 15:21:55.000000000 -0700
-@@ -0,0 +1,15 @@
-+/*
-+ * iopen.h
-+ *
-+ * Special support for opening files by inode number.
-+ *
-+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
-+ *
-+ * This file may be redistributed under the terms of the GNU General
-+ * Public License.
-+ */
-+
-+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
-+extern int ext3_iopen_get_inode(struct inode *inode);
-+extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
-+ struct inode *inode, int rehash);
-Index: linux.mcp2/fs/ext3/namei.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:20:59.000000000 -0700
-+++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:21:55.000000000 -0700
-@@ -35,7 +35,7 @@
- #include <linux/string.h>
- #include <linux/locks.h>
- #include <linux/quotaops.h>
--
-+#include "iopen.h"
-
- /*
- * define how far ahead to read directories while searching them.
-@@ -931,6 +931,9 @@
- if (dentry->d_name.len > EXT3_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
-+ if (ext3_check_for_iopen(dir, dentry))
-+ return NULL;
-+
- bh = ext3_find_entry(dentry, &de);
- inode = NULL;
- if (bh) {
-@@ -942,8 +945,8 @@
- return ERR_PTR(-EACCES);
- }
- }
-- d_add(dentry, inode);
-- return NULL;
-+
-+ return iopen_connect_dentry(dentry, inode, 1);
- }
-
- #define S_SHIFT 12
-@@ -1932,10 +1935,6 @@
- inode->i_nlink);
- inode->i_version = ++event;
- inode->i_nlink = 0;
-- /* There's no need to set i_disksize: the fact that i_nlink is
-- * zero will ensure that the right thing happens during any
-- * recovery. */
-- inode->i_size = 0;
- ext3_orphan_add(handle, inode);
- ext3_mark_inode_dirty(handle, inode);
- dir->i_nlink--;
-@@ -2054,6 +2053,23 @@
- return err;
- }
-
-+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
-+static int ext3_add_link(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
-+{
-+ int err = ext3_add_entry(handle, dentry, inode);
-+ if (!err) {
-+ err = ext3_mark_inode_dirty(handle, inode);
-+ if (err == 0) {
-+ dput(iopen_connect_dentry(dentry, inode, 0));
-+ return 0;
-+ }
-+ }
-+ ext3_dec_count(handle, inode);
-+ iput(inode);
-+ return err;
-+}
-+
- static int ext3_link (struct dentry * old_dentry,
- struct inode * dir, struct dentry *dentry)
- {
-@@ -2081,7 +2097,8 @@
- ext3_inc_count(handle, inode);
- atomic_inc(&inode->i_count);
-
-- err = ext3_add_nondir(handle, dentry, inode);
-+ err = ext3_add_link(handle, dentry, inode);
-+ ext3_orphan_del(handle, inode);
- ext3_journal_stop(handle, dir);
- return err;
- }
-Index: linux.mcp2/fs/ext3/super.c
-===================================================================
---- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:20:59.000000000 -0700
-+++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:21:55.000000000 -0700
-@@ -836,6 +836,18 @@
- || !strcmp (this_char, "quota")
- || !strcmp (this_char, "usrquota"))
- /* Don't do anything ;-) */ ;
-+ else if (!strcmp (this_char, "iopen")) {
-+ set_opt (sbi->s_mount_opt, IOPEN);
-+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
-+ else if (!strcmp (this_char, "noiopen")) {
-+ clear_opt (sbi->s_mount_opt, IOPEN);
-+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
-+ else if (!strcmp (this_char, "iopen_nopriv")) {
-+ set_opt (sbi->s_mount_opt, IOPEN);
-+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
- else if (!strcmp (this_char, "journal")) {
- /* @@@ FIXME */
- /* Eventually we will want to be able to create
-Index: linux.mcp2/include/linux/ext3_fs.h
-===================================================================
---- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 15:20:59.000000000 -0700
-+++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:21:55.000000000 -0700
-@@ -323,6 +323,8 @@
- #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
- #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
- #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
-+#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */
-+#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */
-
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef _LINUX_EXT2_FS_H
+++ /dev/null
- Documentation/filesystems/ext2.txt | 16 ++
- fs/ext3/Makefile | 2
- fs/ext3/inode.c | 4
- fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++
- fs/ext3/iopen.h | 13 +
- fs/ext3/namei.c | 13 +
- fs/ext3/super.c | 11 +
- include/linux/ext3_fs.h | 2
- 8 files changed, 318 insertions(+), 2 deletions(-)
-
-Index: linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt
-===================================================================
---- linux-2.4.19.SuSE.orig/Documentation/filesystems/ext2.txt Wed Jul 11 15:44:45 2001
-+++ linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt Sun Nov 16 01:27:31 2003
-@@ -35,6 +35,22 @@
-
- sb=n Use alternate superblock at this location.
-
-+iopen Makes an invisible pseudo-directory called
-+ __iopen__ available in the root directory
-+ of the filesystem. Allows open-by-inode-
-+ number. i.e., inode 3145 can be accessed
-+ via /mntpt/__iopen__/3145
-+
-+iopen_nopriv This option makes the iopen directory be
-+ world-readable. This may be safer since it
-+ allows daemons to run as an unprivileged user,
-+ however it significantly changes the security
-+ model of a Unix filesystem, since previously
-+ all files under a mode 700 directory were not
-+ generally avilable even if the
-+ permissions on the file itself is
-+ world-readable.
-+
- grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
-
-
-Index: linux-2.4.19.SuSE/fs/ext3/Makefile
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/Makefile Sun Nov 16 00:40:59 2003
-+++ linux-2.4.19.SuSE/fs/ext3/Makefile Sun Nov 16 01:27:31 2003
-@@ -11,7 +11,7 @@
-
- export-objs := ext3-exports.o
-
--obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
- ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
- obj-m := $(O_TARGET)
-
-Index: linux-2.4.19.SuSE/fs/ext3/inode.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:26:04 2003
-+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:27:31 2003
-@@ -34,6 +34,7 @@
- #include <linux/highuid.h>
- #include <linux/quotaops.h>
- #include <linux/module.h>
-+#include "iopen.h"
-
- /*
- * SEARCH_FROM_ZERO forces each block allocation to search from the start
-@@ -2350,6 +2351,9 @@
- struct buffer_head *bh;
- int block;
-
-+ if (ext3_iopen_get_inode(inode))
-+ return;
-+
- if(ext3_get_inode_loc(inode, &iloc))
- goto bad_inode;
- bh = iloc.bh;
-Index: lum/fs/ext3/iopen.c
-===================================================================
---- lum.orig/fs/ext3/iopen.c 2004-03-09 16:46:37.000000000 -0700
-+++ lum/fs/ext3/iopen.c 2004-03-09 16:48:03.000000000 -0700
-@@ -0,0 +1,285 @@
-+/*
-+ * linux/fs/ext3/iopen.c
-+ *
-+ * Special support for open by inode number
-+ *
-+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
-+ *
-+ * This file may be redistributed under the terms of the GNU General
-+ * Public License.
-+ *
-+ *
-+ * Invariants:
-+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
-+ * for an inode at one time.
-+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
-+ * aliases on an inode at the same time.
-+ *
-+ * If we have any connected dentry aliases for an inode, use one of those
-+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED
-+ * dentry for this inode, which thereafter will be found by the dcache
-+ * when looking up this inode number in __iopen__, so we don't return here
-+ * until it is gone.
-+ *
-+ * If we get an inode via a regular name lookup, then we "rename" the
-+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures
-+ * existing users of the disconnected dentry will continue to use the same
-+ * dentry as the connected users, and there will never be both kinds of
-+ * dentry aliases at one time.
-+ */
-+
-+#include <linux/sched.h>
-+#include <linux/fs.h>
-+#include <linux/locks.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/smp_lock.h>
-+#include "iopen.h"
-+
-+#ifndef assert
-+#define assert(test) J_ASSERT(test)
-+#endif
-+
-+#define IOPEN_NAME_LEN 32
-+
-+/*
-+ * This implements looking up an inode by number.
-+ */
-+static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry)
-+{
-+ struct inode *inode;
-+ unsigned long ino;
-+ struct list_head *lp;
-+ struct dentry *alternate;
-+ char buf[IOPEN_NAME_LEN];
-+
-+ if (dentry->d_name.len >= IOPEN_NAME_LEN)
-+ return ERR_PTR(-ENAMETOOLONG);
-+
-+ memcpy(buf, dentry->d_name.name, dentry->d_name.len);
-+ buf[dentry->d_name.len] = 0;
-+
-+ if (strcmp(buf, ".") == 0)
-+ ino = dir->i_ino;
-+ else if (strcmp(buf, "..") == 0)
-+ ino = EXT3_ROOT_INO;
-+ else
-+ ino = simple_strtoul(buf, 0, 0);
-+
-+ if ((ino != EXT3_ROOT_INO &&
-+ //ino != EXT3_ACL_IDX_INO &&
-+ //ino != EXT3_ACL_DATA_INO &&
-+ ino < EXT3_FIRST_INO(dir->i_sb)) ||
-+ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
-+ return ERR_PTR(-ENOENT);
-+
-+ inode = iget(dir->i_sb, ino);
-+ if (!inode)
-+ return ERR_PTR(-EACCES);
-+ if (is_bad_inode(inode)) {
-+ iput(inode);
-+ return ERR_PTR(-ENOENT);
-+ }
-+
-+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
-+ assert(list_empty(&dentry->d_hash)); /* d_rehash */
-+
-+ /* preferrably return a connected dentry */
-+ spin_lock(&dcache_lock);
-+ list_for_each(lp, &inode->i_dentry) {
-+ alternate = list_entry(lp, struct dentry, d_alias);
-+ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED));
-+ }
-+
-+ if (!list_empty(&inode->i_dentry)) {
-+ alternate = list_entry(inode->i_dentry.next,
-+ struct dentry, d_alias);
-+ dget_locked(alternate);
-+ alternate->d_vfs_flags |= DCACHE_REFERENCED;
-+ iput(inode);
-+ spin_unlock(&dcache_lock);
-+ return alternate;
-+ }
-+ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
-+
-+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
-+ dentry->d_inode = inode;
-+
-+ __d_rehash(dentry, 0); /* d_rehash */
-+ spin_unlock(&dcache_lock);
-+
-+ return NULL;
-+}
-+
-+#define do_switch(x,y) do { \
-+ __typeof__ (x) __tmp = x; \
-+ x = y; y = __tmp; } while (0)
-+
-+static inline void switch_names(struct dentry *dentry, struct dentry *target)
-+{
-+ const unsigned char *old_name, *new_name;
-+
-+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN);
-+ old_name = target->d_name.name;
-+ new_name = dentry->d_name.name;
-+ if (old_name == target->d_iname)
-+ old_name = dentry->d_iname;
-+ if (new_name == dentry->d_iname)
-+ new_name = target->d_iname;
-+ target->d_name.name = new_name;
-+ dentry->d_name.name = old_name;
-+}
-+
-+/* This function is spliced into ext3_lookup and does the move of a
-+ * disconnected dentry (if it exists) to a connected dentry.
-+ */
-+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
-+ int rehash)
-+{
-+ struct dentry *tmp, *goal = NULL;
-+ struct list_head *lp;
-+
-+ /* verify this dentry is really new */
-+ assert(dentry->d_inode == NULL);
-+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
-+ if (rehash)
-+ assert(list_empty(&dentry->d_hash)); /* d_rehash */
-+ assert(list_empty(&dentry->d_subdirs));
-+
-+ spin_lock(&dcache_lock);
-+ if (!inode)
-+ goto do_rehash;
-+
-+ if (!test_opt(inode->i_sb, IOPEN))
-+ goto do_instantiate;
-+
-+ /* preferrably return a connected dentry */
-+ list_for_each(lp, &inode->i_dentry) {
-+ tmp = list_entry(lp, struct dentry, d_alias);
-+ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) {
-+ assert(tmp->d_alias.next == &inode->i_dentry);
-+ assert(tmp->d_alias.prev == &inode->i_dentry);
-+ goal = tmp;
-+ dget_locked(goal);
-+ break;
-+ }
-+ }
-+
-+ if (!goal)
-+ goto do_instantiate;
-+
-+ /* Move the goal to the de hash queue - like d_move() */
-+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
-+ list_del_init(&goal->d_hash);
-+
-+ list_del(&goal->d_child);
-+ list_del(&dentry->d_child);
-+
-+ /* Switch the parents and the names.. */
-+ switch_names(goal, dentry);
-+ do_switch(goal->d_parent, dentry->d_parent);
-+ do_switch(goal->d_name.len, dentry->d_name.len);
-+ do_switch(goal->d_name.hash, dentry->d_name.hash);
-+
-+ /* And add them back to the (new) parent lists */
-+ list_add(&goal->d_child, &goal->d_parent->d_subdirs);
-+ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
-+ __d_rehash(goal, 0);
-+ spin_unlock(&dcache_lock);
-+ iput(inode);
-+
-+ return goal;
-+
-+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+do_instantiate:
-+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
-+ dentry->d_inode = inode;
-+do_rehash:
-+ if (rehash)
-+ __d_rehash(dentry, 0); /* d_rehash */
-+ spin_unlock(&dcache_lock);
-+
-+ return NULL;
-+}
-+
-+/*
-+ * These are the special structures for the iopen pseudo directory.
-+ */
-+
-+static struct inode_operations iopen_inode_operations = {
-+ lookup: iopen_lookup, /* BKL held */
-+};
-+
-+static struct file_operations iopen_file_operations = {
-+ read: generic_read_dir,
-+};
-+
-+static int match_dentry(struct dentry *dentry, const char *name)
-+{
-+ int len;
-+
-+ len = strlen(name);
-+ if (dentry->d_name.len != len)
-+ return 0;
-+ if (strncmp(dentry->d_name.name, name, len))
-+ return 0;
-+ return 1;
-+}
-+
-+/*
-+ * This function is spliced into ext3_lookup and returns 1 the file
-+ * name is __iopen__ and dentry has been filled in appropriately.
-+ */
-+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
-+{
-+ struct inode *inode;
-+
-+ if (dir->i_ino != EXT3_ROOT_INO ||
-+ !test_opt(dir->i_sb, IOPEN) ||
-+ !match_dentry(dentry, "__iopen__"))
-+ return 0;
-+
-+ inode = iget(dir->i_sb, EXT3_BAD_INO);
-+
-+ if (!inode)
-+ return 0;
-+ d_add(dentry, inode);
-+ return 1;
-+}
-+
-+/*
-+ * This function is spliced into read_inode; it returns 1 if inode
-+ * number is the one for /__iopen__, in which case the inode is filled
-+ * in appropriately. Otherwise, this fuction returns 0.
-+ */
-+int ext3_iopen_get_inode(struct inode *inode)
-+{
-+ if (inode->i_ino != EXT3_BAD_INO)
-+ return 0;
-+
-+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
-+ if (test_opt(inode->i_sb, IOPEN_NOPRIV))
-+ inode->i_mode |= 0777;
-+ inode->i_uid = 0;
-+ inode->i_gid = 0;
-+ inode->i_nlink = 1;
-+ inode->i_size = 4096;
-+ inode->i_atime = CURRENT_TIME;
-+ inode->i_ctime = CURRENT_TIME;
-+ inode->i_mtime = CURRENT_TIME;
-+ inode->u.ext3_i.i_dtime = 0;
-+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
-+ * (for stat), not the fs block
-+ * size */
-+ inode->i_blocks = 0;
-+ inode->i_version = 1;
-+ inode->i_generation = 0;
-+
-+ inode->i_op = &iopen_inode_operations;
-+ inode->i_fop = &iopen_file_operations;
-+ inode->i_mapping->a_ops = 0;
-+
-+ return 1;
-+}
-Index: lum/fs/ext3/iopen.h
-===================================================================
---- lum.orig/fs/ext3/iopen.h 2004-03-09 16:46:37.000000000 -0700
-+++ lum/fs/ext3/iopen.h 2004-03-09 16:48:03.000000000 -0700
-@@ -0,0 +1,15 @@
-+/*
-+ * iopen.h
-+ *
-+ * Special support for opening files by inode number.
-+ *
-+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
-+ *
-+ * This file may be redistributed under the terms of the GNU General
-+ * Public License.
-+ */
-+
-+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
-+extern int ext3_iopen_get_inode(struct inode *inode);
-+extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
-+ struct inode *inode, int rehash);
-Index: linux-2.4.19.SuSE/fs/ext3/namei.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:23:20 2003
-+++ linux-2.4.19.SuSE/fs/ext3/namei.c Sun Nov 16 01:27:31 2003
-@@ -36,7 +36,7 @@
- #include <linux/string.h>
- #include <linux/locks.h>
- #include <linux/quotaops.h>
--
-+#include "iopen.h"
-
- /*
- * define how far ahead to read directories while searching them.
-@@ -926,6 +927,9 @@
- if (dentry->d_name.len > EXT3_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
-+ if (ext3_check_for_iopen(dir, dentry))
-+ return NULL;
-+
- bh = ext3_find_entry(dentry, &de);
- inode = NULL;
- if (bh) {
-@@ -943,8 +948,8 @@
- return ERR_PTR(-EACCES);
- }
- }
-- d_add(dentry, inode);
-- return NULL;
-+
-+ return iopen_connect_dentry(dentry, inode, 1);
- }
-
- #define S_SHIFT 12
-@@ -1932,10 +1935,6 @@
- inode->i_nlink);
- inode->i_version = ++event;
- inode->i_nlink = 0;
-- /* There's no need to set i_disksize: the fact that i_nlink is
-- * zero will ensure that the right thing happens during any
-- * recovery. */
-- inode->i_size = 0;
- ext3_orphan_add(handle, inode);
- dir->i_nlink--;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-@@ -2086,6 +2085,23 @@
- return err;
- }
-
-+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
-+static int ext3_add_link(handle_t *handle, struct dentry *dentry,
-+ struct inode *inode)
-+{
-+ int err = ext3_add_entry(handle, dentry, inode);
-+ if (!err) {
-+ err = ext3_mark_inode_dirty(handle, inode);
-+ if (err == 0) {
-+ dput(iopen_connect_dentry(dentry, inode, 0));
-+ return 0;
-+ }
-+ }
-+ ext3_dec_count(handle, inode);
-+ iput(inode);
-+ return err;
-+}
-+
- static int ext3_link (struct dentry * old_dentry,
- struct inode * dir, struct dentry *dentry)
- {
-@@ -2113,7 +2129,8 @@
- ext3_inc_count(handle, inode);
- atomic_inc(&inode->i_count);
-
-- err = ext3_add_nondir(handle, dentry, inode);
-+ err = ext3_add_link(handle, dentry, inode);
-+ ext3_orphan_del(handle, inode);
- ext3_journal_stop(handle, dir);
- return err;
- }
-Index: linux-2.4.19.SuSE/fs/ext3/super.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:19:22 2003
-+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 01:27:31 2003
-@@ -864,6 +864,18 @@
- || !strcmp (this_char, "quota")
- || !strcmp (this_char, "usrquota"))
- /* Don't do anything ;-) */ ;
-+ else if (!strcmp (this_char, "iopen")) {
-+ set_opt (sbi->s_mount_opt, IOPEN);
-+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
-+ else if (!strcmp (this_char, "noiopen")) {
-+ clear_opt (sbi->s_mount_opt, IOPEN);
-+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
-+ else if (!strcmp (this_char, "iopen_nopriv")) {
-+ set_opt (sbi->s_mount_opt, IOPEN);
-+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
- else if (!strcmp (this_char, "journal")) {
- /* @@@ FIXME */
- /* Eventually we will want to be able to create
-Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:25:42 2003
-+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:30:05 2003
-@@ -324,6 +324,8 @@
- #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
- #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
- #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
-+#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */
-+#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */
-
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef _LINUX_EXT2_FS_H
+++ /dev/null
-Index: linux-2.4.19.SuSE/include/linux/jbd.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/jbd.h Sun Nov 16 13:51:03 2003
-+++ linux-2.4.19.SuSE/include/linux/jbd.h Sun Nov 16 15:10:48 2003
-@@ -283,6 +283,13 @@
- return bh->b_private;
- }
-
-+#define HAVE_JOURNAL_CALLBACK_STATUS
-+struct journal_callback {
-+ struct list_head jcb_list;
-+ void (*jcb_func)(struct journal_callback *jcb, int error);
-+ /* user data goes here */
-+};
-+
- struct jbd_revoke_table_s;
-
- /* The handle_t type represents a single atomic update being performed
-@@ -313,6 +320,12 @@
- operations */
- int h_err;
-
-+ /* List of application registered callbacks for this handle.
-+ * The function(s) will be called after the transaction that
-+ * this handle is part of has been committed to disk.
-+ */
-+ struct list_head h_jcb;
-+
- /* Flags */
- unsigned int h_sync: 1; /* sync-on-close */
- unsigned int h_jdata: 1; /* force data journaling */
-@@ -432,6 +445,10 @@
-
- /* How many handles used this transaction? */
- int t_handle_count;
-+
-+ /* List of registered callback functions for this transaction.
-+ * Called when the transaction is committed. */
-+ struct list_head t_jcb;
- };
-
-
-@@ -676,6 +693,9 @@
- extern int journal_try_to_free_buffers(journal_t *, struct page *, int);
- extern int journal_stop(handle_t *);
- extern int journal_flush (journal_t *);
-+extern void journal_callback_set(handle_t *handle,
-+ void (*fn)(struct journal_callback *,int),
-+ struct journal_callback *jcb);
-
- extern void journal_lock_updates (journal_t *);
- extern void journal_unlock_updates (journal_t *);
-Index: linux-2.4.19.SuSE/fs/jbd/checkpoint.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/jbd/checkpoint.c Mon Feb 25 11:38:08 2002
-+++ linux-2.4.19.SuSE/fs/jbd/checkpoint.c Sun Nov 16 15:10:48 2003
-@@ -594,7 +594,8 @@
- J_ASSERT (transaction->t_log_list == NULL);
- J_ASSERT (transaction->t_checkpoint_list == NULL);
- J_ASSERT (transaction->t_updates == 0);
--
-+ J_ASSERT (list_empty(&transaction->t_jcb));
-+
- J_ASSERT (transaction->t_journal->j_committing_transaction !=
- transaction);
-
-Index: linux-2.4.19.SuSE/fs/jbd/commit.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/jbd/commit.c Mon Jan 27 05:08:04 2003
-+++ linux-2.4.19.SuSE/fs/jbd/commit.c Sun Nov 16 15:13:53 2003
-@@ -485,7 +485,7 @@
- transaction's t_log_list queue, and metadata buffers are on
- the t_iobuf_list queue.
-
-- Wait for the transactions in reverse order. That way we are
-+ Wait for the buffers in reverse order. That way we are
- less likely to be woken up until all IOs have completed, and
- so we incur less scheduling load.
- */
-@@ -576,8 +576,10 @@
-
- jbd_debug(3, "JBD: commit phase 6\n");
-
-- if (is_journal_aborted(journal))
-+ if (is_journal_aborted(journal)) {
-+ unlock_journal(journal);
- goto skip_commit;
-+ }
-
- /* Done it all: now write the commit record. We should have
- * cleaned up our previous buffers by now, so if we are in abort
-@@ -587,9 +589,10 @@
- descriptor = journal_get_descriptor_buffer(journal);
- if (!descriptor) {
- __journal_abort_hard(journal);
-+ unlock_journal(journal);
- goto skip_commit;
- }
--
-+
- /* AKPM: buglet - add `i' to tmp! */
- for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
- journal_header_t *tmp =
-@@ -610,14 +614,32 @@
- put_bh(bh); /* One for getblk() */
- journal_unlock_journal_head(descriptor);
- }
-- lock_journal(journal);
-
- /* End of a transaction! Finally, we can do checkpoint
- processing: any buffers committed as a result of this
- transaction can be removed from any checkpoint list it was on
- before. */
-
--skip_commit:
-+skip_commit: /* The journal should be unlocked by now. */
-+
-+ /* Call any callbacks that had been registered for handles in this
-+ * transaction. It is up to the callback to free any allocated
-+ * memory.
-+ */
-+ if (!list_empty(&commit_transaction->t_jcb)) {
-+ struct list_head *p, *n;
-+ int error = is_journal_aborted(journal);
-+
-+ list_for_each_safe(p, n, &commit_transaction->t_jcb) {
-+ struct journal_callback *jcb;
-+
-+ jcb = list_entry(p, struct journal_callback, jcb_list);
-+ list_del(p);
-+ jcb->jcb_func(jcb, error);
-+ }
-+ }
-+
-+ lock_journal(journal);
-
- jbd_debug(3, "JBD: commit phase 7\n");
-
-Index: linux-2.4.19.SuSE/fs/jbd/journal.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/jbd/journal.c Mon Jan 27 05:08:00 2003
-+++ linux-2.4.19.SuSE/fs/jbd/journal.c Sun Nov 16 15:10:48 2003
-@@ -59,6 +59,7 @@
- #endif
- EXPORT_SYMBOL(journal_flush);
- EXPORT_SYMBOL(journal_revoke);
-+EXPORT_SYMBOL(journal_callback_set);
-
- EXPORT_SYMBOL(journal_init_dev);
- EXPORT_SYMBOL(journal_init_inode);
-Index: linux-2.4.19.SuSE/fs/jbd/transaction.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/jbd/transaction.c Sun Nov 16 01:45:26 2003
-+++ linux-2.4.19.SuSE/fs/jbd/transaction.c Sun Nov 16 15:15:34 2003
-@@ -58,6 +58,7 @@
- transaction->t_state = T_RUNNING;
- transaction->t_tid = journal->j_transaction_sequence++;
- transaction->t_expires = jiffies + bdflush_interval();
-+ INIT_LIST_HEAD(&transaction->t_jcb);
-
- /* Set up the commit timer for the new transaction. */
- J_ASSERT (!journal->j_commit_timer_active);
-@@ -91,7 +92,14 @@
- transaction_t *transaction;
- int needed;
- int nblocks = handle->h_buffer_credits;
--
-+
-+ if (nblocks > journal->j_max_transaction_buffers) {
-+ jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n",
-+ current->comm, nblocks,
-+ journal->j_max_transaction_buffers);
-+ return -ENOSPC;
-+ }
-+
- jbd_debug(3, "New handle %p going live.\n", handle);
-
- repeat:
-@@ -202,6 +210,20 @@
- return 0;
- }
-
-+/* Allocate a new handle. This should probably be in a slab... */
-+static handle_t *new_handle(int nblocks)
-+{
-+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+ if (!handle)
-+ return NULL;
-+ memset(handle, 0, sizeof (handle_t));
-+ handle->h_buffer_credits = nblocks;
-+ handle->h_ref = 1;
-+ INIT_LIST_HEAD(&handle->h_jcb);
-+
-+ return handle;
-+}
-+
- /*
- * Obtain a new handle.
- *
-@@ -228,14 +250,11 @@
- handle->h_ref++;
- return handle;
- }
--
-- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+
-+ handle = new_handle(nblocks);
- if (!handle)
- return ERR_PTR(-ENOMEM);
-- memset (handle, 0, sizeof (handle_t));
-
-- handle->h_buffer_credits = nblocks;
-- handle->h_ref = 1;
- current->journal_info = handle;
-
- err = start_this_handle(journal, handle);
-@@ -334,14 +353,11 @@
-
- if (is_journal_aborted(journal))
- return ERR_PTR(-EIO);
--
-- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+
-+ handle = new_handle(nblocks);
- if (!handle)
- return ERR_PTR(-ENOMEM);
-- memset (handle, 0, sizeof (handle_t));
-
-- handle->h_buffer_credits = nblocks;
-- handle->h_ref = 1;
- current->journal_info = handle;
-
- err = try_start_this_handle(journal, handle);
-@@ -1321,6 +1337,28 @@
- #endif
-
- /*
-+ * Register a callback function for this handle. The function will be
-+ * called when the transaction that this handle is part of has been
-+ * committed to disk with the original callback data struct and the
-+ * error status of the journal as parameters. There is no guarantee of
-+ * ordering between handles within a single transaction, nor between
-+ * callbacks registered on the same handle.
-+ *
-+ * The caller is responsible for allocating the journal_callback struct.
-+ * This is to allow the caller to add as much extra data to the callback
-+ * as needed, but reduce the overhead of multiple allocations. The caller
-+ * allocated struct must start with a struct journal_callback at offset 0,
-+ * and has the caller-specific data afterwards.
-+ */
-+void journal_callback_set(handle_t *handle,
-+ void (*func)(struct journal_callback *jcb, int error),
-+ struct journal_callback *jcb)
-+{
-+ list_add_tail(&jcb->jcb_list, &handle->h_jcb);
-+ jcb->jcb_func = func;
-+}
-+
-+/*
- * All done for a particular handle.
- *
- * There is not much action needed here. We just return any remaining
-@@ -1385,7 +1423,10 @@
- wake_up(&journal->j_wait_transaction_locked);
- }
-
-- /*
-+ /* Move callbacks from the handle to the transaction. */
-+ list_splice(&handle->h_jcb, &transaction->t_jcb);
-+
-+ /*
- * If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
- * transaction is occupying too much of the log, or if the
+++ /dev/null
-Index: linux-2.4.19-pre1/include/linux/jbd.h
-===================================================================
---- linux-2.4.19-pre1.orig/include/linux/jbd.h 2003-11-21 03:00:11.000000000 +0300
-+++ linux-2.4.19-pre1/include/linux/jbd.h 2003-11-21 03:04:47.000000000 +0300
-@@ -275,6 +275,13 @@
- return bh->b_private;
- }
-
-+#define HAVE_JOURNAL_CALLBACK_STATUS
-+struct journal_callback {
-+ struct list_head jcb_list;
-+ void (*jcb_func)(struct journal_callback *jcb, int error);
-+ /* user data goes here */
-+};
-+
- struct jbd_revoke_table_s;
-
- /* The handle_t type represents a single atomic update being performed
-@@ -305,6 +312,12 @@
- operations */
- int h_err;
-
-+ /* List of application registered callbacks for this handle.
-+ * The function(s) will be called after the transaction that
-+ * this handle is part of has been committed to disk.
-+ */
-+ struct list_head h_jcb;
-+
- /* Flags */
- unsigned int h_sync: 1; /* sync-on-close */
- unsigned int h_jdata: 1; /* force data journaling */
-@@ -424,6 +437,10 @@
-
- /* How many handles used this transaction? */
- int t_handle_count;
-+
-+ /* List of registered callback functions for this transaction.
-+ * Called when the transaction is committed. */
-+ struct list_head t_jcb;
- };
-
-
-@@ -672,6 +689,9 @@
- extern int journal_try_to_free_buffers(journal_t *, struct page *, int);
- extern int journal_stop(handle_t *);
- extern int journal_flush (journal_t *);
-+extern void journal_callback_set(handle_t *handle,
-+ void (*fn)(struct journal_callback *,int),
-+ struct journal_callback *jcb);
-
- extern void journal_lock_updates (journal_t *);
- extern void journal_unlock_updates (journal_t *);
-Index: linux-2.4.19-pre1/fs/jbd/checkpoint.c
-===================================================================
---- linux-2.4.19-pre1.orig/fs/jbd/checkpoint.c 2003-11-21 02:53:20.000000000 +0300
-+++ linux-2.4.19-pre1/fs/jbd/checkpoint.c 2003-11-21 03:04:47.000000000 +0300
-@@ -601,7 +601,8 @@
- J_ASSERT (transaction->t_log_list == NULL);
- J_ASSERT (transaction->t_checkpoint_list == NULL);
- J_ASSERT (transaction->t_updates == 0);
--
-+ J_ASSERT (list_empty(&transaction->t_jcb));
-+
- J_ASSERT (transaction->t_journal->j_committing_transaction !=
- transaction);
-
-Index: linux-2.4.19-pre1/fs/jbd/commit.c
-===================================================================
---- linux-2.4.19-pre1.orig/fs/jbd/commit.c 2003-11-21 02:53:20.000000000 +0300
-+++ linux-2.4.19-pre1/fs/jbd/commit.c 2003-11-21 03:04:47.000000000 +0300
-@@ -480,7 +480,7 @@
- transaction's t_log_list queue, and metadata buffers are on
- the t_iobuf_list queue.
-
-- Wait for the transactions in reverse order. That way we are
-+ Wait for the buffers in reverse order. That way we are
- less likely to be woken up until all IOs have completed, and
- so we incur less scheduling load.
- */
-@@ -571,8 +571,10 @@
-
- jbd_debug(3, "JBD: commit phase 6\n");
-
-- if (is_journal_aborted(journal))
-+ if (is_journal_aborted(journal)) {
-+ unlock_journal(journal);
- goto skip_commit;
-+ }
-
- /* Done it all: now write the commit record. We should have
- * cleaned up our previous buffers by now, so if we are in abort
-@@ -582,9 +584,10 @@
- descriptor = journal_get_descriptor_buffer(journal);
- if (!descriptor) {
- __journal_abort_hard(journal);
-+ unlock_journal(journal);
- goto skip_commit;
- }
--
-+
- /* AKPM: buglet - add `i' to tmp! */
- for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
- journal_header_t *tmp =
-@@ -605,14 +608,32 @@
- put_bh(bh); /* One for getblk() */
- journal_unlock_journal_head(descriptor);
- }
-- lock_journal(journal);
-
- /* End of a transaction! Finally, we can do checkpoint
- processing: any buffers committed as a result of this
- transaction can be removed from any checkpoint list it was on
- before. */
-
--skip_commit:
-+skip_commit: /* The journal should be unlocked by now. */
-+
-+ /* Call any callbacks that had been registered for handles in this
-+ * transaction. It is up to the callback to free any allocated
-+ * memory.
-+ */
-+ if (!list_empty(&commit_transaction->t_jcb)) {
-+ struct list_head *p, *n;
-+ int error = is_journal_aborted(journal);
-+
-+ list_for_each_safe(p, n, &commit_transaction->t_jcb) {
-+ struct journal_callback *jcb;
-+
-+ jcb = list_entry(p, struct journal_callback, jcb_list);
-+ list_del(p);
-+ jcb->jcb_func(jcb, error);
-+ }
-+ }
-+
-+ lock_journal(journal);
-
- jbd_debug(3, "JBD: commit phase 7\n");
-
-Index: linux-2.4.19-pre1/fs/jbd/journal.c
-===================================================================
---- linux-2.4.19-pre1.orig/fs/jbd/journal.c 2003-11-21 02:53:20.000000000 +0300
-+++ linux-2.4.19-pre1/fs/jbd/journal.c 2003-11-21 03:04:47.000000000 +0300
-@@ -58,6 +58,7 @@
- #endif
- EXPORT_SYMBOL(journal_flush);
- EXPORT_SYMBOL(journal_revoke);
-+EXPORT_SYMBOL(journal_callback_set);
-
- EXPORT_SYMBOL(journal_init_dev);
- EXPORT_SYMBOL(journal_init_inode);
-Index: linux-2.4.19-pre1/fs/jbd/transaction.c
-===================================================================
---- linux-2.4.19-pre1.orig/fs/jbd/transaction.c 2003-11-21 02:53:20.000000000 +0300
-+++ linux-2.4.19-pre1/fs/jbd/transaction.c 2003-11-21 03:05:14.000000000 +0300
-@@ -57,6 +57,7 @@
- transaction->t_state = T_RUNNING;
- transaction->t_tid = journal->j_transaction_sequence++;
- transaction->t_expires = jiffies + journal->j_commit_interval;
-+ INIT_LIST_HEAD(&transaction->t_jcb);
-
- /* Set up the commit timer for the new transaction. */
- J_ASSERT (!journal->j_commit_timer_active);
-@@ -90,7 +91,14 @@
- transaction_t *transaction;
- int needed;
- int nblocks = handle->h_buffer_credits;
--
-+
-+ if (nblocks > journal->j_max_transaction_buffers) {
-+ jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n",
-+ current->comm, nblocks,
-+ journal->j_max_transaction_buffers);
-+ return -ENOSPC;
-+ }
-+
- jbd_debug(3, "New handle %p going live.\n", handle);
-
- repeat:
-@@ -196,6 +204,20 @@
- return 0;
- }
-
-+/* Allocate a new handle. This should probably be in a slab... */
-+static handle_t *new_handle(int nblocks)
-+{
-+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+ if (!handle)
-+ return NULL;
-+ memset(handle, 0, sizeof (handle_t));
-+ handle->h_buffer_credits = nblocks;
-+ handle->h_ref = 1;
-+ INIT_LIST_HEAD(&handle->h_jcb);
-+
-+ return handle;
-+}
-+
- /*
- * Obtain a new handle.
- *
-@@ -222,14 +244,11 @@
- handle->h_ref++;
- return handle;
- }
--
-- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+
-+ handle = new_handle(nblocks);
- if (!handle)
- return ERR_PTR(-ENOMEM);
-- memset (handle, 0, sizeof (handle_t));
-
-- handle->h_buffer_credits = nblocks;
-- handle->h_ref = 1;
- current->journal_info = handle;
-
- err = start_this_handle(journal, handle);
-@@ -328,14 +347,11 @@
-
- if (is_journal_aborted(journal))
- return ERR_PTR(-EIO);
--
-- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+
-+ handle = new_handle(nblocks);
- if (!handle)
- return ERR_PTR(-ENOMEM);
-- memset (handle, 0, sizeof (handle_t));
-
-- handle->h_buffer_credits = nblocks;
-- handle->h_ref = 1;
- current->journal_info = handle;
-
- err = try_start_this_handle(journal, handle);
-@@ -1324,6 +1340,28 @@
- #endif
-
- /*
-+ * Register a callback function for this handle. The function will be
-+ * called when the transaction that this handle is part of has been
-+ * committed to disk with the original callback data struct and the
-+ * error status of the journal as parameters. There is no guarantee of
-+ * ordering between handles within a single transaction, nor between
-+ * callbacks registered on the same handle.
-+ *
-+ * The caller is responsible for allocating the journal_callback struct.
-+ * This is to allow the caller to add as much extra data to the callback
-+ * as needed, but reduce the overhead of multiple allocations. The caller
-+ * allocated struct must start with a struct journal_callback at offset 0,
-+ * and has the caller-specific data afterwards.
-+ */
-+void journal_callback_set(handle_t *handle,
-+ void (*func)(struct journal_callback *jcb, int error),
-+ struct journal_callback *jcb)
-+{
-+ list_add_tail(&jcb->jcb_list, &handle->h_jcb);
-+ jcb->jcb_func = func;
-+}
-+
-+/*
- * All done for a particular handle.
- *
- * There is not much action needed here. We just return any remaining
-@@ -1389,7 +1427,10 @@
- wake_up(&journal->j_wait_transaction_locked);
- }
-
-- /*
-+ /* Move callbacks from the handle to the transaction. */
-+ list_splice(&handle->h_jcb, &transaction->t_jcb);
-+
-+ /*
- * If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
- * transaction is occupying too much of the log, or if the
+++ /dev/null
-Index: linux-2.4.19.SuSE/fs/jbd/transaction.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/jbd/transaction.c Sun Nov 16 01:38:25 2003
-+++ linux-2.4.19.SuSE/fs/jbd/transaction.c Sun Nov 16 01:44:26 2003
-@@ -1094,7 +1094,6 @@
-
- spin_lock(&journal_datalist_lock);
- set_bit(BH_JBDDirty, &bh->b_state);
-- set_buffer_flushtime(bh);
-
- J_ASSERT_JH(jh, jh->b_transaction != NULL);
-
-@@ -1995,6 +1994,13 @@
- spin_unlock(&journal_datalist_lock);
- }
-
-+static void jbd_refile_buffer(struct buffer_head *bh)
-+{
-+ if (buffer_dirty(bh) && (bh->b_list != BUF_DIRTY))
-+ set_buffer_flushtime(bh);
-+ refile_buffer(bh);
-+}
-+
- /*
- * Remove a buffer from its current buffer list in preparation for
- * dropping it from its current transaction entirely. If the buffer has
-@@ -2022,7 +2028,7 @@
- J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
- } else {
- /* Onto BUF_DIRTY for writeback */
-- refile_buffer(jh2bh(jh));
-+ jbd_refile_buffer(jh2bh(jh));
- }
- }
-
+++ /dev/null
-Index: linux-bgl/arch/arm/vmlinux-armo.lds.in
-===================================================================
---- linux-bgl.orig/arch/arm/vmlinux-armo.lds.in 2003-07-02 08:44:12.000000000 -0700
-+++ linux-bgl/arch/arm/vmlinux-armo.lds.in 2004-10-26 22:52:50.037677957 -0700
-@@ -62,6 +62,10 @@
- *(__ksymtab)
- __stop___ksymtab = .;
-
-+ __start___kallsyms = .; /* All kernel symbols */
-+ *(__kallsyms)
-+ __stop___kallsyms = .;
-+
- *(.got) /* Global offset table */
-
- _etext = .; /* End of text section */
-Index: linux-bgl/arch/arm/vmlinux-armv.lds.in
-===================================================================
---- linux-bgl.orig/arch/arm/vmlinux-armv.lds.in 2003-07-02 08:44:12.000000000 -0700
-+++ linux-bgl/arch/arm/vmlinux-armv.lds.in 2004-10-26 22:52:50.038677801 -0700
-@@ -67,6 +67,12 @@
- __stop___ksymtab = .;
- }
-
-+ __kallsyms : { /* Kernel debugging table */
-+ __start___kallsyms = .; /* All kernel symbols */
-+ *(__kallsyms)
-+ __stop___kallsyms = .;
-+ }
-+
- . = ALIGN(8192);
-
- .data : {
-Index: linux-bgl/arch/ppc/config.in
-===================================================================
---- linux-bgl.orig/arch/ppc/config.in 2004-10-04 09:55:49.000000000 -0700
-+++ linux-bgl/arch/ppc/config.in 2004-10-26 23:11:56.416643929 -0700
-@@ -732,6 +732,7 @@
- string 'Additional compile arguments' CONFIG_COMPILE_OPTIONS "-g -ggdb"
- fi
- fi
-+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
-
- if [ "$CONFIG_ALL_PPC" = "y" ]; then
- bool 'Support for early boot text console (BootX or OpenFirmware only)' CONFIG_BOOTX_TEXT
-Index: linux-bgl/arch/ppc/vmlinux.lds
-===================================================================
---- linux-bgl.orig/arch/ppc/vmlinux.lds 2003-07-02 08:43:30.000000000 -0700
-+++ linux-bgl/arch/ppc/vmlinux.lds 2004-10-26 22:52:50.043677020 -0700
-@@ -73,6 +73,10 @@
- __ksymtab : { *(__ksymtab) }
- __stop___ksymtab = .;
-
-+ __start___kallsyms = .; /* All kernel symbols */
-+ __kallsyms : { *(__kallsyms) }
-+ __stop___kallsyms = .;
-+
- __start___ftr_fixup = .;
- __ftr_fixup : { *(__ftr_fixup) }
- __stop___ftr_fixup = .;
-Index: linux-bgl/arch/i386/config.in
-===================================================================
---- linux-bgl.orig/arch/i386/config.in 2003-07-02 08:43:46.000000000 -0700
-+++ linux-bgl/arch/i386/config.in 2004-10-26 22:52:50.040677488 -0700
-@@ -363,6 +363,7 @@
- if [ "$CONFIG_ISDN" != "n" ]; then
- source drivers/isdn/Config.in
- fi
-+ bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
- fi
- endmenu
-
-Index: linux-bgl/arch/i386/vmlinux.lds
-===================================================================
---- linux-bgl.orig/arch/i386/vmlinux.lds 2003-07-02 08:44:32.000000000 -0700
-+++ linux-bgl/arch/i386/vmlinux.lds 2004-10-26 22:52:50.040677488 -0700
-@@ -27,6 +27,9 @@
- __start___ksymtab = .; /* Kernel symbol table */
- __ksymtab : { *(__ksymtab) }
- __stop___ksymtab = .;
-+ __start___kallsyms = .; /* All kernel symbols */
-+ __kallsyms : { *(__kallsyms) }
-+ __stop___kallsyms = .;
-
- .data : { /* Data */
- *(.data)
-Index: linux-bgl/arch/ia64/config.in
-===================================================================
---- linux-bgl.orig/arch/ia64/config.in 2003-07-02 08:44:12.000000000 -0700
-+++ linux-bgl/arch/ia64/config.in 2004-10-26 22:52:50.055675147 -0700
-@@ -278,4 +278,6 @@
- bool ' Turn on irq debug checks (slow!)' CONFIG_IA64_DEBUG_IRQ
- fi
-
-+bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
-+
- endmenu
-Index: linux-bgl/arch/alpha/vmlinux.lds.in
-===================================================================
---- linux-bgl.orig/arch/alpha/vmlinux.lds.in 2003-07-02 08:43:45.000000000 -0700
-+++ linux-bgl/arch/alpha/vmlinux.lds.in 2004-10-26 22:52:50.036678113 -0700
-@@ -28,6 +28,10 @@
- __stop___ksymtab = .;
- .kstrtab : { *(.kstrtab) }
-
-+ __start___kallsyms = .; /* All kernel symbols */
-+ __kallsyms : { *(__kallsyms) }
-+ __stop___kallsyms = .;
-+
- /* Startup code */
- . = ALIGN(8192);
- __init_begin = .;
-Index: linux-bgl/Makefile
-===================================================================
---- linux-bgl.orig/Makefile 2004-10-04 09:55:49.000000000 -0700
-+++ linux-bgl/Makefile 2004-10-26 22:54:44.018588371 -0700
-@@ -38,10 +38,13 @@
- MAKEFILES = $(TOPDIR)/.config
- GENKSYMS = /sbin/genksyms
- DEPMOD = /sbin/depmod
-+KALLSYMS = /sbin/kallsyms
- MODFLAGS = -DMODULE
- CFLAGS_KERNEL =
- PERL = perl
-
-+TMPPREFIX =
-+
- export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \
- CONFIG_SHELL TOPDIR HPATH HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC \
- CPP AR NM STRIP OBJCOPY OBJDUMP MAKE MAKEFILES GENKSYMS MODFLAGS PERL
-@@ -198,7 +201,7 @@
- CLEAN_FILES = \
- kernel/ksyms.lst include/linux/compile.h \
- vmlinux System.map \
-- .tmp* \
-+ $(TMPPREFIX).tmp* \
- drivers/char/consolemap_deftbl.c drivers/video/promcon_tbl.c \
- drivers/char/conmakehash \
- drivers/char/drm/*-mod.c \
-@@ -278,16 +281,39 @@
- boot: vmlinux
- @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C arch/$(ARCH)/boot
-
-+LD_VMLINUX := $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \
-+ --start-group \
-+ $(CORE_FILES) \
-+ $(DRIVERS) \
-+ $(NETWORKS) \
-+ $(LIBS) \
-+ --end-group
-+ifeq ($(CONFIG_KALLSYMS),y)
-+LD_VMLINUX_KALLSYMS := $(TMPPREFIX).tmp_kallsyms3.o
-+else
-+LD_VMLINUX_KALLSYMS :=
-+endif
-+
- vmlinux: include/linux/version.h $(CONFIGURATION) init/main.o init/version.o init/do_mounts.o linuxsubdirs
-- $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \
-- --start-group \
-- $(CORE_FILES) \
-- $(DRIVERS) \
-- $(NETWORKS) \
-- $(LIBS) \
-- --end-group \
-- -o vmlinux
-+ @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" kallsyms
-+
-+.PHONY: kallsyms
-+
-+kallsyms:
-+ifeq ($(CONFIG_KALLSYMS),y)
-+ @echo kallsyms pass 1
-+ $(LD_VMLINUX) -o $(TMPPREFIX).tmp_vmlinux1
-+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux1 > $(TMPPREFIX).tmp_kallsyms1.o
-+ @echo kallsyms pass 2
-+ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms1.o -o $(TMPPREFIX).tmp_vmlinux2
-+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux2 > $(TMPPREFIX).tmp_kallsyms2.o
-+ @echo kallsyms pass 3
-+ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms2.o -o $(TMPPREFIX).tmp_vmlinux3
-+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux3 > $(TMPPREFIX).tmp_kallsyms3.o
-+endif
-+ $(LD_VMLINUX) $(LD_VMLINUX_KALLSYMS) -o vmlinux
- $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map
-+ @rm -f $(TMPPREFIX).tmp_vmlinux* $(TMPPREFIX).tmp_kallsyms*
-
- symlinks:
- rm -f include/asm
-Index: linux-bgl/kernel/Makefile
-===================================================================
---- linux-bgl.orig/kernel/Makefile 2003-07-02 08:44:29.000000000 -0700
-+++ linux-bgl/kernel/Makefile 2004-10-26 22:59:34.101037916 -0700
-@@ -19,6 +19,7 @@
- obj-$(CONFIG_UID16) += uid16.o
- obj-$(CONFIG_MODULES) += ksyms.o
- obj-$(CONFIG_PM) += pm.o
-+obj-$(CONFIG_KALLSYMS) += kallsyms.o
-
- ifneq ($(CONFIG_IA64),y)
- # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-Index: linux-bgl/kernel/ksyms.c
-===================================================================
---- linux-bgl.orig/kernel/ksyms.c 2004-10-26 21:49:59.922431839 -0700
-+++ linux-bgl/kernel/ksyms.c 2004-10-26 22:52:50.050675927 -0700
-@@ -56,6 +56,9 @@
- #ifdef CONFIG_KMOD
- #include <linux/kmod.h>
- #endif
-+#ifdef CONFIG_KALLSYMS
-+#include <linux/kallsyms.h>
-+#endif
-
- extern void set_device_ro(kdev_t dev,int flag);
-
-@@ -81,6 +84,15 @@
- EXPORT_SYMBOL(inter_module_put);
- EXPORT_SYMBOL(try_inc_mod_count);
-
-+#ifdef CONFIG_KALLSYMS
-+extern const char __start___kallsyms[];
-+extern const char __stop___kallsyms[];
-+EXPORT_SYMBOL(__start___kallsyms);
-+EXPORT_SYMBOL(__stop___kallsyms);
-+
-+
-+#endif
-+
- /* process memory management */
- EXPORT_SYMBOL(do_mmap_pgoff);
- EXPORT_SYMBOL(do_munmap);
-Index: linux-bgl/kernel/kallsyms.c
-===================================================================
---- linux-bgl.orig/kernel/kallsyms.c 2004-10-26 17:10:51.404753448 -0700
-+++ linux-bgl/kernel/kallsyms.c 2004-10-26 22:52:50.048676240 -0700
-@@ -0,0 +1,306 @@
-+/* An example of using kallsyms data in a kernel debugger.
-+
-+ Copyright 2000 Keith Owens <kaos@ocs.com.au> April 2000
-+
-+ This file is part of the Linux modutils.
-+
-+ This program is free software; you can redistribute it and/or modify it
-+ under the terms of the GNU General Public License as published by the
-+ Free Software Foundation; either version 2 of the License, or (at your
-+ option) any later version.
-+
-+ This program is distributed in the hope that it will be useful, but
-+ WITHOUT ANY WARRANTY; without even the implied warranty of
-+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ General Public License for more details.
-+
-+ You should have received a copy of the GNU General Public License
-+ along with this program; if not, write to the Free Software Foundation,
-+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-+ */
-+
-+#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.2 2005/04/01 21:30:19 green Exp $"
-+
-+/*
-+ This code uses the list of all kernel and module symbols to :-
-+
-+ * Find any non-stack symbol in a kernel or module. Symbols do
-+ not have to be exported for debugging.
-+
-+ * Convert an address to the module (or kernel) that owns it, the
-+ section it is in and the nearest symbol. This finds all non-stack
-+ symbols, not just exported ones.
-+
-+ You need modutils >= 2.3.11 and a kernel with the kallsyms patch
-+ which was compiled with CONFIG_KALLSYMS.
-+ */
-+
-+#include <linux/elf.h>
-+#include <linux/kernel.h>
-+#include <linux/module.h>
-+#include <linux/string.h>
-+#include <linux/kallsyms.h>
-+
-+/* These external symbols are only set on kernels compiled with
-+ * CONFIG_KALLSYMS.
-+ */
-+
-+extern const char __start___kallsyms[];
-+extern const char __stop___kallsyms[];
-+
-+static struct module **kallsyms_module_list;
-+
-+static void kallsyms_get_module_list(void)
-+{
-+ const struct kallsyms_header *ka_hdr;
-+ const struct kallsyms_section *ka_sec;
-+ const struct kallsyms_symbol *ka_sym;
-+ const char *ka_str;
-+ int i;
-+ const char *p;
-+
-+ if (__start___kallsyms >= __stop___kallsyms)
-+ return;
-+ ka_hdr = (struct kallsyms_header *)__start___kallsyms;
-+ ka_sec = (struct kallsyms_section *)
-+ ((char *)(ka_hdr) + ka_hdr->section_off);
-+ ka_sym = (struct kallsyms_symbol *)
-+ ((char *)(ka_hdr) + ka_hdr->symbol_off);
-+ ka_str =
-+ ((char *)(ka_hdr) + ka_hdr->string_off);
-+
-+ for (i = 0; i < ka_hdr->symbols; kallsyms_next_sym(ka_hdr, ka_sym), ++i) {
-+ p = ka_str + ka_sym->name_off;
-+ if (strcmp(p, "module_list") == 0) {
-+ if (ka_sym->symbol_addr)
-+ kallsyms_module_list = (struct module **)(ka_sym->symbol_addr);
-+ break;
-+ }
-+ }
-+}
-+
-+static inline void kallsyms_do_first_time(void)
-+{
-+ static int first_time = 1;
-+ if (first_time)
-+ kallsyms_get_module_list();
-+ first_time = 0;
-+}
-+
-+/* A symbol can appear in more than one module. A token is used to
-+ * restart the scan at the next module, set the token to 0 for the
-+ * first scan of each symbol.
-+ */
-+
-+int kallsyms_symbol_to_address(
-+ const char *name, /* Name to lookup */
-+ unsigned long *token, /* Which module to start at */
-+ const char **mod_name, /* Set to module name */
-+ unsigned long *mod_start, /* Set to start address of module */
-+ unsigned long *mod_end, /* Set to end address of module */
-+ const char **sec_name, /* Set to section name */
-+ unsigned long *sec_start, /* Set to start address of section */
-+ unsigned long *sec_end, /* Set to end address of section */
-+ const char **sym_name, /* Set to full symbol name */
-+ unsigned long *sym_start, /* Set to start address of symbol */
-+ unsigned long *sym_end /* Set to end address of symbol */
-+ )
-+{
-+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */
-+ const struct kallsyms_section *ka_sec;
-+ const struct kallsyms_symbol *ka_sym = NULL;
-+ const char *ka_str = NULL;
-+ const struct module *m;
-+ int i = 0, l;
-+ const char *p, *pt_R;
-+ char *p2;
-+
-+ kallsyms_do_first_time();
-+ if (!kallsyms_module_list)
-+ return(0);
-+
-+ /* Restart? */
-+ m = *kallsyms_module_list;
-+ if (token && *token) {
-+ for (; m; m = m->next)
-+ if ((unsigned long)m == *token)
-+ break;
-+ if (m)
-+ m = m->next;
-+ }
-+
-+ for (; m; m = m->next) {
-+ if (!mod_member_present(m, kallsyms_start) ||
-+ !mod_member_present(m, kallsyms_end) ||
-+ m->kallsyms_start >= m->kallsyms_end)
-+ continue;
-+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start;
-+ ka_sym = (struct kallsyms_symbol *)
-+ ((char *)(ka_hdr) + ka_hdr->symbol_off);
-+ ka_str =
-+ ((char *)(ka_hdr) + ka_hdr->string_off);
-+ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) {
-+ p = ka_str + ka_sym->name_off;
-+ if (strcmp(p, name) == 0)
-+ break;
-+ /* Unversioned requests match versioned names */
-+ if (!(pt_R = strstr(p, "_R")))
-+ continue;
-+ l = strlen(pt_R);
-+ if (l < 10)
-+ continue; /* Not _R.*xxxxxxxx */
-+ (void)simple_strtoul(pt_R+l-8, &p2, 16);
-+ if (*p2)
-+ continue; /* Not _R.*xxxxxxxx */
-+ if (strncmp(p, name, pt_R-p) == 0)
-+ break; /* Match with version */
-+ }
-+ if (i < ka_hdr->symbols)
-+ break;
-+ }
-+
-+ if (token)
-+ *token = (unsigned long)m;
-+ if (!m)
-+ return(0); /* not found */
-+
-+ ka_sec = (const struct kallsyms_section *)
-+ ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off);
-+ *mod_name = *(m->name) ? m->name : "kernel";
-+ *mod_start = ka_hdr->start;
-+ *mod_end = ka_hdr->end;
-+ *sec_name = ka_sec->name_off + ka_str;
-+ *sec_start = ka_sec->start;
-+ *sec_end = ka_sec->start + ka_sec->size;
-+ *sym_name = ka_sym->name_off + ka_str;
-+ *sym_start = ka_sym->symbol_addr;
-+ if (i < ka_hdr->symbols-1) {
-+ const struct kallsyms_symbol *ka_symn = ka_sym;
-+ kallsyms_next_sym(ka_hdr, ka_symn);
-+ *sym_end = ka_symn->symbol_addr;
-+ }
-+ else
-+ *sym_end = *sec_end;
-+ return(1);
-+}
-+
-+int kallsyms_address_to_symbol(
-+ unsigned long address, /* Address to lookup */
-+ const char **mod_name, /* Set to module name */
-+ unsigned long *mod_start, /* Set to start address of module */
-+ unsigned long *mod_end, /* Set to end address of module */
-+ const char **sec_name, /* Set to section name */
-+ unsigned long *sec_start, /* Set to start address of section */
-+ unsigned long *sec_end, /* Set to end address of section */
-+ const char **sym_name, /* Set to full symbol name */
-+ unsigned long *sym_start, /* Set to start address of symbol */
-+ unsigned long *sym_end /* Set to end address of symbol */
-+ )
-+{
-+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */
-+ const struct kallsyms_section *ka_sec = NULL;
-+ const struct kallsyms_symbol *ka_sym;
-+ const char *ka_str;
-+ const struct module *m;
-+ int i;
-+ unsigned long end;
-+
-+ kallsyms_do_first_time();
-+ if (!kallsyms_module_list)
-+ return(0);
-+
-+ for (m = *kallsyms_module_list; m; m = m->next) {
-+ if (!mod_member_present(m, kallsyms_start) ||
-+ !mod_member_present(m, kallsyms_end) ||
-+ m->kallsyms_start >= m->kallsyms_end)
-+ continue;
-+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start;
-+ ka_sec = (const struct kallsyms_section *)
-+ ((char *)ka_hdr + ka_hdr->section_off);
-+ /* Is the address in any section in this module? */
-+ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) {
-+ if (ka_sec->start <= address &&
-+ (ka_sec->start + ka_sec->size) > address)
-+ break;
-+ }
-+ if (i < ka_hdr->sections)
-+ break; /* Found a matching section */
-+ }
-+
-+ if (!m)
-+ return(0); /* not found */
-+
-+ ka_sym = (struct kallsyms_symbol *)
-+ ((char *)(ka_hdr) + ka_hdr->symbol_off);
-+ ka_str =
-+ ((char *)(ka_hdr) + ka_hdr->string_off);
-+ *mod_name = *(m->name) ? m->name : "kernel";
-+ *mod_start = ka_hdr->start;
-+ *mod_end = ka_hdr->end;
-+ *sec_name = ka_sec->name_off + ka_str;
-+ *sec_start = ka_sec->start;
-+ *sec_end = ka_sec->start + ka_sec->size;
-+ *sym_name = *sec_name; /* In case we find no matching symbol */
-+ *sym_start = *sec_start;
-+ *sym_end = *sec_end;
-+
-+ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) {
-+ if (ka_sym->symbol_addr > address)
-+ continue;
-+ if (i < ka_hdr->symbols-1) {
-+ const struct kallsyms_symbol *ka_symn = ka_sym;
-+ kallsyms_next_sym(ka_hdr, ka_symn);
-+ end = ka_symn->symbol_addr;
-+ }
-+ else
-+ end = *sec_end;
-+ if (end <= address)
-+ continue;
-+ if ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off
-+ != (char *)ka_sec)
-+ continue; /* wrong section */
-+ *sym_name = ka_str + ka_sym->name_off;
-+ *sym_start = ka_sym->symbol_addr;
-+ *sym_end = end;
-+ break;
-+ }
-+ return(1);
-+}
-+
-+/* List all sections in all modules. The callback routine is invoked with
-+ * token, module name, section name, section start, section end, section flags.
-+ */
-+int kallsyms_sections(void *token,
-+ int (*callback)(void *, const char *, const char *, ElfW(Addr), ElfW(Addr), ElfW(Word)))
-+{
-+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */
-+ const struct kallsyms_section *ka_sec = NULL;
-+ const char *ka_str;
-+ const struct module *m;
-+ int i;
-+
-+ kallsyms_do_first_time();
-+ if (!kallsyms_module_list)
-+ return(0);
-+
-+ for (m = *kallsyms_module_list; m; m = m->next) {
-+ if (!mod_member_present(m, kallsyms_start) ||
-+ !mod_member_present(m, kallsyms_end) ||
-+ m->kallsyms_start >= m->kallsyms_end)
-+ continue;
-+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start;
-+ ka_sec = (const struct kallsyms_section *) ((char *)ka_hdr + ka_hdr->section_off);
-+ ka_str = ((char *)(ka_hdr) + ka_hdr->string_off);
-+ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) {
-+ if (callback(
-+ token,
-+ *(m->name) ? m->name : "kernel",
-+ ka_sec->name_off + ka_str,
-+ ka_sec->start,
-+ ka_sec->start + ka_sec->size,
-+ ka_sec->flags))
-+ return(0);
-+ }
-+ }
-+ return(1);
-+}
-Index: linux-bgl/include/linux/kallsyms.h
-===================================================================
---- linux-bgl.orig/include/linux/kallsyms.h 2004-10-26 17:10:51.404753448 -0700
-+++ linux-bgl/include/linux/kallsyms.h 2004-10-26 22:52:50.045676708 -0700
-@@ -0,0 +1,141 @@
-+/* kallsyms headers
-+ Copyright 2000 Keith Owens <kaos@ocs.com.au>
-+
-+ This file is part of the Linux modutils. It is exported to kernel
-+ space so debuggers can access the kallsyms data.
-+
-+ The kallsyms data contains all the non-stack symbols from a kernel
-+ or a module. The kernel symbols are held between __start___kallsyms
-+ and __stop___kallsyms. The symbols for a module are accessed via
-+ the struct module chain which is based at module_list.
-+
-+ This program is free software; you can redistribute it and/or modify it
-+ under the terms of the GNU General Public License as published by the
-+ Free Software Foundation; either version 2 of the License, or (at your
-+ option) any later version.
-+
-+ This program is distributed in the hope that it will be useful, but
-+ WITHOUT ANY WARRANTY; without even the implied warranty of
-+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ General Public License for more details.
-+
-+ You should have received a copy of the GNU General Public License
-+ along with this program; if not, write to the Free Software Foundation,
-+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-+ */
-+
-+#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.2 2005/04/01 21:30:19 green Exp $"
-+
-+#ifndef MODUTILS_KALLSYMS_H
-+#define MODUTILS_KALLSYMS_H 1
-+
-+/* Have to (re)define these ElfW entries here because external kallsyms
-+ * code does not have access to modutils/include/obj.h. This code is
-+ * included from user spaces tools (modutils) and kernel, they need
-+ * different includes.
-+ */
-+
-+#ifndef ELFCLASS32
-+#ifdef __KERNEL__
-+#include <linux/elf.h>
-+#else /* __KERNEL__ */
-+#include <elf.h>
-+#endif /* __KERNEL__ */
-+#endif /* ELFCLASS32 */
-+
-+#ifndef ELFCLASSM
-+#define ELFCLASSM ELF_CLASS
-+#endif
-+
-+#ifndef ElfW
-+# if ELFCLASSM == ELFCLASS32
-+# define ElfW(x) Elf32_ ## x
-+# define ELFW(x) ELF32_ ## x
-+# else
-+# define ElfW(x) Elf64_ ## x
-+# define ELFW(x) ELF64_ ## x
-+# endif
-+#endif
-+
-+/* Format of data in the kallsyms section.
-+ * Most of the fields are small numbers but the total size and all
-+ * offsets can be large so use the 32/64 bit types for these fields.
-+ *
-+ * Do not use sizeof() on these structures, modutils may be using extra
-+ * fields. Instead use the size fields in the header to access the
-+ * other bits of data.
-+ */
-+
-+struct kallsyms_header {
-+ int size; /* Size of this header */
-+ ElfW(Word) total_size; /* Total size of kallsyms data */
-+ int sections; /* Number of section entries */
-+ ElfW(Off) section_off; /* Offset to first section entry */
-+ int section_size; /* Size of one section entry */
-+ int symbols; /* Number of symbol entries */
-+ ElfW(Off) symbol_off; /* Offset to first symbol entry */
-+ int symbol_size; /* Size of one symbol entry */
-+ ElfW(Off) string_off; /* Offset to first string */
-+ ElfW(Addr) start; /* Start address of first section */
-+ ElfW(Addr) end; /* End address of last section */
-+};
-+
-+struct kallsyms_section {
-+ ElfW(Addr) start; /* Start address of section */
-+ ElfW(Word) size; /* Size of this section */
-+ ElfW(Off) name_off; /* Offset to section name */
-+ ElfW(Word) flags; /* Flags from section */
-+};
-+
-+struct kallsyms_symbol {
-+ ElfW(Off) section_off; /* Offset to section that owns this symbol */
-+ ElfW(Addr) symbol_addr; /* Address of symbol */
-+ ElfW(Off) name_off; /* Offset to symbol name */
-+};
-+
-+#define KALLSYMS_SEC_NAME "__kallsyms"
-+#define KALLSYMS_IDX 2 /* obj_kallsyms creates kallsyms as section 2 */
-+
-+#define kallsyms_next_sec(h,s) \
-+ ((s) = (struct kallsyms_section *)((char *)(s) + (h)->section_size))
-+#define kallsyms_next_sym(h,s) \
-+ ((s) = (struct kallsyms_symbol *)((char *)(s) + (h)->symbol_size))
-+
-+int kallsyms_symbol_to_address(
-+ const char *name, /* Name to lookup */
-+ unsigned long *token, /* Which module to start with */
-+ const char **mod_name, /* Set to module name or "kernel" */
-+ unsigned long *mod_start, /* Set to start address of module */
-+ unsigned long *mod_end, /* Set to end address of module */
-+ const char **sec_name, /* Set to section name */
-+ unsigned long *sec_start, /* Set to start address of section */
-+ unsigned long *sec_end, /* Set to end address of section */
-+ const char **sym_name, /* Set to full symbol name */
-+ unsigned long *sym_start, /* Set to start address of symbol */
-+ unsigned long *sym_end /* Set to end address of symbol */
-+ );
-+
-+int kallsyms_address_to_symbol(
-+ unsigned long address, /* Address to lookup */
-+ const char **mod_name, /* Set to module name */
-+ unsigned long *mod_start, /* Set to start address of module */
-+ unsigned long *mod_end, /* Set to end address of module */
-+ const char **sec_name, /* Set to section name */
-+ unsigned long *sec_start, /* Set to start address of section */
-+ unsigned long *sec_end, /* Set to end address of section */
-+ const char **sym_name, /* Set to full symbol name */
-+ unsigned long *sym_start, /* Set to start address of symbol */
-+ unsigned long *sym_end /* Set to end address of symbol */
-+ );
-+
-+int kallsyms_sections(void *token,
-+ int (*callback)(void *, /* token */
-+ const char *, /* module name */
-+ const char *, /* section name */
-+ ElfW(Addr), /* Section start */
-+ ElfW(Addr), /* Section end */
-+ ElfW(Word) /* Section flags */
-+ )
-+ );
-+
-+#endif /* kallsyms.h */
+++ /dev/null
-Index: linux-bgl/arch/i386/kernel/traps.c
-===================================================================
---- linux-bgl.orig/arch/i386/kernel/traps.c 2003-07-02 08:43:23.000000000 -0700
-+++ linux-bgl/arch/i386/kernel/traps.c 2004-10-26 23:25:17.950442396 -0700
-@@ -24,6 +24,7 @@
- #include <linux/spinlock.h>
- #include <linux/interrupt.h>
- #include <linux/highmem.h>
-+#include <linux/version.h>
-
- #ifdef CONFIG_MCA
- #include <linux/mca.h>
-@@ -135,6 +136,8 @@
- {
- int i;
- unsigned long addr;
-+ /* static to not take up stackspace; if we race here too bad */
-+ static char buffer[512];
-
- if (!stack)
- stack = (unsigned long*)&stack;
-@@ -144,9 +147,8 @@
- while (((long) stack & (THREAD_SIZE-1)) != 0) {
- addr = *stack++;
- if (kernel_text_address(addr)) {
-- if (i && ((i % 6) == 0))
-- printk("\n ");
-- printk(" [<%08lx>]", addr);
-+ lookup_symbol(addr, buffer, 512);
-+ printk("[<%08lx>] %s (0x%p)\n", addr,buffer,stack-1);
- i++;
- }
- }
-@@ -186,12 +188,19 @@
- show_trace(esp);
- }
-
-+#ifdef CONFIG_MK7
-+#define ARCHIT "/athlon"
-+#else
-+#define ARCHIT "/i686"
-+#endif
-+
- void show_registers(struct pt_regs *regs)
- {
- int i;
- int in_kernel = 1;
- unsigned long esp;
- unsigned short ss;
-+ static char buffer[512];
-
- esp = (unsigned long) (®s->esp);
- ss = __KERNEL_DS;
-@@ -200,8 +209,12 @@
- esp = regs->esp;
- ss = regs->xss & 0xffff;
- }
-+
-+ print_modules();
-+ lookup_symbol(regs->eip, buffer, 512);
- printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx\n",
- smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags);
-+ printk("\nEIP is at %s (" UTS_RELEASE ARCHIT ")\n",buffer);
- printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
- regs->eax, regs->ebx, regs->ecx, regs->edx);
- printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
-@@ -261,7 +274,7 @@
- if (__get_user(file, (char **)(eip + 4)) ||
- (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
- file = "<bad filename>";
--
-+ printk("------------[ cut here ]------------\n");
- printk("kernel BUG at %s:%d!\n", file, line);
-
- no_bug:
-Index: linux-bgl/arch/i386/kernel/process.c
-===================================================================
---- linux-bgl.orig/arch/i386/kernel/process.c 2003-07-02 08:44:07.000000000 -0700
-+++ linux-bgl/arch/i386/kernel/process.c 2004-10-26 23:28:53.017015082 -0700
-@@ -33,6 +33,7 @@
- #include <linux/reboot.h>
- #include <linux/init.h>
- #include <linux/mc146818rtc.h>
-+#include <linux/version.h>
-
- #include <asm/uaccess.h>
- #include <asm/pgtable.h>
-@@ -437,10 +438,14 @@
- void show_regs(struct pt_regs * regs)
- {
- unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
-+ static char buffer[512];
-+
-+ lookup_symbol(regs->eip, buffer, 512);
-
- printk("\n");
- printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
- printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id());
-+ printk("\nEIP is at %s (" UTS_RELEASE ")\n", buffer);
- if (regs->xcs & 3)
- printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
- printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted());
-Index: linux-bgl/arch/ia64/kernel/process.c
-===================================================================
---- linux-bgl.orig/arch/ia64/kernel/process.c 2003-07-02 08:43:26.000000000 -0700
-+++ linux-bgl/arch/ia64/kernel/process.c 2004-10-26 23:29:56.340005959 -0700
-@@ -18,6 +18,7 @@
- #include <linux/smp_lock.h>
- #include <linux/stddef.h>
- #include <linux/unistd.h>
-+#include <linux/version.h>
-
- #include <asm/delay.h>
- #include <asm/efi.h>
-@@ -33,9 +34,10 @@
- #include <asm/sn/idle.h>
- #endif
-
--static void
--do_show_stack (struct unw_frame_info *info, void *arg)
-+void
-+ia64_do_show_stack (struct unw_frame_info *info, void *arg)
- {
-+ static char buffer[512];
- unsigned long ip, sp, bsp;
-
- printk("\nCall Trace: ");
-@@ -46,7 +48,8 @@
-
- unw_get_sp(info, &sp);
- unw_get_bsp(info, &bsp);
-- printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx\n", ip, sp, bsp);
-+ lookup_symbol(ip, buffer, 512);
-+ printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx %s\n", ip, sp, bsp, buffer);
- } while (unw_unwind(info) >= 0);
- }
-
-@@ -56,19 +59,19 @@
- struct unw_frame_info info;
-
- unw_init_from_blocked_task(&info, task);
-- do_show_stack(&info, 0);
-+ ia64_do_show_stack(&info, 0);
- }
-
- void
- show_stack (struct task_struct *task)
- {
- if (!task)
-- unw_init_running(do_show_stack, 0);
-+ unw_init_running(ia64_do_show_stack, 0);
- else {
- struct unw_frame_info info;
-
- unw_init_from_blocked_task(&info, task);
-- do_show_stack(&info, 0);
-+ ia64_do_show_stack(&info, 0);
- }
- }
-
-@@ -76,8 +79,11 @@
- show_regs (struct pt_regs *regs)
- {
- unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
-+ static char buffer[512];
-
- printk("\nPid: %d, comm: %20s\n", current->pid, current->comm);
-+ lookup_symbol(ip, buffer, 512);
-+ printk("EIP is at %s (" UTS_RELEASE ")\n", buffer);
- printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n",
- regs->cr_ipsr, regs->cr_ifs, ip, print_tainted());
- printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
-Index: linux-bgl/arch/s390/config.in
-===================================================================
---- linux-bgl.orig/arch/s390/config.in 2003-07-02 08:43:27.000000000 -0700
-+++ linux-bgl/arch/s390/config.in 2004-10-26 23:25:17.961440685 -0700
-@@ -73,5 +73,6 @@
- # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG
- #fi
- bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
-+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
- endmenu
-
-Index: linux-bgl/arch/s390/kernel/traps.c
-===================================================================
---- linux-bgl.orig/arch/s390/kernel/traps.c 2003-07-02 08:44:02.000000000 -0700
-+++ linux-bgl/arch/s390/kernel/traps.c 2004-10-26 23:25:17.964440218 -0700
-@@ -27,6 +27,7 @@
- #include <linux/init.h>
- #include <linux/delay.h>
- #include <linux/module.h>
-+#include <linux/version.h>
-
- #include <asm/system.h>
- #include <asm/uaccess.h>
-@@ -108,27 +109,26 @@
-
- void show_trace(unsigned long * stack)
- {
-+ static char buffer[512];
- unsigned long backchain, low_addr, high_addr, ret_addr;
- int i;
-
- if (!stack)
- stack = (unsigned long*)&stack;
-
-- printk("Call Trace: ");
- low_addr = ((unsigned long) stack) & PSW_ADDR_MASK;
- high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE;
- /* Skip the first frame (biased stack) */
- backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK;
-- /* Print up to 8 lines */
-- for (i = 0; i < 8; i++) {
-+ /* Print up to 20 lines */
-+ for (i = 0; i < 20; i++) {
- if (backchain < low_addr || backchain >= high_addr)
- break;
- ret_addr = *((unsigned long *) (backchain+56)) & PSW_ADDR_MASK;
- if (!kernel_text_address(ret_addr))
- break;
-- if (i && ((i % 6) == 0))
-- printk("\n ");
-- printk("[<%08lx>] ", ret_addr);
-+ lookup_symbol(ret_addr, buffer, 512);
-+ printk("[<%08lx>] %s (0x%lx)\n", ret_addr,buffer,backchain+56);
- low_addr = backchain;
- backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK;
- }
-@@ -171,6 +171,7 @@
-
- void show_registers(struct pt_regs *regs)
- {
-+ static char buffer[512];
- mm_segment_t old_fs;
- char *mode;
- int i;
-@@ -179,6 +180,10 @@
- printk("%s PSW : %08lx %08lx\n",
- mode, (unsigned long) regs->psw.mask,
- (unsigned long) regs->psw.addr);
-+ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) {
-+ lookup_symbol(regs->psw.addr & 0x7FFFFFFF, buffer, 512);
-+ printk(" %s (" UTS_RELEASE ")\n", buffer);
-+ }
- printk("%s GPRS: %08x %08x %08x %08x\n", mode,
- regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]);
- printk(" %08x %08x %08x %08x\n",
-Index: linux-bgl/arch/s390x/config.in
-===================================================================
---- linux-bgl.orig/arch/s390x/config.in 2003-07-02 08:43:07.000000000 -0700
-+++ linux-bgl/arch/s390x/config.in 2004-10-26 23:25:17.964440218 -0700
-@@ -75,5 +75,6 @@
- # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG
- #fi
- bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ
-+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS
- endmenu
-
-Index: linux-bgl/arch/s390x/kernel/traps.c
-===================================================================
---- linux-bgl.orig/arch/s390x/kernel/traps.c 2003-07-02 08:43:25.000000000 -0700
-+++ linux-bgl/arch/s390x/kernel/traps.c 2004-10-26 23:25:17.966439907 -0700
-@@ -27,6 +27,7 @@
- #include <linux/init.h>
- #include <linux/delay.h>
- #include <linux/module.h>
-+#include <linux/version.h>
-
- #include <asm/system.h>
- #include <asm/uaccess.h>
-@@ -112,25 +113,25 @@
- {
- unsigned long backchain, low_addr, high_addr, ret_addr;
- int i;
-+ /* static to not take up stackspace; if we race here too bad */
-+ static char buffer[512];
-
- if (!stack)
- stack = (unsigned long*)&stack;
-
-- printk("Call Trace: ");
- low_addr = ((unsigned long) stack) & PSW_ADDR_MASK;
- high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE;
- /* Skip the first frame (biased stack) */
- backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK;
-- /* Print up to 8 lines */
-- for (i = 0; i < 8; i++) {
-+ /* Print up to 20 lines */
-+ for (i = 0; i < 20; i++) {
- if (backchain < low_addr || backchain >= high_addr)
- break;
- ret_addr = *((unsigned long *) (backchain+112)) & PSW_ADDR_MASK;
- if (!kernel_text_address(ret_addr))
- break;
-- if (i && ((i % 3) == 0))
-- printk("\n ");
-- printk("[<%016lx>] ", ret_addr);
-+ lookup_symbol(ret_addr, buffer, 512);
-+ printk("[<%016lx>] %s (0x%lx)\n", ret_addr, buffer, backchain+112);
- low_addr = backchain;
- backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK;
- }
-@@ -173,6 +174,7 @@
-
- void show_registers(struct pt_regs *regs)
- {
-+ static char buffer[512];
- mm_segment_t old_fs;
- char *mode;
- int i;
-@@ -181,6 +183,10 @@
- printk("%s PSW : %016lx %016lx\n",
- mode, (unsigned long) regs->psw.mask,
- (unsigned long) regs->psw.addr);
-+ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) {
-+ lookup_symbol(regs->psw.addr, buffer, 512);
-+ printk(" %s (" UTS_RELEASE ")\n", buffer);
-+ }
- printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode,
- regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]);
- printk(" %016lx %016lx %016lx %016lx\n",
-Index: linux-bgl/arch/ppc64/mm/fault.c
-===================================================================
---- linux-bgl.orig/arch/ppc64/mm/fault.c 2003-07-02 08:43:12.000000000 -0700
-+++ linux-bgl/arch/ppc64/mm/fault.c 2004-10-26 23:30:24.467942247 -0700
-@@ -224,7 +224,6 @@
- if (debugger_kernel_faults)
- debugger(regs);
- #endif
-- print_backtrace( (unsigned long *)regs->gpr[1] );
- panic("kernel access of bad area pc %lx lr %lx address %lX tsk %s/%d",
- regs->nip,regs->link,address,current->comm,current->pid);
- }
-Index: linux-bgl/arch/ppc64/kernel/traps.c
-===================================================================
---- linux-bgl.orig/arch/ppc64/kernel/traps.c 2003-07-02 08:44:03.000000000 -0700
-+++ linux-bgl/arch/ppc64/kernel/traps.c 2004-10-26 23:33:45.297572484 -0700
-@@ -89,7 +89,6 @@
- #if defined(CONFIG_KDB)
- kdb(KDB_REASON_OOPS, 0, (kdb_eframe_t) regs);
- #endif
-- print_backtrace((unsigned long *)regs->gpr[1]);
- panic("Exception in kernel pc %lx signal %d",regs->nip,signr);
- #if defined(CONFIG_PPCDBG) && (defined(CONFIG_XMON) || defined(CONFIG_KGDB))
- /* Allow us to catch SIGILLs for 64-bit app/glibc debugging. -Peter */
-@@ -187,7 +186,6 @@
- if (kdb(KDB_REASON_FAULT, 0, regs))
- return ;
- #endif
-- print_backtrace((unsigned long *)regs->gpr[1]);
- panic("machine check");
- }
- _exception(SIGSEGV, regs);
-@@ -209,7 +207,6 @@
- }
- #endif
- show_regs(regs);
-- print_backtrace((unsigned long *)regs->gpr[1]);
- panic("System Management Interrupt");
- }
-
-Index: linux-bgl/arch/ppc64/kernel/process.c
-===================================================================
---- linux-bgl.orig/arch/ppc64/kernel/process.c 2003-07-02 08:44:31.000000000 -0700
-+++ linux-bgl/arch/ppc64/kernel/process.c 2004-10-26 23:33:01.060713583 -0700
-@@ -30,6 +30,8 @@
- #include <linux/user.h>
- #include <linux/elf.h>
- #include <linux/init.h>
-+#include <linux/version.h>
-+#include <linux/module.h>
-
- #include <asm/pgtable.h>
- #include <asm/uaccess.h>
-@@ -130,12 +132,61 @@
- __restore_flags(s);
- }
-
-+/*
-+ * If the address is either in the .text section of the
-+ * kernel, or in the vmalloc'ed module regions, it *may*
-+ * be the address of a calling routine
-+ */
-+
-+#ifdef CONFIG_MODULES
-+
-+extern struct module *module_list;
-+extern struct module kernel_module;
-+extern char _stext[], _etext[];
-+
-+static inline int kernel_text_address(unsigned long addr)
-+{
-+ int retval = 0;
-+ struct module *mod;
-+
-+ if (addr >= (unsigned long) &_stext &&
-+ addr <= (unsigned long) &_etext)
-+ return 1;
-+
-+ for (mod = module_list; mod != &kernel_module; mod = mod->next) {
-+ /* mod_bound tests for addr being inside the vmalloc'ed
-+ * module area. Of course it'd be better to test only
-+ * for the .text subset... */
-+ if (mod_bound(addr, 0, mod)) {
-+ retval = 1;
-+ break;
-+ }
-+ }
-+
-+ return retval;
-+}
-+
-+#else
-+
-+static inline int kernel_text_address(unsigned long addr)
-+{
-+ return (addr >= (unsigned long) &_stext &&
-+ addr <= (unsigned long) &_etext);
-+}
-+
-+#endif
-+
-+
- void show_regs(struct pt_regs * regs)
- {
- int i;
-+ static char buffer[512];
-
-- printk("NIP: %016lX XER: %016lX LR: %016lX REGS: %p TRAP: %04lx %s\n",
-+ print_modules();
-+ printk("NIP: %016lx XER: %016lx LR: %016lx REGS: %p TRAP: %04lx %s\n",
- regs->nip, regs->xer, regs->link, regs,regs->trap, print_tainted());
-+ lookup_symbol(regs->nip, buffer, 512);
-+ printk("NIP is at %s (" UTS_RELEASE ")\n", buffer);
- printk("MSR: %016lx EE: %01x PR: %01x FP: %01x ME: %01x IR/DR: %01x%01x\n",
- regs->msr, regs->msr&MSR_EE ? 1 : 0, regs->msr&MSR_PR ? 1 : 0,
- regs->msr & MSR_FP ? 1 : 0,regs->msr&MSR_ME ? 1 : 0,
-@@ -147,27 +198,22 @@
- printk("\nlast math %p ", last_task_used_math);
-
- #ifdef CONFIG_SMP
-- /* printk(" CPU: %d last CPU: %d", current->processor,current->last_processor); */
-+ printk("CPU: %d", smp_processor_id());
- #endif /* CONFIG_SMP */
-
-- printk("\n");
- for (i = 0; i < 32; i++)
- {
- long r;
- if ((i % 4) == 0)
-- {
-- printk("GPR%02d: ", i);
-- }
-+ printk("\nGPR%02d: ", i);
-
- if ( __get_user(r, &(regs->gpr[i])) )
- return;
-
-- printk("%016lX ", r);
-- if ((i % 4) == 3)
-- {
-- printk("\n");
-- }
-+ printk("%016lx ", r);
- }
-+ printk("\n");
-+ print_backtrace((unsigned long *)regs->gpr[1]);
- }
-
- void exit_thread(void)
-@@ -415,67 +461,24 @@
- }
- }
-
--extern char _stext[], _etext[];
--
--char * ppc_find_proc_name( unsigned * p, char * buf, unsigned buflen )
--{
-- unsigned long tb_flags;
-- unsigned short name_len;
-- unsigned long tb_start, code_start, code_ptr, code_offset;
-- unsigned code_len;
-- strcpy( buf, "Unknown" );
-- code_ptr = (unsigned long)p;
-- code_offset = 0;
-- if ( ( (unsigned long)p >= (unsigned long)_stext ) && ( (unsigned long)p <= (unsigned long)_etext ) ) {
-- while ( (unsigned long)p <= (unsigned long)_etext ) {
-- if ( *p == 0 ) {
-- tb_start = (unsigned long)p;
-- ++p; /* Point to traceback flags */
-- tb_flags = *((unsigned long *)p);
-- p += 2; /* Skip over traceback flags */
-- if ( tb_flags & TB_NAME_PRESENT ) {
-- if ( tb_flags & TB_PARMINFO )
-- ++p; /* skip over parminfo data */
-- if ( tb_flags & TB_HAS_TBOFF ) {
-- code_len = *p; /* get code length */
-- code_start = tb_start - code_len;
-- code_offset = code_ptr - code_start + 1;
-- if ( code_offset > 0x100000 )
-- break;
-- ++p; /* skip over code size */
-- }
-- name_len = *((unsigned short *)p);
-- if ( name_len > (buflen-20) )
-- name_len = buflen-20;
-- memcpy( buf, ((char *)p)+2, name_len );
-- buf[name_len] = 0;
-- if ( code_offset )
-- sprintf( buf+name_len, "+0x%lx", code_offset-1 );
-- }
-- break;
-- }
-- ++p;
-- }
-- }
-- return buf;
--}
--
- void
- print_backtrace(unsigned long *sp)
- {
- int cnt = 0;
- unsigned long i;
-- char name_buf[256];
-+ char buffer[512];
-
-- printk("Call backtrace: \n");
-+ printk("Call Trace: \n");
- while (sp) {
- if (__get_user( i, &sp[2] ))
- break;
-- printk("%016lX ", i);
-- printk("%s\n", ppc_find_proc_name( (unsigned *)i, name_buf, 256 ));
-+ if (kernel_text_address(i)) {
-+ if (__get_user(sp, (unsigned long **)sp))
-+ break;
-+ lookup_symbol(i, buffer, 512);
-+ printk("[<%016lx>] %s\n", i, buffer);
-+ }
- if (cnt > 32) break;
-- if (__get_user(sp, (unsigned long **)sp))
-- break;
- }
- printk("\n");
- }
-@@ -515,6 +518,7 @@
- unsigned long ip, sp;
- unsigned long stack_page = (unsigned long)p;
- int count = 0;
-+ static char buffer[512];
-
- if (!p)
- return;
-@@ -528,7 +532,8 @@
- break;
- if (count > 0) {
- ip = *(unsigned long *)(sp + 16);
-- printk("[%016lx] ", ip);
-+ lookup_symbol(ip, buffer, 512);
-+ printk("[<%016lx>] %s\n", ip, buffer);
- }
- } while (count++ < 16);
- printk("\n");
-Index: linux-bgl/kernel/Makefile
-===================================================================
---- linux-bgl.orig/kernel/Makefile 2004-10-26 23:23:00.516655289 -0700
-+++ linux-bgl/kernel/Makefile 2004-10-26 23:35:04.930451186 -0700
-@@ -14,7 +14,7 @@
- obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
- module.o exit.o itimer.o info.o time.o softirq.o resource.o \
- sysctl.o acct.o capability.o ptrace.o timer.o user.o \
-- signal.o sys.o kmod.o context.o
-+ signal.o sys.o kmod.o context.o kksymoops.o
-
- obj-$(CONFIG_UID16) += uid16.o
- obj-$(CONFIG_MODULES) += ksyms.o
-Index: linux-bgl/kernel/kksymoops.c
-===================================================================
---- linux-bgl.orig/kernel/kksymoops.c 2004-10-26 17:10:51.404753448 -0700
-+++ linux-bgl/kernel/kksymoops.c 2004-10-26 23:25:17.971439129 -0700
-@@ -0,0 +1,82 @@
-+#include <linux/module.h>
-+#include <linux/string.h>
-+#include <linux/errno.h>
-+#include <linux/kernel.h>
-+#include <linux/config.h>
-+#ifdef CONFIG_KALLSYMS
-+#include <linux/kallsyms.h>
-+#endif
-+
-+
-+
-+int lookup_symbol(unsigned long address, char *buffer, int buflen)
-+{
-+ struct module *this_mod;
-+ unsigned long bestsofar;
-+
-+ const char *mod_name = NULL, *sec_name = NULL, *sym_name = NULL;
-+ unsigned long mod_start,mod_end,sec_start,sec_end,sym_start,sym_end;
-+
-+ if (!buffer)
-+ return -EFAULT;
-+
-+ if (buflen<256)
-+ return -ENOMEM;
-+
-+ memset(buffer,0,buflen);
-+
-+#ifdef CONFIG_KALLSYMS
-+ if (!kallsyms_address_to_symbol(address,&mod_name,&mod_start,&mod_end,&sec_name,
-+ &sec_start, &sec_end, &sym_name, &sym_start, &sym_end)) {
-+ /* kallsyms doesn't have a clue; lets try harder */
-+ bestsofar = 0;
-+ snprintf(buffer,buflen-1,"[unresolved]");
-+
-+ this_mod = module_list;
-+
-+ while (this_mod != NULL) {
-+ int i;
-+ /* walk the symbol list of this module. Only symbols
-+ who's address is smaller than the searched for address
-+ are relevant; and only if it's better than the best so far */
-+ for (i=0; i< this_mod->nsyms; i++)
-+ if ((this_mod->syms[i].value<=address) &&
-+ (bestsofar<this_mod->syms[i].value)) {
-+ snprintf(buffer,buflen-1,"%s [%s] 0x%x",
-+ this_mod->syms[i].name,
-+ this_mod->name,
-+ (unsigned int)(address - this_mod->syms[i].value));
-+ bestsofar = this_mod->syms[i].value;
-+ }
-+ this_mod = this_mod->next;
-+ }
-+
-+ } else { /* kallsyms success */
-+ snprintf(buffer,buflen-1,"%s [%s] 0x%x",sym_name,mod_name,(unsigned int)(address-sym_start));
-+ }
-+#endif
-+ return strlen(buffer);
-+}
-+
-+static char modlist[4096];
-+/* this function isn't smp safe but that's not really a problem; it's called from
-+ * oops context only and any locking could actually prevent the oops from going out;
-+ * the line that is generated is informational only and should NEVER prevent the real oops
-+ * from going out.
-+ */
-+void print_modules(void)
-+{
-+ struct module *this_mod;
-+ int pos = 0, i;
-+ memset(modlist,0,4096);
-+
-+#ifdef CONFIG_KALLSYMS
-+ this_mod = module_list;
-+ while (this_mod != NULL) {
-+ if (this_mod->name != NULL)
-+ pos +=snprintf(modlist+pos,160-pos-1,"%s ",this_mod->name);
-+ this_mod = this_mod->next;
-+ }
-+ printk("%s\n",modlist);
-+#endif
-+}
-Index: linux-bgl/include/linux/kernel.h
-===================================================================
---- linux-bgl.orig/include/linux/kernel.h 2003-07-02 08:44:16.000000000 -0700
-+++ linux-bgl/include/linux/kernel.h 2004-10-26 23:25:17.968439596 -0700
-@@ -107,6 +107,9 @@
- extern int tainted;
- extern const char *print_tainted(void);
-
-+extern int lookup_symbol(unsigned long address, char *buffer, int buflen);
-+extern void print_modules(void);
-+
- #if DEBUG
- #define pr_debug(fmt,arg...) \
- printk(KERN_DEBUG fmt,##arg)
+++ /dev/null
-Index: linux-2.4.24/arch/i386/kernel/i386_ksyms.c
-===================================================================
---- linux-2.4.24.orig/arch/i386/kernel/i386_ksyms.c 2003-11-28 13:26:19.000000000 -0500
-+++ linux-2.4.24/arch/i386/kernel/i386_ksyms.c 2004-05-07 16:58:39.000000000 -0400
-@@ -186,3 +186,8 @@
- EXPORT_SYMBOL(edd);
- EXPORT_SYMBOL(eddnr);
- #endif
-+
-+EXPORT_SYMBOL_GPL(show_mem);
-+EXPORT_SYMBOL_GPL(show_state);
-+EXPORT_SYMBOL_GPL(show_regs);
-+
-Index: linux-2.4.24/arch/i386/kernel/process.c
-===================================================================
---- linux-2.4.24.orig/arch/i386/kernel/process.c 2003-11-28 13:26:19.000000000 -0500
-+++ linux-2.4.24/arch/i386/kernel/process.c 2004-05-07 17:08:18.000000000 -0400
-@@ -400,7 +400,8 @@
- * Stop all CPUs and turn off local APICs and the IO-APIC, so
- * other OSs see a clean IRQ state.
- */
-- smp_send_stop();
-+ if (!netdump_func)
-+ smp_send_stop();
- #elif CONFIG_X86_LOCAL_APIC
- if (cpu_has_apic) {
- __cli();
-Index: linux-2.4.24/arch/i386/kernel/traps.c
-===================================================================
---- linux-2.4.24.orig/arch/i386/kernel/traps.c 2004-05-07 16:57:00.000000000 -0400
-+++ linux-2.4.24/arch/i386/kernel/traps.c 2004-05-07 17:09:17.000000000 -0400
-@@ -280,6 +280,9 @@
- printk("Kernel BUG\n");
- }
-
-+void (*netdump_func) (struct pt_regs *regs) = NULL;
-+int netdump_mode = 0;
-+
- spinlock_t die_lock = SPIN_LOCK_UNLOCKED;
-
- void die(const char * str, struct pt_regs * regs, long err)
-@@ -290,6 +293,8 @@
- handle_BUG(regs);
- printk("%s: %04lx\n", str, err & 0xffff);
- show_registers(regs);
-+ if (netdump_func)
-+ netdump_func(regs);
- bust_spinlocks(0);
- spin_unlock_irq(&die_lock);
- do_exit(SIGSEGV);
-@@ -1041,3 +1046,9 @@
-
- EXPORT_SYMBOL_GPL(is_kernel_text_address);
- EXPORT_SYMBOL_GPL(lookup_symbol);
-+
-+EXPORT_SYMBOL_GPL(netdump_func);
-+EXPORT_SYMBOL_GPL(netdump_mode);
-+#if CONFIG_X86_LOCAL_APIC
-+EXPORT_SYMBOL_GPL(nmi_watchdog);
-+#endif
-Index: linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c
-===================================================================
---- linux-2.4.24.orig/arch/x86_64/kernel/x8664_ksyms.c 2003-11-28 13:26:19.000000000 -0500
-+++ linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c 2004-05-07 17:01:51.000000000 -0400
-@@ -41,6 +41,9 @@
- EXPORT_SYMBOL(drive_info);
- #endif
-
-+//extern void (*netdump_func) (struct pt_regs *regs) = NULL;
-+int netdump_mode = 0;
-+
- /* platform dependent support */
- EXPORT_SYMBOL(boot_cpu_data);
- EXPORT_SYMBOL(dump_fpu);
-@@ -229,3 +232,6 @@
- EXPORT_SYMBOL(touch_nmi_watchdog);
-
- EXPORT_SYMBOL(do_fork);
-+
-+EXPORT_SYMBOL_GPL(netdump_func);
-+EXPORT_SYMBOL_GPL(netdump_mode);
-Index: linux-2.4.24/drivers/net/3c59x.c
-===================================================================
---- linux-2.4.24.orig/drivers/net/3c59x.c 2003-11-28 13:26:20.000000000 -0500
-+++ linux-2.4.24/drivers/net/3c59x.c 2004-05-07 17:01:00.000000000 -0400
-@@ -874,6 +874,7 @@
- static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
- static void vortex_tx_timeout(struct net_device *dev);
- static void acpi_set_WOL(struct net_device *dev);
-+static void vorboom_poll(struct net_device *dev);
- static struct ethtool_ops vortex_ethtool_ops;
- \f
- /* This driver uses 'options' to pass the media type, full-duplex flag, etc. */
-@@ -1343,6 +1344,9 @@
- dev->set_multicast_list = set_rx_mode;
- dev->tx_timeout = vortex_tx_timeout;
- dev->watchdog_timeo = (watchdog * HZ) / 1000;
-+#ifdef HAVE_POLL_CONTROLLER
-+ dev->poll_controller = &vorboom_poll;
-+#endif
- if (pdev && vp->enable_wol) {
- vp->pm_state_valid = 1;
- pci_save_state(vp->pdev, vp->power_state);
-@@ -2322,6 +2326,29 @@
- spin_unlock(&vp->lock);
- }
-
-+#ifdef HAVE_POLL_CONTROLLER
-+
-+/*
-+ * Polling 'interrupt' - used by things like netconsole to send skbs
-+ * without having to re-enable interrupts. It's not called while
-+ * the interrupt routine is executing.
-+ */
-+
-+static void vorboom_poll (struct net_device *dev)
-+{
-+ struct vortex_private *vp = (struct vortex_private *)dev->priv;
-+
-+ if (!netdump_mode) disable_irq(dev->irq);
-+ if (vp->full_bus_master_tx)
-+ boomerang_interrupt(dev->irq, dev, 0);
-+ else
-+ vortex_interrupt(dev->irq, dev, 0);
-+ if (!netdump_mode) enable_irq(dev->irq);
-+}
-+
-+#endif
-+
-+
- static int vortex_rx(struct net_device *dev)
- {
- struct vortex_private *vp = (struct vortex_private *)dev->priv;
-Index: linux-2.4.24/drivers/net/Config.in
-===================================================================
---- linux-2.4.24.orig/drivers/net/Config.in 2003-11-28 13:26:20.000000000 -0500
-+++ linux-2.4.24/drivers/net/Config.in 2004-05-07 16:58:39.000000000 -0400
-@@ -295,6 +295,8 @@
- dep_tristate ' SysKonnect FDDI PCI support' CONFIG_SKFP $CONFIG_PCI
- fi
-
-+tristate 'Network logging support' CONFIG_NETCONSOLE
-+
- if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
- if [ "$CONFIG_INET" = "y" ]; then
- bool 'HIPPI driver support (EXPERIMENTAL)' CONFIG_HIPPI
-Index: linux-2.4.24/drivers/net/eepro100.c
-===================================================================
---- linux-2.4.24.orig/drivers/net/eepro100.c 2003-08-25 07:44:42.000000000 -0400
-+++ linux-2.4.24/drivers/net/eepro100.c 2004-05-07 16:58:39.000000000 -0400
-@@ -543,6 +543,7 @@
- static int speedo_rx(struct net_device *dev);
- static void speedo_tx_buffer_gc(struct net_device *dev);
- static void speedo_interrupt(int irq, void *dev_instance, struct pt_regs *regs);
-+static void poll_speedo (struct net_device *dev);
- static int speedo_close(struct net_device *dev);
- static struct net_device_stats *speedo_get_stats(struct net_device *dev);
- static int speedo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
-@@ -879,6 +880,9 @@
- dev->get_stats = &speedo_get_stats;
- dev->set_multicast_list = &set_rx_mode;
- dev->do_ioctl = &speedo_ioctl;
-+#ifdef HAVE_POLL_CONTROLLER
-+ dev->poll_controller = &poll_speedo;
-+#endif
-
- return 0;
- }
-@@ -1176,10 +1180,8 @@
-
-
- /* Media monitoring and control. */
--static void speedo_timer(unsigned long data)
-+static void speedo_timeout(struct net_device *dev, struct speedo_private *sp)
- {
-- struct net_device *dev = (struct net_device *)data;
-- struct speedo_private *sp = (struct speedo_private *)dev->priv;
- long ioaddr = dev->base_addr;
- int phy_num = sp->phy[0] & 0x1f;
-
-@@ -1217,6 +1219,15 @@
- dev->name, sp->rx_mode, jiffies, sp->last_rx_time);
- set_rx_mode(dev);
- }
-+}
-+
-+static void speedo_timer(unsigned long data)
-+{
-+ struct net_device *dev = (struct net_device *)data;
-+ struct speedo_private *sp = (struct speedo_private *)dev->priv;
-+
-+ speedo_timeout(dev, sp);
-+
- /* We must continue to monitor the media. */
- sp->timer.expires = RUN_AT(2*HZ); /* 2.0 sec. */
- add_timer(&sp->timer);
-@@ -1661,6 +1672,29 @@
- return;
- }
-
-+#ifdef HAVE_POLL_CONTROLLER
-+
-+/*
-+ * Polling 'interrupt' - used by things like netconsole to send skbs
-+ * without having to re-enable interrupts. It's not called while
-+ * the interrupt routine is executing.
-+ */
-+
-+static void poll_speedo (struct net_device *dev)
-+{
-+ struct speedo_private *sp = (struct speedo_private *)dev->priv;
-+
-+ if (!netdump_mode) disable_irq(dev->irq);
-+ if (sp->timer.expires == jiffies) {
-+ sp->timer.expires = RUN_AT(2*HZ);
-+ speedo_timeout(dev, sp);
-+ }
-+ speedo_interrupt (dev->irq, dev, NULL);
-+ if (!netdump_mode) enable_irq(dev->irq);
-+}
-+
-+#endif
-+
- static inline struct RxFD *speedo_rx_alloc(struct net_device *dev, int entry)
- {
- struct speedo_private *sp = (struct speedo_private *)dev->priv;
-Index: linux-2.4.24/drivers/net/Makefile
-===================================================================
---- linux-2.4.24.orig/drivers/net/Makefile 2003-11-28 13:26:20.000000000 -0500
-+++ linux-2.4.24/drivers/net/Makefile 2004-05-07 16:58:39.000000000 -0400
-@@ -250,6 +250,8 @@
- obj-y += ../acorn/net/acorn-net.o
- endif
-
-+obj-$(CONFIG_NETCONSOLE) += netconsole.o
-+
- #
- # HIPPI adapters
- #
-Index: linux-2.4.24/drivers/net/netconsole.c
-===================================================================
---- linux-2.4.24.orig/drivers/net/netconsole.c 1969-12-31 19:00:00.000000000 -0500
-+++ linux-2.4.24/drivers/net/netconsole.c 2004-05-07 16:58:39.000000000 -0400
-@@ -0,0 +1,1246 @@
-+/*
-+ * linux/drivers/net/netconsole.c
-+ *
-+ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
-+ * Copyright (C) 2002 Red Hat, Inc.
-+ *
-+ * This file contains the implementation of an IRQ-safe, crash-safe
-+ * kernel console implementation that outputs kernel messages to the
-+ * network.
-+ *
-+ * Modification history:
-+ *
-+ * 2001-09-17 started by Ingo Molnar.
-+ * 2002-03-14 simultaneous syslog packet option by Michael K. Johnson
-+ */
-+
-+/****************************************************************
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2, or (at your option)
-+ * any later version.
-+ *
-+ * This program is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * along with this program; if not, write to the Free Software
-+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-+ *
-+ ****************************************************************/
-+
-+#include <net/tcp.h>
-+#include <net/udp.h>
-+#include <linux/mm.h>
-+#include <linux/tty.h>
-+#include <linux/init.h>
-+#include <linux/delay.h>
-+#include <linux/random.h>
-+#include <linux/reboot.h>
-+#include <linux/module.h>
-+#include <asm/unaligned.h>
-+#include <asm/pgtable.h>
-+#if CONFIG_X86_LOCAL_APIC
-+#include <asm/apic.h>
-+#endif
-+#include <linux/console.h>
-+#include <linux/smp_lock.h>
-+#include <linux/netdevice.h>
-+#include <linux/tty_driver.h>
-+#include <linux/etherdevice.h>
-+#include <linux/elf.h>
-+
-+static struct net_device *netconsole_dev;
-+static u16 source_port, netdump_target_port, netlog_target_port, syslog_target_port;
-+static u32 source_ip, netdump_target_ip, netlog_target_ip, syslog_target_ip;
-+static unsigned char netdump_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ;
-+static unsigned char netlog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ;
-+static unsigned char syslog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ;
-+
-+static unsigned int mhz = 500, idle_timeout;
-+static unsigned long long mhz_cycles, jiffy_cycles;
-+
-+#include "netconsole.h"
-+
-+#define MAX_UDP_CHUNK 1460
-+#define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN)
-+
-+#define DEBUG 0
-+#if DEBUG
-+# define Dprintk(x...) printk(KERN_INFO x)
-+#else
-+# define Dprintk(x...)
-+#endif
-+/*
-+ * We maintain a small pool of fully-sized skbs,
-+ * to make sure the message gets out even in
-+ * extreme OOM situations.
-+ */
-+#define MAX_NETCONSOLE_SKBS 128
-+
-+static spinlock_t netconsole_lock = SPIN_LOCK_UNLOCKED;
-+static int nr_netconsole_skbs;
-+static struct sk_buff *netconsole_skbs;
-+
-+#define MAX_SKB_SIZE \
-+ (MAX_UDP_CHUNK + sizeof(struct udphdr) + \
-+ sizeof(struct iphdr) + sizeof(struct ethhdr))
-+
-+static int new_arp = 0;
-+static unsigned char arp_sha[ETH_ALEN], arp_tha[ETH_ALEN];
-+static u32 arp_sip, arp_tip;
-+
-+static void send_netconsole_arp(struct net_device *dev);
-+
-+static void __refill_netconsole_skbs(void)
-+{
-+ struct sk_buff *skb;
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&netconsole_lock, flags);
-+ while (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS) {
-+ skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
-+ if (!skb)
-+ break;
-+ if (netconsole_skbs)
-+ skb->next = netconsole_skbs;
-+ else
-+ skb->next = NULL;
-+ netconsole_skbs = skb;
-+ nr_netconsole_skbs++;
-+ }
-+ spin_unlock_irqrestore(&netconsole_lock, flags);
-+}
-+
-+static struct sk_buff * get_netconsole_skb(void)
-+{
-+ struct sk_buff *skb;
-+
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&netconsole_lock, flags);
-+ skb = netconsole_skbs;
-+ if (skb) {
-+ netconsole_skbs = skb->next;
-+ skb->next = NULL;
-+ nr_netconsole_skbs--;
-+ }
-+ spin_unlock_irqrestore(&netconsole_lock, flags);
-+
-+ return skb;
-+}
-+
-+static unsigned long long t0;
-+
-+/*
-+ * Do cleanups:
-+ * - zap completed output skbs.
-+ * - send ARPs if requested
-+ * - reboot the box if inactive for more than N seconds.
-+ */
-+static void zap_completion_queue(void)
-+{
-+ unsigned long long t1;
-+ int cpu = smp_processor_id();
-+
-+ if (softnet_data[cpu].completion_queue) {
-+ struct sk_buff *clist;
-+
-+ local_irq_disable();
-+ clist = softnet_data[cpu].completion_queue;
-+ softnet_data[cpu].completion_queue = NULL;
-+ local_irq_enable();
-+
-+ while (clist != NULL) {
-+ struct sk_buff *skb = clist;
-+ clist = clist->next;
-+ __kfree_skb(skb);
-+ }
-+ }
-+
-+ if (new_arp) {
-+ Dprintk("got ARP req - sending reply.\n");
-+ new_arp = 0;
-+ send_netconsole_arp(netconsole_dev);
-+ }
-+
-+ rdtscll(t1);
-+ if (idle_timeout) {
-+ if (t0) {
-+ if (((t1 - t0) >> 20) > mhz_cycles * (unsigned long long)idle_timeout) {
-+ t0 = t1;
-+ printk("netdump idle timeout - rebooting in 3 seconds.\n");
-+ mdelay(3000);
-+ machine_restart(NULL);
-+ }
-+ }
-+ }
-+ /* maintain jiffies in a polling fashion, based on rdtsc. */
-+ {
-+ static unsigned long long prev_tick;
-+
-+ if (t1 - prev_tick >= jiffy_cycles) {
-+ prev_tick += jiffy_cycles;
-+ jiffies++;
-+ }
-+ }
-+}
-+
-+static struct sk_buff * alloc_netconsole_skb(struct net_device *dev, int len, int reserve)
-+{
-+ int once = 1;
-+ int count = 0;
-+ struct sk_buff *skb = NULL;
-+
-+repeat:
-+ zap_completion_queue();
-+ if (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS)
-+ __refill_netconsole_skbs();
-+
-+ skb = alloc_skb(len, GFP_ATOMIC);
-+ if (!skb) {
-+ skb = get_netconsole_skb();
-+ if (!skb) {
-+ count++;
-+ if (once && (count == 1000000)) {
-+ printk("possibly FATAL: out of netconsole skbs!!! will keep retrying.\n");
-+ once = 0;
-+ }
-+ Dprintk("alloc skb: polling controller ...\n");
-+ dev->poll_controller(dev);
-+ goto repeat;
-+ }
-+ }
-+
-+ atomic_set(&skb->users, 1);
-+ skb_reserve(skb, reserve);
-+ return skb;
-+}
-+
-+static void transmit_raw_skb(struct sk_buff *skb, struct net_device *dev)
-+{
-+
-+repeat_poll:
-+ spin_lock(&dev->xmit_lock);
-+ dev->xmit_lock_owner = smp_processor_id();
-+
-+ if (netif_queue_stopped(dev)) {
-+ dev->xmit_lock_owner = -1;
-+ spin_unlock(&dev->xmit_lock);
-+
-+ Dprintk("xmit skb: polling controller ...\n");
-+ dev->poll_controller(dev);
-+ zap_completion_queue();
-+ goto repeat_poll;
-+ }
-+
-+ dev->hard_start_xmit(skb, dev);
-+
-+ dev->xmit_lock_owner = -1;
-+ spin_unlock(&dev->xmit_lock);
-+}
-+
-+static void transmit_netconsole_skb(struct sk_buff *skb, struct net_device *dev,
-+ int ip_len, int udp_len,
-+ u16 source_port, u16 target_port, u32 source_ip, u32 target_ip,
-+ unsigned char * macdaddr)
-+{
-+ struct udphdr *udph;
-+ struct iphdr *iph;
-+ struct ethhdr *eth;
-+
-+ udph = (struct udphdr *) skb_push(skb, sizeof(*udph));
-+ udph->source = source_port;
-+ udph->dest = target_port;
-+ udph->len = htons(udp_len);
-+ udph->check = 0;
-+
-+ iph = (struct iphdr *)skb_push(skb, sizeof(*iph));
-+
-+ iph->version = 4;
-+ iph->ihl = 5;
-+ iph->tos = 0;
-+ iph->tot_len = htons(ip_len);
-+ iph->id = 0;
-+ iph->frag_off = 0;
-+ iph->ttl = 64;
-+ iph->protocol = IPPROTO_UDP;
-+ iph->check = 0;
-+ iph->saddr = source_ip;
-+ iph->daddr = target_ip;
-+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
-+
-+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
-+
-+ eth->h_proto = htons(ETH_P_IP);
-+ memcpy(eth->h_source, dev->dev_addr, dev->addr_len);
-+ memcpy(eth->h_dest, macdaddr, dev->addr_len);
-+
-+ transmit_raw_skb(skb, dev);
-+}
-+
-+static void send_netconsole_arp(struct net_device *dev)
-+{
-+ int total_len, arp_len, arp_data_len;
-+ struct sk_buff *skb;
-+ unsigned char *arp;
-+ struct arphdr *arph;
-+ struct ethhdr *eth;
-+
-+ arp_data_len = 2*4 + 2*ETH_ALEN;
-+ arp_len = arp_data_len + sizeof(struct arphdr);
-+ total_len = arp_len + ETH_HLEN;
-+
-+ skb = alloc_netconsole_skb(dev, total_len, total_len - arp_data_len);
-+
-+ arp = skb->data;
-+
-+ memcpy(arp, dev->dev_addr, ETH_ALEN);
-+ arp += ETH_ALEN;
-+
-+ memcpy(arp, &source_ip, 4);
-+ arp += 4;
-+
-+ memcpy(arp, arp_sha, ETH_ALEN);
-+ arp += ETH_ALEN;
-+
-+ memcpy(arp, &arp_sip, 4);
-+ arp += 4;
-+
-+ skb->len += 2*4 + 2*ETH_ALEN;
-+
-+ arph = (struct arphdr *)skb_push(skb, sizeof(*arph));
-+
-+ arph->ar_hrd = htons(dev->type);
-+ arph->ar_pro = __constant_htons(ETH_P_IP);
-+ arph->ar_hln = ETH_ALEN;
-+ arph->ar_pln = 4;
-+ arph->ar_op = __constant_htons(ARPOP_REPLY);
-+
-+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
-+
-+ eth->h_proto = htons(ETH_P_ARP);
-+ memcpy(eth->h_source, dev->dev_addr, dev->addr_len);
-+ memcpy(eth->h_dest, arp_sha, dev->addr_len);
-+
-+ transmit_raw_skb(skb, dev);
-+}
-+
-+static void send_netdump_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply)
-+{
-+ int total_len, ip_len, udp_len;
-+ struct sk_buff *skb;
-+
-+ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr);
-+ ip_len = udp_len + sizeof(struct iphdr);
-+ total_len = ip_len + ETH_HLEN;
-+
-+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN);
-+
-+ skb->data[0] = NETCONSOLE_VERSION;
-+ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1));
-+ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5));
-+ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9));
-+
-+ memcpy(skb->data + HEADER_LEN, msg, msg_len);
-+ skb->len += msg_len + HEADER_LEN;
-+
-+ transmit_netconsole_skb(skb, dev, ip_len, udp_len,
-+ source_port, netdump_target_port, source_ip, netdump_target_ip, netdump_daddr);
-+}
-+
-+#define SYSLOG_HEADER_LEN 4
-+
-+static void send_netlog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply)
-+{
-+ int total_len, ip_len, udp_len;
-+ struct sk_buff *skb;
-+
-+ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr);
-+ ip_len = udp_len + sizeof(struct iphdr);
-+ total_len = ip_len + ETH_HLEN;
-+
-+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN);
-+
-+ skb->data[0] = NETCONSOLE_VERSION;
-+ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1));
-+ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5));
-+ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9));
-+
-+ memcpy(skb->data + HEADER_LEN, msg, msg_len);
-+ skb->len += msg_len + HEADER_LEN;
-+
-+ transmit_netconsole_skb(skb, dev, ip_len, udp_len,
-+ source_port, netlog_target_port, source_ip, netlog_target_ip, netlog_daddr);
-+}
-+
-+#define SYSLOG_HEADER_LEN 4
-+
-+static void send_syslog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, int pri)
-+{
-+ int total_len, ip_len, udp_len;
-+ struct sk_buff *skb;
-+
-+ udp_len = msg_len + SYSLOG_HEADER_LEN + sizeof(struct udphdr);
-+ ip_len = udp_len + sizeof(struct iphdr);
-+ total_len = ip_len + ETH_HLEN;
-+
-+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - SYSLOG_HEADER_LEN);
-+
-+ skb->data[0] = '<';
-+ skb->data[1] = pri + '0';
-+ skb->data[2]= '>';
-+ skb->data[3]= ' ';
-+
-+ memcpy(skb->data + SYSLOG_HEADER_LEN, msg, msg_len);
-+ skb->len += msg_len + SYSLOG_HEADER_LEN;
-+
-+ transmit_netconsole_skb(skb, dev, ip_len, udp_len, source_port,
-+ syslog_target_port, source_ip, syslog_target_ip, syslog_daddr);
-+}
-+
-+#define MAX_SYSLOG_CHARS 1000
-+
-+static spinlock_t syslog_lock = SPIN_LOCK_UNLOCKED;
-+static int syslog_chars;
-+static unsigned char syslog_line [MAX_SYSLOG_CHARS + 10];
-+
-+/*
-+ * We feed kernel messages char by char, and send the UDP packet
-+ * one linefeed. We buffer all characters received.
-+ */
-+static inline void feed_syslog_char(struct net_device *dev, const unsigned char c)
-+{
-+ if (syslog_chars == MAX_SYSLOG_CHARS)
-+ syslog_chars--;
-+ syslog_line[syslog_chars] = c;
-+ syslog_chars++;
-+ if (c == '\n') {
-+ send_syslog_skb(dev, syslog_line, syslog_chars, 5);
-+ syslog_chars = 0;
-+ }
-+}
-+
-+static spinlock_t sequence_lock = SPIN_LOCK_UNLOCKED;
-+static unsigned int log_offset;
-+
-+static void write_netconsole_msg(struct console *con, const char *msg0, unsigned int msg_len)
-+{
-+ int len, left, i;
-+ struct net_device *dev;
-+ const char *msg = msg0;
-+ reply_t reply;
-+
-+ dev = netconsole_dev;
-+ if (!dev || netdump_mode)
-+ return;
-+
-+ if (dev->poll_controller && netif_running(dev)) {
-+ unsigned long flags;
-+
-+ __save_flags(flags);
-+ __cli();
-+ left = msg_len;
-+ if (netlog_target_ip) {
-+ while (left) {
-+ if (left > MAX_PRINT_CHUNK)
-+ len = MAX_PRINT_CHUNK;
-+ else
-+ len = left;
-+ reply.code = REPLY_LOG;
-+ reply.nr = 0;
-+ spin_lock(&sequence_lock);
-+ reply.info = log_offset;
-+ log_offset += len;
-+ spin_unlock(&sequence_lock);
-+ send_netlog_skb(dev, msg, len, &reply);
-+ msg += len;
-+ left -= len;
-+ }
-+ }
-+ if (syslog_target_ip) {
-+ spin_lock(&syslog_lock);
-+ for (i = 0; i < msg_len; i++)
-+ feed_syslog_char(dev, msg0[i]);
-+ spin_unlock(&syslog_lock);
-+ }
-+
-+ __restore_flags(flags);
-+ }
-+}
-+
-+static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
-+{
-+ return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
-+}
-+
-+static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
-+ unsigned short ulen, u32 saddr, u32 daddr)
-+{
-+ if (uh->check == 0) {
-+ skb->ip_summed = CHECKSUM_UNNECESSARY;
-+ } else if (skb->ip_summed == CHECKSUM_HW) {
-+ skb->ip_summed = CHECKSUM_UNNECESSARY;
-+ if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
-+ return 0;
-+ skb->ip_summed = CHECKSUM_NONE;
-+ }
-+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-+ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP,
-+0);
-+ /* Probably, we should checksum udp header (it should be in cache
-+ * in any case) and data in tiny packets (< rx copybreak).
-+ */
-+ return 0;
-+}
-+
-+static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
-+{
-+ return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
-+}
-+
-+static __inline__ int udp_checksum_complete(struct sk_buff *skb)
-+{
-+ return skb->ip_summed != CHECKSUM_UNNECESSARY &&
-+ __udp_checksum_complete(skb);
-+}
-+
-+/*
-+ * NOTE: security depends on the trusted path between the netconsole
-+ * server and netconsole client, since none of the packets are
-+ * encrypted. The random magic number protects the protocol
-+ * against spoofing.
-+ */
-+static u64 netconsole_magic;
-+static u32 magic1, magic2;
-+
-+static spinlock_t req_lock = SPIN_LOCK_UNLOCKED;
-+static int nr_req = 0;
-+static LIST_HEAD(request_list);
-+
-+static void add_new_req(req_t *req)
-+{
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&req_lock, flags);
-+ list_add_tail(&req->list, &request_list);
-+ nr_req++;
-+ Dprintk("pending requests: %d.\n", nr_req);
-+ spin_unlock_irqrestore(&req_lock, flags);
-+
-+ rdtscll(t0);
-+}
-+
-+static req_t *get_new_req(void)
-+{
-+ req_t *req = NULL;
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&req_lock, flags);
-+ if (nr_req) {
-+ req = list_entry(request_list.next, req_t, list);
-+ list_del(&req->list);
-+ nr_req--;
-+ }
-+ spin_unlock_irqrestore(&req_lock, flags);
-+
-+ return req;
-+}
-+
-+static req_t *alloc_req(void)
-+{
-+ req_t *req;
-+
-+ req = (req_t *) kmalloc(sizeof(*req), GFP_ATOMIC);
-+ return req;
-+}
-+
-+static int netconsole_rx_hook(struct sk_buff *skb)
-+{
-+ int proto;
-+ struct iphdr *iph;
-+ struct udphdr *uh;
-+ __u32 len, saddr, daddr, ulen;
-+ req_t *__req;
-+ req_t *req;
-+ struct net_device *dev;
-+
-+ if (!netdump_mode)
-+ return NET_RX_SUCCESS;
-+#if DEBUG
-+ {
-+ static int packet_count;
-+ Dprintk(" %d\r", ++packet_count);
-+ }
-+#endif
-+ dev = skb->dev;
-+ if (dev->type != ARPHRD_ETHER)
-+ goto out;
-+ proto = ntohs(skb->mac.ethernet->h_proto);
-+ Dprintk("rx got skb %p (len: %d, users: %d), dev %s, h_proto: %04x.\n", skb, skb->len, atomic_read(&skb->users), dev->name, proto);
-+ #define D(x) skb->mac.ethernet->h_dest[x]
-+ Dprintk("... h_dest: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5));
-+ #undef D
-+ #define D(x) skb->mac.ethernet->h_source[x]
-+ Dprintk("... h_source: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5));
-+ #undef D
-+ if (skb->pkt_type == PACKET_OTHERHOST)
-+ goto out;
-+ if (skb_shared(skb))
-+ goto out;
-+ if (proto == ETH_P_ARP) {
-+ struct arphdr *arp;
-+ unsigned char *arp_ptr;
-+
-+ Dprintk("got arp skb.\n");
-+ arp = (struct arphdr *)skb->data;
-+ if (!pskb_may_pull(skb, sizeof(struct arphdr) + 2*4 + 2*ETH_ALEN))
-+ goto out;
-+ if (htons(dev->type) != arp->ar_hrd)
-+ goto out;
-+ if (arp->ar_pro != __constant_htons(ETH_P_IP))
-+ goto out;
-+ if (arp->ar_hln != ETH_ALEN)
-+ goto out;
-+ if (arp->ar_pln != 4)
-+ goto out;
-+ if (arp->ar_op != __constant_htons(ARPOP_REQUEST))
-+ goto out;
-+ /*
-+ * ARP header looks ok so far, extract fields:
-+ */
-+ arp_ptr = (unsigned char *)(arp + 1);
-+
-+ memcpy(arp_sha, arp_ptr, ETH_ALEN);
-+ arp_ptr += ETH_ALEN;
-+
-+ memcpy(&arp_sip, arp_ptr, 4);
-+ arp_ptr += 4;
-+
-+ memcpy(arp_tha, arp_ptr, ETH_ALEN);
-+ arp_ptr += ETH_ALEN;
-+
-+ memcpy(&arp_tip, arp_ptr, 4);
-+
-+ #define D(x) arp_sha[x]
-+ Dprintk("... arp_sha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5));
-+ #undef D
-+ #define D(x) ((unsigned char *)&arp_sip)[x]
-+ Dprintk("... arp_sip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3));
-+ #undef D
-+ #define D(x) arp_tha[x]
-+ Dprintk("... arp_tha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5));
-+ #undef D
-+ #define D(x) ((unsigned char *)&arp_tip)[x]
-+ Dprintk("... arp_tip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3));
-+ #undef D
-+ #define D(x) ((unsigned char *)&source_ip)[x]
-+ Dprintk("... (source_ip): %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3));
-+ #undef D
-+
-+ if (LOOPBACK(arp_tip) || MULTICAST(arp_tip))
-+ goto out;
-+
-+ if (arp_tip != source_ip)
-+ goto out;
-+ new_arp = 1;
-+ goto out;
-+ }
-+ if (proto != ETH_P_IP)
-+ goto out;
-+ /*
-+ * IP header correctness testing:
-+ */
-+ iph = (struct iphdr *)skb->data;
-+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-+ goto out;
-+ Dprintk("... IP ihl*4: %d, version: %d.\n", iph->ihl*4, iph->version);
-+ if (iph->ihl < 5 || iph->version != 4)
-+ goto out;
-+ if (!pskb_may_pull(skb, iph->ihl*4))
-+ goto out;
-+ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
-+ goto out;
-+ len = ntohs(iph->tot_len);
-+ Dprintk("... IP len: %d.\n", len);
-+ if (skb->len < len || len < iph->ihl*4)
-+ goto out;
-+ saddr = iph->saddr;
-+ daddr = iph->daddr;
-+ Dprintk("... IP src: %08x, dst: %08x.\n", saddr, daddr);
-+ Dprintk("... IP protocol: %d.\n", iph->protocol);
-+ if (iph->protocol != IPPROTO_UDP)
-+ goto out;
-+ Dprintk("... netdump src: %08x, dst: %08x.\n", source_ip, netlog_target_ip);
-+ if (source_ip != daddr)
-+ goto out;
-+ if (netlog_target_ip != saddr)
-+ goto out;
-+ len -= iph->ihl*4;
-+ uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
-+ ulen = ntohs(uh->len);
-+ Dprintk("... UDP len: %d (left %d).\n", ulen, len);
-+
-+#define MIN_COMM_SIZE (sizeof(*uh) + NETDUMP_REQ_SIZE)
-+ if (ulen != len || ulen < MIN_COMM_SIZE) {
-+ Dprintk("... UDP, hm, len not ok.\n");
-+ goto out;
-+ }
-+ if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) {
-+ Dprintk("... UDP, hm, checksum init not ok.\n");
-+ goto out;
-+ }
-+ if (udp_checksum_complete(skb)) {
-+ Dprintk("... UDP, hm, checksum complete not ok.\n");
-+ goto out;
-+ }
-+ Dprintk("... UDP packet OK!\n");
-+ Dprintk("... UDP src port: %d, dst port: %d.\n", uh->source, uh->dest);
-+ if (source_port != uh->source)
-+ goto out;
-+ if (netlog_target_port != uh->dest)
-+ goto out;
-+ __req = (req_t *)(uh + 1);
-+ Dprintk("... UDP netdump packet OK!\n");
-+
-+ req = alloc_req();
-+ if (!req) {
-+ printk("no more RAM to allocate request - dropping it.\n");
-+ goto out;
-+ }
-+
-+ req->magic = ntohl(__req->magic);
-+ req->command = ntohl(__req->command);
-+ req->from = ntohl(__req->from);
-+ req->to = ntohl(__req->to);
-+ req->nr = ntohl(__req->nr);
-+
-+ Dprintk("... netdump magic: %08Lx.\n", req->magic);
-+ Dprintk("... netdump command: %08x.\n", req->command);
-+ Dprintk("... netdump from: %08x.\n", req->from);
-+ Dprintk("... netdump to: %08x.\n", req->to);
-+
-+ add_new_req(req);
-+out:
-+ return NET_RX_DROP;
-+}
-+
-+#define INVALID_PAGE "page is not valid!\n"
-+
-+static void send_netdump_mem (struct net_device *dev, req_t *req)
-+{
-+ int i;
-+ char *kaddr;
-+ char str[1024];
-+ struct page *page;
-+ unsigned long nr = req->from;
-+ int nr_chunks = PAGE_SIZE/1024;
-+ reply_t reply;
-+
-+ reply.nr = req->nr;
-+ reply.info = 0;
-+ if (req->from >= max_mapnr) {
-+ sprintf(str, "page %08lx is bigger than max page # %08lx!\n", nr, max_mapnr);
-+ reply.code = REPLY_ERROR;
-+ send_netdump_skb(dev, str, strlen(str), &reply);
-+ return;
-+ }
-+ page = mem_map + nr;
-+ if (PageReserved(page))
-+ page = ZERO_PAGE(0);
-+
-+ kaddr = (char *)kmap_atomic(page, KM_NETDUMP);
-+
-+ for (i = 0; i < nr_chunks; i++) {
-+ unsigned int offset = i*1024;
-+ reply.code = REPLY_MEM;
-+ reply.info = offset;
-+ send_netdump_skb(dev, kaddr + offset, 1024, &reply);
-+ }
-+
-+ kunmap_atomic(kaddr, KM_NETDUMP);
-+}
-+
-+/*
-+ * This function waits for the client to acknowledge the receipt
-+ * of the netdump startup reply, with the possibility of packets
-+ * getting lost. We resend the startup packet if no ACK is received,
-+ * after a 1 second delay.
-+ *
-+ * (The client can test the success of the handshake via the HELLO
-+ * command, and send ACKs until we enter netdump mode.)
-+ */
-+static void netdump_startup_handshake(struct net_device *dev)
-+{
-+ char tmp[200];
-+ reply_t reply;
-+ req_t *req = NULL;
-+ int i;
-+
-+ netdump_mode = 1;
-+
-+repeat:
-+ sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n");
-+ reply.code = REPLY_START_NETDUMP;
-+ reply.nr = 0;
-+ reply.info = 0;
-+ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
-+
-+ for (i = 0; i < 10000; i++) {
-+ // wait 1 sec.
-+ udelay(100);
-+ Dprintk("handshake: polling controller ...\n");
-+ dev->poll_controller(dev);
-+ zap_completion_queue();
-+ req = get_new_req();
-+ if (req)
-+ break;
-+ }
-+ if (!req)
-+ goto repeat;
-+ if (req->command != COMM_START_NETDUMP_ACK) {
-+ kfree(req);
-+ goto repeat;
-+ }
-+ kfree(req);
-+
-+ printk("NETDUMP START!\n");
-+}
-+
-+#if 0
-+
-+static inline void print_status (req_t *req)
-+{
-+ static int count = 0;
-+
-+ switch (++count & 3) {
-+ case 0: printk("/\r"); break;
-+ case 1: printk("|\r"); break;
-+ case 2: printk("\\\r"); break;
-+ case 3: printk("-\r"); break;
-+ }
-+}
-+
-+#else
-+
-+static inline void print_status (req_t *req)
-+{
-+ static int count = 0;
-+ static int prev_jiffies = 0;
-+
-+ if (jiffies/HZ != prev_jiffies/HZ) {
-+ prev_jiffies = jiffies;
-+ count++;
-+ switch (count & 3) {
-+ case 0: printk("%d(%ld)/\r", nr_req, jiffies); break;
-+ case 1: printk("%d(%ld)|\r", nr_req, jiffies); break;
-+ case 2: printk("%d(%ld)\\\r", nr_req, jiffies); break;
-+ case 3: printk("%d(%ld)-\r", nr_req, jiffies); break;
-+ }
-+ }
-+}
-+
-+#endif
-+
-+#define CLI 1
-+
-+#if CONFIG_SMP
-+static void freeze_cpu (void * dummy)
-+{
-+ printk("CPU#%d is frozen.\n", smp_processor_id());
-+#if CLI
-+ for (;;) __cli();
-+#else
-+ for (;;) __sti();
-+#endif
-+}
-+#endif
-+
-+static void netconsole_netdump (struct pt_regs *regs)
-+{
-+ reply_t reply;
-+ char tmp[200];
-+ unsigned long flags;
-+ struct net_device *dev = netconsole_dev;
-+ unsigned long esp;
-+ unsigned short ss;
-+ struct pt_regs myregs;
-+ req_t *req;
-+
-+ __save_flags(flags);
-+ __cli();
-+#if CONFIG_X86_LOCAL_APIC
-+ nmi_watchdog = 0;
-+#endif
-+#if CONFIG_SMP
-+ smp_call_function(freeze_cpu, NULL, 1, 0);
-+#endif
-+ mdelay(1000);
-+ /*
-+ * Just in case we are crashing within the networking code
-+ * ... attempt to fix up.
-+ */
-+ spin_lock_init(&dev->xmit_lock);
-+
-+ esp = (unsigned long) ((char *)regs + sizeof (struct pt_regs));
-+ ss = __KERNEL_DS;
-+ if (regs->xcs & 3) {
-+ esp = regs->esp;
-+ ss = regs->xss & 0xffff;
-+ }
-+ myregs = *regs;
-+ myregs.esp = esp;
-+ myregs.xss = (myregs.xss & 0xffff0000) | ss;
-+
-+ rdtscll(t0);
-+
-+ printk("< netdump activated - performing handshake with the client. >\n");
-+ netdump_startup_handshake(dev);
-+
-+ printk("< handshake completed - listening for dump requests. >\n");
-+
-+ while (netdump_mode) {
-+ __cli();
-+ Dprintk("main netdump loop: polling controller ...\n");
-+ dev->poll_controller(dev);
-+ zap_completion_queue();
-+#if !CLI
-+ __sti();
-+#endif
-+ req = get_new_req();
-+ if (!req)
-+ continue;
-+ Dprintk("got new req, command %d.\n", req->command);
-+ print_status(req);
-+ switch (req->command) {
-+ case COMM_NONE:
-+ Dprintk("got NO command.\n");
-+ break;
-+
-+ case COMM_SEND_MEM:
-+ Dprintk("got MEM command.\n");
-+ // send ->from ->to.
-+ send_netdump_mem(dev, req);
-+ break;
-+
-+ case COMM_EXIT:
-+ Dprintk("got EXIT command.\n");
-+ netdump_mode = 0;
-+ break;
-+
-+ case COMM_REBOOT:
-+ Dprintk("got REBOOT command.\n");
-+ printk("netdump: rebooting in 3 seconds.\n");
-+ mdelay(3000);
-+ machine_restart(NULL);
-+ break;
-+
-+ case COMM_HELLO:
-+ sprintf(tmp, "Hello, this is netdump version 0.%02d\n", NETCONSOLE_VERSION);
-+ reply.code = REPLY_HELLO;
-+ reply.nr = req->nr;
-+ reply.info = NETCONSOLE_VERSION;
-+ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
-+ break;
-+
-+ case COMM_GET_PAGE_SIZE:
-+ sprintf(tmp, "PAGE_SIZE: %ld\n", PAGE_SIZE);
-+ reply.code = REPLY_PAGE_SIZE;
-+ reply.nr = req->nr;
-+ reply.info = PAGE_SIZE;
-+ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
-+ break;
-+
-+ case COMM_GET_REGS:
-+ {
-+ char *tmp2 = tmp;
-+ elf_gregset_t elf_regs;
-+
-+ reply.code = REPLY_REGS;
-+ reply.nr = req->nr;
-+ reply.info = max_mapnr;
-+ tmp2 = tmp + sprintf(tmp, "Sending register info.\n");
-+ ELF_CORE_COPY_REGS(elf_regs, regs);
-+ memcpy(tmp2, &elf_regs, sizeof(elf_regs));
-+ send_netdump_skb(dev, tmp, strlen(tmp) + sizeof(elf_regs), &reply);
-+ break;
-+ }
-+
-+ case COMM_GET_NR_PAGES:
-+ reply.code = REPLY_NR_PAGES;
-+ reply.nr = req->nr;
-+ reply.info = max_mapnr;
-+ sprintf(tmp, "Number of pages: %ld\n", max_mapnr);
-+ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
-+ break;
-+
-+ case COMM_SHOW_STATE:
-+ netdump_mode = 0;
-+ if (regs)
-+ show_regs(regs);
-+ show_state();
-+ show_mem();
-+ netdump_mode = 1;
-+ reply.code = REPLY_SHOW_STATE;
-+ reply.nr = req->nr;
-+ reply.info = 0;
-+ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
-+ break;
-+
-+ default:
-+ reply.code = REPLY_ERROR;
-+ reply.nr = req->nr;
-+ reply.info = req->command;
-+ Dprintk("got UNKNOWN command!\n");
-+ sprintf(tmp, "Got unknown command code %d!\n", req->command);
-+ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
-+ break;
-+ }
-+ kfree(req);
-+ req = NULL;
-+ }
-+ sprintf(tmp, "NETDUMP end.\n");
-+ reply.code = REPLY_END_NETDUMP;
-+ reply.nr = 0;
-+ reply.info = 0;
-+ send_netdump_skb(dev, tmp, strlen(tmp), &reply);
-+ printk("NETDUMP END!\n");
-+ __restore_flags(flags);
-+}
-+
-+static char *dev;
-+static int netdump_target_eth_byte0 = 255;
-+static int netdump_target_eth_byte1 = 255;
-+static int netdump_target_eth_byte2 = 255;
-+static int netdump_target_eth_byte3 = 255;
-+static int netdump_target_eth_byte4 = 255;
-+static int netdump_target_eth_byte5 = 255;
-+
-+static int netlog_target_eth_byte0 = 255;
-+static int netlog_target_eth_byte1 = 255;
-+static int netlog_target_eth_byte2 = 255;
-+static int netlog_target_eth_byte3 = 255;
-+static int netlog_target_eth_byte4 = 255;
-+static int netlog_target_eth_byte5 = 255;
-+
-+static int syslog_target_eth_byte0 = 255;
-+static int syslog_target_eth_byte1 = 255;
-+static int syslog_target_eth_byte2 = 255;
-+static int syslog_target_eth_byte3 = 255;
-+static int syslog_target_eth_byte4 = 255;
-+static int syslog_target_eth_byte5 = 255;
-+
-+MODULE_PARM(netdump_target_ip, "i");
-+MODULE_PARM_DESC(netdump_target_ip,
-+ "remote netdump IP address as a native (not network) endian integer");
-+MODULE_PARM(netlog_target_ip, "i");
-+MODULE_PARM_DESC(netlog_target_ip,
-+ "remote netlog IP address as a native (not network) endian integer");
-+MODULE_PARM(syslog_target_ip, "i");
-+MODULE_PARM_DESC(syslog_target_ip,
-+ "remote syslog IP address as a native (not network) endian integer");
-+
-+MODULE_PARM(source_port, "h");
-+MODULE_PARM_DESC(source_port,
-+ "local port from which to send netdump packets");
-+
-+MODULE_PARM(netdump_target_port, "h");
-+MODULE_PARM_DESC(netdump_target_port,
-+ "remote port to which to send netdump packets");
-+MODULE_PARM(netlog_target_port, "h");
-+MODULE_PARM_DESC(netlog_target_port,
-+ "remote port to which to send netlog packets");
-+MODULE_PARM(syslog_target_port, "h");
-+MODULE_PARM_DESC(syslog_target_port,
-+ "remote port to which to send syslog packets");
-+
-+#define ETH_BYTE(name,nr) \
-+ MODULE_PARM(name##_target_eth_byte##nr, "i"); \
-+ MODULE_PARM_DESC(name##_target_eth_byte##nr, \
-+ "byte "#nr" of the netdump server MAC address")
-+
-+#define ETH_BYTES(name) \
-+ ETH_BYTE(name, 0); ETH_BYTE(name, 1); ETH_BYTE(name, 2); \
-+ ETH_BYTE(name, 3); ETH_BYTE(name, 4); ETH_BYTE(name, 5);
-+
-+ETH_BYTES(netdump);
-+ETH_BYTES(netlog);
-+ETH_BYTES(syslog);
-+
-+MODULE_PARM(magic1, "i");
-+MODULE_PARM_DESC(magic1,
-+ "lower 32 bits of magic cookie shared between client and server");
-+MODULE_PARM(magic2, "i");
-+MODULE_PARM_DESC(magic2,
-+ "upper 32 bits of magic cookie shared between client and server");
-+MODULE_PARM(dev, "s");
-+MODULE_PARM_DESC(dev,
-+ "name of the device from which to send netdump and syslog packets");
-+MODULE_PARM(mhz, "i");
-+MODULE_PARM_DESC(mhz,
-+ "one second wall clock time takes this many million CPU cycles");
-+MODULE_PARM(idle_timeout, "i");
-+MODULE_PARM_DESC(idle_timeout,
-+ "reboot system after this many idle seconds");
-+
-+static struct console netconsole =
-+ { flags: CON_ENABLED, write: write_netconsole_msg };
-+
-+static int init_netconsole(void)
-+{
-+ struct net_device *ndev = NULL;
-+ struct in_device *in_dev;
-+
-+ printk(KERN_INFO "netlog: using network device <%s>\n", dev);
-+ // this will be valid once the device goes up.
-+ if (dev)
-+ ndev = dev_get_by_name(dev);
-+ if (!ndev) {
-+ printk(KERN_ERR "netlog: network device %s does not exist, aborting.\n", dev);
-+ return -1;
-+ }
-+ if (!ndev->poll_controller) {
-+ printk(KERN_ERR "netlog: %s's network driver does not implement netlogging yet, aborting.\n", dev);
-+ return -1;
-+ }
-+ in_dev = in_dev_get(ndev);
-+ if (!in_dev) {
-+ printk(KERN_ERR "netlog: network device %s is not an IP protocol device, aborting.\n", dev);
-+ return -1;
-+ }
-+
-+ if (!magic1 || !magic2) {
-+ printk(KERN_ERR "netlog: magic cookie (magic1,magic2) not specified.\n");
-+ return -1;
-+ }
-+ netconsole_magic = magic1 + (((u64)magic2)<<32);
-+
-+ source_ip = ntohl(in_dev->ifa_list->ifa_local);
-+ if (!source_ip) {
-+ printk(KERN_ERR "netlog: network device %s has no local address, aborting.\n", dev);
-+ return -1;
-+ }
-+#define IP(x) ((unsigned char *)&source_ip)[x]
-+ printk(KERN_INFO "netlog: using source IP %u.%u.%u.%u\n",
-+ IP(3), IP(2), IP(1), IP(0));
-+#undef IP
-+ source_ip = htonl(source_ip);
-+ if (!source_port) {
-+ printk(KERN_ERR "netlog: source_port parameter not specified, aborting.\n");
-+ return -1;
-+ }
-+ printk(KERN_INFO "netlog: using source UDP port: %u\n", source_port);
-+ source_port = htons(source_port);
-+
-+ if (!netdump_target_ip && !netlog_target_ip && !syslog_target_ip) {
-+ printk(KERN_ERR "netlog: target_ip parameter not specified, aborting.\n");
-+ return -1;
-+ }
-+ if (netdump_target_ip) {
-+#define IP(x) ((unsigned char *)&netdump_target_ip)[x]
-+ printk(KERN_INFO "netlog: using netdump target IP %u.%u.%u.%u\n",
-+ IP(3), IP(2), IP(1), IP(0));
-+#undef IP
-+ netdump_target_ip = htonl(netdump_target_ip);
-+ }
-+ if (netlog_target_ip) {
-+#define IP(x) ((unsigned char *)&netlog_target_ip)[x]
-+ printk(KERN_INFO "netlog: using netlog target IP %u.%u.%u.%u\n",
-+ IP(3), IP(2), IP(1), IP(0));
-+#undef IP
-+ netlog_target_ip = htonl(netlog_target_ip);
-+ }
-+ if (syslog_target_ip) {
-+ if (!syslog_target_port)
-+ syslog_target_port = 514;
-+#define IP(x) ((unsigned char *)&syslog_target_ip)[x]
-+ printk("netlog: using syslog target IP %u.%u.%u.%u, port: %d\n", IP(3), IP(2), IP(1), IP(0), syslog_target_port);
-+#undef IP
-+ syslog_target_ip = htonl(syslog_target_ip);
-+ syslog_target_port = htons(syslog_target_port);
-+ }
-+ if (!netdump_target_port && !netlog_target_port && !syslog_target_port) {
-+ printk(KERN_ERR "netlog: target_port parameter not specified, aborting.\n");
-+ return -1;
-+ }
-+ if (netdump_target_port) {
-+ printk(KERN_INFO "netlog: using target UDP port: %u\n", netdump_target_port);
-+ netdump_target_port = htons(netdump_target_port);
-+ }
-+ if (netlog_target_port) {
-+ printk(KERN_INFO "netlog: using target UDP port: %u\n", netlog_target_port);
-+ netlog_target_port = htons(netlog_target_port);
-+ }
-+
-+ netdump_daddr[0] = netdump_target_eth_byte0;
-+ netdump_daddr[1] = netdump_target_eth_byte1;
-+ netdump_daddr[2] = netdump_target_eth_byte2;
-+ netdump_daddr[3] = netdump_target_eth_byte3;
-+ netdump_daddr[4] = netdump_target_eth_byte4;
-+ netdump_daddr[5] = netdump_target_eth_byte5;
-+
-+ if ((netdump_daddr[0] & netdump_daddr[1] & netdump_daddr[2] & netdump_daddr[3] & netdump_daddr[4] & netdump_daddr[5]) == 255)
-+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n");
-+ else
-+ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n",
-+ netdump_daddr[0], netdump_daddr[1], netdump_daddr[2], netdump_daddr[3], netdump_daddr[4], netdump_daddr[5]);
-+
-+ netlog_daddr[0] = netlog_target_eth_byte0;
-+ netlog_daddr[1] = netlog_target_eth_byte1;
-+ netlog_daddr[2] = netlog_target_eth_byte2;
-+ netlog_daddr[3] = netlog_target_eth_byte3;
-+ netlog_daddr[4] = netlog_target_eth_byte4;
-+ netlog_daddr[5] = netlog_target_eth_byte5;
-+
-+ if ((netlog_daddr[0] & netlog_daddr[1] & netlog_daddr[2] & netlog_daddr[3] & netlog_daddr[4] & netlog_daddr[5]) == 255)
-+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n");
-+ else
-+ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n",
-+ netlog_daddr[0], netlog_daddr[1], netlog_daddr[2], netlog_daddr[3], netlog_daddr[4], netlog_daddr[5]);
-+ syslog_daddr[0] = syslog_target_eth_byte0;
-+ syslog_daddr[1] = syslog_target_eth_byte1;
-+ syslog_daddr[2] = syslog_target_eth_byte2;
-+ syslog_daddr[3] = syslog_target_eth_byte3;
-+ syslog_daddr[4] = syslog_target_eth_byte4;
-+ syslog_daddr[5] = syslog_target_eth_byte5;
-+
-+ if ((syslog_daddr[0] & syslog_daddr[1] & syslog_daddr[2] & syslog_daddr[3] & syslog_daddr[4] & syslog_daddr[5]) == 255)
-+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send syslog packets.\n");
-+ else
-+ printk(KERN_INFO "netlog: using syslog target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n",
-+ syslog_daddr[0], syslog_daddr[1], syslog_daddr[2], syslog_daddr[3], syslog_daddr[4], syslog_daddr[5]);
-+
-+ mhz_cycles = (unsigned long long)mhz * 1000000ULL;
-+ jiffy_cycles = (unsigned long long)mhz * (1000000/HZ);
-+
-+ INIT_LIST_HEAD(&request_list);
-+
-+ ndev->rx_hook = netconsole_rx_hook;
-+ netdump_func = netconsole_netdump;
-+ netconsole_dev = ndev;
-+#define STARTUP_MSG "[...network console startup...]\n"
-+ write_netconsole_msg(NULL, STARTUP_MSG, strlen(STARTUP_MSG));
-+
-+ register_console(&netconsole);
-+ printk(KERN_INFO "netlog: network logging started up successfully!\n");
-+ return 0;
-+}
-+
-+static void cleanup_netconsole(void)
-+{
-+ printk(KERN_INFO "netlog: network logging shut down.\n");
-+ unregister_console(&netconsole);
-+
-+#define SHUTDOWN_MSG "[...network console shutdown...]\n"
-+ write_netconsole_msg(NULL, SHUTDOWN_MSG, strlen(SHUTDOWN_MSG));
-+ netconsole_dev->rx_hook = NULL;
-+ netconsole_dev = NULL;
-+}
-+
-+module_init(init_netconsole);
-+module_exit(cleanup_netconsole);
-+
-+MODULE_LICENSE("GPL");
-+
-Index: linux-2.4.24/drivers/net/netconsole.h
-===================================================================
---- linux-2.4.24.orig/drivers/net/netconsole.h 1969-12-31 19:00:00.000000000 -0500
-+++ linux-2.4.24/drivers/net/netconsole.h 2004-05-07 16:58:39.000000000 -0400
-@@ -0,0 +1,81 @@
-+/*
-+ * linux/drivers/net/netconsole.h
-+ *
-+ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
-+ *
-+ * This file contains the implementation of an IRQ-safe, crash-safe
-+ * kernel console implementation that outputs kernel messages to the
-+ * network.
-+ *
-+ * Modification history:
-+ *
-+ * 2001-09-17 started by Ingo Molnar.
-+ */
-+
-+/****************************************************************
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2, or (at your option)
-+ * any later version.
-+ *
-+ * This program is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * along with this program; if not, write to the Free Software
-+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-+ *
-+ ****************************************************************/
-+
-+#define NETCONSOLE_VERSION 0x04
-+
-+enum netdump_commands {
-+ COMM_NONE = 0,
-+ COMM_SEND_MEM = 1,
-+ COMM_EXIT = 2,
-+ COMM_REBOOT = 3,
-+ COMM_HELLO = 4,
-+ COMM_GET_NR_PAGES = 5,
-+ COMM_GET_PAGE_SIZE = 6,
-+ COMM_START_NETDUMP_ACK = 7,
-+ COMM_GET_REGS = 8,
-+ COMM_SHOW_STATE = 9,
-+};
-+
-+#define NETDUMP_REQ_SIZE (8+4*4)
-+
-+typedef struct netdump_req_s {
-+ u64 magic;
-+ u32 nr;
-+ u32 command;
-+ u32 from;
-+ u32 to;
-+ struct list_head list;
-+} req_t;
-+
-+enum netdump_replies {
-+ REPLY_NONE = 0,
-+ REPLY_ERROR = 1,
-+ REPLY_LOG = 2,
-+ REPLY_MEM = 3,
-+ REPLY_RESERVED = 4,
-+ REPLY_HELLO = 5,
-+ REPLY_NR_PAGES = 6,
-+ REPLY_PAGE_SIZE = 7,
-+ REPLY_START_NETDUMP = 8,
-+ REPLY_END_NETDUMP = 9,
-+ REPLY_REGS = 10,
-+ REPLY_MAGIC = 11,
-+ REPLY_SHOW_STATE = 12,
-+};
-+
-+typedef struct netdump_reply_s {
-+ u32 nr;
-+ u32 code;
-+ u32 info;
-+} reply_t;
-+
-+#define HEADER_LEN (1 + sizeof(reply_t))
-+
-Index: linux-2.4.24/drivers/net/tlan.c
-===================================================================
---- linux-2.4.24.orig/drivers/net/tlan.c 2003-11-28 13:26:20.000000000 -0500
-+++ linux-2.4.24/drivers/net/tlan.c 2004-05-07 16:58:39.000000000 -0400
-@@ -345,6 +345,8 @@
- static void TLan_EeReceiveByte( u16, u8 *, int );
- static int TLan_EeReadByte( struct net_device *, u8, u8 * );
-
-+static void TLan_Poll(struct net_device *);
-+
-
- static void
- TLan_StoreSKB( struct tlan_list_tag *tag, struct sk_buff *skb)
-@@ -891,6 +893,9 @@
- dev->get_stats = &TLan_GetStats;
- dev->set_multicast_list = &TLan_SetMulticastList;
- dev->do_ioctl = &TLan_ioctl;
-+#ifdef HAVE_POLL_CONTROLLER
-+ dev->poll_controller = &TLan_Poll;
-+#endif
- dev->tx_timeout = &TLan_tx_timeout;
- dev->watchdog_timeo = TX_TIMEOUT;
-
-@@ -1176,7 +1181,14 @@
-
- } /* TLan_HandleInterrupts */
-
--
-+#ifdef HAVE_POLL_CONTROLLER
-+static void TLan_Poll(struct net_device *dev)
-+{
-+ if (!netdump_mode) disable_irq(dev->irq);
-+ TLan_HandleInterrupt(dev->irq, dev, NULL);
-+ if (!netdump_mode) enable_irq(dev->irq);
-+}
-+#endif
-
-
- /***************************************************************
-Index: linux-2.4.24/drivers/net/tulip/tulip_core.c
-===================================================================
---- linux-2.4.24.orig/drivers/net/tulip/tulip_core.c 2003-11-28 13:26:20.000000000 -0500
-+++ linux-2.4.24/drivers/net/tulip/tulip_core.c 2004-05-07 16:58:39.000000000 -0400
-@@ -266,6 +266,7 @@
- static struct net_device_stats *tulip_get_stats(struct net_device *dev);
- static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
- static void set_rx_mode(struct net_device *dev);
-+static void poll_tulip(struct net_device *dev);
-
-
-
-@@ -1728,6 +1729,9 @@
- dev->get_stats = tulip_get_stats;
- dev->do_ioctl = private_ioctl;
- dev->set_multicast_list = set_rx_mode;
-+#ifdef HAVE_POLL_CONTROLLER
-+ dev->poll_controller = &poll_tulip;
-+#endif
-
- if (register_netdev(dev))
- goto err_out_free_ring;
-@@ -1902,6 +1906,24 @@
- }
-
-
-+#ifdef HAVE_POLL_CONTROLLER
-+
-+/*
-+ * Polling 'interrupt' - used by things like netconsole to send skbs
-+ * without having to re-enable interrupts. It's not called while
-+ * the interrupt routine is executing.
-+ */
-+
-+static void poll_tulip (struct net_device *dev)
-+{
-+ if (!netdump_mode) disable_irq(dev->irq);
-+ tulip_interrupt (dev->irq, dev, NULL);
-+ if (!netdump_mode) enable_irq(dev->irq);
-+}
-+
-+#endif
-+
-+
- static struct pci_driver tulip_driver = {
- name: DRV_NAME,
- id_table: tulip_pci_tbl,
-Index: linux-2.4.24/drivers/net/e100/e100_main.c
-===================================================================
---- linux-2.4.24.orig/drivers/net/e100/e100_main.c 2004-05-07 16:58:39.000000000 -0400
-+++ linux-2.4.24/drivers/net/e100/e100_main.c 2004-05-07 17:00:21.000000000 -0400
-@@ -664,6 +664,10 @@
- goto err_unregister_netdev;
- }
-
-+#ifdef HAVE_POLL_CONTROLLER
-+ dev->poll_controller = e100_netpoll;
-+#endif
-+
- e100nics++;
-
- e100_get_speed_duplex_caps(bdp);
-Index: linux-2.4.24/drivers/net/e1000/e1000_main.c
-===================================================================
---- linux-2.4.24.orig/drivers/net/e1000/e1000_main.c 2003-11-28 13:26:20.000000000 -0500
-+++ linux-2.4.24/drivers/net/e1000/e1000_main.c 2004-05-07 16:58:39.000000000 -0400
-@@ -182,6 +182,9 @@
- static int e1000_resume(struct pci_dev *pdev);
- #endif
-
-+/* for netdump / net console */
-+static void e1000_netpoll (struct net_device *dev);
-+
- struct notifier_block e1000_notifier_reboot = {
- .notifier_call = e1000_notify_reboot,
- .next = NULL,
-@@ -434,6 +437,10 @@
- netdev->vlan_rx_add_vid = e1000_vlan_rx_add_vid;
- netdev->vlan_rx_kill_vid = e1000_vlan_rx_kill_vid;
-
-+#ifdef HAVE_POLL_CONTROLLER
-+ netdev->poll_controller = e1000_netpoll;
-+#endif
-+
- netdev->irq = pdev->irq;
- netdev->mem_start = mmio_start;
- netdev->mem_end = mmio_start + mmio_len;
-@@ -2899,4 +2906,20 @@
- }
- #endif
-
-+#ifdef HAVE_POLL_CONTROLLER
-+/*
-+ * Polling 'interrupt' - used by things like netconsole to send skbs
-+ * without having to re-enable interrupts. It's not called while
-+ * the interrupt routine is executing.
-+ */
-+
-+static void e1000_netpoll (struct net_device *dev)
-+{
-+ if (!netdump_mode) disable_irq(dev->irq);
-+ e1000_intr (dev->irq, dev, NULL);
-+ if (!netdump_mode) enable_irq(dev->irq);
-+}
-+
-+#endif
-+
- /* e1000_main.c */
-Index: linux-2.4.24/drivers/net/tg3.c
-===================================================================
---- linux-2.4.24.orig/drivers/net/tg3.c 2003-11-28 13:26:20.000000000 -0500
-+++ linux-2.4.24/drivers/net/tg3.c 2004-05-07 16:58:39.000000000 -0400
-@@ -216,6 +216,9 @@
- #define tr16(reg) readw(tp->regs + (reg))
- #define tr8(reg) readb(tp->regs + (reg))
-
-+/* Added by mark.fasheh@oracle.com to help enable netdump on these cards */
-+static void poll_tg3 (struct net_device *dev);
-+
- static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val)
- {
- unsigned long flags;
-@@ -7630,6 +7633,9 @@
- dev->watchdog_timeo = TG3_TX_TIMEOUT;
- dev->change_mtu = tg3_change_mtu;
- dev->irq = pdev->irq;
-+#ifdef HAVE_POLL_CONTROLLER
-+ dev->poll_controller = &poll_tg3;
-+#endif
-
- err = tg3_get_invariants(tp);
- if (err) {
-@@ -7862,5 +7868,23 @@
- pci_unregister_driver(&tg3_driver);
- }
-
-+#ifdef HAVE_POLL_CONTROLLER
-+
-+/*
-+ * Polling 'interrupt' - used by things like netconsole to send skbs
-+ * without having to re-enable interrupts. It's not called while
-+ * the interrupt routine is executing.
-+ */
-+
-+static void poll_tg3 (struct net_device *dev)
-+{
-+ if (!netdump_mode) disable_irq(dev->irq);
-+ tg3_interrupt (dev->irq, dev, NULL);
-+ if (!netdump_mode) enable_irq(dev->irq);
-+}
-+
-+#endif
-+
-+
- module_init(tg3_init);
- module_exit(tg3_cleanup);
-Index: linux-2.4.24/include/asm-i386/kmap_types.h
-===================================================================
---- linux-2.4.24.orig/include/asm-i386/kmap_types.h 2003-08-25 07:44:43.000000000 -0400
-+++ linux-2.4.24/include/asm-i386/kmap_types.h 2004-05-07 16:59:12.000000000 -0400
-@@ -10,6 +10,7 @@
- KM_BH_IRQ,
- KM_SOFTIRQ0,
- KM_SOFTIRQ1,
-+ KM_NETDUMP,
- KM_TYPE_NR
- };
-
-Index: linux-2.4.24/include/linux/kernel.h
-===================================================================
---- linux-2.4.24.orig/include/linux/kernel.h 2004-05-07 16:56:55.000000000 -0400
-+++ linux-2.4.24/include/linux/kernel.h 2004-05-07 16:58:39.000000000 -0400
-@@ -104,6 +104,9 @@
-
- extern void bust_spinlocks(int yes);
- extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
-+struct pt_regs;
-+extern void (*netdump_func) (struct pt_regs *regs);
-+extern int netdump_mode;
-
- extern int tainted;
- extern const char *print_tainted(void);
-Index: linux-2.4.24/include/linux/netdevice.h
-===================================================================
---- linux-2.4.24.orig/include/linux/netdevice.h 2003-11-28 13:26:21.000000000 -0500
-+++ linux-2.4.24/include/linux/netdevice.h 2004-05-07 16:58:39.000000000 -0400
-@@ -435,6 +435,9 @@
- unsigned char *haddr);
- int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);
- int (*accept_fastpath)(struct net_device *, struct dst_entry*);
-+#define HAVE_POLL_CONTROLLER
-+ void (*poll_controller)(struct net_device *dev);
-+ int (*rx_hook)(struct sk_buff *skb);
-
- /* open/release and usage marking */
- struct module *owner;
-Index: linux-2.4.24/kernel/panic.c
-===================================================================
---- linux-2.4.24.orig/kernel/panic.c 2004-05-07 16:56:56.000000000 -0400
-+++ linux-2.4.24/kernel/panic.c 2004-05-07 16:58:39.000000000 -0400
-@@ -62,6 +62,8 @@
- vsprintf(buf, fmt, args);
- va_end(args);
- printk(KERN_EMERG "Kernel panic: %s\n",buf);
-+ if (netdump_func)
-+ BUG();
- if (in_interrupt())
- printk(KERN_EMERG "In interrupt handler - not syncing\n");
- else if (!current->pid)
-Index: linux-2.4.24/net/core/dev.c
-===================================================================
---- linux-2.4.24.orig/net/core/dev.c 2003-11-28 13:26:21.000000000 -0500
-+++ linux-2.4.24/net/core/dev.c 2004-05-07 16:58:39.000000000 -0400
-@@ -1288,6 +1288,13 @@
-
- local_irq_save(flags);
-
-+ if (unlikely(skb->dev->rx_hook != NULL)) {
-+ int ret;
-+
-+ ret = skb->dev->rx_hook(skb);
-+ if (ret == NET_RX_DROP)
-+ goto drop;
-+ }
- netdev_rx_stat[this_cpu].total++;
- if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
- if (queue->input_pkt_queue.qlen) {
+++ /dev/null
- Documentation/Configure.help | 66 ++
- arch/alpha/defconfig | 7
- arch/alpha/kernel/entry.S | 12
- arch/arm/defconfig | 7
- arch/arm/kernel/calls.S | 24
- arch/i386/defconfig | 7
- arch/ia64/defconfig | 7
- arch/ia64/kernel/entry.S | 24
- arch/m68k/defconfig | 7
- arch/mips/defconfig | 7
- arch/mips64/defconfig | 7
- arch/ppc/defconfig | 14
- arch/ppc64/kernel/misc.S | 2
- arch/s390/defconfig | 7
- arch/s390/kernel/entry.S | 24
- arch/s390x/defconfig | 7
- arch/s390x/kernel/entry.S | 24
- arch/s390x/kernel/wrapper32.S | 92 +++
- arch/sparc/defconfig | 7
- arch/sparc/kernel/systbls.S | 10
- arch/sparc64/defconfig | 7
- arch/sparc64/kernel/systbls.S | 20
- fs/Config.in | 14
- fs/Makefile | 3
- fs/ext2/Makefile | 4
- fs/ext2/file.c | 5
- fs/ext2/ialloc.c | 2
- fs/ext2/inode.c | 34 -
- fs/ext2/namei.c | 14
- fs/ext2/super.c | 29
- fs/ext2/symlink.c | 14
- fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++
- fs/ext2/xattr_user.c | 103 +++
- fs/ext3/Makefile | 10
- fs/ext3/file.c | 5
- fs/ext3/ialloc.c | 2
- fs/ext3/inode.c | 35 -
- fs/ext3/namei.c | 21
- fs/ext3/super.c | 36 +
- fs/ext3/symlink.c | 14
- fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++
- fs/ext3/xattr_user.c | 111 +++
- fs/jfs/jfs_xattr.h | 6
- fs/jfs/xattr.c | 6
- fs/mbcache.c | 648 ++++++++++++++++++++++
- include/asm-arm/unistd.h | 2
- include/asm-ia64/unistd.h | 13
- include/asm-ppc64/unistd.h | 2
- include/asm-s390/unistd.h | 15
- include/asm-s390x/unistd.h | 15
- include/asm-sparc/unistd.h | 24
- include/asm-sparc64/unistd.h | 24
- include/linux/cache_def.h | 15
- include/linux/errno.h | 4
- include/linux/ext2_fs.h | 31 -
- include/linux/ext2_xattr.h | 157 +++++
- include/linux/ext3_fs.h | 31 -
- include/linux/ext3_jbd.h | 8
- include/linux/ext3_xattr.h | 157 +++++
- include/linux/fs.h | 2
- include/linux/mbcache.h | 69 ++
- kernel/ksyms.c | 4
- mm/vmscan.c | 35 +
- fs/ext3/ext3-exports.c | 14 +
- 64 files changed, 4355 insertions(+), 195 deletions(-)
-
-Index: linux-DRV401/arch/ppc/defconfig
-===================================================================
---- linux-DRV401.orig/arch/ppc/defconfig 2004-10-15 10:24:32.000000000 -0700
-+++ linux-DRV401/arch/ppc/defconfig 2004-10-15 11:03:51.000000000 -0700
-@@ -1,6 +1,13 @@
- #
- # Automatically generated by make menuconfig: don't edit
- #
-+CONFIG_EXT3_FS_XATTR=y
-+# CONFIG_EXT3_FS_XATTR_SHARING is not set
-+# CONFIG_EXT3_FS_XATTR_USER is not set
-+# CONFIG_EXT2_FS_XATTR is not set
-+# CONFIG_EXT2_FS_XATTR_SHARING is not set
-+# CONFIG_EXT2_FS_XATTR_USER is not set
-+# CONFIG_FS_MBCACHE is not set
- # CONFIG_UID16 is not set
- # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
- CONFIG_RWSEM_XCHGADD_ALGORITHM=y
-Index: linux-DRV401/fs/Config.in
-===================================================================
---- linux-DRV401.orig/fs/Config.in 2004-10-15 10:24:06.000000000 -0700
-+++ linux-DRV401/fs/Config.in 2004-10-15 11:03:51.000000000 -0700
-@@ -22,6 +22,11 @@
- dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL
-
- tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS
-+dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS
-+dep_bool ' Ext3 extended attribute block sharing' \
-+ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR
-+dep_bool ' Ext3 extended user attributes' \
-+ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR
- # CONFIG_JBD could be its own option (even modular), but until there are
- # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
- # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
-@@ -77,6 +82,11 @@
- tristate 'ROM file system support' CONFIG_ROMFS_FS
-
- tristate 'Second extended fs support' CONFIG_EXT2_FS
-+dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS
-+dep_bool ' Ext2 extended attribute block sharing' \
-+ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR
-+dep_bool ' Ext2 extended user attributes' \
-+ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR
-
- tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS
-
-@@ -156,6 +166,10 @@
- fi
- fi
-
-+# Meta block cache for Extended Attributes (ext2/ext3)
-+#tristate 'Meta block cache' CONFIG_FS_MBCACHE
-+define_tristate CONFIG_FS_MBCACHE y
-+
- mainmenu_option next_comment
- comment 'Partition Types'
- source fs/partitions/Config.in
-Index: linux-DRV401/fs/Makefile
-===================================================================
---- linux-DRV401.orig/fs/Makefile 2004-10-15 10:39:15.000000000 -0700
-+++ linux-DRV401/fs/Makefile 2004-10-15 11:03:51.000000000 -0700
-@@ -14,7 +14,7 @@
- super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \
- fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
- dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
-- filesystems.o namespace.o seq_file.o quota.o
-+ filesystems.o namespace.o seq_file.o quota.o xattr.o
-
- ifeq ($(CONFIG_QUOTA),y)
- obj-y += dquot.o
-@@ -76,6 +76,9 @@
-
- obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
-
-+export-objs += mbcache.o
-+obj-$(CONFIG_FS_MBCACHE) += mbcache.o
-+
- # persistent filesystems
- obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
-
-Index: linux-DRV401/fs/ext2/Makefile
-===================================================================
---- linux-DRV401.orig/fs/ext2/Makefile 2004-10-15 10:23:59.000000000 -0700
-+++ linux-DRV401/fs/ext2/Makefile 2004-10-15 11:03:51.000000000 -0700
-@@ -13,4 +13,8 @@
- ioctl.o namei.o super.o symlink.o
- obj-m := $(O_TARGET)
-
-+export-objs += xattr.o
-+obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o
-+obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
-+
- include $(TOPDIR)/Rules.make
-Index: linux-DRV401/fs/ext2/file.c
-===================================================================
---- linux-DRV401.orig/fs/ext2/file.c 2004-10-15 10:23:59.000000000 -0700
-+++ linux-DRV401/fs/ext2/file.c 2004-10-15 11:03:51.000000000 -0700
-@@ -20,6 +20,7 @@
-
- #include <linux/fs.h>
- #include <linux/ext2_fs.h>
-+#include <linux/ext2_xattr.h>
- #include <linux/sched.h>
-
- /*
-@@ -51,4 +52,8 @@
-
- struct inode_operations ext2_file_inode_operations = {
- truncate: ext2_truncate,
-+ setxattr: ext2_setxattr,
-+ getxattr: ext2_getxattr,
-+ listxattr: ext2_listxattr,
-+ removexattr: ext2_removexattr,
- };
-Index: linux-DRV401/fs/ext2/ialloc.c
-===================================================================
---- linux-DRV401.orig/fs/ext2/ialloc.c 2004-10-15 10:23:59.000000000 -0700
-+++ linux-DRV401/fs/ext2/ialloc.c 2004-10-15 11:03:51.000000000 -0700
-@@ -15,6 +15,7 @@
- #include <linux/config.h>
- #include <linux/fs.h>
- #include <linux/ext2_fs.h>
-+#include <linux/ext2_xattr.h>
- #include <linux/locks.h>
- #include <linux/quotaops.h>
-
-@@ -167,6 +168,7 @@
- */
- if (!is_bad_inode(inode)) {
- /* Quota is already initialized in iput() */
-+ ext2_xattr_delete_inode(inode);
- DQUOT_FREE_INODE(inode);
- DQUOT_DROP(inode);
- }
-Index: linux-DRV401/fs/ext2/inode.c
-===================================================================
---- linux-DRV401.orig/fs/ext2/inode.c 2004-10-15 10:24:00.000000000 -0700
-+++ linux-DRV401/fs/ext2/inode.c 2004-10-15 11:03:51.000000000 -0700
-@@ -39,6 +39,18 @@
- static int ext2_update_inode(struct inode * inode, int do_sync);
-
- /*
-+ * Test whether an inode is a fast symlink.
-+ */
-+static inline int ext2_inode_is_fast_symlink(struct inode *inode)
-+{
-+ int ea_blocks = inode->u.ext2_i.i_file_acl ?
-+ (inode->i_sb->s_blocksize >> 9) : 0;
-+
-+ return (S_ISLNK(inode->i_mode) &&
-+ inode->i_blocks - ea_blocks == 0);
-+}
-+
-+/*
- * Called at each iput()
- */
- void ext2_put_inode (struct inode * inode)
-@@ -53,9 +65,7 @@
- {
- lock_kernel();
-
-- if (is_bad_inode(inode) ||
-- inode->i_ino == EXT2_ACL_IDX_INO ||
-- inode->i_ino == EXT2_ACL_DATA_INO)
-+ if (is_bad_inode(inode))
- goto no_delete;
- inode->u.ext2_i.i_dtime = CURRENT_TIME;
- mark_inode_dirty(inode);
-@@ -792,6 +802,8 @@
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
-+ if (ext2_inode_is_fast_symlink(inode))
-+ return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
-
-@@ -879,8 +891,7 @@
- unsigned long offset;
- struct ext2_group_desc * gdp;
-
-- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO &&
-- inode->i_ino != EXT2_ACL_DATA_INO &&
-+ if ((inode->i_ino != EXT2_ROOT_INO &&
- inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
- inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
- ext2_error (inode->i_sb, "ext2_read_inode",
-@@ -965,10 +976,7 @@
- for (block = 0; block < EXT2_N_BLOCKS; block++)
- inode->u.ext2_i.i_data[block] = raw_inode->i_block[block];
-
-- if (inode->i_ino == EXT2_ACL_IDX_INO ||
-- inode->i_ino == EXT2_ACL_DATA_INO)
-- /* Nothing to do */ ;
-- else if (S_ISREG(inode->i_mode)) {
-+ if (S_ISREG(inode->i_mode)) {
- inode->i_op = &ext2_file_inode_operations;
- inode->i_fop = &ext2_file_operations;
- inode->i_mapping->a_ops = &ext2_aops;
-@@ -977,15 +985,17 @@
- inode->i_fop = &ext2_dir_operations;
- inode->i_mapping->a_ops = &ext2_aops;
- } else if (S_ISLNK(inode->i_mode)) {
-- if (!inode->i_blocks)
-+ if (ext2_inode_is_fast_symlink(inode))
- inode->i_op = &ext2_fast_symlink_inode_operations;
- else {
-- inode->i_op = &page_symlink_inode_operations;
-+ inode->i_op = &ext2_symlink_inode_operations;
- inode->i_mapping->a_ops = &ext2_aops;
- }
-- } else
-+ } else {
-+ inode->i_op = &ext2_special_inode_operations;
- init_special_inode(inode, inode->i_mode,
- le32_to_cpu(raw_inode->i_block[0]));
-+ }
- brelse (bh);
- inode->i_attr_flags = 0;
- if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) {
-Index: linux-DRV401/fs/ext2/namei.c
-===================================================================
---- linux-DRV401.orig/fs/ext2/namei.c 2004-10-15 10:23:59.000000000 -0700
-+++ linux-DRV401/fs/ext2/namei.c 2004-10-15 11:03:51.000000000 -0700
-@@ -31,6 +31,7 @@
-
- #include <linux/fs.h>
- #include <linux/ext2_fs.h>
-+#include <linux/ext2_xattr.h>
- #include <linux/pagemap.h>
-
- /*
-@@ -136,7 +137,7 @@
-
- if (l > sizeof (inode->u.ext2_i.i_data)) {
- /* slow symlink */
-- inode->i_op = &page_symlink_inode_operations;
-+ inode->i_op = &ext2_symlink_inode_operations;
- inode->i_mapping->a_ops = &ext2_aops;
- err = block_symlink(inode, symname, l);
- if (err)
-@@ -345,4 +346,15 @@
- rmdir: ext2_rmdir,
- mknod: ext2_mknod,
- rename: ext2_rename,
-+ setxattr: ext2_setxattr,
-+ getxattr: ext2_getxattr,
-+ listxattr: ext2_listxattr,
-+ removexattr: ext2_removexattr,
-+};
-+
-+struct inode_operations ext2_special_inode_operations = {
-+ setxattr: ext2_setxattr,
-+ getxattr: ext2_getxattr,
-+ listxattr: ext2_listxattr,
-+ removexattr: ext2_removexattr,
- };
-Index: linux-DRV401/fs/ext2/super.c
-===================================================================
---- linux-DRV401.orig/fs/ext2/super.c 2004-10-15 10:23:59.000000000 -0700
-+++ linux-DRV401/fs/ext2/super.c 2004-10-15 11:03:51.000000000 -0700
-@@ -21,6 +21,7 @@
- #include <linux/string.h>
- #include <linux/fs.h>
- #include <linux/ext2_fs.h>
-+#include <linux/ext2_xattr.h>
- #include <linux/slab.h>
- #include <linux/init.h>
- #include <linux/locks.h>
-@@ -125,6 +126,7 @@
- int db_count;
- int i;
-
-+ ext2_xattr_put_super(sb);
- if (!(sb->s_flags & MS_RDONLY)) {
- struct ext2_super_block *es = EXT2_SB(sb)->s_es;
-
-@@ -175,6 +177,13 @@
- this_char = strtok (NULL, ",")) {
- if ((value = strchr (this_char, '=')) != NULL)
- *value++ = 0;
-+#ifdef CONFIG_EXT2_FS_XATTR_USER
-+ if (!strcmp (this_char, "user_xattr"))
-+ set_opt (*mount_options, XATTR_USER);
-+ else if (!strcmp (this_char, "nouser_xattr"))
-+ clear_opt (*mount_options, XATTR_USER);
-+ else
-+#endif
- if (!strcmp (this_char, "bsddf"))
- clear_opt (*mount_options, MINIX_DF);
- else if (!strcmp (this_char, "nouid32")) {
-@@ -424,6 +433,9 @@
- blocksize = BLOCK_SIZE;
-
- sb->u.ext2_sb.s_mount_opt = 0;
-+#ifdef CONFIG_EXT2_FS_XATTR_USER
-+ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */
-+#endif
- if (!parse_options ((char *) data, &sb_block, &resuid, &resgid,
- &sb->u.ext2_sb.s_mount_opt)) {
- return NULL;
-@@ -810,12 +822,27 @@
-
- static int __init init_ext2_fs(void)
- {
-- return register_filesystem(&ext2_fs_type);
-+ int error = init_ext2_xattr();
-+ if (error)
-+ return error;
-+ error = init_ext2_xattr_user();
-+ if (error)
-+ goto fail;
-+ error = register_filesystem(&ext2_fs_type);
-+ if (!error)
-+ return 0;
-+
-+ exit_ext2_xattr_user();
-+fail:
-+ exit_ext2_xattr();
-+ return error;
- }
-
- static void __exit exit_ext2_fs(void)
- {
- unregister_filesystem(&ext2_fs_type);
-+ exit_ext2_xattr_user();
-+ exit_ext2_xattr();
- }
-
- EXPORT_NO_SYMBOLS;
-Index: linux-DRV401/fs/ext2/symlink.c
-===================================================================
---- linux-DRV401.orig/fs/ext2/symlink.c 2004-10-15 10:23:59.000000000 -0700
-+++ linux-DRV401/fs/ext2/symlink.c 2004-10-15 11:03:51.000000000 -0700
-@@ -19,6 +19,7 @@
-
- #include <linux/fs.h>
- #include <linux/ext2_fs.h>
-+#include <linux/ext2_xattr.h>
-
- static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen)
- {
-@@ -32,7 +33,20 @@
- return vfs_follow_link(nd, s);
- }
-
-+struct inode_operations ext2_symlink_inode_operations = {
-+ readlink: page_readlink,
-+ follow_link: page_follow_link,
-+ setxattr: ext2_setxattr,
-+ getxattr: ext2_getxattr,
-+ listxattr: ext2_listxattr,
-+ removexattr: ext2_removexattr,
-+};
-+
- struct inode_operations ext2_fast_symlink_inode_operations = {
- readlink: ext2_readlink,
- follow_link: ext2_follow_link,
-+ setxattr: ext2_setxattr,
-+ getxattr: ext2_getxattr,
-+ listxattr: ext2_listxattr,
-+ removexattr: ext2_removexattr,
- };
-Index: linux-DRV401/fs/ext2/xattr.c
-===================================================================
---- linux-DRV401.orig/fs/ext2/xattr.c 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/fs/ext2/xattr.c 2004-10-15 11:03:51.000000000 -0700
-@@ -0,0 +1,1212 @@
-+/*
-+ * linux/fs/ext2/xattr.c
-+ *
-+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ *
-+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
-+ * Extended attributes for symlinks and special files added per
-+ * suggestion of Luka Renko <luka.renko@hermes.si>.
-+ */
-+
-+/*
-+ * Extended attributes are stored on disk blocks allocated outside of
-+ * any inode. The i_file_acl field is then made to point to this allocated
-+ * block. If all extended attributes of an inode are identical, these
-+ * inodes may share the same extended attribute block. Such situations
-+ * are automatically detected by keeping a cache of recent attribute block
-+ * numbers and hashes over the block's contents in memory.
-+ *
-+ *
-+ * Extended attribute block layout:
-+ *
-+ * +------------------+
-+ * | header |
-+ * | entry 1 | |
-+ * | entry 2 | | growing downwards
-+ * | entry 3 | v
-+ * | four null bytes |
-+ * | . . . |
-+ * | value 1 | ^
-+ * | value 3 | | growing upwards
-+ * | value 2 | |
-+ * +------------------+
-+ *
-+ * The block header is followed by multiple entry descriptors. These entry
-+ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD
-+ * byte boundaries. The entry descriptors are sorted by attribute name,
-+ * so that two extended attribute blocks can be compared efficiently.
-+ *
-+ * Attribute values are aligned to the end of the block, stored in
-+ * no specific order. They are also padded to EXT2_XATTR_PAD byte
-+ * boundaries. No additional gaps are left between them.
-+ *
-+ * Locking strategy
-+ * ----------------
-+ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
-+ * the xattr inode operations are called, so we are guaranteed that only one
-+ * processes accesses extended attributes of an inode at any time.
-+ *
-+ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that
-+ * only a single process is modifying an extended attribute block, even
-+ * if the block is shared among inodes.
-+ *
-+ * Note for porting to 2.5
-+ * -----------------------
-+ * The BKL will no longer be held in the xattr inode operations.
-+ */
-+
-+#include <linux/module.h>
-+#include <linux/locks.h>
-+#include <linux/slab.h>
-+#include <linux/fs.h>
-+#include <linux/ext2_fs.h>
-+#include <linux/ext2_xattr.h>
-+#include <linux/mbcache.h>
-+#include <linux/quotaops.h>
-+#include <asm/semaphore.h>
-+#include <linux/compatmac.h>
-+
-+/* These symbols may be needed by a module. */
-+EXPORT_SYMBOL(ext2_xattr_register);
-+EXPORT_SYMBOL(ext2_xattr_unregister);
-+EXPORT_SYMBOL(ext2_xattr_get);
-+EXPORT_SYMBOL(ext2_xattr_list);
-+EXPORT_SYMBOL(ext2_xattr_set);
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
-+# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
-+#endif
-+
-+#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data))
-+#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr))
-+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
-+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-+
-+#ifdef EXT2_XATTR_DEBUG
-+# define ea_idebug(inode, f...) do { \
-+ printk(KERN_DEBUG "inode %s:%ld: ", \
-+ kdevname(inode->i_dev), inode->i_ino); \
-+ printk(f); \
-+ printk("\n"); \
-+ } while (0)
-+# define ea_bdebug(bh, f...) do { \
-+ printk(KERN_DEBUG "block %s:%ld: ", \
-+ kdevname(bh->b_dev), bh->b_blocknr); \
-+ printk(f); \
-+ printk("\n"); \
-+ } while (0)
-+#else
-+# define ea_idebug(f...)
-+# define ea_bdebug(f...)
-+#endif
-+
-+static int ext2_xattr_set2(struct inode *, struct buffer_head *,
-+ struct ext2_xattr_header *);
-+
-+#ifdef CONFIG_EXT2_FS_XATTR_SHARING
-+
-+static int ext2_xattr_cache_insert(struct buffer_head *);
-+static struct buffer_head *ext2_xattr_cache_find(struct inode *,
-+ struct ext2_xattr_header *);
-+static void ext2_xattr_cache_remove(struct buffer_head *);
-+static void ext2_xattr_rehash(struct ext2_xattr_header *,
-+ struct ext2_xattr_entry *);
-+
-+static struct mb_cache *ext2_xattr_cache;
-+
-+#else
-+# define ext2_xattr_cache_insert(bh) 0
-+# define ext2_xattr_cache_find(inode, header) NULL
-+# define ext2_xattr_cache_remove(bh) while(0) {}
-+# define ext2_xattr_rehash(header, entry) while(0) {}
-+#endif
-+
-+/*
-+ * If a file system does not share extended attributes among inodes,
-+ * we should not need the ext2_xattr_sem semaphore. However, the
-+ * filesystem may still contain shared blocks, so we always take
-+ * the lock.
-+ */
-+
-+DECLARE_MUTEX(ext2_xattr_sem);
-+
-+static inline int
-+ext2_xattr_new_block(struct inode *inode, int * errp, int force)
-+{
-+ struct super_block *sb = inode->i_sb;
-+ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) +
-+ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb);
-+
-+ /* How can we enforce the allocation? */
-+ int block = ext2_new_block(inode, goal, 0, 0, errp);
-+#ifdef OLD_QUOTAS
-+ if (!*errp)
-+ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
-+#endif
-+ return block;
-+}
-+
-+static inline int
-+ext2_xattr_quota_alloc(struct inode *inode, int force)
-+{
-+ /* How can we enforce the allocation? */
-+#ifdef OLD_QUOTAS
-+ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
-+ if (!error)
-+ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
-+#else
-+ int error = DQUOT_ALLOC_BLOCK(inode, 1);
-+#endif
-+ return error;
-+}
-+
-+#ifdef OLD_QUOTAS
-+
-+static inline void
-+ext2_xattr_quota_free(struct inode *inode)
-+{
-+ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
-+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
-+}
-+
-+static inline void
-+ext2_xattr_free_block(struct inode * inode, unsigned long block)
-+{
-+ ext2_free_blocks(inode, block, 1);
-+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
-+}
-+
-+#else
-+# define ext2_xattr_quota_free(inode) \
-+ DQUOT_FREE_BLOCK(inode, 1)
-+# define ext2_xattr_free_block(inode, block) \
-+ ext2_free_blocks(inode, block, 1)
-+#endif
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
-+
-+static inline struct buffer_head *
-+sb_bread(struct super_block *sb, int block)
-+{
-+ return bread(sb->s_dev, block, sb->s_blocksize);
-+}
-+
-+static inline struct buffer_head *
-+sb_getblk(struct super_block *sb, int block)
-+{
-+ return getblk(sb->s_dev, block, sb->s_blocksize);
-+}
-+
-+#endif
-+
-+struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
-+rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
-+
-+int
-+ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler)
-+{
-+ int error = -EINVAL;
-+
-+ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
-+ write_lock(&ext2_handler_lock);
-+ if (!ext2_xattr_handlers[name_index-1]) {
-+ ext2_xattr_handlers[name_index-1] = handler;
-+ error = 0;
-+ }
-+ write_unlock(&ext2_handler_lock);
-+ }
-+ return error;
-+}
-+
-+void
-+ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler)
-+{
-+ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) {
-+ write_lock(&ext2_handler_lock);
-+ ext2_xattr_handlers[name_index-1] = NULL;
-+ write_unlock(&ext2_handler_lock);
-+ }
-+}
-+
-+static inline const char *
-+strcmp_prefix(const char *a, const char *a_prefix)
-+{
-+ while (*a_prefix && *a == *a_prefix) {
-+ a++;
-+ a_prefix++;
-+ }
-+ return *a_prefix ? NULL : a;
-+}
-+
-+/*
-+ * Decode the extended attribute name, and translate it into
-+ * the name_index and name suffix.
-+ */
-+static struct ext2_xattr_handler *
-+ext2_xattr_resolve_name(const char **name)
-+{
-+ struct ext2_xattr_handler *handler = NULL;
-+ int i;
-+
-+ if (!*name)
-+ return NULL;
-+ read_lock(&ext2_handler_lock);
-+ for (i=0; i<EXT2_XATTR_INDEX_MAX; i++) {
-+ if (ext2_xattr_handlers[i]) {
-+ const char *n = strcmp_prefix(*name,
-+ ext2_xattr_handlers[i]->prefix);
-+ if (n) {
-+ handler = ext2_xattr_handlers[i];
-+ *name = n;
-+ break;
-+ }
-+ }
-+ }
-+ read_unlock(&ext2_handler_lock);
-+ return handler;
-+}
-+
-+static inline struct ext2_xattr_handler *
-+ext2_xattr_handler(int name_index)
-+{
-+ struct ext2_xattr_handler *handler = NULL;
-+ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
-+ read_lock(&ext2_handler_lock);
-+ handler = ext2_xattr_handlers[name_index-1];
-+ read_unlock(&ext2_handler_lock);
-+ }
-+ return handler;
-+}
-+
-+/*
-+ * Inode operation getxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ * BKL held [before 2.5.x]
-+ */
-+ssize_t
-+ext2_getxattr(struct dentry *dentry, const char *name,
-+ void *buffer, size_t size)
-+{
-+ struct ext2_xattr_handler *handler;
-+ struct inode *inode = dentry->d_inode;
-+
-+ handler = ext2_xattr_resolve_name(&name);
-+ if (!handler)
-+ return -ENOTSUP;
-+ return handler->get(inode, name, buffer, size);
-+}
-+
-+/*
-+ * Inode operation listxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ * BKL held [before 2.5.x]
-+ */
-+ssize_t
-+ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-+{
-+ return ext2_xattr_list(dentry->d_inode, buffer, size);
-+}
-+
-+/*
-+ * Inode operation setxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ * BKL held [before 2.5.x]
-+ */
-+int
-+ext2_setxattr(struct dentry *dentry, const char *name,
-+ const void *value, size_t size, int flags)
-+{
-+ struct ext2_xattr_handler *handler;
-+ struct inode *inode = dentry->d_inode;
-+
-+ if (size == 0)
-+ value = ""; /* empty EA, do not remove */
-+ handler = ext2_xattr_resolve_name(&name);
-+ if (!handler)
-+ return -ENOTSUP;
-+ return handler->set(inode, name, value, size, flags);
-+}
-+
-+/*
-+ * Inode operation removexattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ * BKL held [before 2.5.x]
-+ */
-+int
-+ext2_removexattr(struct dentry *dentry, const char *name)
-+{
-+ struct ext2_xattr_handler *handler;
-+ struct inode *inode = dentry->d_inode;
-+
-+ handler = ext2_xattr_resolve_name(&name);
-+ if (!handler)
-+ return -ENOTSUP;
-+ return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
-+}
-+
-+/*
-+ * ext2_xattr_get()
-+ *
-+ * Copy an extended attribute into the buffer
-+ * provided, or compute the buffer size required.
-+ * Buffer is NULL to compute the size of the buffer required.
-+ *
-+ * Returns a negative error number on failure, or the number of bytes
-+ * used / required on success.
-+ */
-+int
-+ext2_xattr_get(struct inode *inode, int name_index, const char *name,
-+ void *buffer, size_t buffer_size)
-+{
-+ struct buffer_head *bh = NULL;
-+ struct ext2_xattr_entry *entry;
-+ unsigned int block, size;
-+ char *end;
-+ int name_len, error;
-+
-+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
-+ name_index, name, buffer, (long)buffer_size);
-+
-+ if (name == NULL)
-+ return -EINVAL;
-+ if (!EXT2_I(inode)->i_file_acl)
-+ return -ENOATTR;
-+ block = EXT2_I(inode)->i_file_acl;
-+ ea_idebug(inode, "reading block %d", block);
-+ bh = sb_bread(inode->i_sb, block);
-+ if (!bh)
-+ return -EIO;
-+ ea_bdebug(bh, "b_count=%d, refcount=%d",
-+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
-+ end = bh->b_data + bh->b_size;
-+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
-+ HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
-+ "inode %ld: bad block %d", inode->i_ino, block);
-+ error = -EIO;
-+ goto cleanup;
-+ }
-+ /* find named attribute */
-+ name_len = strlen(name);
-+
-+ error = -ERANGE;
-+ if (name_len > 255)
-+ goto cleanup;
-+ entry = FIRST_ENTRY(bh);
-+ while (!IS_LAST_ENTRY(entry)) {
-+ struct ext2_xattr_entry *next =
-+ EXT2_XATTR_NEXT(entry);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+ if (name_index == entry->e_name_index &&
-+ name_len == entry->e_name_len &&
-+ memcmp(name, entry->e_name, name_len) == 0)
-+ goto found;
-+ entry = next;
-+ }
-+ /* Check the remaining name entries */
-+ while (!IS_LAST_ENTRY(entry)) {
-+ struct ext2_xattr_entry *next =
-+ EXT2_XATTR_NEXT(entry);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+ entry = next;
-+ }
-+ if (ext2_xattr_cache_insert(bh))
-+ ea_idebug(inode, "cache insert failed");
-+ error = -ENOATTR;
-+ goto cleanup;
-+found:
-+ /* check the buffer size */
-+ if (entry->e_value_block != 0)
-+ goto bad_block;
-+ size = le32_to_cpu(entry->e_value_size);
-+ if (size > inode->i_sb->s_blocksize ||
-+ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
-+ goto bad_block;
-+
-+ if (ext2_xattr_cache_insert(bh))
-+ ea_idebug(inode, "cache insert failed");
-+ if (buffer) {
-+ error = -ERANGE;
-+ if (size > buffer_size)
-+ goto cleanup;
-+ /* return value of attribute */
-+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
-+ size);
-+ }
-+ error = size;
-+
-+cleanup:
-+ brelse(bh);
-+
-+ return error;
-+}
-+
-+/*
-+ * ext2_xattr_list()
-+ *
-+ * Copy a list of attribute names into the buffer
-+ * provided, or compute the buffer size required.
-+ * Buffer is NULL to compute the size of the buffer required.
-+ *
-+ * Returns a negative error number on failure, or the number of bytes
-+ * used / required on success.
-+ */
-+int
-+ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
-+{
-+ struct buffer_head *bh = NULL;
-+ struct ext2_xattr_entry *entry;
-+ unsigned int block, size = 0;
-+ char *buf, *end;
-+ int error;
-+
-+ ea_idebug(inode, "buffer=%p, buffer_size=%ld",
-+ buffer, (long)buffer_size);
-+
-+ if (!EXT2_I(inode)->i_file_acl)
-+ return 0;
-+ block = EXT2_I(inode)->i_file_acl;
-+ ea_idebug(inode, "reading block %d", block);
-+ bh = sb_bread(inode->i_sb, block);
-+ if (!bh)
-+ return -EIO;
-+ ea_bdebug(bh, "b_count=%d, refcount=%d",
-+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
-+ end = bh->b_data + bh->b_size;
-+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
-+ HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
-+ "inode %ld: bad block %d", inode->i_ino, block);
-+ error = -EIO;
-+ goto cleanup;
-+ }
-+ /* compute the size required for the list of attribute names */
-+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
-+ entry = EXT2_XATTR_NEXT(entry)) {
-+ struct ext2_xattr_handler *handler;
-+ struct ext2_xattr_entry *next =
-+ EXT2_XATTR_NEXT(entry);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+
-+ handler = ext2_xattr_handler(entry->e_name_index);
-+ if (handler)
-+ size += handler->list(NULL, inode, entry->e_name,
-+ entry->e_name_len);
-+ }
-+
-+ if (ext2_xattr_cache_insert(bh))
-+ ea_idebug(inode, "cache insert failed");
-+ if (!buffer) {
-+ error = size;
-+ goto cleanup;
-+ } else {
-+ error = -ERANGE;
-+ if (size > buffer_size)
-+ goto cleanup;
-+ }
-+
-+ /* list the attribute names */
-+ buf = buffer;
-+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
-+ entry = EXT2_XATTR_NEXT(entry)) {
-+ struct ext2_xattr_handler *handler;
-+
-+ handler = ext2_xattr_handler(entry->e_name_index);
-+ if (handler)
-+ buf += handler->list(buf, inode, entry->e_name,
-+ entry->e_name_len);
-+ }
-+ error = size;
-+
-+cleanup:
-+ brelse(bh);
-+
-+ return error;
-+}
-+
-+/*
-+ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is
-+ * not set, set it.
-+ */
-+static void ext2_xattr_update_super_block(struct super_block *sb)
-+{
-+ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
-+ return;
-+
-+ lock_super(sb);
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
-+ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR;
-+#endif
-+ EXT2_SB(sb)->s_es->s_feature_compat |=
-+ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR);
-+ sb->s_dirt = 1;
-+ mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-+ unlock_super(sb);
-+}
-+
-+/*
-+ * ext2_xattr_set()
-+ *
-+ * Create, replace or remove an extended attribute for this inode. Buffer
-+ * is NULL to remove an existing extended attribute, and non-NULL to
-+ * either replace an existing extended attribute, or create a new extended
-+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
-+ * specify that an extended attribute must exist and must not exist
-+ * previous to the call, respectively.
-+ *
-+ * Returns 0, or a negative error number on failure.
-+ */
-+int
-+ext2_xattr_set(struct inode *inode, int name_index, const char *name,
-+ const void *value, size_t value_len, int flags)
-+{
-+ struct super_block *sb = inode->i_sb;
-+ struct buffer_head *bh = NULL;
-+ struct ext2_xattr_header *header = NULL;
-+ struct ext2_xattr_entry *here, *last;
-+ unsigned int name_len;
-+ int block = EXT2_I(inode)->i_file_acl;
-+ int min_offs = sb->s_blocksize, not_found = 1, free, error;
-+ char *end;
-+
-+ /*
-+ * header -- Points either into bh, or to a temporarily
-+ * allocated buffer.
-+ * here -- The named entry found, or the place for inserting, within
-+ * the block pointed to by header.
-+ * last -- Points right after the last named entry within the block
-+ * pointed to by header.
-+ * min_offs -- The offset of the first value (values are aligned
-+ * towards the end of the block).
-+ * end -- Points right after the block pointed to by header.
-+ */
-+
-+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
-+ name_index, name, value, (long)value_len);
-+
-+ if (IS_RDONLY(inode))
-+ return -EROFS;
-+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-+ return -EPERM;
-+ if (value == NULL)
-+ value_len = 0;
-+ if (name == NULL)
-+ return -EINVAL;
-+ name_len = strlen(name);
-+ if (name_len > 255 || value_len > sb->s_blocksize)
-+ return -ERANGE;
-+ down(&ext2_xattr_sem);
-+
-+ if (block) {
-+ /* The inode already has an extended attribute block. */
-+
-+ bh = sb_bread(sb, block);
-+ error = -EIO;
-+ if (!bh)
-+ goto cleanup;
-+ ea_bdebug(bh, "b_count=%d, refcount=%d",
-+ atomic_read(&(bh->b_count)),
-+ le32_to_cpu(HDR(bh)->h_refcount));
-+ header = HDR(bh);
-+ end = bh->b_data + bh->b_size;
-+ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
-+ header->h_blocks != cpu_to_le32(1)) {
-+bad_block: ext2_error(sb, "ext2_xattr_set",
-+ "inode %ld: bad block %d", inode->i_ino, block);
-+ error = -EIO;
-+ goto cleanup;
-+ }
-+ /* Find the named attribute. */
-+ here = FIRST_ENTRY(bh);
-+ while (!IS_LAST_ENTRY(here)) {
-+ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+ if (!here->e_value_block && here->e_value_size) {
-+ int offs = le16_to_cpu(here->e_value_offs);
-+ if (offs < min_offs)
-+ min_offs = offs;
-+ }
-+ not_found = name_index - here->e_name_index;
-+ if (!not_found)
-+ not_found = name_len - here->e_name_len;
-+ if (!not_found)
-+ not_found = memcmp(name, here->e_name,name_len);
-+ if (not_found <= 0)
-+ break;
-+ here = next;
-+ }
-+ last = here;
-+ /* We still need to compute min_offs and last. */
-+ while (!IS_LAST_ENTRY(last)) {
-+ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+ if (!last->e_value_block && last->e_value_size) {
-+ int offs = le16_to_cpu(last->e_value_offs);
-+ if (offs < min_offs)
-+ min_offs = offs;
-+ }
-+ last = next;
-+ }
-+
-+ /* Check whether we have enough space left. */
-+ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
-+ } else {
-+ /* We will use a new extended attribute block. */
-+ free = sb->s_blocksize -
-+ sizeof(struct ext2_xattr_header) - sizeof(__u32);
-+ here = last = NULL; /* avoid gcc uninitialized warning. */
-+ }
-+
-+ if (not_found) {
-+ /* Request to remove a nonexistent attribute? */
-+ error = -ENOATTR;
-+ if (flags & XATTR_REPLACE)
-+ goto cleanup;
-+ error = 0;
-+ if (value == NULL)
-+ goto cleanup;
-+ else
-+ free -= EXT2_XATTR_LEN(name_len);
-+ } else {
-+ /* Request to create an existing attribute? */
-+ error = -EEXIST;
-+ if (flags & XATTR_CREATE)
-+ goto cleanup;
-+ if (!here->e_value_block && here->e_value_size) {
-+ unsigned int size = le32_to_cpu(here->e_value_size);
-+
-+ if (le16_to_cpu(here->e_value_offs) + size >
-+ sb->s_blocksize || size > sb->s_blocksize)
-+ goto bad_block;
-+ free += EXT2_XATTR_SIZE(size);
-+ }
-+ }
-+ free -= EXT2_XATTR_SIZE(value_len);
-+ error = -ENOSPC;
-+ if (free < 0)
-+ goto cleanup;
-+
-+ /* Here we know that we can set the new attribute. */
-+
-+ if (header) {
-+ if (header->h_refcount == cpu_to_le32(1)) {
-+ ea_bdebug(bh, "modifying in-place");
-+ ext2_xattr_cache_remove(bh);
-+ } else {
-+ int offset;
-+
-+ ea_bdebug(bh, "cloning");
-+ header = kmalloc(bh->b_size, GFP_KERNEL);
-+ error = -ENOMEM;
-+ if (header == NULL)
-+ goto cleanup;
-+ memcpy(header, HDR(bh), bh->b_size);
-+ header->h_refcount = cpu_to_le32(1);
-+ offset = (char *)header - bh->b_data;
-+ here = ENTRY((char *)here + offset);
-+ last = ENTRY((char *)last + offset);
-+ }
-+ } else {
-+ /* Allocate a buffer where we construct the new block. */
-+ header = kmalloc(sb->s_blocksize, GFP_KERNEL);
-+ error = -ENOMEM;
-+ if (header == NULL)
-+ goto cleanup;
-+ memset(header, 0, sb->s_blocksize);
-+ end = (char *)header + sb->s_blocksize;
-+ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
-+ header->h_blocks = header->h_refcount = cpu_to_le32(1);
-+ last = here = ENTRY(header+1);
-+ }
-+
-+ if (not_found) {
-+ /* Insert the new name. */
-+ int size = EXT2_XATTR_LEN(name_len);
-+ int rest = (char *)last - (char *)here;
-+ memmove((char *)here + size, here, rest);
-+ memset(here, 0, size);
-+ here->e_name_index = name_index;
-+ here->e_name_len = name_len;
-+ memcpy(here->e_name, name, name_len);
-+ } else {
-+ /* Remove the old value. */
-+ if (!here->e_value_block && here->e_value_size) {
-+ char *first_val = (char *)header + min_offs;
-+ int offs = le16_to_cpu(here->e_value_offs);
-+ char *val = (char *)header + offs;
-+ size_t size = EXT2_XATTR_SIZE(
-+ le32_to_cpu(here->e_value_size));
-+ memmove(first_val + size, first_val, val - first_val);
-+ memset(first_val, 0, size);
-+ here->e_value_offs = 0;
-+ min_offs += size;
-+
-+ /* Adjust all value offsets. */
-+ last = ENTRY(header+1);
-+ while (!IS_LAST_ENTRY(last)) {
-+ int o = le16_to_cpu(last->e_value_offs);
-+ if (!last->e_value_block && o < offs)
-+ last->e_value_offs =
-+ cpu_to_le16(o + size);
-+ last = EXT2_XATTR_NEXT(last);
-+ }
-+ }
-+ if (value == NULL) {
-+ /* Remove this attribute. */
-+ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) {
-+ /* This block is now empty. */
-+ error = ext2_xattr_set2(inode, bh, NULL);
-+ goto cleanup;
-+ } else {
-+ /* Remove the old name. */
-+ int size = EXT2_XATTR_LEN(name_len);
-+ last = ENTRY((char *)last - size);
-+ memmove(here, (char*)here + size,
-+ (char*)last - (char*)here);
-+ memset(last, 0, size);
-+ }
-+ }
-+ }
-+
-+ if (value != NULL) {
-+ /* Insert the new value. */
-+ here->e_value_size = cpu_to_le32(value_len);
-+ if (value_len) {
-+ size_t size = EXT2_XATTR_SIZE(value_len);
-+ char *val = (char *)header + min_offs - size;
-+ here->e_value_offs =
-+ cpu_to_le16((char *)val - (char *)header);
-+ memset(val + size - EXT2_XATTR_PAD, 0,
-+ EXT2_XATTR_PAD); /* Clear the pad bytes. */
-+ memcpy(val, value, value_len);
-+ }
-+ }
-+ ext2_xattr_rehash(header, here);
-+
-+ error = ext2_xattr_set2(inode, bh, header);
-+
-+cleanup:
-+ brelse(bh);
-+ if (!(bh && header == HDR(bh)))
-+ kfree(header);
-+ up(&ext2_xattr_sem);
-+
-+ return error;
-+}
-+
-+/*
-+ * Second half of ext2_xattr_set(): Update the file system.
-+ */
-+static int
-+ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
-+ struct ext2_xattr_header *header)
-+{
-+ struct super_block *sb = inode->i_sb;
-+ struct buffer_head *new_bh = NULL;
-+ int error;
-+
-+ if (header) {
-+ new_bh = ext2_xattr_cache_find(inode, header);
-+ if (new_bh) {
-+ /*
-+ * We found an identical block in the cache.
-+ * The old block will be released after updating
-+ * the inode.
-+ */
-+ ea_bdebug(old_bh, "reusing block %ld",
-+ new_bh->b_blocknr);
-+
-+ error = -EDQUOT;
-+ if (ext2_xattr_quota_alloc(inode, 1))
-+ goto cleanup;
-+
-+ HDR(new_bh)->h_refcount = cpu_to_le32(
-+ le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
-+ ea_bdebug(new_bh, "refcount now=%d",
-+ le32_to_cpu(HDR(new_bh)->h_refcount));
-+ } else if (old_bh && header == HDR(old_bh)) {
-+ /* Keep this block. */
-+ new_bh = old_bh;
-+ ext2_xattr_cache_insert(new_bh);
-+ } else {
-+ /* We need to allocate a new block */
-+ int force = EXT2_I(inode)->i_file_acl != 0;
-+ int block = ext2_xattr_new_block(inode, &error, force);
-+ if (error)
-+ goto cleanup;
-+ ea_idebug(inode, "creating block %d", block);
-+
-+ new_bh = sb_getblk(sb, block);
-+ if (!new_bh) {
-+ ext2_xattr_free_block(inode, block);
-+ error = -EIO;
-+ goto cleanup;
-+ }
-+ lock_buffer(new_bh);
-+ memcpy(new_bh->b_data, header, new_bh->b_size);
-+ mark_buffer_uptodate(new_bh, 1);
-+ unlock_buffer(new_bh);
-+ ext2_xattr_cache_insert(new_bh);
-+
-+ ext2_xattr_update_super_block(sb);
-+ }
-+ mark_buffer_dirty(new_bh);
-+ if (IS_SYNC(inode)) {
-+ ll_rw_block(WRITE, 1, &new_bh);
-+ wait_on_buffer(new_bh);
-+ error = -EIO;
-+ if (buffer_req(new_bh) && !buffer_uptodate(new_bh))
-+ goto cleanup;
-+ }
-+ }
-+
-+ /* Update the inode. */
-+ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
-+ inode->i_ctime = CURRENT_TIME;
-+ if (IS_SYNC(inode)) {
-+ error = ext2_sync_inode (inode);
-+ if (error)
-+ goto cleanup;
-+ } else
-+ mark_inode_dirty(inode);
-+
-+ error = 0;
-+ if (old_bh && old_bh != new_bh) {
-+ /*
-+ * If there was an old block, and we are not still using it,
-+ * we now release the old block.
-+ */
-+ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
-+
-+ if (refcount == 1) {
-+ /* Free the old block. */
-+ ea_bdebug(old_bh, "freeing");
-+ ext2_xattr_free_block(inode, old_bh->b_blocknr);
-+ mark_buffer_clean(old_bh);
-+ } else {
-+ /* Decrement the refcount only. */
-+ refcount--;
-+ HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
-+ ext2_xattr_quota_free(inode);
-+ mark_buffer_dirty(old_bh);
-+ ea_bdebug(old_bh, "refcount now=%d", refcount);
-+ }
-+ }
-+
-+cleanup:
-+ if (old_bh != new_bh)
-+ brelse(new_bh);
-+
-+ return error;
-+}
-+
-+/*
-+ * ext2_xattr_delete_inode()
-+ *
-+ * Free extended attribute resources associated with this inode. This
-+ * is called immediately before an inode is freed.
-+ */
-+void
-+ext2_xattr_delete_inode(struct inode *inode)
-+{
-+ struct buffer_head *bh;
-+ unsigned int block = EXT2_I(inode)->i_file_acl;
-+
-+ if (!block)
-+ return;
-+ down(&ext2_xattr_sem);
-+
-+ bh = sb_bread(inode->i_sb, block);
-+ if (!bh) {
-+ ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
-+ "inode %ld: block %d read error", inode->i_ino, block);
-+ goto cleanup;
-+ }
-+ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
-+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
-+ HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+ ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
-+ "inode %ld: bad block %d", inode->i_ino, block);
-+ goto cleanup;
-+ }
-+ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
-+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
-+ ext2_xattr_cache_remove(bh);
-+ ext2_xattr_free_block(inode, block);
-+ bforget(bh);
-+ bh = NULL;
-+ } else {
-+ HDR(bh)->h_refcount = cpu_to_le32(
-+ le32_to_cpu(HDR(bh)->h_refcount) - 1);
-+ mark_buffer_dirty(bh);
-+ if (IS_SYNC(inode)) {
-+ ll_rw_block(WRITE, 1, &bh);
-+ wait_on_buffer(bh);
-+ }
-+ ext2_xattr_quota_free(inode);
-+ }
-+ EXT2_I(inode)->i_file_acl = 0;
-+
-+cleanup:
-+ brelse(bh);
-+ up(&ext2_xattr_sem);
-+}
-+
-+/*
-+ * ext2_xattr_put_super()
-+ *
-+ * This is called when a file system is unmounted.
-+ */
-+void
-+ext2_xattr_put_super(struct super_block *sb)
-+{
-+#ifdef CONFIG_EXT2_FS_XATTR_SHARING
-+ mb_cache_shrink(ext2_xattr_cache, sb->s_dev);
-+#endif
-+}
-+
-+#ifdef CONFIG_EXT2_FS_XATTR_SHARING
-+
-+/*
-+ * ext2_xattr_cache_insert()
-+ *
-+ * Create a new entry in the extended attribute cache, and insert
-+ * it unless such an entry is already in the cache.
-+ *
-+ * Returns 0, or a negative error number on failure.
-+ */
-+static int
-+ext2_xattr_cache_insert(struct buffer_head *bh)
-+{
-+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-+ struct mb_cache_entry *ce;
-+ int error;
-+
-+ ce = mb_cache_entry_alloc(ext2_xattr_cache);
-+ if (!ce)
-+ return -ENOMEM;
-+ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
-+ if (error) {
-+ mb_cache_entry_free(ce);
-+ if (error == -EBUSY) {
-+ ea_bdebug(bh, "already in cache (%d cache entries)",
-+ atomic_read(&ext2_xattr_cache->c_entry_count));
-+ error = 0;
-+ }
-+ } else {
-+ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
-+ atomic_read(&ext2_xattr_cache->c_entry_count));
-+ mb_cache_entry_release(ce);
-+ }
-+ return error;
-+}
-+
-+/*
-+ * ext2_xattr_cmp()
-+ *
-+ * Compare two extended attribute blocks for equality.
-+ *
-+ * Returns 0 if the blocks are equal, 1 if they differ, and
-+ * a negative error number on errors.
-+ */
-+static int
-+ext2_xattr_cmp(struct ext2_xattr_header *header1,
-+ struct ext2_xattr_header *header2)
-+{
-+ struct ext2_xattr_entry *entry1, *entry2;
-+
-+ entry1 = ENTRY(header1+1);
-+ entry2 = ENTRY(header2+1);
-+ while (!IS_LAST_ENTRY(entry1)) {
-+ if (IS_LAST_ENTRY(entry2))
-+ return 1;
-+ if (entry1->e_hash != entry2->e_hash ||
-+ entry1->e_name_len != entry2->e_name_len ||
-+ entry1->e_value_size != entry2->e_value_size ||
-+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
-+ return 1;
-+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
-+ return -EIO;
-+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
-+ (char *)header2 + le16_to_cpu(entry2->e_value_offs),
-+ le32_to_cpu(entry1->e_value_size)))
-+ return 1;
-+
-+ entry1 = EXT2_XATTR_NEXT(entry1);
-+ entry2 = EXT2_XATTR_NEXT(entry2);
-+ }
-+ if (!IS_LAST_ENTRY(entry2))
-+ return 1;
-+ return 0;
-+}
-+
-+/*
-+ * ext2_xattr_cache_find()
-+ *
-+ * Find an identical extended attribute block.
-+ *
-+ * Returns a pointer to the block found, or NULL if such a block was
-+ * not found or an error occurred.
-+ */
-+static struct buffer_head *
-+ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
-+{
-+ __u32 hash = le32_to_cpu(header->h_hash);
-+ struct mb_cache_entry *ce;
-+
-+ if (!header->h_hash)
-+ return NULL; /* never share */
-+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-+ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash);
-+ while (ce) {
-+ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
-+
-+ if (!bh) {
-+ ext2_error(inode->i_sb, "ext2_xattr_cache_find",
-+ "inode %ld: block %ld read error",
-+ inode->i_ino, ce->e_block);
-+ } else if (le32_to_cpu(HDR(bh)->h_refcount) >
-+ EXT2_XATTR_REFCOUNT_MAX) {
-+ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
-+ le32_to_cpu(HDR(bh)->h_refcount),
-+ EXT2_XATTR_REFCOUNT_MAX);
-+ } else if (!ext2_xattr_cmp(header, HDR(bh))) {
-+ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
-+ mb_cache_entry_release(ce);
-+ return bh;
-+ }
-+ brelse(bh);
-+ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
-+ }
-+ return NULL;
-+}
-+
-+/*
-+ * ext2_xattr_cache_remove()
-+ *
-+ * Remove the cache entry of a block from the cache. Called when a
-+ * block becomes invalid.
-+ */
-+static void
-+ext2_xattr_cache_remove(struct buffer_head *bh)
-+{
-+ struct mb_cache_entry *ce;
-+
-+ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr);
-+ if (ce) {
-+ ea_bdebug(bh, "removing (%d cache entries remaining)",
-+ atomic_read(&ext2_xattr_cache->c_entry_count)-1);
-+ mb_cache_entry_free(ce);
-+ } else
-+ ea_bdebug(bh, "no cache entry");
-+}
-+
-+#define NAME_HASH_SHIFT 5
-+#define VALUE_HASH_SHIFT 16
-+
-+/*
-+ * ext2_xattr_hash_entry()
-+ *
-+ * Compute the hash of an extended attribute.
-+ */
-+static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header,
-+ struct ext2_xattr_entry *entry)
-+{
-+ __u32 hash = 0;
-+ char *name = entry->e_name;
-+ int n;
-+
-+ for (n=0; n < entry->e_name_len; n++) {
-+ hash = (hash << NAME_HASH_SHIFT) ^
-+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
-+ *name++;
-+ }
-+
-+ if (entry->e_value_block == 0 && entry->e_value_size != 0) {
-+ __u32 *value = (__u32 *)((char *)header +
-+ le16_to_cpu(entry->e_value_offs));
-+ for (n = (le32_to_cpu(entry->e_value_size) +
-+ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) {
-+ hash = (hash << VALUE_HASH_SHIFT) ^
-+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
-+ le32_to_cpu(*value++);
-+ }
-+ }
-+ entry->e_hash = cpu_to_le32(hash);
-+}
-+
-+#undef NAME_HASH_SHIFT
-+#undef VALUE_HASH_SHIFT
-+
-+#define BLOCK_HASH_SHIFT 16
-+
-+/*
-+ * ext2_xattr_rehash()
-+ *
-+ * Re-compute the extended attribute hash value after an entry has changed.
-+ */
-+static void ext2_xattr_rehash(struct ext2_xattr_header *header,
-+ struct ext2_xattr_entry *entry)
-+{
-+ struct ext2_xattr_entry *here;
-+ __u32 hash = 0;
-+
-+ ext2_xattr_hash_entry(header, entry);
-+ here = ENTRY(header+1);
-+ while (!IS_LAST_ENTRY(here)) {
-+ if (!here->e_hash) {
-+ /* Block is not shared if an entry's hash value == 0 */
-+ hash = 0;
-+ break;
-+ }
-+ hash = (hash << BLOCK_HASH_SHIFT) ^
-+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
-+ le32_to_cpu(here->e_hash);
-+ here = EXT2_XATTR_NEXT(here);
-+ }
-+ header->h_hash = cpu_to_le32(hash);
-+}
-+
-+#undef BLOCK_HASH_SHIFT
-+
-+int __init
-+init_ext2_xattr(void)
-+{
-+ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
-+ sizeof(struct mb_cache_entry) +
-+ sizeof(struct mb_cache_entry_index), 1, 61);
-+ if (!ext2_xattr_cache)
-+ return -ENOMEM;
-+
-+ return 0;
-+}
-+
-+void
-+exit_ext2_xattr(void)
-+{
-+ mb_cache_destroy(ext2_xattr_cache);
-+}
-+
-+#else /* CONFIG_EXT2_FS_XATTR_SHARING */
-+
-+int __init
-+init_ext2_xattr(void)
-+{
-+ return 0;
-+}
-+
-+void
-+exit_ext2_xattr(void)
-+{
-+}
-+
-+#endif /* CONFIG_EXT2_FS_XATTR_SHARING */
-Index: linux-DRV401/fs/ext2/xattr_user.c
-===================================================================
---- linux-DRV401.orig/fs/ext2/xattr_user.c 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/fs/ext2/xattr_user.c 2004-10-15 11:03:51.000000000 -0700
-@@ -0,0 +1,103 @@
-+/*
-+ * linux/fs/ext2/xattr_user.c
-+ * Handler for extended user attributes.
-+ *
-+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ */
-+
-+#include <linux/module.h>
-+#include <linux/string.h>
-+#include <linux/fs.h>
-+#include <linux/ext2_fs.h>
-+#include <linux/ext2_xattr.h>
-+
-+#ifdef CONFIG_EXT2_FS_POSIX_ACL
-+# include <linux/ext2_acl.h>
-+#endif
-+
-+#define XATTR_USER_PREFIX "user."
-+
-+static size_t
-+ext2_xattr_user_list(char *list, struct inode *inode,
-+ const char *name, int name_len)
-+{
-+ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
-+
-+ if (!test_opt(inode->i_sb, XATTR_USER))
-+ return 0;
-+
-+ if (list) {
-+ memcpy(list, XATTR_USER_PREFIX, prefix_len);
-+ memcpy(list+prefix_len, name, name_len);
-+ list[prefix_len + name_len] = '\0';
-+ }
-+ return prefix_len + name_len + 1;
-+}
-+
-+static int
-+ext2_xattr_user_get(struct inode *inode, const char *name,
-+ void *buffer, size_t size)
-+{
-+ int error;
-+
-+ if (strcmp(name, "") == 0)
-+ return -EINVAL;
-+ if (!test_opt(inode->i_sb, XATTR_USER))
-+ return -ENOTSUP;
-+#ifdef CONFIG_EXT2_FS_POSIX_ACL
-+ error = ext2_permission_locked(inode, MAY_READ);
-+#else
-+ error = permission(inode, MAY_READ);
-+#endif
-+ if (error)
-+ return error;
-+
-+ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name,
-+ buffer, size);
-+}
-+
-+static int
-+ext2_xattr_user_set(struct inode *inode, const char *name,
-+ const void *value, size_t size, int flags)
-+{
-+ int error;
-+
-+ if (strcmp(name, "") == 0)
-+ return -EINVAL;
-+ if (!test_opt(inode->i_sb, XATTR_USER))
-+ return -ENOTSUP;
-+ if ( !S_ISREG(inode->i_mode) &&
-+ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-+ return -EPERM;
-+#ifdef CONFIG_EXT2_FS_POSIX_ACL
-+ error = ext2_permission_locked(inode, MAY_WRITE);
-+#else
-+ error = permission(inode, MAY_WRITE);
-+#endif
-+ if (error)
-+ return error;
-+
-+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
-+ value, size, flags);
-+}
-+
-+struct ext2_xattr_handler ext2_xattr_user_handler = {
-+ prefix: XATTR_USER_PREFIX,
-+ list: ext2_xattr_user_list,
-+ get: ext2_xattr_user_get,
-+ set: ext2_xattr_user_set,
-+};
-+
-+int __init
-+init_ext2_xattr_user(void)
-+{
-+ return ext2_xattr_register(EXT2_XATTR_INDEX_USER,
-+ &ext2_xattr_user_handler);
-+}
-+
-+void
-+exit_ext2_xattr_user(void)
-+{
-+ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
-+ &ext2_xattr_user_handler);
-+}
-Index: linux-DRV401/fs/ext3/Makefile
-===================================================================
---- linux-DRV401.orig/fs/ext3/Makefile 2004-10-15 10:39:16.000000000 -0700
-+++ linux-DRV401/fs/ext3/Makefile 2004-10-15 11:03:51.000000000 -0700
-@@ -1,5 +1,5 @@
- #
--# Makefile for the linux ext2-filesystem routines.
-+# Makefile for the linux ext3-filesystem routines.
- #
- # Note! Dependencies are done automagically by 'make dep', which also
- # removes any old dependencies. DON'T put your own dependencies here
-@@ -9,8 +9,14 @@
-
- O_TARGET := ext3.o
-
-+export-objs := ext3-exports.o
-+
- obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-- ioctl.o namei.o super.o symlink.o hash.o
-+ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
- obj-m := $(O_TARGET)
-
-+export-objs += xattr.o
-+obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o
-+obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o
-+
- include $(TOPDIR)/Rules.make
-Index: linux-DRV401/fs/ext3/file.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/file.c 2004-10-15 10:39:16.000000000 -0700
-+++ linux-DRV401/fs/ext3/file.c 2004-10-15 11:03:51.000000000 -0700
-@@ -23,6 +23,7 @@
- #include <linux/locks.h>
- #include <linux/jbd.h>
- #include <linux/ext3_fs.h>
-+#include <linux/ext3_xattr.h>
- #include <linux/ext3_jbd.h>
- #include <linux/smp_lock.h>
-
-@@ -93,5 +94,9 @@
- struct inode_operations ext3_file_inode_operations = {
- truncate: ext3_truncate, /* BKL held */
- setattr: ext3_setattr, /* BKL held */
-+ setxattr: ext3_setxattr, /* BKL held */
-+ getxattr: ext3_getxattr, /* BKL held */
-+ listxattr: ext3_listxattr, /* BKL held */
-+ removexattr: ext3_removexattr, /* BKL held */
- };
-
-Index: linux-DRV401/fs/ext3/ialloc.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/ialloc.c 2004-10-15 10:24:00.000000000 -0700
-+++ linux-DRV401/fs/ext3/ialloc.c 2004-10-15 11:03:52.000000000 -0700
-@@ -17,6 +17,7 @@
- #include <linux/jbd.h>
- #include <linux/ext3_fs.h>
- #include <linux/ext3_jbd.h>
-+#include <linux/ext3_xattr.h>
- #include <linux/stat.h>
- #include <linux/string.h>
- #include <linux/locks.h>
-@@ -216,6 +217,7 @@
- * as writing the quota to disk may need the lock as well.
- */
- DQUOT_INIT(inode);
-+ ext3_xattr_delete_inode(handle, inode);
- DQUOT_FREE_INODE(inode);
- DQUOT_DROP(inode);
-
-Index: linux-DRV401/fs/ext3/inode.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/inode.c 2004-10-15 10:24:00.000000000 -0700
-+++ linux-DRV401/fs/ext3/inode.c 2004-10-15 11:03:52.000000000 -0700
-@@ -39,6 +39,18 @@
- */
- #undef SEARCH_FROM_ZERO
-
-+/*
-+ * Test whether an inode is a fast symlink.
-+ */
-+static inline int ext3_inode_is_fast_symlink(struct inode *inode)
-+{
-+ int ea_blocks = inode->u.ext3_i.i_file_acl ?
-+ (inode->i_sb->s_blocksize >> 9) : 0;
-+
-+ return (S_ISLNK(inode->i_mode) &&
-+ inode->i_blocks - ea_blocks == 0);
-+}
-+
- /* The ext3 forget function must perform a revoke if we are freeing data
- * which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
-@@ -48,7 +60,7 @@
- * still needs to be revoked.
- */
-
--static int ext3_forget(handle_t *handle, int is_metadata,
-+int ext3_forget(handle_t *handle, int is_metadata,
- struct inode *inode, struct buffer_head *bh,
- int blocknr)
- {
-@@ -164,9 +176,7 @@
- {
- handle_t *handle;
-
-- if (is_bad_inode(inode) ||
-- inode->i_ino == EXT3_ACL_IDX_INO ||
-- inode->i_ino == EXT3_ACL_DATA_INO)
-+ if (is_bad_inode(inode))
- goto no_delete;
-
- lock_kernel();
-@@ -1843,6 +1853,8 @@
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
-+ if (ext3_inode_is_fast_symlink(inode))
-+ return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
-
-@@ -1990,8 +2002,6 @@
- struct ext3_group_desc * gdp;
-
- if ((inode->i_ino != EXT3_ROOT_INO &&
-- inode->i_ino != EXT3_ACL_IDX_INO &&
-- inode->i_ino != EXT3_ACL_DATA_INO &&
- inode->i_ino != EXT3_JOURNAL_INO &&
- inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
- inode->i_ino > le32_to_cpu(
-@@ -2118,10 +2128,7 @@
-
- brelse (iloc.bh);
-
-- if (inode->i_ino == EXT3_ACL_IDX_INO ||
-- inode->i_ino == EXT3_ACL_DATA_INO)
-- /* Nothing to do */ ;
-- else if (S_ISREG(inode->i_mode)) {
-+ if (S_ISREG(inode->i_mode)) {
- inode->i_op = &ext3_file_inode_operations;
- inode->i_fop = &ext3_file_operations;
- inode->i_mapping->a_ops = &ext3_aops;
-@@ -2129,15 +2136,17 @@
- inode->i_op = &ext3_dir_inode_operations;
- inode->i_fop = &ext3_dir_operations;
- } else if (S_ISLNK(inode->i_mode)) {
-- if (!inode->i_blocks)
-+ if (ext3_inode_is_fast_symlink(inode))
- inode->i_op = &ext3_fast_symlink_inode_operations;
- else {
-- inode->i_op = &page_symlink_inode_operations;
-+ inode->i_op = &ext3_symlink_inode_operations;
- inode->i_mapping->a_ops = &ext3_aops;
- }
-- } else
-+ } else {
-+ inode->i_op = &ext3_special_inode_operations;
- init_special_inode(inode, inode->i_mode,
- le32_to_cpu(iloc.raw_inode->i_block[0]));
-+ }
- /* inode->i_attr_flags = 0; unused */
- if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
- /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
-Index: linux-DRV401/fs/ext3/namei.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/namei.c 2004-10-15 10:39:16.000000000 -0700
-+++ linux-DRV401/fs/ext3/namei.c 2004-10-15 11:03:52.000000000 -0700
-@@ -29,6 +29,7 @@
- #include <linux/sched.h>
- #include <linux/ext3_fs.h>
- #include <linux/ext3_jbd.h>
-+#include <linux/ext3_xattr.h>
- #include <linux/fcntl.h>
- #include <linux/stat.h>
- #include <linux/string.h>
-@@ -1612,7 +1613,7 @@
- if (IS_SYNC(dir))
- handle->h_sync = 1;
-
-- inode = ext3_new_inode (handle, dir, S_IFDIR);
-+ inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
-@@ -1620,7 +1621,6 @@
- inode->i_op = &ext3_dir_inode_operations;
- inode->i_fop = &ext3_dir_operations;
- inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-- inode->i_blocks = 0;
- dir_block = ext3_bread (handle, inode, 0, 1, &err);
- if (!dir_block) {
- inode->i_nlink--; /* is this nlink == 0? */
-@@ -1647,9 +1647,6 @@
- BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, dir_block);
- brelse (dir_block);
-- inode->i_mode = S_IFDIR | mode;
-- if (dir->i_mode & S_ISGID)
-- inode->i_mode |= S_ISGID;
- ext3_mark_inode_dirty(handle, inode);
- err = ext3_add_entry (handle, dentry, inode);
- if (err) {
-@@ -2018,7 +2015,7 @@
- goto out_stop;
-
- if (l > sizeof (EXT3_I(inode)->i_data)) {
-- inode->i_op = &page_symlink_inode_operations;
-+ inode->i_op = &ext3_symlink_inode_operations;
- inode->i_mapping->a_ops = &ext3_aops;
- /*
- * block_symlink() calls back into ext3_prepare/commit_write.
-@@ -2245,4 +2242,16 @@
- rmdir: ext3_rmdir, /* BKL held */
- mknod: ext3_mknod, /* BKL held */
- rename: ext3_rename, /* BKL held */
-+ setxattr: ext3_setxattr, /* BKL held */
-+ getxattr: ext3_getxattr, /* BKL held */
-+ listxattr: ext3_listxattr, /* BKL held */
-+ removexattr: ext3_removexattr, /* BKL held */
- };
-+
-+struct inode_operations ext3_special_inode_operations = {
-+ setxattr: ext3_setxattr, /* BKL held */
-+ getxattr: ext3_getxattr, /* BKL held */
-+ listxattr: ext3_listxattr, /* BKL held */
-+ removexattr: ext3_removexattr, /* BKL held */
-+};
-+
-Index: linux-DRV401/fs/ext3/super.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/super.c 2004-10-15 10:39:16.000000000 -0700
-+++ linux-DRV401/fs/ext3/super.c 2004-10-15 11:03:52.000000000 -0700
-@@ -24,6 +24,7 @@
- #include <linux/jbd.h>
- #include <linux/ext3_fs.h>
- #include <linux/ext3_jbd.h>
-+#include <linux/ext3_xattr.h>
- #include <linux/slab.h>
- #include <linux/init.h>
- #include <linux/locks.h>
-@@ -404,6 +405,7 @@
- kdev_t j_dev = sbi->s_journal->j_dev;
- int i;
-
-+ ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
- if (!(sb->s_flags & MS_RDONLY)) {
- EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-@@ -499,6 +501,7 @@
- int is_remount)
- {
- unsigned long *mount_options = &sbi->s_mount_opt;
-+
- uid_t *resuid = &sbi->s_resuid;
- gid_t *resgid = &sbi->s_resgid;
- char * this_char;
-@@ -511,6 +514,13 @@
- this_char = strtok (NULL, ",")) {
- if ((value = strchr (this_char, '=')) != NULL)
- *value++ = 0;
-+#ifdef CONFIG_EXT3_FS_XATTR_USER
-+ if (!strcmp (this_char, "user_xattr"))
-+ set_opt (*mount_options, XATTR_USER);
-+ else if (!strcmp (this_char, "nouser_xattr"))
-+ clear_opt (*mount_options, XATTR_USER);
-+ else
-+#endif
- if (!strcmp (this_char, "bsddf"))
- clear_opt (*mount_options, MINIX_DF);
- else if (!strcmp (this_char, "nouid32")) {
-@@ -924,6 +934,12 @@
- sbi->s_mount_opt = 0;
- sbi->s_resuid = EXT3_DEF_RESUID;
- sbi->s_resgid = EXT3_DEF_RESGID;
-+
-+ /* Default extended attribute flags */
-+#ifdef CONFIG_EXT3_FS_XATTR_USER
-+ /* set_opt(sbi->s_mount_opt, XATTR_USER); */
-+#endif
-+
- if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
- sb->s_dev = 0;
- goto out_fail;
-@@ -1742,12 +1758,27 @@
-
- static int __init init_ext3_fs(void)
- {
-- return register_filesystem(&ext3_fs_type);
-+ int error = init_ext3_xattr();
-+ if (error)
-+ return error;
-+ error = init_ext3_xattr_user();
-+ if (error)
-+ goto fail;
-+ error = register_filesystem(&ext3_fs_type);
-+ if (!error)
-+ return 0;
-+
-+ exit_ext3_xattr_user();
-+fail:
-+ exit_ext3_xattr();
-+ return error;
- }
-
- static void __exit exit_ext3_fs(void)
- {
- unregister_filesystem(&ext3_fs_type);
-+ exit_ext3_xattr_user();
-+ exit_ext3_xattr();
- }
-
- EXPORT_SYMBOL(ext3_force_commit);
-Index: linux-DRV401/fs/ext3/symlink.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/symlink.c 2004-10-15 10:24:00.000000000 -0700
-+++ linux-DRV401/fs/ext3/symlink.c 2004-10-15 11:03:52.000000000 -0700
-@@ -20,6 +20,7 @@
- #include <linux/fs.h>
- #include <linux/jbd.h>
- #include <linux/ext3_fs.h>
-+#include <linux/ext3_xattr.h>
-
- static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
- {
-@@ -33,7 +34,20 @@
- return vfs_follow_link(nd, s);
- }
-
-+struct inode_operations ext3_symlink_inode_operations = {
-+ readlink: page_readlink, /* BKL not held. Don't need */
-+ follow_link: page_follow_link, /* BKL not held. Don't need */
-+ setxattr: ext3_setxattr, /* BKL held */
-+ getxattr: ext3_getxattr, /* BKL held */
-+ listxattr: ext3_listxattr, /* BKL held */
-+ removexattr: ext3_removexattr, /* BKL held */
-+};
-+
- struct inode_operations ext3_fast_symlink_inode_operations = {
- readlink: ext3_readlink, /* BKL not held. Don't need */
- follow_link: ext3_follow_link, /* BKL not held. Don't need */
-+ setxattr: ext3_setxattr, /* BKL held */
-+ getxattr: ext3_getxattr, /* BKL held */
-+ listxattr: ext3_listxattr, /* BKL held */
-+ removexattr: ext3_removexattr, /* BKL held */
- };
-Index: linux-DRV401/fs/ext3/xattr.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/xattr.c 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/fs/ext3/xattr.c 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,1225 @@
-+/*
-+ * linux/fs/ext3/xattr.c
-+ *
-+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ *
-+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
-+ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
-+ * Extended attributes for symlinks and special files added per
-+ * suggestion of Luka Renko <luka.renko@hermes.si>.
-+ */
-+
-+/*
-+ * Extended attributes are stored on disk blocks allocated outside of
-+ * any inode. The i_file_acl field is then made to point to this allocated
-+ * block. If all extended attributes of an inode are identical, these
-+ * inodes may share the same extended attribute block. Such situations
-+ * are automatically detected by keeping a cache of recent attribute block
-+ * numbers and hashes over the block's contents in memory.
-+ *
-+ *
-+ * Extended attribute block layout:
-+ *
-+ * +------------------+
-+ * | header |
-+ * | entry 1 | |
-+ * | entry 2 | | growing downwards
-+ * | entry 3 | v
-+ * | four null bytes |
-+ * | . . . |
-+ * | value 1 | ^
-+ * | value 3 | | growing upwards
-+ * | value 2 | |
-+ * +------------------+
-+ *
-+ * The block header is followed by multiple entry descriptors. These entry
-+ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD
-+ * byte boundaries. The entry descriptors are sorted by attribute name,
-+ * so that two extended attribute blocks can be compared efficiently.
-+ *
-+ * Attribute values are aligned to the end of the block, stored in
-+ * no specific order. They are also padded to EXT3_XATTR_PAD byte
-+ * boundaries. No additional gaps are left between them.
-+ *
-+ * Locking strategy
-+ * ----------------
-+ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
-+ * the xattr inode operations are called, so we are guaranteed that only one
-+ * processes accesses extended attributes of an inode at any time.
-+ *
-+ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
-+ * only a single process is modifying an extended attribute block, even
-+ * if the block is shared among inodes.
-+ *
-+ * Note for porting to 2.5
-+ * -----------------------
-+ * The BKL will no longer be held in the xattr inode operations.
-+ */
-+
-+#include <linux/module.h>
-+#include <linux/fs.h>
-+#include <linux/locks.h>
-+#include <linux/slab.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_xattr.h>
-+#include <linux/mbcache.h>
-+#include <linux/quotaops.h>
-+#include <asm/semaphore.h>
-+#include <linux/compatmac.h>
-+
-+#define EXT3_EA_USER "user."
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
-+# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
-+#endif
-+
-+#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
-+#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
-+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
-+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-+
-+#ifdef EXT3_XATTR_DEBUG
-+# define ea_idebug(inode, f...) do { \
-+ printk(KERN_DEBUG "inode %s:%ld: ", \
-+ kdevname(inode->i_dev), inode->i_ino); \
-+ printk(f); \
-+ printk("\n"); \
-+ } while (0)
-+# define ea_bdebug(bh, f...) do { \
-+ printk(KERN_DEBUG "block %s:%ld: ", \
-+ kdevname(bh->b_dev), bh->b_blocknr); \
-+ printk(f); \
-+ printk("\n"); \
-+ } while (0)
-+#else
-+# define ea_idebug(f...)
-+# define ea_bdebug(f...)
-+#endif
-+
-+static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *,
-+ struct ext3_xattr_header *);
-+
-+#ifdef CONFIG_EXT3_FS_XATTR_SHARING
-+
-+static int ext3_xattr_cache_insert(struct buffer_head *);
-+static struct buffer_head *ext3_xattr_cache_find(struct inode *,
-+ struct ext3_xattr_header *);
-+static void ext3_xattr_cache_remove(struct buffer_head *);
-+static void ext3_xattr_rehash(struct ext3_xattr_header *,
-+ struct ext3_xattr_entry *);
-+
-+static struct mb_cache *ext3_xattr_cache;
-+
-+#else
-+# define ext3_xattr_cache_insert(bh) 0
-+# define ext3_xattr_cache_find(inode, header) NULL
-+# define ext3_xattr_cache_remove(bh) while(0) {}
-+# define ext3_xattr_rehash(header, entry) while(0) {}
-+#endif
-+
-+/*
-+ * If a file system does not share extended attributes among inodes,
-+ * we should not need the ext3_xattr_sem semaphore. However, the
-+ * filesystem may still contain shared blocks, so we always take
-+ * the lock.
-+ */
-+
-+DECLARE_MUTEX(ext3_xattr_sem);
-+
-+static inline int
-+ext3_xattr_new_block(handle_t *handle, struct inode *inode,
-+ int * errp, int force)
-+{
-+ struct super_block *sb = inode->i_sb;
-+ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-+ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb);
-+
-+ /* How can we enforce the allocation? */
-+ int block = ext3_new_block(handle, inode, goal, 0, 0, errp);
-+#ifdef OLD_QUOTAS
-+ if (!*errp)
-+ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
-+#endif
-+ return block;
-+}
-+
-+static inline int
-+ext3_xattr_quota_alloc(struct inode *inode, int force)
-+{
-+ /* How can we enforce the allocation? */
-+#ifdef OLD_QUOTAS
-+ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
-+ if (!error)
-+ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
-+#else
-+ int error = DQUOT_ALLOC_BLOCK(inode, 1);
-+#endif
-+ return error;
-+}
-+
-+#ifdef OLD_QUOTAS
-+
-+static inline void
-+ext3_xattr_quota_free(struct inode *inode)
-+{
-+ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
-+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
-+}
-+
-+static inline void
-+ext3_xattr_free_block(handle_t *handle, struct inode * inode,
-+ unsigned long block)
-+{
-+ ext3_free_blocks(handle, inode, block, 1);
-+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
-+}
-+
-+#else
-+# define ext3_xattr_quota_free(inode) \
-+ DQUOT_FREE_BLOCK(inode, 1)
-+# define ext3_xattr_free_block(handle, inode, block) \
-+ ext3_free_blocks(handle, inode, block, 1)
-+#endif
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
-+
-+static inline struct buffer_head *
-+sb_bread(struct super_block *sb, int block)
-+{
-+ return bread(sb->s_dev, block, sb->s_blocksize);
-+}
-+
-+static inline struct buffer_head *
-+sb_getblk(struct super_block *sb, int block)
-+{
-+ return getblk(sb->s_dev, block, sb->s_blocksize);
-+}
-+
-+#endif
-+
-+struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
-+rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
-+
-+int
-+ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
-+{
-+ int error = -EINVAL;
-+
-+ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
-+ write_lock(&ext3_handler_lock);
-+ if (!ext3_xattr_handlers[name_index-1]) {
-+ ext3_xattr_handlers[name_index-1] = handler;
-+ error = 0;
-+ }
-+ write_unlock(&ext3_handler_lock);
-+ }
-+ return error;
-+}
-+
-+void
-+ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
-+{
-+ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
-+ write_lock(&ext3_handler_lock);
-+ ext3_xattr_handlers[name_index-1] = NULL;
-+ write_unlock(&ext3_handler_lock);
-+ }
-+}
-+
-+static inline const char *
-+strcmp_prefix(const char *a, const char *a_prefix)
-+{
-+ while (*a_prefix && *a == *a_prefix) {
-+ a++;
-+ a_prefix++;
-+ }
-+ return *a_prefix ? NULL : a;
-+}
-+
-+/*
-+ * Decode the extended attribute name, and translate it into
-+ * the name_index and name suffix.
-+ */
-+static inline struct ext3_xattr_handler *
-+ext3_xattr_resolve_name(const char **name)
-+{
-+ struct ext3_xattr_handler *handler = NULL;
-+ int i;
-+
-+ if (!*name)
-+ return NULL;
-+ read_lock(&ext3_handler_lock);
-+ for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
-+ if (ext3_xattr_handlers[i]) {
-+ const char *n = strcmp_prefix(*name,
-+ ext3_xattr_handlers[i]->prefix);
-+ if (n) {
-+ handler = ext3_xattr_handlers[i];
-+ *name = n;
-+ break;
-+ }
-+ }
-+ }
-+ read_unlock(&ext3_handler_lock);
-+ return handler;
-+}
-+
-+static inline struct ext3_xattr_handler *
-+ext3_xattr_handler(int name_index)
-+{
-+ struct ext3_xattr_handler *handler = NULL;
-+ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
-+ read_lock(&ext3_handler_lock);
-+ handler = ext3_xattr_handlers[name_index-1];
-+ read_unlock(&ext3_handler_lock);
-+ }
-+ return handler;
-+}
-+
-+/*
-+ * Inode operation getxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ * BKL held [before 2.5.x]
-+ */
-+ssize_t
-+ext3_getxattr(struct dentry *dentry, const char *name,
-+ void *buffer, size_t size)
-+{
-+ struct ext3_xattr_handler *handler;
-+ struct inode *inode = dentry->d_inode;
-+
-+ handler = ext3_xattr_resolve_name(&name);
-+ if (!handler)
-+ return -ENOTSUP;
-+ return handler->get(inode, name, buffer, size);
-+}
-+
-+/*
-+ * Inode operation listxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ * BKL held [before 2.5.x]
-+ */
-+ssize_t
-+ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
-+{
-+ return ext3_xattr_list(dentry->d_inode, buffer, size);
-+}
-+
-+/*
-+ * Inode operation setxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ * BKL held [before 2.5.x]
-+ */
-+int
-+ext3_setxattr(struct dentry *dentry, const char *name,
-+ const void *value, size_t size, int flags)
-+{
-+ struct ext3_xattr_handler *handler;
-+ struct inode *inode = dentry->d_inode;
-+
-+ if (size == 0)
-+ value = ""; /* empty EA, do not remove */
-+ handler = ext3_xattr_resolve_name(&name);
-+ if (!handler)
-+ return -ENOTSUP;
-+ return handler->set(inode, name, value, size, flags);
-+}
-+
-+/*
-+ * Inode operation removexattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ * BKL held [before 2.5.x]
-+ */
-+int
-+ext3_removexattr(struct dentry *dentry, const char *name)
-+{
-+ struct ext3_xattr_handler *handler;
-+ struct inode *inode = dentry->d_inode;
-+
-+ handler = ext3_xattr_resolve_name(&name);
-+ if (!handler)
-+ return -ENOTSUP;
-+ return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
-+}
-+
-+/*
-+ * ext3_xattr_get()
-+ *
-+ * Copy an extended attribute into the buffer
-+ * provided, or compute the buffer size required.
-+ * Buffer is NULL to compute the size of the buffer required.
-+ *
-+ * Returns a negative error number on failure, or the number of bytes
-+ * used / required on success.
-+ */
-+int
-+ext3_xattr_get(struct inode *inode, int name_index, const char *name,
-+ void *buffer, size_t buffer_size)
-+{
-+ struct buffer_head *bh = NULL;
-+ struct ext3_xattr_entry *entry;
-+ unsigned int block, size;
-+ char *end;
-+ int name_len, error;
-+
-+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
-+ name_index, name, buffer, (long)buffer_size);
-+
-+ if (name == NULL)
-+ return -EINVAL;
-+ if (!EXT3_I(inode)->i_file_acl)
-+ return -ENOATTR;
-+ block = EXT3_I(inode)->i_file_acl;
-+ ea_idebug(inode, "reading block %d", block);
-+ bh = sb_bread(inode->i_sb, block);
-+ if (!bh)
-+ return -EIO;
-+ ea_bdebug(bh, "b_count=%d, refcount=%d",
-+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
-+ end = bh->b_data + bh->b_size;
-+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-+ HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+bad_block: ext3_error(inode->i_sb, "ext3_xattr_get",
-+ "inode %ld: bad block %d", inode->i_ino, block);
-+ error = -EIO;
-+ goto cleanup;
-+ }
-+ /* find named attribute */
-+ name_len = strlen(name);
-+
-+ error = -ERANGE;
-+ if (name_len > 255)
-+ goto cleanup;
-+ entry = FIRST_ENTRY(bh);
-+ while (!IS_LAST_ENTRY(entry)) {
-+ struct ext3_xattr_entry *next =
-+ EXT3_XATTR_NEXT(entry);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+ if (name_index == entry->e_name_index &&
-+ name_len == entry->e_name_len &&
-+ memcmp(name, entry->e_name, name_len) == 0)
-+ goto found;
-+ entry = next;
-+ }
-+ /* Check the remaining name entries */
-+ while (!IS_LAST_ENTRY(entry)) {
-+ struct ext3_xattr_entry *next =
-+ EXT3_XATTR_NEXT(entry);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+ entry = next;
-+ }
-+ if (ext3_xattr_cache_insert(bh))
-+ ea_idebug(inode, "cache insert failed");
-+ error = -ENOATTR;
-+ goto cleanup;
-+found:
-+ /* check the buffer size */
-+ if (entry->e_value_block != 0)
-+ goto bad_block;
-+ size = le32_to_cpu(entry->e_value_size);
-+ if (size > inode->i_sb->s_blocksize ||
-+ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
-+ goto bad_block;
-+
-+ if (ext3_xattr_cache_insert(bh))
-+ ea_idebug(inode, "cache insert failed");
-+ if (buffer) {
-+ error = -ERANGE;
-+ if (size > buffer_size)
-+ goto cleanup;
-+ /* return value of attribute */
-+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
-+ size);
-+ }
-+ error = size;
-+
-+cleanup:
-+ brelse(bh);
-+
-+ return error;
-+}
-+
-+/*
-+ * ext3_xattr_list()
-+ *
-+ * Copy a list of attribute names into the buffer
-+ * provided, or compute the buffer size required.
-+ * Buffer is NULL to compute the size of the buffer required.
-+ *
-+ * Returns a negative error number on failure, or the number of bytes
-+ * used / required on success.
-+ */
-+int
-+ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
-+{
-+ struct buffer_head *bh = NULL;
-+ struct ext3_xattr_entry *entry;
-+ unsigned int block, size = 0;
-+ char *buf, *end;
-+ int error;
-+
-+ ea_idebug(inode, "buffer=%p, buffer_size=%ld",
-+ buffer, (long)buffer_size);
-+
-+ if (!EXT3_I(inode)->i_file_acl)
-+ return 0;
-+ block = EXT3_I(inode)->i_file_acl;
-+ ea_idebug(inode, "reading block %d", block);
-+ bh = sb_bread(inode->i_sb, block);
-+ if (!bh)
-+ return -EIO;
-+ ea_bdebug(bh, "b_count=%d, refcount=%d",
-+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
-+ end = bh->b_data + bh->b_size;
-+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-+ HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+bad_block: ext3_error(inode->i_sb, "ext3_xattr_list",
-+ "inode %ld: bad block %d", inode->i_ino, block);
-+ error = -EIO;
-+ goto cleanup;
-+ }
-+ /* compute the size required for the list of attribute names */
-+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
-+ entry = EXT3_XATTR_NEXT(entry)) {
-+ struct ext3_xattr_handler *handler;
-+ struct ext3_xattr_entry *next =
-+ EXT3_XATTR_NEXT(entry);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+
-+ handler = ext3_xattr_handler(entry->e_name_index);
-+ if (handler)
-+ size += handler->list(NULL, inode, entry->e_name,
-+ entry->e_name_len);
-+ }
-+
-+ if (ext3_xattr_cache_insert(bh))
-+ ea_idebug(inode, "cache insert failed");
-+ if (!buffer) {
-+ error = size;
-+ goto cleanup;
-+ } else {
-+ error = -ERANGE;
-+ if (size > buffer_size)
-+ goto cleanup;
-+ }
-+
-+ /* list the attribute names */
-+ buf = buffer;
-+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
-+ entry = EXT3_XATTR_NEXT(entry)) {
-+ struct ext3_xattr_handler *handler;
-+
-+ handler = ext3_xattr_handler(entry->e_name_index);
-+ if (handler)
-+ buf += handler->list(buf, inode, entry->e_name,
-+ entry->e_name_len);
-+ }
-+ error = size;
-+
-+cleanup:
-+ brelse(bh);
-+
-+ return error;
-+}
-+
-+/*
-+ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
-+ * not set, set it.
-+ */
-+static void ext3_xattr_update_super_block(handle_t *handle,
-+ struct super_block *sb)
-+{
-+ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
-+ return;
-+
-+ lock_super(sb);
-+ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
-+ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR;
-+#endif
-+ EXT3_SB(sb)->s_es->s_feature_compat |=
-+ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
-+ sb->s_dirt = 1;
-+ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-+ unlock_super(sb);
-+}
-+
-+/*
-+ * ext3_xattr_set()
-+ *
-+ * Create, replace or remove an extended attribute for this inode. Buffer
-+ * is NULL to remove an existing extended attribute, and non-NULL to
-+ * either replace an existing extended attribute, or create a new extended
-+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
-+ * specify that an extended attribute must exist and must not exist
-+ * previous to the call, respectively.
-+ *
-+ * Returns 0, or a negative error number on failure.
-+ */
-+int
-+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
-+ const char *name, const void *value, size_t value_len, int flags)
-+{
-+ struct super_block *sb = inode->i_sb;
-+ struct buffer_head *bh = NULL;
-+ struct ext3_xattr_header *header = NULL;
-+ struct ext3_xattr_entry *here, *last;
-+ unsigned int name_len;
-+ int block = EXT3_I(inode)->i_file_acl;
-+ int min_offs = sb->s_blocksize, not_found = 1, free, error;
-+ char *end;
-+
-+ /*
-+ * header -- Points either into bh, or to a temporarily
-+ * allocated buffer.
-+ * here -- The named entry found, or the place for inserting, within
-+ * the block pointed to by header.
-+ * last -- Points right after the last named entry within the block
-+ * pointed to by header.
-+ * min_offs -- The offset of the first value (values are aligned
-+ * towards the end of the block).
-+ * end -- Points right after the block pointed to by header.
-+ */
-+
-+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
-+ name_index, name, value, (long)value_len);
-+
-+ if (IS_RDONLY(inode))
-+ return -EROFS;
-+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-+ return -EPERM;
-+ if (value == NULL)
-+ value_len = 0;
-+ if (name == NULL)
-+ return -EINVAL;
-+ name_len = strlen(name);
-+ if (name_len > 255 || value_len > sb->s_blocksize)
-+ return -ERANGE;
-+ down(&ext3_xattr_sem);
-+
-+ if (block) {
-+ /* The inode already has an extended attribute block. */
-+ bh = sb_bread(sb, block);
-+ error = -EIO;
-+ if (!bh)
-+ goto cleanup;
-+ ea_bdebug(bh, "b_count=%d, refcount=%d",
-+ atomic_read(&(bh->b_count)),
-+ le32_to_cpu(HDR(bh)->h_refcount));
-+ header = HDR(bh);
-+ end = bh->b_data + bh->b_size;
-+ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-+ header->h_blocks != cpu_to_le32(1)) {
-+bad_block: ext3_error(sb, "ext3_xattr_set",
-+ "inode %ld: bad block %d", inode->i_ino, block);
-+ error = -EIO;
-+ goto cleanup;
-+ }
-+ /* Find the named attribute. */
-+ here = FIRST_ENTRY(bh);
-+ while (!IS_LAST_ENTRY(here)) {
-+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+ if (!here->e_value_block && here->e_value_size) {
-+ int offs = le16_to_cpu(here->e_value_offs);
-+ if (offs < min_offs)
-+ min_offs = offs;
-+ }
-+ not_found = name_index - here->e_name_index;
-+ if (!not_found)
-+ not_found = name_len - here->e_name_len;
-+ if (!not_found)
-+ not_found = memcmp(name, here->e_name,name_len);
-+ if (not_found <= 0)
-+ break;
-+ here = next;
-+ }
-+ last = here;
-+ /* We still need to compute min_offs and last. */
-+ while (!IS_LAST_ENTRY(last)) {
-+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last);
-+ if ((char *)next >= end)
-+ goto bad_block;
-+ if (!last->e_value_block && last->e_value_size) {
-+ int offs = le16_to_cpu(last->e_value_offs);
-+ if (offs < min_offs)
-+ min_offs = offs;
-+ }
-+ last = next;
-+ }
-+
-+ /* Check whether we have enough space left. */
-+ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
-+ } else {
-+ /* We will use a new extended attribute block. */
-+ free = sb->s_blocksize -
-+ sizeof(struct ext3_xattr_header) - sizeof(__u32);
-+ here = last = NULL; /* avoid gcc uninitialized warning. */
-+ }
-+
-+ if (not_found) {
-+ /* Request to remove a nonexistent attribute? */
-+ error = -ENOATTR;
-+ if (flags & XATTR_REPLACE)
-+ goto cleanup;
-+ error = 0;
-+ if (value == NULL)
-+ goto cleanup;
-+ else
-+ free -= EXT3_XATTR_LEN(name_len);
-+ } else {
-+ /* Request to create an existing attribute? */
-+ error = -EEXIST;
-+ if (flags & XATTR_CREATE)
-+ goto cleanup;
-+ if (!here->e_value_block && here->e_value_size) {
-+ unsigned int size = le32_to_cpu(here->e_value_size);
-+
-+ if (le16_to_cpu(here->e_value_offs) + size >
-+ sb->s_blocksize || size > sb->s_blocksize)
-+ goto bad_block;
-+ free += EXT3_XATTR_SIZE(size);
-+ }
-+ }
-+ free -= EXT3_XATTR_SIZE(value_len);
-+ error = -ENOSPC;
-+ if (free < 0)
-+ goto cleanup;
-+
-+ /* Here we know that we can set the new attribute. */
-+
-+ if (header) {
-+ if (header->h_refcount == cpu_to_le32(1)) {
-+ ea_bdebug(bh, "modifying in-place");
-+ ext3_xattr_cache_remove(bh);
-+ error = ext3_journal_get_write_access(handle, bh);
-+ if (error)
-+ goto cleanup;
-+ } else {
-+ int offset;
-+
-+ ea_bdebug(bh, "cloning");
-+ header = kmalloc(bh->b_size, GFP_KERNEL);
-+ error = -ENOMEM;
-+ if (header == NULL)
-+ goto cleanup;
-+ memcpy(header, HDR(bh), bh->b_size);
-+ header->h_refcount = cpu_to_le32(1);
-+ offset = (char *)header - bh->b_data;
-+ here = ENTRY((char *)here + offset);
-+ last = ENTRY((char *)last + offset);
-+ }
-+ } else {
-+ /* Allocate a buffer where we construct the new block. */
-+ header = kmalloc(sb->s_blocksize, GFP_KERNEL);
-+ error = -ENOMEM;
-+ if (header == NULL)
-+ goto cleanup;
-+ memset(header, 0, sb->s_blocksize);
-+ end = (char *)header + sb->s_blocksize;
-+ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
-+ header->h_blocks = header->h_refcount = cpu_to_le32(1);
-+ last = here = ENTRY(header+1);
-+ }
-+
-+ if (not_found) {
-+ /* Insert the new name. */
-+ int size = EXT3_XATTR_LEN(name_len);
-+ int rest = (char *)last - (char *)here;
-+ memmove((char *)here + size, here, rest);
-+ memset(here, 0, size);
-+ here->e_name_index = name_index;
-+ here->e_name_len = name_len;
-+ memcpy(here->e_name, name, name_len);
-+ } else {
-+ /* Remove the old value. */
-+ if (!here->e_value_block && here->e_value_size) {
-+ char *first_val = (char *)header + min_offs;
-+ int offs = le16_to_cpu(here->e_value_offs);
-+ char *val = (char *)header + offs;
-+ size_t size = EXT3_XATTR_SIZE(
-+ le32_to_cpu(here->e_value_size));
-+ memmove(first_val + size, first_val, val - first_val);
-+ memset(first_val, 0, size);
-+ here->e_value_offs = 0;
-+ min_offs += size;
-+
-+ /* Adjust all value offsets. */
-+ last = ENTRY(header+1);
-+ while (!IS_LAST_ENTRY(last)) {
-+ int o = le16_to_cpu(last->e_value_offs);
-+ if (!last->e_value_block && o < offs)
-+ last->e_value_offs =
-+ cpu_to_le16(o + size);
-+ last = EXT3_XATTR_NEXT(last);
-+ }
-+ }
-+ if (value == NULL) {
-+ /* Remove this attribute. */
-+ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) {
-+ /* This block is now empty. */
-+ error = ext3_xattr_set2(handle, inode, bh,NULL);
-+ goto cleanup;
-+ } else {
-+ /* Remove the old name. */
-+ int size = EXT3_XATTR_LEN(name_len);
-+ last = ENTRY((char *)last - size);
-+ memmove(here, (char*)here + size,
-+ (char*)last - (char*)here);
-+ memset(last, 0, size);
-+ }
-+ }
-+ }
-+
-+ if (value != NULL) {
-+ /* Insert the new value. */
-+ here->e_value_size = cpu_to_le32(value_len);
-+ if (value_len) {
-+ size_t size = EXT3_XATTR_SIZE(value_len);
-+ char *val = (char *)header + min_offs - size;
-+ here->e_value_offs =
-+ cpu_to_le16((char *)val - (char *)header);
-+ memset(val + size - EXT3_XATTR_PAD, 0,
-+ EXT3_XATTR_PAD); /* Clear the pad bytes. */
-+ memcpy(val, value, value_len);
-+ }
-+ }
-+ ext3_xattr_rehash(header, here);
-+
-+ error = ext3_xattr_set2(handle, inode, bh, header);
-+
-+cleanup:
-+ brelse(bh);
-+ if (!(bh && header == HDR(bh)))
-+ kfree(header);
-+ up(&ext3_xattr_sem);
-+
-+ return error;
-+}
-+
-+/*
-+ * Second half of ext3_xattr_set(): Update the file system.
-+ */
-+static int
-+ext3_xattr_set2(handle_t *handle, struct inode *inode,
-+ struct buffer_head *old_bh, struct ext3_xattr_header *header)
-+{
-+ struct super_block *sb = inode->i_sb;
-+ struct buffer_head *new_bh = NULL;
-+ int error;
-+
-+ if (header) {
-+ new_bh = ext3_xattr_cache_find(inode, header);
-+ if (new_bh) {
-+ /*
-+ * We found an identical block in the cache.
-+ * The old block will be released after updating
-+ * the inode.
-+ */
-+ ea_bdebug(old_bh, "reusing block %ld",
-+ new_bh->b_blocknr);
-+
-+ error = -EDQUOT;
-+ if (ext3_xattr_quota_alloc(inode, 1))
-+ goto cleanup;
-+
-+ error = ext3_journal_get_write_access(handle, new_bh);
-+ if (error)
-+ goto cleanup;
-+ HDR(new_bh)->h_refcount = cpu_to_le32(
-+ le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
-+ ea_bdebug(new_bh, "refcount now=%d",
-+ le32_to_cpu(HDR(new_bh)->h_refcount));
-+ } else if (old_bh && header == HDR(old_bh)) {
-+ /* Keep this block. */
-+ new_bh = old_bh;
-+ ext3_xattr_cache_insert(new_bh);
-+ } else {
-+ /* We need to allocate a new block */
-+ int force = EXT3_I(inode)->i_file_acl != 0;
-+ int block = ext3_xattr_new_block(handle, inode,
-+ &error, force);
-+ if (error)
-+ goto cleanup;
-+ ea_idebug(inode, "creating block %d", block);
-+
-+ new_bh = sb_getblk(sb, block);
-+ if (!new_bh) {
-+getblk_failed: ext3_xattr_free_block(handle, inode, block);
-+ error = -EIO;
-+ goto cleanup;
-+ }
-+ lock_buffer(new_bh);
-+ error = ext3_journal_get_create_access(handle, new_bh);
-+ if (error) {
-+ unlock_buffer(new_bh);
-+ goto getblk_failed;
-+ }
-+ memcpy(new_bh->b_data, header, new_bh->b_size);
-+ mark_buffer_uptodate(new_bh, 1);
-+ unlock_buffer(new_bh);
-+ ext3_xattr_cache_insert(new_bh);
-+
-+ ext3_xattr_update_super_block(handle, sb);
-+ }
-+ error = ext3_journal_dirty_metadata(handle, new_bh);
-+ if (error)
-+ goto cleanup;
-+ }
-+
-+ /* Update the inode. */
-+ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
-+ inode->i_ctime = CURRENT_TIME;
-+ ext3_mark_inode_dirty(handle, inode);
-+ if (IS_SYNC(inode))
-+ handle->h_sync = 1;
-+
-+ error = 0;
-+ if (old_bh && old_bh != new_bh) {
-+ /*
-+ * If there was an old block, and we are not still using it,
-+ * we now release the old block.
-+ */
-+ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
-+
-+ error = ext3_journal_get_write_access(handle, old_bh);
-+ if (error)
-+ goto cleanup;
-+ if (refcount == 1) {
-+ /* Free the old block. */
-+ ea_bdebug(old_bh, "freeing");
-+ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr);
-+
-+ /* ext3_forget() calls bforget() for us, but we
-+ let our caller release old_bh, so we need to
-+ duplicate the handle before. */
-+ get_bh(old_bh);
-+ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
-+ } else {
-+ /* Decrement the refcount only. */
-+ refcount--;
-+ HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
-+ ext3_xattr_quota_free(inode);
-+ ext3_journal_dirty_metadata(handle, old_bh);
-+ ea_bdebug(old_bh, "refcount now=%d", refcount);
-+ }
-+ }
-+
-+cleanup:
-+ if (old_bh != new_bh)
-+ brelse(new_bh);
-+
-+ return error;
-+}
-+
-+/*
-+ * ext3_xattr_delete_inode()
-+ *
-+ * Free extended attribute resources associated with this inode. This
-+ * is called immediately before an inode is freed.
-+ */
-+void
-+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
-+{
-+ struct buffer_head *bh;
-+ unsigned int block = EXT3_I(inode)->i_file_acl;
-+
-+ if (!block)
-+ return;
-+ down(&ext3_xattr_sem);
-+
-+ bh = sb_bread(inode->i_sb, block);
-+ if (!bh) {
-+ ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
-+ "inode %ld: block %d read error", inode->i_ino, block);
-+ goto cleanup;
-+ }
-+ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
-+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-+ HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+ ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
-+ "inode %ld: bad block %d", inode->i_ino, block);
-+ goto cleanup;
-+ }
-+ ext3_journal_get_write_access(handle, bh);
-+ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
-+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
-+ ext3_xattr_cache_remove(bh);
-+ ext3_xattr_free_block(handle, inode, block);
-+ ext3_forget(handle, 1, inode, bh, block);
-+ bh = NULL;
-+ } else {
-+ HDR(bh)->h_refcount = cpu_to_le32(
-+ le32_to_cpu(HDR(bh)->h_refcount) - 1);
-+ ext3_journal_dirty_metadata(handle, bh);
-+ if (IS_SYNC(inode))
-+ handle->h_sync = 1;
-+ ext3_xattr_quota_free(inode);
-+ }
-+ EXT3_I(inode)->i_file_acl = 0;
-+
-+cleanup:
-+ brelse(bh);
-+ up(&ext3_xattr_sem);
-+}
-+
-+/*
-+ * ext3_xattr_put_super()
-+ *
-+ * This is called when a file system is unmounted.
-+ */
-+void
-+ext3_xattr_put_super(struct super_block *sb)
-+{
-+#ifdef CONFIG_EXT3_FS_XATTR_SHARING
-+ mb_cache_shrink(ext3_xattr_cache, sb->s_dev);
-+#endif
-+}
-+
-+#ifdef CONFIG_EXT3_FS_XATTR_SHARING
-+
-+/*
-+ * ext3_xattr_cache_insert()
-+ *
-+ * Create a new entry in the extended attribute cache, and insert
-+ * it unless such an entry is already in the cache.
-+ *
-+ * Returns 0, or a negative error number on failure.
-+ */
-+static int
-+ext3_xattr_cache_insert(struct buffer_head *bh)
-+{
-+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-+ struct mb_cache_entry *ce;
-+ int error;
-+
-+ ce = mb_cache_entry_alloc(ext3_xattr_cache);
-+ if (!ce)
-+ return -ENOMEM;
-+ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
-+ if (error) {
-+ mb_cache_entry_free(ce);
-+ if (error == -EBUSY) {
-+ ea_bdebug(bh, "already in cache (%d cache entries)",
-+ atomic_read(&ext3_xattr_cache->c_entry_count));
-+ error = 0;
-+ }
-+ } else {
-+ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
-+ atomic_read(&ext3_xattr_cache->c_entry_count));
-+ mb_cache_entry_release(ce);
-+ }
-+ return error;
-+}
-+
-+/*
-+ * ext3_xattr_cmp()
-+ *
-+ * Compare two extended attribute blocks for equality.
-+ *
-+ * Returns 0 if the blocks are equal, 1 if they differ, and
-+ * a negative error number on errors.
-+ */
-+static int
-+ext3_xattr_cmp(struct ext3_xattr_header *header1,
-+ struct ext3_xattr_header *header2)
-+{
-+ struct ext3_xattr_entry *entry1, *entry2;
-+
-+ entry1 = ENTRY(header1+1);
-+ entry2 = ENTRY(header2+1);
-+ while (!IS_LAST_ENTRY(entry1)) {
-+ if (IS_LAST_ENTRY(entry2))
-+ return 1;
-+ if (entry1->e_hash != entry2->e_hash ||
-+ entry1->e_name_len != entry2->e_name_len ||
-+ entry1->e_value_size != entry2->e_value_size ||
-+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
-+ return 1;
-+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
-+ return -EIO;
-+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
-+ (char *)header2 + le16_to_cpu(entry2->e_value_offs),
-+ le32_to_cpu(entry1->e_value_size)))
-+ return 1;
-+
-+ entry1 = EXT3_XATTR_NEXT(entry1);
-+ entry2 = EXT3_XATTR_NEXT(entry2);
-+ }
-+ if (!IS_LAST_ENTRY(entry2))
-+ return 1;
-+ return 0;
-+}
-+
-+/*
-+ * ext3_xattr_cache_find()
-+ *
-+ * Find an identical extended attribute block.
-+ *
-+ * Returns a pointer to the block found, or NULL if such a block was
-+ * not found or an error occurred.
-+ */
-+static struct buffer_head *
-+ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
-+{
-+ __u32 hash = le32_to_cpu(header->h_hash);
-+ struct mb_cache_entry *ce;
-+
-+ if (!header->h_hash)
-+ return NULL; /* never share */
-+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-+ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash);
-+ while (ce) {
-+ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
-+
-+ if (!bh) {
-+ ext3_error(inode->i_sb, "ext3_xattr_cache_find",
-+ "inode %ld: block %ld read error",
-+ inode->i_ino, ce->e_block);
-+ } else if (le32_to_cpu(HDR(bh)->h_refcount) >
-+ EXT3_XATTR_REFCOUNT_MAX) {
-+ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
-+ le32_to_cpu(HDR(bh)->h_refcount),
-+ EXT3_XATTR_REFCOUNT_MAX);
-+ } else if (!ext3_xattr_cmp(header, HDR(bh))) {
-+ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
-+ mb_cache_entry_release(ce);
-+ return bh;
-+ }
-+ brelse(bh);
-+ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
-+ }
-+ return NULL;
-+}
-+
-+/*
-+ * ext3_xattr_cache_remove()
-+ *
-+ * Remove the cache entry of a block from the cache. Called when a
-+ * block becomes invalid.
-+ */
-+static void
-+ext3_xattr_cache_remove(struct buffer_head *bh)
-+{
-+ struct mb_cache_entry *ce;
-+
-+ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr);
-+ if (ce) {
-+ ea_bdebug(bh, "removing (%d cache entries remaining)",
-+ atomic_read(&ext3_xattr_cache->c_entry_count)-1);
-+ mb_cache_entry_free(ce);
-+ } else
-+ ea_bdebug(bh, "no cache entry");
-+}
-+
-+#define NAME_HASH_SHIFT 5
-+#define VALUE_HASH_SHIFT 16
-+
-+/*
-+ * ext3_xattr_hash_entry()
-+ *
-+ * Compute the hash of an extended attribute.
-+ */
-+static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
-+ struct ext3_xattr_entry *entry)
-+{
-+ __u32 hash = 0;
-+ char *name = entry->e_name;
-+ int n;
-+
-+ for (n=0; n < entry->e_name_len; n++) {
-+ hash = (hash << NAME_HASH_SHIFT) ^
-+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
-+ *name++;
-+ }
-+
-+ if (entry->e_value_block == 0 && entry->e_value_size != 0) {
-+ __u32 *value = (__u32 *)((char *)header +
-+ le16_to_cpu(entry->e_value_offs));
-+ for (n = (le32_to_cpu(entry->e_value_size) +
-+ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
-+ hash = (hash << VALUE_HASH_SHIFT) ^
-+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
-+ le32_to_cpu(*value++);
-+ }
-+ }
-+ entry->e_hash = cpu_to_le32(hash);
-+}
-+
-+#undef NAME_HASH_SHIFT
-+#undef VALUE_HASH_SHIFT
-+
-+#define BLOCK_HASH_SHIFT 16
-+
-+/*
-+ * ext3_xattr_rehash()
-+ *
-+ * Re-compute the extended attribute hash value after an entry has changed.
-+ */
-+static void ext3_xattr_rehash(struct ext3_xattr_header *header,
-+ struct ext3_xattr_entry *entry)
-+{
-+ struct ext3_xattr_entry *here;
-+ __u32 hash = 0;
-+
-+ ext3_xattr_hash_entry(header, entry);
-+ here = ENTRY(header+1);
-+ while (!IS_LAST_ENTRY(here)) {
-+ if (!here->e_hash) {
-+ /* Block is not shared if an entry's hash value == 0 */
-+ hash = 0;
-+ break;
-+ }
-+ hash = (hash << BLOCK_HASH_SHIFT) ^
-+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
-+ le32_to_cpu(here->e_hash);
-+ here = EXT3_XATTR_NEXT(here);
-+ }
-+ header->h_hash = cpu_to_le32(hash);
-+}
-+
-+#undef BLOCK_HASH_SHIFT
-+
-+int __init
-+init_ext3_xattr(void)
-+{
-+ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
-+ sizeof(struct mb_cache_entry) +
-+ sizeof(struct mb_cache_entry_index), 1, 61);
-+ if (!ext3_xattr_cache)
-+ return -ENOMEM;
-+
-+ return 0;
-+}
-+
-+void
-+exit_ext3_xattr(void)
-+{
-+ if (ext3_xattr_cache)
-+ mb_cache_destroy(ext3_xattr_cache);
-+ ext3_xattr_cache = NULL;
-+}
-+
-+#else /* CONFIG_EXT3_FS_XATTR_SHARING */
-+
-+int __init
-+init_ext3_xattr(void)
-+{
-+ return 0;
-+}
-+
-+void
-+exit_ext3_xattr(void)
-+{
-+}
-+
-+#endif /* CONFIG_EXT3_FS_XATTR_SHARING */
-Index: linux-DRV401/fs/ext3/xattr_user.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/xattr_user.c 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/fs/ext3/xattr_user.c 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,111 @@
-+/*
-+ * linux/fs/ext3/xattr_user.c
-+ * Handler for extended user attributes.
-+ *
-+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ */
-+
-+#include <linux/module.h>
-+#include <linux/string.h>
-+#include <linux/fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_xattr.h>
-+
-+#ifdef CONFIG_EXT3_FS_POSIX_ACL
-+# include <linux/ext3_acl.h>
-+#endif
-+
-+#define XATTR_USER_PREFIX "user."
-+
-+static size_t
-+ext3_xattr_user_list(char *list, struct inode *inode,
-+ const char *name, int name_len)
-+{
-+ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
-+
-+ if (!test_opt(inode->i_sb, XATTR_USER))
-+ return 0;
-+
-+ if (list) {
-+ memcpy(list, XATTR_USER_PREFIX, prefix_len);
-+ memcpy(list+prefix_len, name, name_len);
-+ list[prefix_len + name_len] = '\0';
-+ }
-+ return prefix_len + name_len + 1;
-+}
-+
-+static int
-+ext3_xattr_user_get(struct inode *inode, const char *name,
-+ void *buffer, size_t size)
-+{
-+ int error;
-+
-+ if (strcmp(name, "") == 0)
-+ return -EINVAL;
-+ if (!test_opt(inode->i_sb, XATTR_USER))
-+ return -ENOTSUP;
-+#ifdef CONFIG_EXT3_FS_POSIX_ACL
-+ error = ext3_permission_locked(inode, MAY_READ);
-+#else
-+ error = permission(inode, MAY_READ);
-+#endif
-+ if (error)
-+ return error;
-+
-+ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name,
-+ buffer, size);
-+}
-+
-+static int
-+ext3_xattr_user_set(struct inode *inode, const char *name,
-+ const void *value, size_t size, int flags)
-+{
-+ handle_t *handle;
-+ int error;
-+
-+ if (strcmp(name, "") == 0)
-+ return -EINVAL;
-+ if (!test_opt(inode->i_sb, XATTR_USER))
-+ return -ENOTSUP;
-+ if ( !S_ISREG(inode->i_mode) &&
-+ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-+ return -EPERM;
-+#ifdef CONFIG_EXT3_FS_POSIX_ACL
-+ error = ext3_permission_locked(inode, MAY_WRITE);
-+#else
-+ error = permission(inode, MAY_WRITE);
-+#endif
-+ if (error)
-+ return error;
-+
-+ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
-+ if (IS_ERR(handle))
-+ return PTR_ERR(handle);
-+ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name,
-+ value, size, flags);
-+ ext3_journal_stop(handle, inode);
-+
-+ return error;
-+}
-+
-+struct ext3_xattr_handler ext3_xattr_user_handler = {
-+ prefix: XATTR_USER_PREFIX,
-+ list: ext3_xattr_user_list,
-+ get: ext3_xattr_user_get,
-+ set: ext3_xattr_user_set,
-+};
-+
-+int __init
-+init_ext3_xattr_user(void)
-+{
-+ return ext3_xattr_register(EXT3_XATTR_INDEX_USER,
-+ &ext3_xattr_user_handler);
-+}
-+
-+void
-+exit_ext3_xattr_user(void)
-+{
-+ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
-+ &ext3_xattr_user_handler);
-+}
-Index: linux-DRV401/fs/ext3/ext3-exports.c
-===================================================================
---- linux-DRV401.orig/fs/ext3/ext3-exports.c 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/fs/ext3/ext3-exports.c 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,13 @@
-+#include <linux/config.h>
-+#include <linux/module.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/ext3_xattr.h>
-+
-+EXPORT_SYMBOL(ext3_force_commit);
-+EXPORT_SYMBOL(ext3_bread);
-+EXPORT_SYMBOL(ext3_xattr_register);
-+EXPORT_SYMBOL(ext3_xattr_unregister);
-+EXPORT_SYMBOL(ext3_xattr_get);
-+EXPORT_SYMBOL(ext3_xattr_list);
-+EXPORT_SYMBOL(ext3_xattr_set);
-Index: linux-DRV401/fs/mbcache.c
-===================================================================
---- linux-DRV401.orig/fs/mbcache.c 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/fs/mbcache.c 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,648 @@
-+/*
-+ * linux/fs/mbcache.c
-+ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ */
-+
-+/*
-+ * Filesystem Meta Information Block Cache (mbcache)
-+ *
-+ * The mbcache caches blocks of block devices that need to be located
-+ * by their device/block number, as well as by other criteria (such
-+ * as the block's contents).
-+ *
-+ * There can only be one cache entry in a cache per device and block number.
-+ * Additional indexes need not be unique in this sense. The number of
-+ * additional indexes (=other criteria) can be hardwired at compile time
-+ * or specified at cache create time.
-+ *
-+ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
-+ * in the cache. A valid entry is in the main hash tables of the cache,
-+ * and may also be in the lru list. An invalid entry is not in any hashes
-+ * or lists.
-+ *
-+ * A valid cache entry is only in the lru list if no handles refer to it.
-+ * Invalid cache entries will be freed when the last handle to the cache
-+ * entry is released. Entries that cannot be freed immediately are put
-+ * back on the lru list.
-+ */
-+
-+#include <linux/kernel.h>
-+#include <linux/module.h>
-+
-+#include <linux/fs.h>
-+#include <linux/slab.h>
-+#include <linux/sched.h>
-+#include <linux/cache_def.h>
-+#include <linux/version.h>
-+#include <linux/init.h>
-+#include <linux/mbcache.h>
-+
-+
-+#ifdef MB_CACHE_DEBUG
-+# define mb_debug(f...) do { \
-+ printk(KERN_DEBUG f); \
-+ printk("\n"); \
-+ } while (0)
-+#define mb_assert(c) do { if (!(c)) \
-+ printk(KERN_ERR "assertion " #c " failed\n"); \
-+ } while(0)
-+#else
-+# define mb_debug(f...) do { } while(0)
-+# define mb_assert(c) do { } while(0)
-+#endif
-+#define mb_error(f...) do { \
-+ printk(KERN_ERR f); \
-+ printk("\n"); \
-+ } while(0)
-+
-+MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
-+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
-+MODULE_LICENSE("GPL");
-+#endif
-+
-+EXPORT_SYMBOL(mb_cache_create);
-+EXPORT_SYMBOL(mb_cache_shrink);
-+EXPORT_SYMBOL(mb_cache_destroy);
-+EXPORT_SYMBOL(mb_cache_entry_alloc);
-+EXPORT_SYMBOL(mb_cache_entry_insert);
-+EXPORT_SYMBOL(mb_cache_entry_release);
-+EXPORT_SYMBOL(mb_cache_entry_takeout);
-+EXPORT_SYMBOL(mb_cache_entry_free);
-+EXPORT_SYMBOL(mb_cache_entry_dup);
-+EXPORT_SYMBOL(mb_cache_entry_get);
-+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-+EXPORT_SYMBOL(mb_cache_entry_find_first);
-+EXPORT_SYMBOL(mb_cache_entry_find_next);
-+#endif
-+
-+
-+/*
-+ * Global data: list of all mbcache's, lru list, and a spinlock for
-+ * accessing cache data structures on SMP machines. The lru list is
-+ * global across all mbcaches.
-+ */
-+
-+static LIST_HEAD(mb_cache_list);
-+static LIST_HEAD(mb_cache_lru_list);
-+static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED;
-+
-+static inline int
-+mb_cache_indexes(struct mb_cache *cache)
-+{
-+#ifdef MB_CACHE_INDEXES_COUNT
-+ return MB_CACHE_INDEXES_COUNT;
-+#else
-+ return cache->c_indexes_count;
-+#endif
-+}
-+
-+/*
-+ * What the mbcache registers as to get shrunk dynamically.
-+ */
-+
-+static void
-+mb_cache_memory_pressure(int priority, unsigned int gfp_mask);
-+
-+static struct cache_definition mb_cache_definition = {
-+ "mb_cache",
-+ mb_cache_memory_pressure
-+};
-+
-+
-+static inline int
-+__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
-+{
-+ return !list_empty(&ce->e_block_list);
-+}
-+
-+
-+static inline void
-+__mb_cache_entry_unhash(struct mb_cache_entry *ce)
-+{
-+ int n;
-+
-+ if (__mb_cache_entry_is_hashed(ce)) {
-+ list_del_init(&ce->e_block_list);
-+ for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
-+ list_del(&ce->e_indexes[n].o_list);
-+ }
-+}
-+
-+
-+static inline void
-+__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
-+{
-+ struct mb_cache *cache = ce->e_cache;
-+
-+ mb_assert(atomic_read(&ce->e_used) == 0);
-+ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
-+ /* free failed -- put back on the lru list
-+ for freeing later. */
-+ spin_lock(&mb_cache_spinlock);
-+ list_add(&ce->e_lru_list, &mb_cache_lru_list);
-+ spin_unlock(&mb_cache_spinlock);
-+ } else {
-+ kmem_cache_free(cache->c_entry_cache, ce);
-+ atomic_dec(&cache->c_entry_count);
-+ }
-+}
-+
-+
-+static inline void
-+__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
-+{
-+ if (atomic_dec_and_test(&ce->e_used)) {
-+ if (__mb_cache_entry_is_hashed(ce))
-+ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
-+ else {
-+ spin_unlock(&mb_cache_spinlock);
-+ __mb_cache_entry_forget(ce, GFP_KERNEL);
-+ return;
-+ }
-+ }
-+ spin_unlock(&mb_cache_spinlock);
-+}
-+
-+
-+/*
-+ * mb_cache_memory_pressure() memory pressure callback
-+ *
-+ * This function is called by the kernel memory management when memory
-+ * gets low.
-+ *
-+ * @priority: Amount by which to shrink the cache (0 = highes priority)
-+ * @gfp_mask: (ignored)
-+ */
-+static void
-+mb_cache_memory_pressure(int priority, unsigned int gfp_mask)
-+{
-+ LIST_HEAD(free_list);
-+ struct list_head *l, *ltmp;
-+ int count = 0;
-+
-+ spin_lock(&mb_cache_spinlock);
-+ list_for_each(l, &mb_cache_list) {
-+ struct mb_cache *cache =
-+ list_entry(l, struct mb_cache, c_cache_list);
-+ mb_debug("cache %s (%d)", cache->c_name,
-+ atomic_read(&cache->c_entry_count));
-+ count += atomic_read(&cache->c_entry_count);
-+ }
-+ mb_debug("trying to free %d of %d entries",
-+ count / (priority ? priority : 1), count);
-+ if (priority)
-+ count /= priority;
-+ while (count-- && !list_empty(&mb_cache_lru_list)) {
-+ struct mb_cache_entry *ce =
-+ list_entry(mb_cache_lru_list.next,
-+ struct mb_cache_entry, e_lru_list);
-+ list_del(&ce->e_lru_list);
-+ __mb_cache_entry_unhash(ce);
-+ list_add_tail(&ce->e_lru_list, &free_list);
-+ }
-+ spin_unlock(&mb_cache_spinlock);
-+ list_for_each_safe(l, ltmp, &free_list) {
-+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-+ e_lru_list), gfp_mask);
-+ }
-+}
-+
-+
-+/*
-+ * mb_cache_create() create a new cache
-+ *
-+ * All entries in one cache are equal size. Cache entries may be from
-+ * multiple devices. If this is the first mbcache created, registers
-+ * the cache with kernel memory management. Returns NULL if no more
-+ * memory was available.
-+ *
-+ * @name: name of the cache (informal)
-+ * @cache_op: contains the callback called when freeing a cache entry
-+ * @entry_size: The size of a cache entry, including
-+ * struct mb_cache_entry
-+ * @indexes_count: number of additional indexes in the cache. Must equal
-+ * MB_CACHE_INDEXES_COUNT if the number of indexes is
-+ * hardwired.
-+ * @bucket_count: number of hash buckets
-+ */
-+struct mb_cache *
-+mb_cache_create(const char *name, struct mb_cache_op *cache_op,
-+ size_t entry_size, int indexes_count, int bucket_count)
-+{
-+ int m=0, n;
-+ struct mb_cache *cache = NULL;
-+
-+ if(entry_size < sizeof(struct mb_cache_entry) +
-+ indexes_count * sizeof(struct mb_cache_entry_index))
-+ return NULL;
-+
-+ MOD_INC_USE_COUNT;
-+ cache = kmalloc(sizeof(struct mb_cache) +
-+ indexes_count * sizeof(struct list_head), GFP_KERNEL);
-+ if (!cache)
-+ goto fail;
-+ cache->c_name = name;
-+ cache->c_op.free = NULL;
-+ if (cache_op)
-+ cache->c_op.free = cache_op->free;
-+ atomic_set(&cache->c_entry_count, 0);
-+ cache->c_bucket_count = bucket_count;
-+#ifdef MB_CACHE_INDEXES_COUNT
-+ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
-+#else
-+ cache->c_indexes_count = indexes_count;
-+#endif
-+ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
-+ GFP_KERNEL);
-+ if (!cache->c_block_hash)
-+ goto fail;
-+ for (n=0; n<bucket_count; n++)
-+ INIT_LIST_HEAD(&cache->c_block_hash[n]);
-+ for (m=0; m<indexes_count; m++) {
-+ cache->c_indexes_hash[m] = kmalloc(bucket_count *
-+ sizeof(struct list_head),
-+ GFP_KERNEL);
-+ if (!cache->c_indexes_hash[m])
-+ goto fail;
-+ for (n=0; n<bucket_count; n++)
-+ INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
-+ }
-+ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
-+ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL);
-+ if (!cache->c_entry_cache)
-+ goto fail;
-+
-+ spin_lock(&mb_cache_spinlock);
-+ list_add(&cache->c_cache_list, &mb_cache_list);
-+ spin_unlock(&mb_cache_spinlock);
-+ return cache;
-+
-+fail:
-+ if (cache) {
-+ while (--m >= 0)
-+ kfree(cache->c_indexes_hash[m]);
-+ if (cache->c_block_hash)
-+ kfree(cache->c_block_hash);
-+ kfree(cache);
-+ }
-+ MOD_DEC_USE_COUNT;
-+ return NULL;
-+}
-+
-+
-+/*
-+ * mb_cache_shrink()
-+ *
-+ * Removes all cache entires of a device from the cache. All cache entries
-+ * currently in use cannot be freed, and thus remain in the cache.
-+ *
-+ * @cache: which cache to shrink
-+ * @dev: which device's cache entries to shrink
-+ */
-+void
-+mb_cache_shrink(struct mb_cache *cache, kdev_t dev)
-+{
-+ LIST_HEAD(free_list);
-+ struct list_head *l, *ltmp;
-+
-+ spin_lock(&mb_cache_spinlock);
-+ list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
-+ struct mb_cache_entry *ce =
-+ list_entry(l, struct mb_cache_entry, e_lru_list);
-+ if (ce->e_dev == dev) {
-+ list_del(&ce->e_lru_list);
-+ list_add_tail(&ce->e_lru_list, &free_list);
-+ __mb_cache_entry_unhash(ce);
-+ }
-+ }
-+ spin_unlock(&mb_cache_spinlock);
-+ list_for_each_safe(l, ltmp, &free_list) {
-+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-+ e_lru_list), GFP_KERNEL);
-+ }
-+}
-+
-+
-+/*
-+ * mb_cache_destroy()
-+ *
-+ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
-+ * and then destroys it. If this was the last mbcache, un-registers the
-+ * mbcache from kernel memory management.
-+ */
-+void
-+mb_cache_destroy(struct mb_cache *cache)
-+{
-+ LIST_HEAD(free_list);
-+ struct list_head *l, *ltmp;
-+ int n;
-+
-+ spin_lock(&mb_cache_spinlock);
-+ list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
-+ struct mb_cache_entry *ce =
-+ list_entry(l, struct mb_cache_entry, e_lru_list);
-+ if (ce->e_cache == cache) {
-+ list_del(&ce->e_lru_list);
-+ list_add_tail(&ce->e_lru_list, &free_list);
-+ __mb_cache_entry_unhash(ce);
-+ }
-+ }
-+ list_del(&cache->c_cache_list);
-+ spin_unlock(&mb_cache_spinlock);
-+ list_for_each_safe(l, ltmp, &free_list) {
-+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-+ e_lru_list), GFP_KERNEL);
-+ }
-+
-+ if (atomic_read(&cache->c_entry_count) > 0) {
-+ mb_error("cache %s: %d orphaned entries",
-+ cache->c_name,
-+ atomic_read(&cache->c_entry_count));
-+ }
-+
-+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0))
-+ /* We don't have kmem_cache_destroy() in 2.2.x */
-+ kmem_cache_shrink(cache->c_entry_cache);
-+#else
-+ kmem_cache_destroy(cache->c_entry_cache);
-+#endif
-+ for (n=0; n < mb_cache_indexes(cache); n++)
-+ kfree(cache->c_indexes_hash[n]);
-+ kfree(cache->c_block_hash);
-+ kfree(cache);
-+
-+ MOD_DEC_USE_COUNT;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_alloc()
-+ *
-+ * Allocates a new cache entry. The new entry will not be valid initially,
-+ * and thus cannot be looked up yet. It should be filled with data, and
-+ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
-+ * if no more memory was available.
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_alloc(struct mb_cache *cache)
-+{
-+ struct mb_cache_entry *ce;
-+
-+ atomic_inc(&cache->c_entry_count);
-+ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
-+ if (ce) {
-+ INIT_LIST_HEAD(&ce->e_lru_list);
-+ INIT_LIST_HEAD(&ce->e_block_list);
-+ ce->e_cache = cache;
-+ atomic_set(&ce->e_used, 1);
-+ }
-+ return ce;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_insert()
-+ *
-+ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
-+ * the cache. After this, the cache entry can be looked up, but is not yet
-+ * in the lru list as the caller still holds a handle to it. Returns 0 on
-+ * success, or -EBUSY if a cache entry for that device + inode exists
-+ * already (this may happen after a failed lookup, if another process has
-+ * inserted the same cache entry in the meantime).
-+ *
-+ * @dev: device the cache entry belongs to
-+ * @block: block number
-+ * @keys: array of additional keys. There must be indexes_count entries
-+ * in the array (as specified when creating the cache).
-+ */
-+int
-+mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev,
-+ unsigned long block, unsigned int keys[])
-+{
-+ struct mb_cache *cache = ce->e_cache;
-+ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
-+ struct list_head *l;
-+ int error = -EBUSY, n;
-+
-+ spin_lock(&mb_cache_spinlock);
-+ list_for_each(l, &cache->c_block_hash[bucket]) {
-+ struct mb_cache_entry *ce =
-+ list_entry(l, struct mb_cache_entry, e_block_list);
-+ if (ce->e_dev == dev && ce->e_block == block)
-+ goto out;
-+ }
-+ __mb_cache_entry_unhash(ce);
-+ ce->e_dev = dev;
-+ ce->e_block = block;
-+ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
-+ for (n=0; n<mb_cache_indexes(cache); n++) {
-+ ce->e_indexes[n].o_key = keys[n];
-+ bucket = keys[n] % cache->c_bucket_count;
-+ list_add(&ce->e_indexes[n].o_list,
-+ &cache->c_indexes_hash[n][bucket]);
-+ }
-+out:
-+ spin_unlock(&mb_cache_spinlock);
-+ return error;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_release()
-+ *
-+ * Release a handle to a cache entry. When the last handle to a cache entry
-+ * is released it is either freed (if it is invalid) or otherwise inserted
-+ * in to the lru list.
-+ */
-+void
-+mb_cache_entry_release(struct mb_cache_entry *ce)
-+{
-+ spin_lock(&mb_cache_spinlock);
-+ __mb_cache_entry_release_unlock(ce);
-+}
-+
-+
-+/*
-+ * mb_cache_entry_takeout()
-+ *
-+ * Take a cache entry out of the cache, making it invalid. The entry can later
-+ * be re-inserted using mb_cache_entry_insert(), or released using
-+ * mb_cache_entry_release().
-+ */
-+void
-+mb_cache_entry_takeout(struct mb_cache_entry *ce)
-+{
-+ spin_lock(&mb_cache_spinlock);
-+ mb_assert(list_empty(&ce->e_lru_list));
-+ __mb_cache_entry_unhash(ce);
-+ spin_unlock(&mb_cache_spinlock);
-+}
-+
-+
-+/*
-+ * mb_cache_entry_free()
-+ *
-+ * This is equivalent to the sequence mb_cache_entry_takeout() --
-+ * mb_cache_entry_release().
-+ */
-+void
-+mb_cache_entry_free(struct mb_cache_entry *ce)
-+{
-+ spin_lock(&mb_cache_spinlock);
-+ mb_assert(list_empty(&ce->e_lru_list));
-+ __mb_cache_entry_unhash(ce);
-+ __mb_cache_entry_release_unlock(ce);
-+}
-+
-+
-+/*
-+ * mb_cache_entry_dup()
-+ *
-+ * Duplicate a handle to a cache entry (does not duplicate the cache entry
-+ * itself). After the call, both the old and the new handle must be released.
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_dup(struct mb_cache_entry *ce)
-+{
-+ atomic_inc(&ce->e_used);
-+ return ce;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_get()
-+ *
-+ * Get a cache entry by device / block number. (There can only be one entry
-+ * in the cache per device and block.) Returns NULL if no such cache entry
-+ * exists.
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block)
-+{
-+ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
-+ struct list_head *l;
-+ struct mb_cache_entry *ce;
-+
-+ spin_lock(&mb_cache_spinlock);
-+ list_for_each(l, &cache->c_block_hash[bucket]) {
-+ ce = list_entry(l, struct mb_cache_entry, e_block_list);
-+ if (ce->e_dev == dev && ce->e_block == block) {
-+ if (!list_empty(&ce->e_lru_list))
-+ list_del_init(&ce->e_lru_list);
-+ atomic_inc(&ce->e_used);
-+ goto cleanup;
-+ }
-+ }
-+ ce = NULL;
-+
-+cleanup:
-+ spin_unlock(&mb_cache_spinlock);
-+ return ce;
-+}
-+
-+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-+
-+static struct mb_cache_entry *
-+__mb_cache_entry_find(struct list_head *l, struct list_head *head,
-+ int index, kdev_t dev, unsigned int key)
-+{
-+ while (l != head) {
-+ struct mb_cache_entry *ce =
-+ list_entry(l, struct mb_cache_entry,
-+ e_indexes[index].o_list);
-+ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) {
-+ if (!list_empty(&ce->e_lru_list))
-+ list_del_init(&ce->e_lru_list);
-+ atomic_inc(&ce->e_used);
-+ return ce;
-+ }
-+ l = l->next;
-+ }
-+ return NULL;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_find_first()
-+ *
-+ * Find the first cache entry on a given device with a certain key in
-+ * an additional index. Additonal matches can be found with
-+ * mb_cache_entry_find_next(). Returns NULL if no match was found.
-+ *
-+ * @cache: the cache to search
-+ * @index: the number of the additonal index to search (0<=index<indexes_count)
-+ * @dev: the device the cache entry should belong to
-+ * @key: the key in the index
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_find_first(struct mb_cache *cache, int index, kdev_t dev,
-+ unsigned int key)
-+{
-+ unsigned int bucket = key % cache->c_bucket_count;
-+ struct list_head *l;
-+ struct mb_cache_entry *ce;
-+
-+ mb_assert(index < mb_cache_indexes(cache));
-+ spin_lock(&mb_cache_spinlock);
-+ l = cache->c_indexes_hash[index][bucket].next;
-+ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
-+ index, dev, key);
-+ spin_unlock(&mb_cache_spinlock);
-+ return ce;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_find_next()
-+ *
-+ * Find the next cache entry on a given device with a certain key in an
-+ * additional index. Returns NULL if no match could be found. The previous
-+ * entry is atomatically released, so that mb_cache_entry_find_next() can
-+ * be called like this:
-+ *
-+ * entry = mb_cache_entry_find_first();
-+ * while (entry) {
-+ * ...
-+ * entry = mb_cache_entry_find_next(entry, ...);
-+ * }
-+ *
-+ * @prev: The previous match
-+ * @index: the number of the additonal index to search (0<=index<indexes_count)
-+ * @dev: the device the cache entry should belong to
-+ * @key: the key in the index
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, kdev_t dev,
-+ unsigned int key)
-+{
-+ struct mb_cache *cache = prev->e_cache;
-+ unsigned int bucket = key % cache->c_bucket_count;
-+ struct list_head *l;
-+ struct mb_cache_entry *ce;
-+
-+ mb_assert(index < mb_cache_indexes(cache));
-+ spin_lock(&mb_cache_spinlock);
-+ l = prev->e_indexes[index].o_list.next;
-+ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
-+ index, dev, key);
-+ __mb_cache_entry_release_unlock(prev);
-+ return ce;
-+}
-+
-+#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
-+
-+static int __init init_mbcache(void)
-+{
-+ register_cache(&mb_cache_definition);
-+ return 0;
-+}
-+
-+static void __exit exit_mbcache(void)
-+{
-+ unregister_cache(&mb_cache_definition);
-+}
-+
-+module_init(init_mbcache)
-+module_exit(exit_mbcache)
-+
-Index: linux-DRV401/fs/xattr.c
-===================================================================
---- linux-DRV401.orig/fs/xattr.c 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/fs/xattr.c 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,355 @@
-+/*
-+ File: fs/xattr.c
-+
-+ Extended attribute handling.
-+
-+ Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
-+ Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
-+ */
-+#include <linux/fs.h>
-+#include <linux/slab.h>
-+#include <linux/vmalloc.h>
-+#include <linux/smp_lock.h>
-+#include <linux/file.h>
-+#include <linux/xattr.h>
-+#include <asm/uaccess.h>
-+
-+/*
-+ * Extended attribute memory allocation wrappers, originally
-+ * based on the Intermezzo PRESTO_ALLOC/PRESTO_FREE macros.
-+ * The vmalloc use here is very uncommon - extended attributes
-+ * are supposed to be small chunks of metadata, and it is quite
-+ * unusual to have very many extended attributes, so lists tend
-+ * to be quite short as well. The 64K upper limit is derived
-+ * from the extended attribute size limit used by XFS.
-+ * Intentionally allow zero @size for value/list size requests.
-+ */
-+static void *
-+xattr_alloc(size_t size, size_t limit)
-+{
-+ void *ptr;
-+
-+ if (size > limit)
-+ return ERR_PTR(-E2BIG);
-+
-+ if (!size) /* size request, no buffer is needed */
-+ return NULL;
-+ else if (size <= PAGE_SIZE)
-+ ptr = kmalloc((unsigned long) size, GFP_KERNEL);
-+ else
-+ ptr = vmalloc((unsigned long) size);
-+ if (!ptr)
-+ return ERR_PTR(-ENOMEM);
-+ return ptr;
-+}
-+
-+static void
-+xattr_free(void *ptr, size_t size)
-+{
-+ if (!size) /* size request, no buffer was needed */
-+ return;
-+ else if (size <= PAGE_SIZE)
-+ kfree(ptr);
-+ else
-+ vfree(ptr);
-+}
-+
-+/*
-+ * Extended attribute SET operations
-+ */
-+static long
-+setxattr(struct dentry *d, char *name, void *value, size_t size, int flags)
-+{
-+ int error;
-+ void *kvalue;
-+ char kname[XATTR_NAME_MAX + 1];
-+
-+ if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
-+ return -EINVAL;
-+
-+ error = strncpy_from_user(kname, name, sizeof(kname));
-+ if (error == 0 || error == sizeof(kname))
-+ error = -ERANGE;
-+ if (error < 0)
-+ return error;
-+
-+ kvalue = xattr_alloc(size, XATTR_SIZE_MAX);
-+ if (IS_ERR(kvalue))
-+ return PTR_ERR(kvalue);
-+
-+ if (size > 0 && copy_from_user(kvalue, value, size)) {
-+ xattr_free(kvalue, size);
-+ return -EFAULT;
-+ }
-+
-+ error = -EOPNOTSUPP;
-+ if (d->d_inode->i_op && d->d_inode->i_op->setxattr) {
-+ down(&d->d_inode->i_sem);
-+ lock_kernel();
-+ error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags);
-+ unlock_kernel();
-+ up(&d->d_inode->i_sem);
-+ }
-+
-+ xattr_free(kvalue, size);
-+ return error;
-+}
-+
-+asmlinkage long
-+sys_setxattr(char *path, char *name, void *value, size_t size, int flags)
-+{
-+ struct nameidata nd;
-+ int error;
-+
-+ error = user_path_walk(path, &nd);
-+ if (error)
-+ return error;
-+ error = setxattr(nd.dentry, name, value, size, flags);
-+ path_release(&nd);
-+ return error;
-+}
-+
-+asmlinkage long
-+sys_lsetxattr(char *path, char *name, void *value, size_t size, int flags)
-+{
-+ struct nameidata nd;
-+ int error;
-+
-+ error = user_path_walk_link(path, &nd);
-+ if (error)
-+ return error;
-+ error = setxattr(nd.dentry, name, value, size, flags);
-+ path_release(&nd);
-+ return error;
-+}
-+
-+asmlinkage long
-+sys_fsetxattr(int fd, char *name, void *value, size_t size, int flags)
-+{
-+ struct file *f;
-+ int error = -EBADF;
-+
-+ f = fget(fd);
-+ if (!f)
-+ return error;
-+ error = setxattr(f->f_dentry, name, value, size, flags);
-+ fput(f);
-+ return error;
-+}
-+
-+/*
-+ * Extended attribute GET operations
-+ */
-+static ssize_t
-+getxattr(struct dentry *d, char *name, void *value, size_t size)
-+{
-+ ssize_t error;
-+ void *kvalue;
-+ char kname[XATTR_NAME_MAX + 1];
-+
-+ error = strncpy_from_user(kname, name, sizeof(kname));
-+ if (error == 0 || error == sizeof(kname))
-+ error = -ERANGE;
-+ if (error < 0)
-+ return error;
-+
-+ kvalue = xattr_alloc(size, XATTR_SIZE_MAX);
-+ if (IS_ERR(kvalue))
-+ return PTR_ERR(kvalue);
-+
-+ error = -EOPNOTSUPP;
-+ if (d->d_inode->i_op && d->d_inode->i_op->getxattr) {
-+ down(&d->d_inode->i_sem);
-+ lock_kernel();
-+ error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
-+ unlock_kernel();
-+ up(&d->d_inode->i_sem);
-+ }
-+
-+ if (kvalue && error > 0)
-+ if (copy_to_user(value, kvalue, error))
-+ error = -EFAULT;
-+ xattr_free(kvalue, size);
-+ return error;
-+}
-+
-+asmlinkage ssize_t
-+sys_getxattr(char *path, char *name, void *value, size_t size)
-+{
-+ struct nameidata nd;
-+ ssize_t error;
-+
-+ error = user_path_walk(path, &nd);
-+ if (error)
-+ return error;
-+ error = getxattr(nd.dentry, name, value, size);
-+ path_release(&nd);
-+ return error;
-+}
-+
-+asmlinkage ssize_t
-+sys_lgetxattr(char *path, char *name, void *value, size_t size)
-+{
-+ struct nameidata nd;
-+ ssize_t error;
-+
-+ error = user_path_walk_link(path, &nd);
-+ if (error)
-+ return error;
-+ error = getxattr(nd.dentry, name, value, size);
-+ path_release(&nd);
-+ return error;
-+}
-+
-+asmlinkage ssize_t
-+sys_fgetxattr(int fd, char *name, void *value, size_t size)
-+{
-+ struct file *f;
-+ ssize_t error = -EBADF;
-+
-+ f = fget(fd);
-+ if (!f)
-+ return error;
-+ error = getxattr(f->f_dentry, name, value, size);
-+ fput(f);
-+ return error;
-+}
-+
-+/*
-+ * Extended attribute LIST operations
-+ */
-+static ssize_t
-+listxattr(struct dentry *d, char *list, size_t size)
-+{
-+ ssize_t error;
-+ char *klist;
-+
-+ klist = (char *)xattr_alloc(size, XATTR_LIST_MAX);
-+ if (IS_ERR(klist))
-+ return PTR_ERR(klist);
-+
-+ error = -EOPNOTSUPP;
-+ if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
-+ down(&d->d_inode->i_sem);
-+ lock_kernel();
-+ error = d->d_inode->i_op->listxattr(d, klist, size);
-+ unlock_kernel();
-+ up(&d->d_inode->i_sem);
-+ }
-+
-+ if (klist && error > 0)
-+ if (copy_to_user(list, klist, error))
-+ error = -EFAULT;
-+ xattr_free(klist, size);
-+ return error;
-+}
-+
-+asmlinkage ssize_t
-+sys_listxattr(char *path, char *list, size_t size)
-+{
-+ struct nameidata nd;
-+ ssize_t error;
-+
-+ error = user_path_walk(path, &nd);
-+ if (error)
-+ return error;
-+ error = listxattr(nd.dentry, list, size);
-+ path_release(&nd);
-+ return error;
-+}
-+
-+asmlinkage ssize_t
-+sys_llistxattr(char *path, char *list, size_t size)
-+{
-+ struct nameidata nd;
-+ ssize_t error;
-+
-+ error = user_path_walk_link(path, &nd);
-+ if (error)
-+ return error;
-+ error = listxattr(nd.dentry, list, size);
-+ path_release(&nd);
-+ return error;
-+}
-+
-+asmlinkage ssize_t
-+sys_flistxattr(int fd, char *list, size_t size)
-+{
-+ struct file *f;
-+ ssize_t error = -EBADF;
-+
-+ f = fget(fd);
-+ if (!f)
-+ return error;
-+ error = listxattr(f->f_dentry, list, size);
-+ fput(f);
-+ return error;
-+}
-+
-+/*
-+ * Extended attribute REMOVE operations
-+ */
-+static long
-+removexattr(struct dentry *d, char *name)
-+{
-+ int error;
-+ char kname[XATTR_NAME_MAX + 1];
-+
-+ error = strncpy_from_user(kname, name, sizeof(kname));
-+ if (error == 0 || error == sizeof(kname))
-+ error = -ERANGE;
-+ if (error < 0)
-+ return error;
-+
-+ error = -EOPNOTSUPP;
-+ if (d->d_inode->i_op && d->d_inode->i_op->removexattr) {
-+ down(&d->d_inode->i_sem);
-+ lock_kernel();
-+ error = d->d_inode->i_op->removexattr(d, kname);
-+ unlock_kernel();
-+ up(&d->d_inode->i_sem);
-+ }
-+ return error;
-+}
-+
-+asmlinkage long
-+sys_removexattr(char *path, char *name)
-+{
-+ struct nameidata nd;
-+ int error;
-+
-+ error = user_path_walk(path, &nd);
-+ if (error)
-+ return error;
-+ error = removexattr(nd.dentry, name);
-+ path_release(&nd);
-+ return error;
-+}
-+
-+asmlinkage long
-+sys_lremovexattr(char *path, char *name)
-+{
-+ struct nameidata nd;
-+ int error;
-+
-+ error = user_path_walk_link(path, &nd);
-+ if (error)
-+ return error;
-+ error = removexattr(nd.dentry, name);
-+ path_release(&nd);
-+ return error;
-+}
-+
-+asmlinkage long
-+sys_fremovexattr(int fd, char *name)
-+{
-+ struct file *f;
-+ int error = -EBADF;
-+
-+ f = fget(fd);
-+ if (!f)
-+ return error;
-+ error = removexattr(f->f_dentry, name);
-+ fput(f);
-+ return error;
-+}
-Index: linux-DRV401/include/linux/cache_def.h
-===================================================================
---- linux-DRV401.orig/include/linux/cache_def.h 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/include/linux/cache_def.h 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,15 @@
-+/*
-+ * linux/cache_def.h
-+ * Handling of caches defined in drivers, filesystems, ...
-+ *
-+ * Copyright (C) 2002 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ */
-+
-+struct cache_definition {
-+ const char *name;
-+ void (*shrink)(int, unsigned int);
-+ struct list_head link;
-+};
-+
-+extern void register_cache(struct cache_definition *);
-+extern void unregister_cache(struct cache_definition *);
-Index: linux-DRV401/include/linux/errno.h
-===================================================================
---- linux-DRV401.orig/include/linux/errno.h 2004-10-15 10:26:15.000000000 -0700
-+++ linux-DRV401/include/linux/errno.h 2004-10-15 11:03:52.000000000 -0700
-@@ -23,4 +23,8 @@
-
- #endif
-
-+/* Defined for extended attributes */
-+#define ENOATTR ENODATA /* No such attribute */
-+#define ENOTSUP EOPNOTSUPP /* Operation not supported */
-+
- #endif
-Index: linux-DRV401/include/linux/ext2_fs.h
-===================================================================
---- linux-DRV401.orig/include/linux/ext2_fs.h 2004-10-15 10:26:11.000000000 -0700
-+++ linux-DRV401/include/linux/ext2_fs.h 2004-10-15 11:03:52.000000000 -0700
-@@ -57,8 +57,6 @@
- */
- #define EXT2_BAD_INO 1 /* Bad blocks inode */
- #define EXT2_ROOT_INO 2 /* Root inode */
--#define EXT2_ACL_IDX_INO 3 /* ACL inode */
--#define EXT2_ACL_DATA_INO 4 /* ACL inode */
- #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */
- #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */
-
-@@ -86,7 +84,6 @@
- #else
- # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size)
- #endif
--#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry))
- #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32))
- #ifdef __KERNEL__
- # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
-@@ -121,28 +118,6 @@
- #endif
-
- /*
-- * ACL structures
-- */
--struct ext2_acl_header /* Header of Access Control Lists */
--{
-- __u32 aclh_size;
-- __u32 aclh_file_count;
-- __u32 aclh_acle_count;
-- __u32 aclh_first_acle;
--};
--
--struct ext2_acl_entry /* Access Control List Entry */
--{
-- __u32 acle_size;
-- __u16 acle_perms; /* Access permissions */
-- __u16 acle_type; /* Type of entry */
-- __u16 acle_tag; /* User or group identity */
-- __u16 acle_pad1;
-- __u32 acle_next; /* Pointer on next entry for the */
-- /* same inode or on next free entry */
--};
--
--/*
- * Structure of a blocks group descriptor
- */
- struct ext2_group_desc
-@@ -314,6 +289,7 @@
- #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
- #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */
- #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */
-+#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
-
- #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
- #define set_opt(o, opt) o |= EXT2_MOUNT_##opt
-@@ -397,6 +373,7 @@
-
- #ifdef __KERNEL__
- #define EXT2_SB(sb) (&((sb)->u.ext2_sb))
-+#define EXT2_I(inode) (&((inode)->u.ext2_i))
- #else
- /* Assume that user mode programs are passing in an ext2fs superblock, not
- * a kernel struct super_block. This will allow us to call the feature-test
-@@ -466,7 +443,7 @@
- #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008
- #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff
-
--#define EXT2_FEATURE_COMPAT_SUPP 0
-+#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
- #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE
- #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
- EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
-@@ -623,8 +600,10 @@
-
- /* namei.c */
- extern struct inode_operations ext2_dir_inode_operations;
-+extern struct inode_operations ext2_special_inode_operations;
-
- /* symlink.c */
-+extern struct inode_operations ext2_symlink_inode_operations;
- extern struct inode_operations ext2_fast_symlink_inode_operations;
-
- #endif /* __KERNEL__ */
-Index: linux-DRV401/include/linux/ext2_xattr.h
-===================================================================
---- linux-DRV401.orig/include/linux/ext2_xattr.h 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/include/linux/ext2_xattr.h 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,157 @@
-+/*
-+ File: linux/ext2_xattr.h
-+
-+ On-disk format of extended attributes for the ext2 filesystem.
-+
-+ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+*/
-+
-+#include <linux/config.h>
-+#include <linux/init.h>
-+#include <linux/xattr.h>
-+
-+/* Magic value in attribute blocks */
-+#define EXT2_XATTR_MAGIC 0xEA020000
-+
-+/* Maximum number of references to one attribute block */
-+#define EXT2_XATTR_REFCOUNT_MAX 1024
-+
-+/* Name indexes */
-+#define EXT2_XATTR_INDEX_MAX 10
-+#define EXT2_XATTR_INDEX_USER 1
-+#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2
-+#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3
-+
-+struct ext2_xattr_header {
-+ __u32 h_magic; /* magic number for identification */
-+ __u32 h_refcount; /* reference count */
-+ __u32 h_blocks; /* number of disk blocks used */
-+ __u32 h_hash; /* hash value of all attributes */
-+ __u32 h_reserved[4]; /* zero right now */
-+};
-+
-+struct ext2_xattr_entry {
-+ __u8 e_name_len; /* length of name */
-+ __u8 e_name_index; /* attribute name index */
-+ __u16 e_value_offs; /* offset in disk block of value */
-+ __u32 e_value_block; /* disk block attribute is stored on (n/i) */
-+ __u32 e_value_size; /* size of attribute value */
-+ __u32 e_hash; /* hash value of name and value */
-+ char e_name[0]; /* attribute name */
-+};
-+
-+#define EXT2_XATTR_PAD_BITS 2
-+#define EXT2_XATTR_PAD (1<<EXT2_XATTR_PAD_BITS)
-+#define EXT2_XATTR_ROUND (EXT2_XATTR_PAD-1)
-+#define EXT2_XATTR_LEN(name_len) \
-+ (((name_len) + EXT2_XATTR_ROUND + \
-+ sizeof(struct ext2_xattr_entry)) & ~EXT2_XATTR_ROUND)
-+#define EXT2_XATTR_NEXT(entry) \
-+ ( (struct ext2_xattr_entry *)( \
-+ (char *)(entry) + EXT2_XATTR_LEN((entry)->e_name_len)) )
-+#define EXT2_XATTR_SIZE(size) \
-+ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
-+
-+#ifdef __KERNEL__
-+
-+# ifdef CONFIG_EXT2_FS_XATTR
-+
-+struct ext2_xattr_handler {
-+ char *prefix;
-+ size_t (*list)(char *list, struct inode *inode, const char *name,
-+ int name_len);
-+ int (*get)(struct inode *inode, const char *name, void *buffer,
-+ size_t size);
-+ int (*set)(struct inode *inode, const char *name, const void *buffer,
-+ size_t size, int flags);
-+};
-+
-+extern int ext2_xattr_register(int, struct ext2_xattr_handler *);
-+extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *);
-+
-+extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int);
-+extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t);
-+extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
-+extern int ext2_removexattr(struct dentry *, const char *);
-+
-+extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
-+extern int ext2_xattr_list(struct inode *, char *, size_t);
-+extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
-+
-+extern void ext2_xattr_delete_inode(struct inode *);
-+extern void ext2_xattr_put_super(struct super_block *);
-+
-+extern int init_ext2_xattr(void) __init;
-+extern void exit_ext2_xattr(void);
-+
-+# else /* CONFIG_EXT2_FS_XATTR */
-+# define ext2_setxattr NULL
-+# define ext2_getxattr NULL
-+# define ext2_listxattr NULL
-+# define ext2_removexattr NULL
-+
-+static inline int
-+ext2_xattr_get(struct inode *inode, int name_index,
-+ const char *name, void *buffer, size_t size)
-+{
-+ return -ENOTSUP;
-+}
-+
-+static inline int
-+ext2_xattr_list(struct inode *inode, char *buffer, size_t size)
-+{
-+ return -ENOTSUP;
-+}
-+
-+static inline int
-+ext2_xattr_set(struct inode *inode, int name_index, const char *name,
-+ const void *value, size_t size, int flags)
-+{
-+ return -ENOTSUP;
-+}
-+
-+static inline void
-+ext2_xattr_delete_inode(struct inode *inode)
-+{
-+}
-+
-+static inline void
-+ext2_xattr_put_super(struct super_block *sb)
-+{
-+}
-+
-+static inline int
-+init_ext2_xattr(void)
-+{
-+ return 0;
-+}
-+
-+static inline void
-+exit_ext2_xattr(void)
-+{
-+}
-+
-+# endif /* CONFIG_EXT2_FS_XATTR */
-+
-+# ifdef CONFIG_EXT2_FS_XATTR_USER
-+
-+extern int init_ext2_xattr_user(void) __init;
-+extern void exit_ext2_xattr_user(void);
-+
-+# else /* CONFIG_EXT2_FS_XATTR_USER */
-+
-+static inline int
-+init_ext2_xattr_user(void)
-+{
-+ return 0;
-+}
-+
-+static inline void
-+exit_ext2_xattr_user(void)
-+{
-+}
-+
-+# endif /* CONFIG_EXT2_FS_XATTR_USER */
-+
-+#endif /* __KERNEL__ */
-+
-Index: linux-DRV401/include/linux/ext3_fs.h
-===================================================================
---- linux-DRV401.orig/include/linux/ext3_fs.h 2004-10-15 10:39:16.000000000 -0700
-+++ linux-DRV401/include/linux/ext3_fs.h 2004-10-15 11:03:52.000000000 -0700
-@@ -63,8 +63,6 @@
- */
- #define EXT3_BAD_INO 1 /* Bad blocks inode */
- #define EXT3_ROOT_INO 2 /* Root inode */
--#define EXT3_ACL_IDX_INO 3 /* ACL inode */
--#define EXT3_ACL_DATA_INO 4 /* ACL inode */
- #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
- #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
- #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
-@@ -94,7 +92,6 @@
- #else
- # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
- #endif
--#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
- #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
- #ifdef __KERNEL__
- # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
-@@ -129,28 +126,6 @@
- #endif
-
- /*
-- * ACL structures
-- */
--struct ext3_acl_header /* Header of Access Control Lists */
--{
-- __u32 aclh_size;
-- __u32 aclh_file_count;
-- __u32 aclh_acle_count;
-- __u32 aclh_first_acle;
--};
--
--struct ext3_acl_entry /* Access Control List Entry */
--{
-- __u32 acle_size;
-- __u16 acle_perms; /* Access permissions */
-- __u16 acle_type; /* Type of entry */
-- __u16 acle_tag; /* User or group identity */
-- __u16 acle_pad1;
-- __u32 acle_next; /* Pointer on next entry for the */
-- /* same inode or on next free entry */
--};
--
--/*
- * Structure of a blocks group descriptor
- */
- struct ext3_group_desc
-@@ -344,6 +319,7 @@
- #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */
- #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
- #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
-+#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
-
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef _LINUX_EXT2_FS_H
-@@ -520,7 +496,7 @@
- #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
- #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
-
--#define EXT3_FEATURE_COMPAT_SUPP 0
-+#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
- #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
- EXT3_FEATURE_INCOMPAT_RECOVER)
- #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
-@@ -703,6 +679,7 @@
- extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
-
- /* inode.c */
-+extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
- extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
- extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
-
-@@ -771,8 +748,10 @@
-
- /* namei.c */
- extern struct inode_operations ext3_dir_inode_operations;
-+extern struct inode_operations ext3_special_inode_operations;
-
- /* symlink.c */
-+extern struct inode_operations ext3_symlink_inode_operations;
- extern struct inode_operations ext3_fast_symlink_inode_operations;
-
-
-Index: linux-DRV401/include/linux/ext3_jbd.h
-===================================================================
---- linux-DRV401.orig/include/linux/ext3_jbd.h 2004-10-15 10:39:16.000000000 -0700
-+++ linux-DRV401/include/linux/ext3_jbd.h 2004-10-15 11:03:52.000000000 -0700
-@@ -30,13 +30,19 @@
-
- #define EXT3_SINGLEDATA_TRANS_BLOCKS 8
-
-+/* Extended attributes may touch two data buffers, two bitmap buffers,
-+ * and two group and summaries. */
-+
-+#define EXT3_XATTR_TRANS_BLOCKS 8
-+
- /* Define the minimum size for a transaction which modifies data. This
- * needs to take into account the fact that we may end up modifying two
- * quota files too (one for the group, one for the user quota). The
- * superblock only gets updated once, of course, so don't bother
- * counting that again for the quota updates. */
-
--#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
-+#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \
-+ EXT3_XATTR_TRANS_BLOCKS - 2)
-
- extern int ext3_writepage_trans_blocks(struct inode *inode);
-
-Index: linux-DRV401/include/linux/ext3_xattr.h
-===================================================================
---- linux-DRV401.orig/include/linux/ext3_xattr.h 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/include/linux/ext3_xattr.h 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,157 @@
-+/*
-+ File: linux/ext3_xattr.h
-+
-+ On-disk format of extended attributes for the ext3 filesystem.
-+
-+ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+*/
-+
-+#include <linux/config.h>
-+#include <linux/init.h>
-+#include <linux/xattr.h>
-+
-+/* Magic value in attribute blocks */
-+#define EXT3_XATTR_MAGIC 0xEA020000
-+
-+/* Maximum number of references to one attribute block */
-+#define EXT3_XATTR_REFCOUNT_MAX 1024
-+
-+/* Name indexes */
-+#define EXT3_XATTR_INDEX_MAX 10
-+#define EXT3_XATTR_INDEX_USER 1
-+#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2
-+#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3
-+
-+struct ext3_xattr_header {
-+ __u32 h_magic; /* magic number for identification */
-+ __u32 h_refcount; /* reference count */
-+ __u32 h_blocks; /* number of disk blocks used */
-+ __u32 h_hash; /* hash value of all attributes */
-+ __u32 h_reserved[4]; /* zero right now */
-+};
-+
-+struct ext3_xattr_entry {
-+ __u8 e_name_len; /* length of name */
-+ __u8 e_name_index; /* attribute name index */
-+ __u16 e_value_offs; /* offset in disk block of value */
-+ __u32 e_value_block; /* disk block attribute is stored on (n/i) */
-+ __u32 e_value_size; /* size of attribute value */
-+ __u32 e_hash; /* hash value of name and value */
-+ char e_name[0]; /* attribute name */
-+};
-+
-+#define EXT3_XATTR_PAD_BITS 2
-+#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS)
-+#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1)
-+#define EXT3_XATTR_LEN(name_len) \
-+ (((name_len) + EXT3_XATTR_ROUND + \
-+ sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
-+#define EXT3_XATTR_NEXT(entry) \
-+ ( (struct ext3_xattr_entry *)( \
-+ (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
-+#define EXT3_XATTR_SIZE(size) \
-+ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
-+
-+#ifdef __KERNEL__
-+
-+# ifdef CONFIG_EXT3_FS_XATTR
-+
-+struct ext3_xattr_handler {
-+ char *prefix;
-+ size_t (*list)(char *list, struct inode *inode, const char *name,
-+ int name_len);
-+ int (*get)(struct inode *inode, const char *name, void *buffer,
-+ size_t size);
-+ int (*set)(struct inode *inode, const char *name, const void *buffer,
-+ size_t size, int flags);
-+};
-+
-+extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
-+extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
-+
-+extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int);
-+extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
-+extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
-+extern int ext3_removexattr(struct dentry *, const char *);
-+
-+extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
-+extern int ext3_xattr_list(struct inode *, char *, size_t);
-+extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int);
-+
-+extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
-+extern void ext3_xattr_put_super(struct super_block *);
-+
-+extern int init_ext3_xattr(void) __init;
-+extern void exit_ext3_xattr(void);
-+
-+# else /* CONFIG_EXT3_FS_XATTR */
-+# define ext3_setxattr NULL
-+# define ext3_getxattr NULL
-+# define ext3_listxattr NULL
-+# define ext3_removexattr NULL
-+
-+static inline int
-+ext3_xattr_get(struct inode *inode, int name_index, const char *name,
-+ void *buffer, size_t size)
-+{
-+ return -ENOTSUP;
-+}
-+
-+static inline int
-+ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
-+{
-+ return -ENOTSUP;
-+}
-+
-+static inline int
-+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
-+ const char *name, const void *value, size_t size, int flags)
-+{
-+ return -ENOTSUP;
-+}
-+
-+static inline void
-+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
-+{
-+}
-+
-+static inline void
-+ext3_xattr_put_super(struct super_block *sb)
-+{
-+}
-+
-+static inline int
-+init_ext3_xattr(void)
-+{
-+ return 0;
-+}
-+
-+static inline void
-+exit_ext3_xattr(void)
-+{
-+}
-+
-+# endif /* CONFIG_EXT3_FS_XATTR */
-+
-+# ifdef CONFIG_EXT3_FS_XATTR_USER
-+
-+extern int init_ext3_xattr_user(void) __init;
-+extern void exit_ext3_xattr_user(void);
-+
-+# else /* CONFIG_EXT3_FS_XATTR_USER */
-+
-+static inline int
-+init_ext3_xattr_user(void)
-+{
-+ return 0;
-+}
-+
-+static inline void
-+exit_ext3_xattr_user(void)
-+{
-+}
-+
-+#endif /* CONFIG_EXT3_FS_XATTR_USER */
-+
-+#endif /* __KERNEL__ */
-+
-Index: linux-DRV401/include/linux/fs.h
-===================================================================
---- linux-DRV401.orig/include/linux/fs.h 2004-10-15 10:39:15.000000000 -0700
-+++ linux-DRV401/include/linux/fs.h 2004-10-15 11:03:52.000000000 -0700
-@@ -936,6 +936,10 @@
- int (*setattr) (struct dentry *, struct iattr *);
- int (*setattr_raw) (struct inode *, struct iattr *);
- int (*getattr) (struct dentry *, struct iattr *);
-+ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int);
-+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
-+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
-+ int (*removexattr) (struct dentry *, const char *);
- };
-
- struct seq_file;
-Index: linux-DRV401/include/linux/mbcache.h
-===================================================================
---- linux-DRV401.orig/include/linux/mbcache.h 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/include/linux/mbcache.h 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,69 @@
-+/*
-+ File: linux/mbcache.h
-+
-+ (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+*/
-+
-+/* Hardwire the number of additional indexes */
-+#define MB_CACHE_INDEXES_COUNT 1
-+
-+struct mb_cache_entry;
-+
-+struct mb_cache_op {
-+ int (*free)(struct mb_cache_entry *, int);
-+};
-+
-+struct mb_cache {
-+ struct list_head c_cache_list;
-+ const char *c_name;
-+ struct mb_cache_op c_op;
-+ atomic_t c_entry_count;
-+ int c_bucket_count;
-+#ifndef MB_CACHE_INDEXES_COUNT
-+ int c_indexes_count;
-+#endif
-+ kmem_cache_t *c_entry_cache;
-+ struct list_head *c_block_hash;
-+ struct list_head *c_indexes_hash[0];
-+};
-+
-+struct mb_cache_entry_index {
-+ struct list_head o_list;
-+ unsigned int o_key;
-+};
-+
-+struct mb_cache_entry {
-+ struct list_head e_lru_list;
-+ struct mb_cache *e_cache;
-+ atomic_t e_used;
-+ kdev_t e_dev;
-+ unsigned long e_block;
-+ struct list_head e_block_list;
-+ struct mb_cache_entry_index e_indexes[0];
-+};
-+
-+/* Functions on caches */
-+
-+struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
-+ int, int);
-+void mb_cache_shrink(struct mb_cache *, kdev_t);
-+void mb_cache_destroy(struct mb_cache *);
-+
-+/* Functions on cache entries */
-+
-+struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *);
-+int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long,
-+ unsigned int[]);
-+void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]);
-+void mb_cache_entry_release(struct mb_cache_entry *);
-+void mb_cache_entry_takeout(struct mb_cache_entry *);
-+void mb_cache_entry_free(struct mb_cache_entry *);
-+struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *);
-+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t,
-+ unsigned long);
-+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
-+ kdev_t, unsigned int);
-+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
-+ kdev_t, unsigned int);
-+#endif
-Index: linux-DRV401/include/linux/xattr.h
-===================================================================
---- linux-DRV401.orig/include/linux/xattr.h 2004-10-12 08:56:38.404764448 -0700
-+++ linux-DRV401/include/linux/xattr.h 2004-10-15 11:03:52.000000000 -0700
-@@ -0,0 +1,15 @@
-+/*
-+ File: linux/xattr.h
-+
-+ Extended attributes handling.
-+
-+ Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
-+ Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved.
-+*/
-+#ifndef _LINUX_XATTR_H
-+#define _LINUX_XATTR_H
-+
-+#define XATTR_CREATE 0x1 /* set the value, fail if attr already exists */
-+#define XATTR_REPLACE 0x2 /* set the value, fail if attr does not exist */
-+
-+#endif /* _LINUX_XATTR_H */
-Index: linux-DRV401/include/linux/limits.h
-===================================================================
---- linux-DRV401.orig/include/linux/limits.h 2004-10-15 10:26:20.000000000 -0700
-+++ linux-DRV401/include/linux/limits.h 2004-10-15 11:03:52.000000000 -0700
-@@ -13,6 +13,9 @@
- #define NAME_MAX 255 /* # chars in a file name */
- #define PATH_MAX 4096 /* # chars in a path name including nul */
- #define PIPE_BUF 4096 /* # bytes in atomic write to a pipe */
-+#define XATTR_NAME_MAX 255 /* # chars in an extended attribute name */
-+#define XATTR_SIZE_MAX 65536 /* size of an extended attribute value (64k) */
-+#define XATTR_LIST_MAX 65536 /* size of extended attribute namelist (64k) */
-
- #define RTSIG_MAX 32
-
-Index: linux-DRV401/kernel/ksyms.c
-===================================================================
---- linux-DRV401.orig/kernel/ksyms.c 2004-10-15 10:39:15.000000000 -0700
-+++ linux-DRV401/kernel/ksyms.c 2004-10-15 11:03:52.000000000 -0700
-@@ -11,6 +11,7 @@
-
- #include <linux/config.h>
- #include <linux/slab.h>
-+#include <linux/cache_def.h>
- #include <linux/module.h>
- #include <linux/blkdev.h>
- #include <linux/cdrom.h>
-@@ -88,6 +89,7 @@
- EXPORT_SYMBOL(exit_files);
- EXPORT_SYMBOL(exit_fs);
- EXPORT_SYMBOL(exit_sighand);
-+EXPORT_SYMBOL(copy_fs_struct);
- EXPORT_SYMBOL(unshare_files);
-
- /* internal kernel memory management */
-@@ -105,6 +107,8 @@
- EXPORT_SYMBOL(kmem_cache_shrink);
- EXPORT_SYMBOL(kmem_cache_alloc);
- EXPORT_SYMBOL(kmem_cache_free);
-+EXPORT_SYMBOL(register_cache);
-+EXPORT_SYMBOL(unregister_cache);
- EXPORT_SYMBOL(kmalloc);
- EXPORT_SYMBOL(kfree);
- EXPORT_SYMBOL(vfree);
-Index: linux-DRV401/mm/vmscan.c
-===================================================================
---- linux-DRV401.orig/mm/vmscan.c 2004-10-15 10:24:07.000000000 -0700
-+++ linux-DRV401/mm/vmscan.c 2004-10-15 11:08:53.000000000 -0700
-@@ -15,6 +15,7 @@
- #include <linux/kernel_stat.h>
- #include <linux/swap.h>
- #include <linux/swapctl.h>
-+#include <linux/cache_def.h>
- #include <linux/smp_lock.h>
- #include <linux/pagemap.h>
- #include <linux/init.h>
-@@ -31,6 +32,39 @@
- */
- #define DEF_PRIORITY (6)
-
-+static DECLARE_MUTEX(other_caches_sem);
-+static LIST_HEAD(cache_definitions);
-+
-+void register_cache(struct cache_definition *cache)
-+{
-+ down(&other_caches_sem);
-+ list_add(&cache->link, &cache_definitions);
-+ up(&other_caches_sem);
-+}
-+
-+void unregister_cache(struct cache_definition *cache)
-+{
-+ down(&other_caches_sem);
-+ list_del(&cache->link);
-+ up(&other_caches_sem);
-+}
-+
-+static void shrink_other_caches(unsigned int priority, int gfp_mask)
-+{
-+ struct list_head *p;
-+
-+ if (down_trylock(&other_caches_sem))
-+ return;
-+
-+ list_for_each_prev(p, &cache_definitions) {
-+ struct cache_definition *cache =
-+ list_entry(p, struct cache_definition, link);
-+
-+ cache->shrink(priority, gfp_mask);
-+ }
-+ up(&other_caches_sem);
-+}
-+
- /*
- * The swap-out function returns 1 if it successfully
- * scanned all the pages it was asked to (`count').
-@@ -584,6 +618,7 @@
-
- shrink_dcache_memory(priority, gfp_mask);
- shrink_icache_memory(priority, gfp_mask);
-+ shrink_other_caches(priority, gfp_mask);
- #ifdef CONFIG_QUOTA
- shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
- #endif
+++ /dev/null
- Documentation/Configure.help | 66 ++
- arch/ia64/defconfig | 7
- fs/Config.in | 14
- fs/Makefile | 3
- fs/ext2/Makefile | 4
- fs/ext2/file.c | 5
- fs/ext2/ialloc.c | 2
- fs/ext2/inode.c | 34 -
- fs/ext2/namei.c | 14
- fs/ext2/super.c | 29
- fs/ext2/symlink.c | 14
- fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++
- fs/ext2/xattr_user.c | 103 +++
- fs/ext3/Makefile | 9
- fs/ext3/ext3-exports.c | 13
- fs/ext3/file.c | 5
- fs/ext3/ialloc.c | 2
- fs/ext3/inode.c | 35 -
- fs/ext3/namei.c | 21
- fs/ext3/super.c | 36 +
- fs/ext3/symlink.c | 14
- fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++
- fs/ext3/xattr_user.c | 111 +++
- fs/jfs/jfs_xattr.h | 6
- fs/jfs/xattr.c | 6
- fs/mbcache.c | 648 ++++++++++++++++++++++
- include/linux/cache_def.h | 15
- include/linux/errno.h | 4
- include/linux/ext2_fs.h | 31 -
- include/linux/ext2_xattr.h | 157 +++++
- include/linux/ext3_fs.h | 31 -
- include/linux/ext3_jbd.h | 8
- include/linux/ext3_xattr.h | 157 +++++
- include/linux/fs.h | 2
- include/linux/mbcache.h | 69 ++
- kernel/ksyms.c | 4
- mm/vmscan.c | 35 +
- 62 files changed, 4343 insertions(+), 182 deletions(-)
-
-Index: linux-2.4.19.SuSE/Documentation/Configure.help
-===================================================================
---- linux-2.4.19.SuSE.orig/Documentation/Configure.help 2004-05-03 11:20:17.000000000 -0700
-+++ linux-2.4.19.SuSE/Documentation/Configure.help 2004-05-03 11:50:22.000000000 -0700
-@@ -15296,6 +15296,39 @@
-
- If unsure, say N.
-
-+Ext2 extended attributes
-+CONFIG_EXT2_FS_XATTR
-+ Extended attributes are name:value pairs associated with inodes by
-+ the kernel or by users (see the attr(5) manual page, or visit
-+ <http://acl.bestbits.at/> for details).
-+
-+ If unsure, say N.
-+
-+Ext2 extended attribute block sharing
-+CONFIG_EXT2_FS_XATTR_SHARING
-+ This options enables code for sharing identical extended attribute
-+ blocks among multiple inodes.
-+
-+ Usually, say Y.
-+
-+Ext2 extended user attributes
-+CONFIG_EXT2_FS_XATTR_USER
-+ This option enables extended user attributes on ext2. Processes can
-+ associate extended user attributes with inodes to store additional
-+ information such as the character encoding of files, etc. (see the
-+ attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
-+
-+ If unsure, say N.
-+
-+Ext2 trusted extended attributes
-+CONFIG_EXT2_FS_XATTR_TRUSTED
-+ This option enables extended attributes on ext2 that are accessible
-+ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
-+ is only the super user. Trusted extended attributes are meant for
-+ implementing system/security services.
-+
-+ If unsure, say N.
-+
- Ext3 journalling file system support (EXPERIMENTAL)
- CONFIG_EXT3_FS
- This is the journalling version of the Second extended file system
-@@ -15354,6 +15387,39 @@
-
- If unsure, say N.
-
-+Ext3 extended attributes
-+CONFIG_EXT3_FS_XATTR
-+ Extended attributes are name:value pairs associated with inodes by
-+ the kernel or by users (see the attr(5) manual page, or visit
-+ <http://acl.bestbits.at/> for details).
-+
-+ If unsure, say N.
-+
-+Ext3 extended attribute block sharing
-+CONFIG_EXT3_FS_XATTR_SHARING
-+ This options enables code for sharing identical extended attribute
-+ blocks among multiple inodes.
-+
-+ Usually, say Y.
-+
-+Ext3 extended user attributes
-+CONFIG_EXT3_FS_XATTR_USER
-+ This option enables extended user attributes on ext3. Processes can
-+ associate extended user attributes with inodes to store additional
-+ information such as the character encoding of files, etc. (see the
-+ attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
-+
-+ If unsure, say N.
-+
-+Ext3 trusted extended attributes
-+CONFIG_EXT3_FS_XATTR_TRUSTED
-+ This option enables extended attributes on ext3 that are accessible
-+ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
-+ is only the super user. Trusted extended attributes are meant for
-+ implementing system/security services.
-+
-+ If unsure, say N.
-+
- Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
- CONFIG_JBD
- This is a generic journalling layer for block devices. It is
-Index: linux-2.4.19.SuSE/arch/ia64/defconfig
-===================================================================
---- linux-2.4.19.SuSE.orig/arch/ia64/defconfig 2004-05-03 11:19:10.000000000 -0700
-+++ linux-2.4.19.SuSE/arch/ia64/defconfig 2004-05-03 11:50:22.000000000 -0700
-@@ -1,6 +1,13 @@
- #
- # Automatically generated make config: don't edit
- #
-+CONFIG_EXT3_FS_XATTR=y
-+# CONFIG_EXT3_FS_XATTR_SHARING is not set
-+# CONFIG_EXT3_FS_XATTR_USER is not set
-+# CONFIG_EXT2_FS_XATTR is not set
-+# CONFIG_EXT2_FS_XATTR_SHARING is not set
-+# CONFIG_EXT2_FS_XATTR_USER is not set
-+# CONFIG_FS_MBCACHE is not set
-
- #
- # Code maturity level options
-Index: linux-2.4.19.SuSE/fs/Config.in
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/Config.in 2004-05-03 11:18:52.000000000 -0700
-+++ linux-2.4.19.SuSE/fs/Config.in 2004-05-03 11:50:22.000000000 -0700
-@@ -203,6 +203,10 @@
- #tristate 'Meta block cache' CONFIG_FS_MBCACHE
- define_tristate CONFIG_FS_MBCACHE y
-
-+# Meta block cache for Extended Attributes (ext2/ext3)
-+#tristate 'Meta block cache' CONFIG_FS_MBCACHE
-+define_tristate CONFIG_FS_MBCACHE y
-+
- mainmenu_option next_comment
- comment 'Partition Types'
- source fs/partitions/Config.in
-Index: linux-2.4.19.SuSE/fs/Makefile
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/Makefile 2004-05-03 11:22:49.000000000 -0700
-+++ linux-2.4.19.SuSE/fs/Makefile 2004-05-03 11:50:22.000000000 -0700
-@@ -104,6 +104,9 @@
- obj-$(CONFIG_FS_MBCACHE) += mbcache.o
- obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
-
-+export-objs += mbcache.o
-+obj-$(CONFIG_FS_MBCACHE) += mbcache.o
-+
- # persistent filesystems
- obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
-
-Index: linux-2.4.19.SuSE/fs/ext2/Makefile
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext2/Makefile 2004-05-03 11:18:46.000000000 -0700
-+++ linux-2.4.19.SuSE/fs/ext2/Makefile 2004-05-03 11:50:22.000000000 -0700
-@@ -18,4 +18,8 @@
- obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
- obj-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
-
-+export-objs += xattr.o
-+obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o
-+obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
-+
- include $(TOPDIR)/Rules.make
-Index: linux-2.4.19.SuSE/fs/ext2/inode.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext2/inode.c 2004-05-03 11:18:47.000000000 -0700
-+++ linux-2.4.19.SuSE/fs/ext2/inode.c 2004-05-03 11:50:22.000000000 -0700
-@@ -52,6 +52,18 @@
- }
-
- /*
-+ * Test whether an inode is a fast symlink.
-+ */
-+static inline int ext2_inode_is_fast_symlink(struct inode *inode)
-+{
-+ int ea_blocks = inode->u.ext2_i.i_file_acl ?
-+ (inode->i_sb->s_blocksize >> 9) : 0;
-+
-+ return (S_ISLNK(inode->i_mode) &&
-+ inode->i_blocks - ea_blocks == 0);
-+}
-+
-+/*
- * Called at each iput()
- */
- void ext2_put_inode (struct inode * inode)
-@@ -806,6 +818,8 @@
- return;
- if (ext2_inode_is_fast_symlink(inode))
- return;
-+ if (ext2_inode_is_fast_symlink(inode))
-+ return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
-
-Index: linux-2.4.19.SuSE/fs/ext2/super.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext2/super.c 2004-05-03 11:18:47.000000000 -0700
-+++ linux-2.4.19.SuSE/fs/ext2/super.c 2004-05-03 11:50:22.000000000 -0700
-@@ -70,6 +70,7 @@
- {
- va_list args;
-
-+ ext2_xattr_put_super(sb);
- if (!(sb->s_flags & MS_RDONLY)) {
- sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS;
- sb->u.ext2_sb.s_es->s_state =
-Index: linux-2.4.19.SuSE/fs/ext3/inode.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c 2004-05-03 11:18:47.000000000 -0700
-+++ linux-2.4.19.SuSE/fs/ext3/inode.c 2004-05-03 11:50:22.000000000 -0700
-@@ -54,6 +54,18 @@
- inode->i_blocks - ea_blocks == 0);
- }
-
-+/*
-+ * Test whether an inode is a fast symlink.
-+ */
-+static inline int ext3_inode_is_fast_symlink(struct inode *inode)
-+{
-+ int ea_blocks = inode->u.ext3_i.i_file_acl ?
-+ (inode->i_sb->s_blocksize >> 9) : 0;
-+
-+ return (S_ISLNK(inode->i_mode) &&
-+ inode->i_blocks - ea_blocks == 0);
-+}
-+
- /* The ext3 forget function must perform a revoke if we are freeing data
- * which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
-@@ -1968,6 +1980,8 @@
- return;
- if (ext3_inode_is_fast_symlink(inode))
- return;
-+ if (ext3_inode_is_fast_symlink(inode))
-+ return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
-
-Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c 2004-02-18 07:26:44.000000000 -0800
-+++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c 2004-05-03 11:50:22.000000000 -0700
-@@ -0,0 +1,13 @@
-+#include <linux/config.h>
-+#include <linux/module.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/ext3_xattr.h>
-+
-+EXPORT_SYMBOL(ext3_force_commit);
-+EXPORT_SYMBOL(ext3_bread);
-+EXPORT_SYMBOL(ext3_xattr_register);
-+EXPORT_SYMBOL(ext3_xattr_unregister);
-+EXPORT_SYMBOL(ext3_xattr_get);
-+EXPORT_SYMBOL(ext3_xattr_list);
-+EXPORT_SYMBOL(ext3_xattr_set);
-Index: linux-2.4.19.SuSE/include/linux/errno.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/errno.h 2004-05-03 11:20:21.000000000 -0700
-+++ linux-2.4.19.SuSE/include/linux/errno.h 2004-05-03 11:50:22.000000000 -0700
-@@ -30,4 +30,8 @@
-
- #endif
-
-+/* Defined for extended attributes */
-+#define ENOATTR ENODATA /* No such attribute */
-+#define ENOTSUP EOPNOTSUPP /* Operation not supported */
-+
- #endif
-Index: linux-2.4.19.SuSE/kernel/ksyms.c
-===================================================================
---- linux-2.4.19.SuSE.orig/kernel/ksyms.c 2004-05-03 11:22:48.000000000 -0700
-+++ linux-2.4.19.SuSE/kernel/ksyms.c 2004-05-03 11:50:22.000000000 -0700
-@@ -12,6 +12,7 @@
- #define __KERNEL_SYSCALLS__
- #include <linux/config.h>
- #include <linux/slab.h>
-+#include <linux/cache_def.h>
- #include <linux/module.h>
- #include <linux/blkdev.h>
- #include <linux/cdrom.h>
-Index: linux-2.4.19.SuSE/mm/vmscan.c
-===================================================================
---- linux-2.4.19.SuSE.orig/mm/vmscan.c 2004-05-03 11:18:53.000000000 -0700
-+++ linux-2.4.19.SuSE/mm/vmscan.c 2004-05-03 11:50:22.000000000 -0700
-@@ -32,6 +32,39 @@
- */
- int vm_passes = 60;
-
-+static DECLARE_MUTEX(other_caches_sem);
-+static LIST_HEAD(cache_definitions);
-+
-+void register_cache(struct cache_definition *cache)
-+{
-+ down(&other_caches_sem);
-+ list_add(&cache->link, &cache_definitions);
-+ up(&other_caches_sem);
-+}
-+
-+void unregister_cache(struct cache_definition *cache)
-+{
-+ down(&other_caches_sem);
-+ list_del(&cache->link);
-+ up(&other_caches_sem);
-+}
-+
-+static void shrink_other_caches(unsigned int priority, int gfp_mask)
-+{
-+ struct list_head *p;
-+
-+ if (down_trylock(&other_caches_sem))
-+ return;
-+
-+ list_for_each_prev(p, &cache_definitions) {
-+ struct cache_definition *cache =
-+ list_entry(p, struct cache_definition, link);
-+
-+ cache->shrink(priority, gfp_mask);
-+ }
-+ up(&other_caches_sem);
-+}
-+
- /*
- * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
- * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
+++ /dev/null
- ext2/super.c | 3 +--
- ext3/ext3-exports.c | 13 +++++++++++++
- 2 files changed, 14 insertions(+), 2 deletions(-)
-
-Index: linux-2.4.19.SuSE/fs/ext2/super.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext2/super.c Mon Jan 27 05:08:00 2003
-+++ linux-2.4.19.SuSE/fs/ext2/super.c Sun Nov 16 00:40:59 2003
-@@ -70,6 +70,7 @@
- {
- va_list args;
-
-+ ext2_xattr_put_super(sb);
- if (!(sb->s_flags & MS_RDONLY)) {
- sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS;
- sb->u.ext2_sb.s_es->s_state =
-Index: linux-2.4.19.SuSE/fs/ext3/super.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Mon Jan 27 05:08:00 2003
-+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 00:40:59 2003
-@@ -1822,8 +1828,6 @@
- exit_ext3_xattr();
- }
-
--EXPORT_SYMBOL(ext3_force_commit);
--EXPORT_SYMBOL(ext3_bread);
-
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
- MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
-Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c Sun Nov 16 00:40:58 2003
-+++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c Sun Nov 16 00:40:59 2003
-@@ -0,0 +1,13 @@
-+#include <linux/config.h>
-+#include <linux/module.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/ext3_xattr.h>
-+
-+EXPORT_SYMBOL(ext3_force_commit);
-+EXPORT_SYMBOL(ext3_bread);
-+EXPORT_SYMBOL(ext3_xattr_register);
-+EXPORT_SYMBOL(ext3_xattr_unregister);
-+EXPORT_SYMBOL(ext3_xattr_get);
-+EXPORT_SYMBOL(ext3_xattr_list);
-+EXPORT_SYMBOL(ext3_xattr_set);
+++ /dev/null
-Index: linux-2.4.18-chaos/include/linux/list.h
-===================================================================
---- linux-2.4.18-chaos.orig/include/linux/list.h 2003-11-23 00:07:05.000000000 +0300
-+++ linux-2.4.18-chaos/include/linux/list.h 2003-12-11 00:25:15.000000000 +0300
-@@ -173,6 +173,67 @@
- for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
- pos = pos->prev, prefetch(pos->prev))
-
-+/**
-+ * list_for_each_entry - iterate over list of given type
-+ * @pos: the type * to use as a loop counter.
-+ * @head: the head for your list.
-+ * @member: the name of the list_struct within the struct.
-+ */
-+#define list_for_each_entry(pos, head, member) \
-+ for (pos = list_entry((head)->next, typeof(*pos), member), \
-+ prefetch(pos->member.next); \
-+ &pos->member != (head); \
-+ pos = list_entry(pos->member.next, typeof(*pos), member), \
-+ prefetch(pos->member.next))
-+
-+#ifndef list_for_each_entry_safe
-+/**
-+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
-+ * @pos: the type * to use as a loop counter.
-+ * @n: another type * to use as temporary storage
-+ * @head: the head for your list.
-+ * @member: the name of the list_struct within the struct.
-+ */
-+#define list_for_each_entry_safe(pos, n, head, member) \
-+ for (pos = list_entry((head)->next, typeof(*pos), member), \
-+ n = list_entry(pos->member.next, typeof(*pos), member); \
-+ &pos->member != (head); \
-+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
-+#endif
-+
-+/**
-+ * list_move - delete from one list and add as another's head
-+ * @list: the entry to move
-+ * @head: the head that will precede our entry
-+ */
-+static inline void list_move(struct list_head *list, struct list_head *head)
-+{
-+ __list_del(list->prev, list->next);
-+ list_add(list, head);
-+}
-+
-+/**
-+ * list_move_tail - delete from one list and add as another's tail
-+ * @list: the entry to move
-+ * @head: the head that will follow our entry
-+ */
-+static inline void list_move_tail(struct list_head *list,
-+ struct list_head *head)
-+{
-+ __list_del(list->prev, list->next);
-+ list_add_tail(list, head);
-+}
-+
-+/* 2.5 uses hlists for some things, like the d_hash. we'll treat them
-+ * as 2.5 and let macros drop back.. */
-+#define hlist_entry list_entry
-+#define hlist_head list_head
-+#define hlist_node list_head
-+#define HLIST_HEAD LIST_HEAD
-+#define INIT_HLIST_HEAD INIT_LIST_HEAD
-+#define hlist_del_init list_del_init
-+#define hlist_add_head list_add
-+#define hlist_for_each_safe list_for_each_safe
-
- #endif /* __KERNEL__ || _LVM_H_INCLUDE */
-
+++ /dev/null
-? linux/.config
-? linux/include/linux/autoconf.h
-? linux/include/linux/modules
-Index: linux/Makefile
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/Makefile,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.1
-diff -u -r1.3.2.1 -r1.3.2.1.2.1
---- linux/Makefile 12 Mar 2003 19:48:52 -0000 1.3.2.1
-+++ linux/Makefile 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1
-@@ -99,6 +99,10 @@
- CFLAGS += -fomit-frame-pointer
- endif
- AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS)
-+ifeq ($(CONFIG_MCL_COREDUMP),y)
-+ CFLAGS += -g
-+endif
-+
-
- #
- # ROOT_DEV specifies the default root-device when making the image.
-Index: linux/Documentation/Configure.help
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/Documentation/Configure.help,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.1
-diff -u -r1.3.2.1 -r1.3.2.1.2.1
---- linux/Documentation/Configure.help 12 Mar 2003 19:48:52 -0000 1.3.2.1
-+++ linux/Documentation/Configure.help 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1
-@@ -21660,6 +21660,35 @@
- This option allows you to run the kernel with data cache disabled.
- Say Y if you experience CPM lock-ups.
-
-+Boot kernel image support
-+CONFIG_BOOTIMG
-+ Add support for booting a new Linux kernel from a running Linux
-+ system. You need to download the bootimg(8) utility from
-+ ftp://icaftp.epfl.ch/pub/people/almesber/misc/bootimg-current.tar.gz
-+ in order to use this functionality.
-+
-+Protect SMP configuration tables
-+CONFIG_BOOTIMG_SMP
-+ On SMP systems, the BIOS stores tables with configuration data in
-+ memory and an SMP-enabled kernel reads these tables. However, a
-+ kernel without SMP support will overwrite such tables. If a kernel
-+ without SMP support used bootimg to boot an SMP-enabled kernel, the
-+ latter will probably crash when trying to read the SMP tables. The
-+ CONFIG_BOOTIMG_SMP option enables minimal support for scanning and
-+ protecting of SMP configuration tables also for kernels without SMP
-+ support.
-+
-+In-memory kernel core dump facility
-+CONFIG_MCL_COREDUMP
-+ In conjunction with bootimg, this allows you to get kernel core dumps
-+ of your system at panic() time. The panic call is modified so that it
-+ calls the core dump facility and reboots the system. On the way back
-+ up, the kernel dump image is written out to disk by the accompanying
-+ init script. You can use the crash analysis tool to analyze the core
-+ dump. This tool can be found at :
-+
-+ http://www.missioncriticallinux.com/download
-+
- #
- # m68k-specific kernel options
- # Documented by Chris Lawrence <mailto:quango@themall.net> et al.
-Index: linux/arch/i386/config.in
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/config.in,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.2
-diff -u -r1.3.2.1 -r1.3.2.1.2.2
---- linux/arch/i386/config.in 12 Mar 2003 19:49:05 -0000 1.3.2.1
-+++ linux/arch/i386/config.in 1 Apr 2003 19:35:12 -0000 1.3.2.1.2.2
-@@ -502,6 +502,12 @@
- bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ
- bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK
- bool ' Compile the kernel with frame pointers' CONFIG_FRAME_POINTER
-+ if [ "$CONFIG_FRAME_POINTER " != "n" ]; then
-+ bool ' Kernel Core Dump Facility' CONFIG_MCL_COREDUMP
-+ if [ "$CONFIG_MCL_COREDUMP" = "y" ]; then
-+ bool ' Reboot using bootimg' CONFIG_BOOTIMG
-+ fi
-+ fi
- fi
-
- endmenu
-Index: linux/arch/i386/vmlinux.lds
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/vmlinux.lds,v
-retrieving revision 1.1.1.1.4.1
-retrieving revision 1.1.1.1.4.1.2.1
-diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1
---- linux/arch/i386/vmlinux.lds 12 Mar 2003 19:49:05 -0000 1.1.1.1.4.1
-+++ linux/arch/i386/vmlinux.lds 1 Apr 2003 12:17:40 -0000 1.1.1.1.4.1.2.1
-@@ -19,6 +19,13 @@
- .rodata : { *(.rodata) *(.rodata.*) }
- .kstrtab : { *(.kstrtab) }
-
-+ . = ALIGN(16); /* Relocatable bootimage code */
-+ __bootimg_start = .;
-+ .bootimg : {
-+ *(.bootimg)
-+ }
-+ __bootimg_end = .;
-+
- . = ALIGN(16); /* Exception table */
- __start___ex_table = .;
- __ex_table : { *(__ex_table) }
-Index: linux/arch/i386/boot/setup.S
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/setup.S,v
-retrieving revision 1.2.2.1
-retrieving revision 1.2.2.1.2.1
-diff -u -r1.2.2.1 -r1.2.2.1.2.1
---- linux/arch/i386/boot/setup.S 12 Mar 2003 19:49:05 -0000 1.2.2.1
-+++ linux/arch/i386/boot/setup.S 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1
-@@ -105,16 +105,22 @@
- # flags, unused bits must be zero (RFU) bit within loadflags
- loadflags:
- LOADED_HIGH = 1 # If set, the kernel is loaded high
-+RELOADS_GDT = 2 # if set, kernel reloads GDT, such that
-+ # boot loader does not have to provide
-+ # GDT in a "safe" memory location
- CAN_USE_HEAP = 0x80 # If set, the loader also has set
- # heap_end_ptr to tell how much
- # space behind setup.S can be used for
- # heap purposes.
- # Only the loader knows what is free
--#ifndef __BIG_KERNEL__
-- .byte 0
--#else
-- .byte LOADED_HIGH
-+_FLAGS = 0
-+#ifdef __BIG_KERNEL__
-+ _FLAGS = _FLAGS | LOADED_HIGH
- #endif
-+#ifdef CONFIG_BOOTIMG
-+ _FLAGS = _FLAGS | RELOADS_GDT
-+#endif
-+ .byte _FLAGS
-
- setup_move_size: .word 0x8000 # size to move, when setup is not
- # loaded at 0x90000. We will move setup
-Index: linux/arch/i386/kernel/Makefile
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/Makefile,v
-retrieving revision 1.2.2.1
-retrieving revision 1.2.2.1.2.1
-diff -u -r1.2.2.1 -r1.2.2.1.2.1
---- linux/arch/i386/kernel/Makefile 12 Mar 2003 19:49:05 -0000 1.2.2.1
-+++ linux/arch/i386/kernel/Makefile 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1
-@@ -49,6 +49,7 @@
- obj-$(CONFIG_X86_LONGRUN) += longrun.o
- obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
- obj-$(CONFIG_PROFILING) += profile.o
-+obj-$(CONFIG_MCL_COREDUMP) += crash.o
-
-
- include $(TOPDIR)/Rules.make
-Index: linux/arch/i386/kernel/crash.c
-===================================================================
-RCS file: linux/arch/i386/kernel/crash.c
-diff -N linux/arch/i386/kernel/crash.c
---- /dev/null 1 Jan 1970 00:00:00 -0000
-+++ linux/arch/i386/kernel/crash.c 1 Apr 2003 12:17:40 -0000 1.1.6.1
-@@ -0,0 +1,82 @@
-+/*
-+ * linux/arch/i386/crash.c
-+ *
-+ * Architecture dependant code for MCL in-memory core dump.
-+ */
-+#include <linux/sched.h>
-+#include <linux/types.h>
-+#include <linux/smp.h>
-+#include <linux/crash.h>
-+#include <linux/reboot.h>
-+#include <linux/bootimg.h>
-+
-+inline void crash_save_regs(void) {
-+ static unsigned long regs[8];
-+
-+ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs[0]));
-+ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs[1]));
-+ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs[2]));
-+ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs[3]));
-+ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs[4]));
-+ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs[5]));
-+ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs[6]));
-+ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs[7]));
-+
-+ panic_regs = regs;
-+}
-+
-+/*
-+ * Save the current stack pointer and EIP.
-+ */
-+void crash_save_current_state(struct task_struct *tp)
-+{
-+ /*
-+ * Here we save ebp instead of esp just in case the compiler
-+ * decides to put an extra push in before we execute this
-+ * instruction (thus invalidating our frame pointer).
-+ */
-+ asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp));
-+ tp->thread.eip = (u_long)crash_save_current_state;
-+ panic_ksp[smp_processor_id()] = tp->thread.esp;
-+ mb();
-+
-+ save_core();
-+
-+ crash_halt_or_reboot(1);
-+}
-+
-+/*
-+ * If we are not the panicking thread, we simply halt. Otherwise,
-+ * we take care of calling the reboot code.
-+ */
-+void crash_halt_or_reboot(int boot_cpu)
-+{
-+#ifdef CONFIG_SMP
-+ if (!boot_cpu) {
-+ stop_this_cpu(NULL);
-+ /* NOTREACHED */
-+ }
-+#endif
-+ machine_restart(NULL);
-+}
-+
-+void crash_cleanup_smp_state(void)
-+{
-+ /*
-+ * Here we duplicate smp_send_stop. Crash_halt_or_reboot() calls
-+ * stop_this_cpu. We now know that we are the only one running,
-+ * so we finish off the smp_send_stop function.
-+ */
-+ __cli();
-+#ifdef CONFIG_SMP
-+ disable_local_APIC();
-+#endif
-+}
-+
-+/*
-+ * Core dump IPI
-+ */
-+void smp_crash_funnel_cpu(void)
-+{
-+ crash_save_current_state(current);
-+}
-Index: linux/arch/i386/kernel/nmi.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/nmi.c,v
-retrieving revision 1.2.2.1
-retrieving revision 1.2.2.1.2.1
-diff -u -r1.2.2.1 -r1.2.2.1.2.1
---- linux/arch/i386/kernel/nmi.c 12 Mar 2003 19:49:06 -0000 1.2.2.1
-+++ linux/arch/i386/kernel/nmi.c 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1
-@@ -374,11 +374,18 @@
- bust_spinlocks(1);
- printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
- show_registers(regs);
-+#ifdef CONFIG_MCL_COREDUMP
-+ spin_unlock(&nmi_print_lock);
-+ bust_spinlocks(0);
-+ panic("die");
-+ /* NOTREACHED */
-+#else
- printk("console shuts up ...\n");
- console_silent();
- spin_unlock(&nmi_print_lock);
- bust_spinlocks(0);
- do_exit(SIGSEGV);
-+#endif
- }
- } else {
- last_irq_sums[cpu] = sum;
-Index: linux/arch/i386/kernel/process.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/process.c,v
-retrieving revision 1.2.2.2
-retrieving revision 1.2.2.2.2.1
-diff -u -r1.2.2.2 -r1.2.2.2.2.1
---- linux/arch/i386/kernel/process.c 1 Apr 2003 02:11:17 -0000 1.2.2.2
-+++ linux/arch/i386/kernel/process.c 1 Apr 2003 12:17:40 -0000 1.2.2.2.2.1
-@@ -50,6 +50,9 @@
- #ifdef CONFIG_MATH_EMULATION
- #include <asm/math_emu.h>
- #endif
-+#ifdef CONFIG_BOOTIMG
-+#include <linux/bootimg.h>
-+#endif
-
- #include <linux/irq.h>
-
-@@ -377,7 +380,21 @@
-
- void machine_restart(char * __unused)
- {
-+#ifdef CONFIG_MCL_COREDUMP
-+ extern char *panicmsg;
-+ /*
-+ * Only call bootimg if we have a valid descriptor and
-+ * we are in a panic() context.
-+ */
-+ if (panicmsg)
-+#endif
-+#ifdef CONFIG_BOOTIMG
-+ if (bootimg_dsc.page_dir)
-+ boot_image();
-+#endif
-+
- #if CONFIG_SMP
-+{
- int cpuid;
-
- cpuid = GET_APIC_ID(apic_read(APIC_ID));
-@@ -413,6 +430,7 @@
- if (!netdump_func)
- smp_send_stop();
- disable_IO_APIC();
-+}
- #endif
-
- if(!reboot_thru_bios) {
-Index: linux/arch/i386/kernel/setup.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/setup.c,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.2
-diff -u -r1.3.2.1 -r1.3.2.1.2.2
---- linux/arch/i386/kernel/setup.c 12 Mar 2003 19:49:06 -0000 1.3.2.1
-+++ linux/arch/i386/kernel/setup.c 1 Apr 2003 17:55:35 -0000 1.3.2.1.2.2
-@@ -116,6 +116,9 @@
- #include <asm/mpspec.h>
- #include <asm/mmu_context.h>
- #include <asm/edd.h>
-+#ifdef CONFIG_MCL_COREDUMP
-+#include <linux/crash.h>
-+#endif
- /*
- * Machine setup..
- */
-@@ -973,6 +976,7 @@
- static unsigned long __init setup_memory(void)
- {
- unsigned long bootmap_size, start_pfn, max_low_pfn;
-+ unsigned long bootmap_pages = 0UL, crash_pages = 0UL;
-
- /*
- * partially used pages are not usable - thus
-@@ -992,6 +996,21 @@
- printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
- pages_to_mb(highend_pfn - highstart_pfn));
- #endif
-+
-+#ifdef CONFIG_MCL_COREDUMP
-+ bootmap_pages = bootmem_bootmap_pages(max_low_pfn);
-+ crash_pages = crash_pages_needed();
-+
-+ printk("start_pfn: %d, bootmap_pages: %d\n", start_pfn, bootmap_pages);
-+
-+ crash_init((u_long)phys_to_virt(PFN_PHYS(start_pfn)),
-+ (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn)),
-+ (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn +
-+ crash_pages)));
-+
-+ printk("new start_pfn: %08lx\n", PFN_PHYS(start_pfn));
-+ printk("crash map starts at %lx\n",(start_pfn+bootmap_pages)*PAGE_SIZE);
-+#endif
- printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
- pages_to_mb(max_low_pfn));
- /*
-@@ -1007,8 +1026,8 @@
- * the (very unlikely) case of us accidentally initializing the
- * bootmem allocator with an invalid RAM area.
- */
-- reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
-- bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
-+ reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + bootmap_size +
-+ ((1+crash_pages)*PAGE_SIZE) + PAGE_SIZE-1) - (HIGH_MEMORY));
-
- /*
- * reserve physical page 0 - it's a special BIOS page on many boxes,
-@@ -1016,6 +1035,16 @@
- */
- reserve_bootmem(0, PAGE_SIZE);
-
-+#ifdef CONFIG_BOOTIMG
-+ /*
-+ * bootimg(8) reads the old parameter block. Note that the copy in
-+ * empty_zero_page will vanish when mem_init runs. (Should we
-+ * memcpy(phys_to_virt(0x90000), PARAM, PAGE_SIZE);
-+ * now ?)
-+ */
-+ reserve_bootmem(0x90000, PAGE_SIZE);
-+#endif
-+
- #ifdef CONFIG_SMP
- /*
- * But first pinch a few for the stack/trampoline stuff
-@@ -1032,6 +1061,7 @@
- find_smp_config();
- #endif
- #ifdef CONFIG_BLK_DEV_INITRD
-+ printk("caution: initrd may overwrite dump\n"); /* phro */
- if (LOADER_TYPE && INITRD_START) {
- if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
- reserve_bootmem(INITRD_START, INITRD_SIZE);
-@@ -1172,6 +1202,12 @@
- smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
- #endif
- paging_init();
-+#ifdef CONFIG_MCL_COREDUMP
-+ /*
-+ * Reserve crash pages
-+ */
-+ crash_mark_dump_reserved();
-+#endif
- #ifdef CONFIG_X86_LOCAL_APIC
- /*
- * get boot-time SMP configuration:
-Index: linux/arch/i386/kernel/smp.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/smp.c,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.1
-diff -u -r1.3.2.1 -r1.3.2.1.2.1
---- linux/arch/i386/kernel/smp.c 12 Mar 2003 19:49:06 -0000 1.3.2.1
-+++ linux/arch/i386/kernel/smp.c 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1
-@@ -23,6 +23,9 @@
- #include <asm/pgalloc.h>
- #include <asm/smpboot.h>
-
-+#ifdef CONFIG_MCL_COREDUMP
-+#include <asm/crash.h>
-+#endif
- /*
- * Some notes on x86 processor bugs affecting SMP operation:
- *
-@@ -579,7 +582,7 @@
- return 0;
- }
-
--static void stop_this_cpu (void * dummy)
-+void stop_this_cpu (void * dummy)
- {
- /*
- * Remove this CPU:
-Index: linux/arch/i386/kernel/traps.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/traps.c,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.1
-diff -u -r1.3.2.1 -r1.3.2.1.2.1
---- linux/arch/i386/kernel/traps.c 12 Mar 2003 19:49:06 -0000 1.3.2.1
-+++ linux/arch/i386/kernel/traps.c 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1
-@@ -52,6 +52,10 @@
- #include <linux/irq.h>
- #include <linux/module.h>
-
-+#ifdef CONFIG_MCL_COREDUMP
-+#include <linux/crash.h>
-+#endif
-+
- asmlinkage int system_call(void);
- asmlinkage void lcall7(void);
- asmlinkage void lcall27(void);
-@@ -309,7 +313,11 @@
- netdump_func(regs);
- bust_spinlocks(0);
- spin_unlock_irq(&die_lock);
-- do_exit(SIGSEGV);
-+#ifdef CONFIG_MCL_COREDUMP
-+ if(panic_on_oops)
-+ panic("die");
-+#endif
-+ do_exit(SIGSEGV);/* NOTREACHED */
- }
-
- static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
-Index: linux/drivers/char/misc.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/misc.c,v
-retrieving revision 1.2
-retrieving revision 1.2.4.1
-diff -u -r1.2 -r1.2.4.1
---- linux/drivers/char/misc.c 25 Sep 2002 17:11:05 -0000 1.2
-+++ linux/drivers/char/misc.c 1 Apr 2003 12:17:41 -0000 1.2.4.1
-@@ -78,6 +78,8 @@
- extern int i8k_init(void);
- extern int lcd_init(void);
-
-+extern int crash_init_chrdev(void);
-+
- static int misc_read_proc(char *buf, char **start, off_t offset,
- int len, int *eof, void *private)
- {
-@@ -255,6 +257,9 @@
- int __init misc_init(void)
- {
- create_proc_read_entry("misc", 0, 0, misc_read_proc, NULL);
-+#ifdef CONFIG_MCL_COREDUMP
-+ crash_init_chrdev();
-+#endif
- #ifdef CONFIG_MVME16x
- rtc_MK48T08_init();
- #endif
-Index: linux/drivers/char/sysrq.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/sysrq.c,v
-retrieving revision 1.2.2.1
-retrieving revision 1.2.2.1.2.2
-diff -u -r1.2.2.1 -r1.2.2.1.2.2
---- linux/drivers/char/sysrq.c 12 Mar 2003 19:49:47 -0000 1.2.2.1
-+++ linux/drivers/char/sysrq.c 1 Apr 2003 17:55:35 -0000 1.2.2.1.2.2
-@@ -97,7 +97,18 @@
- action_msg: "Resetting",
- };
-
--
-+#ifdef CONFIG_MCL_COREDUMP
-+/* kernel core dump sysrq */
-+static void sysrq_handle_coredump(int key, struct pt_regs *pt_regs,
-+ struct kbd_struct *kbd, struct tty_struct *ttty) {
-+ panic("sysrq");
-+}
-+static struct sysrq_key_op sysrq_coredump_op = {
-+ handler: sysrq_handle_coredump,
-+ help_msg: "Crash",
-+ action_msg: "Dumping core",
-+};
-+#endif
-
- /* SYNC SYSRQ HANDLERS BLOCK */
-
-@@ -334,7 +345,11 @@
- it is handled specially on the spark
- and will never arive */
- /* b */ &sysrq_reboot_op,
-+#ifdef CONFIG_MCL_COREDUMP
-+/* c */ &sysrq_coredump_op,
-+#else
- /* c */ NULL,
-+#endif
- /* d */ NULL,
- /* e */ &sysrq_term_op,
- /* f */ NULL,
-Index: linux/include/asm-i386/bootimg.h
-===================================================================
-RCS file: linux/include/asm-i386/bootimg.h
-diff -N linux/include/asm-i386/bootimg.h
---- /dev/null 1 Jan 1970 00:00:00 -0000
-+++ linux/include/asm-i386/bootimg.h 1 Apr 2003 12:17:41 -0000 1.1.6.1
-@@ -0,0 +1,141 @@
-+/* asm-i386/bootimg.h - Boot image, i386-specific code */
-+
-+/* Written 2000 by Werner Almesberger */
-+
-+/*
-+ * When porting bootimg(2) to a new architcture, you need to adapt the
-+ * functions and definitions in this file.
-+ */
-+
-+
-+#ifndef _ASM_I386_BOOTIMG_H
-+#define _ASM_I386_BOOTIMG_H
-+
-+#include <linux/config.h>
-+#include <asm/system.h>
-+
-+#ifdef CONFIG_SMP
-+#include <linux/smp.h>
-+#include <linux/irq.h>
-+#endif
-+
-+
-+/*
-+ * The memory page with the code currently executing has been copied from
-+ * old_page to new_page. Jump there.
-+ *
-+ * Note: flush_icache_range has already been called on the new page.
-+ */
-+
-+static inline void jump_relocated(unsigned long old_page,unsigned long new_page)
-+{
-+ int tmp;
-+
-+ __asm__ __volatile__(
-+ "stc\n\t"
-+ "call 1f\n"
-+ "1:\tjnc 2f\n\t"
-+ "popl %0\n\t"
-+ "addl %1,%0\n\t"
-+ "addl %1,%%esp\n\t"
-+ "clc\n\t"
-+ "jmp *%0\n"
-+ "2:"
-+ : "=&r" (tmp) : "r" (new_page-old_page));
-+}
-+
-+
-+/*
-+ * Stop paging, such that
-+ * - page tables can be overwritten
-+ * - all physical memory can be accessed
-+ * - all physical memory is identity-mapped
-+ *
-+ * (Other rules are possible, but need to be encoded in bootimg(8).)
-+ */
-+
-+static inline void stop_paging(void)
-+{
-+ unsigned long msw;
-+
-+ __asm__ __volatile__(
-+ "movl %%cr0,%0\n\t"
-+ "andl $0x7fffffff,%0\n\t"
-+ "movl %0,%%cr0\n\t"
-+ "jmp 1f\n\t" /* i486 and such */
-+ "1:"
-+
-+/* Clear the PAE bit in register %cr4 if we were in PAE mode. The initial
-+ * page table set up by the new kernel's bootstrap code is non-PAE regardless
-+ * of whether the new kernel is a PAE kernel. By clearing the PAE bit here,
-+ * we make sure the bootstrap code doesn't accidentally enable PAE mode when
-+ * it turns on address translation.
-+ */
-+#ifdef CONFIG_X86_PAE
-+ "movl %%cr4,%0\n\t"
-+ "andl $0xffffffdf,%0\n\t"
-+ "movl %0,%%cr4\n\t"
-+#endif
-+
-+ : "=&r" (msw) : : "memory");
-+}
-+
-+
-+/*
-+ * Stop any remaining concurrency in the system. If become_only_thread fails
-+ * but the system is still usable, become_only_thread should return an error
-+ * code. If no recovery is possible, it may as well panic.
-+ */
-+
-+static inline int become_only_thread(void)
-+{
-+#ifdef CONFIG_SMP
-+ smp_send_stop();
-+ disable_IO_APIC();
-+#endif
-+ cli();
-+ return 0;
-+}
-+
-+
-+/*
-+ * A conservative estimate of the number of bytes relocate_and_jump allocated
-+ * on the stack. This is only used for sanity checking before running code,
-+ * because we can't recover from failure in relocate_and_jump.
-+ */
-+
-+#define RESERVE_MIN_RELOC_STACK 256
-+
-+
-+/*
-+ * Change the stack pointer such that stack is at the end of the specified
-+ * page. No data on the old stack will be accessed anymore, so no copying is
-+ * required.
-+ */
-+
-+static inline void stack_on_page(void *page)
-+{
-+ __asm__ __volatile__(
-+ "push %%ds\n\t"
-+ "pop %%ss\n\t"
-+ "movl %0,%%esp\n\t"
-+ "addl $0x1000,%%esp\n\t"
-+ : : "r" (page));
-+}
-+
-+/*
-+ * Set up things such that the kernel will be comfortable (e.g. some
-+ * architectures expect the boot loader to set registers in certain ways),
-+ * and then jump to the kernel's entry address.
-+ */
-+
-+static inline void jump_to_kernel(void (*kernel_entry)(void))
-+{
-+ __asm__ __volatile__(
-+ "mov $0x90000,%%esi\n\t"
-+ : : );
-+
-+ kernel_entry();
-+}
-+
-+#endif
-Index: linux/include/asm-i386/crash.h
-===================================================================
-RCS file: linux/include/asm-i386/crash.h
-diff -N linux/include/asm-i386/crash.h
---- /dev/null 1 Jan 1970 00:00:00 -0000
-+++ linux/include/asm-i386/crash.h 1 Apr 2003 12:17:41 -0000 1.1.6.1
-@@ -0,0 +1,15 @@
-+#ifndef __ASM_CRASH_H
-+#define __ASM_CRASH_H
-+
-+#define UPPER_MEM_BACKUP 0
-+#define LOWER_MEM_FORWARD 0
-+#define LOW_OFFSET 100
-+
-+/*
-+ * These two functions are inlined on alpha. That's why they appear
-+ * in the arch dependent include file.
-+ */
-+void crash_save_current_state(struct task_struct *);
-+void crash_halt_or_reboot(int);
-+
-+#endif
-Index: linux/include/linux/bootimg.h
-===================================================================
-RCS file: linux/include/linux/bootimg.h
-diff -N linux/include/linux/bootimg.h
---- /dev/null 1 Jan 1970 00:00:00 -0000
-+++ linux/include/linux/bootimg.h 1 Apr 2003 12:17:41 -0000 1.1.6.1
-@@ -0,0 +1,84 @@
-+/* linux/bootimg.h - Boot image, general definitions */
-+
-+/* Written 2000 by Werner Almesberger */
-+
-+
-+#ifndef _LINUX_BOOTIMG_H
-+#define _LINUX_BOOTIMG_H
-+
-+
-+/*
-+ * Constraints on image_map:
-+ * - each image_map[n] is the virtual address of a page-sized memory region
-+ * readable by the user
-+ * - currently, image_map[n] is not required to be page-aligned, but this may
-+ * change in the future if we want to map pages directly to lower memory
-+ * pressure (NB: mapping works for ELF and plain binary images, but usually
-+ * not for (b)zImages, because the prepended boot and setup sectors
-+ * mis-align them)
-+ *
-+ * Constraints on load_map:
-+ * - each load_map[] is the physical address of a page in RAM
-+ */
-+
-+struct boot_image {
-+ void **image_map; /* pointers to image pages in user memory */
-+ int pages; /* length in pages */
-+ unsigned long *load_map;/* list of destination pages (physical addr) */
-+ unsigned long start; /* jump to this physical address */
-+ int flags; /* for future use, must be zero for now */
-+};
-+
-+
-+#ifdef __KERNEL__
-+
-+#define __bootimg __attribute__ ((__section__ (".bootimg")))
-+
-+
-+struct bootimg_dsc {
-+ unsigned long self; /* code page ALL ADDRESSES */
-+ unsigned long scratch; /* scratch page ARE PHYSICAL !*/
-+ unsigned long **page_dir; /* src & dst page tables */
-+ void (*jump_to)(void); /* start address */
-+ int pages; /* number of pages */
-+ unsigned long csum; /* Kernel Image checksum */
-+};
-+
-+/*
-+ * page_dir contains pointers to pages containing pointers to pages. We call
-+ * page_dir a "directory" and the page page_dir[n] points to a "table". The
-+ * first PAGES_PER_TABLE/2 entries of page_dir are for source pages, and other
-+ * half are for destination pages.
-+ */
-+
-+/*
-+ * Note that the definitions used here do not necessarily correspond to the
-+ * architecture-specific PTRS_PER_PTE, __pte_offset, etc.
-+ */
-+
-+#define PAGES_PER_TABLE (PAGE_SIZE/sizeof(void *))
-+#define FROM_TABLE(i) ((i)/PAGES_PER_TABLE)
-+#define TO_TABLE(i) ((i)/PAGES_PER_TABLE+PAGES_PER_TABLE/2)
-+#define PAGE_NR(i) ((i) % PAGES_PER_TABLE)
-+
-+
-+extern char __bootimg_start,__bootimg_end; /* linker segment boundaries */
-+extern unsigned long *unity_page; /* unity-mapped page for i386 */
-+
-+/*
-+ * relocate_and_jump runs in its own page with its own stack. This makes it
-+ * difficult to pass parameters. The solution chosen here is to use the global
-+ * variable bootimg_dsc, which is copied into an "auto" variable by
-+ * relocate_and_jump before any copying or relocation takes place.
-+ */
-+
-+extern struct bootimg_dsc bootimg_dsc;
-+
-+typedef void (*relocate_and_jump_t)(void);
-+
-+void relocate_and_jump(void);
-+int boot_image(void);
-+
-+#endif /* __KERNEL__ */
-+
-+#endif
-Index: linux/include/linux/crash.h
-===================================================================
-RCS file: linux/include/linux/crash.h
-diff -N linux/include/linux/crash.h
---- /dev/null 1 Jan 1970 00:00:00 -0000
-+++ linux/include/linux/crash.h 1 Apr 2003 12:17:41 -0000 1.1.6.1
-@@ -0,0 +1,119 @@
-+#ifndef __LINUX_CRASH_H
-+#define __LINUX_CRASH_H
-+
-+/* defines for interfacing with user-space (ioctls, etc) */
-+struct ioctl_getdump {
-+ unsigned long kva;
-+ unsigned long buf;
-+};
-+
-+#define CRASH_IOC_MAGIC 'C'
-+
-+#define CRASH_IOCFREEDUMP _IO(CRASH_IOC_MAGIC, 0)
-+#define CRASH_IOCGETDUMP _IOWR(CRASH_IOC_MAGIC, 1, struct ioctl_getdump)
-+#define CRASH_IOCBOOTIMG _IOWR(CRASH_IOC_MAGIC, 2, struct boot_image)
-+#define CRASH_IOCVERSION _IO(CRASH_IOC_MAGIC, 3)
-+
-+/* kernel-only part of crash.h */
-+#ifdef __KERNEL__
-+#include <asm/crash.h>
-+
-+#define CRASH_K_MINOR (1)
-+#define CRASH_K_MAJOR (0)
-+
-+/*
-+ * Crash prototypes.
-+ */
-+void save_core(void);
-+void crash_mark_dump_reserved(void);
-+void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va);
-+u_long crash_pages_needed(void);
-+void smp_crash_funnel_cpu(void);
-+void crash_cleanup_smp_state(void);
-+
-+/*
-+ * Arch dependant crash.c funcs
-+ */
-+void crash_save_current_state(struct task_struct *);
-+void crash_halt_or_reboot(int);
-+inline void crash_save_regs(void);
-+
-+/*
-+ * Crash globals
-+ */
-+extern u_long crash_dump_header;
-+extern volatile u_long panic_ksp[];
-+extern volatile int crash_release;
-+extern int panic_on_oops;
-+extern char *panicmsg;
-+extern int panic_processor;
-+extern int crash_perform_sync;
-+extern unsigned long *panic_regs;
-+
-+/*
-+ * symbols not exported by linux header files
-+ */
-+extern void stop_this_cpu(void *);
-+
-+/* struct crash_map_hdr located at byte offset 0 */
-+/* on-disk formats */
-+
-+#define trunc_page(x) ((void *)(((unsigned long)(x)) & ~((unsigned long)(PAGE_SIZE - 1))))
-+#define round_page(x) trunc_page(((unsigned long)(x)) + ((unsigned long)(PAGE_SIZE - 1)))
-+
-+#define CRASH_MAGIC 0x9a8bccdd
-+#define CRASH_SOURCE_PAGES 128
-+#define CRASH_SUB_MAP_BYTES ((u_long)round_page((CRASH_SOURCE_PAGES+1)*sizeof(u_long)))
-+#define CRASH_SUB_MAP_PAGES (CRASH_SUB_MAP_BYTES / PAGE_SIZE)
-+#define CRASH_UNCOMPR_BUF_PAGES (CRASH_SOURCE_PAGES + CRASH_SUB_MAP_PAGES)
-+#define CRASH_COMPR_BUF_PAGES (CRASH_UNCOMPR_BUF_PAGES + (CRASH_UNCOMPR_BUF_PAGES/4))
-+#define CRASH_COMPESS_PRIME_PAGES (2*CRASH_COMPR_BUF_PAGES)
-+#define CRASH_ZALLOC_PAGES 16*5*2 /* 2 to handle crash in crash */
-+#define CRASH_LOW_WATER_PAGES 100
-+
-+#define CRASH_CPU_TIMEOUT 5000 /* 5 sec wait for other cpus to stop */
-+
-+#define CRASH_MARK_RESERVED(addr) (set_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags))
-+#define CRASH_CLEAR_RESERVED(addr) (clear_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags))
-+#define CRASH_MARK_BOOT_RESERVED(addr) reserve_bootmem(virt_to_phys((void *)addr), PAGE_SIZE);
-+
-+typedef int boolean_t;
-+
-+#define TRUE 1
-+#define FALSE 0
-+
-+/* mem structure */
-+struct mem_crash_map_hdr {
-+ long magic[4]; /* identify crash dump */
-+ u_long map; /* location of map */
-+ u_long map_pages;
-+ u_long data_pages;
-+ u_long compr_units;
-+ u_long boot_reserved_start;
-+ u_long boot_reserved_end;
-+};
-+struct mem_crash_map_entry {
-+ u_long src_va; /* source start of larger non-contig
-+ * block. a src_va of -1 means that
-+ * the dest_page_va is the location of
-+ * the next map page */
-+ u_long dest_page_va; /* dest of this sub block */
-+ u_long check_sum; /* check_sum for dest data */
-+};
-+
-+/* file structure */
-+struct crash_map_hdr {
-+ long magic[4]; /* identify crash dump */
-+ int blk_size; /* block size for this device */
-+ int map_block; /* location of map */
-+ int map_blocks; /* number of blocks for map */
-+};
-+struct crash_map_entry {
-+ u_long start_va; /* virtual address */
-+ char *exp_data; /* expanded data in memory */
-+ int start_blk; /* device location */
-+ int num_blks;
-+};
-+
-+#endif /* __KERNEL__ */
-+#endif /* __LINUX_CRASH_H */
-Index: linux/include/linux/mm.h
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/include/linux/mm.h,v
-retrieving revision 1.2.2.1
-retrieving revision 1.2.2.1.2.2
-diff -u -r1.2.2.1 -r1.2.2.1.2.2
---- linux/include/linux/mm.h 12 Mar 2003 19:51:27 -0000 1.2.2.1
-+++ linux/include/linux/mm.h 1 Apr 2003 17:55:35 -0000 1.2.2.1.2.2
-@@ -331,6 +331,11 @@
- #define PG_lru 18
- #define PG_active_cache 19
- #define PG_fs_1 20 /* Filesystem specific */
-+#ifdef CONFIG_MCL_COREDUMP
-+#define PG_free 21
-+#define PG_shm 22
-+#define PG_anon 23
-+#endif
-
- /* Make it prettier to test the above... */
- #define UnlockPage(page) unlock_page(page)
-@@ -452,6 +457,11 @@
- #define PageSetSlab(page) set_bit(PG_slab, &(page)->flags)
- #define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags)
- #define PageReserved(page) test_bit(PG_reserved, &(page)->flags)
-+#ifdef CONFIG_MCL_COREDUMP
-+#define PageFree(page) (test_bit(PG_free, &(page)->flags))
-+#define PageAnon(page) (test_bit(PG_anon, &(page)->flags))
-+#define PageShm(page) (test_bit(PG_shm, &(page)->flags))
-+#endif
-
- #define PageActiveAnon(page) test_bit(PG_active_anon, &(page)->flags)
- #define SetPageActiveAnon(page) set_bit(PG_active_anon, &(page)->flags)
-Index: linux/include/linux/reboot.h
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/include/linux/reboot.h,v
-retrieving revision 1.1.1.1
-retrieving revision 1.1.1.1.10.2
-diff -u -r1.1.1.1 -r1.1.1.1.10.2
---- linux/include/linux/reboot.h 7 May 2002 21:53:47 -0000 1.1.1.1
-+++ linux/include/linux/reboot.h 1 Apr 2003 17:55:35 -0000 1.1.1.1.10.2
-@@ -20,6 +20,7 @@
- * CAD_OFF Ctrl-Alt-Del sequence sends SIGINT to init task.
- * POWER_OFF Stop OS and remove all power from system, if possible.
- * RESTART2 Restart system using given command string.
-+ * COREDUMP We're taking a core dump, secondary cpus already stopped.
- */
-
- #define LINUX_REBOOT_CMD_RESTART 0x01234567
-@@ -28,7 +29,9 @@
- #define LINUX_REBOOT_CMD_CAD_OFF 0x00000000
- #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC
- #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4
--
-+#ifdef CONFIG_MCL_COREDUMP
-+#define LINUX_REBOOT_CMD_COREDUMP 0x9A8BCCDD
-+#endif
-
- #ifdef __KERNEL__
-
-Index: linux/include/linux/sysctl.h
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/include/linux/sysctl.h,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.1
-diff -u -r1.3.2.1 -r1.3.2.1.2.1
---- linux/include/linux/sysctl.h 12 Mar 2003 19:51:30 -0000 1.3.2.1
-+++ linux/include/linux/sysctl.h 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1
-@@ -126,6 +126,7 @@
- KERN_CADPID=54, /* int: PID of the process to notify on CAD */
- KERN_CORE_PATTERN=56, /* string: pattern for core-files */
- KERN_PID_MAX=55, /* int: max PID value of processes */
-+ KERN_PANIC_ON_OOPS /* int: panic on oops enabled */
- };
-
-
-Index: linux/init/main.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/init/main.c,v
-retrieving revision 1.2.2.1
-retrieving revision 1.2.2.1.2.1
-diff -u -r1.2.2.1 -r1.2.2.1.2.1
---- linux/init/main.c 12 Mar 2003 19:51:35 -0000 1.2.2.1
-+++ linux/init/main.c 1 Apr 2003 12:17:41 -0000 1.2.2.1.2.1
-@@ -70,6 +70,10 @@
- #include <asm/smp.h>
- #endif
-
-+#ifdef CONFIG_BOOTIMG
-+#include <linux/bootimg.h>
-+#endif
-+
- /*
- * Versions of gcc older than that listed below may actually compile
- * and link okay, but the end product can have subtle run time bugs.
-@@ -352,10 +356,14 @@
- {
- char * command_line;
- extern char saved_command_line[];
-+#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC)
-+ unsigned long value;
-+#endif
- /*
- * Interrupts are still disabled. Do necessary setups, then
- * enable them
- */
-+ printk("start_kernel\n");
- lock_kernel();
- printk(linux_banner);
- setup_arch(&command_line);
-@@ -373,12 +381,26 @@
- * this. But we do want output early, in case something goes wrong.
- */
- console_init();
-+
-+#ifdef CONFIG_BOOTIMG
-+ unity_page = alloc_bootmem_pages(PAGE_SIZE);
-+ printk("unity_page addr: %p\n",unity_page);
-+#endif
- #ifdef CONFIG_MODULES
- init_modules();
- #endif
- profile_init();
- kmem_cache_init();
- sti();
-+#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC)
-+ /* If we don't make sure the APIC is enabled, AND the LVT0
-+ * register is programmed properly, we won't get timer interrupts
-+ */
-+ setup_local_APIC();
-+
-+ value = apic_read(APIC_LVT0);
-+ apic_write_around(APIC_LVT0, value & ~APIC_LVT_MASKED);
-+#endif
- calibrate_delay();
- #ifdef CONFIG_BLK_DEV_INITRD
- if (initrd_start && !initrd_below_start_ok &&
-Index: linux/kernel/Makefile
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/kernel/Makefile,v
-retrieving revision 1.1.1.1.4.1
-retrieving revision 1.1.1.1.4.1.2.1
-diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1
---- linux/kernel/Makefile 12 Mar 2003 19:51:36 -0000 1.1.1.1.4.1
-+++ linux/kernel/Makefile 1 Apr 2003 12:17:41 -0000 1.1.1.1.4.1.2.1
-@@ -22,7 +22,8 @@
- obj-$(CONFIG_PM) += pm.o
- obj-$(CONFIG_KALLSYMS) += kallsyms.o
- obj-$(CONFIG_CPU_FREQ) += cpufreq.o
--
-+obj-$(CONFIG_BOOTIMG) += bootimg.o bootimg_pic.o
-+obj-$(CONFIG_MCL_COREDUMP) += crash.o
-
- ifneq ($(CONFIG_IA64),y)
- # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-Index: linux/kernel/bootimg.c
-===================================================================
-RCS file: linux/kernel/bootimg.c
-diff -N linux/kernel/bootimg.c
---- /dev/null 1 Jan 1970 00:00:00 -0000
-+++ linux/kernel/bootimg.c 1 Apr 2003 12:17:41 -0000 1.1.6.1
-@@ -0,0 +1,301 @@
-+/* bootimg.c - Boot another (kernel) image */
-+
-+/* Written 2000 by Werner Almesberger */
-+
-+
-+#include <linux/config.h>
-+#include <linux/kernel.h>
-+#include <linux/errno.h>
-+#include <linux/mm.h>
-+#include <linux/capability.h>
-+#include <linux/bootimg.h>
-+#include <asm/bootimg.h>
-+#include <asm/uaccess.h>
-+#include <asm/io.h>
-+#include <asm/pgtable.h>
-+#include <linux/delay.h>
-+
-+#if 0
-+#define DPRINTK_CONT(format,args...) printk(format,##args)
-+#else
-+#define DPRINTK_CONT(format,args...)
-+#endif
-+#define DPRINTK(format,args...) DPRINTK_CONT(KERN_DEBUG format,##args)
-+
-+unsigned long **bootimg_page_dir;
-+
-+struct bootimg_dsc bootimg_dsc; /* communication with PIC */
-+unsigned long *unity_page; /* unity-mapped page for i386 */
-+
-+static unsigned long bootimg_checksum(unsigned long **page_dir, int num_pages)
-+{
-+ unsigned long checksum, *page;
-+ int i, j;
-+
-+ checksum = 0;
-+
-+ for (i = 0; i < num_pages; i++) {
-+ page = __va((unsigned long *)
-+ page_dir[FROM_TABLE(i)][PAGE_NR(i)]);
-+
-+ for (j = 0; j < PAGES_PER_TABLE; j++)
-+ checksum ^= page[j];
-+
-+ checksum ^= page_dir[TO_TABLE(i)][PAGE_NR(i)];
-+ }
-+
-+ return checksum;
-+}
-+
-+#ifdef CONFIG_X86_PAE
-+
-+static unsigned long get_identity_mapped_page(void)
-+{
-+ pgd_t *pgd;
-+ pmd_t *pmd;
-+ unsigned long phys_addr, page_base;
-+
-+ /* Set up a 2 Mb identity-mapped page. */
-+
-+ phys_addr = virt_to_phys(unity_page);
-+ pgd = pgd_offset(current->active_mm, phys_addr);
-+ pmd = pmd_offset(pgd, phys_addr);
-+
-+ /* We hardcode this rather than using PMD_MASK just in case the PAE
-+ * mode setup ever changes so that 2 Mb pages are no longer used.
-+ */
-+ page_base = phys_addr & ~((1 << 21) - 1);
-+
-+ set_pmd(pmd, __pmd(page_base | _PAGE_PSE | _KERNPG_TABLE));
-+ __flush_tlb_one(phys_addr);
-+
-+ return (unsigned long) unity_page;
-+}
-+
-+#else
-+
-+static unsigned long get_identity_mapped_page(void)
-+{
-+ set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)),
-+ __pgd((_KERNPG_TABLE + _PAGE_PSE + (virt_to_phys(unity_page)&PGDIR_MASK))));
-+ __flush_tlb_one(virt_to_phys(unity_page));
-+ return (unsigned long)unity_page;
-+}
-+
-+#endif
-+
-+#if 0 /* Perhaps we'll need this in the future? */
-+static void unmap_identity_mapped_page(void)
-+{
-+ set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)),__pgd(0));
-+ __flush_tlb();
-+}
-+#endif
-+
-+static int fill_page_dir(unsigned long **page_dir,struct boot_image *image)
-+{
-+ int i, count=0;
-+
-+ memset(page_dir,0,PAGE_SIZE);
-+ for (i = 0; i < image->pages; i += PAGES_PER_TABLE) {
-+ unsigned long **table;
-+ int bytes_left;
-+
-+ table = page_dir+FROM_TABLE(i);
-+ *table = (unsigned long *) get_free_page(GFP_KERNEL);
-+ if (!*table) return -ENOMEM;
-+
-+ memset(*table,0,PAGE_SIZE);
-+ DPRINTK("page %d: from table %p @ %p\n",i,*table,table);
-+ table = page_dir+TO_TABLE(i);
-+ *table = (unsigned long *) get_free_page(GFP_KERNEL);
-+ if (!*table) return -ENOMEM;
-+
-+ bytes_left = (image->pages-i)*sizeof(unsigned long);
-+ if (copy_from_user(*table,image->load_map+i,
-+ bytes_left > PAGE_SIZE ? PAGE_SIZE : bytes_left))
-+ return -EFAULT;
-+ DPRINTK("page %d: to table %p @ %p\n",i,*table,table);
-+ count+=2; /* 2 pages per loop */
-+ }
-+
-+ for (i = 0; i < image->pages; i++) {
-+ unsigned long page = get_free_page(GFP_KERNEL);
-+ void *src;
-+
-+ if (!page) return -ENOMEM;
-+ count++;
-+
-+ page_dir[FROM_TABLE(i)][PAGE_NR(i)] =
-+ virt_to_phys((void *) page);
-+ if (get_user(src,image->image_map+i) ||
-+ copy_from_user((void *) page,src,PAGE_SIZE))
-+ return -EFAULT;
-+
-+ DPRINTK("page %d: %p->%p->%p @ %p\n",i,src,(void *) page,
-+ (void *) page_dir[FROM_TABLE(i)][PAGE_NR(i)],
-+ &page_dir[FROM_TABLE(i)][PAGE_NR(i)]);
-+ }
-+
-+ DPRINTK("fill_page_dir: %d pages allocated\n", count);
-+
-+ return 0;
-+}
-+
-+
-+static void free_page_dir(unsigned long **page_dir)
-+{
-+ int i,j,count=0;
-+
-+ for (i = 0; i < PAGES_PER_TABLE/2; i++)
-+ if (page_dir[i])
-+ for (j = 0; j < PAGES_PER_TABLE; j++)
-+ if (page_dir[i][j]) {
-+ free_page((unsigned long)
-+ phys_to_virt(page_dir[i][j]));
-+ count++;
-+ }
-+ for (i = 0; i < PAGES_PER_TABLE; i++)
-+ if (page_dir[i]) {
-+ free_page((unsigned long) *page_dir[i]);
-+ count++;
-+ }
-+ DPRINTK("free_page_dir: %d pages freed\n", count);
-+}
-+
-+
-+static void convert_table_refs_to_phys(unsigned long **page_dir)
-+{
-+ int i;
-+
-+ DPRINTK("PAGES_PER_TABLE: %d\n",PAGES_PER_TABLE);
-+ for (i = 0; i < PAGES_PER_TABLE; i++)
-+ if (page_dir[i]) {
-+ DPRINTK("table %i: mapped %p -> ",i,page_dir[i]);
-+ page_dir[i] = (unsigned long *)
-+ virt_to_phys(page_dir[i]);
-+ DPRINTK_CONT("%p\n",page_dir[i]);
-+ }
-+}
-+
-+
-+
-+static int fill_bootimg_dsc(struct boot_image *image)
-+{
-+ unsigned long scratch;
-+ int error = -ENOMEM;
-+
-+ if(bootimg_page_dir) {
-+ /* free previously allocated memory */
-+ free_page_dir(bootimg_page_dir);
-+ free_page((unsigned long) bootimg_page_dir);
-+ DPRINTK("free_page (bootimg_page_dir)\n");
-+ }
-+
-+ bootimg_page_dir = (unsigned long **) get_free_page(GFP_KERNEL);
-+ if (!bootimg_page_dir) goto out0;
-+ DPRINTK("get_free_page (bootimg_page_dir)\n");
-+
-+ error = fill_page_dir(bootimg_page_dir,image);
-+ if (error) goto out1;
-+
-+ if(!bootimg_dsc.scratch) {
-+ scratch = get_free_page(GFP_KERNEL);
-+ DPRINTK("get_free_page (scratch)\n");
-+ } else
-+ scratch = 1; /* already allocated */
-+
-+ if (!scratch) goto out1;
-+ /*
-+ * Not all architectures need the code to be identity-mapped, but it
-+ * can't hurt ...
-+ */
-+ DPRINTK("bootimg_page_dir: mapped %p -> ",bootimg_page_dir);
-+ bootimg_dsc.page_dir = (unsigned long **) virt_to_phys(bootimg_page_dir);
-+ DPRINTK_CONT("%p\n",bootimg_dsc.page_dir);
-+ if(!bootimg_dsc.scratch)
-+ bootimg_dsc.scratch = virt_to_phys((void *) scratch);
-+ bootimg_dsc.jump_to = (void (*)(void)) image->start;
-+ bootimg_dsc.pages = image->pages;
-+ bootimg_dsc.csum = bootimg_checksum(bootimg_page_dir, image->pages);
-+
-+ return 0;
-+
-+out1:
-+ free_page_dir(bootimg_page_dir);
-+ free_page((unsigned long) bootimg_page_dir);
-+ DPRINTK("free_page (bootimg_page_dir)\n");
-+ bootimg_page_dir = 0;
-+out0:
-+ return error;
-+}
-+
-+extern char *panicmsg;
-+int boot_image()
-+{
-+ relocate_and_jump_t code;
-+ unsigned long code_page;
-+ int error = -ENOMEM;
-+
-+ if (bootimg_checksum(__va(bootimg_dsc.page_dir),bootimg_dsc.pages)
-+ != bootimg_dsc.csum)
-+ printk("Checksum of kernel image failed. Rebooting via BIOS\n");
-+
-+ code_page = get_identity_mapped_page();
-+ if (!code_page) goto out3;
-+ code = (relocate_and_jump_t) virt_to_phys((void *) code_page);
-+ memcpy(code,&__bootimg_start,&__bootimg_end-&__bootimg_start);
-+ flush_icache_range(&__bootimg_start, &__bootimg_end-&__bootimg_start);
-+
-+ bootimg_dsc.self = (unsigned long) code;
-+ printk(KERN_INFO "Running boot code at 0x%p\n",code);
-+
-+ /*
-+ * The point of no return. Not even printk may work after a successful
-+ * return from become_only_thread.
-+ */
-+
-+ if (!panicmsg) {
-+ error = become_only_thread();
-+ if (error) goto out3;
-+ } else {
-+#ifdef CONFIG_SMP
-+ disable_IO_APIC();
-+#endif
-+ __cli();
-+ }
-+
-+ convert_table_refs_to_phys((unsigned long **)__va(bootimg_dsc.page_dir));
-+ stack_on_page(code);
-+
-+ code();
-+
-+ panic("PIC code exec failed");
-+out3:
-+ printk("boot_image() failed!\n");
-+ for(;;);
-+}
-+
-+/* changed from asmlinkage because we're called via an IOCTL on /dev/crash now */
-+int sys_bootimg(struct boot_image *user_dsc)
-+{
-+ struct boot_image dsc;
-+
-+ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_MODULE)) return -EPERM;
-+ if (&__bootimg_end-&__bootimg_start > PAGE_SIZE-RESERVE_MIN_RELOC_STACK)
-+ {
-+ printk(KERN_ERR "boot_image: PIC too large (%d bytes)\n",
-+ &__bootimg_end-&__bootimg_start);
-+ return -EIO;
-+ }
-+ if ((void *) relocate_and_jump != (void *) &__bootimg_start) {
-+ printk(KERN_ERR "boot_image: relocate_and_jump is mis-placed"
-+ "(0x%p != 0x%p)\n",relocate_and_jump,&__bootimg_start);
-+ return -EIO;
-+ }
-+
-+ if (copy_from_user(&dsc,user_dsc,sizeof(dsc))) return -EFAULT;
-+ if (dsc.pages >= PAGES_PER_TABLE*PAGES_PER_TABLE/2) return -EFBIG;
-+ if (dsc.flags) return -EINVAL; /* for future use */
-+ return fill_bootimg_dsc(&dsc);
-+}
-Index: linux/kernel/bootimg_pic.c
-===================================================================
-RCS file: linux/kernel/bootimg_pic.c
-diff -N linux/kernel/bootimg_pic.c
---- /dev/null 1 Jan 1970 00:00:00 -0000
-+++ linux/kernel/bootimg_pic.c 1 Apr 2003 12:17:41 -0000 1.1.6.1
-@@ -0,0 +1,91 @@
-+/* bootimg_pic.c - Boot image, position-independent code */
-+
-+/* Written 2000 by Werner Almesberger */
-+
-+/*
-+ * Strongly inspired by FiPaBoL designed mainly by Otfried Cheong and Roger
-+ * Gammans, and written by the latter.
-+ */
-+
-+/*
-+ * This code is position-independent and must fit in a single page !
-+ * Furthermore, everything (text+data+stack) has to go into the
-+ * .bootimg segment.
-+ */
-+
-+
-+#include <linux/config.h>
-+#include <linux/kernel.h>
-+#include <linux/errno.h>
-+#include <linux/mm.h>
-+#include <linux/bootimg.h>
-+#include <asm/bootimg.h>
-+
-+#include <asm/io.h>
-+
-+#define copy_and_swap(from,to) \
-+ ( { my_copy_page(from,to); \
-+ tmp = from; \
-+ from = to; \
-+ to = tmp; } )
-+
-+
-+static inline void my_copy_page(unsigned long from,unsigned long to)
-+{
-+ unsigned long end = from+PAGE_SIZE;
-+
-+ do *((unsigned long *) to)++ = *((unsigned long *) from)++;
-+ while (from != end);
-+}
-+
-+
-+void __bootimg relocate_and_jump(void)
-+{
-+ struct bootimg_dsc dsc = bootimg_dsc;
-+ int i;
-+
-+ stop_paging();
-+ for (i = 0; i < dsc.pages; i++) {
-+ unsigned long from,to,tmp;
-+
-+ from = dsc.page_dir[FROM_TABLE(i)][PAGE_NR(i)];
-+ to = dsc.page_dir[TO_TABLE(i)][PAGE_NR(i)];
-+ if (from == to) continue;
-+ if (to == dsc.self) {
-+ copy_and_swap(dsc.self,dsc.scratch);
-+ /* WARNING: flush_icache_range MUST BE INLINED !!! */
-+ flush_icache_range(dsc.self,dsc.self+PAGE_SIZE-1);
-+ jump_relocated(dsc.scratch,dsc.self);
-+ }
-+ else if (to == (unsigned long) dsc.page_dir)
-+ copy_and_swap((unsigned long) dsc.page_dir,dsc.scratch);
-+ else {
-+ /*
-+ * O((n^2-n)/2), sigh ...
-+ */
-+ unsigned long **table;
-+ int j;
-+
-+ for (j = i+1; j < dsc.pages; j++) {
-+ table = dsc.page_dir+FROM_TABLE(j);
-+ if (((unsigned long) *table) == to) {
-+ copy_and_swap(*table,dsc.scratch);
-+ break;
-+ }
-+ if ((*table)[PAGE_NR(j)] == to) {
-+ copy_and_swap((*table)[PAGE_NR(j)],
-+ dsc.scratch);
-+ break;
-+ }
-+ table = dsc.page_dir+TO_TABLE(j);
-+ if (((unsigned long) *table) == to) {
-+ copy_and_swap(*table,dsc.scratch);
-+ break;
-+ }
-+ }
-+ }
-+ my_copy_page(from,to);
-+ dsc.scratch = from;
-+ }
-+ jump_to_kernel(dsc.jump_to);
-+}
-Index: linux/kernel/crash.c
-===================================================================
-RCS file: linux/kernel/crash.c
-diff -N linux/kernel/crash.c
---- /dev/null 1 Jan 1970 00:00:00 -0000
-+++ linux/kernel/crash.c 1 Apr 2003 12:17:41 -0000 1.1.6.1
-@@ -0,0 +1,886 @@
-+#include <linux/locks.h>
-+#include <linux/slab.h>
-+#include <linux/crash.h>
-+#include <linux/vmalloc.h>
-+#include <linux/mm.h>
-+#include <linux/fs.h>
-+#include <linux/ext2_fs.h>
-+#include <asm/param.h>
-+#include <asm/uaccess.h>
-+#include <linux/zlib.h>
-+#include <linux/reboot.h>
-+#include <linux/delay.h>
-+#include <asm/io.h>
-+#include <linux/miscdevice.h>
-+#include <linux/bootmem.h>
-+
-+#ifdef CONFIG_BOOTIMG
-+#include <linux/bootimg.h>
-+#endif
-+
-+static void crash_print_data_around(u_long p);
-+static void crash_free_page(u_long addr);
-+static int crash_chksum_page(u_long pg_addr, u_long * sum_addr);
-+static void *czalloc(void *arg, unsigned int items, unsigned int size);
-+static void czfree(void *arg, void *ptr);
-+static u_long crash_alloc_dest_page(void);
-+static void crash_free_dest_page(u_long dest);
-+static void init_dest_page_alloc(void);
-+static int crash_audit_maps(void);
-+static u_long crash_get_source_page(void);
-+static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages);
-+static int crash_reset_stream(z_stream * stream);
-+static boolean_t crash_is_kseg(u_long addr);
-+static u_long *crash_link(u_long p);
-+static int crash_chksum(u_long limit, u_long * sum_addr);
-+static int crash_audit_map_page(u_long map);
-+static void crash_wait_cpus(void);
-+static int crash_is_dir_page(struct page *page);
-+
-+/* for the /dev/crash interface */
-+int crash_init_chrdev(void);
-+static int crashdev_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
-+
-+#define CRASH_DEBUG 1
-+
-+#ifdef CONFIG_BOOTIMG
-+extern int sys_bootimg(struct boot_image *);
-+#endif
-+
-+static u_long crash_compr_buf;
-+static u_long crash_uncompr_buf;
-+static u_long crash_dump_header = 0;
-+static u_long crash_dest_free_list = 0;
-+static u_long crash_debug = 0;
-+
-+static u_long crash_cur_pfn;
-+
-+static u_long src_pages_skipped = 0;
-+static u_long src_pages_saved = 0;
-+static u_long dest_pages_free = 0;
-+
-+/* this information is saved from within panic() */
-+char *panicmsg = (char *)0;
-+int panic_processor = 0;
-+int crash_perform_sync = 0;
-+
-+u_int console_crash = 0; /* should be moved to alpha branch */
-+
-+// typedef struct task_struct *task_t;
-+
-+/*
-+ * Threads active at time of panic:
-+ */
-+volatile task_t *panic_threads[NR_CPUS];
-+volatile unsigned long panic_ksp[NR_CPUS];
-+unsigned long *panic_regs = NULL;
-+
-+int panic_on_oops; /* for /proc/sys/kernel/panic_on_oops */
-+
-+extern unsigned long max_low_pfn;
-+
-+u_long crash_zalloc_start; // , crash_zalloc_end, crash_zalloc_cur;
-+
-+/*
-+ * Crash Kernel API functions below
-+ * crash_pages_needed, computes pages needed for header and compression temp
-+ * crash_init, partitions out the allocated pages, sets defaults and
-+ * initializes the character device.
-+ * crash_mark_dump_reserved, marks pages reserved from a previous dump.
-+ * save_core, called at panic time to save a dump to memory.
-+ */
-+u_long crash_pages_needed(void)
-+{
-+ /* one for the header */
-+ return (1 + CRASH_ZALLOC_PAGES + CRASH_UNCOMPR_BUF_PAGES + CRASH_COMPR_BUF_PAGES);
-+}
-+
-+void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va)
-+{
-+ struct mem_crash_map_hdr *header;
-+ int i;
-+
-+ /* the default behavior is not NOT panic on a kernel OOPS */
-+ panic_on_oops = 0;
-+
-+ printk("crash_init (crash_va: %08lx)\n", crash_va);
-+ for (i = 0; i < NR_CPUS; i++)
-+ panic_threads[i] = 0;
-+ crash_dump_header = crash_va;
-+ crash_va += PAGE_SIZE;
-+ crash_zalloc_start = crash_va;
-+ crash_va += CRASH_ZALLOC_PAGES * PAGE_SIZE;
-+ crash_uncompr_buf = crash_va;
-+ crash_va += CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE;
-+ crash_compr_buf = crash_va;
-+ crash_va += CRASH_COMPR_BUF_PAGES * PAGE_SIZE;
-+#if 0
-+ if (crash_va != end_alloc_va)
-+ panic("crash_init inconsistency-1\n");
-+#endif
-+
-+ header = (struct mem_crash_map_hdr *)crash_dump_header;
-+#ifdef CRASH_DEBUG
-+ printk("crash_dump_header %p {\n", header);
-+ printk(" magic[0] = %lx\n", header->magic[0]);
-+ printk(" map = %lx\n", header->map);
-+ printk(" map_pages = %lx\n", header->map_pages);
-+ printk(" data_pages = %lx\n", header->data_pages);
-+ printk(" compr_units = %lx\n", header->compr_units);
-+ printk(" boot_reserved_start = %lx\n", header->boot_reserved_start);
-+ printk(" boot_reserved_end = %lx\n", header->boot_reserved_end);
-+#endif
-+
-+ if (header->magic[0] == CRASH_MAGIC) {
-+ printk("crash found\n");
-+ if ((header->boot_reserved_start != bootmap_va) ||
-+ (header->boot_reserved_end != end_alloc_va)) {
-+ /* crash audit will catch the corruption */
-+ printk("crash_init inconsistency, dump may be corrupted\n");
-+ }
-+ } else {
-+printk("memset...");
-+ memset(header, 0, sizeof(*header));
-+printk("done\n");
-+ }
-+
-+ header->boot_reserved_start = bootmap_va;
-+ header->boot_reserved_end = end_alloc_va;
-+
-+}
-+
-+void crash_mark_dump_reserved(void)
-+{
-+ struct mem_crash_map_hdr *header;
-+ struct mem_crash_map_entry *m;
-+
-+ header = (struct mem_crash_map_hdr *)crash_dump_header;
-+ if (header->magic[0] != CRASH_MAGIC)
-+ return;
-+ m = (struct mem_crash_map_entry *)header->map;
-+#ifdef CRASH_DEBUG
-+ printk("\n\n\ncrash_mark_dump_reserved\n\n");
-+ printk("crash_dump_header %p {\n", header);
-+ printk(" magic[0] = %lx\n", header->magic[0]);
-+ printk(" map = %lx\n", header->map);
-+ printk(" map_pages = %lx\n", header->map_pages);
-+ printk(" data_pages = %lx\n", header->data_pages);
-+ printk(" compr_units = %lx\n", header->compr_units);
-+ printk(" boot_reserved_start = %lx\n", header->boot_reserved_start);
-+ printk(" boot_reserved_end = %lx\n", header->boot_reserved_end);
-+ printk("mem_crash_map_entry %p {\n", m);
-+ printk(" src_va = %lx\n", m->src_va);
-+ printk(" dest_page_va = %lx\n", m->dest_page_va);
-+ printk(" check_sum = %lx\n", m->check_sum);
-+#endif
-+
-+ if (crash_audit_maps()) {
-+ header->magic[0] = 0;
-+ return;
-+ }
-+
-+ m = (struct mem_crash_map_entry *)header->map;
-+ again:
-+ CRASH_MARK_BOOT_RESERVED(m);
-+ for (; m->src_va; m++) {
-+ if (m->src_va == -1) {
-+ m = (struct mem_crash_map_entry *)m->dest_page_va;
-+ goto again;
-+ }
-+ CRASH_MARK_BOOT_RESERVED(m->dest_page_va);
-+ }
-+ return;
-+}
-+
-+void save_core(void)
-+{
-+ int i, j, k;
-+ z_stream stream;
-+ int err;
-+ struct task_struct *tp;
-+ struct mem_crash_map_hdr *header;
-+ u_long *sub_map;
-+ u_long map;
-+ u_long src, dest, unc, cp, src_base, comp_pages;
-+
-+ k = 0;
-+ dest = 0;
-+ __cli();
-+ tp = current;
-+ mb();
-+ if (smp_processor_id() != 0) { /* boot_cpu_id is always 0, i think */
-+ panic_threads[smp_processor_id()] = tp;
-+ crash_halt_or_reboot(0);
-+ } else {
-+ if (console_crash)
-+ panic_threads[smp_processor_id()] = &init_task_union.task;
-+ else
-+ panic_threads[smp_processor_id()] = tp;
-+
-+ crash_wait_cpus();
-+ }
-+
-+ printk("save_core: started on CPU%d\n", smp_processor_id());
-+ if (!crash_dump_header) {
-+ printk("save_core: not initialized\n");
-+ return;
-+ }
-+
-+ header = (struct mem_crash_map_hdr *)crash_dump_header;
-+ header->magic[0] = 0;
-+ header->map_pages = 0;
-+ header->data_pages = 0;
-+ header->compr_units = 0;
-+ header->map = 0;
-+
-+ stream.workspace=(void*)crash_zalloc_start;
-+ // stream.zalloc = czalloc;
-+ // stream.zfree = czfree;
-+ // stream.opaque = (voidpf) 0;
-+ stream.next_out = (Bytef *) crash_compr_buf;
-+ stream.avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE);
-+ stream.next_in = (Bytef *) crash_uncompr_buf;
-+ stream.avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE);
-+ err = zlib_deflateInit(&stream, Z_BEST_SPEED);
-+ if (err != Z_OK) {
-+ printk("save_core: bad return %d from deflateInit\n", err);
-+ return;
-+ }
-+
-+ init_dest_page_alloc();
-+ header->map = map = crash_update_map(0, 0, 0, &header->map_pages);
-+ if (!map) {
-+ printk("save_core: no dest pages\n");
-+ return;
-+ }
-+ crash_cur_pfn = 0;
-+ src_base = 0;
-+ src = 0;
-+ for (;;) {
-+ sub_map = (u_long *) crash_uncompr_buf;
-+ unc = crash_uncompr_buf + CRASH_SUB_MAP_PAGES * PAGE_SIZE;
-+ for (i = 0; i < CRASH_SOURCE_PAGES; i++) {
-+ src = crash_get_source_page();
-+ if (!src)
-+ break;
-+ if (!i)
-+ src_base = src;
-+ if (!crash_is_kseg(unc) || !crash_is_kseg(src)) {
-+ printk("unc = 0x%lx, src = 0x%lx, i = %d\n", unc, src, i);
-+ i = src = 0;
-+ break;
-+ }
-+ memcpy((void *)unc, (void *)src, PAGE_SIZE);
-+ unc += PAGE_SIZE;
-+ *sub_map++ = src;
-+ }
-+ *sub_map = 0;
-+ if (!i && !src)
-+ break;
-+ err = zlib_deflate(&stream, Z_FINISH);
-+ if (!(err == Z_STREAM_END)) {
-+ zlib_deflateEnd(&stream);
-+ printk("save_core: bad return %d from deflate, src_base = 0x%lx\n", err,
-+ src_base);
-+ return;
-+ }
-+ comp_pages = (u_long) round_page(stream.total_out) / PAGE_SIZE;
-+ if (crash_debug)
-+ printk("src_base = 0x%lx compressed data in 0x%lx pages\n", src_base,
-+ comp_pages);
-+
-+ cp = crash_compr_buf;
-+ j = 0;
-+ if (crash_debug)
-+ printk("\nsrc = %lx\n", src_base);
-+ else {
-+ printk(".");
-+ if (!(k++ % 64))
-+ printk("\n");
-+ }
-+ for (i = 0; i < comp_pages; i++) {
-+ dest = crash_alloc_dest_page();
-+ if (crash_debug) {
-+ printk("%lx ", dest);
-+ if (!(j++ % 8))
-+ printk("\n");
-+ }
-+ header->data_pages++;
-+ if (!dest) {
-+ printk("save_core: no dest pages\n");
-+ return;
-+ }
-+ if (!crash_is_kseg(dest) || !crash_is_kseg(cp)) {
-+ printk("dest = 0x%lx, cp = 0x%lx, i = %d, comp_pages = 0x%lx\n",
-+ dest, cp, i, comp_pages);
-+ src = 0;
-+ break;
-+ }
-+ memcpy((void *)dest, (void *)cp, PAGE_SIZE);
-+ cp += PAGE_SIZE;
-+ map = crash_update_map(map, src_base, dest, &header->map_pages); /* links a new map page, if necessary */
-+ if (!map) {
-+ printk("save_core: no map\n");
-+ return;
-+ }
-+ }
-+ header->compr_units++;
-+ if (!src)
-+ break;
-+ if (crash_reset_stream(&stream))
-+ return;
-+ }
-+
-+ map = crash_update_map(map, 0, 0, &header->map_pages);
-+ header->magic[0] = CRASH_MAGIC;
-+
-+ if (crash_audit_maps()) {
-+ header->magic[0] = 0;
-+ return;
-+ }
-+
-+ printk("\nsave_core: src pages skipped = 0x%lx src pages saved = 0x%lx\n",
-+ src_pages_skipped, src_pages_saved);
-+ printk("save_core: data_pages = 0x%lx map_pages = 0x%lx\n", header->data_pages,
-+ header->map_pages);
-+ printk("save_core: completed, crash_dump_header = 0x%lx\n", crash_dump_header);
-+}
-+
-+/* helper functions private to this file */
-+static int crash_reset_stream(z_stream * stream)
-+{
-+ int err;
-+
-+ stream->workspace=(void*)crash_zalloc_start;
-+ // stream->zalloc = czalloc;
-+ // stream->zfree = czfree;
-+ // stream->opaque = (voidpf) 0;
-+ stream->next_out = (Bytef *) crash_compr_buf;
-+ stream->avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE);
-+ stream->next_in = (Bytef *) crash_uncompr_buf;
-+ stream->avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE);
-+ err = zlib_deflateReset(stream);
-+ if (err != Z_OK) {
-+ printk("crash_reset_stream: bad return %d from deflateReset\n", err);
-+ return 1;
-+ }
-+ return 0;
-+}
-+
-+static u_long crash_alloc_dest_page(void)
-+{
-+ u_long addr;
-+
-+ addr = crash_dest_free_list;
-+ if (addr) {
-+ crash_dest_free_list = *(u_long *) addr;
-+ dest_pages_free--;
-+ } else
-+ printk("crash_alloc_dest_page: free list empty\n");
-+ return addr;
-+}
-+
-+static void crash_free_dest_page(u_long dest)
-+{
-+ if (!dest) {
-+ printk("crash_free_dest_page: freeing addr 0\n");
-+ return;
-+ }
-+ dest_pages_free++;
-+ dest = (u_long) trunc_page(dest);
-+ *(u_long *) dest = crash_dest_free_list;
-+ crash_dest_free_list = dest;
-+}
-+
-+/*
-+ * Stolen from setup.c
-+ */
-+#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
-+
-+static void init_dest_page_alloc(void)
-+{
-+ u_long va;
-+ long i;
-+ struct page *page;
-+ struct mem_crash_map_hdr *header;
-+
-+ header = (struct mem_crash_map_hdr *)crash_dump_header;
-+ for (i = ((1 << 24) >> PAGE_SHIFT) + LOWER_MEM_FORWARD;
-+ i < (max_low_pfn - UPPER_MEM_BACKUP); i++) {
-+ va = (u_long) phys_to_virt(PFN_PHYS(i));
-+ if ((va >= header->boot_reserved_start) && (va < header->boot_reserved_end))
-+ continue;
-+ page = mem_map + i;
-+ if (PageLocked(page) || PageReserved(page))
-+ continue;
-+ if (PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers)
-+ crash_free_dest_page(va);
-+ }
-+ if (crash_debug)
-+ printk("init_dest_page_alloc: dest_pages_free = 0x%lx\n", dest_pages_free);
-+}
-+
-+static int crash_is_dir_page(struct page *page) {
-+ struct inode *tmp_inode;
-+
-+ if(page->mapping && page->mapping->host) {
-+ tmp_inode = (struct inode *)page->mapping->host;
-+ if((tmp_inode->i_sb->s_magic == EXT2_SUPER_MAGIC) &&
-+ (S_ISDIR(tmp_inode->i_mode)))
-+ return 1;
-+ }
-+
-+ return 0;
-+}
-+
-+static u_long crash_get_source_page(void)
-+{
-+ struct page *page;
-+ u_long va;
-+
-+ while (crash_cur_pfn < max_low_pfn) {
-+ page = mem_map + crash_cur_pfn;
-+ if (!(PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers))
-+ break;
-+ src_pages_skipped++;
-+ crash_cur_pfn++;
-+ }
-+ if (crash_cur_pfn == max_low_pfn)
-+ return 0;
-+
-+ va = (u_long) phys_to_virt(PFN_PHYS(crash_cur_pfn));
-+ src_pages_saved++;
-+ crash_cur_pfn++;
-+ return va;
-+}
-+
-+static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages)
-+{
-+ struct mem_crash_map_entry *m;
-+
-+
-+ if (!map) {
-+ (*pages)++;
-+ return crash_alloc_dest_page();
-+ }
-+ m = (struct mem_crash_map_entry *)map;
-+ m->src_va = src_base;
-+ m->dest_page_va = dest;
-+ if (dest)
-+ if (crash_chksum_page(dest, &m->check_sum))
-+ return 0;
-+
-+ map += sizeof(struct mem_crash_map_entry);
-+
-+ m = (struct mem_crash_map_entry *)map;
-+ if (!src_base) { /* end of list */
-+ if (crash_chksum((u_long) m, &m->src_va))
-+ return 0;
-+ } else if ((map + 3 * sizeof(struct mem_crash_map_entry)) > (u_long) round_page(map)) {
-+ m->src_va = -1;
-+ map = m->dest_page_va = crash_alloc_dest_page();
-+ if (crash_debug)
-+ printk("\nm = 0x%lx m->src_va = 0x%lx m->dest_page_va = 0x%lx\n",
-+ (u_long) trunc_page(m), m->src_va, m->dest_page_va);
-+ m++;
-+ if (crash_chksum((u_long) m, &m->src_va))
-+ return 0;
-+ if (crash_debug)
-+ printk("m = 0x%lx chksum = m->src_va = 0x%lx\n", (u_long) trunc_page(m),
-+ m->src_va);
-+ if (crash_audit_map_page((u_long) m))
-+ return 0;
-+ (*pages)++;
-+ }
-+ return map;
-+}
-+
-+static int crash_chksum(u_long limit, u_long * sum_addr)
-+{
-+ u_long sum;
-+ u_long *addr;
-+
-+ if (!crash_is_kseg(limit)) {
-+ printk("bad addr = 0x%lx to crash_chksum\n", limit);
-+ return 1;
-+ }
-+ sum = 0;
-+ addr = (u_long *) trunc_page(limit);
-+ for (; (u_long) addr < limit; addr++)
-+ sum += *addr;
-+ *sum_addr = sum;
-+ return 0;
-+}
-+
-+static int crash_chksum_page(u_long pg_addr, u_long * sum_addr)
-+{
-+ u_long sum, limit;
-+ u_long *addr;
-+
-+ if (!crash_is_kseg(pg_addr)) {
-+ printk("bad addr = 0x%lx to crash_chksum_page\n", pg_addr);
-+ return 1;
-+ }
-+
-+ sum = 0;
-+ addr = (u_long *) trunc_page(pg_addr);
-+ limit = (u_long) addr + PAGE_SIZE;
-+ for (; (u_long) addr < limit; addr++)
-+ sum += *addr;
-+ *sum_addr = sum;
-+ return 0;
-+}
-+
-+static int crash_audit_maps(void)
-+{
-+ u_long m, count;
-+ u_long *link_addr;
-+ struct mem_crash_map_hdr *header;
-+
-+ header = (struct mem_crash_map_hdr *)crash_dump_header;
-+ if (header->magic[0] != CRASH_MAGIC)
-+ return 1;
-+
-+ link_addr = &header->map;
-+ m = header->map;
-+
-+ count = 0;
-+ for (;;) {
-+ if (!crash_is_kseg(m)) {
-+ printk("crash_audit_maps: bad link 0x%lx at 0x%lx\n", m,
-+ (u_long) link_addr);
-+ return 1;
-+ }
-+ if (crash_audit_map_page(m)) {
-+ printk("audit failed while on map page %ld\n", count);
-+ return 1;
-+ }
-+ if (!crash_link(m))
-+ break;
-+ link_addr = crash_link(m);
-+ m = *link_addr;
-+
-+ count++;
-+ }
-+ return 0;
-+}
-+
-+static int crash_audit_map_page(u_long map)
-+{
-+ struct mem_crash_map_entry *m;
-+ u_long sum;
-+
-+ if (!map || !crash_is_kseg(map)) {
-+ printk("crash_audit_map_page: bad map = 0x%lx\n", map);
-+ return 1;
-+ }
-+ map = (u_long) trunc_page((u_long) map);
-+ m = (struct mem_crash_map_entry *)map;
-+ for (;;) {
-+ if ((m->src_va == -1) || (m->src_va == 0)) {
-+ m++;
-+ if (crash_chksum((u_long) m, &sum))
-+ return 1;
-+ if (m->src_va != sum) {
-+ printk("crash_audit_map_page: checksum failure1\n");
-+ printk("m = 0x%lx, sum = 0x%lx, m->src_va = 0x%lx\n",
-+ (u_long) m, (u_long) sum, (u_long) m->src_va);
-+ crash_print_data_around((u_long) & m->src_va);
-+ return 1;
-+ } else {
-+ return 0;
-+ }
-+ } else {
-+ if (crash_chksum_page((u_long) m->dest_page_va, &sum)
-+ || (m->check_sum != sum)) {
-+ printk("crash_audit_map_page: checksum failure2\n");
-+ printk
-+ ("dest_page_va = 0x%lx, &dest_page_va = 0x%lx, sum = 0x%lx, m->check_sum = 0x%lx\n",
-+ (u_long) m->dest_page_va, (u_long) (&m->check_sum),
-+ (u_long) sum, (u_long) m->check_sum);
-+ crash_print_data_around((u_long) & m->check_sum);
-+ return 1;
-+ }
-+ }
-+ m++;
-+ }
-+}
-+
-+static void crash_print_data_around(u_long p)
-+{
-+ u_long *a;
-+ int i;
-+
-+ if (!crash_is_kseg(p)) {
-+ printk("crash_print_data_around: p = 0x%lx not kseg\n", p);
-+ return;
-+ }
-+ a = (u_long *) p;
-+ a -= 20;
-+ for (i = 0; i < 40; i++)
-+ printk("%lx\n", *a++);
-+}
-+
-+#ifdef CRASH_DEBUG
-+static void crash_print_map_page(u_long map)
-+{
-+ struct mem_crash_map_entry *m;
-+ int j = 0;
-+ u_long sum;
-+
-+ map = (u_long) trunc_page((u_long) map);
-+ m = (struct mem_crash_map_entry *)map;
-+ for (;;) {
-+ printk("%lx %lx %lx ", m->src_va, m->dest_page_va, m->check_sum);
-+ if (!(j++ % 4))
-+ printk("\n");
-+ if ((m->src_va == -1) || (m->src_va == 0)) {
-+ m++;
-+ printk("%lx %lx ", m->src_va, m->dest_page_va);
-+ if (crash_chksum((u_long) m, &sum));
-+ else
-+ printk("\nchksum = 0x%lx\n", sum);
-+ return;
-+ }
-+ m++;
-+ }
-+}
-+#endif /* CRASH_DEBUG */
-+
-+static void crash_wait_cpus(void)
-+{
-+ int i;
-+ int msecs = 0;
-+
-+ for (i = 0; i < smp_num_cpus; i++) {
-+ if (i != smp_processor_id()) {
-+ while (!panic_threads[i]) {
-+ msecs++;
-+ mdelay(1);
-+ if (msecs > CRASH_CPU_TIMEOUT) {
-+ /* if other cpus are still running
-+ * we have to halt, otherwise we could
-+ * risk using buffer cache pages which
-+ * could subsequently get flushed to disk.
-+ */
-+ printk("Unable to halt other CPUs, halting system.\n");
-+ crash_halt_or_reboot(0);
-+ }
-+ }
-+ }
-+ }
-+
-+ crash_cleanup_smp_state();
-+}
-+
-+
-+#if 0
-+static void *czalloc(void *arg, unsigned int items, unsigned int size)
-+{
-+ u_long nbytes;
-+ u_long addr;
-+
-+ nbytes = (u_long) (items * size);
-+ nbytes = (u_long) round_page(nbytes);
-+ if ((crash_zalloc_cur + nbytes) > crash_zalloc_end)
-+ return 0;
-+ addr = crash_zalloc_cur;
-+ crash_zalloc_cur += nbytes;
-+ return ((void *)addr);
-+}
-+
-+static void czfree(void *arg, void *ptr)
-+{
-+ printk("zfree: ptr = 0x%lx\n", (u_long) ptr);
-+}
-+#endif
-+
-+static boolean_t crash_is_kseg(u_long addr)
-+{
-+ u_long phys;
-+
-+ phys = virt_to_phys((void *)addr);
-+ if (phys < PFN_PHYS(max_low_pfn))
-+ return TRUE;
-+ else
-+ return FALSE;
-+}
-+
-+static u_long *crash_link(u_long p)
-+{
-+ struct mem_crash_map_entry *m;
-+
-+ p = (u_long) trunc_page(p);
-+ m = (struct mem_crash_map_entry *)p;
-+ for (; m->src_va; m++)
-+ if (m->src_va == -1)
-+ return &m->dest_page_va;
-+
-+ return 0;
-+}
-+
-+/* Call this after data written to disk. */
-+static int crash_free_crashmem(void)
-+{
-+ struct mem_crash_map_hdr *header;
-+ struct mem_crash_map_entry *m, *last_m;
-+
-+ if (crash_debug)
-+ printk("crash_free_crashmem: \n");
-+
-+ header = (struct mem_crash_map_hdr *)crash_dump_header;
-+ if (crash_audit_maps()) {
-+ header->magic[0] = 0;
-+ return 1;
-+ }
-+ m = (struct mem_crash_map_entry *)header->map;
-+ again:
-+ for (; m->src_va; m++) {
-+ if (m->src_va == -1) {
-+ last_m = m;
-+ m = (struct mem_crash_map_entry *)m->dest_page_va;
-+ crash_free_page((unsigned long)last_m);
-+ goto again;
-+ }
-+ crash_free_page(m->dest_page_va);
-+ }
-+ if (crash_debug)
-+ printk("crash_free_crashmem: 0x%lx freed\n",
-+ (header->data_pages + header->map_pages) * PAGE_SIZE);
-+ header->magic[0] = 0;
-+ return 0;
-+}
-+
-+static void crash_free_page(u_long addr)
-+{
-+ struct page *page;
-+
-+ page = virt_to_page(addr);
-+ ClearPageReserved(page);
-+ set_page_count(page, 1);
-+ __free_page(page);
-+}
-+
-+static int get_dump_helper(u_long kva, u_long buf)
-+{
-+ struct page *page;
-+ struct mem_crash_map_hdr *header;
-+
-+ header = (struct mem_crash_map_hdr *)crash_dump_header;
-+ if (header->magic[0] != CRASH_MAGIC)
-+ return 1;
-+
-+ if (!kva) {
-+ if (crash_audit_maps()) {
-+ printk("get_dump_helper: audit failure\n");
-+ header->magic[0] = 0;
-+ return 1;
-+ }
-+ page = virt_to_page((u_long) crash_dump_header);
-+ if (!PageReserved(page)) {
-+ printk("not reserved: crash_dump_header = 0x%lx\n", crash_dump_header);
-+ return 1;
-+ }
-+ if (copy_to_user((char *)buf, (char *)crash_dump_header,
-+ sizeof(struct mem_crash_map_hdr))) {
-+ printk("get_dump_helper: copy_to_user failed1\n");
-+ return 1;
-+ }
-+ } else {
-+ page = virt_to_page(kva);
-+ if (!PageReserved(page)) {
-+ printk("not reserved: kva = 0x%lx\n", kva);
-+ return 1;
-+ }
-+ if (copy_to_user((char *)buf, (char *)trunc_page(kva), PAGE_SIZE)) {
-+ printk("get_dump_helper: copy_to_user failed2\n");
-+ return 1;
-+ }
-+ }
-+ return 0;
-+}
-+
-+static void free_dump_helper(void)
-+{
-+ struct mem_crash_map_hdr *header;
-+
-+ header = (struct mem_crash_map_hdr *)crash_dump_header;
-+ if (header->magic[0] != CRASH_MAGIC)
-+ return;
-+ if (crash_debug)
-+ printk("free_dump_helper\n");
-+ crash_free_crashmem();
-+}
-+
-+static int crashdev_open(struct inode *inode, struct file *file)
-+{
-+ /* always return success -- nothing to do here */
-+ return 0;
-+}
-+
-+/* character device implementation */
-+static struct file_operations crashdev_fops = {
-+ ioctl:crashdev_ioctl,
-+ open:crashdev_open,
-+};
-+
-+static struct miscdevice crash_miscdev = {
-+ 190, "crash", &crashdev_fops
-+};
-+
-+int crash_init_chrdev(void)
-+{
-+ int result;
-+
-+ result = misc_register(&crash_miscdev);
-+
-+ if (result < 0)
-+ printk(KERN_WARNING "crash: can't register crash device (c 10 190)\n");
-+
-+ return result;
-+}
-+
-+/* call the original syscalls, just to get things going */
-+static int crashdev_ioctl(struct inode *inode, struct file *file,
-+ unsigned int cmd, unsigned long arg)
-+{
-+ int retval = 0;
-+
-+ switch (cmd) {
-+ case CRASH_IOCFREEDUMP:
-+ free_dump_helper();
-+ break;
-+
-+ case CRASH_IOCGETDUMP:
-+ if (crash_debug) {
-+ printk("crashdev_ioctl: get dump\n");
-+ printk("vals: %08lx %08lx\n",
-+ ((struct ioctl_getdump *)arg)->kva,
-+ ((struct ioctl_getdump *)arg)->buf);
-+ }
-+
-+ retval = get_dump_helper((u_long) ((struct ioctl_getdump *)arg)->kva,
-+ (u_long) ((struct ioctl_getdump *)arg)->buf);
-+ break;
-+
-+#ifdef CONFIG_BOOTIMG
-+ case CRASH_IOCBOOTIMG:
-+ if (crash_debug)
-+ printk("crashdev_ioctl: bootimg\n");
-+
-+ retval = sys_bootimg((struct boot_image *)arg);
-+ break;
-+#endif
-+
-+ case CRASH_IOCVERSION:
-+ if (crash_debug)
-+ printk("crashdev_ioctl: version\n");
-+ retval = CRASH_K_MINOR | (CRASH_K_MAJOR << 16);
-+ break;
-+
-+ default:
-+ return -EINVAL;
-+ }
-+
-+ return retval;
-+}
-Index: linux/kernel/module.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/kernel/module.c,v
-retrieving revision 1.1.1.1.4.1
-retrieving revision 1.1.1.1.4.1.2.1
-diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1
---- linux/kernel/module.c 12 Mar 2003 19:51:36 -0000 1.1.1.1.4.1
-+++ linux/kernel/module.c 1 Apr 2003 12:17:41 -0000 1.1.1.1.4.1.2.1
-@@ -311,7 +311,14 @@
- error = -EEXIST;
- goto err1;
- }
-+#if defined(CONFIG_MCL_COREDUMP)
-+ /* Call vmalloc_32 instead of module_map (vmalloc for i386)
-+ * to avoid being mapped in highmem where mcore can't see us.
-+ */
-+ if ((mod = (struct module *)vmalloc_32(size)) == NULL) {
-+#else
- if ((mod = (struct module *)module_map(size)) == NULL) {
-+#endif
- error = -ENOMEM;
- goto err1;
- }
-Index: linux/kernel/panic.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/kernel/panic.c,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.1
-diff -u -r1.3.2.1 -r1.3.2.1.2.1
---- linux/kernel/panic.c 12 Mar 2003 19:51:36 -0000 1.3.2.1
-+++ linux/kernel/panic.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1
-@@ -19,6 +19,10 @@
- #include <linux/vt_kern.h>
- #include <linux/pc_keyb.h>
-
-+#ifdef CONFIG_MCL_COREDUMP
-+#include <linux/crash.h>
-+#endif
-+
- asmlinkage void sys_sync(void); /* it's really int */
-
- int panic_timeout;
-@@ -197,20 +201,43 @@
- unsigned long caller = (unsigned long) __builtin_return_address(0);
- #endif
-
-+#ifdef CONFIG_MCL_COREDUMP
-+ crash_save_regs();
-+#endif
-+
- bust_spinlocks(1);
- va_start(args, fmt);
- vsprintf(buf, fmt, args);
- va_end(args);
- printk(KERN_EMERG "Kernel panic: %s\n",buf);
-+
-+#ifdef CONFIG_MCL_COREDUMP
-+ if (!panicmsg) {
-+ panicmsg = buf;
-+ panic_processor = smp_processor_id();
-+ mb();
-+ }
-+#endif
-+
- if (netdump_func)
- BUG();
- if (in_interrupt())
- printk(KERN_EMERG "In interrupt handler - not syncing\n");
- else if (!current->pid)
- printk(KERN_EMERG "In idle task - not syncing\n");
-+#ifdef CONFIG_MCL_COREDUMP
-+ else if (crash_perform_sync)
-+#else
- else
-+#endif
- sys_sync();
-+
- bust_spinlocks(0);
-+
-+#ifdef CONFIG_MCL_COREDUMP
-+ smp_call_function((void *)smp_crash_funnel_cpu,0,0,0);
-+ crash_save_current_state(current);
-+#endif
-
- #ifdef CONFIG_SMP
- smp_send_stop();
-Index: linux/kernel/sysctl.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/kernel/sysctl.c,v
-retrieving revision 1.2.2.1
-retrieving revision 1.2.2.1.2.1
-diff -u -r1.2.2.1 -r1.2.2.1.2.1
---- linux/kernel/sysctl.c 12 Mar 2003 19:51:36 -0000 1.2.2.1
-+++ linux/kernel/sysctl.c 1 Apr 2003 12:17:41 -0000 1.2.2.1.2.1
-@@ -37,6 +37,10 @@
- #include <linux/nfs_fs.h>
- #endif
-
-+#ifdef CONFIG_MCL_COREDUMP
-+#include <linux/crash.h>
-+#endif
-+
- #if defined(CONFIG_SYSCTL)
-
- /* External variables not in a header file. */
-@@ -247,6 +251,10 @@
- {KERN_SYSRQ, "sysrq", &sysrq_enabled, sizeof (int),
- 0644, NULL, &proc_dointvec},
- #endif
-+#ifdef CONFIG_MCL_COREDUMP
-+ {KERN_PANIC_ON_OOPS, "panic_on_oops", &panic_on_oops, sizeof(int),
-+ 0644, NULL, &proc_dointvec},
-+#endif
- {KERN_CADPID, "cad_pid", &cad_pid, sizeof (int),
- 0600, NULL, &proc_dointvec},
- {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int),
-Index: linux/lib/Config.in
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/lib/Config.in,v
-retrieving revision 1.2
-retrieving revision 1.2.4.1
-diff -u -r1.2 -r1.2.4.1
---- linux/lib/Config.in 14 Feb 2003 22:59:23 -0000 1.2
-+++ linux/lib/Config.in 1 Apr 2003 12:17:41 -0000 1.2.4.1
-@@ -23,12 +23,14 @@
- fi
- fi
-
--if [ "$CONFIG_PPP_DEFLATE" = "y" -o \
-+if [ "$CONFIG_MCL_COREDUMP" = "y" -o \
-+ "$CONFIG_PPP_DEFLATE" = "y" -o \
- "$CONFIG_JFFS2_FS" = "y" ]; then
- define_tristate CONFIG_ZLIB_DEFLATE y
- else
- if [ "$CONFIG_PPP_DEFLATE" = "m" -o \
-- "$CONFIG_JFFS2_FS" = "m" ]; then
-+ "$CONFIG_JFFS2_FS" = "m" -o \
-+ "$CONFIG_MCL_COREDUMP" = "m" ]; then
- define_tristate CONFIG_ZLIB_DEFLATE m
- else
- tristate 'zlib compression support' CONFIG_ZLIB_DEFLATE
-Index: linux/mm/memory.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/mm/memory.c,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.1
-diff -u -r1.3.2.1 -r1.3.2.1.2.1
---- linux/mm/memory.c 12 Mar 2003 19:51:37 -0000 1.3.2.1
-+++ linux/mm/memory.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1
-@@ -1381,6 +1381,10 @@
- }
- lock_page(page);
-
-+#ifdef CONFIG_MCL_COREDUMP
-+ set_bit(PG_anon, &page->flags);
-+#endif
-+
- /*
- * Back out if somebody else faulted in this pte while we
- * released the page table lock.
-@@ -1470,6 +1474,9 @@
- mm->rss++;
- flush_page_to_ram(page);
- entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-+#ifdef CONFIG_MCL_COREDUMP
-+ set_bit(PG_anon, &page->flags);
-+#endif
- lru_cache_add(page);
- }
-
-Index: linux/mm/page_alloc.c
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/mm/page_alloc.c,v
-retrieving revision 1.3.2.1
-retrieving revision 1.3.2.1.2.1
-diff -u -r1.3.2.1 -r1.3.2.1.2.1
---- linux/mm/page_alloc.c 12 Mar 2003 19:51:37 -0000 1.3.2.1
-+++ linux/mm/page_alloc.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1
-@@ -95,6 +95,10 @@
- struct page *base;
- per_cpu_t *per_cpu;
- zone_t *zone;
-+#ifdef CONFIG_MCL_COREDUMP
-+ struct page *pagemap;
-+ int count = 1<<order;
-+#endif
-
- /*
- * Yes, think what happens when other parts of the kernel take
-@@ -163,6 +167,15 @@
-
- spin_lock(&zone->lock);
-
-+#ifdef CONFIG_MCL_COREDUMP
-+ pagemap = page;
-+ do {
-+ pagemap->flags |= (1<<PG_free);
-+ pagemap->flags &= ~((1<<PG_anon)|(1<<PG_shm));
-+ pagemap++;
-+ } while(--count);
-+#endif
-+
- zone->free_pages -= mask;
-
- while (mask + (1 << (MAX_ORDER-1))) {
-@@ -268,6 +281,16 @@
- zone->free_pages -= 1UL << order;
-
- page = expand(zone, page, index, order, curr_order, area);
-+#ifdef CONFIG_MCL_COREDUMP
-+ {
-+ struct page *pagemap = page;
-+ int count = 1<<order;
-+ do {
-+ pagemap->flags &= ~(1<<PG_free);
-+ pagemap++;
-+ } while (--count);
-+ }
-+#endif
- spin_unlock_irqrestore(&zone->lock, flags);
-
- set_page_count(page, 1);
-Index: linux/arch/i386//boot/compressed/head.S
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/compressed/head.S,v
-retrieving revision 1.1.1.1
-retrieving revision 1.1.1.1.12.6
-diff -u -r1.1.1.1 -r1.1.1.1.12.6
---- linux/arch/i386//boot/compressed/head.S 7 May 2002 21:53:54 -0000 1.1.1.1
-+++ linux/arch/i386//boot/compressed/head.S 5 Apr 2003 05:51:27 -0000 1.1.1.1.12.6
-@@ -23,6 +23,7 @@
- */
- .text
-
-+#include <linux/config.h>
- #include <linux/linkage.h>
- #include <asm/segment.h>
-
-@@ -31,6 +32,55 @@
- startup_32:
- cld
- cli
-+
-+#ifdef CONFIG_BOOTIMG
-+/*
-+ * GDT is invalid if we're booted by bootimg, so reload it now
-+ */
-+ lgdt %cs:gdt_descr
-+ ljmp $(__KERNEL_CS),$1f
-+
-+gdt_table_limit = gdt_table_end - gdt_table - 1
-+gdt_descr:
-+ .word gdt_table_limit
-+ .long gdt_table
-+
-+gdt_table: /* stolen from arch/i386/kernel/head.S */
-+ .quad 0x0000000000000000 /* NULL descriptor */
-+ .quad 0x0000000000000000 /* 0x0b reserved */
-+ .quad 0x0000000000000000 /* 0x13 reserved */
-+ .quad 0x0000000000000000 /* 0x1b reserved */
-+ .quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */
-+ .quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */
-+ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
-+ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
-+ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
-+ .quad 0x0000000000000000 /* 0x4b reserved */
-+ .quad 0x0000000000000000 /* 0x53 reserved */
-+ .quad 0x0000000000000000 /* 0x5b reserved */
-+
-+ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
-+ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
-+ .quad 0x0000000000000000 /* 0x70 TSS descriptor */
-+ .quad 0x0000000000000000 /* 0x78 LDT descriptor */
-+
-+ /* Segments used for calling PnP BIOS */
-+ .quad 0x00c09a0000000000 /* 0x80 32-bit code */
-+ .quad 0x00809a0000000000 /* 0x88 16-bit code */
-+ .quad 0x0080920000000000 /* 0x90 16-bit data */
-+ .quad 0x0080920000000000 /* 0x98 16-bit data */
-+ .quad 0x0080920000000000 /* 0xa0 16-bit data */
-+ /*
-+ * The APM segments have byte granularity and their bases
-+ * and limits are set at run time.
-+ */
-+ .quad 0x00409a0000000000 /* 0xa8 APM CS code */
-+ .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */
-+ .quad 0x0040920000000000 /* 0xb8 APM DS data */
-+gdt_table_end:
-+
-+1:
-+#endif
- movl $(__KERNEL_DS),%eax
- movl %eax,%ds
- movl %eax,%es
-@@ -92,7 +142,6 @@
- cld
- rep
- movsl
--
- popl %esi # discard the address
- popl %ebx # real mode pointer
- popl %esi # low_buffer_start
-@@ -124,5 +173,10 @@
- movsl
- movl %ebx,%esi # Restore setup pointer
- xorl %ebx,%ebx
-+#ifdef CONFIG_BOOTIMG
-+ movl $0x100000,%eax
-+ jmpl *%eax
-+#else
- ljmp $(__KERNEL_CS), $0x100000
-+#endif
- move_routine_end:
-Index: linux/arch/i386//kernel/head.S
-===================================================================
-RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/head.S,v
-retrieving revision 1.2.2.1
-retrieving revision 1.2.2.1.2.5
-diff -u -r1.2.2.1 -r1.2.2.1.2.5
---- linux/arch/i386//kernel/head.S 12 Mar 2003 19:49:06 -0000 1.2.2.1
-+++ linux/arch/i386//kernel/head.S 5 Apr 2003 05:51:27 -0000 1.2.2.1.2.5
-@@ -42,6 +42,21 @@
- * On entry, %esi points to the real-mode code as a 32-bit pointer.
- */
- startup_32:
-+#ifdef CONFIG_BOOTIMG
-+/*
-+ * GDT is invalid if we're booted by bootimg, so reload it now
-+ */
-+ lgdt %cs:_gdt_descr-__PAGE_OFFSET
-+ ljmp $(__KERNEL_CS),$1f-__PAGE_OFFSET
-+
-+gdt_limit = SYMBOL_NAME(cpu_gdt_table_end) - SYMBOL_NAME(cpu_gdt_table) - 1
-+
-+_gdt_descr:
-+ .word gdt_limit
-+ .long SYMBOL_NAME(cpu_gdt_table)-__PAGE_OFFSET
-+
-+1:
-+#endif
- /*
- * Set segments to known values
- */
-@@ -452,6 +467,7 @@
- .quad 0x00409a0000000000 /* 0xa8 APM CS code */
- .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */
- .quad 0x0040920000000000 /* 0xb8 APM DS data */
-+ENTRY(cpu_gdt_table_end)
-
- #if CONFIG_SMP
- .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */
+++ /dev/null
-Index: linux-2.4.20-30.9/scripts/mkdep.c
-===================================================================
---- linux-2.4.20-30.9.orig/scripts/mkdep.c 2004-02-19 19:40:51.000000000 -0500
-+++ linux-2.4.20-30.9/scripts/mkdep.c 2004-04-28 17:24:54.000000000 -0400
-@@ -48,8 +48,6 @@
- char __depname[512] = "\n\t@touch ";
- #define depname (__depname+9)
- int hasdep;
--char cwd[PATH_MAX];
--int lcwd;
-
- struct path_struct {
- int len;
-@@ -204,22 +202,8 @@
- memcpy(path->buffer+path->len, name, len);
- path->buffer[path->len+len] = '\0';
- if (access(path->buffer, F_OK) == 0) {
-- int l = lcwd + strlen(path->buffer);
-- char name2[l+2], *p;
-- if (path->buffer[0] == '/') {
-- memcpy(name2, path->buffer, l+1);
-- }
-- else {
-- memcpy(name2, cwd, lcwd);
-- name2[lcwd] = '/';
-- memcpy(name2+lcwd+1, path->buffer, path->len+len+1);
-- }
-- while ((p = strstr(name2, "/../"))) {
-- *p = '\0';
-- strcpy(strrchr(name2, '/'), p+3);
-- }
- do_depname();
-- printf(" \\\n %s", name2);
-+ printf(" \\\n %s", path->buffer);
- return;
- }
- }
-@@ -601,12 +585,6 @@
- return 1;
- }
-
-- if (!getcwd(cwd, sizeof(cwd))) {
-- fprintf(stderr, "mkdep: getcwd() failed %m\n");
-- return 1;
-- }
-- lcwd = strlen(cwd);
--
- add_path("."); /* for #include "..." */
-
- while (++argv, --argc > 0) {
+++ /dev/null
- fs/Makefile | 3
- fs/file_table.c | 11 ++
- fs/inode.c | 23 ++++-
- fs/namei.c | 12 ++
- fs/nfsd/export.c | 5 +
- fs/nfsd/nfsfh.c | 65 +++++++++++++-
- fs/nfsd/vfs.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++-----
- include/linux/fs.h | 10 ++
- kernel/ksyms.c | 2
- 9 files changed, 337 insertions(+), 34 deletions(-)
-
-Index: linux-bgl/fs/nfsd/vfs.c
-===================================================================
---- linux-bgl.orig/fs/nfsd/vfs.c 2003-07-02 08:44:33.000000000 -0700
-+++ linux-bgl/fs/nfsd/vfs.c 2004-12-28 17:13:59.940919832 -0800
-@@ -77,6 +77,129 @@
- static struct raparms * raparml;
- static struct raparms * raparm_cache;
-
-+static int link_raw(struct dentry *dold, struct dentry *ddir,
-+ struct dentry *dnew)
-+{
-+ int err;
-+
-+ struct nameidata old_nd = { .dentry = dold };
-+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name };
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ err = op->link_raw(&old_nd, &nd);
-+ igrab(dold->d_inode);
-+ d_instantiate(dnew, dold->d_inode);
-+ if(dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it)
-+ dold->d_inode->i_op->revalidate_it(dnew, NULL);
-+
-+ return err;
-+}
-+
-+static int unlink_raw(struct dentry *dentry, char *fname, int flen,
-+ struct dentry *rdentry)
-+{
-+ int err;
-+ struct qstr last = { .name = fname, .len = flen };
-+ struct nameidata nd = { .dentry = dentry, .last = last };
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ err = op->unlink_raw(&nd);
-+ if (!err)
-+ d_delete(rdentry);
-+
-+ return err;
-+}
-+
-+static int rmdir_raw(struct dentry *dentry, char *fname, int flen,
-+ struct dentry *rdentry)
-+{
-+ int err;
-+ struct qstr last = { .name = fname, .len = flen };
-+ struct nameidata nd = { .dentry = dentry, .last = last };
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ err = op->rmdir_raw(&nd);
-+ if(!err) {
-+ rdentry->d_inode->i_flags |= S_DEAD;
-+ d_delete(rdentry);
-+ }
-+
-+ return err;
-+}
-+
-+static int symlink_raw(struct dentry *dentry, char *fname, int flen,
-+ char *path)
-+{
-+ int err;
-+ struct qstr last = { .name = fname, .len = flen };
-+ struct nameidata nd = { .dentry = dentry, .last = last };
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ err = op->symlink_raw(&nd, path);
-+
-+ return err;
-+}
-+
-+static int mkdir_raw(struct dentry *dentry, char *fname, int flen, int mode)
-+{
-+ int err;
-+ struct qstr last = { .name = fname, .len = flen };
-+ struct nameidata nd = { .dentry = dentry, .last = last };
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ err = op->mkdir_raw(&nd, mode);
-+
-+ return err;
-+}
-+
-+static int mknod_raw(struct dentry *dentry, char *fname, int flen, int mode,
-+ dev_t dev)
-+{
-+ int err;
-+ struct qstr last = { .name = fname, .len = flen };
-+ struct nameidata nd = { .dentry = dentry, .last = last };
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ err = op->mknod_raw(&nd, mode, dev);
-+
-+ return err;
-+}
-+
-+static int rename_raw(struct dentry *fdentry, struct dentry *tdentry,
-+ struct dentry *odentry, struct dentry *ndentry)
-+{
-+ int err;
-+
-+ struct nameidata old_nd = { .dentry = fdentry, .last = odentry->d_name};
-+ struct nameidata new_nd = { .dentry = tdentry, .last = ndentry->d_name};
-+ struct inode_operations *op = old_nd.dentry->d_inode->i_op;
-+ err = op->rename_raw(&old_nd, &new_nd);
-+ d_move(odentry, ndentry);
-+
-+ return err;
-+}
-+
-+static int setattr_raw(struct inode *inode, struct iattr *iap)
-+{
-+ int err;
-+
-+ iap->ia_valid |= ATTR_RAW;
-+ err = inode->i_op->setattr_raw(inode, iap);
-+
-+ return err;
-+}
-+
-+int revalidate_it(struct dentry *dentry, struct lookup_intent *it)
-+{
-+ int err = 0;
-+
-+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
-+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it) &&
-+ !d_invalidate(dentry)) {
-+ dput(dentry);
-+ err = -EINVAL;
-+ dentry = NULL;
-+ return err;
-+ }
-+ }
-+
-+ return err;
-+}
-+
- /*
- * Look up one component of a pathname.
- * N.B. After this call _both_ fhp and resfh need an fh_put
-@@ -304,7 +426,10 @@
- }
- err = nfserr_notsync;
- if (!check_guard || guardtime == inode->i_ctime) {
-- err = notify_change(dentry, iap);
-+ if ( dentry->d_inode->i_op && dentry->d_inode->i_op->setattr_raw)
-+ err = setattr_raw(dentry->d_inode, iap);
-+ else
-+ err = notify_change(dentry, iap);
- err = nfserrno(err);
- }
- if (size_change) {
-@@ -431,6 +556,7 @@
- {
- struct dentry *dentry;
- struct inode *inode;
-+ struct lookup_intent it;
- int err;
-
- /* If we get here, then the client has already done an "open", and (hopefully)
-@@ -477,6 +603,14 @@
- filp->f_mode = FMODE_READ;
- }
-
-+ intent_init(&it, IT_OPEN, (filp->f_flags & ~O_ACCMODE) | filp->f_mode);
-+
-+ err = revalidate_it(dentry, &it);
-+ if (err)
-+ goto out_nfserr;
-+
-+ filp->f_it = ⁢
-+
- err = 0;
- if (filp->f_op && filp->f_op->open) {
- err = filp->f_op->open(inode, filp);
-@@ -491,7 +625,11 @@
- atomic_dec(&filp->f_count);
- }
- }
-+
- out_nfserr:
-+ if (it.it_op_release)
-+ intent_release(&it);
-+
- if (err)
- err = nfserrno(err);
- out:
-@@ -822,7 +960,7 @@
- {
- struct dentry *dentry, *dchild;
- struct inode *dirp;
-- int err;
-+ int err, error = -EOPNOTSUPP;
-
- err = nfserr_perm;
- if (!flen)
-@@ -838,20 +976,44 @@
- dentry = fhp->fh_dentry;
- dirp = dentry->d_inode;
-
-+ switch (type) {
-+ case S_IFDIR:
-+ if (dirp->i_op->mkdir_raw)
-+ error = mkdir_raw(dentry, fname, flen, iap->ia_mode);
-+ break;
-+ case S_IFCHR:
-+ case S_IFBLK:
-+ case S_IFIFO:
-+ case S_IFSOCK:
-+ case S_IFREG:
-+ if (dirp->i_op->mknod_raw) {
-+ if (type == S_IFREG)
-+ rdev = 0;
-+ error = mknod_raw(dentry, fname, flen, iap->ia_mode, rdev);
-+ }
-+ break;
-+ default:
-+ printk("nfsd: bad file type %o in nfsd_create\n", type);
-+ }
-+
- err = nfserr_notdir;
-- if(!dirp->i_op || !dirp->i_op->lookup)
-+ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it))
- goto out;
- /*
- * Check whether the response file handle has been verified yet.
- * If it has, the parent directory should already be locked.
- */
-- if (!resfhp->fh_dentry) {
-- /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
-- fh_lock(fhp);
-+ if (!resfhp->fh_dentry || dirp->i_op->lookup_it) {
-+ /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create
-+ and nfsd_proc_create in case of lustre
-+ */
-+ if (!resfhp->fh_dentry)
-+ fh_lock(fhp);
- dchild = lookup_one_len(fname, dentry, flen);
- err = PTR_ERR(dchild);
- if (IS_ERR(dchild))
- goto out_nfserr;
-+ resfhp->fh_dentry = NULL;
- err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
- if (err)
- goto out;
-@@ -872,10 +1034,12 @@
- * Make sure the child dentry is still negative ...
- */
- err = nfserr_exist;
-- if (dchild->d_inode) {
-- dprintk("nfsd_create: dentry %s/%s not negative!\n",
-- dentry->d_name.name, dchild->d_name.name);
-- goto out;
-+ if ( error == -EOPNOTSUPP) {
-+ if (dchild->d_inode) {
-+ dprintk("nfsd_create: dentry %s/%s not negative!\n",
-+ dentry->d_name.name, dchild->d_name.name);
-+ goto out;
-+ }
- }
-
- if (!(iap->ia_valid & ATTR_MODE))
-@@ -888,16 +1052,19 @@
- err = nfserr_perm;
- switch (type) {
- case S_IFREG:
-- err = vfs_create(dirp, dchild, iap->ia_mode);
-+ if (error == -EOPNOTSUPP)
-+ err = vfs_create(dirp, dchild, iap->ia_mode);
- break;
- case S_IFDIR:
-- err = vfs_mkdir(dirp, dchild, iap->ia_mode);
-+ if (error == -EOPNOTSUPP)
-+ err = vfs_mkdir(dirp, dchild, iap->ia_mode);
- break;
- case S_IFCHR:
- case S_IFBLK:
- case S_IFIFO:
- case S_IFSOCK:
-- err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
-+ if (error == -EOPNOTSUPP)
-+ err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
- break;
- default:
- printk("nfsd: bad file type %o in nfsd_create\n", type);
-@@ -966,7 +1133,13 @@
- /* Get all the sanity checks out of the way before
- * we lock the parent. */
- err = nfserr_notdir;
-- if(!dirp->i_op || !dirp->i_op->lookup)
-+ if (dirp->i_op->mknod_raw) {
-+ err = mknod_raw(dentry, fname, flen, iap->ia_mode, 0);
-+ if (err && err != -EOPNOTSUPP)
-+ goto out;
-+ }
-+
-+ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it))
- goto out;
- fh_lock(fhp);
-
-@@ -1017,6 +1190,8 @@
- case NFS3_CREATE_GUARDED:
- err = nfserr_exist;
- }
-+ if(dirp->i_op->mknod_raw)
-+ err = 0;
- goto out;
- }
-
-@@ -1123,7 +1298,7 @@
- struct iattr *iap)
- {
- struct dentry *dentry, *dnew;
-- int err, cerr;
-+ int err, cerr, error = -EOPNOTSUPP;
-
- err = nfserr_noent;
- if (!flen || !plen)
-@@ -1137,12 +1312,18 @@
- goto out;
- fh_lock(fhp);
- dentry = fhp->fh_dentry;
-+
-+ if (dentry->d_inode->i_op->symlink_raw)
-+ error = symlink_raw(dentry, fname, flen, path);
-+
- dnew = lookup_one_len(fname, dentry, flen);
- err = PTR_ERR(dnew);
- if (IS_ERR(dnew))
- goto out_nfserr;
-
-- err = vfs_symlink(dentry->d_inode, dnew, path);
-+ err = error;
-+ if (err == -EOPNOTSUPP || !dentry->d_inode->i_op->symlink_raw)
-+ err = vfs_symlink(dentry->d_inode, dnew, path);
- if (!err) {
- if (EX_ISSYNC(fhp->fh_export))
- nfsd_sync_dir(dentry);
-@@ -1152,7 +1333,10 @@
- iap->ia_valid |= ATTR_CTIME;
- iap->ia_mode = (iap->ia_mode&S_IALLUGO)
- | S_IFLNK;
-- err = notify_change(dnew, iap);
-+ if (dnew->d_inode->i_op && dnew->d_inode->i_op->setattr_raw)
-+ err = setattr_raw(dnew->d_inode, iap);
-+ else
-+ err = notify_change(dnew, iap);
- if (!err && EX_ISSYNC(fhp->fh_export))
- write_inode_now(dentry->d_inode, 1);
- }
-@@ -1210,7 +1394,10 @@
- dold = tfhp->fh_dentry;
- dest = dold->d_inode;
-
-- err = vfs_link(dold, dirp, dnew);
-+ if (dirp->i_op->link_raw)
-+ err = link_raw(dold, ddir, dnew);
-+ else
-+ err = vfs_link(dold, dirp, dnew);
- if (!err) {
- if (EX_ISSYNC(ffhp->fh_export)) {
- nfsd_sync_dir(ddir);
-@@ -1295,7 +1482,10 @@
- err = nfserr_perm;
- } else
- #endif
-- err = vfs_rename(fdir, odentry, tdir, ndentry);
-+ if(fdir->i_op->rename_raw)
-+ err = rename_raw(fdentry, tdentry, odentry, ndentry);
-+ else
-+ err = vfs_rename(fdir, odentry, tdir, ndentry);
- if (!err && EX_ISSYNC(tfhp->fh_export)) {
- nfsd_sync_dir(tdentry);
- nfsd_sync_dir(fdentry);
-@@ -1316,7 +1506,7 @@
- fill_post_wcc(tfhp);
- double_up(&tdir->i_sem, &fdir->i_sem);
- ffhp->fh_locked = tfhp->fh_locked = 0;
--
-+
- out:
- return err;
- }
-@@ -1362,9 +1552,15 @@
- err = nfserr_perm;
- } else
- #endif
-- err = vfs_unlink(dirp, rdentry);
-+ if (dirp->i_op->unlink_raw)
-+ err = unlink_raw(dentry, fname, flen, rdentry);
-+ else
-+ err = vfs_unlink(dirp, rdentry);
- } else { /* It's RMDIR */
-- err = vfs_rmdir(dirp, rdentry);
-+ if (dirp->i_op->rmdir_raw)
-+ err = rmdir_raw(dentry, fname, flen, rdentry);
-+ else
-+ err = vfs_rmdir(dirp, rdentry);
- }
-
- dput(rdentry);
-Index: linux-bgl/fs/nfsd/nfsfh.c
-===================================================================
---- linux-bgl.orig/fs/nfsd/nfsfh.c 2003-07-02 08:44:08.000000000 -0700
-+++ linux-bgl/fs/nfsd/nfsfh.c 2004-12-28 17:13:59.942919514 -0800
-@@ -36,6 +36,15 @@
- int sequence; /* sequence counter */
- };
-
-+static struct dentry *lookup_it(struct inode *inode, struct dentry * dentry)
-+{
-+ if (inode->i_op->lookup_it)
-+ return inode->i_op->lookup_it(inode, dentry, NULL, 0);
-+ else
-+ return inode->i_op->lookup(inode, dentry);
-+
-+}
-+
- /*
- * A rather strange filldir function to capture
- * the name matching the specified inode number.
-@@ -75,6 +84,8 @@
- int error;
- struct file file;
- struct nfsd_getdents_callback buffer;
-+ struct lookup_intent it;
-+ struct file *filp = NULL;
-
- error = -ENOTDIR;
- if (!dir || !S_ISDIR(dir->i_mode))
-@@ -85,9 +96,37 @@
- /*
- * Open the directory ...
- */
-- error = init_private_file(&file, dentry, FMODE_READ);
-- if (error)
-+ if (dentry->d_op && dentry->d_op->d_revalidate_it) {
-+ if ((dentry->d_flags & DCACHE_NFSD_DISCONNECTED) &&
-+ (dentry->d_parent == dentry) ) {
-+ it.it_op_release = NULL;
-+ /*
-+ * XXX Temporary Hack: Simulating init_private_file without
-+ * f_op->open for disconnected dentry Since we don't have actual
-+ * dentry->d_name to revalidate in revalidate_it()
-+ */
-+ filp = &file;
-+ memset(filp, 0, sizeof(*filp));
-+ filp->f_mode = FMODE_READ;
-+ atomic_set(&filp->f_count, 1);
-+ filp->f_dentry = dentry;
-+ filp->f_uid = current->fsuid;
-+ filp->f_gid = current->fsgid;
-+ filp->f_op = dentry->d_inode->i_fop;
-+ error = 0;
-+ } else {
-+ intent_init(&it, IT_OPEN, 0);
-+ error = revalidate_it(dentry, &it);
-+ if (error)
-+ goto out;
-+ error = init_private_file_it(&file, dentry, FMODE_READ, &it);
-+ }
-+ } else {
-+ error = init_private_file_it(&file, dentry, FMODE_READ, NULL);
-+ }
-+ if (error)
- goto out;
-+
- error = -EINVAL;
- if (!file.f_op->readdir)
- goto out_close;
-@@ -113,9 +152,13 @@
- }
-
- out_close:
-- if (file.f_op->release)
-+ if (file.f_op->release && !filp)
- file.f_op->release(dir, &file);
- out:
-+ if (dentry->d_op &&
-+ dentry->d_op->d_revalidate_it &&
-+ it.it_op_release && !filp)
-+ intent_release(&it);
- return error;
- }
-
-@@ -273,7 +316,7 @@
- /* I'm going to assume that if the returned dentry is different, then
- * it is well connected. But nobody returns different dentrys do they?
- */
-- pdentry = child->d_inode->i_op->lookup(child->d_inode, tdentry);
-+ pdentry = lookup_it(child->d_inode, tdentry);
- d_drop(tdentry); /* we never want ".." hashed */
- if (!pdentry && tdentry->d_inode == NULL) {
- /* File system cannot find ".." ... sad but possible */
-@@ -304,6 +347,8 @@
- igrab(tdentry->d_inode);
- pdentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
- }
-+ if (child->d_op && child->d_op->d_revalidate_it)
-+ pdentry->d_op = child->d_op;
- }
- if (pdentry == NULL)
- pdentry = ERR_PTR(-ENOMEM);
-@@ -461,6 +506,8 @@
- struct dentry *pdentry;
- struct inode *parent;
-
-+ if (result->d_op && result->d_op->d_revalidate_it)
-+ dentry->d_op = result->d_op;
- pdentry = nfsd_findparent(dentry);
- err = PTR_ERR(pdentry);
- if (IS_ERR(pdentry))
-@@ -648,6 +695,11 @@
-
- inode = dentry->d_inode;
-
-+ /* cache coherency for non-device filesystems */
-+ if (inode->i_op && inode->i_op->revalidate_it) {
-+ inode->i_op->revalidate_it(dentry, NULL);
-+ }
-+
- /* Type check. The correct error return for type mismatches
- * does not seem to be generally agreed upon. SunOS seems to
- * use EISDIR if file isn't S_IFREG; a comment in the NFSv3
-@@ -878,8 +930,9 @@
- dentry->d_parent->d_name.name, dentry->d_name.name);
- goto out;
- out_uptodate:
-- printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n",
-- dentry->d_parent->d_name.name, dentry->d_name.name);
-+ if(!dentry->d_parent->d_inode->i_op->mkdir_raw)
-+ printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n",
-+ dentry->d_parent->d_name.name, dentry->d_name.name);
- goto out;
- }
-
-Index: linux-bgl/fs/Makefile
-===================================================================
---- linux-bgl.orig/fs/Makefile 2004-12-28 17:13:56.898868625 -0800
-+++ linux-bgl/fs/Makefile 2004-12-28 17:13:59.943919356 -0800
-@@ -7,7 +7,8 @@
-
- O_TARGET := fs.o
-
--export-objs := filesystems.o open.o dcache.o buffer.o inode.o
-+export-objs := filesystems.o open.o dcache.o buffer.o inode.o namei.o \
-+ file_table.o
- mod-subdirs := nls
-
- obj-y := open.o read_write.o devices.o file_table.o buffer.o \
-Index: linux-bgl/fs/namei.c
-===================================================================
---- linux-bgl.orig/fs/namei.c 2004-12-28 17:13:56.265835195 -0800
-+++ linux-bgl/fs/namei.c 2004-12-28 17:13:59.947918720 -0800
-@@ -22,6 +22,7 @@
- #include <linux/dnotify.h>
- #include <linux/smp_lock.h>
- #include <linux/personality.h>
-+#include <linux/module.h>
-
- #include <asm/namei.h>
- #include <asm/uaccess.h>
-@@ -100,6 +101,7 @@
- it->it_op_release(it);
-
- }
-+EXPORT_SYMBOL(intent_release);
-
- /* In order to reduce some races, while at the same time doing additional
- * checking and hopefully speeding things up, we copy filenames to the
-@@ -889,7 +891,8 @@
-
-
- /* SMP-safe */
--struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
-+struct dentry * lookup_one_len_it(const char * name, struct dentry * base,
-+ int len, struct lookup_intent *it)
- {
- unsigned long hash;
- struct qstr this;
-@@ -909,11 +912,16 @@
- }
- this.hash = end_name_hash(hash);
-
-- return lookup_hash_it(&this, base, NULL);
-+ return lookup_hash_it(&this, base, it);
- access:
- return ERR_PTR(-EACCES);
- }
-
-+struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
-+{
-+ return lookup_one_len_it(name, base, len, NULL);
-+}
-+
- /*
- * namei()
- *
-Index: linux-bgl/fs/file_table.c
-===================================================================
---- linux-bgl.orig/fs/file_table.c 2003-07-02 08:44:42.000000000 -0700
-+++ linux-bgl/fs/file_table.c 2004-12-28 17:13:59.948918562 -0800
-@@ -82,7 +82,8 @@
- * and call the open function (if any). The caller must verify that
- * inode->i_fop is not NULL.
- */
--int init_private_file(struct file *filp, struct dentry *dentry, int mode)
-+int init_private_file_it(struct file *filp, struct dentry *dentry, int mode,
-+ struct lookup_intent *it)
- {
- memset(filp, 0, sizeof(*filp));
- filp->f_mode = mode;
-@@ -90,12 +91,20 @@
- filp->f_dentry = dentry;
- filp->f_uid = current->fsuid;
- filp->f_gid = current->fsgid;
-+ if (it)
-+ filp->f_it = it;
- filp->f_op = dentry->d_inode->i_fop;
- if (filp->f_op->open)
- return filp->f_op->open(dentry->d_inode, filp);
- else
- return 0;
- }
-+EXPORT_SYMBOL(init_private_file_it);
-+
-+int init_private_file(struct file *filp, struct dentry *dentry, int mode)
-+{
-+ return init_private_file_it(filp, dentry, mode, NULL);
-+}
-
- void fput(struct file * file)
- {
-Index: linux-bgl/fs/inode.c
-===================================================================
---- linux-bgl.orig/fs/inode.c 2004-12-28 17:13:56.635910389 -0800
-+++ linux-bgl/fs/inode.c 2004-12-28 17:13:59.950918244 -0800
-@@ -971,9 +971,10 @@
- }
-
-
--struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find_actor, void *opaque)
-+static inline struct inode *ifind(struct super_block *sb, unsigned long ino,
-+ struct list_head *head,
-+ find_inode_t find_actor, void *opaque)
- {
-- struct list_head * head = inode_hashtable + hash(sb,ino);
- struct inode * inode;
-
- spin_lock(&inode_lock);
-@@ -986,6 +987,24 @@
- }
- spin_unlock(&inode_lock);
-
-+ return NULL;
-+}
-+
-+struct inode *ilookup4(struct super_block *sb, unsigned long ino,
-+ find_inode_t find_actor, void *opaque)
-+{
-+ struct list_head * head = inode_hashtable + hash(sb,ino);
-+ return ifind(sb, ino, head, find_actor, opaque);
-+}
-+
-+struct inode *iget4(struct super_block *sb, unsigned long ino,
-+ find_inode_t find_actor, void *opaque)
-+{
-+ struct list_head * head = inode_hashtable + hash(sb,ino);
-+ struct inode *inode = ifind(sb, ino, head, find_actor, opaque);
-+ if (inode)
-+ return inode;
-+
- /*
- * get_new_inode() will do the right thing, re-trying the search
- * in case it had to block at any point.
-Index: linux-bgl/kernel/ksyms.c
-===================================================================
---- linux-bgl.orig/kernel/ksyms.c 2004-12-28 17:13:56.978855920 -0800
-+++ linux-bgl/kernel/ksyms.c 2004-12-28 17:13:59.951918085 -0800
-@@ -142,6 +142,7 @@
- EXPORT_SYMBOL(igrab);
- EXPORT_SYMBOL(iunique);
- EXPORT_SYMBOL(iget4);
-+EXPORT_SYMBOL(ilookup4);
- EXPORT_SYMBOL(iput);
- EXPORT_SYMBOL(force_delete);
- EXPORT_SYMBOL(follow_up);
-@@ -152,6 +153,7 @@
- EXPORT_SYMBOL(path_release);
- EXPORT_SYMBOL(__user_walk);
- EXPORT_SYMBOL(lookup_one_len);
-+EXPORT_SYMBOL(lookup_one_len_it);
- EXPORT_SYMBOL(lookup_hash);
- EXPORT_SYMBOL(sys_close);
- EXPORT_SYMBOL(dcache_lock);
-Index: linux-bgl/include/linux/fs.h
-===================================================================
---- linux-bgl.orig/include/linux/fs.h 2004-12-28 17:13:59.471860200 -0800
-+++ linux-bgl/include/linux/fs.h 2004-12-28 17:13:59.955917450 -0800
-@@ -93,6 +93,9 @@
- #define FS_SINGLE 8 /* Filesystem that can have only one superblock */
- #define FS_NOMOUNT 16 /* Never mount from userland */
- #define FS_LITTER 32 /* Keeps the tree in dcache */
-+#define FS_NFSEXP_FSID 64 /* Use file system specific fsid for
-+ * exporting non device filesystems.
-+ */
- #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon
- * as nfs_rename() will be cleaned up
- */
-@@ -1149,6 +1152,9 @@
- struct nameidata *nd, struct lookup_intent *it);
- extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
- int flags, struct lookup_intent *it);
-+extern int revalidate_it(struct dentry *dentry, struct lookup_intent *it);
-+extern int init_private_file_it(struct file *, struct dentry *dentry, int mode,
-+ struct lookup_intent *it);
- extern int filp_close(struct file *, fl_owner_t id);
- extern char * getname(const char *);
-
-@@ -1418,6 +1424,8 @@
- extern int follow_down(struct vfsmount **, struct dentry **);
- extern int follow_up(struct vfsmount **, struct dentry **);
- extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
-+extern struct dentry * lookup_one_len_it(const char *, struct dentry *, int,
-+ struct lookup_intent *);
- extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
- #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
- #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
-@@ -1431,6 +1439,8 @@
-
- typedef int (*find_inode_t)(struct inode *, unsigned long, void *);
- extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *);
-+extern struct inode * ilookup4(struct super_block *, unsigned long,
-+ find_inode_t, void *);
- static inline struct inode *iget(struct super_block *sb, unsigned long ino)
- {
- return iget4(sb, ino, NULL, NULL);
+++ /dev/null
- include/linux/fs.h | 1 +
- mm/filemap.c | 3 +++
- 2 files changed, 4 insertions(+)
-
-Index: linux-2.4.19.SuSE/include/linux/fs.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/fs.h Sun Nov 16 00:40:59 2003
-+++ linux-2.4.19.SuSE/include/linux/fs.h Sun Nov 16 01:38:06 2003
-@@ -428,6 +428,7 @@
- int (*releasepage) (struct page *, int);
- #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
- int (*direct_IO)(int, struct file *, struct kiobuf *, unsigned long, int);
-+ void (*removepage)(struct page *); /* called when page gets removed from the inode */
- };
-
- struct address_space {
-Index: linux-2.4.19.SuSE/mm/filemap.c
-===================================================================
---- linux-2.4.19.SuSE.orig/mm/filemap.c Sat Nov 15 18:02:15 2003
-+++ linux-2.4.19.SuSE/mm/filemap.c Sun Nov 16 01:37:11 2003
-@@ -97,6 +97,9 @@
- {
- struct address_space * mapping = page->mapping;
-
-+ if (mapping->a_ops->removepage)
-+ mapping->a_ops->removepage(page);
-+
- mapping->nrpages--;
- list_del(&page->list);
- page->mapping = NULL;
+++ /dev/null
-Index: linux-2.4.19-pre1/include/linux/sched.h
-===================================================================
---- linux-2.4.19-pre1.orig/include/linux/sched.h 2003-11-21 04:05:05.000000000 +0300
-+++ linux-2.4.19-pre1/include/linux/sched.h 2003-11-21 04:10:29.000000000 +0300
-@@ -927,6 +927,11 @@
- return res;
- }
-
-+static inline int need_resched(void)
-+{
-+ return (unlikely(current->need_resched));
-+}
-+
- #endif /* __KERNEL__ */
-
- #endif
+++ /dev/null
- include/linux/socket.h | 4 ++++
- net/netsyms.c | 2 ++
- net/socket.c | 2 +-
- 3 files changed, 7 insertions(+), 1 deletion(-)
-
-Index: linux-DRV401/include/linux/socket.h
-===================================================================
---- linux-DRV401.orig/include/linux/socket.h 2004-10-15 10:26:20.000000000 -0700
-+++ linux-DRV401/include/linux/socket.h 2004-10-15 11:11:09.000000000 -0700
-@@ -260,6 +260,10 @@
- extern int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen);
- extern int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr);
- extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
-+struct socket;
-+extern int sock_map_fd(struct socket *sock);
-+extern struct socket *sockfd_lookup(int fd, int *err);
-+
- #endif
- #endif /* not kernel and not glibc */
- #endif /* _LINUX_SOCKET_H */
-Index: linux-DRV401/net/netsyms.c
-===================================================================
---- linux-DRV401.orig/net/netsyms.c 2004-10-15 11:10:52.000000000 -0700
-+++ linux-DRV401/net/netsyms.c 2004-10-15 11:11:09.000000000 -0700
-@@ -159,6 +159,8 @@
- EXPORT_SYMBOL(put_cmsg);
- EXPORT_SYMBOL(sock_kmalloc);
- EXPORT_SYMBOL(sock_kfree_s);
-+EXPORT_SYMBOL(sockfd_lookup);
-+EXPORT_SYMBOL(sock_map_fd);
-
- #ifdef CONFIG_FILTER
- EXPORT_SYMBOL(sk_run_filter);
-Index: linux-DRV401/net/socket.c
-===================================================================
---- linux-DRV401.orig/net/socket.c 2004-10-15 10:24:16.000000000 -0700
-+++ linux-DRV401/net/socket.c 2004-10-15 11:11:09.000000000 -0700
-@@ -326,7 +326,7 @@
- * but we take care of internal coherence yet.
- */
-
--static int sock_map_fd(struct socket *sock)
-+int sock_map_fd(struct socket *sock)
- {
- int fd;
- struct qstr this;
+++ /dev/null
-Index: linux-2.4.19-pre1/include/linux/skbuff.h
-===================================================================
---- linux-2.4.19-pre1.orig/include/linux/skbuff.h 2001-11-22 22:46:26.000000000 +0300
-+++ linux-2.4.19-pre1/include/linux/skbuff.h 2004-01-14 01:15:13.000000000 +0300
-@@ -116,6 +116,30 @@
- __u16 size;
- };
-
-+/* Support for callback when skb data has been released */
-+typedef struct zccd /* Zero Copy Callback Descriptor */
-+{ /* (embed as first member of custom struct) */
-+ atomic_t zccd_count; /* reference count */
-+ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
-+} zccd_t;
-+
-+static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
-+{
-+ atomic_set (&d->zccd_count, 1);
-+ d->zccd_destructor = callback;
-+}
-+
-+static inline void zccd_get (zccd_t *d) /* take a reference */
-+{
-+ atomic_inc (&d->zccd_count);
-+}
-+
-+static inline void zccd_put (zccd_t *d) /* release a reference */
-+{
-+ if (atomic_dec_and_test (&d->zccd_count))
-+ (d->zccd_destructor)(d);
-+}
-+
- /* This data is invariant across clones and lives at
- * the end of the header data, ie. at skb->end.
- */
-@@ -123,6 +147,12 @@
- atomic_t dataref;
- unsigned int nr_frags;
- struct sk_buff *frag_list;
-+ zccd_t *zccd; /* zero copy descriptor */
-+ zccd_t *zccd2; /* 2nd zero copy descriptor */
-+ /* NB we expect zero-copy data to be at least 1 packet, so
-+ * having 2 zccds means we don't unneccessarily split the packet
-+ * where consecutive zero-copy sends abutt.
-+ */
- skb_frag_t frags[MAX_SKB_FRAGS];
- };
-
-Index: linux-2.4.19-pre1/include/net/tcp.h
-===================================================================
---- linux-2.4.19-pre1.orig/include/net/tcp.h 2001-11-22 22:47:22.000000000 +0300
-+++ linux-2.4.19-pre1/include/net/tcp.h 2004-01-14 01:15:13.000000000 +0300
-@@ -640,6 +640,8 @@
-
- extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
- extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
-+extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
-+ int flags, zccd_t *zccd);
-
- extern int tcp_ioctl(struct sock *sk,
- int cmd,
-@@ -733,6 +735,9 @@
- struct msghdr *msg,
- int len, int nonblock,
- int flags, int *addr_len);
-+extern int tcp_recvpackets(struct sock *sk,
-+ struct sk_buff_head *packets,
-+ int len, int nonblock);
-
- extern int tcp_listen_start(struct sock *sk);
-
-Index: linux-2.4.19-pre1/net/netsyms.c
-===================================================================
---- linux-2.4.19-pre1.orig/net/netsyms.c 2004-01-14 01:10:37.000000000 +0300
-+++ linux-2.4.19-pre1/net/netsyms.c 2004-01-14 01:15:54.000000000 +0300
-@@ -409,6 +409,9 @@
-
- #endif
-
-+EXPORT_SYMBOL(tcp_sendpage_zccd);
-+EXPORT_SYMBOL(tcp_recvpackets);
-+
- EXPORT_SYMBOL(netlink_set_err);
- EXPORT_SYMBOL(netlink_broadcast);
- EXPORT_SYMBOL(netlink_unicast);
-Index: linux-2.4.19-pre1/net/core/skbuff.c
-===================================================================
---- linux-2.4.19-pre1.orig/net/core/skbuff.c 2001-12-21 20:42:05.000000000 +0300
-+++ linux-2.4.19-pre1/net/core/skbuff.c 2004-01-14 01:15:13.000000000 +0300
-@@ -208,6 +208,8 @@
- atomic_set(&(skb_shinfo(skb)->dataref), 1);
- skb_shinfo(skb)->nr_frags = 0;
- skb_shinfo(skb)->frag_list = NULL;
-+ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */
-+ skb_shinfo(skb)->zccd2 = NULL;
- return skb;
-
- nodata:
-@@ -276,6 +278,10 @@
- {
- if (!skb->cloned ||
- atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
-+ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
-+ zccd_put (skb_shinfo(skb)->zccd); /* release hold */
-+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
-+ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
- if (skb_shinfo(skb)->nr_frags) {
- int i;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-@@ -532,6 +538,8 @@
- atomic_set(&(skb_shinfo(skb)->dataref), 1);
- skb_shinfo(skb)->nr_frags = 0;
- skb_shinfo(skb)->frag_list = NULL;
-+ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */
-+ skb_shinfo(skb)->zccd2 = NULL;
-
- /* We are no longer a clone, even if we were. */
- skb->cloned = 0;
-@@ -578,6 +586,14 @@
- n->data_len = skb->data_len;
- n->len = skb->len;
-
-+ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */
-+ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
-+ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
-+
-+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */
-+ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
-+ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
-+
- if (skb_shinfo(skb)->nr_frags) {
- int i;
-
-@@ -620,6 +636,8 @@
- u8 *data;
- int size = nhead + (skb->end - skb->head) + ntail;
- long off;
-+ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */
-+ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
-
- if (skb_shared(skb))
- BUG();
-@@ -641,6 +659,11 @@
- if (skb_shinfo(skb)->frag_list)
- skb_clone_fraglist(skb);
-
-+ if (zccd != NULL) /* user zero copy descriptor? */
-+ zccd_get (zccd); /* extra ref (pages are shared) */
-+ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */
-+ zccd_get (zccd2); /* extra ref (pages are shared) */
-+
- skb_release_data(skb);
-
- off = (data+nhead) - skb->head;
-@@ -655,6 +678,8 @@
- skb->nh.raw += off;
- skb->cloned = 0;
- atomic_set(&skb_shinfo(skb)->dataref, 1);
-+ skb_shinfo(skb)->zccd = zccd;
-+ skb_shinfo(skb)->zccd2 = zccd2;
- return 0;
-
- nodata:
-Index: linux-2.4.19-pre1/net/ipv4/tcp.c
-===================================================================
---- linux-2.4.19-pre1.orig/net/ipv4/tcp.c 2001-12-21 20:42:05.000000000 +0300
-+++ linux-2.4.19-pre1/net/ipv4/tcp.c 2004-01-14 01:15:13.000000000 +0300
-@@ -744,7 +744,7 @@
- goto out;
- }
-
--ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
-+ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
-
- static inline int
- can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
-@@ -823,7 +823,8 @@
- return err;
- }
-
--ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
-+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
-+ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
- {
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int mss_now;
-@@ -871,6 +872,17 @@
- copy = size;
-
- i = skb_shinfo(skb)->nr_frags;
-+
-+ if (zccd != NULL && /* this is a zcc I/O */
-+ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
-+ skb_shinfo(skb)->zccd2 != NULL &&
-+ skb_shinfo(skb)->zccd != zccd && /* not the same one */
-+ skb_shinfo(skb)->zccd2 != zccd)
-+ {
-+ tcp_mark_push (tp, skb);
-+ goto new_segment;
-+ }
-+
- if (can_coalesce(skb, i, page, offset)) {
- skb_shinfo(skb)->frags[i-1].size += copy;
- } else if (i < MAX_SKB_FRAGS) {
-@@ -881,6 +893,20 @@
- goto new_segment;
- }
-
-+ if (zccd != NULL && /* this is a zcc I/O */
-+ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
-+ skb_shinfo(skb)->zccd2 != zccd)
-+ {
-+ zccd_get (zccd); /* bump ref count */
-+
-+ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
-+
-+ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
-+ skb_shinfo(skb)->zccd = zccd;
-+ else
-+ skb_shinfo(skb)->zccd2 = zccd;
-+ }
-+
- skb->len += copy;
- skb->data_len += copy;
- skb->ip_summed = CHECKSUM_HW;
-@@ -944,7 +970,31 @@
-
- lock_sock(sk);
- TCP_CHECK_TIMER(sk);
-- res = do_tcp_sendpages(sk, &page, offset, size, flags);
-+ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
-+ TCP_CHECK_TIMER(sk);
-+ release_sock(sk);
-+ return res;
-+}
-+
-+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
-+ int flags, zccd_t *zccd)
-+{
-+ ssize_t res;
-+ struct sock *sk = sock->sk;
-+
-+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
-+
-+ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
-+ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
-+ BUG ();
-+
-+#undef TCP_ZC_CSUM_FLAGS
-+
-+ lock_sock(sk);
-+ TCP_CHECK_TIMER(sk);
-+
-+ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
-+
- TCP_CHECK_TIMER(sk);
- release_sock(sk);
- return res;
-@@ -1683,6 +1733,202 @@
- goto out;
- }
-
-+int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
-+ int len, int nonblock)
-+{
-+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-+ int copied;
-+ long timeo;
-+
-+ BUG_TRAP (len > 0);
-+ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
-+
-+ lock_sock(sk);
-+
-+ TCP_CHECK_TIMER(sk);
-+
-+ copied = -ENOTCONN;
-+ if (sk->state == TCP_LISTEN)
-+ goto out;
-+
-+ copied = 0;
-+ timeo = sock_rcvtimeo(sk, nonblock);
-+
-+ do {
-+ struct sk_buff * skb;
-+ u32 offset;
-+ unsigned long used;
-+ int exhausted;
-+ int eaten;
-+
-+ /* Are we at urgent data? Stop if we have read anything. */
-+ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
-+ break;
-+
-+ /* We need to check signals first, to get correct SIGURG
-+ * handling. FIXME: Need to check this doesnt impact 1003.1g
-+ * and move it down to the bottom of the loop
-+ */
-+ if (signal_pending(current)) {
-+ if (copied)
-+ break;
-+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
-+ break;
-+ }
-+
-+ /* Next get a buffer. */
-+
-+ skb = skb_peek(&sk->receive_queue);
-+
-+ if (skb == NULL) /* nothing ready */
-+ {
-+ if (copied) {
-+ if (sk->err ||
-+ sk->state == TCP_CLOSE ||
-+ (sk->shutdown & RCV_SHUTDOWN) ||
-+ !timeo ||
-+ (0))
-+ break;
-+ } else {
-+ if (sk->done)
-+ break;
-+
-+ if (sk->err) {
-+ copied = sock_error(sk);
-+ break;
-+ }
-+
-+ if (sk->shutdown & RCV_SHUTDOWN)
-+ break;
-+
-+ if (sk->state == TCP_CLOSE) {
-+ if (!sk->done) {
-+ /* This occurs when user tries to read
-+ * from never connected socket.
-+ */
-+ copied = -ENOTCONN;
-+ break;
-+ }
-+ break;
-+ }
-+
-+ if (!timeo) {
-+ copied = -EAGAIN;
-+ break;
-+ }
-+ }
-+
-+ cleanup_rbuf(sk, copied);
-+ timeo = tcp_data_wait(sk, timeo);
-+ continue;
-+ }
-+
-+ BUG_TRAP (atomic_read (&skb->users) == 1);
-+
-+ exhausted = eaten = 0;
-+
-+ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
-+ if (skb->h.th->syn)
-+ offset--;
-+
-+ used = skb->len - offset;
-+
-+ if (tp->urg_data) {
-+ u32 urg_offset = tp->urg_seq - tp->copied_seq;
-+ if (urg_offset < used) {
-+ if (!urg_offset) { /* at urgent date */
-+ if (!sk->urginline) {
-+ tp->copied_seq++; /* discard the single byte of urgent data */
-+ offset++;
-+ used--;
-+ }
-+ } else /* truncate read */
-+ used = urg_offset;
-+ }
-+ }
-+
-+ BUG_TRAP (used >= 0);
-+ if (len < used)
-+ used = len;
-+
-+ if (used == 0)
-+ exhausted = 1;
-+ else
-+ {
-+ if (skb_is_nonlinear (skb))
-+ {
-+ int rc = skb_linearize (skb, GFP_KERNEL);
-+
-+ printk ("tcp_recvpackets(): linearising: %d\n", rc);
-+
-+ if (rc)
-+ {
-+ if (!copied)
-+ copied = rc;
-+ break;
-+ }
-+ }
-+
-+ if ((offset + used) == skb->len) /* consuming the whole packet */
-+ {
-+ __skb_unlink (skb, &sk->receive_queue);
-+ dst_release (skb->dst);
-+ skb_orphan (skb);
-+ __skb_pull (skb, offset);
-+ __skb_queue_tail (packets, skb);
-+ exhausted = eaten = 1;
-+ }
-+ else /* consuming only part of the packet */
-+ {
-+ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
-+
-+ if (skb2 == NULL)
-+ {
-+ if (!copied)
-+ copied = -ENOMEM;
-+ break;
-+ }
-+
-+ dst_release (skb2->dst);
-+ __skb_pull (skb2, offset);
-+ __skb_trim (skb2, used);
-+ __skb_queue_tail (packets, skb2);
-+ }
-+
-+ tp->copied_seq += used;
-+ copied += used;
-+ len -= used;
-+ }
-+
-+ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
-+ tp->urg_data = 0;
-+ tcp_fast_path_check(sk, tp);
-+ }
-+
-+ if (!exhausted)
-+ continue;
-+
-+ if (skb->h.th->fin)
-+ {
-+ tp->copied_seq++;
-+ if (!eaten)
-+ tcp_eat_skb (sk, skb);
-+ break;
-+ }
-+
-+ if (!eaten)
-+ tcp_eat_skb (sk, skb);
-+
-+ } while (len > 0);
-+
-+ out:
-+ /* Clean up data we have read: This will do ACK frames. */
-+ cleanup_rbuf(sk, copied);
-+ TCP_CHECK_TIMER(sk);
-+ release_sock(sk);
-+ return copied;
-+}
-+
- /*
- * State processing on a close. This implements the state shift for
- * sending our FIN frame. Note that we only send a FIN for some
+++ /dev/null
- fs/dcache.c | 19 ++
- fs/exec.c | 17 +-
- fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++-------
- fs/namespace.c | 28 +++-
- fs/open.c | 172 +++++++++++++++++++-------
- fs/stat.c | 52 +++++---
- include/linux/dcache.h | 60 +++++++++
- include/linux/fs.h | 32 ++++
- include/linux/fs_struct.h | 4
- kernel/exit.c | 3
- kernel/fork.c | 3
- kernel/ksyms.c | 1
- 12 files changed, 558 insertions(+), 128 deletions(-)
-
-Index: linux.mcp2/fs/dcache.c
-===================================================================
---- linux.mcp2.orig/fs/dcache.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/dcache.c 2004-05-05 14:19:59.000000000 -0700
-@@ -181,6 +181,13 @@
- spin_unlock(&dcache_lock);
- return 0;
- }
-+
-+ /* network invalidation by Lustre */
-+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) {
-+ spin_unlock(&dcache_lock);
-+ return 0;
-+ }
-+
- /*
- * Check whether to do a partial shrink_dcache
- * to get rid of unused child entries.
-@@ -830,13 +837,19 @@
- * Adds a dentry to the hash according to its name.
- */
-
--void d_rehash(struct dentry * entry)
-+void __d_rehash(struct dentry * entry, int lock)
- {
- struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
- if (!list_empty(&entry->d_hash)) BUG();
-- spin_lock(&dcache_lock);
-+ if (lock) spin_lock(&dcache_lock);
- list_add(&entry->d_hash, list);
-- spin_unlock(&dcache_lock);
-+ if (lock) spin_unlock(&dcache_lock);
-+}
-+EXPORT_SYMBOL(__d_rehash);
-+
-+void d_rehash(struct dentry * entry)
-+{
-+ __d_rehash(entry, 1);
- }
-
- #define do_switch(x,y) do { \
-Index: linux.mcp2/fs/exec.c
-===================================================================
---- linux.mcp2.orig/fs/exec.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/exec.c 2004-05-05 14:19:59.000000000 -0700
-@@ -107,8 +107,10 @@
- struct file * file;
- struct nameidata nd;
- int error;
-+ struct lookup_intent it = { .it_op = IT_OPEN,
-+ .it_flags = FMODE_READ|FMODE_EXEC };
-
-- error = user_path_walk(library, &nd);
-+ error = user_path_walk_it(library, &nd, &it);
- if (error)
- goto out;
-
-@@ -120,7 +122,8 @@
- if (error)
- goto exit;
-
-- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
-+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
-+ intent_release(&it);
- error = PTR_ERR(file);
- if (IS_ERR(file))
- goto out;
-@@ -342,9 +345,11 @@
- struct inode *inode;
- struct file *file;
- int err = 0;
-+ struct lookup_intent it = { .it_op = IT_OPEN,
-+ .it_flags = FMODE_READ|FMODE_EXEC };
-
- if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
-- err = path_walk(name, &nd);
-+ err = path_walk_it(name, &nd, &it);
- file = ERR_PTR(err);
- if (!err) {
- inode = nd.dentry->d_inode;
-@@ -356,7 +361,8 @@
- err = -EACCES;
- file = ERR_PTR(err);
- if (!err) {
-- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
-+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
-+ intent_release(&it);
- if (!IS_ERR(file)) {
- err = deny_write_access(file);
- if (err) {
-@@ -368,6 +374,7 @@
- return file;
- }
- }
-+ intent_release(&it);
- path_release(&nd);
- }
- goto out;
-@@ -969,7 +976,7 @@
- goto close_fail;
- if (!file->f_op->write)
- goto close_fail;
-- if (do_truncate(file->f_dentry, 0) != 0)
-+ if (do_truncate(file->f_dentry, 0, 0) != 0)
- goto close_fail;
-
- retval = binfmt->core_dump(signr, regs, file);
-Index: linux.mcp2/fs/namei.c
-===================================================================
---- linux.mcp2.orig/fs/namei.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/namei.c 2004-05-05 14:28:26.000000000 -0700
-@@ -94,6 +94,13 @@
- * XEmacs seems to be relying on it...
- */
-
-+void intent_release(struct lookup_intent *it)
-+{
-+ if (it && it->it_op_release)
-+ it->it_op_release(it);
-+
-+}
-+
- /* In order to reduce some races, while at the same time doing additional
- * checking and hopefully speeding things up, we copy filenames to the
- * kernel data space before using them..
-@@ -260,10 +267,19 @@
- * Internal lookup() using the new generic dcache.
- * SMP-safe
- */
--static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name,
-+ int flags, struct lookup_intent *it)
- {
- struct dentry * dentry = d_lookup(parent, name);
-
-+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
-+ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) &&
-+ !d_invalidate(dentry)) {
-+ dput(dentry);
-+ dentry = NULL;
-+ }
-+ return dentry;
-+ } else
- if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
- if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
- dput(dentry);
-@@ -281,11 +297,15 @@
- * make sure that nobody added the entry to the dcache in the meantime..
- * SMP-safe
- */
--static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name,
-+ int flags, struct lookup_intent *it)
- {
- struct dentry * result;
- struct inode *dir = parent->d_inode;
-+ int counter = 0;
-
-+again:
-+ counter++;
- down(&dir->i_sem);
- /*
- * First re-do the cached lookup just in case it was created
-@@ -300,6 +320,9 @@
- result = ERR_PTR(-ENOMEM);
- if (dentry) {
- lock_kernel();
-+ if (dir->i_op->lookup_it)
-+ result = dir->i_op->lookup_it(dir, dentry, it, flags);
-+ else
- result = dir->i_op->lookup(dir, dentry);
- unlock_kernel();
- if (result)
-@@ -321,6 +344,15 @@
- dput(result);
- result = ERR_PTR(-ENOENT);
- }
-+ } else if (result->d_op && result->d_op->d_revalidate_it) {
-+ if (!result->d_op->d_revalidate_it(result, flags, it) &&
-+ !d_invalidate(result)) {
-+ dput(result);
-+ if (counter > 10)
-+ result = ERR_PTR(-ESTALE);
-+ if (!IS_ERR(result))
-+ goto again;
-+ }
- }
- return result;
- }
-@@ -332,7 +364,8 @@
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups.
- */
--static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
-+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd,
-+ struct lookup_intent *it)
- {
- int err;
- if (current->link_count >= 5)
-@@ -346,10 +379,12 @@
- current->link_count++;
- current->total_link_count++;
- UPDATE_ATIME(dentry->d_inode);
-+ nd->intent = it;
- err = dentry->d_inode->i_op->follow_link(dentry, nd);
- current->link_count--;
- return err;
- loop:
-+ intent_release(it);
- path_release(nd);
- return -ELOOP;
- }
-@@ -447,7 +482,8 @@
- *
- * We expect 'base' to be positive and a directory.
- */
--int link_path_walk(const char * name, struct nameidata *nd)
-+int link_path_walk_it(const char *name, struct nameidata *nd,
-+ struct lookup_intent *it)
- {
- struct dentry *dentry;
- struct inode *inode;
-@@ -520,9 +556,10 @@
- break;
- }
- /* This does the actual lookups.. */
-- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
- if (!dentry) {
-- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE,
-+ NULL);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- break;
-@@ -540,7 +577,7 @@
- goto out_dput;
-
- if (inode->i_op->follow_link) {
-- err = do_follow_link(dentry, nd);
-+ err = do_follow_link(dentry, nd, NULL);
- dput(dentry);
- if (err)
- goto return_err;
-@@ -556,7 +593,7 @@
- nd->dentry = dentry;
- }
- err = -ENOTDIR;
-- if (!inode->i_op->lookup)
-+ if (!inode->i_op->lookup && !inode->i_op->lookup_it)
- break;
- continue;
- /* here ends the main loop */
-@@ -583,9 +620,9 @@
- if (err < 0)
- break;
- }
-- dentry = cached_lookup(nd->dentry, &this, 0);
-+ dentry = cached_lookup(nd->dentry, &this, 0, it);
- if (!dentry) {
-- dentry = real_lookup(nd->dentry, &this, 0);
-+ dentry = real_lookup(nd->dentry, &this, 0, it);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- break;
-@@ -595,7 +632,7 @@
- inode = dentry->d_inode;
- if ((lookup_flags & LOOKUP_FOLLOW)
- && inode && inode->i_op && inode->i_op->follow_link) {
-- err = do_follow_link(dentry, nd);
-+ err = do_follow_link(dentry, nd, it);
- dput(dentry);
- if (err)
- goto return_err;
-@@ -609,7 +646,8 @@
- goto no_inode;
- if (lookup_flags & LOOKUP_DIRECTORY) {
- err = -ENOTDIR;
-- if (!inode->i_op || !inode->i_op->lookup)
-+ if (!inode->i_op ||
-+ (!inode->i_op->lookup && !inode->i_op->lookup_it))
- break;
- }
- goto return_base;
-@@ -633,6 +671,34 @@
- * Check the cached dentry for staleness.
- */
- dentry = nd->dentry;
-+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
-+ err = -ESTALE;
-+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) {
-+ struct dentry *new;
-+ err = permission(dentry->d_parent->d_inode,
-+ MAY_EXEC);
-+ if (err)
-+ break;
-+ new = real_lookup(dentry->d_parent,
-+ &dentry->d_name, 0, NULL);
-+ if (IS_ERR(new)) {
-+ err = PTR_ERR(new);
-+ break;
-+ }
-+ d_invalidate(dentry);
-+ dput(dentry);
-+ nd->dentry = new;
-+ }
-+ if (!nd->dentry->d_inode)
-+ goto no_inode;
-+ if (lookup_flags & LOOKUP_DIRECTORY) {
-+ err = -ENOTDIR;
-+ if (!nd->dentry->d_inode->i_op ||
-+ (!nd->dentry->d_inode->i_op->lookup &&
-+ !nd->dentry->d_inode->i_op->lookup_it))
-+ break;
-+ }
-+ } else
- if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
- err = -ESTALE;
- if (!dentry->d_op->d_revalidate(dentry, 0)) {
-@@ -646,15 +703,28 @@
- dput(dentry);
- break;
- }
-+ if (err)
-+ intent_release(it);
- path_release(nd);
- return_err:
- return err;
- }
-
-+int link_path_walk(const char * name, struct nameidata *nd)
-+{
-+ return link_path_walk_it(name, nd, NULL);
-+}
-+
-+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it)
-+{
-+ current->total_link_count = 0;
-+ return link_path_walk_it(name, nd, it);
-+}
-+
- int path_walk(const char * name, struct nameidata *nd)
- {
- current->total_link_count = 0;
-- return link_path_walk(name, nd);
-+ return link_path_walk_it(name, nd, NULL);
- }
-
- /* SMP-safe */
-@@ -743,6 +813,7 @@
- {
- nd->last_type = LAST_ROOT; /* if there are only slashes... */
- nd->flags = flags;
-+ nd->intent = NULL;
- if (*name=='/')
- return walk_init_root(name,nd);
- read_lock(¤t->fs->lock);
-@@ -757,7 +828,8 @@
- * needs parent already locked. Doesn't follow mounts.
- * SMP-safe.
- */
--struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base,
-+ struct lookup_intent *it)
- {
- struct dentry * dentry;
- struct inode *inode;
-@@ -780,13 +852,16 @@
- goto out;
- }
-
-- dentry = cached_lookup(base, name, 0);
-+ dentry = cached_lookup(base, name, 0, it);
- if (!dentry) {
- struct dentry *new = d_alloc(base, name);
- dentry = ERR_PTR(-ENOMEM);
- if (!new)
- goto out;
- lock_kernel();
-+ if (inode->i_op->lookup_it)
-+ dentry = inode->i_op->lookup_it(inode, new, it, 0);
-+ else
- dentry = inode->i_op->lookup(inode, new);
- unlock_kernel();
- if (!dentry)
-@@ -798,6 +873,12 @@
- return dentry;
- }
-
-+struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+{
-+ return lookup_hash_it(name, base, NULL);
-+}
-+
-+
- /* SMP-safe */
- struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
- {
-@@ -819,7 +900,7 @@
- }
- this.hash = end_name_hash(hash);
-
-- return lookup_hash(&this, base);
-+ return lookup_hash_it(&this, base, NULL);
- access:
- return ERR_PTR(-EACCES);
- }
-@@ -851,6 +932,23 @@
- return err;
- }
-
-+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd,
-+ struct lookup_intent *it)
-+{
-+ char *tmp;
-+ int err;
-+
-+ tmp = getname(name);
-+ err = PTR_ERR(tmp);
-+ if (!IS_ERR(tmp)) {
-+ err = 0;
-+ if (path_init(tmp, flags, nd))
-+ err = path_walk_it(tmp, nd, it);
-+ putname(tmp);
-+ }
-+ return err;
-+}
-+
- /*
- * It's inline, so penalty for filesystems that don't use sticky bit is
- * minimal.
-@@ -946,7 +1044,8 @@
- return retval;
- }
-
--int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
-+static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode,
-+ struct lookup_intent *it)
- {
- int error;
-
-@@ -959,12 +1058,15 @@
- goto exit_lock;
-
- error = -EACCES; /* shouldn't it be ENOSYS? */
-- if (!dir->i_op || !dir->i_op->create)
-+ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it))
- goto exit_lock;
-
- DQUOT_INIT(dir);
- lock_kernel();
-- error = dir->i_op->create(dir, dentry, mode);
-+ if (dir->i_op->create_it)
-+ error = dir->i_op->create_it(dir, dentry, mode, it);
-+ else
-+ error = dir->i_op->create(dir, dentry, mode);
- unlock_kernel();
- exit_lock:
- up(&dir->i_zombie);
-@@ -973,6 +1075,11 @@
- return error;
- }
-
-+int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
-+{
-+ return vfs_create_it(dir, dentry, mode, NULL);
-+}
-+
- /*
- * open_namei()
- *
-@@ -987,7 +1094,8 @@
- * for symlinks (where the permissions are checked later).
- * SMP-safe
- */
--int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
-+int open_namei_it(const char *pathname, int flag, int mode,
-+ struct nameidata *nd, struct lookup_intent *it)
- {
- int acc_mode, error = 0;
- struct inode *inode;
-@@ -997,12 +1105,14 @@
-
- acc_mode = ACC_MODE(flag);
-
-+ if (it)
-+ it->it_flags = flag;
- /*
- * The simplest case - just a plain lookup.
- */
- if (!(flag & O_CREAT)) {
- if (path_init(pathname, lookup_flags(flag), nd))
-- error = path_walk(pathname, nd);
-+ error = path_walk_it(pathname, nd, it);
- if (error)
- return error;
- dentry = nd->dentry;
-@@ -1012,6 +1122,10 @@
- /*
- * Create - we need to know the parent.
- */
-+ if (it) {
-+ it->it_create_mode = mode;
-+ it->it_op |= IT_CREAT;
-+ }
- if (path_init(pathname, LOOKUP_PARENT, nd))
- error = path_walk(pathname, nd);
- if (error)
-@@ -1028,7 +1142,7 @@
-
- dir = nd->dentry;
- down(&dir->d_inode->i_sem);
-- dentry = lookup_hash(&nd->last, nd->dentry);
-+ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
-
- do_last:
- error = PTR_ERR(dentry);
-@@ -1037,10 +1151,11 @@
- goto exit;
- }
-
-+ it->it_create_mode = mode;
- /* Negative dentry, just create the file */
- if (!dentry->d_inode) {
-- error = vfs_create(dir->d_inode, dentry,
-- mode & ~current->fs->umask);
-+ error = vfs_create_it(dir->d_inode, dentry,
-+ mode & ~current->fs->umask, it);
- up(&dir->d_inode->i_sem);
- dput(nd->dentry);
- nd->dentry = dentry;
-@@ -1144,7 +1259,7 @@
- if (!error) {
- DQUOT_INIT(inode);
-
-- error = do_truncate(dentry, 0);
-+ error = do_truncate(dentry, 0, 1);
- }
- put_write_access(inode);
- if (error)
-@@ -1156,8 +1271,10 @@
- return 0;
-
- exit_dput:
-+ intent_release(it);
- dput(dentry);
- exit:
-+ intent_release(it);
- path_release(nd);
- return error;
-
-@@ -1176,7 +1293,10 @@
- * are done. Procfs-like symlinks just set LAST_BIND.
- */
- UPDATE_ATIME(dentry->d_inode);
-+ nd->intent = it;
- error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+ if (error)
-+ intent_release(it);
- dput(dentry);
- if (error)
- return error;
-@@ -1198,13 +1318,20 @@
- }
- dir = nd->dentry;
- down(&dir->d_inode->i_sem);
-- dentry = lookup_hash(&nd->last, nd->dentry);
-+ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
- putname(nd->last.name);
- goto do_last;
- }
-
-+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd)
-+{
-+ return open_namei_it(pathname, flag, mode, nd, NULL);
-+}
-+
-+
- /* SMP-safe */
--static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
-+static struct dentry *lookup_create(struct nameidata *nd, int is_dir,
-+ struct lookup_intent *it)
- {
- struct dentry *dentry;
-
-@@ -1212,7 +1339,7 @@
- dentry = ERR_PTR(-EEXIST);
- if (nd->last_type != LAST_NORM)
- goto fail;
-- dentry = lookup_hash(&nd->last, nd->dentry);
-+ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
- if (IS_ERR(dentry))
- goto fail;
- if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1269,7 +1396,20 @@
- error = path_walk(tmp, &nd);
- if (error)
- goto out;
-- dentry = lookup_create(&nd, 0);
-+
-+ if (nd.last_type != LAST_NORM) {
-+ error = -EEXIST;
-+ goto out2;
-+ }
-+ if (nd.dentry->d_inode->i_op->mknod_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->mknod_raw(&nd, mode, dev);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out2;
-+ }
-+
-+ dentry = lookup_create(&nd, 0, NULL);
- error = PTR_ERR(dentry);
-
- mode &= ~current->fs->umask;
-@@ -1290,6 +1426,7 @@
- dput(dentry);
- }
- up(&nd.dentry->d_inode->i_sem);
-+out2:
- path_release(&nd);
- out:
- putname(tmp);
-@@ -1338,7 +1475,18 @@
- error = path_walk(tmp, &nd);
- if (error)
- goto out;
-- dentry = lookup_create(&nd, 1);
-+ if (nd.last_type != LAST_NORM) {
-+ error = -EEXIST;
-+ goto out2;
-+ }
-+ if (nd.dentry->d_inode->i_op->mkdir_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->mkdir_raw(&nd, mode);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out2;
-+ }
-+ dentry = lookup_create(&nd, 1, NULL);
- error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
- error = vfs_mkdir(nd.dentry->d_inode, dentry,
-@@ -1346,6 +1490,7 @@
- dput(dentry);
- }
- up(&nd.dentry->d_inode->i_sem);
-+out2:
- path_release(&nd);
- out:
- putname(tmp);
-@@ -1447,8 +1592,16 @@
- error = -EBUSY;
- goto exit1;
- }
-+ if (nd.dentry->d_inode->i_op->rmdir_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+
-+ error = op->rmdir_raw(&nd);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto exit1;
-+ }
- down(&nd.dentry->d_inode->i_sem);
-- dentry = lookup_hash(&nd.last, nd.dentry);
-+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
- error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
- error = vfs_rmdir(nd.dentry->d_inode, dentry);
-@@ -1507,8 +1660,15 @@
- error = -EISDIR;
- if (nd.last_type != LAST_NORM)
- goto exit1;
-+ if (nd.dentry->d_inode->i_op->unlink_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->unlink_raw(&nd);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto exit1;
-+ }
- down(&nd.dentry->d_inode->i_sem);
-- dentry = lookup_hash(&nd.last, nd.dentry);
-+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
- error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
- /* Why not before? Because we want correct error value */
-@@ -1576,15 +1736,27 @@
- error = path_walk(to, &nd);
- if (error)
- goto out;
-- dentry = lookup_create(&nd, 0);
-+ if (nd.last_type != LAST_NORM) {
-+ error = -EEXIST;
-+ goto out2;
-+ }
-+ if (nd.dentry->d_inode->i_op->symlink_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->symlink_raw(&nd, from);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out2;
-+ }
-+ dentry = lookup_create(&nd, 0, NULL);
- error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
- error = vfs_symlink(nd.dentry->d_inode, dentry, from);
- dput(dentry);
- }
- up(&nd.dentry->d_inode->i_sem);
-+ out2:
- path_release(&nd);
--out:
-+ out:
- putname(to);
- }
- putname(from);
-@@ -1667,7 +1835,18 @@
- error = -EXDEV;
- if (old_nd.mnt != nd.mnt)
- goto out_release;
-- new_dentry = lookup_create(&nd, 0);
-+ if (nd.last_type != LAST_NORM) {
-+ error = -EEXIST;
-+ goto out_release;
-+ }
-+ if (nd.dentry->d_inode->i_op->link_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->link_raw(&old_nd, &nd);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out_release;
-+ }
-+ new_dentry = lookup_create(&nd, 0, NULL);
- error = PTR_ERR(new_dentry);
- if (!IS_ERR(new_dentry)) {
- error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-@@ -1713,7 +1888,7 @@
- * locking].
- */
- int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
-- struct inode *new_dir, struct dentry *new_dentry)
-+ struct inode *new_dir, struct dentry *new_dentry)
- {
- int error;
- struct inode *target;
-@@ -1792,7 +1967,7 @@
- }
-
- int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
-- struct inode *new_dir, struct dentry *new_dentry)
-+ struct inode *new_dir, struct dentry *new_dentry)
- {
- int error;
-
-@@ -1883,9 +2058,18 @@
- if (newnd.last_type != LAST_NORM)
- goto exit2;
-
-+ if (old_dir->d_inode->i_op->rename_raw) {
-+ lock_kernel();
-+ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd);
-+ unlock_kernel();
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto exit2;
-+ }
-+
- double_lock(new_dir, old_dir);
-
-- old_dentry = lookup_hash(&oldnd.last, old_dir);
-+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL);
- error = PTR_ERR(old_dentry);
- if (IS_ERR(old_dentry))
- goto exit3;
-@@ -1901,16 +2085,16 @@
- if (newnd.last.name[newnd.last.len])
- goto exit4;
- }
-- new_dentry = lookup_hash(&newnd.last, new_dir);
-+ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL);
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry))
- goto exit4;
-
-+
- lock_kernel();
- error = vfs_rename(old_dir->d_inode, old_dentry,
- new_dir->d_inode, new_dentry);
- unlock_kernel();
--
- dput(new_dentry);
- exit4:
- dput(old_dentry);
-@@ -1961,20 +2145,26 @@
- }
-
- static inline int
--__vfs_follow_link(struct nameidata *nd, const char *link)
-+__vfs_follow_link(struct nameidata *nd, const char *link,
-+ struct lookup_intent *it)
- {
- int res = 0;
- char *name;
- if (IS_ERR(link))
- goto fail;
-
-+ if (it == NULL)
-+ it = nd->intent;
-+ else if (it != nd->intent)
-+ printk("it != nd->intent: tell phil@clusterfs.com\n");
-+
- if (*link == '/') {
- path_release(nd);
- if (!walk_init_root(link, nd))
- /* weird __emul_prefix() stuff did it */
- goto out;
- }
-- res = link_path_walk(link, nd);
-+ res = link_path_walk_it(link, nd, it);
- out:
- if (current->link_count || res || nd->last_type!=LAST_NORM)
- return res;
-@@ -1996,7 +2186,13 @@
-
- int vfs_follow_link(struct nameidata *nd, const char *link)
- {
-- return __vfs_follow_link(nd, link);
-+ return __vfs_follow_link(nd, link, NULL);
-+}
-+
-+int vfs_follow_link_it(struct nameidata *nd, const char *link,
-+ struct lookup_intent *it)
-+{
-+ return __vfs_follow_link(nd, link, it);
- }
-
- /* get the link contents into pagecache */
-@@ -2038,7 +2234,7 @@
- {
- struct page *page = NULL;
- char *s = page_getlink(dentry, &page);
-- int res = __vfs_follow_link(nd, s);
-+ int res = __vfs_follow_link(nd, s, NULL);
- if (page) {
- kunmap(page);
- page_cache_release(page);
-Index: linux.mcp2/fs/namespace.c
-===================================================================
---- linux.mcp2.orig/fs/namespace.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/namespace.c 2004-05-05 14:22:06.000000000 -0700
-@@ -97,6 +97,7 @@
- {
- old_nd->dentry = mnt->mnt_mountpoint;
- old_nd->mnt = mnt->mnt_parent;
-+ UNPIN(old_nd->dentry, old_nd->mnt, 1);
- mnt->mnt_parent = mnt;
- mnt->mnt_mountpoint = mnt->mnt_root;
- list_del_init(&mnt->mnt_child);
-@@ -108,6 +109,7 @@
- {
- mnt->mnt_parent = mntget(nd->mnt);
- mnt->mnt_mountpoint = dget(nd->dentry);
-+ PIN(nd->dentry, nd->mnt, 1);
- list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
- list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
- nd->dentry->d_mounted++;
-@@ -491,15 +493,18 @@
- {
- struct nameidata old_nd;
- struct vfsmount *mnt = NULL;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int err = mount_is_safe(nd);
- if (err)
- return err;
- if (!old_name || !*old_name)
- return -EINVAL;
- if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd))
-- err = path_walk(old_name, &old_nd);
-- if (err)
-+ err = path_walk_it(old_name, &old_nd, &it);
-+ if (err) {
-+ intent_release(&it);
- return err;
-+ }
-
- down_write(¤t->namespace->sem);
- err = -EINVAL;
-@@ -522,6 +527,7 @@
- }
-
- up_write(¤t->namespace->sem);
-+ intent_release(&it);
- path_release(&old_nd);
- return err;
- }
-@@ -706,6 +712,7 @@
- unsigned long flags, void *data_page)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int retval = 0;
- int mnt_flags = 0;
-
-@@ -731,9 +738,11 @@
-
- /* ... and get the mountpoint */
- if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
-- retval = path_walk(dir_name, &nd);
-- if (retval)
-+ retval = path_walk_it(dir_name, &nd, &it);
-+ if (retval) {
-+ intent_release(&it);
- return retval;
-+ }
-
- if (flags & MS_REMOUNT)
- retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
-@@ -745,6 +754,8 @@
- else
- retval = do_add_mount(&nd, type_page, flags, mnt_flags,
- dev_name, data_page);
-+
-+ intent_release(&it);
- path_release(&nd);
- return retval;
- }
-@@ -910,6 +921,8 @@
- {
- struct vfsmount *tmp;
- struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd;
-+ struct lookup_intent new_it = { .it_op = IT_GETATTR };
-+ struct lookup_intent old_it = { .it_op = IT_GETATTR };
- char *name;
- int error;
-
-@@ -924,7 +937,7 @@
- goto out0;
- error = 0;
- if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd))
-- error = path_walk(name, &new_nd);
-+ error = path_walk_it(name, &new_nd, &new_it);
- putname(name);
- if (error)
- goto out0;
-@@ -938,7 +951,7 @@
- goto out1;
- error = 0;
- if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd))
-- error = path_walk(name, &old_nd);
-+ error = path_walk_it(name, &old_nd, &old_it);
- putname(name);
- if (error)
- goto out1;
-@@ -994,8 +1007,10 @@
- up(&old_nd.dentry->d_inode->i_zombie);
- up_write(¤t->namespace->sem);
- path_release(&user_nd);
-+ intent_release(&old_it);
- path_release(&old_nd);
- out1:
-+ intent_release(&new_it);
- path_release(&new_nd);
- out0:
- unlock_kernel();
-Index: linux.mcp2/fs/open.c
-===================================================================
---- linux.mcp2.orig/fs/open.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/open.c 2004-05-05 14:30:34.000000000 -0700
-@@ -19,6 +19,8 @@
- #include <asm/uaccess.h>
-
- #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
-+extern int path_walk_it(const char *name, struct nameidata *nd,
-+ struct lookup_intent *it);
-
- int vfs_statfs(struct super_block *sb, struct statfs *buf)
- {
-@@ -95,9 +97,10 @@
- write_unlock(&files->file_lock);
- }
-
--int do_truncate(struct dentry *dentry, loff_t length)
-+int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
- {
- struct inode *inode = dentry->d_inode;
-+ struct inode_operations *op = dentry->d_inode->i_op;
- int error;
- struct iattr newattrs;
-
-@@ -108,7 +111,13 @@
- down(&inode->i_sem);
- newattrs.ia_size = length;
- newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-- error = notify_change(dentry, &newattrs);
-+ if (called_from_open)
-+ newattrs.ia_valid |= ATTR_FROM_OPEN;
-+ if (op->setattr_raw) {
-+ newattrs.ia_valid |= ATTR_RAW;
-+ error = op->setattr_raw(inode, &newattrs);
-+ } else
-+ error = notify_change(dentry, &newattrs);
- up(&inode->i_sem);
- return error;
- }
-@@ -118,12 +127,13 @@
- struct nameidata nd;
- struct inode * inode;
- int error;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
- error = -EINVAL;
- if (length < 0) /* sorry, but loff_t says... */
- goto out;
-
-- error = user_path_walk(path, &nd);
-+ error = user_path_walk_it(path, &nd, &it);
- if (error)
- goto out;
- inode = nd.dentry->d_inode;
-@@ -163,11 +173,13 @@
- error = locks_verify_truncate(inode, NULL, length);
- if (!error) {
- DQUOT_INIT(inode);
-- error = do_truncate(nd.dentry, length);
-+ intent_release(&it);
-+ error = do_truncate(nd.dentry, length, 0);
- }
- put_write_access(inode);
-
- dput_and_out:
-+ intent_release(&it);
- path_release(&nd);
- out:
- return error;
-@@ -215,7 +227,7 @@
-
- error = locks_verify_truncate(inode, file, length);
- if (!error)
-- error = do_truncate(dentry, length);
-+ error = do_truncate(dentry, length, 0);
- out_putf:
- fput(file);
- out:
-@@ -260,11 +272,13 @@
- struct inode * inode;
- struct iattr newattrs;
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, NULL);
- if (error)
- goto out;
- inode = nd.dentry->d_inode;
-
-+ /* this is safe without a Lustre lock because it only depends
-+ on the super block */
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto dput_and_out;
-@@ -279,11 +293,25 @@
- goto dput_and_out;
-
- newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
-- } else {
-+ }
-+
-+ if (inode->i_op->setattr_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+
-+ newattrs.ia_valid |= ATTR_RAW;
-+ error = op->setattr_raw(inode, &newattrs);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto dput_and_out;
-+ }
-+
-+ error = -EPERM;
-+ if (!times) {
- if (current->fsuid != inode->i_uid &&
- (error = permission(inode,MAY_WRITE)) != 0)
- goto dput_and_out;
- }
-+
- error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
- path_release(&nd);
-@@ -304,12 +332,14 @@
- struct inode * inode;
- struct iattr newattrs;
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, NULL);
-
- if (error)
- goto out;
- inode = nd.dentry->d_inode;
-
-+ /* this is safe without a Lustre lock because it only depends
-+ on the super block */
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto dput_and_out;
-@@ -324,7 +354,20 @@
- newattrs.ia_atime = times[0].tv_sec;
- newattrs.ia_mtime = times[1].tv_sec;
- newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
-- } else {
-+ }
-+
-+ if (inode->i_op->setattr_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+
-+ newattrs.ia_valid |= ATTR_RAW;
-+ error = op->setattr_raw(inode, &newattrs);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto dput_and_out;
-+ }
-+
-+ error = -EPERM;
-+ if (!utimes) {
- if (current->fsuid != inode->i_uid &&
- (error = permission(inode,MAY_WRITE)) != 0)
- goto dput_and_out;
-@@ -347,6 +390,7 @@
- int old_fsuid, old_fsgid;
- kernel_cap_t old_cap;
- int res;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
- if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
- return -EINVAL;
-@@ -364,13 +408,14 @@
- else
- current->cap_effective = current->cap_permitted;
-
-- res = user_path_walk(filename, &nd);
-+ res = user_path_walk_it(filename, &nd, &it);
- if (!res) {
- res = permission(nd.dentry->d_inode, mode);
- /* SuS v2 requires we report a read only fs too */
- if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
- && !special_file(nd.dentry->d_inode->i_mode))
- res = -EROFS;
-+ intent_release(&it);
- path_release(&nd);
- }
-
-@@ -386,6 +431,7 @@
- int error;
- struct nameidata nd;
- char *name;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
- name = getname(filename);
- error = PTR_ERR(name);
-@@ -394,7 +440,7 @@
-
- error = 0;
- if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd))
-- error = path_walk(name, &nd);
-+ error = path_walk_it(name, &nd, &it);
- putname(name);
- if (error)
- goto out;
-@@ -406,6 +452,7 @@
- set_fs_pwd(current->fs, nd.mnt, nd.dentry);
-
- dput_and_out:
-+ intent_release(&it);
- path_release(&nd);
- out:
- return error;
-@@ -446,6 +493,7 @@
- int error;
- struct nameidata nd;
- char *name;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
- name = getname(filename);
- error = PTR_ERR(name);
-@@ -454,7 +502,7 @@
-
- path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
- LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
-- error = path_walk(name, &nd);
-+ error = path_walk_it(name, &nd, &it);
- putname(name);
- if (error)
- goto out;
-@@ -471,39 +519,56 @@
- set_fs_altroot();
- error = 0;
- dput_and_out:
-+ intent_release(&it);
- path_release(&nd);
- out:
- return error;
- }
-
--asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
-+int chmod_common(struct dentry *dentry, mode_t mode)
- {
-- struct inode * inode;
-- struct dentry * dentry;
-- struct file * file;
-- int err = -EBADF;
-+ struct inode *inode = dentry->d_inode;
- struct iattr newattrs;
-+ int err = -EROFS;
-
-- file = fget(fd);
-- if (!file)
-+ if (IS_RDONLY(inode))
- goto out;
-
-- dentry = file->f_dentry;
-- inode = dentry->d_inode;
-+ if (inode->i_op->setattr_raw) {
-+ newattrs.ia_mode = mode;
-+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-+ newattrs.ia_valid |= ATTR_RAW;
-+ err = inode->i_op->setattr_raw(inode, &newattrs);
-+ /* the file system wants to use normal vfs path now */
-+ if (err != -EOPNOTSUPP)
-+ goto out;
-+ }
-
-- err = -EROFS;
-- if (IS_RDONLY(inode))
-- goto out_putf;
- err = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-- goto out_putf;
-+ goto out;
-+
- if (mode == (mode_t) -1)
- mode = inode->i_mode;
- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- err = notify_change(dentry, &newattrs);
-
--out_putf:
-+out:
-+ return err;
-+}
-+
-+asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
-+{
-+ struct file * file;
-+ int err = -EBADF;
-+
-+ file = fget(fd);
-+ if (!file)
-+ goto out;
-+
-+ err = chmod_common(file->f_dentry, mode);
-+
- fput(file);
- out:
- return err;
-@@ -512,30 +577,14 @@
- asmlinkage long sys_chmod(const char * filename, mode_t mode)
- {
- struct nameidata nd;
-- struct inode * inode;
- int error;
-- struct iattr newattrs;
-
- error = user_path_walk(filename, &nd);
- if (error)
- goto out;
-- inode = nd.dentry->d_inode;
--
-- error = -EROFS;
-- if (IS_RDONLY(inode))
-- goto dput_and_out;
-
-- error = -EPERM;
-- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-- goto dput_and_out;
-+ error = chmod_common(nd.dentry, mode);
-
-- if (mode == (mode_t) -1)
-- mode = inode->i_mode;
-- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
-- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-- error = notify_change(nd.dentry, &newattrs);
--
--dput_and_out:
- path_release(&nd);
- out:
- return error;
-@@ -555,6 +604,20 @@
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto out;
-+
-+ if (inode->i_op->setattr_raw) {
-+ struct inode_operations *op = dentry->d_inode->i_op;
-+
-+ newattrs.ia_uid = user;
-+ newattrs.ia_gid = group;
-+ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME;
-+ newattrs.ia_valid |= ATTR_RAW;
-+ error = op->setattr_raw(inode, &newattrs);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ return error;
-+ }
-+
- error = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out;
-@@ -659,6 +722,7 @@
- {
- int namei_flags, error;
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_OPEN };
-
- namei_flags = flags;
- if ((namei_flags+1) & O_ACCMODE)
-@@ -666,14 +730,15 @@
- if (namei_flags & O_TRUNC)
- namei_flags |= 2;
-
-- error = open_namei(filename, namei_flags, mode, &nd);
-- if (!error)
-- return dentry_open(nd.dentry, nd.mnt, flags);
-+ error = open_namei_it(filename, namei_flags, mode, &nd, &it);
-+ if (error)
-+ return ERR_PTR(error);
-
-- return ERR_PTR(error);
-+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it);
- }
-
--struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+ int flags, struct lookup_intent *it)
- {
- struct file * f;
- struct inode *inode;
-@@ -710,12 +775,15 @@
- }
-
- if (f->f_op && f->f_op->open) {
-+ f->f_it = it;
- error = f->f_op->open(inode,f);
-+ f->f_it = NULL;
- if (error)
- goto cleanup_all;
- }
- f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
-
-+ intent_release(it);
- return f;
-
- cleanup_all:
-@@ -730,11 +798,17 @@
- cleanup_file:
- put_filp(f);
- cleanup_dentry:
-+ intent_release(it);
- dput(dentry);
- mntput(mnt);
- return ERR_PTR(error);
- }
-
-+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+{
-+ return dentry_open_it(dentry, mnt, flags, NULL);
-+}
-+
- /*
- * Find an empty file descriptor entry, and mark it busy.
- */
-Index: linux.mcp2/fs/stat.c
-===================================================================
---- linux.mcp2.orig/fs/stat.c 2004-01-19 07:49:43.000000000 -0800
-+++ linux.mcp2/fs/stat.c 2004-05-05 14:19:59.000000000 -0700
-@@ -17,10 +17,12 @@
- * Revalidate the inode. This is required for proper NFS attribute caching.
- */
- static __inline__ int
--do_revalidate(struct dentry *dentry)
-+do_revalidate(struct dentry *dentry, struct lookup_intent *it)
- {
- struct inode * inode = dentry->d_inode;
-- if (inode->i_op && inode->i_op->revalidate)
-+ if (inode->i_op && inode->i_op->revalidate_it)
-+ return inode->i_op->revalidate_it(dentry, it);
-+ else if (inode->i_op && inode->i_op->revalidate)
- return inode->i_op->revalidate(dentry);
- return 0;
- }
-@@ -135,13 +139,15 @@
- asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int error;
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -151,13 +157,15 @@
- asmlinkage long sys_newstat(char * filename, struct stat * statbuf)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int error;
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -172,13 +180,15 @@
- asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int error;
-
-- error = user_path_walk_link(filename, &nd);
-+ error = user_path_walk_link_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -189,13 +199,15 @@
- asmlinkage long sys_newlstat(char * filename, struct stat * statbuf)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int error;
-
-- error = user_path_walk_link(filename, &nd);
-+ error = user_path_walk_link_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -216,7 +228,7 @@
- if (f) {
- struct dentry * dentry = f->f_dentry;
-
-- err = do_revalidate(dentry);
-+ err = do_revalidate(dentry, NULL);
- if (!err)
- err = cp_old_stat(dentry->d_inode, statbuf);
- fput(f);
-@@ -235,7 +247,7 @@
- if (f) {
- struct dentry * dentry = f->f_dentry;
-
-- err = do_revalidate(dentry);
-+ err = do_revalidate(dentry, NULL);
- if (!err)
- err = cp_new_stat(dentry->d_inode, statbuf);
- fput(f);
-@@ -257,7 +269,7 @@
-
- error = -EINVAL;
- if (inode->i_op && inode->i_op->readlink &&
-- !(error = do_revalidate(nd.dentry))) {
-+ !(error = do_revalidate(nd.dentry, NULL))) {
- UPDATE_ATIME(inode);
- error = inode->i_op->readlink(nd.dentry, buf, bufsiz);
- }
-@@ -333,12 +345,14 @@
- {
- struct nameidata nd;
- int error;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -348,12 +362,14 @@
- {
- struct nameidata nd;
- int error;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
-- error = user_path_walk_link(filename, &nd);
-+ error = user_path_walk_link_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -368,7 +384,7 @@
- if (f) {
- struct dentry * dentry = f->f_dentry;
-
-- err = do_revalidate(dentry);
-+ err = do_revalidate(dentry, NULL);
- if (!err)
- err = cp_new_stat64(dentry->d_inode, statbuf);
- fput(f);
-Index: linux.mcp2/include/linux/dcache.h
-===================================================================
---- linux.mcp2.orig/include/linux/dcache.h 2004-04-23 16:52:28.000000000 -0700
-+++ linux.mcp2/include/linux/dcache.h 2004-05-05 14:19:59.000000000 -0700
-@@ -5,6 +5,51 @@
-
- #include <asm/atomic.h>
- #include <linux/mount.h>
-+#include <linux/string.h>
-+
-+#define IT_OPEN 0x0001
-+#define IT_CREAT 0x0002
-+#define IT_READDIR 0x0004
-+#define IT_GETATTR 0x0008
-+#define IT_LOOKUP 0x0010
-+#define IT_UNLINK 0x0020
-+#define IT_GETXATTR 0x0040
-+#define IT_EXEC 0x0080
-+#define IT_PIN 0x0100
-+
-+#define IT_FL_LOCKED 0x0001
-+#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */
-+
-+#define INTENT_MAGIC 0x19620323
-+
-+
-+struct lustre_intent_data {
-+ int it_disposition;
-+ int it_status;
-+ __u64 it_lock_handle;
-+ void *it_data;
-+ int it_lock_mode;
-+ int it_int_flags;
-+};
-+struct lookup_intent {
-+ int it_magic;
-+ void (*it_op_release)(struct lookup_intent *);
-+ int it_op;
-+ int it_flags;
-+ int it_create_mode;
-+ union {
-+ struct lustre_intent_data lustre;
-+ } d;
-+};
-+
-+static inline void intent_init(struct lookup_intent *it, int op, int flags)
-+{
-+ memset(it, 0, sizeof(*it));
-+ it->it_magic = INTENT_MAGIC;
-+ it->it_op = op;
-+ it->it_flags = flags;
-+}
-+
-
- /*
- * linux/include/linux/dcache.h
-@@ -90,8 +135,22 @@
- int (*d_delete)(struct dentry *);
- void (*d_release)(struct dentry *);
- void (*d_iput)(struct dentry *, struct inode *);
-+ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *);
-+ void (*d_pin)(struct dentry *, struct vfsmount * , int);
-+ void (*d_unpin)(struct dentry *, struct vfsmount *, int);
- };
-
-+#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \
-+ de->d_op->d_pin(de, mnt, flag);
-+#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \
-+ de->d_op->d_unpin(de, mnt, flag);
-+
-+
-+/* defined in fs/namei.c */
-+extern void intent_release(struct lookup_intent *it);
-+/* defined in fs/dcache.c */
-+extern void __d_rehash(struct dentry * entry, int lock);
-+
- /* the dentry parameter passed to d_hash and d_compare is the parent
- * directory of the entries to be compared. It is used in case these
- * functions need any directory specific information for determining
-@@ -123,6 +182,7 @@
- * s_nfsd_free_path semaphore will be down
- */
- #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */
-+#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */
-
- extern spinlock_t dcache_lock;
-
-Index: linux.mcp2/include/linux/fs.h
-===================================================================
---- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:12:28.000000000 -0700
-+++ linux.mcp2/include/linux/fs.h 2004-05-05 14:19:59.000000000 -0700
-@@ -73,6 +73,7 @@
-
- #define FMODE_READ 1
- #define FMODE_WRITE 2
-+#define FMODE_EXEC 4
-
- #define READ 0
- #define WRITE 1
-@@ -335,6 +336,9 @@
- #define ATTR_MTIME_SET 256
- #define ATTR_FORCE 512 /* Not a change, but a change it */
- #define ATTR_ATTR_FLAG 1024
-+#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */
-+#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */
-+#define ATTR_CTIME_SET 0x2000
-
- /*
- * This is the Inode Attributes structure, used for notify_change(). It
-@@ -470,6 +474,7 @@
- struct pipe_inode_info *i_pipe;
- struct block_device *i_bdev;
- struct char_device *i_cdev;
-+ void *i_filterdata;
-
- unsigned long i_dnotify_mask; /* Directory notify events */
- struct dnotify_struct *i_dnotify; /* for directory notifications */
-@@ -574,6 +579,7 @@
-
- /* needed for tty driver, and maybe others */
- void *private_data;
-+ struct lookup_intent *f_it;
-
- /* preallocated helper kiobuf to speedup O_DIRECT */
- struct kiobuf *f_iobuf;
-@@ -692,6 +698,7 @@
- struct qstr last;
- unsigned int flags;
- int last_type;
-+ struct lookup_intent *intent;
- };
-
- #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */
-@@ -840,7 +847,8 @@
- extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
- extern int vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_unlink(struct inode *, struct dentry *);
--extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-+ struct inode *new_dir, struct dentry *new_dentry);
-
- /*
- * File types
-@@ -900,21 +908,32 @@
-
- struct inode_operations {
- int (*create) (struct inode *,struct dentry *,int);
-+ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *);
- struct dentry * (*lookup) (struct inode *,struct dentry *);
-+ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags);
- int (*link) (struct dentry *,struct inode *,struct dentry *);
-+ int (*link_raw) (struct nameidata *,struct nameidata *);
- int (*unlink) (struct inode *,struct dentry *);
-+ int (*unlink_raw) (struct nameidata *);
- int (*symlink) (struct inode *,struct dentry *,const char *);
-+ int (*symlink_raw) (struct nameidata *,const char *);
- int (*mkdir) (struct inode *,struct dentry *,int);
-+ int (*mkdir_raw) (struct nameidata *,int);
- int (*rmdir) (struct inode *,struct dentry *);
-+ int (*rmdir_raw) (struct nameidata *);
- int (*mknod) (struct inode *,struct dentry *,int,int);
-+ int (*mknod_raw) (struct nameidata *,int,dev_t);
- int (*rename) (struct inode *, struct dentry *,
- struct inode *, struct dentry *);
-+ int (*rename_raw) (struct nameidata *, struct nameidata *);
- int (*readlink) (struct dentry *, char *,int);
- int (*follow_link) (struct dentry *, struct nameidata *);
- void (*truncate) (struct inode *);
- int (*permission) (struct inode *, int);
- int (*revalidate) (struct dentry *);
-+ int (*revalidate_it) (struct dentry *, struct lookup_intent *);
- int (*setattr) (struct dentry *, struct iattr *);
-+ int (*setattr_raw) (struct inode *, struct iattr *);
- int (*getattr) (struct dentry *, struct iattr *);
- };
-
-@@ -1115,10 +1134,14 @@
-
- asmlinkage long sys_open(const char *, int, int);
- asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */
--extern int do_truncate(struct dentry *, loff_t start);
-+extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
-
- extern struct file *filp_open(const char *, int, int);
- extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
-+extern int open_namei_it(const char *filename, int namei_flags, int mode,
-+ struct nameidata *nd, struct lookup_intent *it);
-+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+ int flags, struct lookup_intent *it);
- extern int filp_close(struct file *, fl_owner_t id);
- extern char * getname(const char *);
-
-@@ -1380,6 +1403,7 @@
- extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
-
- extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
-+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it));
- extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
- extern int FASTCALL(path_walk(const char *, struct nameidata *));
- extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
-@@ -1390,6 +1414,8 @@
- extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
- #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
- #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
-+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it)
-+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it)
-
- extern void iput(struct inode *);
- extern void force_delete(struct inode *);
-@@ -1499,6 +1525,8 @@
-
- extern int vfs_readlink(struct dentry *, char *, int, const char *);
- extern int vfs_follow_link(struct nameidata *, const char *);
-+extern int vfs_follow_link_it(struct nameidata *, const char *,
-+ struct lookup_intent *it);
- extern int page_readlink(struct dentry *, char *, int);
- extern int page_follow_link(struct dentry *, struct nameidata *);
- extern struct inode_operations page_symlink_inode_operations;
-Index: linux.mcp2/include/linux/fs_struct.h
-===================================================================
---- linux.mcp2.orig/include/linux/fs_struct.h 2004-01-19 07:49:42.000000000 -0800
-+++ linux.mcp2/include/linux/fs_struct.h 2004-05-05 14:19:59.000000000 -0700
-@@ -34,10 +34,12 @@
- write_lock(&fs->lock);
- old_root = fs->root;
- old_rootmnt = fs->rootmnt;
-+ PIN(dentry, mnt, 1);
- fs->rootmnt = mntget(mnt);
- fs->root = dget(dentry);
- write_unlock(&fs->lock);
- if (old_root) {
-+ UNPIN(old_root, old_rootmnt, 1);
- dput(old_root);
- mntput(old_rootmnt);
- }
-@@ -57,10 +59,12 @@
- write_lock(&fs->lock);
- old_pwd = fs->pwd;
- old_pwdmnt = fs->pwdmnt;
-+ PIN(dentry, mnt, 0);
- fs->pwdmnt = mntget(mnt);
- fs->pwd = dget(dentry);
- write_unlock(&fs->lock);
- if (old_pwd) {
-+ UNPIN(old_pwd, old_pwdmnt, 0);
- dput(old_pwd);
- mntput(old_pwdmnt);
- }
-Index: linux.mcp2/kernel/exit.c
-===================================================================
---- linux.mcp2.orig/kernel/exit.c 2004-01-19 07:49:44.000000000 -0800
-+++ linux.mcp2/kernel/exit.c 2004-05-05 14:19:59.000000000 -0700
-@@ -252,11 +252,14 @@
- {
- /* No need to hold fs->lock if we are killing it */
- if (atomic_dec_and_test(&fs->count)) {
-+ UNPIN(fs->pwd, fs->pwdmnt, 0);
-+ UNPIN(fs->root, fs->rootmnt, 1);
- dput(fs->root);
- mntput(fs->rootmnt);
- dput(fs->pwd);
- mntput(fs->pwdmnt);
- if (fs->altroot) {
-+ UNPIN(fs->altroot, fs->altrootmnt, 1);
- dput(fs->altroot);
- mntput(fs->altrootmnt);
- }
-Index: linux.mcp2/kernel/fork.c
-===================================================================
---- linux.mcp2.orig/kernel/fork.c 2004-01-19 07:49:44.000000000 -0800
-+++ linux.mcp2/kernel/fork.c 2004-05-05 14:19:59.000000000 -0700
-@@ -384,10 +384,13 @@
- fs->umask = old->umask;
- read_lock(&old->lock);
- fs->rootmnt = mntget(old->rootmnt);
-+ PIN(old->pwd, old->pwdmnt, 0);
-+ PIN(old->root, old->rootmnt, 1);
- fs->root = dget(old->root);
- fs->pwdmnt = mntget(old->pwdmnt);
- fs->pwd = dget(old->pwd);
- if (old->altroot) {
-+ PIN(old->altroot, old->altrootmnt, 1);
- fs->altrootmnt = mntget(old->altrootmnt);
- fs->altroot = dget(old->altroot);
- } else {
-Index: linux.mcp2/kernel/ksyms.c
-===================================================================
---- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:12:28.000000000 -0700
-+++ linux.mcp2/kernel/ksyms.c 2004-05-05 14:19:59.000000000 -0700
-@@ -264,6 +264,7 @@
- EXPORT_SYMBOL(set_page_dirty);
- EXPORT_SYMBOL(vfs_readlink);
- EXPORT_SYMBOL(vfs_follow_link);
-+EXPORT_SYMBOL(vfs_follow_link_it);
- EXPORT_SYMBOL(page_readlink);
- EXPORT_SYMBOL(page_follow_link);
- EXPORT_SYMBOL(page_symlink_inode_operations);
+++ /dev/null
- fs/dcache.c | 19 ++
- fs/exec.c | 17 +-
- fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++-------
- fs/namespace.c | 28 +++-
- fs/open.c | 172 +++++++++++++++++++-------
- fs/stat.c | 52 +++++---
- include/linux/dcache.h | 60 +++++++++
- include/linux/fs.h | 32 ++++
- include/linux/fs_struct.h | 4
- kernel/exit.c | 3
- kernel/fork.c | 3
- kernel/ksyms.c | 1
- 12 files changed, 558 insertions(+), 128 deletions(-)
-
-Index: linux-2.4.19.SuSE/fs/dcache.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/dcache.c Mon Jan 27 05:08:04 2003
-+++ linux-2.4.19.SuSE/fs/dcache.c Sat Nov 15 17:29:03 2003
-@@ -186,6 +186,13 @@
- spin_unlock(&dcache_lock);
- return 0;
- }
-+
-+ /* network invalidation by Lustre */
-+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) {
-+ spin_unlock(&dcache_lock);
-+ return 0;
-+ }
-+
- /*
- * Check whether to do a partial shrink_dcache
- * to get rid of unused child entries.
-@@ -838,13 +845,19 @@
- * Adds a dentry to the hash according to its name.
- */
-
--void d_rehash(struct dentry * entry)
-+void __d_rehash(struct dentry * entry, int lock)
- {
- struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
- if (!list_empty(&entry->d_hash)) BUG();
-- spin_lock(&dcache_lock);
-+ if (lock) spin_lock(&dcache_lock);
- list_add(&entry->d_hash, list);
-- spin_unlock(&dcache_lock);
-+ if (lock) spin_unlock(&dcache_lock);
-+}
-+EXPORT_SYMBOL(__d_rehash);
-+
-+void d_rehash(struct dentry * entry)
-+{
-+ __d_rehash(entry, 1);
- }
-
- #define do_switch(x,y) do { \
-Index: linux-2.4.19.SuSE/fs/exec.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/exec.c Mon Jan 27 05:08:35 2003
-+++ linux-2.4.19.SuSE/fs/exec.c Sat Nov 15 17:34:06 2003
-@@ -107,8 +107,10 @@
- struct file * file;
- struct nameidata nd;
- int error;
-+ struct lookup_intent it = { .it_op = IT_OPEN,
-+ .it_flags = FMODE_READ|FMODE_EXEC };
-
-- error = user_path_walk(library, &nd);
-+ error = user_path_walk_it(library, &nd, &it);
- if (error)
- goto out;
-
-@@ -120,7 +122,8 @@
- if (error)
- goto exit;
-
-- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
-+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
-+ intent_release(&it);
- error = PTR_ERR(file);
- if (IS_ERR(file))
- goto out;
-@@ -346,9 +349,11 @@
- struct inode *inode;
- struct file *file;
- int err = 0;
-+ struct lookup_intent it = { .it_op = IT_OPEN,
-+ .it_flags = FMODE_READ|FMODE_EXEC };
-
- if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
-- err = path_walk(name, &nd);
-+ err = path_walk_it(name, &nd, &it);
- file = ERR_PTR(err);
- if (!err) {
- inode = nd.dentry->d_inode;
-@@ -360,7 +365,8 @@
- err = -EACCES;
- file = ERR_PTR(err);
- if (!err) {
-- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
-+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
-+ intent_release(&it);
- if (!IS_ERR(file)) {
- err = deny_write_access(file);
- if (err) {
-@@ -372,6 +378,7 @@
- return file;
- }
- }
-+ intent_release(&it);
- path_release(&nd);
- }
- goto out;
-@@ -981,7 +988,7 @@
- goto close_fail;
- if (!file->f_op->write)
- goto close_fail;
-- if (do_truncate(file->f_dentry, 0) != 0)
-+ if (do_truncate(file->f_dentry, 0, 0) != 0)
- goto close_fail;
-
- retval = binfmt->core_dump(signr, regs, file);
-Index: linux-2.4.19.SuSE/fs/namei.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/namei.c Mon Jan 27 05:08:07 2003
-+++ linux-2.4.19.SuSE/fs/namei.c Sat Nov 15 17:52:03 2003
-@@ -94,6 +94,13 @@
- * XEmacs seems to be relying on it...
- */
-
-+void intent_release(struct lookup_intent *it)
-+{
-+ if (it && it->it_op_release)
-+ it->it_op_release(it);
-+
-+}
-+
- /* In order to reduce some races, while at the same time doing additional
- * checking and hopefully speeding things up, we copy filenames to the
- * kernel data space before using them..
-@@ -260,10 +267,19 @@
- * Internal lookup() using the new generic dcache.
- * SMP-safe
- */
--static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name,
-+ int flags, struct lookup_intent *it)
- {
- struct dentry * dentry = d_lookup(parent, name);
-
-+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
-+ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) &&
-+ !d_invalidate(dentry)) {
-+ dput(dentry);
-+ dentry = NULL;
-+ }
-+ return dentry;
-+ } else
- if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
- if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
- dput(dentry);
-@@ -281,11 +297,15 @@
- * make sure that nobody added the entry to the dcache in the meantime..
- * SMP-safe
- */
--static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name,
-+ int flags, struct lookup_intent *it)
- {
- struct dentry * result;
- struct inode *dir = parent->d_inode;
-+ int counter = 0;
-
-+again:
-+ counter++;
- down(&dir->i_sem);
- /*
- * First re-do the cached lookup just in case it was created
-@@ -300,6 +320,9 @@
- result = ERR_PTR(-ENOMEM);
- if (dentry) {
- lock_kernel();
-+ if (dir->i_op->lookup_it)
-+ result = dir->i_op->lookup_it(dir, dentry, it, flags);
-+ else
- result = dir->i_op->lookup(dir, dentry);
- unlock_kernel();
- if (result)
-@@ -321,6 +344,15 @@
- dput(result);
- result = ERR_PTR(-ENOENT);
- }
-+ } else if (result->d_op && result->d_op->d_revalidate_it) {
-+ if (!result->d_op->d_revalidate_it(result, flags, it) &&
-+ !d_invalidate(result)) {
-+ dput(result);
-+ if (counter > 10)
-+ result = ERR_PTR(-ESTALE);
-+ if (!IS_ERR(result))
-+ goto again;
-+ }
- }
- return result;
- }
-@@ -332,7 +364,8 @@
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups.
- */
--static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
-+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd,
-+ struct lookup_intent *it)
- {
- int err;
- if (current->link_count >= 8)
-@@ -346,10 +379,12 @@
- current->link_count++;
- current->total_link_count++;
- UPDATE_ATIME(dentry->d_inode);
-+ nd->intent = it;
- err = dentry->d_inode->i_op->follow_link(dentry, nd);
- current->link_count--;
- return err;
- loop:
-+ intent_release(it);
- path_release(nd);
- return -ELOOP;
- }
-@@ -447,7 +482,8 @@
- *
- * We expect 'base' to be positive and a directory.
- */
--int link_path_walk(const char * name, struct nameidata *nd)
-+int link_path_walk_it(const char *name, struct nameidata *nd,
-+ struct lookup_intent *it)
- {
- struct dentry *dentry;
- struct inode *inode;
-@@ -524,12 +560,13 @@
- break;
- }
- /* This does the actual lookups.. */
-- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
- if (!dentry) {
- err = -EWOULDBLOCKIO;
- if (atomic)
- break;
-- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE,
-+ NULL);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- break;
-@@ -547,7 +584,7 @@
- goto out_dput;
-
- if (inode->i_op->follow_link) {
-- err = do_follow_link(dentry, nd);
-+ err = do_follow_link(dentry, nd, NULL);
- dput(dentry);
- if (err)
- goto return_err;
-@@ -563,7 +600,7 @@
- nd->dentry = dentry;
- }
- err = -ENOTDIR;
-- if (!inode->i_op->lookup)
-+ if (!inode->i_op->lookup && !inode->i_op->lookup_it)
- break;
- continue;
- /* here ends the main loop */
-@@ -590,12 +627,12 @@
- if (err < 0)
- break;
- }
-- dentry = cached_lookup(nd->dentry, &this, 0);
-+ dentry = cached_lookup(nd->dentry, &this, 0, it);
- if (!dentry) {
- err = -EWOULDBLOCKIO;
- if (atomic)
- break;
-- dentry = real_lookup(nd->dentry, &this, 0);
-+ dentry = real_lookup(nd->dentry, &this, 0, it);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- break;
-@@ -605,7 +642,7 @@
- inode = dentry->d_inode;
- if ((lookup_flags & LOOKUP_FOLLOW)
- && inode && inode->i_op && inode->i_op->follow_link) {
-- err = do_follow_link(dentry, nd);
-+ err = do_follow_link(dentry, nd, it);
- dput(dentry);
- if (err)
- goto return_err;
-@@ -619,7 +656,8 @@
- goto no_inode;
- if (lookup_flags & LOOKUP_DIRECTORY) {
- err = -ENOTDIR;
-- if (!inode->i_op || !inode->i_op->lookup)
-+ if (!inode->i_op ||
-+ (!inode->i_op->lookup && !inode->i_op->lookup_it))
- break;
- }
- goto return_base;
-@@ -643,6 +681,32 @@
- * Check the cached dentry for staleness.
- */
- dentry = nd->dentry;
-+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
-+ err = -ESTALE;
-+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) {
-+ struct dentry *new;
-+ err = permission(dentry->d_parent->d_inode,
-+ MAY_EXEC);
-+ if (err)
-+ break;
-+ new = real_lookup(dentry->d_parent,
-+ &dentry->d_name, 0, it);
-+ if (IS_ERR(new)) {
-+ err = PTR_ERR(new);
-+ break;
-+ }
-+ d_invalidate(dentry);
-+ dput(dentry);
-+ nd->dentry = new;
-+ }
-+ if (!nd->dentry->d_inode)
-+ goto no_inode;
-+ if (lookup_flags & LOOKUP_DIRECTORY) {
-+ err = -ENOTDIR;
-+ if (!nd->dentry->d_inode->i_op ||
-+ (!nd->dentry->d_inode->i_op->lookup &&
-+ !nd->dentry->d_inode->i_op->lookup_it))
-+ break;
-+ }
-+ } else
- if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
- err = -ESTALE;
- if (!dentry->d_op->d_revalidate(dentry, lookup_flags & LOOKUP_PARENT)) {
-@@ -656,15 +713,28 @@
- dput(dentry);
- break;
- }
-+ if (err)
-+ intent_release(it);
- path_release(nd);
- return_err:
- return err;
- }
-
-+int link_path_walk(const char * name, struct nameidata *nd)
-+{
-+ return link_path_walk_it(name, nd, NULL);
-+}
-+
-+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it)
-+{
-+ current->total_link_count = 0;
-+ return link_path_walk_it(name, nd, it);
-+}
-+
- int path_walk(const char * name, struct nameidata *nd)
- {
- current->total_link_count = 0;
-- return link_path_walk(name, nd);
-+ return link_path_walk_it(name, nd, NULL);
- }
-
- /* SMP-safe */
-@@ -753,6 +823,7 @@
- {
- nd->last_type = LAST_ROOT; /* if there are only slashes... */
- nd->flags = flags;
-+ nd->intent = NULL;
- if (*name=='/')
- return walk_init_root(name,nd);
- read_lock(¤t->fs->lock);
-@@ -767,7 +838,8 @@
- * needs parent already locked. Doesn't follow mounts.
- * SMP-safe.
- */
--struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base,
-+ struct lookup_intent *it)
- {
- struct dentry * dentry;
- struct inode *inode;
-@@ -790,13 +862,16 @@
- goto out;
- }
-
-- dentry = cached_lookup(base, name, 0);
-+ dentry = cached_lookup(base, name, 0, it);
- if (!dentry) {
- struct dentry *new = d_alloc(base, name);
- dentry = ERR_PTR(-ENOMEM);
- if (!new)
- goto out;
- lock_kernel();
-+ if (inode->i_op->lookup_it)
-+ dentry = inode->i_op->lookup_it(inode, new, it, 0);
-+ else
- dentry = inode->i_op->lookup(inode, new);
- unlock_kernel();
- if (!dentry)
-@@ -808,6 +883,12 @@
- return dentry;
- }
-
-+struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+{
-+ return lookup_hash_it(name, base, NULL);
-+}
-+
-+
- /* SMP-safe */
- struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
- {
-@@ -829,7 +910,7 @@
- }
- this.hash = end_name_hash(hash);
-
-- return lookup_hash(&this, base);
-+ return lookup_hash_it(&this, base, NULL);
- access:
- return ERR_PTR(-EACCES);
- }
-@@ -861,6 +942,23 @@
- return err;
- }
-
-+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd,
-+ struct lookup_intent *it)
-+{
-+ char *tmp;
-+ int err;
-+
-+ tmp = getname(name);
-+ err = PTR_ERR(tmp);
-+ if (!IS_ERR(tmp)) {
-+ err = 0;
-+ if (path_init(tmp, flags, nd))
-+ err = path_walk_it(tmp, nd, it);
-+ putname(tmp);
-+ }
-+ return err;
-+}
-+
- /*
- * It's inline, so penalty for filesystems that don't use sticky bit is
- * minimal.
-@@ -958,7 +1056,8 @@
- return retval;
- }
-
--int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
-+static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode,
-+ struct lookup_intent *it)
- {
- int error;
-
-@@ -971,12 +1070,15 @@
- goto exit_lock;
-
- error = -EACCES; /* shouldn't it be ENOSYS? */
-- if (!dir->i_op || !dir->i_op->create)
-+ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it))
- goto exit_lock;
-
- DQUOT_INIT(dir);
- lock_kernel();
-- error = dir->i_op->create(dir, dentry, mode);
-+ if (dir->i_op->create_it)
-+ error = dir->i_op->create_it(dir, dentry, mode, it);
-+ else
-+ error = dir->i_op->create(dir, dentry, mode);
- unlock_kernel();
- exit_lock:
- up(&dir->i_zombie);
-@@ -985,6 +1087,11 @@
- return error;
- }
-
-+int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
-+{
-+ return vfs_create_it(dir, dentry, mode, NULL);
-+}
-+
- /*
- * open_namei()
- *
-@@ -999,7 +1106,8 @@
- * for symlinks (where the permissions are checked later).
- * SMP-safe
- */
--int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
-+int open_namei_it(const char *pathname, int flag, int mode,
-+ struct nameidata *nd, struct lookup_intent *it)
- {
- int acc_mode, error = 0;
- struct inode *inode;
-@@ -1009,12 +1117,14 @@
-
- acc_mode = ACC_MODE(flag);
-
-+ if (it)
-+ it->it_flags = flag;
- /*
- * The simplest case - just a plain lookup.
- */
- if (!(flag & O_CREAT)) {
- if (path_init(pathname, lookup_flags(flag), nd))
-- error = path_walk(pathname, nd);
-+ error = path_walk_it(pathname, nd, it);
- if (error)
- return error;
- dentry = nd->dentry;
-@@ -1024,6 +1134,10 @@
- /*
- * Create - we need to know the parent.
- */
-+ if (it) {
-+ it->it_create_mode = mode;
-+ it->it_op |= IT_CREAT;
-+ }
- if (path_init(pathname, LOOKUP_PARENT, nd))
- error = path_walk(pathname, nd);
- if (error)
-@@ -1040,7 +1154,7 @@
-
- dir = nd->dentry;
- down(&dir->d_inode->i_sem);
-- dentry = lookup_hash(&nd->last, nd->dentry);
-+ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
-
- do_last:
- error = PTR_ERR(dentry);
-@@ -1049,11 +1163,13 @@
- goto exit;
- }
-
-+ it->it_create_mode = mode;
- /* Negative dentry, just create the file */
- if (!dentry->d_inode) {
- if (!IS_POSIXACL(dir->d_inode))
- mode &= ~current->fs->umask;
-- error = vfs_create(dir->d_inode, dentry, mode);
-+ error = vfs_create_it(dir->d_inode, dentry,
-+ mode & ~current->fs->umask, it);
- up(&dir->d_inode->i_sem);
- #ifndef DENTRY_WASTE_RAM
- if (error)
-@@ -1161,7 +1277,7 @@
- if (!error) {
- DQUOT_INIT(inode);
-
-- error = do_truncate(dentry, 0);
-+ error = do_truncate(dentry, 0, 1);
- }
- put_write_access(inode);
- if (error)
-@@ -1173,8 +1289,10 @@
- return 0;
-
- exit_dput:
-+ intent_release(it);
- dput(dentry);
- exit:
-+ intent_release(it);
- path_release(nd);
- return error;
-
-@@ -1193,7 +1311,10 @@
- * are done. Procfs-like symlinks just set LAST_BIND.
- */
- UPDATE_ATIME(dentry->d_inode);
-+ nd->intent = it;
- error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+ if (error)
-+ intent_release(it);
- dput(dentry);
- if (error)
- return error;
-@@ -1215,13 +1336,20 @@
- }
- dir = nd->dentry;
- down(&dir->d_inode->i_sem);
-- dentry = lookup_hash(&nd->last, nd->dentry);
-+ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
- putname(nd->last.name);
- goto do_last;
- }
-
-+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd)
-+{
-+ return open_namei_it(pathname, flag, mode, nd, NULL);
-+}
-+
-+
- /* SMP-safe */
--static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
-+static struct dentry *lookup_create(struct nameidata *nd, int is_dir,
-+ struct lookup_intent *it)
- {
- struct dentry *dentry;
-
-@@ -1229,7 +1357,7 @@
- dentry = ERR_PTR(-EEXIST);
- if (nd->last_type != LAST_NORM)
- goto fail;
-- dentry = lookup_hash(&nd->last, nd->dentry);
-+ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
- if (IS_ERR(dentry))
- goto fail;
- if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1286,7 +1414,20 @@
- error = path_walk(tmp, &nd);
- if (error)
- goto out;
-- dentry = lookup_create(&nd, 0);
-+
-+ if (nd.last_type != LAST_NORM) {
-+ error = -EEXIST;
-+ goto out2;
-+ }
-+ if (nd.dentry->d_inode->i_op->mknod_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->mknod_raw(&nd, mode, dev);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out2;
-+ }
-+
-+ dentry = lookup_create(&nd, 0, NULL);
- error = PTR_ERR(dentry);
-
- if (!IS_POSIXACL(nd.dentry->d_inode))
-@@ -1308,6 +1445,7 @@
- dput(dentry);
- }
- up(&nd.dentry->d_inode->i_sem);
-+out2:
- path_release(&nd);
- out:
- putname(tmp);
-@@ -1356,7 +1494,18 @@
- error = path_walk(tmp, &nd);
- if (error)
- goto out;
-- dentry = lookup_create(&nd, 1);
-+ if (nd.last_type != LAST_NORM) {
-+ error = -EEXIST;
-+ goto out2;
-+ }
-+ if (nd.dentry->d_inode->i_op->mkdir_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->mkdir_raw(&nd, mode);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out2;
-+ }
-+ dentry = lookup_create(&nd, 1, NULL);
- error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
- if (!IS_POSIXACL(nd.dentry->d_inode))
-@@ -1365,6 +1510,7 @@
- dput(dentry);
- }
- up(&nd.dentry->d_inode->i_sem);
-+out2:
- path_release(&nd);
- out:
- putname(tmp);
-@@ -1466,8 +1612,16 @@
- error = -EBUSY;
- goto exit1;
- }
-+ if (nd.dentry->d_inode->i_op->rmdir_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+
-+ error = op->rmdir_raw(&nd);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto exit1;
-+ }
- down(&nd.dentry->d_inode->i_sem);
-- dentry = lookup_hash(&nd.last, nd.dentry);
-+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
- error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
- error = vfs_rmdir(nd.dentry->d_inode, dentry);
-@@ -1526,8 +1680,15 @@
- error = -EISDIR;
- if (nd.last_type != LAST_NORM)
- goto exit1;
-+ if (nd.dentry->d_inode->i_op->unlink_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->unlink_raw(&nd);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto exit1;
-+ }
- down(&nd.dentry->d_inode->i_sem);
-- dentry = lookup_hash(&nd.last, nd.dentry);
-+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
- error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
- /* Why not before? Because we want correct error value */
-@@ -1595,15 +1756,27 @@
- error = path_walk(to, &nd);
- if (error)
- goto out;
-- dentry = lookup_create(&nd, 0);
-+ if (nd.last_type != LAST_NORM) {
-+ error = -EEXIST;
-+ goto out2;
-+ }
-+ if (nd.dentry->d_inode->i_op->symlink_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->symlink_raw(&nd, from);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out2;
-+ }
-+ dentry = lookup_create(&nd, 0, NULL);
- error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
- error = vfs_symlink(nd.dentry->d_inode, dentry, from);
- dput(dentry);
- }
- up(&nd.dentry->d_inode->i_sem);
-+ out2:
- path_release(&nd);
--out:
-+ out:
- putname(to);
- }
- putname(from);
-@@ -1686,7 +1855,14 @@
- error = -EXDEV;
- if (old_nd.mnt != nd.mnt)
- goto out_release;
-- new_dentry = lookup_create(&nd, 0);
-+ if (nd.last_type != LAST_NORM) {
-+ error = -EEXIST;
-+ goto out_release;
-+ }
-+ if (nd.dentry->d_inode->i_op->link_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->link_raw(&old_nd, &nd);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out_release;
-+ }
-+ new_dentry = lookup_create(&nd, 0, NULL);
- error = PTR_ERR(new_dentry);
- if (!IS_ERR(new_dentry)) {
- error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-@@ -1732,7 +1908,7 @@
- * locking].
- */
- int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
-- struct inode *new_dir, struct dentry *new_dentry)
-+ struct inode *new_dir, struct dentry *new_dentry)
- {
- int error;
- struct inode *target;
-@@ -1811,7 +1987,7 @@
- }
-
- int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
-- struct inode *new_dir, struct dentry *new_dentry)
-+ struct inode *new_dir, struct dentry *new_dentry)
- {
- int error;
-
-@@ -1902,9 +2078,18 @@
- if (newnd.last_type != LAST_NORM)
- goto exit2;
-
-+ if (old_dir->d_inode->i_op->rename_raw) {
-+ lock_kernel();
-+ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd);
-+ unlock_kernel();
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto exit2;
-+ }
-+
- double_lock(new_dir, old_dir);
-
-- old_dentry = lookup_hash(&oldnd.last, old_dir);
-+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL);
- error = PTR_ERR(old_dentry);
- if (IS_ERR(old_dentry))
- goto exit3;
-@@ -1920,16 +2105,16 @@
- if (newnd.last.name[newnd.last.len])
- goto exit4;
- }
-- new_dentry = lookup_hash(&newnd.last, new_dir);
-+ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL);
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry))
- goto exit4;
-
-+
- lock_kernel();
- error = vfs_rename(old_dir->d_inode, old_dentry,
- new_dir->d_inode, new_dentry);
- unlock_kernel();
--
- dput(new_dentry);
- exit4:
- dput(old_dentry);
-@@ -1980,20 +2165,26 @@
- }
-
- static inline int
--__vfs_follow_link(struct nameidata *nd, const char *link)
-+__vfs_follow_link(struct nameidata *nd, const char *link,
-+ struct lookup_intent *it)
- {
- int res = 0;
- char *name;
- if (IS_ERR(link))
- goto fail;
-
-+ if (it == NULL)
-+ it = nd->intent;
-+ else if (it != nd->intent)
-+ printk("it != nd->intent: tell phil@clusterfs.com\n");
-+
- if (*link == '/') {
- path_release(nd);
- if (!walk_init_root(link, nd))
- /* weird __emul_prefix() stuff did it */
- goto out;
- }
-- res = link_path_walk(link, nd);
-+ res = link_path_walk_it(link, nd, it);
- out:
- if (current->link_count || res || nd->last_type!=LAST_NORM)
- return res;
-@@ -2015,7 +2206,13 @@
-
- int vfs_follow_link(struct nameidata *nd, const char *link)
- {
-- return __vfs_follow_link(nd, link);
-+ return __vfs_follow_link(nd, link, NULL);
-+}
-+
-+int vfs_follow_link_it(struct nameidata *nd, const char *link,
-+ struct lookup_intent *it)
-+{
-+ return __vfs_follow_link(nd, link, it);
- }
-
- /* get the link contents into pagecache */
-@@ -2057,7 +2254,7 @@
- {
- struct page *page = NULL;
- char *s = page_getlink(dentry, &page);
-- int res = __vfs_follow_link(nd, s);
-+ int res = __vfs_follow_link(nd, s, NULL);
- if (page) {
- kunmap(page);
- page_cache_release(page);
-Index: linux-2.4.19.SuSE/fs/namespace.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/namespace.c Mon Jan 27 05:08:07 2003
-+++ linux-2.4.19.SuSE/fs/namespace.c Sat Nov 15 17:56:42 2003
-@@ -97,6 +97,7 @@
- {
- old_nd->dentry = mnt->mnt_mountpoint;
- old_nd->mnt = mnt->mnt_parent;
-+ UNPIN(old_nd->dentry, old_nd->mnt, 1);
- mnt->mnt_parent = mnt;
- mnt->mnt_mountpoint = mnt->mnt_root;
- list_del_init(&mnt->mnt_child);
-@@ -108,6 +109,7 @@
- {
- mnt->mnt_parent = mntget(nd->mnt);
- mnt->mnt_mountpoint = dget(nd->dentry);
-+ PIN(nd->dentry, nd->mnt, 1);
- list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
- list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
- nd->dentry->d_mounted++;
-@@ -491,15 +493,18 @@
- {
- struct nameidata old_nd;
- struct vfsmount *mnt = NULL;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int err = mount_is_safe(nd);
- if (err)
- return err;
- if (!old_name || !*old_name)
- return -EINVAL;
- if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd))
-- err = path_walk(old_name, &old_nd);
-- if (err)
-+ err = path_walk_it(old_name, &old_nd, &it);
-+ if (err) {
-+ intent_release(&it);
- return err;
-+ }
-
- down_write(¤t->namespace->sem);
- err = -EINVAL;
-@@ -522,6 +527,7 @@
- }
-
- up_write(¤t->namespace->sem);
-+ intent_release(&it);
- path_release(&old_nd);
- return err;
- }
-@@ -725,6 +731,7 @@
- unsigned long flags, void *data_page)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int retval = 0;
- int mnt_flags = 0;
-
-@@ -750,9 +757,11 @@
-
- /* ... and get the mountpoint */
- if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
-- retval = path_walk(dir_name, &nd);
-- if (retval)
-+ retval = path_walk_it(dir_name, &nd, &it);
-+ if (retval) {
-+ intent_release(&it);
- return retval;
-+ }
-
- if (flags & MS_REMOUNT)
- retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
-@@ -764,6 +773,8 @@
- else
- retval = do_add_mount(&nd, type_page, flags, mnt_flags,
- dev_name, data_page);
-+
-+ intent_release(&it);
- path_release(&nd);
- return retval;
- }
-@@ -929,6 +940,8 @@
- {
- struct vfsmount *tmp;
- struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd;
-+ struct lookup_intent new_it = { .it_op = IT_GETATTR };
-+ struct lookup_intent old_it = { .it_op = IT_GETATTR };
- char *name;
- int error;
-
-@@ -943,7 +956,7 @@
- goto out0;
- error = 0;
- if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd))
-- error = path_walk(name, &new_nd);
-+ error = path_walk_it(name, &new_nd, &new_it);
- putname(name);
- if (error)
- goto out0;
-@@ -957,7 +970,7 @@
- goto out1;
- error = 0;
- if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd))
-- error = path_walk(name, &old_nd);
-+ error = path_walk_it(name, &old_nd, &old_it);
- putname(name);
- if (error)
- goto out1;
-@@ -1013,8 +1026,10 @@
- up(&old_nd.dentry->d_inode->i_zombie);
- up_write(¤t->namespace->sem);
- path_release(&user_nd);
-+ intent_release(&old_it);
- path_release(&old_nd);
- out1:
-+ intent_release(&new_it);
- path_release(&new_nd);
- out0:
- unlock_kernel();
-Index: linux-2.4.19.SuSE/fs/open.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/open.c Mon Jan 27 05:08:00 2003
-+++ linux-2.4.19.SuSE/fs/open.c Sat Nov 15 17:43:27 2003
-@@ -19,6 +19,8 @@
- #include <asm/uaccess.h>
-
- #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
-+extern int path_walk_it(const char *name, struct nameidata *nd,
-+ struct lookup_intent *it);
-
- int vfs_statfs(struct super_block *sb, struct statfs *buf)
- {
-@@ -95,9 +97,10 @@
- write_unlock(&files->file_lock);
- }
-
--int do_truncate(struct dentry *dentry, loff_t length)
-+int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
- {
- struct inode *inode = dentry->d_inode;
-+ struct inode_operations *op = dentry->d_inode->i_op;
- int error;
- struct iattr newattrs;
-
-@@ -108,7 +111,13 @@
- down(&inode->i_sem);
- newattrs.ia_size = length;
- newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-- error = notify_change(dentry, &newattrs);
-+ if (called_from_open)
-+ newattrs.ia_valid |= ATTR_FROM_OPEN;
-+ if (op->setattr_raw) {
-+ newattrs.ia_valid |= ATTR_RAW;
-+ error = op->setattr_raw(inode, &newattrs);
-+ } else
-+ error = notify_change(dentry, &newattrs);
- up(&inode->i_sem);
- return error;
- }
-@@ -118,12 +127,13 @@
- struct nameidata nd;
- struct inode * inode;
- int error;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
- error = -EINVAL;
- if (length < 0) /* sorry, but loff_t says... */
- goto out;
-
-- error = user_path_walk(path, &nd);
-+ error = user_path_walk_it(path, &nd, &it);
- if (error)
- goto out;
- inode = nd.dentry->d_inode;
-@@ -163,11 +173,13 @@
- error = locks_verify_truncate(inode, NULL, length);
- if (!error) {
- DQUOT_INIT(inode);
-- error = do_truncate(nd.dentry, length);
-+ intent_release(&it);
-+ error = do_truncate(nd.dentry, length, 0);
- }
- put_write_access(inode);
-
- dput_and_out:
-+ intent_release(&it);
- path_release(&nd);
- out:
- return error;
-@@ -215,7 +227,7 @@
-
- error = locks_verify_truncate(inode, file, length);
- if (!error)
-- error = do_truncate(dentry, length);
-+ error = do_truncate(dentry, length, 0);
- out_putf:
- fput(file);
- out:
-@@ -260,11 +272,13 @@
- struct inode * inode;
- struct iattr newattrs;
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, NULL);
- if (error)
- goto out;
- inode = nd.dentry->d_inode;
-
-+ /* this is safe without a Lustre lock because it only depends
-+ on the super block */
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto dput_and_out;
-@@ -279,11 +293,25 @@
- goto dput_and_out;
-
- newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
-- } else {
-+ }
-+
-+ if (inode->i_op->setattr_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+
-+ newattrs.ia_valid |= ATTR_RAW;
-+ error = op->setattr_raw(inode, &newattrs);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto dput_and_out;
-+ }
-+
-+ error = -EPERM;
-+ if (!times) {
- if (current->fsuid != inode->i_uid &&
- (error = permission(inode,MAY_WRITE)) != 0)
- goto dput_and_out;
- }
-+
- error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
- path_release(&nd);
-@@ -304,12 +332,14 @@
- struct inode * inode;
- struct iattr newattrs;
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, NULL);
-
- if (error)
- goto out;
- inode = nd.dentry->d_inode;
-
-+ /* this is safe without a Lustre lock because it only depends
-+ on the super block */
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto dput_and_out;
-@@ -324,7 +354,20 @@
- newattrs.ia_atime = times[0].tv_sec;
- newattrs.ia_mtime = times[1].tv_sec;
- newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
-- } else {
-+ }
-+
-+ if (inode->i_op->setattr_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+
-+ newattrs.ia_valid |= ATTR_RAW;
-+ error = op->setattr_raw(inode, &newattrs);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto dput_and_out;
-+ }
-+
-+ error = -EPERM;
-+ if (!utimes) {
- if (current->fsuid != inode->i_uid &&
- (error = permission(inode,MAY_WRITE)) != 0)
- goto dput_and_out;
-@@ -347,6 +390,7 @@
- int old_fsuid, old_fsgid;
- kernel_cap_t old_cap;
- int res;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
- if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
- return -EINVAL;
-@@ -364,13 +408,14 @@
- else
- current->cap_effective = current->cap_permitted;
-
-- res = user_path_walk(filename, &nd);
-+ res = user_path_walk_it(filename, &nd, &it);
- if (!res) {
- res = permission(nd.dentry->d_inode, mode);
- /* SuS v2 requires we report a read only fs too */
- if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
- && !special_file(nd.dentry->d_inode->i_mode))
- res = -EROFS;
-+ intent_release(&it);
- path_release(&nd);
- }
-
-@@ -386,6 +431,7 @@
- int error;
- struct nameidata nd;
- char *name;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
- name = getname(filename);
- error = PTR_ERR(name);
-@@ -394,7 +440,7 @@
-
- error = 0;
- if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd))
-- error = path_walk(name, &nd);
-+ error = path_walk_it(name, &nd, &it);
- putname(name);
- if (error)
- goto out;
-@@ -406,6 +452,7 @@
- set_fs_pwd(current->fs, nd.mnt, nd.dentry);
-
- dput_and_out:
-+ intent_release(&it);
- path_release(&nd);
- out:
- return error;
-@@ -446,6 +493,7 @@
- int error;
- struct nameidata nd;
- char *name;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
- name = getname(filename);
- error = PTR_ERR(name);
-@@ -454,7 +502,7 @@
-
- path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
- LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
-- error = path_walk(name, &nd);
-+ error = path_walk_it(name, &nd, &it);
- putname(name);
- if (error)
- goto out;
-@@ -471,39 +519,56 @@
- set_fs_altroot();
- error = 0;
- dput_and_out:
-+ intent_release(&it);
- path_release(&nd);
- out:
- return error;
- }
-
--asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
-+int chmod_common(struct dentry *dentry, mode_t mode)
- {
-- struct inode * inode;
-- struct dentry * dentry;
-- struct file * file;
-- int err = -EBADF;
-+ struct inode *inode = dentry->d_inode;
- struct iattr newattrs;
-+ int err = -EROFS;
-
-- file = fget(fd);
-- if (!file)
-+ if (IS_RDONLY(inode))
- goto out;
-
-- dentry = file->f_dentry;
-- inode = dentry->d_inode;
-+ if (inode->i_op->setattr_raw) {
-+ newattrs.ia_mode = mode;
-+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-+ newattrs.ia_valid |= ATTR_RAW;
-+ err = inode->i_op->setattr_raw(inode, &newattrs);
-+ /* the file system wants to use normal vfs path now */
-+ if (err != -EOPNOTSUPP)
-+ goto out;
-+ }
-
-- err = -EROFS;
-- if (IS_RDONLY(inode))
-- goto out_putf;
- err = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-- goto out_putf;
-+ goto out;
-+
- if (mode == (mode_t) -1)
- mode = inode->i_mode;
- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- err = notify_change(dentry, &newattrs);
-
--out_putf:
-+out:
-+ return err;
-+}
-+
-+asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
-+{
-+ struct file * file;
-+ int err = -EBADF;
-+
-+ file = fget(fd);
-+ if (!file)
-+ goto out;
-+
-+ err = chmod_common(file->f_dentry, mode);
-+
- fput(file);
- out:
- return err;
-@@ -512,30 +577,14 @@
- asmlinkage long sys_chmod(const char * filename, mode_t mode)
- {
- struct nameidata nd;
-- struct inode * inode;
- int error;
-- struct iattr newattrs;
-
- error = user_path_walk(filename, &nd);
- if (error)
- goto out;
-- inode = nd.dentry->d_inode;
--
-- error = -EROFS;
-- if (IS_RDONLY(inode))
-- goto dput_and_out;
-
-- error = -EPERM;
-- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-- goto dput_and_out;
-+ error = chmod_common(nd.dentry, mode);
-
-- if (mode == (mode_t) -1)
-- mode = inode->i_mode;
-- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
-- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-- error = notify_change(nd.dentry, &newattrs);
--
--dput_and_out:
- path_release(&nd);
- out:
- return error;
-@@ -555,6 +604,20 @@
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto out;
-+
-+ if (inode->i_op->setattr_raw) {
-+ struct inode_operations *op = dentry->d_inode->i_op;
-+
-+ newattrs.ia_uid = user;
-+ newattrs.ia_gid = group;
-+ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME;
-+ newattrs.ia_valid |= ATTR_RAW;
-+ error = op->setattr_raw(inode, &newattrs);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ return error;
-+ }
-+
- error = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out;
-@@ -659,6 +722,7 @@
- {
- int namei_flags, error;
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_OPEN };
-
- namei_flags = flags;
- if ((namei_flags+1) & O_ACCMODE)
-@@ -666,14 +730,15 @@
- if (namei_flags & O_TRUNC)
- namei_flags |= 2;
-
-- error = open_namei(filename, namei_flags, mode, &nd);
-- if (!error)
-- return dentry_open(nd.dentry, nd.mnt, flags);
-+ error = open_namei_it(filename, namei_flags, mode, &nd, &it);
-+ if (error)
-+ return ERR_PTR(error);
-
-- return ERR_PTR(error);
-+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it);
- }
-
--struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+ int flags, struct lookup_intent *it)
- {
- struct file * f;
- struct inode *inode;
-@@ -710,7 +775,9 @@
- }
-
- if (f->f_op && f->f_op->open) {
-+ f->f_it = it;
- error = f->f_op->open(inode,f);
-+ f->f_it = NULL;
- if (error)
- goto cleanup_all;
- }
-@@ -722,6 +789,7 @@
- !inode->i_mapping->a_ops->direct_IO))
- goto cleanup_all;
-
-+ intent_release(it);
- return f;
-
- cleanup_all:
-@@ -736,11 +804,17 @@
- cleanup_file:
- put_filp(f);
- cleanup_dentry:
-+ intent_release(it);
- dput(dentry);
- mntput(mnt);
- return ERR_PTR(error);
- }
-
-+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+{
-+ return dentry_open_it(dentry, mnt, flags, NULL);
-+}
-+
- /*
- * Find an empty file descriptor entry, and mark it busy.
- */
-Index: linux-2.4.19.SuSE/fs/stat.c
-===================================================================
---- linux-2.4.19.SuSE.orig/fs/stat.c Mon Jan 27 05:08:00 2003
-+++ linux-2.4.19.SuSE/fs/stat.c Sat Nov 15 17:29:03 2003
-@@ -17,10 +17,16 @@
- * Revalidate the inode. This is required for proper NFS attribute caching.
- */
- static __inline__ int
--do_revalidate(struct dentry *dentry)
-+do_revalidate(struct dentry *dentry, struct lookup_intent *it)
- {
- struct inode * inode = dentry->d_inode;
-- if (inode->i_op && inode->i_op->revalidate)
-+ if (inode->i_op && inode->i_op->revalidate_it)
-+ return inode->i_op->revalidate_it(dentry, it);
-+ else if (inode->i_op && inode->i_op->revalidate)
- return inode->i_op->revalidate(dentry);
- return 0;
- }
-@@ -141,13 +145,15 @@
- asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int error;
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -157,13 +163,15 @@
- asmlinkage long sys_newstat(char * filename, struct stat * statbuf)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int error;
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -178,13 +186,15 @@
- asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int error;
-
-- error = user_path_walk_link(filename, &nd);
-+ error = user_path_walk_link_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -195,13 +205,15 @@
- asmlinkage long sys_newlstat(char * filename, struct stat * statbuf)
- {
- struct nameidata nd;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
- int error;
-
-- error = user_path_walk_link(filename, &nd);
-+ error = user_path_walk_link_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -222,7 +234,7 @@
- if (f) {
- struct dentry * dentry = f->f_dentry;
-
-- err = do_revalidate(dentry);
-+ err = do_revalidate(dentry, NULL);
- if (!err)
- err = cp_old_stat(dentry->d_inode, statbuf);
- fput(f);
-@@ -241,7 +253,7 @@
- if (f) {
- struct dentry * dentry = f->f_dentry;
-
-- err = do_revalidate(dentry);
-+ err = do_revalidate(dentry, NULL);
- if (!err)
- err = cp_new_stat(dentry->d_inode, statbuf);
- fput(f);
-@@ -263,7 +275,7 @@
-
- error = -EINVAL;
- if (inode->i_op && inode->i_op->readlink &&
-- !(error = do_revalidate(nd.dentry))) {
-+ !(error = do_revalidate(nd.dentry, NULL))) {
- UPDATE_ATIME(inode);
- error = inode->i_op->readlink(nd.dentry, buf, bufsiz);
- }
-@@ -339,12 +351,14 @@
- {
- struct nameidata nd;
- int error;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
-- error = user_path_walk(filename, &nd);
-+ error = user_path_walk_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -354,12 +368,14 @@
- {
- struct nameidata nd;
- int error;
-+ struct lookup_intent it = { .it_op = IT_GETATTR };
-
-- error = user_path_walk_link(filename, &nd);
-+ error = user_path_walk_link_it(filename, &nd, &it);
- if (!error) {
-- error = do_revalidate(nd.dentry);
-+ error = do_revalidate(nd.dentry, &it);
- if (!error)
- error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+ intent_release(&it);
- path_release(&nd);
- }
- return error;
-@@ -374,7 +390,7 @@
- if (f) {
- struct dentry * dentry = f->f_dentry;
-
-- err = do_revalidate(dentry);
-+ err = do_revalidate(dentry, NULL);
- if (!err)
- err = cp_new_stat64(dentry->d_inode, statbuf);
- fput(f);
-Index: linux-2.4.19.SuSE/include/linux/dcache.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/dcache.h Mon Jan 27 05:13:15 2003
-+++ linux-2.4.19.SuSE/include/linux/dcache.h Sat Nov 15 17:35:46 2003
-@@ -5,6 +5,51 @@
-
- #include <asm/atomic.h>
- #include <linux/mount.h>
-+#include <linux/string.h>
-+
-+#define IT_OPEN 0x0001
-+#define IT_CREAT 0x0002
-+#define IT_READDIR 0x0004
-+#define IT_GETATTR 0x0008
-+#define IT_LOOKUP 0x0010
-+#define IT_UNLINK 0x0020
-+#define IT_GETXATTR 0x0040
-+#define IT_EXEC 0x0080
-+#define IT_PIN 0x0100
-+
-+#define IT_FL_LOCKED 0x0001
-+#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */
-+
-+#define INTENT_MAGIC 0x19620323
-+
-+
-+struct lustre_intent_data {
-+ int it_disposition;
-+ int it_status;
-+ __u64 it_lock_handle;
-+ void *it_data;
-+ int it_lock_mode;
-+ int it_int_flags;
-+};
-+struct lookup_intent {
-+ int it_magic;
-+ void (*it_op_release)(struct lookup_intent *);
-+ int it_op;
-+ int it_flags;
-+ int it_create_mode;
-+ union {
-+ struct lustre_intent_data lustre;
-+ } d;
-+};
-+
-+static inline void intent_init(struct lookup_intent *it, int op, int flags)
-+{
-+ memset(it, 0, sizeof(*it));
-+ it->it_magic = INTENT_MAGIC;
-+ it->it_op = op;
-+ it->it_flags = flags;
-+}
-+
-
- /*
- * linux/include/linux/dcache.h
-@@ -92,8 +137,22 @@
- int (*d_delete)(struct dentry *);
- void (*d_release)(struct dentry *);
- void (*d_iput)(struct dentry *, struct inode *);
-+ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *);
-+ void (*d_pin)(struct dentry *, struct vfsmount * , int);
-+ void (*d_unpin)(struct dentry *, struct vfsmount *, int);
- };
-
-+#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \
-+ de->d_op->d_pin(de, mnt, flag);
-+#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \
-+ de->d_op->d_unpin(de, mnt, flag);
-+
-+
-+/* defined in fs/namei.c */
-+extern void intent_release(struct lookup_intent *it);
-+/* defined in fs/dcache.c */
-+extern void __d_rehash(struct dentry * entry, int lock);
-+
- /* the dentry parameter passed to d_hash and d_compare is the parent
- * directory of the entries to be compared. It is used in case these
- * functions need any directory specific information for determining
-@@ -125,6 +184,7 @@
- * s_nfsd_free_path semaphore will be down
- */
- #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */
-+#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */
-
- extern spinlock_t dcache_lock;
-
-Index: linux-2.4.19.SuSE/include/linux/fs.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/fs.h Sat Nov 15 17:25:06 2003
-+++ linux-2.4.19.SuSE/include/linux/fs.h Sat Nov 15 17:29:03 2003
-@@ -73,6 +73,7 @@
-
- #define FMODE_READ 1
- #define FMODE_WRITE 2
-+#define FMODE_EXEC 4
-
- #define READ 0
- #define WRITE 1
-@@ -363,6 +364,9 @@
- #define ATTR_MTIME_SET 256
- #define ATTR_FORCE 512 /* Not a change, but a change it */
- #define ATTR_ATTR_FLAG 1024
-+#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */
-+#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */
-+#define ATTR_CTIME_SET 0x2000
-
- /*
- * This is the Inode Attributes structure, used for notify_change(). It
-@@ -507,6 +511,7 @@
- struct pipe_inode_info *i_pipe;
- struct block_device *i_bdev;
- struct char_device *i_cdev;
-+ void *i_filterdata;
-
- unsigned long i_dnotify_mask; /* Directory notify events */
- struct dnotify_struct *i_dnotify; /* for directory notifications */
-@@ -669,6 +674,7 @@
-
- /* needed for tty driver, and maybe others */
- void *private_data;
-+ struct lookup_intent *f_it;
-
- /* preallocated helper kiobuf to speedup O_DIRECT */
- struct kiobuf *f_iobuf;
-@@ -799,6 +805,7 @@
- struct qstr last;
- unsigned int flags;
- int last_type;
-+ struct lookup_intent *intent;
- };
-
- #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */
-@@ -947,7 +954,8 @@
- extern int __vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_unlink(struct inode *, struct dentry *);
--extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-+ struct inode *new_dir, struct dentry *new_dentry);
-
- /*
- * File types
-@@ -1020,21 +1028,32 @@
-
- struct inode_operations {
- int (*create) (struct inode *,struct dentry *,int);
-+ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *);
- struct dentry * (*lookup) (struct inode *,struct dentry *);
-+ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags);
- int (*link) (struct dentry *,struct inode *,struct dentry *);
-+ int (*link_raw) (struct nameidata *,struct nameidata *);
- int (*unlink) (struct inode *,struct dentry *);
-+ int (*unlink_raw) (struct nameidata *);
- int (*symlink) (struct inode *,struct dentry *,const char *);
-+ int (*symlink_raw) (struct nameidata *,const char *);
- int (*mkdir) (struct inode *,struct dentry *,int);
-+ int (*mkdir_raw) (struct nameidata *,int);
- int (*rmdir) (struct inode *,struct dentry *);
-+ int (*rmdir_raw) (struct nameidata *);
- int (*mknod) (struct inode *,struct dentry *,int,int);
-+ int (*mknod_raw) (struct nameidata *,int,dev_t);
- int (*rename) (struct inode *, struct dentry *,
- struct inode *, struct dentry *);
-+ int (*rename_raw) (struct nameidata *, struct nameidata *);
- int (*readlink) (struct dentry *, char *,int);
- int (*follow_link) (struct dentry *, struct nameidata *);
- void (*truncate) (struct inode *);
- int (*permission) (struct inode *, int);
- int (*revalidate) (struct dentry *);
-+ int (*revalidate_it) (struct dentry *, struct lookup_intent *);
- int (*setattr) (struct dentry *, struct iattr *);
-+ int (*setattr_raw) (struct inode *, struct iattr *);
- int (*getattr) (struct dentry *, struct iattr *);
- int (*setxattr) (struct dentry *, const char *, const void *, size_t, int);
- ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
-@@ -1244,10 +1263,14 @@
-
- asmlinkage long sys_open(const char *, int, int);
- asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */
--extern int do_truncate(struct dentry *, loff_t start);
-+extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
-
- extern struct file *filp_open(const char *, int, int);
- extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
-+extern int open_namei_it(const char *filename, int namei_flags, int mode,
-+ struct nameidata *nd, struct lookup_intent *it);
-+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+ int flags, struct lookup_intent *it);
- extern int filp_close(struct file *, fl_owner_t id);
- extern char * getname(const char *);
-
-@@ -1515,6 +1538,7 @@
- extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
-
- extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
-+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it));
- extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
- extern int FASTCALL(path_walk(const char *, struct nameidata *));
- extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
-@@ -1526,6 +1550,8 @@
- extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
- #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
- #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
-+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it)
-+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it)
-
- extern void iput(struct inode *);
- extern void force_delete(struct inode *);
-@@ -1646,6 +1672,8 @@
-
- extern int vfs_readlink(struct dentry *, char *, int, const char *);
- extern int vfs_follow_link(struct nameidata *, const char *);
-+extern int vfs_follow_link_it(struct nameidata *, const char *,
-+ struct lookup_intent *it);
- extern int page_readlink(struct dentry *, char *, int);
- extern int page_follow_link(struct dentry *, struct nameidata *);
- extern struct inode_operations page_symlink_inode_operations;
-Index: linux-2.4.19.SuSE/include/linux/fs_struct.h
-===================================================================
---- linux-2.4.19.SuSE.orig/include/linux/fs_struct.h Fri Jul 13 15:10:44 2001
-+++ linux-2.4.19.SuSE/include/linux/fs_struct.h Sat Nov 15 17:29:03 2003
-@@ -34,10 +34,12 @@
- write_lock(&fs->lock);
- old_root = fs->root;
- old_rootmnt = fs->rootmnt;
-+ PIN(dentry, mnt, 1);
- fs->rootmnt = mntget(mnt);
- fs->root = dget(dentry);
- write_unlock(&fs->lock);
- if (old_root) {
-+ UNPIN(old_root, old_rootmnt, 1);
- dput(old_root);
- mntput(old_rootmnt);
- }
-@@ -57,10 +59,12 @@
- write_lock(&fs->lock);
- old_pwd = fs->pwd;
- old_pwdmnt = fs->pwdmnt;
-+ PIN(dentry, mnt, 0);
- fs->pwdmnt = mntget(mnt);
- fs->pwd = dget(dentry);
- write_unlock(&fs->lock);
- if (old_pwd) {
-+ UNPIN(old_pwd, old_pwdmnt, 0);
- dput(old_pwd);
- mntput(old_pwdmnt);
- }
-Index: linux-2.4.19.SuSE/kernel/exit.c
-===================================================================
---- linux-2.4.19.SuSE.orig/kernel/exit.c Mon Jan 27 05:08:16 2003
-+++ linux-2.4.19.SuSE/kernel/exit.c Sat Nov 15 17:29:03 2003
-@@ -288,11 +288,14 @@
- {
- /* No need to hold fs->lock if we are killing it */
- if (atomic_dec_and_test(&fs->count)) {
-+ UNPIN(fs->pwd, fs->pwdmnt, 0);
-+ UNPIN(fs->root, fs->rootmnt, 1);
- dput(fs->root);
- mntput(fs->rootmnt);
- dput(fs->pwd);
- mntput(fs->pwdmnt);
- if (fs->altroot) {
-+ UNPIN(fs->altroot, fs->altrootmnt, 1);
- dput(fs->altroot);
- mntput(fs->altrootmnt);
- }
-Index: linux-2.4.19.SuSE/kernel/fork.c
-===================================================================
---- linux-2.4.19.SuSE.orig/kernel/fork.c Mon Jan 27 05:08:56 2003
-+++ linux-2.4.19.SuSE/kernel/fork.c Sat Nov 15 17:29:03 2003
-@@ -454,10 +454,13 @@
- fs->umask = old->umask;
- read_lock(&old->lock);
- fs->rootmnt = mntget(old->rootmnt);
-+ PIN(old->pwd, old->pwdmnt, 0);
-+ PIN(old->root, old->rootmnt, 1);
- fs->root = dget(old->root);
- fs->pwdmnt = mntget(old->pwdmnt);
- fs->pwd = dget(old->pwd);
- if (old->altroot) {
-+ PIN(old->altroot, old->altrootmnt, 1);
- fs->altrootmnt = mntget(old->altrootmnt);
- fs->altroot = dget(old->altroot);
- } else {
-Index: linux-2.4.19.SuSE/kernel/ksyms.c
-===================================================================
---- linux-2.4.19.SuSE.orig/kernel/ksyms.c Sat Nov 15 17:24:46 2003
-+++ linux-2.4.19.SuSE/kernel/ksyms.c Sat Nov 15 17:29:03 2003
-@@ -315,6 +315,7 @@
- EXPORT_SYMBOL(set_page_dirty);
- EXPORT_SYMBOL(vfs_readlink);
- EXPORT_SYMBOL(vfs_follow_link);
-+EXPORT_SYMBOL(vfs_follow_link_it);
- EXPORT_SYMBOL(page_readlink);
- EXPORT_SYMBOL(page_follow_link);
- EXPORT_SYMBOL(page_symlink_inode_operations);
+++ /dev/null
-Index: linux.mcp2/kernel/ksyms.c
-===================================================================
---- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:57:48.000000000 -0700
-+++ linux.mcp2/kernel/ksyms.c 2004-05-05 15:32:44.000000000 -0700
-@@ -108,6 +108,7 @@
- EXPORT_SYMBOL(kfree);
- EXPORT_SYMBOL(vfree);
- EXPORT_SYMBOL(__vmalloc);
-+extern struct page * vmalloc_to_page(void *addr);
- EXPORT_SYMBOL(vmalloc_to_page);
- EXPORT_SYMBOL(mem_map);
- EXPORT_SYMBOL(remap_page_range);
+++ /dev/null
-dev_read_only_2.4.20-rh.patch
-exports_2.4.19-bgl.patch
-lustre_version.patch
-vfs_intent-2.4.19-bgl.patch
-invalidate_show-2.4.19-bgl.patch
-export-truncate-bgl.patch
-iod-stock-24-exports-2.4.19-bgl.patch
-ext3-htree-2.4.19-bgl.patch
-linux-2.4.19-bgl-xattr-0.8.54.patch
-ext3-2.4.20-fixes.patch
-ext3-2.4-ino_t.patch
-ext3-largefile.patch
-ext3-truncate_blocks.patch
-ext3-unmount_sync.patch
-ext3-use-after-free-2.4.19-pre1.patch
-ext3-orphan_lock.patch
-ext3-noread-2.4.20.patch
-ext3-delete_thread-2.4.20.patch
-extN-wantedi.patch
-ext3-san-2.4.20.patch
-ext3-map_inode_page.patch
-ext3-error-export.patch
-iopen-2.4.19-bgl.patch
-tcp-zero-copy-2.4.19-pre1.patch
-jbd-dont-account-blocks-twice.patch
-jbd-commit-tricks.patch
-ext3-no-write-super.patch
-add_page_private-2.4.19-bgl.patch
-socket-exports-2.4.19-bgl.patch
-removepage-2.4.20.patch
-jbd-ctx_switch.patch
-jbd-flushtime-2.4.19-suse.patch
-jbd-get_write_access.patch
-nfs_export_kernel-2.4.19-bgl.patch
-ext3-raw-lookup.patch
-ext3-ea-in-inode-2.4.20.patch
-listman-2.4.19-bgl.patch
-ext3-trusted_ea-2.4.20.patch
-jbd-2.4.19-pre1-jcberr.patch
-resched-2.4.19-pre1.patch
-ext3-xattr-ptr-arith-fix.patch
-vmalloc_to_page-2.4.19-bgl.patch
-procfs-ndynamic-2.4.patch
-ext3-truncate-buffer-head.patch
-kallsyms-2.4-bgl.patch
-kksymoops-2.4-bgl.patch
-export-show_task-2.4-bgl.patch