From ed1792ed91ebd96acf00fa99a61b3bb32c64d32f Mon Sep 17 00:00:00 2001
From: wangdi <wangdi>
Date: Wed, 6 Apr 2005 16:05:44 +0000
Subject: [PATCH] Branch: HEAD add FC3 kernel patches

---
 .../patches/dev_read_only-2.6.10-fc3.patch         |    81 +
 .../patches/dynamic-locks-2.6.10-fc3.patch         |   278 +
 .../patches/export-ext3-2.6.10-fc3.patch           |    33 +
 .../patches/export-fedro-2.6.10.patch              |    84 +
 .../patches/export_symbols-ext3-2.6.10-fc3.patch   |    17 +
 .../patches/ext3-extents-2.6.10-fc3.patch          |  2846 +++++
 .../patches/ext3-extents-in-ea-2.6.10-fc3.patch    |   361 +
 .../ext3-extents-in-ea-ioctl-2.6.10-fc3.patch      |   230 +
 .../patches/ext3-mds-num-2.6.10-fc3.patch          |   281 +
 .../patches/ext3-pdirops-2.6.10-fc3.patch          |  1202 +++
 .../patches/ext3-wantedi-2.6.10-fc3.patch          |   192 +
 .../patches/hostfs_readdir_large.patch             |    32 +
 .../kernel_patches/patches/iopen-2.6.10-fc3.patch  |   476 +
 .../kernel_patches/patches/jbd-2.6.10-jcberr.patch |   222 +
 .../patches/jbd-buffer-release-2.6.10-fc3.patch    |   399 +
 lustre/kernel_patches/patches/kgdb-ga.patch        |  6358 +++++++++++
 .../patches/linux-2.6.10-CITI_NFS4_ALL-1.patch     | 10703 +++++++++++++++++++
 .../patches/linux-2.6.10-fc3-left.patch            |  1477 +++
 .../patches/linux-2.6.10-fc3-lkcd.patch            | 10676 ++++++++++++++++++
 lustre/kernel_patches/patches/uml-2.6.10-fc3.patch |  3746 +++++++
 .../vfs-dcache_locking-vanilla-2.6.10-fc3.patch    |   113 +
 .../vfs-gns_export_doumount-2.6.10-fc3.patch       |    34 +
 .../vfs-intent_api-vanilla-2.6.10-fc3.patch        |   557 +
 .../vfs-lookup_last-vanilla-2.6.10-fc3.patch       |    78 +
 .../patches/vfs-pdirops-2.6.10-fc3.patch           |   274 +
 .../patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch   |   235 +
 .../patches/vfs_gns-2.6.10-fc3.patch               |    22 +
 27 files changed, 41007 insertions(+)
 create mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/export-fedro-2.6.10.patch
 create mode 100644 lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch
 create mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch
 create mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch
 create mode 100755 lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/hostfs_readdir_large.patch
 create mode 100644 lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch
 create mode 100644 lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/kgdb-ga.patch
 create mode 100644 lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch
 create mode 100644 lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch
 create mode 100644 lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch
 create mode 100644 lustre/kernel_patches/patches/uml-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch
 create mode 100644 lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch
diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch
new file mode 100644
index 0000000..1aec6f6
--- /dev/null
+++ b/lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch
@@ -0,0 +1,81 @@
+ drivers/block/ll_rw_blk.c |   49 ++++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/blkdev.h    |    1 
+ 2 files changed, 50 insertions(+)
+
+Index: linux-2.6.10/drivers/block/ll_rw_blk.c
+===================================================================
+--- linux-2.6.10.orig/drivers/block/ll_rw_blk.c	2004-12-25 05:33:59.000000000 +0800
++++ linux-2.6.10/drivers/block/ll_rw_blk.c	2005-04-05 15:42:58.075467024 +0800
+@@ -2679,6 +2679,13 @@
+ 		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))
+ 			goto end_io;
+ 
++                /* this is cfs's dev_rdonly check */
++                if (bio->bi_rw == WRITE &&
++                                dev_check_rdonly(bio->bi_bdev->bd_dev)) {
++                        bio_endio(bio, bio->bi_size, 0);
++                        break;
++                }
++
+ 		block_wait_queue_running(q);
+ 
+ 		/*
+@@ -3287,6 +3294,58 @@
+ 	return queue_var_show(max_hw_sectors_kb, (page));
+ }
+ 
++#define MAX_RDONLY_DEVS		16
++
++static dev_t rdonly_devs[MAX_RDONLY_DEVS] = {0, };
++
++/*
++ * Debug code for turning block devices "read-only" (will discard writes
++ * silently).  This is for filesystem crash/recovery testing.
++ */
++void dev_set_rdonly(struct block_device *bdev, int no_write)
++{
++	if (no_write >= MAX_RDONLY_DEVS) {
++		printk(KERN_ALERT "%s:%d illegal arg %d (max %d)\n",
++				__FILE__, __LINE__, no_write, MAX_RDONLY_DEVS);
++		return;
++	}
++
++	if (bdev) {
++		printk(KERN_WARNING "Turning device %s read-only at %d\n",
++				bdev->bd_disk ? bdev->bd_disk->disk_name : "?",
++				no_write);
++		rdonly_devs[no_write] = bdev->bd_dev;
++	}
++}
++
++void dev_clear_rdonly(int no_write)
++{
++	if (no_write >= MAX_RDONLY_DEVS) {
++		printk(KERN_ALERT "%s:%d illegal arg %d (max %d)\n",
++				__FILE__, __LINE__, no_write, MAX_RDONLY_DEVS);
++		return;
++	}
++
++	if (rdonly_devs[no_write] == 0)
++		return;
++	
++	printk(KERN_WARNING "Clearing read-only at %d\n", no_write);
++	rdonly_devs[no_write] = 0;
++}
++
++int dev_check_rdonly(dev_t dev)
++{
++	int i;
++
++	for (i = 0; i < MAX_RDONLY_DEVS; i++)
++		if (rdonly_devs[i] == dev)
++			return 1;
++	return 0;
++}
++
++EXPORT_SYMBOL(dev_set_rdonly);
++EXPORT_SYMBOL(dev_clear_rdonly);
++EXPORT_SYMBOL(dev_check_rdonly);
+ 
+ static struct queue_sysfs_entry queue_requests_entry = {
+ 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
diff --git a/lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch b/lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch
new file mode 100644
index 0000000..166deb6
--- /dev/null
+++ b/lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch
@@ -0,0 +1,278 @@
+ include/linux/dynlocks.h |   33 ++++++++++
+ lib/Makefile             |    4 -
+ lib/dynlocks.c           |  152 +++++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 187 insertions(+), 2 deletions(-)
+
+Index: linux-2.6.10/lib/dynlocks.c
+===================================================================
+--- linux-2.6.10.orig/lib/dynlocks.c	2005-03-31 16:59:29.399768040 +0800
++++ linux-2.6.10/lib/dynlocks.c	2005-03-31 18:02:41.470646856 +0800
+@@ -0,0 +1,187 @@
++/*
++ * Dynamic Locks
++ *
++ * struct dynlock is lockspace
++ * one may request lock (exclusive or shared) for some value
++ * in that lockspace
++ *
++ */
++
++#include <linux/dynlocks.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++
++static kmem_cache_t * dynlock_cachep = NULL;
++
++void __init dynlock_cache_init(void)
++{
++	printk(KERN_INFO "init dynlocks cache\n");
++	dynlock_cachep = kmem_cache_create("dynlock_cache",
++					 sizeof(struct dynlock_member),
++					 0,
++					 SLAB_HWCACHE_ALIGN,
++					 NULL, NULL);
++	if (dynlock_cachep == NULL)
++		panic("Can't create dynlock cache");
++}
++
++/*
++ * dynlock_init
++ *
++ * initialize lockspace
++ *
++ */
++void dynlock_init(struct dynlock *dl)
++{
++	spin_lock_init(&dl->dl_list_lock);
++	INIT_LIST_HEAD(&dl->dl_list);
++	dl->dl_magic = DYNLOCK_LIST_MAGIC;
++}
++
++/*
++ * dynlock_lock
++ *
++ * acquires lock (exclusive or shared) in specified lockspace
++ * each lock in lockspace is allocated separately, so user have
++ * to specify GFP flags.
++ * routine returns pointer to lock. this pointer is intended to
++ * be passed to dynlock_unlock
++ *
++ */
++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp)
++{
++	struct dynlock_member *nhl = NULL; 
++	struct dynlock_member *hl; 
++	struct list_head *cur;
++	int num = 0;
++
++	BUG_ON(dl == NULL);
++	BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC);
++repeat:
++	/* find requested lock in lockspace */
++	spin_lock(&dl->dl_list_lock);
++	BUG_ON(dl->dl_list.next == NULL);
++	BUG_ON(dl->dl_list.prev == NULL);
++	list_for_each(cur, &dl->dl_list) {
++		BUG_ON(cur->next == NULL);
++		BUG_ON(cur->prev == NULL);
++		hl = list_entry(cur, struct dynlock_member, dl_list);
++		BUG_ON(hl->dl_magic != DYNLOCK_MAGIC);
++		if (hl->dl_value == value) {
++			/* lock is found */
++			if (nhl) {
++				/* someone else just allocated
++				 * lock we didn't find and just created
++				 * so, we drop our lock
++				 */
++				kmem_cache_free(dynlock_cachep, nhl);
++				nhl = NULL;
++			}
++			hl->dl_refcount++;
++			goto found;
++		}
++		num++;
++	}
++	/* lock not found */
++	if (nhl) {
++		/* we already have allocated lock. use it */
++		hl = nhl;
++		nhl = NULL;
++		list_add(&hl->dl_list, &dl->dl_list);
++		goto found;
++	}
++	spin_unlock(&dl->dl_list_lock);
++	
++	/* lock not found and we haven't allocated lock yet. allocate it */
++	nhl = kmem_cache_alloc(dynlock_cachep, gfp);
++	if (nhl == NULL)
++		return NULL;
++	nhl->dl_refcount = 1;
++	nhl->dl_value = value;
++	nhl->dl_readers = 0;
++	nhl->dl_writers = 0;
++	nhl->dl_magic = DYNLOCK_MAGIC;
++	init_waitqueue_head(&nhl->dl_wait);
++
++	/* while lock is being allocated, someone else may allocate it
++	 * and put onto to list. check this situation
++	 */
++	goto repeat;
++
++found:
++	if (rw) {
++		/* exclusive lock: user don't want to share lock at all
++		 * NOTE: one process may take the same lock several times
++		 * this functionaly is useful for rename operations */
++		while ((hl->dl_writers && hl->dl_pid != current->pid) ||
++				hl->dl_readers) {
++			spin_unlock(&dl->dl_list_lock);
++			wait_event(hl->dl_wait,
++				hl->dl_writers == 0 && hl->dl_readers == 0);
++			spin_lock(&dl->dl_list_lock);
++		}
++		hl->dl_writers++;
++	} else {
++		/* shared lock: user do not want to share lock with writer */
++		while (hl->dl_writers) {
++			spin_unlock(&dl->dl_list_lock);
++			wait_event(hl->dl_wait, hl->dl_writers == 0);
++			spin_lock(&dl->dl_list_lock);
++		}
++		hl->dl_readers++;
++	}
++	hl->dl_pid = current->pid;
++	spin_unlock(&dl->dl_list_lock);
++
++	return hl;
++}
++
++
++/*
++ * dynlock_unlock
++ *
++ * user have to specify lockspace (dl) and pointer to lock structure
++ * returned by dynlock_lock()
++ *
++ */
++void dynlock_unlock(struct dynlock *dl, void *lock)
++{
++	struct dynlock_member *hl = lock;
++	int wakeup = 0;
++	
++	BUG_ON(dl == NULL);
++	BUG_ON(hl == NULL);
++	BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC);
++	BUG_ON(hl->dl_magic != DYNLOCK_MAGIC);
++	BUG_ON(current->pid != hl->dl_pid);
++
++	spin_lock(&dl->dl_list_lock);
++	if (hl->dl_writers) {
++		BUG_ON(hl->dl_readers > 0 || hl->dl_readers < 0);
++		hl->dl_writers--;
++		if (hl->dl_writers == 0)
++			wakeup = 1;
++	} else if (hl->dl_readers) {
++		hl->dl_readers--;
++		if (hl->dl_readers == 0)
++			wakeup = 1;
++	} else {
++		BUG_ON(1);
++	}
++	if (wakeup) {
++		hl->dl_pid = 0;
++		wake_up(&hl->dl_wait);
++	}
++	if (--(hl->dl_refcount) == 0) {
++		hl->dl_magic = DYNLOCK_MAGIC2;
++		list_del(&hl->dl_list);
++		kmem_cache_free(dynlock_cachep, hl);
++	}
++	spin_unlock(&dl->dl_list_lock);
++}
++
++EXPORT_SYMBOL(dynlock_init);
++EXPORT_SYMBOL(dynlock_lock);
++EXPORT_SYMBOL(dynlock_unlock);
++
+Index: linux-2.6.10/lib/Makefile
+===================================================================
+--- linux-2.6.10.orig/lib/Makefile	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/lib/Makefile	2005-03-31 18:03:16.727287032 +0800
+@@ -5,7 +5,7 @@
+ lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
+ 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
+ 	 kobject.o kref.o idr.o div64.o parser.o int_sqrt.o \
+-	 bitmap.o extable.o kobject_uevent.o
++	 bitmap.o extable.o kobject_uevent.o dynlocks.o
+ 
+ ifeq ($(CONFIG_DEBUG_KOBJECT),y)
+ CFLAGS_kobject.o += -DDEBUG
+Index: linux-2.6.10/fs/dcache.c
+===================================================================
+--- linux-2.6.10.orig/fs/dcache.c	2005-03-31 17:02:41.000000000 +0800
++++ linux-2.6.10/fs/dcache.c	2005-03-31 18:02:41.474646248 +0800
+@@ -1655,6 +1655,7 @@
+ 
+ extern void bdev_cache_init(void);
+ extern void chrdev_init(void);
++extern void dynlock_cache_init(void);
+ 
+ void __init vfs_caches_init_early(void)
+ {
+@@ -1684,6 +1685,7 @@
+ 	mnt_init(mempages);
+ 	bdev_cache_init();
+ 	chrdev_init();
++	dynlock_cache_init();
+ }
+ 
+ EXPORT_SYMBOL(d_alloc);
+Index: linux-2.6.10/include/linux/dynlocks.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dynlocks.h	2005-03-31 16:59:29.399768040 +0800
++++ linux-2.6.10/include/linux/dynlocks.h	2005-03-31 18:02:41.469647008 +0800
+@@ -0,0 +1,43 @@
++#ifndef _LINUX_DYNLOCKS_H
++#define _LINUX_DYNLOCKS_H
++
++#include <linux/list.h>
++#include <linux/wait.h>
++
++#define DYNLOCK_MAGIC		0xd19a10c
++#define DYNLOCK_MAGIC2		0xd1956ee
++
++struct dynlock;
++
++struct dynlock_member {
++	unsigned 		dl_magic;
++	struct list_head	dl_list;
++	unsigned long		dl_value;	/* lock value */
++	int			dl_refcount;	/* number of users */
++	int			dl_readers;
++	int			dl_writers;
++	int			dl_pid;		/* holder of the lock */
++	wait_queue_head_t	dl_wait;
++};
++
++/*
++ * lock's namespace:
++ *   - list of locks
++ *   - lock to protect this list
++ */
++
++#define DYNLOCK_LIST_MAGIC	0x11ee91e6
++
++struct dynlock {
++	unsigned dl_magic;
++	struct list_head dl_list;
++	spinlock_t dl_list_lock;
++};
++
++void dynlock_init(struct dynlock *dl);
++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp);
++void dynlock_unlock(struct dynlock *dl, void *lock);
++
++
++#endif
++
diff --git a/lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch b/lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch
new file mode 100644
index 0000000..449c4b9
--- /dev/null
+++ b/lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch
@@ -0,0 +1,33 @@
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c	2005-03-31 18:44:38.935933960 +0800
++++ linux-2.6.10/fs/ext3/super.c	2005-03-31 18:46:03.008153040 +0800
+@@ -123,6 +123,8 @@
+ 	journal_abort_handle(handle);
+ }
+ 
++EXPORT_SYMBOL(ext3_journal_abort_handle);
++ 
+ /* Deal with the reporting of failure conditions on a filesystem such as
+  * inconsistencies detected or read IO failures.
+  *
+@@ -2016,6 +2018,8 @@
+ 	return ret;
+ }
+ 
++EXPORT_SYMBOL(ext3_force_commit);
++
+ /*
+  * Ext3 always journals updates to the superblock itself, so we don't
+  * have to propagate any other updates to the superblock on disk at this
+@@ -2447,6 +2451,10 @@
+                         unsigned long *blocks, int *created, int create);
+ EXPORT_SYMBOL(ext3_map_inode_page);
+ 
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_set_handle);
++EXPORT_SYMBOL(ext3_bread);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+ MODULE_LICENSE("GPL");
diff --git a/lustre/kernel_patches/patches/export-fedro-2.6.10.patch b/lustre/kernel_patches/patches/export-fedro-2.6.10.patch
new file mode 100644
index 0000000..d724d6f
--- /dev/null
+++ b/lustre/kernel_patches/patches/export-fedro-2.6.10.patch
@@ -0,0 +1,84 @@
+Index: linux-2.6.10/net/core/sock.c
+===================================================================
+--- linux-2.6.10.orig/net/core/sock.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/net/core/sock.c	2005-03-31 20:42:01.084364672 +0800
+@@ -1359,6 +1359,7 @@
+ EXPORT_SYMBOL(sk_alloc);
+ EXPORT_SYMBOL(sk_free);
+ EXPORT_SYMBOL(sk_send_sigurg);
++EXPORT_SYMBOL(sock_getsockopt);
+ EXPORT_SYMBOL(sock_alloc_send_pskb);
+ EXPORT_SYMBOL(sock_alloc_send_skb);
+ EXPORT_SYMBOL(sock_init_data);
+Index: linux-2.6.10/fs/dcache.c
+===================================================================
+--- linux-2.6.10.orig/fs/dcache.c	2005-03-31 19:44:53.000000000 +0800
++++ linux-2.6.10/fs/dcache.c	2005-03-31 22:02:08.130582568 +0800
+@@ -1691,6 +1691,7 @@
+ 
+ EXPORT_SYMBOL(d_alloc);
+ EXPORT_SYMBOL(d_alloc_anon);
++EXPORT_SYMBOL(is_subdir);
+ EXPORT_SYMBOL(d_alloc_root);
+ EXPORT_SYMBOL(d_delete);
+ EXPORT_SYMBOL(d_find_alias);
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c	2005-03-31 19:44:54.000000000 +0800
++++ linux-2.6.10/fs/namespace.c	2005-03-31 22:03:44.906870336 +0800
+@@ -1239,6 +1239,7 @@
+ 		mntput(old_pwdmnt);
+ 	}
+ }
++EXPORT_SYMBOL(set_fs_pwd);
+ 
+ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
+ {
+Index: linux-2.6.10/fs/file_table.c
+===================================================================
+--- linux-2.6.10.orig/fs/file_table.c	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/fs/file_table.c	2005-03-31 20:44:40.924065344 +0800
+@@ -196,6 +196,7 @@
+ 		file_free(file);
+ 	}
+ }
++EXPORT_SYMBOL(put_filp);
+ 
+ void file_move(struct file *file, struct list_head *list)
+ {
+Index: linux-2.6.10/kernel/sched.c
+===================================================================
+--- linux-2.6.10.orig/kernel/sched.c	2005-03-31 15:57:21.000000000 +0800
++++ linux-2.6.10/kernel/sched.c	2005-03-31 22:00:30.616406976 +0800
+@@ -2942,6 +2942,19 @@
+ 
+ EXPORT_SYMBOL(sleep_on_timeout);
+ 
++void fastcall __sched sleep_on(wait_queue_head_t *q)
++{
++        SLEEP_ON_VAR
++
++        current->state = TASK_UNINTERRUPTIBLE;
++
++        SLEEP_ON_HEAD
++        schedule();
++        SLEEP_ON_TAIL
++}
++
++EXPORT_SYMBOL(sleep_on);
++
+ void set_user_nice(task_t *p, long nice)
+ {
+ 	unsigned long flags;
+Index: linux-2.6.10/kernel/exit.c
+===================================================================
+--- linux-2.6.10.orig/kernel/exit.c	2005-03-31 19:44:52.509587264 +0800
++++ linux-2.6.10/kernel/exit.c	2005-03-31 20:47:18.034180976 +0800
+@@ -515,6 +515,7 @@
+ {
+ 	__exit_mm(tsk);
+ }
++EXPORT_SYMBOL(exit_mm);
+ 
+ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+ {
diff --git a/lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch b/lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch
new file mode 100644
index 0000000..d09fd6a
--- /dev/null
+++ b/lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch
@@ -0,0 +1,17 @@
+Index: linux-2.6.10/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs_sb.h	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/include/linux/ext3_fs_sb.h	2005-03-31 18:44:21.076648984 +0800
+@@ -19,9 +19,12 @@
+ #ifdef __KERNEL__
+ #include <linux/timer.h>
+ #include <linux/wait.h>
++#ifndef EXT_INCLUDE
++#define EXT_INCLUDE
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
+ #endif
++#endif
+ #include <linux/rbtree.h>
+ 
+ /*
diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch
new file mode 100644
index 0000000..90064a2
--- /dev/null
+++ b/lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch
@@ -0,0 +1,2846 @@
+%patch
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h	2005-04-05 12:26:19.494124024 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h	2005-04-05 12:26:25.474214912 +0800
+@@ -186,6 +186,7 @@
+ #define EXT3_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
+ #define EXT3_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
+ #define EXT3_RESERVED_FL		0x80000000 /* reserved for ext3 lib */
++#define EXT3_EXTENTS_FL			0x00080000 /* Inode uses extents */
+ 
+ #define EXT3_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
+@@ -238,7 +239,9 @@
+ #endif
+ #define EXT3_IOC_GETRSVSZ		_IOR('f', 5, long)
+ #define EXT3_IOC_SETRSVSZ		_IOW('f', 6, long)
+-
++#define	EXT3_IOC_GET_EXTENTS		_IOR('f', 10, long)
++#define	EXT3_IOC_GET_TREE_DEPTH		_IOR('f', 11, long)
++#define	EXT3_IOC_GET_TREE_STATS		_IOR('f', 12, long)
+ /*
+  * Structure of an inode on the disk
+  */
+@@ -361,6 +364,8 @@
+ #define EXT3_MOUNT_PDIROPS		0x800000/* Parallel dir operations */
+ #define EXT3_MOUNT_IOPEN		0x40000	/* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV		0x80000	/* Make iopen world-readable */
++#define EXT3_MOUNT_EXTENTS		0x100000	/* Extents support */
++#define EXT3_MOUNT_EXTDEBUG		0x200000	/* Extents debug */
+ 
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -549,11 +554,13 @@
+ #define EXT3_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
+ #define EXT3_FEATURE_INCOMPAT_META_BG		0x0010
++#define EXT3_FEATURE_INCOMPAT_EXTENTS		0x0040 /* extents support */
+ 
+ #define EXT3_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP	(EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ 					 EXT3_FEATURE_INCOMPAT_RECOVER| \
+-					 EXT3_FEATURE_INCOMPAT_META_BG)
++					 EXT3_FEATURE_INCOMPAT_META_BG| \
++					 EXT3_FEATURE_INCOMPAT_EXTENTS)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ 					 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ 					 EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+@@ -759,6 +766,7 @@
+ 
+ 
+ /* inode.c */
++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+ extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -839,6 +847,14 @@
+ extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+ 
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++				struct buffer_head *, int, int);
++extern void ext3_ext_truncate(struct inode *, struct page *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
+ 
+ #endif	/* __KERNEL__ */
+ 
+Index: linux-2.6.10/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs_i.h	2005-04-05 12:26:19.377141808 +0800
++++ linux-2.6.10/include/linux/ext3_fs_i.h	2005-04-05 12:26:25.473215064 +0800
+@@ -134,6 +134,8 @@
+  	struct dynlock i_htree_lock;
+  	struct semaphore i_append_sem;
+  	struct semaphore i_rename_sem;
++
++ 	__u32 i_cached_extent[3];
+ };
+ 
+ #endif	/* _LINUX_EXT3_FS_I */
+Index: linux-2.6.10/include/linux/ext3_extents.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_extents.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/ext3_extents.h	2005-04-05 12:26:25.476214608 +0800
+@@ -0,0 +1,238 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
++ */
++
++#ifndef _LINUX_EXT3_EXTENTS
++#define _LINUX_EXT3_EXTENTS
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if CHECK_BINSEARCH defined, then results of binary search
++ * will be checked by linear search
++ */
++#define CHECK_BINSEARCH_
++
++/*
++ * if EXT_DEBUG is defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG
++#ifdef EXT_DEBUG
++#define ext_debug(tree,fmt,a...) 			\
++do {							\
++	if (test_opt((tree)->inode->i_sb, EXTDEBUG))	\
++		printk(fmt, ##a);			\
++} while (0);
++#else
++#define ext_debug(tree,fmt,a...)
++#endif
++
++/*
++ * if EXT_STATS is defined then stats numbers are collected
++ * these number will be displayed at umount time
++ */
++#define EXT_STATS_
++
++
++#define EXT3_ALLOC_NEEDED	3	/* block bitmap + group desc. + sb */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ *  - tree depth (0 mean there is no tree yet. all extents in the inode)
++ *  - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++	__u32	ee_block;	/* first logical block extent covers */
++	__u16	ee_len;		/* number of blocks covered by extent */
++	__u16	ee_start_hi;	/* high 16 bits of physical block */
++	__u32	ee_start;	/* low 32 bigs of physical block */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++	__u32	ei_block;	/* index covers logical blocks from 'block' */
++	__u32	ei_leaf;	/* pointer to the physical block of the next *
++				 * level. leaf or next index could bet here */
++	__u16	ei_leaf_hi;	/* high 16 bits of physical block */
++	__u16	ei_unused;
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {	
++	__u16	eh_magic;	/* probably will support different formats */	
++	__u16	eh_entries;	/* number of valid entries */
++	__u16	eh_max;		/* capacity of store in entries */
++	__u16	eh_depth;	/* has tree real underlaying blocks? */
++	__u32	eh_generation;	/* generation of the tree */
++};
++
++#define EXT3_EXT_MAGIC		0xf30a
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++	__u32				p_block;
++	__u16				p_depth;
++	struct ext3_extent		*p_ext;
++	struct ext3_extent_idx		*p_idx;
++	struct ext3_extent_header	*p_hdr;
++	struct buffer_head		*p_bh;
++};
++
++/*
++ * structure for external API
++ */
++
++/*
++ * ext3_extents_tree is used to pass initial information
++ * to top-level extents API
++ */
++struct ext3_extents_helpers;
++struct ext3_extents_tree {
++	struct inode *inode;	/* inode which tree belongs to */
++	void *root;		/* ptr to data top of tree resides at */
++	void *buffer;		/* will be passed as arg to ^^ routines	*/
++	int buffer_len;
++	void *private;
++	struct ext3_extent *cex;/* last found extent */
++	struct ext3_extents_helpers *ops;
++};
++
++struct ext3_extents_helpers {
++	int (*get_write_access)(handle_t *h, void *buffer);
++	int (*mark_buffer_dirty)(handle_t *h, void *buffer);
++	int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
++	int (*remove_extent_credits)(struct ext3_extents_tree *,
++					struct ext3_extent *, unsigned long,
++					unsigned long);
++	int (*remove_extent)(struct ext3_extents_tree *,
++				struct ext3_extent *, unsigned long,
++				unsigned long);
++	int (*new_block)(handle_t *, struct ext3_extents_tree *,
++				struct ext3_ext_path *, struct ext3_extent *,
++				int *);
++};
++
++/*
++ * to be called by ext3_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext3_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
++					struct ext3_ext_path *,
++					struct ext3_extent *, int);
++
++#define EXT_CONTINUE	0
++#define EXT_BREAK	1
++#define EXT_REPEAT	2
++
++
++#define EXT_MAX_BLOCK	0xffffffff
++#define EXT_CACHE_MARK	0xffff
++
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++	((struct ext3_extent *) (((char *) (__hdr__)) +		\
++				 sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++	((struct ext3_extent_idx *) (((char *) (__hdr__)) +	\
++				     sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++	((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++	(EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++	(EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++	(EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++	(EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++
++#define EXT_ROOT_HDR(tree) \
++	((struct ext3_extent_header *) (tree)->root)
++#define EXT_BLOCK_HDR(bh) \
++	((struct ext3_extent_header *) (bh)->b_data)
++#define EXT_DEPTH(_t_)	\
++	(((struct ext3_extent_header *)((_t_)->root))->eh_depth)
++#define EXT_GENERATION(_t_)	\
++	(((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++
++/*
++ * this structure is used to gather extents from the tree via ioctl
++ */
++struct ext3_extent_buf {
++	unsigned long start;
++	int buflen;
++	void *buffer;
++	void *cur;
++	int err;
++};
++
++/*
++ * this structure is used to collect stats info about the tree
++ */
++struct ext3_extent_tree_stats {
++	int depth;
++	int extents_num;
++	int leaf_num;
++};
++
++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *);
++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int);
++
++static inline void
++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
++{
++	if (tree->cex)
++		tree->cex->ee_len = 0;
++}
++
++
++#endif /* _LINUX_EXT3_EXTENTS */
++
+Index: linux-2.6.10/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/inode.c	2005-04-05 12:26:19.367143328 +0800
++++ linux-2.6.10/fs/ext3/inode.c	2005-04-05 12:26:25.462216736 +0800
+@@ -796,6 +796,17 @@
+ 	goto reread;
+ }
+ 
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++		struct buffer_head *bh, int create, int extend_disksize)
++{
++	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++		return ext3_ext_get_block(handle, inode, block, bh, create,
++						extend_disksize);
++	return ext3_get_block_handle(handle, inode, block, bh, create,
++					extend_disksize);
++}
++
+ static int ext3_get_block(struct inode *inode, sector_t iblock,
+ 			struct buffer_head *bh_result, int create)
+ {
+@@ -806,8 +817,8 @@
+ 		handle = ext3_journal_current_handle();
+ 		J_ASSERT(handle != 0);
+ 	}
+-	ret = ext3_get_block_handle(handle, inode, iblock,
+-				bh_result, create, 1);
++	ret = ext3_get_block_wrap(handle, inode, iblock,
++					bh_result, create, 1);
+ 	return ret;
+ }
+ 
+@@ -851,8 +862,8 @@
+ 
+ get_block:
+ 	if (ret == 0)
+-		ret = ext3_get_block_handle(handle, inode, iblock,
+-					bh_result, create, 0);
++ 		ret = ext3_get_block_wrap(handle, inode, iblock,
++ 					bh_result, create, 0);
+ 	bh_result->b_size = (1 << inode->i_blkbits);
+ 	return ret;
+ }
+@@ -871,7 +882,7 @@
+ 	dummy.b_state = 0;
+ 	dummy.b_blocknr = -1000;
+ 	buffer_trace_init(&dummy.b_history);
+-	*errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
++	*errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
+ 	if (!*errp && buffer_mapped(&dummy)) {
+ 		struct buffer_head *bh;
+ 		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1591,7 +1602,7 @@
+  * This required during truncate. We need to physically zero the tail end
+  * of that block so it doesn't yield old data if the file is later grown.
+  */
+-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
++int ext3_block_truncate_page(handle_t *handle, struct page *page,
+ 		struct address_space *mapping, loff_t from)
+ {
+ 	unsigned long index = from >> PAGE_CACHE_SHIFT;
+@@ -2089,6 +2100,9 @@
+ 			return;
+ 	}
+ 
++	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++		return ext3_ext_truncate(inode, page);
++
+ 	handle = start_transaction(inode);
+ 	if (IS_ERR(handle)) {
+ 		if (page) {
+@@ -2817,6 +2831,9 @@
+ 	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ 	int ret;
+ 
++ 	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ 		return ext3_ext_writepage_trans_blocks(inode, bpp);
++ 
+ 	if (ext3_should_journal_data(inode))
+ 		ret = 3 * (bpp + indirects) + 2;
+ 	else
+Index: linux-2.6.10/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ioctl.c	2005-04-05 12:25:13.631136720 +0800
++++ linux-2.6.10/fs/ext3/ioctl.c	2005-04-05 12:26:25.471215368 +0800
+@@ -245,6 +245,10 @@
+ 		return err;
+ 	}
+ 
++ 	case EXT3_IOC_GET_EXTENTS:
++ 	case EXT3_IOC_GET_TREE_STATS:
++ 	case EXT3_IOC_GET_TREE_DEPTH:
++ 		return ext3_ext_ioctl(inode, filp, cmd, arg);
+ 
+ 	default:
+ 		return -ENOTTY;
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c	2005-04-05 12:26:19.438132536 +0800
++++ linux-2.6.10/fs/ext3/super.c	2005-04-05 12:26:25.471215368 +0800
+@@ -394,6 +394,7 @@
+ 	struct ext3_super_block *es = sbi->s_es;
+ 	int i;
+ 
++ 	ext3_ext_release(sb);
+ 	ext3_xattr_put_super(sb);
+ 	journal_destroy(sbi->s_journal);
+ 	if (!(sb->s_flags & MS_RDONLY)) {
+@@ -463,6 +464,9 @@
+ 	dynlock_init(&ei->i_htree_lock);
+ 	sema_init(&ei->i_rename_sem, 1);
+ 	sema_init(&ei->i_append_sem, 1);
++	ei->i_cached_extent[0] = 0;
++	ei->i_cached_extent[1] = 0;
++	ei->i_cached_extent[2] = 0;
+ 	return &ei->vfs_inode;
+ }
+ 
+@@ -595,6 +599,7 @@
+ 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
+  	Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
++	Opt_extents, Opt_extdebug,
+ 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+ };
+ 
+@@ -647,6 +652,8 @@
+  	{Opt_iopen,  "iopen"},
+  	{Opt_noiopen,  "noiopen"},
+  	{Opt_iopen_nopriv,  "iopen_nopriv"},
++  	{Opt_extents, "extents"},
++  	{Opt_extdebug, "extdebug"},
+ 	{Opt_err, NULL},
+ 	{Opt_resize, "resize"},
+ };
+@@ -950,6 +957,12 @@
+ 			match_int(&args[0], &option);
+ 			*n_blocks_count = option;
+ 			break;
++		case Opt_extents:
++			set_opt (sbi->s_mount_opt, EXTENTS);
++			break;
++		case Opt_extdebug:
++			set_opt (sbi->s_mount_opt, EXTDEBUG);
++			break;
+ 		default:
+ 			printk (KERN_ERR
+ 				"EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1635,6 +1648,8 @@
+ 	percpu_counter_mod(&sbi->s_dirs_counter,
+ 		ext3_count_dirs(sb));
+ 
++ 	ext3_ext_init(sb);
++ 
+ 	return 0;
+ 
+ cantfind_ext3:
+Index: linux-2.6.10/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/extents.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/ext3/extents.c	2005-04-05 12:26:25.468215824 +0800
+@@ -0,0 +1,2306 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
++ */
++
++/*
++ * Extents support for EXT3
++ *
++ * TODO:
++ *   - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
++ *   - ext3_ext_calc_credits() could take 'mergable' into account
++ *   - ext3*_error() should be used in some situations
++ *   - find_goal() [to be tested and improved]
++ *   - smart tree reduction
++ *   - arch-independence
++ *     common on-disk format for big/little-endian arch
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/ext3_extents.h>
++#include <asm/uaccess.h>
++
++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++	int err;
++
++	if (handle->h_buffer_credits > needed)
++		return handle;
++	if (!ext3_journal_extend(handle, needed))
++		return handle;
++	err = ext3_journal_restart(handle, needed);
++	
++	return handle;
++}
++
++static int inline
++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
++{
++	if (tree->ops->get_write_access)
++		return tree->ops->get_write_access(h,tree->buffer);
++	else
++		return 0;
++}
++
++static int inline
++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
++{
++	if (tree->ops->mark_buffer_dirty)
++		return tree->ops->mark_buffer_dirty(h,tree->buffer);
++	else
++		return 0;
++}
++
++/*
++ * could return:
++ *  - EROFS
++ *  - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle,
++				struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path)
++{
++	int err;
++
++	if (path->p_bh) {
++		/* path points to block */
++		err = ext3_journal_get_write_access(handle, path->p_bh);
++	} else {
++		/* path points to leaf/index in inode body */
++		err = ext3_ext_get_access_for_root(handle, tree);
++	}
++	return err;
++}
++
++/*
++ * could return:
++ *  - EROFS
++ *  - ENOMEM
++ *  - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path)
++{
++	int err;
++	if (path->p_bh) {
++		/* path points to block */
++		err =ext3_journal_dirty_metadata(handle, path->p_bh);
++	} else {
++		/* path points to leaf/index in inode body */
++		err = ext3_ext_mark_root_dirty(handle, tree);
++	}
++	return err;
++}
++
++static int inline
++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path, struct ext3_extent *ex,
++			int *err)
++{
++	int goal, depth, newblock;
++	struct inode *inode;
++
++	EXT_ASSERT(tree);
++	if (tree->ops->new_block)
++		return tree->ops->new_block(handle, tree, path, ex, err);
++
++	inode = tree->inode;
++	depth = EXT_DEPTH(tree);
++	if (path && depth > 0) {
++		goal = path[depth-1].p_block;
++	} else {
++		struct ext3_inode_info *ei = EXT3_I(inode);
++		unsigned long bg_start;
++		unsigned long colour;
++
++		bg_start = (ei->i_block_group *
++				EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++			le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++		colour = (current->pid % 16) *
++			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++		goal = bg_start + colour;
++	}
++
++	newblock = ext3_new_block(handle, inode, goal, err);
++	return newblock;
++}
++
++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
++{
++	struct ext3_extent_header *neh;
++	neh = EXT_ROOT_HDR(tree);
++	neh->eh_generation++;
++}
++
++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
++{
++	int size;
++
++	size = (tree->inode->i_sb->s_blocksize -
++			sizeof(struct ext3_extent_header))
++				/ sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++	size = 6;
++#endif
++	return size;
++}
++
++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
++{
++	int size;
++
++	size = (tree->inode->i_sb->s_blocksize -
++			sizeof(struct ext3_extent_header))
++				/ sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++	size = 5;
++#endif
++	return size;
++}
++
++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
++{
++	int size;
++
++	size = (tree->buffer_len - sizeof(struct ext3_extent_header))
++			/ sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++	size = 3;
++#endif
++	return size;
++}
++
++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
++{
++	int size;
++
++	size = (tree->buffer_len -
++			sizeof(struct ext3_extent_header))
++			/ sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++	size = 4;
++#endif
++	return size;
++}
++
++static void ext3_ext_show_path(struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++	int k, l = path->p_depth;
++
++	ext_debug(tree, "path:");
++	for (k = 0; k <= l; k++, path++) {
++		if (path->p_idx) {
++			ext_debug(tree, "  %d->%d", path->p_idx->ei_block,
++					path->p_idx->ei_leaf);
++		} else if (path->p_ext) {
++			ext_debug(tree, "  %d:%d:%d",
++					path->p_ext->ee_block,
++					path->p_ext->ee_len,
++					path->p_ext->ee_start);
++		} else
++			ext_debug(tree, "  []");
++	}
++	ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++	int depth = EXT_DEPTH(tree);
++	struct ext3_extent_header *eh;
++	struct ext3_extent *ex;
++	int i;
++
++	if (!path)
++		return;
++
++	eh = path[depth].p_hdr;
++	ex = EXT_FIRST_EXTENT(eh);
++
++	for (i = 0; i < eh->eh_entries; i++, ex++) {
++		ext_debug(tree, "%d:%d:%d ",
++				ex->ee_block, ex->ee_len, ex->ee_start);
++	}
++	ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_drop_refs(struct ext3_ext_path *path)
++{
++	int depth = path->p_depth;
++	int i;
++
++	for (i = 0; i <= depth; i++, path++)
++		if (path->p_bh) {
++			brelse(path->p_bh);
++			path->p_bh = NULL;
++		}
++}
++
++/*
++ * binary search for closest index by given block
++ */
++static inline void
++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path, int block)
++{
++	struct ext3_extent_header *eh = path->p_hdr;
++	struct ext3_extent_idx *ix;
++	int l = 0, k, r;
++
++	EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++	EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++	EXT_ASSERT(eh->eh_entries > 0);
++
++	ext_debug(tree, "binsearch for %d(idx):  ", block);
++
++	path->p_idx = ix = EXT_FIRST_INDEX(eh);
++
++	r = k = eh->eh_entries;
++	while (k > 1) {
++		k = (r - l) / 2;
++		if (block < ix[l + k].ei_block)
++			r -= k;
++		else
++			l += k;
++		ext_debug(tree, "%d:%d:%d ", k, l, r);
++	}
++
++	ix += l;
++	path->p_idx = ix;
++	ext_debug(tree, "  -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf);
++
++	while (l++ < r) {
++		if (block < ix->ei_block) 
++			break;
++		path->p_idx = ix++;
++	}
++	ext_debug(tree, "  -> %d->%d\n", path->p_idx->ei_block,
++			path->p_idx->ei_leaf);
++
++#ifdef CHECK_BINSEARCH 
++	{
++		struct ext3_extent_idx *chix;
++
++		chix = ix = EXT_FIRST_INDEX(eh);
++		for (k = 0; k < eh->eh_entries; k++, ix++) {
++			if (k != 0 && ix->ei_block <= ix[-1].ei_block) {
++				printk("k=%d, ix=0x%p, first=0x%p\n", k,
++					ix, EXT_FIRST_INDEX(eh));
++				printk("%u <= %u\n",
++					ix->ei_block,ix[-1].ei_block);
++			}
++			EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block);
++			if (block < ix->ei_block) 
++				break;
++			chix = ix;
++		}
++		EXT_ASSERT(chix == path->p_idx);
++	}
++#endif
++
++}
++
++/*
++ * binary search for closest extent by given block
++ */
++static inline void
++ext3_ext_binsearch(struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path, int block)
++{
++	struct ext3_extent_header *eh = path->p_hdr;
++	struct ext3_extent *ex;
++	int l = 0, k, r;
++
++	EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++	EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++
++	if (eh->eh_entries == 0) {
++		/*
++		 * this leaf is empty yet:
++		 *  we get such a leaf in split/add case
++		 */
++		return;
++	}
++	
++	ext_debug(tree, "binsearch for %d:  ", block);
++
++	path->p_ext = ex = EXT_FIRST_EXTENT(eh);
++
++	r = k = eh->eh_entries;
++	while (k > 1) {
++		k = (r - l) / 2;
++		if (block < ex[l + k].ee_block)
++			r -= k;
++		else
++			l += k;
++		ext_debug(tree, "%d:%d:%d ", k, l, r);
++	}
++
++	ex += l;
++	path->p_ext = ex;
++	ext_debug(tree, "  -> %d:%d:%d ", path->p_ext->ee_block,
++			path->p_ext->ee_start, path->p_ext->ee_len);
++
++	while (l++ < r) {
++		if (block < ex->ee_block) 
++			break;
++		path->p_ext = ex++;
++	}
++	ext_debug(tree, "  -> %d:%d:%d\n", path->p_ext->ee_block,
++			path->p_ext->ee_start, path->p_ext->ee_len);
++
++#ifdef CHECK_BINSEARCH 
++	{
++		struct ext3_extent *chex;
++
++		chex = ex = EXT_FIRST_EXTENT(eh);
++		for (k = 0; k < eh->eh_entries; k++, ex++) {
++			EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block);
++			if (block < ex->ee_block) 
++				break;
++			chex = ex;
++		}
++		EXT_ASSERT(chex == path->p_ext);
++	}
++#endif
++
++}
++
++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
++{
++	struct ext3_extent_header *eh;
++
++	BUG_ON(tree->buffer_len == 0);
++	ext3_ext_get_access_for_root(handle, tree);
++	eh = EXT_ROOT_HDR(tree);
++	eh->eh_depth = 0;
++	eh->eh_entries = 0;
++	eh->eh_magic = EXT3_EXT_MAGIC;
++	eh->eh_max = ext3_ext_space_root(tree);
++	ext3_ext_mark_root_dirty(handle, tree);
++	ext3_ext_invalidate_cache(tree);
++	return 0;
++}
++
++struct ext3_ext_path *
++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
++			struct ext3_ext_path *path)
++{
++	struct ext3_extent_header *eh;
++	struct buffer_head *bh;
++	int depth, i, ppos = 0;
++
++	EXT_ASSERT(tree);
++	EXT_ASSERT(tree->inode);
++	EXT_ASSERT(tree->root);
++
++	eh = EXT_ROOT_HDR(tree);
++	EXT_ASSERT(eh);
++	i = depth = EXT_DEPTH(tree);
++	EXT_ASSERT(eh->eh_max);
++	EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++	EXT_ASSERT(i == 0 || eh->eh_entries > 0);
++	
++	/* account possible depth increase */
++	if (!path) {
++		path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++				GFP_NOFS);
++		if (!path)
++			return ERR_PTR(-ENOMEM);
++	}
++	memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++	path[0].p_hdr = eh;
++
++	/* walk through the tree */
++	while (i) {
++		ext_debug(tree, "depth %d: num %d, max %d\n",
++				ppos, eh->eh_entries, eh->eh_max);
++		ext3_ext_binsearch_idx(tree, path + ppos, block);
++		path[ppos].p_block = path[ppos].p_idx->ei_leaf;
++		path[ppos].p_depth = i;
++		path[ppos].p_ext = NULL;
++
++		bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
++		if (!bh) {
++			ext3_ext_drop_refs(path);
++			kfree(path);
++			return ERR_PTR(-EIO);
++		}
++		eh = EXT_BLOCK_HDR(bh);
++		ppos++;
++		EXT_ASSERT(ppos <= depth);
++		path[ppos].p_bh = bh;
++		path[ppos].p_hdr = eh;
++		i--;
++	}
++
++	path[ppos].p_depth = i;
++	path[ppos].p_hdr = eh;
++	path[ppos].p_ext = NULL;
++
++	/* find extent */
++	ext3_ext_binsearch(tree, path + ppos, block);
++
++	ext3_ext_show_path(tree, path);
++
++	return path;
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle,
++				struct ext3_extents_tree *tree,
++				struct ext3_ext_path *curp,
++				int logical, int ptr)
++{
++	struct ext3_extent_idx *ix;
++	int len, err;
++
++	if ((err = ext3_ext_get_access(handle, tree, curp)))
++		return err;
++
++	EXT_ASSERT(logical != curp->p_idx->ei_block);
++	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++	if (logical > curp->p_idx->ei_block) {
++		/* insert after */
++		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
++			len = (len - 1) * sizeof(struct ext3_extent_idx);
++			len = len < 0 ? 0 : len;
++			ext_debug(tree, "insert new index %d after: %d. "
++					"move %d from 0x%p to 0x%p\n",
++					logical, ptr, len,
++					(curp->p_idx + 1), (curp->p_idx + 2));
++			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++		}
++		ix = curp->p_idx + 1;
++	} else {
++		/* insert before */
++		len = len * sizeof(struct ext3_extent_idx);
++		len = len < 0 ? 0 : len;
++		ext_debug(tree, "insert new index %d before: %d. "
++				"move %d from 0x%p to 0x%p\n",
++				logical, ptr, len,
++				curp->p_idx, (curp->p_idx + 1));
++		memmove(curp->p_idx + 1, curp->p_idx, len);
++		ix = curp->p_idx;
++	}
++
++	ix->ei_block = logical;
++	ix->ei_leaf = ptr;
++	curp->p_hdr->eh_entries++;
++
++	EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max);
++	EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
++
++	err = ext3_ext_dirty(handle, tree, curp);
++	ext3_std_error(tree->inode->i_sb, err);
++
++	return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ *  - allocates all needed blocks (new leaf and all intermediate index blocks)
++ *  - makes decision where to split
++ *  - moves remaining extens and index entries (right to the split point)
++ *    into the newly allocated blocks
++ *  - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path,
++				struct ext3_extent *newext, int at)
++{
++	struct buffer_head *bh = NULL;
++	int depth = EXT_DEPTH(tree);
++	struct ext3_extent_header *neh;
++	struct ext3_extent_idx *fidx;
++	struct ext3_extent *ex;
++	int i = at, k, m, a;
++	unsigned long newblock, oldblock, border;
++	int *ablocks = NULL; /* array of allocated blocks */
++	int err = 0;
++
++	/* make decision: where to split? */
++	/* FIXME: now desicion is simplest: at current extent */
++
++	/* if current leaf will be splitted, then we should use 
++	 * border from split point */
++	EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
++	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++		border = path[depth].p_ext[1].ee_block;
++		ext_debug(tree, "leaf will be splitted."
++				" next leaf starts at %d\n",
++				(int)border);
++	} else {
++		border = newext->ee_block;
++		ext_debug(tree, "leaf will be added."
++				" next leaf starts at %d\n",
++				(int)border);
++	}
++
++	/* 
++	 * if error occurs, then we break processing
++	 * and turn filesystem read-only. so, index won't
++	 * be inserted and tree will be in consistent
++	 * state. next mount will repair buffers too
++	 */
++
++	/*
++	 * get array to track all allocated blocks
++	 * we need this to handle errors and free blocks
++	 * upon them
++	 */
++	ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
++	if (!ablocks)
++		return -ENOMEM;
++	memset(ablocks, 0, sizeof(unsigned long) * depth);
++
++	/* allocate all needed blocks */
++	ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
++	for (a = 0; a < depth - at; a++) {
++		newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++		if (newblock == 0)
++			goto cleanup;
++		ablocks[a] = newblock;
++	}
++
++	/* initialize new leaf */
++	newblock = ablocks[--a];
++	EXT_ASSERT(newblock);
++	bh = sb_getblk(tree->inode->i_sb, newblock);
++	if (!bh) {
++		err = -EIO;
++		goto cleanup;
++	}
++	lock_buffer(bh);
++
++	if ((err = ext3_journal_get_create_access(handle, bh)))
++		goto cleanup;
++
++	neh = EXT_BLOCK_HDR(bh);
++	neh->eh_entries = 0;
++	neh->eh_max = ext3_ext_space_block(tree);
++	neh->eh_magic = EXT3_EXT_MAGIC;
++	neh->eh_depth = 0;
++	ex = EXT_FIRST_EXTENT(neh);
++
++	/* move remain of path[depth] to the new leaf */
++	EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max);
++	/* start copy from next extent */
++	/* TODO: we could do it by single memmove */
++	m = 0;
++	path[depth].p_ext++;
++	while (path[depth].p_ext <=
++			EXT_MAX_EXTENT(path[depth].p_hdr)) {
++		ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
++				path[depth].p_ext->ee_block,
++				path[depth].p_ext->ee_start,
++				path[depth].p_ext->ee_len,
++				newblock);
++		memmove(ex++, path[depth].p_ext++,
++				sizeof(struct ext3_extent));
++		neh->eh_entries++;
++		m++;
++	}
++	set_buffer_uptodate(bh);
++	unlock_buffer(bh);
++
++	if ((err = ext3_journal_dirty_metadata(handle, bh)))
++		goto cleanup;	
++	brelse(bh);
++	bh = NULL;
++
++	/* correct old leaf */
++	if (m) {
++		if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++			goto cleanup;
++		path[depth].p_hdr->eh_entries -= m;
++		if ((err = ext3_ext_dirty(handle, tree, path + depth)))
++			goto cleanup;
++		
++	}
++
++	/* create intermediate indexes */
++	k = depth - at - 1;
++	EXT_ASSERT(k >= 0);
++	if (k)
++		ext_debug(tree,	"create %d intermediate indices\n", k);
++	/* insert new index into current index block */
++	/* current depth stored in i var */
++	i = depth - 1;
++	while (k--) {
++		oldblock = newblock;
++		newblock = ablocks[--a];
++		bh = sb_getblk(tree->inode->i_sb, newblock);
++		if (!bh) {
++			err = -EIO;
++			goto cleanup;
++		}
++		lock_buffer(bh);
++
++		if ((err = ext3_journal_get_create_access(handle, bh)))
++			goto cleanup;
++
++		neh = EXT_BLOCK_HDR(bh);
++		neh->eh_entries = 1;
++		neh->eh_magic = EXT3_EXT_MAGIC;
++		neh->eh_max = ext3_ext_space_block_idx(tree);
++		neh->eh_depth = depth - i; 
++		fidx = EXT_FIRST_INDEX(neh);
++		fidx->ei_block = border;
++		fidx->ei_leaf = oldblock;
++
++		ext_debug(tree,	"int.index at %d (block %lu): %lu -> %lu\n",
++				i, newblock, border, oldblock);
++		/* copy indexes */
++		m = 0;
++		path[i].p_idx++;
++
++		ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++				EXT_MAX_INDEX(path[i].p_hdr));
++		EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++				EXT_LAST_INDEX(path[i].p_hdr));
++		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
++			ext_debug(tree, "%d: move %d:%d in new index %lu\n",
++					i, path[i].p_idx->ei_block,
++					path[i].p_idx->ei_leaf, newblock);
++			memmove(++fidx, path[i].p_idx++,
++					sizeof(struct ext3_extent_idx));
++			neh->eh_entries++;
++			EXT_ASSERT(neh->eh_entries <= neh->eh_max);
++			m++;
++		}
++		set_buffer_uptodate(bh);
++		unlock_buffer(bh);
++
++		if ((err = ext3_journal_dirty_metadata(handle, bh)))
++			goto cleanup;
++		brelse(bh);
++		bh = NULL;
++
++		/* correct old index */
++		if (m) {
++			err = ext3_ext_get_access(handle, tree, path + i);
++			if (err)
++				goto cleanup;
++			path[i].p_hdr->eh_entries -= m;
++			err = ext3_ext_dirty(handle, tree, path + i);
++			if (err)
++				goto cleanup;
++		}
++
++		i--;
++	}
++
++	/* insert new index */
++	if (!err)
++		err = ext3_ext_insert_index(handle, tree, path + at,
++						border, newblock);
++
++cleanup:
++	if (bh) {
++		if (buffer_locked(bh))
++			unlock_buffer(bh);
++		brelse(bh);
++	}
++
++	if (err) {
++		/* free all allocated blocks in error case */
++		for (i = 0; i < depth; i++) {
++			if (!ablocks[i])
++				continue;
++			ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++		}
++	}
++	kfree(ablocks);
++
++	return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ *  - allocates new block
++ *  - moves top-level data (index block or leaf) into the new block
++ *  - initialize new top-level, creating index that points to the
++ *    just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle,
++					struct ext3_extents_tree *tree,
++					struct ext3_ext_path *path,
++					struct ext3_extent *newext)
++{
++	struct ext3_ext_path *curp = path;
++	struct ext3_extent_header *neh;
++	struct ext3_extent_idx *fidx;
++	struct buffer_head *bh;
++	unsigned long newblock;
++	int err = 0;
++
++	newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++	if (newblock == 0)
++		return err;
++
++	bh = sb_getblk(tree->inode->i_sb, newblock);
++	if (!bh) {
++		err = -EIO;
++		ext3_std_error(tree->inode->i_sb, err);
++		return err;
++	}
++	lock_buffer(bh);
++
++	if ((err = ext3_journal_get_create_access(handle, bh))) {
++		unlock_buffer(bh);
++		goto out;	
++	}
++
++	/* move top-level index/leaf into new block */
++	memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
++
++	/* set size of new block */
++	neh = EXT_BLOCK_HDR(bh);
++	/* old root could have indexes or leaves
++	 * so calculate e_max right way */
++	if (EXT_DEPTH(tree))
++		neh->eh_max = ext3_ext_space_block_idx(tree);
++	else
++		neh->eh_max = ext3_ext_space_block(tree);
++	neh->eh_magic = EXT3_EXT_MAGIC;
++	set_buffer_uptodate(bh);
++	unlock_buffer(bh);
++
++	if ((err = ext3_journal_dirty_metadata(handle, bh)))
++		goto out;
++
++	/* create index in new top-level index: num,max,pointer */
++	if ((err = ext3_ext_get_access(handle, tree, curp)))
++		goto out;
++
++	curp->p_hdr->eh_magic = EXT3_EXT_MAGIC;
++	curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree);
++	curp->p_hdr->eh_entries = 1;
++	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++	/* FIXME: it works, but actually path[0] can be index */
++	curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
++	curp->p_idx->ei_leaf = newblock;
++
++	neh = EXT_ROOT_HDR(tree);
++	fidx = EXT_FIRST_INDEX(neh);
++	ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
++			neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); 
++
++	neh->eh_depth = path->p_depth + 1;
++	err = ext3_ext_dirty(handle, tree, curp);
++out:
++	brelse(bh);
++
++	return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle,
++					struct ext3_extents_tree *tree,
++					struct ext3_ext_path *path,
++					struct ext3_extent *newext)
++{
++	struct ext3_ext_path *curp;
++	int depth, i, err = 0;
++
++repeat:
++	i = depth = EXT_DEPTH(tree);
++	
++	/* walk up to the tree and look for free index entry */
++	curp = path + depth;
++	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++		i--;
++		curp--;
++	}
++
++	/* we use already allocated block for index block
++	 * so, subsequent data blocks should be contigoues */
++	if (EXT_HAS_FREE_INDEX(curp)) {
++		/* if we found index with free entry, then use that
++		 * entry: create all needed subtree and add new leaf */
++		err = ext3_ext_split(handle, tree, path, newext, i);
++
++		/* refill path */
++		ext3_ext_drop_refs(path);
++		path = ext3_ext_find_extent(tree, newext->ee_block, path);
++		if (IS_ERR(path))
++			err = PTR_ERR(path);
++	} else {
++		/* tree is full, time to grow in depth */
++		err = ext3_ext_grow_indepth(handle, tree, path, newext);
++
++		/* refill path */
++		ext3_ext_drop_refs(path);
++		path = ext3_ext_find_extent(tree, newext->ee_block, path);
++		if (IS_ERR(path))
++			err = PTR_ERR(path);
++
++		/*
++		 * only first (depth 0 -> 1) produces free space
++		 * in all other cases we have to split growed tree
++		 */
++		depth = EXT_DEPTH(tree);
++		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
++			/* now we need split */
++			goto repeat;
++		}
++	}
++
++	if (err)
++		return err;
++
++	return 0;
++}
++
++/*
++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static unsigned long
++ext3_ext_next_allocated_block(struct ext3_ext_path *path)
++{
++	int depth;
++
++	EXT_ASSERT(path != NULL);
++	depth = path->p_depth;
++
++	if (depth == 0 && path->p_ext == NULL)
++		return EXT_MAX_BLOCK;
++
++	/* FIXME: what if index isn't full ?! */
++	while (depth >= 0) {
++		if (depth == path->p_depth) {
++			/* leaf */
++			if (path[depth].p_ext !=
++					EXT_LAST_EXTENT(path[depth].p_hdr))
++				return path[depth].p_ext[1].ee_block;
++		} else {
++			/* index */
++			if (path[depth].p_idx !=
++					EXT_LAST_INDEX(path[depth].p_hdr))
++				return path[depth].p_idx[1].ei_block;
++		}
++		depth--;        
++	}
++
++	return EXT_MAX_BLOCK;
++}
++
++/*
++ * returns first allocated block from next leaf or EXT_MAX_BLOCK
++ */
++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
++                                               struct ext3_ext_path *path)
++{
++	int depth;
++
++	EXT_ASSERT(path != NULL);
++	depth = path->p_depth;
++
++	/* zero-tree has no leaf blocks at all */
++	if (depth == 0)
++		return EXT_MAX_BLOCK;
++
++	/* go to index block */
++	depth--;
++	
++	while (depth >= 0) {
++		if (path[depth].p_idx !=
++				EXT_LAST_INDEX(path[depth].p_hdr))
++			return path[depth].p_idx[1].ei_block;
++		depth--;        
++	}
++
++	return EXT_MAX_BLOCK;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path)
++{
++	struct ext3_extent_header *eh;
++	int depth = EXT_DEPTH(tree);	
++	struct ext3_extent *ex;
++	unsigned long border;
++	int k, err = 0;
++	
++	eh = path[depth].p_hdr;
++	ex = path[depth].p_ext;
++	EXT_ASSERT(ex);
++	EXT_ASSERT(eh);
++	
++	if (depth == 0) {
++		/* there is no tree at all */
++		return 0;
++	}
++	
++	if (ex != EXT_FIRST_EXTENT(eh)) {
++		/* we correct tree if first leaf got modified only */
++		return 0;
++	}
++	
++	/*
++	 * TODO: we need correction if border is smaller then current one
++	 */
++	k = depth - 1;
++	border = path[depth].p_ext->ee_block;
++	if ((err = ext3_ext_get_access(handle, tree, path + k)))
++		return err;
++	path[k].p_idx->ei_block = border;
++	if ((err = ext3_ext_dirty(handle, tree, path + k)))
++		return err;
++
++	while (k--) {
++		/* change all left-side indexes */
++		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
++			break;
++		if ((err = ext3_ext_get_access(handle, tree, path + k)))
++			break;
++		path[k].p_idx->ei_block = border;
++		if ((err = ext3_ext_dirty(handle, tree, path + k)))
++			break;
++	}
++
++	return err;
++}
++
++static int inline
++ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
++				struct ext3_extent *ex1,
++				struct ext3_extent *ex2)
++{
++	if (ex1->ee_block + ex1->ee_len != ex2->ee_block)
++		return 0;
++
++#ifdef AGRESSIVE_TEST
++	if (ex1->ee_len >= 4)
++		return 0;
++#endif
++
++	if (!tree->ops->mergable)
++		return 1;
++
++	return tree->ops->mergable(ex1, ex2);
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path,
++				struct ext3_extent *newext)
++{
++	struct ext3_extent_header * eh;
++	struct ext3_extent *ex, *fex;
++	struct ext3_extent *nearex; /* nearest extent */
++	struct ext3_ext_path *npath = NULL;
++	int depth, len, err, next;
++
++	EXT_ASSERT(newext->ee_len > 0);
++	EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK);
++	depth = EXT_DEPTH(tree);
++	ex = path[depth].p_ext;
++	EXT_ASSERT(path[depth].p_hdr);
++
++	/* try to insert block into found extent and return */
++	if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
++		ext_debug(tree, "append %d block to %d:%d (from %d)\n",
++				newext->ee_len, ex->ee_block, ex->ee_len,
++				ex->ee_start);
++		if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++			return err;
++		ex->ee_len += newext->ee_len;
++		eh = path[depth].p_hdr;
++		nearex = ex;
++		goto merge;
++	}
++
++repeat:
++	depth = EXT_DEPTH(tree);
++	eh = path[depth].p_hdr;
++	if (eh->eh_entries < eh->eh_max)
++		goto has_space;
++
++	/* probably next leaf has space for us? */
++	fex = EXT_LAST_EXTENT(eh);
++	next = ext3_ext_next_leaf_block(tree, path);
++	if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) {
++		ext_debug(tree, "next leaf block - %d\n", next);
++		EXT_ASSERT(!npath);
++		npath = ext3_ext_find_extent(tree, next, NULL);
++		if (IS_ERR(npath))
++			return PTR_ERR(npath);
++		EXT_ASSERT(npath->p_depth == path->p_depth);
++		eh = npath[depth].p_hdr;
++		if (eh->eh_entries < eh->eh_max) {
++			ext_debug(tree,	"next leaf isnt full(%d)\n",
++					eh->eh_entries);
++			path = npath;
++			goto repeat;
++		}
++		ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
++				eh->eh_entries, eh->eh_max);
++	}
++
++	/*
++	 * there is no free space in found leaf
++	 * we're gonna add new leaf in the tree
++	 */
++	err = ext3_ext_create_new_leaf(handle, tree, path, newext);
++	if (err)
++		goto cleanup;
++	depth = EXT_DEPTH(tree);
++	eh = path[depth].p_hdr;
++
++has_space:
++	nearex = path[depth].p_ext;
++
++	if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++		goto cleanup;
++
++	if (!nearex) {
++		/* there is no extent in this leaf, create first one */
++		ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
++				newext->ee_block, newext->ee_start,
++				newext->ee_len);
++		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++	} else if (newext->ee_block > nearex->ee_block) {
++		EXT_ASSERT(newext->ee_block != nearex->ee_block);
++		if (nearex != EXT_LAST_EXTENT(eh)) {
++			len = EXT_MAX_EXTENT(eh) - nearex;
++			len = (len - 1) * sizeof(struct ext3_extent);
++			len = len < 0 ? 0 : len;
++			ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
++					"move %d from 0x%p to 0x%p\n",
++					newext->ee_block, newext->ee_start,
++					newext->ee_len,
++					nearex, len, nearex + 1, nearex + 2);
++			memmove(nearex + 2, nearex + 1, len);
++		}
++		path[depth].p_ext = nearex + 1;
++	} else {
++		EXT_ASSERT(newext->ee_block != nearex->ee_block);
++		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++		len = len < 0 ? 0 : len;
++		ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
++				"move %d from 0x%p to 0x%p\n",
++				newext->ee_block, newext->ee_start, newext->ee_len,
++				nearex, len, nearex + 1, nearex + 2);
++		memmove(nearex + 1, nearex, len);
++		path[depth].p_ext = nearex;
++	}
++
++	eh->eh_entries++;
++	nearex = path[depth].p_ext;
++	nearex->ee_block = newext->ee_block;
++	nearex->ee_start = newext->ee_start;
++	nearex->ee_len = newext->ee_len;
++	/* FIXME: support for large fs */
++	nearex->ee_start_hi = 0;
++
++merge:
++	/* try to merge extents to the right */
++	while (nearex < EXT_LAST_EXTENT(eh)) {
++		if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
++			break;
++		/* merge with next extent! */
++		nearex->ee_len += nearex[1].ee_len;
++		if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
++			len = (EXT_LAST_EXTENT(eh) - nearex - 1)
++					* sizeof(struct ext3_extent);
++			memmove(nearex + 1, nearex + 2, len);
++		}
++		eh->eh_entries--;
++		EXT_ASSERT(eh->eh_entries > 0);
++	}
++
++	/* try to merge extents to the left */
++
++	/* time to correct all indexes above */
++	err = ext3_ext_correct_indexes(handle, tree, path);
++	if (err)
++		goto cleanup;
++
++	err = ext3_ext_dirty(handle, tree, path + depth);
++
++cleanup:
++	if (npath) {
++		ext3_ext_drop_refs(npath);
++		kfree(npath);
++	}
++	ext3_ext_tree_changed(tree);
++	ext3_ext_invalidate_cache(tree);
++	return err;
++}
++
++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
++			unsigned long num, ext_prepare_callback func)
++{
++	struct ext3_ext_path *path = NULL;
++	struct ext3_extent *ex, cbex;
++	unsigned long next, start = 0, end = 0;
++	unsigned long last = block + num;
++	int depth, exists, err = 0;
++
++	EXT_ASSERT(tree);
++	EXT_ASSERT(func);
++	EXT_ASSERT(tree->inode);
++	EXT_ASSERT(tree->root);
++
++	while (block < last && block != EXT_MAX_BLOCK) {
++		num = last - block;
++		/* find extent for this block */
++		path = ext3_ext_find_extent(tree, block, path);
++		if (IS_ERR(path)) {
++			err = PTR_ERR(path);
++			path = NULL;
++			break;
++		}
++
++		depth = EXT_DEPTH(tree);
++		EXT_ASSERT(path[depth].p_hdr);
++		ex = path[depth].p_ext;
++		next = ext3_ext_next_allocated_block(path);
++
++		exists = 0;
++		if (!ex) {
++			/* there is no extent yet, so try to allocate
++			 * all requested space */
++			start = block;
++			end = block + num;
++		} else if (ex->ee_block > block) {
++			/* need to allocate space before found extent */
++			start = block;
++			end = ex->ee_block;
++			if (block + num < end)
++				end = block + num;
++		} else if (block >= ex->ee_block + ex->ee_len) {
++			/* need to allocate space after found extent */
++			start = block;
++			end = block + num;
++			if (end >= next)
++				end = next;
++		} else if (block >= ex->ee_block) {
++			/* 
++			 * some part of requested space is covered
++			 * by found extent
++			 */
++			start = block;
++			end = ex->ee_block + ex->ee_len;
++			if (block + num < end)
++				end = block + num;
++			exists = 1;
++		} else {
++			BUG();
++		}
++		EXT_ASSERT(end > start);
++
++		if (!exists) {
++			cbex.ee_block = start;
++			cbex.ee_len = end - start;
++			cbex.ee_start = 0;
++		} else
++			cbex = *ex;
++
++		EXT_ASSERT(path[depth].p_hdr);
++		err = func(tree, path, &cbex, exists);
++		ext3_ext_drop_refs(path);
++
++		if (err < 0)
++			break;
++		if (err == EXT_REPEAT)
++			continue;
++		else if (err == EXT_BREAK) {
++			err = 0;
++			break;
++		}
++
++		if (EXT_DEPTH(tree) != depth) {
++			/* depth was changed. we have to realloc path */
++			kfree(path);
++			path = NULL;
++		}
++
++		block = cbex.ee_block + cbex.ee_len;
++	}
++
++	if (path) {
++		ext3_ext_drop_refs(path);
++		kfree(path);
++	}
++
++	return err;
++}
++
++static inline void
++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex)
++{
++	if (tree->cex) {
++		EXT_ASSERT(ex);
++		EXT_ASSERT(ex->ee_len);
++		tree->cex->ee_block = ex->ee_block;
++		tree->cex->ee_start = ex->ee_start;
++		tree->cex->ee_len = ex->ee_len;
++	}
++}
++
++/*
++ * this routine calculate boundaries of the gap requested block fits into
++ * and cache this gap
++ */
++static inline void
++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path,
++				unsigned long block)
++{
++	int depth = EXT_DEPTH(tree);
++	struct ext3_extent *ex, gex;
++
++	if (!tree->cex)
++		return;
++
++	ex = path[depth].p_ext;
++	if (ex == NULL) {
++		/* there is no extent yet, so gap is [0;-] */
++		gex.ee_block = 0;
++		gex.ee_len = EXT_CACHE_MARK;
++		ext_debug(tree, "cache gap(whole file):");
++	} else if (block < ex->ee_block) {
++		gex.ee_block = block;
++		gex.ee_len = ex->ee_block - block;
++		ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
++				(unsigned long) block,
++				(unsigned long) ex->ee_block,
++				(unsigned long) ex->ee_len);
++	} else if (block >= ex->ee_block + ex->ee_len) {
++		gex.ee_block = ex->ee_block + ex->ee_len;
++		gex.ee_len = ext3_ext_next_allocated_block(path);
++		ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
++				(unsigned long) ex->ee_block,
++				(unsigned long) ex->ee_len,
++				(unsigned long) block);
++		EXT_ASSERT(gex.ee_len > gex.ee_block);
++		gex.ee_len = gex.ee_len - gex.ee_block;
++	} else {
++		BUG();
++	}
++
++	ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.ee_block,
++			(unsigned long) gex.ee_len);
++	gex.ee_start = EXT_CACHE_MARK;
++	ext3_ext_put_in_cache(tree, &gex);
++}
++
++static inline int
++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
++			struct ext3_extent *ex)
++{
++	struct ext3_extent *cex = tree->cex;
++
++	/* is there cache storage at all? */
++	if (!cex)
++		return 0;
++
++	/* has cache valid data? */
++	if (cex->ee_len == 0)
++		return 0;
++
++	if (block >= cex->ee_block && block < cex->ee_block + cex->ee_len) {
++		ex->ee_block = cex->ee_block;
++		ex->ee_start = cex->ee_start;
++		ex->ee_len = cex->ee_len;
++		ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
++				(unsigned long) block,
++				(unsigned long) ex->ee_block,
++				(unsigned long) ex->ee_len,
++				(unsigned long) ex->ee_start);
++		return 1;
++	}
++
++	/* not in cache */
++	return 0;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path)
++{
++	struct buffer_head *bh;
++	int err;
++	
++	/* free index block */
++	path--;
++	EXT_ASSERT(path->p_hdr->eh_entries);
++	if ((err = ext3_ext_get_access(handle, tree, path)))
++		return err;
++	path->p_hdr->eh_entries--;
++	if ((err = ext3_ext_dirty(handle, tree, path)))
++		return err;
++	ext_debug(tree, "index is empty, remove it, free block %d\n",
++			path->p_idx->ei_leaf);
++	bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
++	ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
++	ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++	return err;
++}
++
++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
++					struct ext3_ext_path *path)
++{
++	int depth = EXT_DEPTH(tree);
++	int needed;
++
++	if (path) {
++		/* probably there is space in leaf? */
++		if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max)
++			return 1;
++	}
++	
++	/*
++	 * the worste case we're expecting is creation of the
++	 * new root (growing in depth) with index splitting
++	 * for splitting we have to consider depth + 1 because
++	 * previous growing could increase it
++	 */
++	depth = depth + 1;
++
++	/* 
++	 * growing in depth:
++	 * block allocation + new root + old root
++	 */
++	needed = EXT3_ALLOC_NEEDED + 2;
++
++	/* index split. we may need:
++	 *   allocate intermediate indexes and new leaf
++	 *   change two blocks at each level, but root
++	 *   modify root block (inode)
++	 */
++	needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++	return needed;
++}
++
++static int
++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path, unsigned long start,
++			unsigned long end)
++{
++	struct ext3_extent *ex, tex;
++	struct ext3_ext_path *npath;
++	int depth, creds, err;
++
++	depth = EXT_DEPTH(tree);
++	ex = path[depth].p_ext;
++	EXT_ASSERT(ex);
++	EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1);
++	EXT_ASSERT(ex->ee_block < start);
++
++	/* calculate tail extent */
++	tex.ee_block = end + 1;
++	EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len);
++	tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block;
++
++	creds = ext3_ext_calc_credits_for_insert(tree, path);
++	handle = ext3_ext_journal_restart(handle, creds);
++	if (IS_ERR(handle))
++		return PTR_ERR(handle);
++	
++	/* calculate head extent. use primary extent */
++	err = ext3_ext_get_access(handle, tree, path + depth);
++	if (err)
++		return err;
++	ex->ee_len = start - ex->ee_block;
++	err = ext3_ext_dirty(handle, tree, path + depth);
++	if (err)
++		return err;
++
++	/* FIXME: some callback to free underlying resource
++	 * and correct ee_start? */
++	ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
++			ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len);
++
++	npath = ext3_ext_find_extent(tree, ex->ee_block, NULL);
++	if (IS_ERR(npath))
++		return PTR_ERR(npath);
++	depth = EXT_DEPTH(tree);
++	EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block);
++	EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len);
++
++	err = ext3_ext_insert_extent(handle, tree, npath, &tex);
++	ext3_ext_drop_refs(npath);
++	kfree(npath);
++
++	return err;
++			
++}
++
++static int
++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path, unsigned long start,
++			unsigned long end)
++{
++	struct ext3_extent *ex, *fu = NULL, *lu, *le;
++	int err = 0, correct_index = 0;
++	int depth = EXT_DEPTH(tree), credits;
++	struct ext3_extent_header *eh;
++	unsigned a, b, block, num;
++
++	ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
++	if (!path[depth].p_hdr)
++		path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
++	eh = path[depth].p_hdr;
++	EXT_ASSERT(eh);
++	EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++	EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++	
++	/* find where to start removing */
++	le = ex = EXT_LAST_EXTENT(eh);
++	while (ex != EXT_FIRST_EXTENT(eh)) {
++		if (ex->ee_block <= end)
++			break;
++		ex--;
++	}
++
++	if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) {
++		/* removal of internal part of the extent requested
++		 * tail and head must be placed in different extent
++		 * so, we have to insert one more extent */
++		path[depth].p_ext = ex;
++		return ext3_ext_split_for_rm(handle, tree, path, start, end);
++	}
++	
++	lu = ex;
++	while (ex >= EXT_FIRST_EXTENT(eh) &&
++			ex->ee_block + ex->ee_len > start) {
++		ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len);
++		path[depth].p_ext = ex;
++	
++		a = ex->ee_block > start ? ex->ee_block : start;
++		b = ex->ee_block + ex->ee_len - 1 < end ?
++			ex->ee_block + ex->ee_len - 1 : end;
++		
++		ext_debug(tree, "  border %u:%u\n", a, b);
++
++		if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) {
++			block = 0;
++			num = 0;
++			BUG();
++		} else if (a != ex->ee_block) {
++			/* remove tail of the extent */
++			block = ex->ee_block;
++			num = a - block;
++		} else if (b != ex->ee_block + ex->ee_len - 1) {
++			/* remove head of the extent */
++			block = a;
++			num = b - a;
++		} else {
++			/* remove whole extent: excelent! */
++			block = ex->ee_block; 
++			num = 0;
++			EXT_ASSERT(a == ex->ee_block &&
++					b == ex->ee_block + ex->ee_len - 1);
++		}
++
++		if (ex == EXT_FIRST_EXTENT(eh))
++			correct_index = 1;
++
++		credits = 1;
++		if (correct_index)
++			credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
++		if (tree->ops->remove_extent_credits)
++			credits+=tree->ops->remove_extent_credits(tree,ex,a,b);
++		
++		handle = ext3_ext_journal_restart(handle, credits);
++		if (IS_ERR(handle)) {
++			err = PTR_ERR(handle);
++			goto out;
++		}
++
++		err = ext3_ext_get_access(handle, tree, path + depth);
++		if (err)
++			goto out;
++
++		if (tree->ops->remove_extent)
++			err = tree->ops->remove_extent(tree, ex, a, b);
++		if (err)
++			goto out;
++
++		if (num == 0) {
++			/* this extent is removed entirely mark slot unused */
++			ex->ee_start = 0;
++			eh->eh_entries--;
++			fu = ex;
++		}
++
++		ex->ee_block = block;
++		ex->ee_len = num;
++
++		err = ext3_ext_dirty(handle, tree, path + depth);
++		if (err)
++			goto out;
++
++		ext_debug(tree, "new extent: %u:%u:%u\n",
++				ex->ee_block, ex->ee_len, ex->ee_start);
++		ex--;
++	}
++
++	if (fu) {
++		/* reuse unused slots */
++		while (lu < le) {
++			if (lu->ee_start) {
++				*fu = *lu;
++				lu->ee_start = 0;
++				fu++;
++			}
++			lu++;
++		}
++	}
++
++	if (correct_index && eh->eh_entries)
++		err = ext3_ext_correct_indexes(handle, tree, path);
++
++	/* if this leaf is free, then we should
++	 * remove it from index block above */
++	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
++		err = ext3_ext_rm_idx(handle, tree, path + depth);
++
++out:
++	return err;
++}
++
++
++static struct ext3_extent_idx *
++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
++{
++	struct ext3_extent_idx *ix;
++	
++	ix = EXT_LAST_INDEX(hdr);
++	while (ix != EXT_FIRST_INDEX(hdr)) {
++		if (ix->ei_block <= block)
++			break;
++		ix--;
++	}
++	return ix;
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int inline
++ext3_ext_more_to_rm(struct ext3_ext_path *path)
++{
++	EXT_ASSERT(path->p_idx);
++
++	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++		return 0;
++
++	/*
++	 * if truncate on deeper level happened it it wasn't partial
++	 * so we have to consider current index for truncation
++	 */
++	if (path->p_hdr->eh_entries == path->p_block)
++		return 0;
++	return 1;
++}
++
++int ext3_ext_remove_space(struct ext3_extents_tree *tree,
++				unsigned long start, unsigned long end)
++{
++	struct inode *inode = tree->inode;
++	struct super_block *sb = inode->i_sb;
++	int depth = EXT_DEPTH(tree);
++	struct ext3_ext_path *path;
++	handle_t *handle;
++	int i = 0, err = 0;
++
++	ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
++
++	/* probably first extent we're gonna free will be last in block */
++	handle = ext3_journal_start(inode, depth + 1);
++	if (IS_ERR(handle))
++		return PTR_ERR(handle);
++
++	ext3_ext_invalidate_cache(tree);
++
++	/*
++	 * we start scanning from right side freeing all the blocks
++	 * after i_size and walking into the deep
++	 */
++	path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++	if (IS_ERR(path)) {
++		ext3_error(sb, "ext3_ext_remove_space",
++				"Can't allocate path array");
++		ext3_journal_stop(handle);
++		return -ENOMEM;
++	}
++	memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++	path[i].p_hdr = EXT_ROOT_HDR(tree);
++	
++	while (i >= 0 && err == 0) {
++		if (i == depth) {
++			/* this is leaf block */
++			err = ext3_ext_rm_leaf(handle, tree, path, start, end);
++			/* root level have p_bh == NULL, brelse() eats this */
++			brelse(path[i].p_bh);
++			i--;
++			continue;
++		}
++		
++		/* this is index block */
++		if (!path[i].p_hdr) {
++			ext_debug(tree, "initialize header\n");
++			path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
++		}
++
++		EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max);
++		EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC);
++		
++		if (!path[i].p_idx) {
++			/* this level hasn't touched yet */
++			path[i].p_idx =
++				ext3_ext_last_covered(path[i].p_hdr, end);
++			path[i].p_block = path[i].p_hdr->eh_entries + 1;
++			ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
++					path[i].p_hdr, path[i].p_hdr->eh_entries);
++		} else {
++			/* we've already was here, see at next index */
++			path[i].p_idx--;
++		}
++
++		ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
++				i, EXT_FIRST_INDEX(path[i].p_hdr),
++				path[i].p_idx);
++		if (ext3_ext_more_to_rm(path + i)) {
++			/* go to the next level */
++			ext_debug(tree, "move to level %d (block %d)\n",
++					i + 1, path[i].p_idx->ei_leaf);
++			memset(path + i + 1, 0, sizeof(*path));
++			path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf);
++			if (!path[i+1].p_bh) {
++				/* should we reset i_size? */
++				err = -EIO;
++				break;
++			}
++			/* put actual number of indexes to know is this
++			 * number got changed at the next iteration */
++			path[i].p_block = path[i].p_hdr->eh_entries;
++			i++;
++		} else {
++			/* we finish processing this index, go up */
++			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
++				/* index is empty, remove it
++				 * handle must be already prepared by the
++				 * truncatei_leaf() */
++				err = ext3_ext_rm_idx(handle, tree, path + i);
++			}
++			/* root level have p_bh == NULL, brelse() eats this */
++			brelse(path[i].p_bh);
++			i--;
++			ext_debug(tree, "return to level %d\n", i);
++		}
++	}
++
++	/* TODO: flexible tree reduction should be here */
++	if (path->p_hdr->eh_entries == 0) {
++		/*
++		 * truncate to zero freed all the tree
++		 * so, we need to correct eh_depth
++		 */
++		err = ext3_ext_get_access(handle, tree, path);
++		if (err == 0) {
++			EXT_ROOT_HDR(tree)->eh_depth = 0;
++			EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree);
++			err = ext3_ext_dirty(handle, tree, path);
++		}
++	}
++	ext3_ext_tree_changed(tree);
++
++	kfree(path);
++	ext3_journal_stop(handle);
++
++	return err;
++}
++
++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks)
++{
++	int lcap, icap, rcap, leafs, idxs, num;
++
++	rcap = ext3_ext_space_root(tree);
++	if (blocks <= rcap) {
++		/* all extents fit to the root */
++		return 0;
++	}
++
++	rcap = ext3_ext_space_root_idx(tree);
++	lcap = ext3_ext_space_block(tree);
++	icap = ext3_ext_space_block_idx(tree);
++
++	num = leafs = (blocks + lcap - 1) / lcap;
++	if (leafs <= rcap) {
++		/* all pointers to leafs fit to the root */
++		return leafs;
++	}
++
++	/* ok. we need separate index block(s) to link all leaf blocks */
++	idxs = (leafs + icap - 1) / icap;
++	do {
++		num += idxs;
++		idxs = (idxs + icap - 1) / icap;
++	} while (idxs > rcap);
++
++	return num;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++	/*
++	 * possible initialization would be here
++	 */
++
++	if (test_opt(sb, EXTENTS)) {
++		printk("EXT3-fs: file extents enabled");
++#ifdef AGRESSIVE_TEST
++		printk(", agressive tests");
++#endif
++#ifdef CHECK_BINSEARCH
++		printk(", check binsearch");
++#endif
++		printk("\n");
++	}
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++}
++
++/************************************************************************
++ * VFS related routines
++ ************************************************************************/
++
++static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
++{
++	/* we use in-core data, not bh */
++	return 0;
++}
++
++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
++{
++	struct inode *inode = buffer;
++	return ext3_mark_inode_dirty(handle, inode);
++}
++
++static int ext3_ext_mergable(struct ext3_extent *ex1,
++				struct ext3_extent *ex2)
++{
++	/* FIXME: support for large fs */
++	if (ex1->ee_start + ex1->ee_len == ex2->ee_start)
++		return 1;
++	return 0;
++}
++
++static int
++ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
++				struct ext3_extent *ex,
++				unsigned long from, unsigned long to)
++{
++	int needed;
++	
++	/* at present, extent can't cross block group */;
++	needed = 4; /* bitmap + group desc + sb + inode */
++
++#ifdef CONFIG_QUOTA
++	needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++	return needed;
++}
++
++static int
++ext3_remove_blocks(struct ext3_extents_tree *tree,
++				struct ext3_extent *ex,
++				unsigned long from, unsigned long to)
++{
++	int needed = ext3_remove_blocks_credits(tree, ex, from, to);
++	handle_t *handle = ext3_journal_start(tree->inode, needed);
++	struct buffer_head *bh;
++	int i;
++
++	if (IS_ERR(handle))
++		return PTR_ERR(handle);
++	if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
++		/* tail removal */
++		unsigned long num, start;
++		num = ex->ee_block + ex->ee_len - from;
++		start = ex->ee_start + ex->ee_len - num;
++		ext_debug(tree, "free last %lu blocks starting %lu\n",
++				num, start);
++		for (i = 0; i < num; i++) {
++			bh = sb_find_get_block(tree->inode->i_sb, start + i);
++			ext3_forget(handle, 0, tree->inode, bh, start + i);
++		}
++		ext3_free_blocks(handle, tree->inode, start, num);
++	} else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
++		printk("strange request: removal %lu-%lu from %u:%u\n",
++			from, to, ex->ee_block, ex->ee_len);
++	} else {
++		printk("strange request: removal(2) %lu-%lu from %u:%u\n",
++			from, to, ex->ee_block, ex->ee_len);
++	}
++	ext3_journal_stop(handle);
++	return 0;
++}
++
++static int ext3_ext_find_goal(struct inode *inode,
++				struct ext3_ext_path *path, unsigned long block)
++{
++	struct ext3_inode_info *ei = EXT3_I(inode);
++	unsigned long bg_start;
++	unsigned long colour;
++	int depth;
++	
++	if (path) {
++		struct ext3_extent *ex;
++		depth = path->p_depth;
++		
++		/* try to predict block placement */
++		if ((ex = path[depth].p_ext))
++			return ex->ee_start + (block - ex->ee_block);
++
++		/* it looks index is empty
++		 * try to find starting from index itself */
++		if (path[depth].p_bh)
++			return path[depth].p_bh->b_blocknr;
++	}
++
++	/* OK. use inode's group */
++	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++	colour = (current->pid % 16) *
++			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++	return bg_start + colour + block;
++}
++
++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
++				struct ext3_ext_path *path,
++				struct ext3_extent *ex, int *err)
++{
++	struct inode *inode = tree->inode;
++	int newblock, goal;
++	
++	EXT_ASSERT(path);
++	EXT_ASSERT(ex);
++	EXT_ASSERT(ex->ee_start);
++	EXT_ASSERT(ex->ee_len);
++	
++	/* reuse block from the extent to order data/metadata */
++	newblock = ex->ee_start++;
++	ex->ee_len--;
++	if (ex->ee_len == 0) {
++		ex->ee_len = 1;
++		/* allocate new block for the extent */
++		goal = ext3_ext_find_goal(inode, path, ex->ee_block);
++		ex->ee_start = ext3_new_block(handle, inode, goal, err);
++		if (ex->ee_start == 0) {
++			/* error occured: restore old extent */
++			ex->ee_start = newblock;
++			return 0;
++		}
++	}
++	return newblock;
++}
++
++static struct ext3_extents_helpers ext3_blockmap_helpers = {
++	.get_write_access	= ext3_get_inode_write_access,
++	.mark_buffer_dirty	= ext3_mark_buffer_dirty,
++	.mergable		= ext3_ext_mergable,
++	.new_block		= ext3_new_block_cb,
++	.remove_extent		= ext3_remove_blocks,
++	.remove_extent_credits	= ext3_remove_blocks_credits,
++};
++
++void ext3_init_tree_desc(struct ext3_extents_tree *tree,
++				struct inode *inode)
++{
++	tree->inode = inode;
++	tree->root = (void *) EXT3_I(inode)->i_data;
++	tree->buffer = (void *) inode;
++	tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
++	tree->cex = (struct ext3_extent *) &EXT3_I(inode)->i_cached_extent;
++	tree->ops = &ext3_blockmap_helpers;
++}
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode,
++			long iblock, struct buffer_head *bh_result,
++			int create, int extend_disksize)
++{
++	struct ext3_ext_path *path = NULL;
++	struct ext3_extent newex;
++	struct ext3_extent *ex;
++	int goal, newblock, err = 0, depth;
++	struct ext3_extents_tree tree;
++
++	clear_buffer_new(bh_result);
++	ext3_init_tree_desc(&tree, inode);
++	ext_debug(&tree, "block %d requested for inode %u\n",
++			(int) iblock, (unsigned) inode->i_ino);
++	down(&EXT3_I(inode)->truncate_sem);
++
++	/* check in cache */
++	if (ext3_ext_in_cache(&tree, iblock, &newex)) {
++		if (newex.ee_start == EXT_CACHE_MARK) {
++			/* this is cached gap */
++			if (!create) {
++				/* block isn't allocated yet and
++				 * user don't want to allocate it */
++				goto out2;
++			}
++			/* we should allocate requested block */
++		} else if (newex.ee_start) {
++			/* block is already allocated */
++			newblock = iblock - newex.ee_block + newex.ee_start;
++			goto out;
++		}
++	}
++
++	/* find extent for this block */
++	path = ext3_ext_find_extent(&tree, iblock, NULL);
++	if (IS_ERR(path)) {
++		err = PTR_ERR(path);
++		path = NULL;
++		goto out2;
++	}
++
++	depth = EXT_DEPTH(&tree);
++
++	/*
++	 * consistent leaf must not be empty
++	 * this situations is possible, though, _during_ tree modification
++	 * this is why assert can't be put in ext3_ext_find_extent()
++	 */
++	EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
++
++	if ((ex = path[depth].p_ext)) {
++		/* if found exent covers block, simple return it */
++		if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) {
++			newblock = iblock - ex->ee_block + ex->ee_start;
++			ext_debug(&tree, "%d fit into %d:%d -> %d\n",
++					(int) iblock, ex->ee_block, ex->ee_len,
++					newblock);
++			ext3_ext_put_in_cache(&tree, ex);
++			goto out;
++		}
++	}
++
++	/*
++	 * requested block isn't allocated yet
++	 * we couldn't try to create block if create flag is zero 
++	 */
++	if (!create) {
++		/* put just found gap into cache to speedup subsequest reqs */
++		ext3_ext_put_gap_in_cache(&tree, path, iblock);
++		goto out2;
++	}
++
++	/* allocate new block */
++	goal = ext3_ext_find_goal(inode, path, iblock);
++	newblock = ext3_new_block(handle, inode, goal, &err);
++	if (!newblock)
++		goto out2;
++	ext_debug(&tree, "allocate new block: goal %d, found %d\n",
++			goal, newblock);
++
++	/* try to insert new extent into found leaf and return */
++	newex.ee_block = iblock;
++	newex.ee_start = newblock;
++	newex.ee_len = 1;
++	err = ext3_ext_insert_extent(handle, &tree, path, &newex);
++	if (err)
++		goto out2;
++	
++	if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize)
++		EXT3_I(inode)->i_disksize = inode->i_size;
++
++	/* previous routine could use block we allocated */
++	newblock = newex.ee_start;
++	set_buffer_new(bh_result);
++
++	ext3_ext_put_in_cache(&tree, &newex);
++out:
++	ext3_ext_show_leaf(&tree, path);
++	map_bh(bh_result, inode->i_sb, newblock);
++out2:
++	if (path) {
++		ext3_ext_drop_refs(path);
++		kfree(path);
++	}
++	up(&EXT3_I(inode)->truncate_sem);
++
++	return err;	
++}
++
++void ext3_ext_truncate(struct inode * inode, struct page *page)
++{
++	struct address_space *mapping = inode->i_mapping;
++	struct super_block *sb = inode->i_sb;
++	struct ext3_extents_tree tree;
++	unsigned long last_block;
++	handle_t *handle;
++	int err = 0;
++
++	ext3_init_tree_desc(&tree, inode);
++
++	/*
++	 * probably first extent we're gonna free will be last in block
++	 */
++	err = ext3_writepage_trans_blocks(inode) + 3;
++	handle = ext3_journal_start(inode, err);
++	if (IS_ERR(handle)) {
++		if (page) {
++			clear_highpage(page);
++			flush_dcache_page(page);
++			unlock_page(page);
++			page_cache_release(page);
++		}
++		return;
++	}
++
++	if (page)
++		ext3_block_truncate_page(handle, page, mapping, inode->i_size);
++
++	down(&EXT3_I(inode)->truncate_sem);
++	ext3_ext_invalidate_cache(&tree);
++
++	/* 
++	 * TODO: optimization is possible here
++	 * probably we need not scaning at all,
++	 * because page truncation is enough
++	 */
++	if (ext3_orphan_add(handle, inode))
++		goto out_stop;
++
++	/* we have to know where to truncate from in crash case */
++	EXT3_I(inode)->i_disksize = inode->i_size;
++	ext3_mark_inode_dirty(handle, inode);
++
++	last_block = (inode->i_size + sb->s_blocksize - 1)
++			>> EXT3_BLOCK_SIZE_BITS(sb);
++	err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK);
++	
++	/* In a multi-transaction truncate, we only make the final
++	 * transaction synchronous */
++	if (IS_SYNC(inode))
++		handle->h_sync = 1;
++
++out_stop:
++	/*
++	 * If this was a simple ftruncate(), and the file will remain alive
++	 * then we need to clear up the orphan record which we created above.
++	 * However, if this was a real unlink then we were called by
++	 * ext3_delete_inode(), and we allow that function to clean up the
++	 * orphan info for us.
++	 */
++	if (inode->i_nlink)
++		ext3_orphan_del(handle, inode);
++
++	up(&EXT3_I(inode)->truncate_sem);
++	ext3_journal_stop(handle);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++	struct ext3_extents_tree tree;
++	int needed;
++	
++	ext3_init_tree_desc(&tree, inode);
++	
++	needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
++
++	/* caller want to allocate num blocks */
++	needed *= num;
++	
++#ifdef CONFIG_QUOTA
++	/* 
++	 * FIXME: real calculation should be here
++	 * it depends on blockmap format of qouta file
++	 */
++	needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++	return needed;
++}
++
++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
++{
++	struct ext3_extents_tree tree;
++
++	ext3_init_tree_desc(&tree, inode);
++	ext3_extent_tree_init(handle, &tree);
++}
++
++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks)
++{
++	struct ext3_extents_tree tree;
++
++	ext3_init_tree_desc(&tree, inode);
++	return ext3_ext_calc_metadata_amount(&tree, blocks);
++}
++	
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path,
++			struct ext3_extent *newex, int exist)
++{
++	struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++	if (!exist)
++		return EXT_CONTINUE;
++	if (buf->err < 0)
++		return EXT_BREAK;
++	if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++		return EXT_BREAK;
++
++	if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++		buf->err++;
++		buf->cur += sizeof(*newex);
++	} else {
++		buf->err = -EFAULT;
++		return EXT_BREAK;
++	}
++	return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path,
++			struct ext3_extent *ex, int exist)
++{
++	struct ext3_extent_tree_stats *buf =
++		(struct ext3_extent_tree_stats *) tree->private;
++	int depth;
++
++	if (!exist)
++		return EXT_CONTINUE;
++
++	depth = EXT_DEPTH(tree);
++	buf->extents_num++;
++	if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++		buf->leaf_num++;
++	return EXT_CONTINUE;
++}
++
++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++		unsigned long arg)
++{
++	int err = 0;
++
++	if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++		return -EINVAL;
++
++	if (cmd == EXT3_IOC_GET_EXTENTS) {
++		struct ext3_extent_buf buf;
++		struct ext3_extents_tree tree;
++
++		if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++			return -EFAULT;
++
++		ext3_init_tree_desc(&tree, inode);
++		buf.cur = buf.buffer;
++		buf.err = 0;
++		tree.private = &buf;
++		down(&EXT3_I(inode)->truncate_sem);
++		err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++						ext3_ext_store_extent_cb);
++		up(&EXT3_I(inode)->truncate_sem);
++		if (err == 0)
++			err = buf.err;
++	} else if (cmd == EXT3_IOC_GET_TREE_STATS) {
++		struct ext3_extent_tree_stats buf;
++		struct ext3_extents_tree tree;
++
++		ext3_init_tree_desc(&tree, inode);
++		down(&EXT3_I(inode)->truncate_sem);
++		buf.depth = EXT_DEPTH(&tree);
++		buf.extents_num = 0;
++		buf.leaf_num = 0;
++		tree.private = &buf;
++		err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK,
++						ext3_ext_collect_stats_cb);
++		up(&EXT3_I(inode)->truncate_sem);
++		if (!err)
++			err = copy_to_user((void *) arg, &buf, sizeof(buf));
++	} else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
++		struct ext3_extents_tree tree;
++		ext3_init_tree_desc(&tree, inode);
++		down(&EXT3_I(inode)->truncate_sem);
++		err = EXT_DEPTH(&tree);
++		up(&EXT3_I(inode)->truncate_sem);
++	}
++
++	return err;
++}
++
++EXPORT_SYMBOL(ext3_init_tree_desc);
++EXPORT_SYMBOL(ext3_mark_inode_dirty);
++EXPORT_SYMBOL(ext3_ext_invalidate_cache);
++EXPORT_SYMBOL(ext3_ext_insert_extent);
++EXPORT_SYMBOL(ext3_ext_walk_space);
++EXPORT_SYMBOL(ext3_ext_find_goal);
++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
++
+Index: linux-2.6.10/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ialloc.c	2005-04-05 12:26:19.368143176 +0800
++++ linux-2.6.10/fs/ext3/ialloc.c	2005-04-05 12:26:25.464216432 +0800
+@@ -644,6 +644,17 @@
+ 		DQUOT_FREE_INODE(inode);
+ 		goto fail2;
+   	}
++ 	if (test_opt(sb, EXTENTS)) {
++ 		EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++ 		ext3_extents_initialize_blockmap(handle, inode);
++		if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) {
++			err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++			if (err) goto fail;
++			EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS);
++			BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata");
++			err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++		}
++ 	}
+ 	err = ext3_mark_inode_dirty(handle, inode);
+ 	if (err) {
+ 		ext3_std_error(sb, err);
+Index: linux-2.6.10/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/Makefile	2005-04-05 12:26:06.897039072 +0800
++++ linux-2.6.10/fs/ext3/Makefile	2005-04-05 12:27:00.597875304 +0800
+@@ -5,8 +5,8 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+ 
+ ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-	   ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o
+-
++	   ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \
++	   extents.o
+ ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+ ext3-$(CONFIG_EXT3_FS_SECURITY)	 += xattr_security.o
+
+%diffstat
+ fs/ext3/Makefile             |    4 
+ fs/ext3/extents.c            | 2306 +++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/ialloc.c             |   11 
+ fs/ext3/inode.c              |   29 
+ fs/ext3/ioctl.c              |    4 
+ fs/ext3/super.c              |   15 
+ include/linux/ext3_extents.h |  238 ++++
+ include/linux/ext3_fs.h      |   20 
+ include/linux/ext3_fs_i.h    |    2 
+ 9 files changed, 2619 insertions(+), 10 deletions(-)
+
diff --git a/lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch
new file mode 100755
index 0000000..a400fb3
--- /dev/null
+++ b/lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch
@@ -0,0 +1,361 @@
+Index: linux-2.6.10/fs/ext3/xattr.h
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/xattr.h	2005-04-05 12:26:19.376141960 +0800
++++ linux-2.6.10/fs/ext3/xattr.h	2005-04-05 12:27:55.527524728 +0800
+@@ -70,6 +70,7 @@
+ extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,const void *,size_t,int);
+ extern int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *,const void *,size_t,int);
+ 
++extern int ext3_xattr_get_ea_loc(struct inode *, int, const char *, struct buffer_head **, int *, int *);
+ extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
+ extern void ext3_xattr_put_super(struct super_block *);
+ 
+Index: linux-2.6.10/fs/ext3/extents-in-ea.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/extents-in-ea.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/ext3/extents-in-ea.c	2005-04-05 12:27:55.524525184 +0800
+@@ -0,0 +1,224 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/ext3_extents.h>
++#include <linux/ext3_xattr.h>
++#include <linux/slab.h>
++#include <asm/uaccess.h>
++
++static int ext3_get_ea_write_access(handle_t *handle, void *buffer)
++{
++	struct buffer_head *bh = (struct buffer_head *) buffer;
++	return ext3_journal_get_write_access(handle, bh);
++}
++
++static int ext3_mark_ea_buffer_dirty(handle_t *handle, void *buffer)
++{
++	struct buffer_head *bh = (struct buffer_head *) buffer;
++	ext3_journal_dirty_metadata(handle, bh);
++	return 0;
++}
++
++static struct ext3_extents_helpers ext3_ea_helpers = {
++	.get_write_access	= ext3_get_ea_write_access,
++	.mark_buffer_dirty	= ext3_mark_ea_buffer_dirty,
++	.mergable		= NULL,
++	.new_block		= NULL,
++	.remove_extent		= NULL,
++	.remove_extent_credits	= NULL,
++};
++
++int ext3_init_tree_in_ea_desc(struct ext3_extents_tree *tree,
++				struct inode *inode, int name_index,
++				const char *eaname)
++{
++	struct buffer_head *bh;
++	int offset, err, size;
++
++	err = ext3_xattr_get_ea_loc(inode, name_index, eaname,
++					&bh, &offset, &size);
++	if (err)
++		return err;
++
++	EXT_ASSERT(bh);
++	EXT_ASSERT(size >= sizeof(struct ext3_extent_header)
++				+ sizeof(struct ext3_extent));
++	tree->inode = inode;
++	tree->root = (void *) bh->b_data + offset;
++	tree->buffer_len = size;
++	tree->buffer = (void *) bh;
++	tree->ops = &ext3_ea_helpers;
++	tree->cex = NULL; 	/* FIXME: add cache store later */
++	return 0;
++}
++
++void ext3_release_tree_in_ea_desc(struct ext3_extents_tree *tree)
++{
++	struct buffer_head *bh;
++
++	bh = (struct buffer_head *) tree->buffer;
++	EXT_ASSERT(bh);
++	brelse(bh);
++}
++
++int ext3_init_tree_in_ea(struct inode *inode, int name_index,
++				const char *eaname, int size)
++{
++	struct ext3_extents_tree tree;
++	handle_t *handle;
++	char *root;
++	int err;
++
++	root = kmalloc(size, GFP_USER);
++	if (!root)
++		return -ENOMEM;
++	memset(root, 0, size);
++
++	/* first, create ea to store root of the tree */
++	handle = ext3_journal_start(inode, EXT3_ALLOC_NEEDED + 3);
++	if (IS_ERR(handle))
++		return PTR_ERR(handle);
++	if ((err = ext3_xattr_set(inode, name_index,
++				  eaname, root, size, 0)))
++		goto out;
++	if ((err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname)))
++		goto out;
++	err = ext3_extent_tree_init(handle, &tree);
++	ext3_release_tree_in_ea_desc(&tree);
++out:
++	ext3_journal_stop(handle, inode);
++	kfree(root);
++	return err;
++}
++
++static int
++ext3_ext_in_ea_new_extent(struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path,
++			struct ext3_extent *newex, int exist)
++{
++	struct inode *inode = tree->inode;
++	handle_t *handle;
++	int needed, err;
++	unsigned long tgen;
++
++	if (exist)
++		return EXT_CONTINUE;
++
++	tgen = EXT_GENERATION(tree);
++	needed = ext3_ext_calc_credits_for_insert(tree, path);
++	up(&EXT3_I(inode)->truncate_sem);
++	handle = ext3_journal_start(tree->inode, needed + 10);
++	if (IS_ERR(handle)) {
++		down_write(&EXT3_I(inode)->truncate_sem);
++		return PTR_ERR(handle);
++	}
++
++	if (tgen != EXT_GENERATION(tree)) {
++		/* the tree has changed. so path can be invalid at moment */
++		ext3_journal_stop(handle, inode);
++		down_write(&EXT3_I(inode)->truncate_sem);
++		return EXT_REPEAT;
++	}
++
++	down_write(&EXT3_I(inode)->truncate_sem);
++
++	/* insert new extent */
++	newex->ee_start = 0;
++	err = ext3_ext_insert_extent(handle, tree, path, newex);
++	if (!err)
++		ext3_journal_stop(handle, tree->inode);
++
++	return err;
++}
++
++int ext3_ext_in_ea_alloc_space(struct inode *inode, int name_index,
++				const char *eaname, unsigned long from,
++				unsigned long num)
++{
++	struct ext3_extents_tree tree;
++	int err;
++
++	err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname);
++	if (err == 0) {
++		down_write(&EXT3_I(inode)->truncate_sem);	
++		err = ext3_ext_walk_space(&tree, from, num,
++						ext3_ext_in_ea_new_extent);
++		ext3_release_tree_in_ea_desc(&tree);
++		up_write(&EXT3_I(inode)->truncate_sem);
++	}
++	return err;
++}
++
++int ext3_ext_in_ea_remove_space(struct inode *inode, int name_index,
++				const char *eaname, unsigned long from,
++				unsigned long num)
++{
++	struct ext3_extents_tree tree;
++	int err;
++
++	err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname);
++	if (err == 0) {
++		err = ext3_ext_remove_space(&tree, from, num);
++		ext3_release_tree_in_ea_desc(&tree);
++	}
++	return err;
++}
++
++int ext3_ext_in_ea_presence(struct inode *inode, int name_index,
++				const char *eaname, unsigned long block)
++{
++	struct ext3_extents_tree tree;
++	struct ext3_ext_path *path;
++	struct ext3_extent *ex;
++	int err, depth;
++
++	err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname);
++	if (err)
++		return err;
++
++	/* find extent for this block */
++	path = ext3_ext_find_extent(&tree, block, NULL);
++	if (IS_ERR(path)) {
++		err = PTR_ERR(path);
++		goto out;
++	}
++
++	depth = EXT_DEPTH(&tree);
++	ex = path[depth].p_ext;
++	if (!ex) {
++		/* there is no extent yet */
++		goto out;
++	}
++
++	if (block >= ex->ee_block && block < ex->ee_block + ex->ee_len)
++		err = 1;
++out:
++	ext3_release_tree_in_ea_desc(&tree);
++	return err;
++}
++
+Index: linux-2.6.10/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/xattr.c	2005-04-05 12:26:19.370142872 +0800
++++ linux-2.6.10/fs/ext3/xattr.c	2005-04-05 12:27:55.527524728 +0800
+@@ -590,7 +590,8 @@
+  */
+ int
+ ext3_xattr_ibody_find(struct inode *inode, int name_index,
+-		const char *name, struct ext3_xattr_entry *rentry, int *free)
++ 		const char *name, struct ext3_xattr_entry *rentry, int *free,
++ 		struct buffer_head **bh, int *offset)
+ {
+ 	struct ext3_xattr_entry *last;
+ 	struct ext3_inode *raw_inode;
+@@ -637,6 +638,15 @@
+ 		    name_len == last->e_name_len &&
+ 		    !memcmp(name, last->e_name, name_len)) {
+ 			memcpy(rentry, last, sizeof(struct ext3_xattr_entry));
++			if (offset) {
++				void *voff;
++				voff = start + le16_to_cpu(last->e_value_offs);
++				*offset = voff - (void *) iloc.bh->b_data;
++			}
++			if (bh) {
++				get_bh(iloc.bh);	
++				*bh = iloc.bh;
++			}
+ 			ret = 0;
+ 		} else {
+ 			*free -= EXT3_XATTR_LEN(last->e_name_len);
+@@ -657,7 +667,8 @@
+  */
+ int
+ ext3_xattr_block_find(struct inode *inode, int name_index, const char *name,
+-	       struct ext3_xattr_entry *rentry, int *free)
++	       struct ext3_xattr_entry *rentry, int *free,
++	       struct buffer_head **tbh, int *offset)
+ {
+ 	struct buffer_head *bh = NULL;
+ 	struct ext3_xattr_entry *entry;
+@@ -700,6 +711,12 @@
+ 		    memcmp(name, entry->e_name, name_len) == 0) {
+ 			memcpy(rentry, entry, sizeof(struct ext3_xattr_entry));
+ 			error = 0;
++			if (offset)
++				*offset = le16_to_cpu(entry->e_value_offs);
++			if (tbh) {
++				get_bh(bh);	
++				*tbh = bh;
++			}
+ 		} else {
+ 			*free -= EXT3_XATTR_LEN(entry->e_name_len);
+ 			*free -= le32_to_cpu(entry->e_value_size);
+@@ -894,7 +911,8 @@
+ 	down_write(&EXT3_I(inode)->xattr_sem);
+ 
+ 	/* try to find attribute in inode body */
+-	err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1);
++	err = ext3_xattr_ibody_find(inode, name_index, name,
++					&entry, &free1, NULL, NULL);
+ 	if (err == 0) {
+ 		/* found EA in inode */
+ 		found = 1;
+@@ -903,7 +921,7 @@
+ 		/* there is no such attribute in inode body */
+ 		/* try to find attribute in dedicated block */
+ 		err = ext3_xattr_block_find(inode, name_index, name,
+-						&entry, &free2);
++						&entry, &free2, NULL, NULL);
+ 		if (err != 0 && err != -ENOENT) {
+ 			/* not found EA in block */
+ 			goto finish;	
+@@ -960,6 +978,35 @@
+ 	return err;
+ }
+ 
++int ext3_xattr_get_ea_loc(struct inode *inode, int name_index,
++				const char *name, struct buffer_head **bh,
++				int *offset, int *size)
++{
++	int free1 = -1, free2 = -1, err, name_len;
++	struct ext3_xattr_entry entry;
++	
++	ea_idebug(inode, "name=%d.%s", name_index, name);
++
++	if (name == NULL)
++		return -EINVAL;
++	name_len = strlen(name);
++	if (name_len > 255)
++		return -ERANGE;
++
++	/* try to find attribute in inode body */
++	err = ext3_xattr_ibody_find(inode, name_index, name,
++					&entry, &free1, bh, offset);
++	if (err == -ENOENT) {
++		/* there is no such attribute in inode body */
++		/* try to find attribute in dedicated block */
++		err = ext3_xattr_block_find(inode, name_index, name,
++						&entry, &free2, bh, offset);
++	}
++	if (err == 0 && size)
++		*size = le32_to_cpu(entry.e_value_size);
++	return err;
++}
++
+ /*
+  * ext3_xattr_block_set()
+  *
+Index: linux-2.6.10/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/Makefile	2005-04-05 12:27:00.597875304 +0800
++++ linux-2.6.10/fs/ext3/Makefile	2005-04-05 12:28:26.989741744 +0800
+@@ -7,6 +7,6 @@
+ ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ 	   ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \
+ 	   extents.o
+-ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
++ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o extents-in-ea.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+ ext3-$(CONFIG_EXT3_FS_SECURITY)	 += xattr_security.o
diff --git a/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch
new file mode 100755
index 0000000..b39fb93
--- /dev/null
+++ b/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch
@@ -0,0 +1,230 @@
+Index: linux-2.6.10/fs/ext3/extents-in-ea.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/extents-in-ea.c	2005-03-31 19:41:09.471494208 +0800
++++ linux-2.6.10/fs/ext3/extents-in-ea.c	2005-03-31 19:41:09.580477640 +0800
+@@ -27,7 +27,7 @@
+ #include <linux/quotaops.h>
+ #include <linux/string.h>
+ #include <linux/ext3_extents.h>
+-#include <linux/ext3_xattr.h>
++#include "xattr.h" 
+ #include <linux/slab.h>
+ #include <asm/uaccess.h>
+ 
+@@ -111,7 +111,7 @@
+ 	err = ext3_extent_tree_init(handle, &tree);
+ 	ext3_release_tree_in_ea_desc(&tree);
+ out:
+-	ext3_journal_stop(handle, inode);
++	ext3_journal_stop(handle);
+ 	kfree(root);
+ 	return err;
+ }
+@@ -134,24 +134,24 @@
+ 	up(&EXT3_I(inode)->truncate_sem);
+ 	handle = ext3_journal_start(tree->inode, needed + 10);
+ 	if (IS_ERR(handle)) {
+-		down_write(&EXT3_I(inode)->truncate_sem);
++		down(&EXT3_I(inode)->truncate_sem);
+ 		return PTR_ERR(handle);
+ 	}
+ 
+ 	if (tgen != EXT_GENERATION(tree)) {
+ 		/* the tree has changed. so path can be invalid at moment */
+-		ext3_journal_stop(handle, inode);
+-		down_write(&EXT3_I(inode)->truncate_sem);
++		ext3_journal_stop(handle);
++		down(&EXT3_I(inode)->truncate_sem);
+ 		return EXT_REPEAT;
+ 	}
+ 
+-	down_write(&EXT3_I(inode)->truncate_sem);
++	down(&EXT3_I(inode)->truncate_sem);
+ 
+ 	/* insert new extent */
+ 	newex->ee_start = 0;
+ 	err = ext3_ext_insert_extent(handle, tree, path, newex);
+ 	if (!err)
+-		ext3_journal_stop(handle, tree->inode);
++		ext3_journal_stop(handle);
+ 
+ 	return err;
+ }
+@@ -165,11 +165,11 @@
+ 
+ 	err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname);
+ 	if (err == 0) {
+-		down_write(&EXT3_I(inode)->truncate_sem);	
++		down(&EXT3_I(inode)->truncate_sem);	
+ 		err = ext3_ext_walk_space(&tree, from, num,
+ 						ext3_ext_in_ea_new_extent);
+ 		ext3_release_tree_in_ea_desc(&tree);
+-		up_write(&EXT3_I(inode)->truncate_sem);
++		up(&EXT3_I(inode)->truncate_sem);
+ 	}
+ 	return err;
+ }
+@@ -222,3 +222,112 @@
+ 	return err;
+ }
+ 
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path,
++			struct ext3_extent *newex, int exist)
++{
++	struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++	if (!exist)
++		return EXT_CONTINUE;
++	if (buf->err < 0)
++		return EXT_BREAK;
++	if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++		return EXT_BREAK;
++
++	if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++		buf->err++;
++		buf->cur += sizeof(*newex);
++	} else {
++		buf->err = -EFAULT;
++		return EXT_BREAK;
++	}
++	return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++			struct ext3_ext_path *path,
++			struct ext3_extent *ex, int exist)
++{
++	struct ext3_extent_tree_stats *buf =
++		(struct ext3_extent_tree_stats *) tree->private;
++	int depth;
++
++	if (!exist)
++		return EXT_CONTINUE;
++
++	depth = EXT_DEPTH(tree);
++	buf->extents_num++;
++	if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++		buf->leaf_num++;
++	return EXT_CONTINUE;
++}
++
++struct ea_tree_desc {
++	int name_index;
++	char eaname[256];
++};
++
++int ext3_ext_in_ea_ioctl(struct inode *inode, struct file *filp,
++				unsigned int cmd, unsigned long arg)
++{
++	int err = 0;
++
++	if (cmd == EXT3_IOC_EA_TREE_INIT) {
++		struct ea_tree_desc desc;
++
++		if (copy_from_user(&desc, (void *) arg, sizeof(desc)))
++			return -EFAULT;
++		err = ext3_init_tree_in_ea(inode, desc.name_index,
++						desc.eaname, 64);
++	} else if (cmd == EXT3_IOC_GET_EA_EXTENTS) {
++		struct ext3_extents_tree tree;
++		struct ext3_extent_buf buf;
++		struct ea_tree_desc desc;
++
++		if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++			return -EFAULT;
++		if (copy_from_user(&desc, buf.cur, sizeof(desc)))
++			return -EFAULT;
++		err = ext3_init_tree_in_ea_desc(&tree, inode,
++						desc.name_index, desc.eaname);
++		if (err)
++			goto out;
++		buf.cur = buf.buffer;
++		buf.err = 0;
++		tree.private = &buf;
++		err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++						ext3_ext_store_extent_cb);
++		if (err == 0)
++			err = buf.err;
++		ext3_release_tree_in_ea_desc(&tree);
++	} else if (cmd == EXT3_IOC_EA_TREE_ALLOCATE) {
++		struct ext3_extent_buf buf;
++		struct ea_tree_desc desc;
++
++		if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++			return -EFAULT;
++		if (copy_from_user(&desc, buf.cur, sizeof(desc)))
++			return -EFAULT;
++		err = ext3_ext_in_ea_alloc_space(inode, desc.name_index,
++						desc.eaname, buf.start,
++						buf.err);
++	} else if (cmd == EXT3_IOC_EA_TREE_REMOVE) {
++		struct ext3_extent_buf buf;
++		struct ea_tree_desc desc;
++
++		if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++			return -EFAULT;
++		if (copy_from_user(&desc, buf.cur, sizeof(desc)))
++			return -EFAULT;
++		err = ext3_ext_in_ea_remove_space(inode, desc.name_index,
++						desc.eaname, buf.start,
++						buf.err);
++	}
++
++out:
++	return err;
++}
++
+Index: linux-2.6.10/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ioctl.c	2005-03-31 19:41:09.365510320 +0800
++++ linux-2.6.10/fs/ext3/ioctl.c	2005-03-31 19:41:09.580477640 +0800
+@@ -249,7 +249,13 @@
+  	case EXT3_IOC_GET_TREE_STATS:
+  	case EXT3_IOC_GET_TREE_DEPTH:
+  		return ext3_ext_ioctl(inode, filp, cmd, arg);
+-
++ 	case EXT3_IOC_GET_EA_EXTENTS:
++ 	case EXT3_IOC_GET_EA_TREE_DEPTH:
++ 	case EXT3_IOC_GET_EA_TREE_STATS:
++ 	case EXT3_IOC_EA_TREE_INIT:
++ 	case EXT3_IOC_EA_TREE_ALLOCATE:
++ 	case EXT3_IOC_EA_TREE_REMOVE:
++ 		return ext3_ext_in_ea_ioctl(inode, filp, cmd, arg);
+ 	default:
+ 		return -ENOTTY;
+ 	}
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h	2005-03-31 19:41:09.366510168 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h	2005-03-31 19:43:30.539048680 +0800
+@@ -242,6 +242,15 @@
+ #define	EXT3_IOC_GET_EXTENTS		_IOR('f', 10, long)
+ #define	EXT3_IOC_GET_TREE_DEPTH		_IOR('f', 11, long)
+ #define	EXT3_IOC_GET_TREE_STATS		_IOR('f', 12, long)
++
++#define	EXT3_IOC_GET_EA_EXTENTS		_IOR('f', 13, long)
++#define	EXT3_IOC_GET_EA_TREE_DEPTH	_IOR('f', 14, long)
++#define	EXT3_IOC_GET_EA_TREE_STATS	_IOR('f', 15, long)
++#define	EXT3_IOC_EA_TREE_INIT		_IOW('f', 16, long)
++#define	EXT3_IOC_EA_TREE_ALLOCATE	_IOW('f', 17, long)
++#define	EXT3_IOC_EA_TREE_REMOVE		_IOW('f', 18, long)
++
++
+ /*
+  * Structure of an inode on the disk
+  */
+@@ -788,7 +797,10 @@
+ /* ioctl.c */
+ extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
+ 		       unsigned long);
+-
++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++			  unsigned long arg);
++extern int ext3_ext_in_ea_ioctl(struct inode *inode, struct file *filp,
++				unsigned int cmd, unsigned long arg);
+ /* namei.c */
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
diff --git a/lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch
new file mode 100755
index 0000000..973d02f
--- /dev/null
+++ b/lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch
@@ -0,0 +1,281 @@
+Index: linux-2.6.10/fs/ext3/dir.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/dir.c	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/fs/ext3/dir.c	2005-03-31 18:56:02.961946200 +0800
+@@ -53,6 +53,9 @@
+ 
+ static unsigned char get_dtype(struct super_block *sb, int filetype)
+ {
++	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM))
++		return DT_UNKNOWN;
++
+ 	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
+ 	    (filetype >= EXT3_FT_MAX))
+ 		return DT_UNKNOWN;
+@@ -79,7 +82,8 @@
+ 		error_msg = "directory entry across blocks";
+ 	else if (le32_to_cpu(de->inode) >
+ 			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+-		error_msg = "inode out of bounds";
++		if (de->file_type != 128)
++			error_msg = "inode out of bounds";
+ 
+ 	if (error_msg != NULL)
+ 		ext3_error (dir->i_sb, function,
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c	2005-03-31 18:41:15.880803032 +0800
++++ linux-2.6.10/fs/ext3/namei.c	2005-03-31 18:56:02.960946352 +0800
+@@ -24,6 +24,7 @@
+  * 	Theodore Ts'o, 2002
+  */
+ 
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+ #include <linux/jbd.h>
+@@ -1148,6 +1149,23 @@
+ 	inode = NULL;
+ 	if (bh) {
+ 		unsigned long ino = le32_to_cpu(de->inode);
++ 		unsigned type = de->file_type;
++ 		__u32 *mds;
++ 		mds = (__u32 *)((char *) de + EXT3_DIR_REC_LEN(de->name_len));
++ 		if ((type & 128) && EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb,
++ 				EXT3_FEATURE_INCOMPAT_MDSNUM) &&
++ 				mds[0] != EXT3_SB(dir->i_sb)->s_mdsnum) {
++ 			struct ext3_super_block *es;
++ 			es = EXT3_SB(dir->i_sb)->s_es;
++ 			brelse (bh);
++ 			dentry->d_flags |= DCACHE_CROSS_REF;
++ 			dentry->d_generation = mds[1];
++ 			dentry->d_mdsnum = mds[0];
++ 			dentry->d_inum = ino;
++			ext3_unlock_htree(dir, lock);
++ 			d_add(dentry, NULL);
++ 			return NULL;
++ 		}
+ 		ext3_unlock_htree(dir, lock);
+ 		brelse (bh);
+ 		inode = iget(dir->i_sb, ino);
+@@ -1221,7 +1239,7 @@
+ 	while (count--) {
+ 		struct ext3_dir_entry_2 *de =
+ 			(struct ext3_dir_entry_2 *) (from + map->offs);
+-		rec_len = EXT3_DIR_REC_LEN(de->name_len);
++		rec_len = EXT3_DIR_REC_LEN_DE(de);
+ 		memcpy (to, de, rec_len);
+ 		((struct ext3_dir_entry_2 *) to)->rec_len =
+ 				cpu_to_le16(rec_len);
+@@ -1243,7 +1261,7 @@
+ 		next = (struct ext3_dir_entry_2 *) ((char *) de +
+ 						    le16_to_cpu(de->rec_len));
+ 		if (de->inode && de->name_len) {
+-			rec_len = EXT3_DIR_REC_LEN(de->name_len);
++			rec_len = EXT3_DIR_REC_LEN_DE(de);
+ 			if (de > to)
+ 				memmove(to, de, rec_len);
+ 			to->rec_len = cpu_to_le16(rec_len);
+@@ -1359,6 +1377,7 @@
+ 			     struct buffer_head * bh)
+ {
+ 	struct inode	*dir = dentry->d_parent->d_inode;
++	struct super_block *sb = dir->i_sb;
+ 	const char	*name = dentry->d_name.name;
+ 	int		namelen = dentry->d_name.len;
+ 	unsigned long	offset = 0;
+@@ -1367,6 +1386,10 @@
+ 	char		*top;
+ 
+ 	reclen = EXT3_DIR_REC_LEN(namelen);
++	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM)
++			&& (dentry->d_flags & DCACHE_CROSS_REF)
++			&& (dentry->d_mdsnum != EXT3_SB(sb)->s_mdsnum))
++		reclen += 8; /* we need space to store mds num */
+ 	if (!de) {
+ 		de = (struct ext3_dir_entry_2 *)bh->b_data;
+ 		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+@@ -1380,7 +1403,7 @@
+ 				brelse (bh);
+ 				return -EEXIST;
+ 			}
+-			nlen = EXT3_DIR_REC_LEN(de->name_len);
++			nlen = EXT3_DIR_REC_LEN_DE(de);
+ 			rlen = le16_to_cpu(de->rec_len);
+ 			if ((de->inode? rlen - nlen: rlen) >= reclen)
+ 				break;
+@@ -1399,7 +1422,7 @@
+ 	}
+ 
+ 	/* By now the buffer is marked for journaling */
+-	nlen = EXT3_DIR_REC_LEN(de->name_len);
++	nlen = EXT3_DIR_REC_LEN_DE(de);
+ 	rlen = le16_to_cpu(de->rec_len);
+ 	if (de->inode) {
+ 		struct ext3_dir_entry_2 *de1 =
+@@ -1411,8 +1434,20 @@
+ 	de->file_type = EXT3_FT_UNKNOWN;
+ 	if (inode) {
+ 		de->inode = cpu_to_le32(inode->i_ino);
+-		ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+-	} else
++		ext3_set_de_type(sb, de, inode->i_mode);
++	} else if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM)
++			&& (dentry->d_flags & DCACHE_CROSS_REF)) {
++		if (dentry->d_mdsnum != EXT3_SB(sb)->s_mdsnum) {
++			__u32 *mds;
++			mds = (__u32 *)((char *)de + EXT3_DIR_REC_LEN(namelen));
++			mds[0] = cpu_to_le32(dentry->d_mdsnum);
++			mds[1] = cpu_to_le32(dentry->d_generation);
++			de->inode = cpu_to_le32(dentry->d_inum);
++			de->file_type = 128;
++		} else {
++			de->inode = cpu_to_le32(dentry->d_inum);
++		}
++	} else 
+ 		de->inode = 0;
+ 	de->name_len = namelen;
+ 	memcpy (de->name, name, namelen);
+@@ -2737,6 +2772,81 @@
+ }
+ 
+ /*
++ * caller has to make sure directory is protected
++ */
++int ext3_add_dir_entry(struct dentry *dentry)
++{
++	struct inode *dir = dentry->d_parent->d_inode;
++	handle_t *handle;
++	int err;
++
++	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
++	if (IS_ERR(handle)) {
++		return PTR_ERR(handle);
++	}
++
++	if (IS_SYNC(dir))
++		handle->h_sync = 1;
++
++	err = ext3_add_entry(handle, dentry, NULL);
++	ext3_journal_stop(handle);
++	return err;
++}
++EXPORT_SYMBOL(ext3_add_dir_entry);
++/*
++ * caller has to make sure directory is protected
++ */
++int ext3_del_dir_entry(struct dentry *dentry)
++{
++	struct inode * inode;
++	struct inode * dir = dentry->d_parent->d_inode;
++	struct buffer_head * bh;
++	struct ext3_dir_entry_2 * de;
++	handle_t *handle;
++	int retval;
++	void *lock = NULL;
++
++	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
++	if (IS_ERR(handle)) {
++		return PTR_ERR(handle);
++	}
++
++	if (IS_SYNC(dir))
++		handle->h_sync = 1;
++
++	retval = -ENOENT;
++	bh = ext3_find_entry (dentry, &de, 1, &lock);
++	ext3_unlock_htree(dir, lock);
++	if (!bh)
++		goto end_unlink;
++
++	inode = dentry->d_inode;
++	if (inode)
++		DQUOT_INIT(inode);
++
++	retval = ext3_delete_entry(handle, dir, de, bh);
++	if (retval)
++		goto end_unlink;
++	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
++	ext3_update_dx_flag(dir);
++	if (inode) {
++		inode->i_ctime = dir->i_ctime;
++		ext3_mark_inode_dirty(handle, inode);
++		if (S_ISDIR(inode->i_mode))
++			dir->i_nlink--;
++	}
++	ext3_mark_inode_dirty(handle, dir);
++	retval = 0;
++
++end_unlink:
++	ext3_journal_stop(handle);
++	brelse (bh);
++	return retval;
++}
++
++EXPORT_SYMBOL(ext3_del_dir_entry);
++/*
+  * directories can handle most operations...
+  */
+ struct inode_operations ext3_dir_inode_operations = {
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h	2005-03-31 18:54:32.497698856 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h	2005-03-31 18:56:41.955018352 +0800
+@@ -483,7 +483,8 @@
+ 	__u16	s_reserved_word_pad;
+ 	__le32	s_default_mount_opts;
+ 	__le32	s_first_meta_bg; 	/* First metablock block group */
+-	__u32	s_reserved[190];	/* Padding to the end of the block */
++  	__u32	s_mdsnum;
++  	__u32	s_reserved[189];	/* Padding to the end of the block */
+ };
+ 
+ #ifdef __KERNEL__
+@@ -563,12 +564,14 @@
+ #define EXT3_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
+ #define EXT3_FEATURE_INCOMPAT_META_BG		0x0010
++#define EXT3_FEATURE_INCOMPAT_MDSNUM		0x0020 /* direntry has mdsnum */
+ #define EXT3_FEATURE_INCOMPAT_EXTENTS		0x0040 /* extents support */
+ 
+ #define EXT3_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP	(EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ 					 EXT3_FEATURE_INCOMPAT_RECOVER| \
+ 					 EXT3_FEATURE_INCOMPAT_META_BG| \
++					 EXT3_FEATURE_INCOMPAT_MDSNUM| \
+ 					 EXT3_FEATURE_INCOMPAT_EXTENTS)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ 					 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -643,6 +646,9 @@
+ #define EXT3_DIR_ROUND			(EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT3_DIR_ROUND) & \
+ 					 ~EXT3_DIR_ROUND)
++#define EXT3_DIR_REC_LEN_DE(de)	(EXT3_DIR_REC_LEN((de)->name_len) + \
++					(((de)->file_type & 128) ? 8 : 0))
++
+ /*
+  * Hash Tree Directory indexing
+  * (c) Daniel Phillips, 2001
+@@ -868,6 +874,9 @@
+ extern void ext3_ext_release(struct super_block *);
+ extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
+ 
++extern int ext3_add_dir_entry(struct dentry *dentry);
++
++extern int ext3_del_dir_entry(struct dentry *dentry);
+ #endif	/* __KERNEL__ */
+ 
+ #define EXT3_IOC_CREATE_INUM			_IOW('f', 5, long)
+Index: linux-2.6.10/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs_sb.h	2005-03-31 18:44:21.076648984 +0800
++++ linux-2.6.10/include/linux/ext3_fs_sb.h	2005-03-31 18:56:02.964945744 +0800
+@@ -81,6 +81,7 @@
+ 	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
+ 	int s_jquota_fmt;			/* Format of quota to use */
+ #endif
++	u32 s_mdsnum;
+ };
+ 
+ #endif	/* _LINUX_EXT3_FS_SB */
diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch
new file mode 100644
index 0000000..022b8d0
--- /dev/null
+++ b/lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch
@@ -0,0 +1,1202 @@
+ fs/ext3/ialloc.c          |    3 
+ fs/ext3/inode.c           |    3 
+ fs/ext3/namei.c           |  582 +++++++++++++++++++++++++++++++++++++---------
+ fs/ext3/super.c           |   14 +
+ include/linux/ext3_fs.h   |    1 
+ include/linux/ext3_fs_i.h |    6 
+ 6 files changed, 500 insertions(+), 109 deletions(-)
+
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/ext3/super.c	2005-03-31 19:44:54.251322480 +0800
+@@ -458,6 +458,9 @@
+ #endif
+ 	ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+ 	ei->vfs_inode.i_version = 1;
++	dynlock_init(&ei->i_htree_lock);
++	sema_init(&ei->i_rename_sem, 1);
++	sema_init(&ei->i_append_sem, 1);
+ 	return &ei->vfs_inode;
+ }
+ 
+@@ -588,7 +591,7 @@
+ 	Opt_commit, Opt_journal_update, Opt_journal_inum,
+ 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+-	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
++	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
+ 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+ };
+ 
+@@ -637,6 +640,7 @@
+ 	{Opt_ignore, "quota"},
+ 	{Opt_ignore, "usrquota"},
+ 	{Opt_barrier, "barrier=%u"},
++ 	{Opt_pdirops, "pdirops"},
+ 	{Opt_err, NULL},
+ 	{Opt_resize, "resize"},
+ };
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/fs/ext3/namei.c	2005-03-31 19:48:53.958881392 +0800
+@@ -53,6 +53,9 @@
+ {
+ 	struct buffer_head *bh;
+ 
++	/* with parallel dir operations all appends
++	 * have to be serialized -bzzz */
++	down(&EXT3_I(inode)->i_append_sem);
+ 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ 
+ 	if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+@@ -60,6 +63,8 @@
+ 		EXT3_I(inode)->i_disksize = inode->i_size;
+ 		ext3_journal_get_write_access(handle,bh);
+ 	}
++	up(&EXT3_I(inode)->i_append_sem);
++	
+ 	return bh;
+ }
+ 
+@@ -133,6 +138,8 @@
+ 	struct buffer_head *bh;
+ 	struct dx_entry *entries;
+ 	struct dx_entry *at;
++	unsigned long leaf;
++	unsigned int curidx;
+ };
+ 
+ struct dx_map_entry
+@@ -141,6 +148,30 @@
+ 	u32 offs;
+ };
+ 
++/* FIXME: this should be reworked using bb_spin_lock
++ * introduced in -mm tree
++ */
++#define BH_DXLock	25
++
++static inline void dx_lock_bh(struct buffer_head volatile *bh)
++{
++#ifdef CONFIG_SMP
++        while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
++                while (test_bit(BH_DXLock, &bh->b_state))
++                        cpu_relax();
++        }
++#endif
++}
++
++static inline void dx_unlock_bh(struct buffer_head *bh)
++{
++#ifdef CONFIG_SMP
++        smp_mb__before_clear_bit();
++        clear_bit(BH_DXLock, &bh->b_state);
++#endif
++}
++
++
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block (struct dx_entry *entry);
+ static void dx_set_block (struct dx_entry *entry, unsigned value);
+@@ -152,7 +183,7 @@
+ static void dx_set_limit (struct dx_entry *entries, unsigned value);
+ static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+ static unsigned dx_node_limit (struct inode *dir);
+-static struct dx_frame *dx_probe(struct dentry *dentry,
++static struct dx_frame *dx_probe(struct qstr *name,
+ 				 struct inode *dir,
+ 				 struct dx_hash_info *hinfo,
+ 				 struct dx_frame *frame,
+@@ -164,15 +195,18 @@
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+ 		struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+ 				 struct dx_frame *frame,
+ 				 struct dx_frame *frames, 
+ 				 __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+-		       struct ext3_dir_entry_2 **res_dir, int *err);
++		       struct ext3_dir_entry_2 **res_dir, int *err,
++		       int rwlock, void **lock);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ 			     struct inode *inode);
++static  void *ext3_lock_htree(struct inode *, unsigned long, int);
++static  void ext3_unlock_htree(struct inode *, void *);
+ 
+ /*
+  * Future: use high four bits of block for coalesce-on-delete flags
+@@ -316,6 +350,94 @@
+ #endif /* DX_DEBUG */
+ 
+ /*
++ * dx_find_position
++ *
++ * search position of specified hash in index
++ *
++ */
++
++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash)
++{
++	struct dx_entry *p, *q, *m;
++	int count;
++
++	count = dx_get_count(entries);
++	p = entries + 1;
++	q = entries + count - 1;
++	while (p <= q)
++	{
++		m = p + (q - p)/2;
++		if (dx_get_hash(m) > hash)
++			q = m - 1;
++		else
++			p = m + 1;
++	}
++	return p - 1;
++}
++
++/*
++ * returns 1 if path is unchanged
++ */
++int dx_check_path(struct dx_frame *frame, u32 hash)
++{
++	struct dx_entry *p;
++	int ret = 1;
++
++	dx_lock_bh(frame->bh);
++	p = dx_find_position(frame->entries, hash);
++	if (frame->leaf != dx_get_block(p))
++		ret = 0;
++	dx_unlock_bh(frame->bh);
++	
++	return ret;
++}
++
++/*
++ * 0 - changed
++ * 1 - hasn't changed
++ */
++static int
++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo)
++{
++	struct dx_entry *p;
++	struct dx_frame *frame = frames;
++	u32 leaf;
++
++	/* check first level */
++	dx_lock_bh(frame->bh);
++	p = dx_find_position(frame->entries, hinfo->hash);
++	leaf = dx_get_block(p);
++	dx_unlock_bh(frame->bh);
++	
++	if (leaf != frame->leaf) 
++		return 0;
++	
++	/* is there 2nd level? */
++	frame++;
++	if (frame->bh == NULL)
++		return 1;
++
++	/* check second level */
++	dx_lock_bh(frame->bh);
++
++	/* probably 1st level got changed, check it */
++	if (!dx_check_path(frames, hinfo->hash)) {
++		/* path changed */
++		dx_unlock_bh(frame->bh);
++		return 0;
++	}
++
++	p = dx_find_position(frame->entries, hinfo->hash);
++	leaf = dx_get_block(p);
++	dx_unlock_bh(frame->bh);
++	
++	if (leaf != frame->leaf)
++		return 0;
++
++	return 1;
++}
++
++/*
+  * Probe for a directory leaf block to search.
+  *
+  * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+@@ -325,19 +447,20 @@
+  * back to userspace.
+  */
+ static struct dx_frame *
+-dx_probe(struct dentry *dentry, struct inode *dir,
++dx_probe(struct qstr *name, struct inode *dir,
+ 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+ {
+-	unsigned count, indirect;
+-	struct dx_entry *at, *entries, *p, *q, *m;
++	unsigned indirect;
++	struct dx_entry *at, *entries;
+ 	struct dx_root *root;
+ 	struct buffer_head *bh;
+ 	struct dx_frame *frame = frame_in;
+ 	u32 hash;
++	unsigned int curidx;
+ 
+ 	frame->bh = NULL;
+-	if (dentry)
+-		dir = dentry->d_parent->d_inode;
++	frame[1].bh = NULL;
++
+ 	if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+ 		goto fail;
+ 	root = (struct dx_root *) bh->b_data;
+@@ -353,8 +476,8 @@
+ 	}
+ 	hinfo->hash_version = root->info.hash_version;
+ 	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+-	if (dentry)
+-		ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++	if (name)
++		ext3fs_dirhash(name->name, name->len, hinfo);
+ 	hash = hinfo->hash;
+ 
+ 	if (root->info.unused_flags & 1) {
+@@ -366,7 +489,19 @@
+ 		goto fail;
+ 	}
+ 
++repeat:
++	curidx = 0;
++	entries = (struct dx_entry *) (((char *)&root->info) +
++				       root->info.info_length);
++	assert(dx_get_limit(entries) == dx_root_limit(dir,
++						      root->info.info_length));
++	dxtrace (printk("Look up %x", hash));
++	dx_lock_bh(bh);
++	/* indirect must be initialized under bh lock because
++	 * 2nd level creation procedure may change it and dx_probe()
++	 * will suggest htree is still single-level -bzzz */
+ 	if ((indirect = root->info.indirect_levels) > 1) {
++		dx_unlock_bh(bh);
+ 		ext3_warning(dir->i_sb, __FUNCTION__,
+ 			     "Unimplemented inode hash depth: %#06x",
+ 			     root->info.indirect_levels);
+@@ -374,56 +509,46 @@
+ 		*err = ERR_BAD_DX_DIR;
+ 		goto fail;
+ 	}
+-
+-	entries = (struct dx_entry *) (((char *)&root->info) +
+-				       root->info.info_length);
+-	assert(dx_get_limit(entries) == dx_root_limit(dir,
+-						      root->info.info_length));
+-	dxtrace (printk("Look up %x", hash));
++	
+ 	while (1)
+ 	{
+-		count = dx_get_count(entries);
+-		assert (count && count <= dx_get_limit(entries));
+-		p = entries + 1;
+-		q = entries + count - 1;
+-		while (p <= q)
+-		{
+-			m = p + (q - p)/2;
+-			dxtrace(printk("."));
+-			if (dx_get_hash(m) > hash)
+-				q = m - 1;
+-			else
+-				p = m + 1;
+-		}
+-
+-		if (0) // linear search cross check
+-		{
+-			unsigned n = count - 1;
+-			at = entries;
+-			while (n--)
+-			{
+-				dxtrace(printk(","));
+-				if (dx_get_hash(++at) > hash)
+-				{
+-					at--;
+-					break;
+-				}
+-			}
+-			assert (at == p - 1);
+-		}
+-
+-		at = p - 1;
+-		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++		at = dx_find_position(entries, hinfo->hash);
++		dxtrace(printk(" %x->%u\n",
++				at == entries? 0: dx_get_hash(at),
++				dx_get_block(at)));
+ 		frame->bh = bh;
+ 		frame->entries = entries;
+ 		frame->at = at;
+-		if (!indirect--) return frame;
+-		if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++		frame->curidx = curidx;
++		frame->leaf = dx_get_block(at);
++		if (!indirect--) {
++			dx_unlock_bh(bh);
++			return frame;
++		}
++		
++		/* step into next htree level */
++		curidx = dx_get_block(at);
++		dx_unlock_bh(bh);
++		if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err)))
+ 			goto fail2;
++		
++		dx_lock_bh(bh);
++		/* splitting may change root index block and move
++		 * hash we're looking for into another index block
++		 * so, we have to check this situation and repeat
++		 * from begining if path got changed -bzzz */
++		if (!dx_check_path(frame, hash)) {
++			dx_unlock_bh(bh);
++			bh = frame->bh;
++			indirect++;
++			goto repeat;
++		}
++		
+ 		at = entries = ((struct dx_node *) bh->b_data)->entries;
+ 		assert (dx_get_limit(entries) == dx_node_limit (dir));
+ 		frame++;
+ 	}
++	dx_unlock_bh(bh);
+ fail2:
+ 	while (frame >= frame_in) {
+ 		brelse(frame->bh);
+@@ -437,8 +562,7 @@
+ {
+ 	if (frames[0].bh == NULL)
+ 		return;
+-
+-	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++	if (frames[1].bh != NULL)
+ 		brelse(frames[1].bh);
+ 	brelse(frames[0].bh);
+ }
+@@ -479,8 +603,10 @@
+ 	 * nodes need to be read.
+ 	 */
+ 	while (1) {
+-		if (++(p->at) < p->entries + dx_get_count(p->entries))
++		if (++(p->at) < p->entries + dx_get_count(p->entries)) {
++			p->leaf = dx_get_block(p->at);
+ 			break;
++		}
+ 		if (p == frames)
+ 			return 0;
+ 		num_frames++;
+@@ -506,13 +632,17 @@
+ 	 * block so no check is necessary
+ 	 */
+ 	while (num_frames--) {
+-		if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+-				      0, &err)))
++		u32 idx;
++		
++		idx = p->leaf = dx_get_block(p->at);
++		if (!(bh = ext3_bread(NULL, dir, idx, 0, &err)))
+ 			return err; /* Failure */
+ 		p++;
+ 		brelse (p->bh);
+ 		p->bh = bh;
+ 		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++		p->curidx = idx;
++		p->leaf = dx_get_block(p->at);
+ 	}
+ 	return 1;
+ }
+@@ -673,7 +803,8 @@
+ 			count++;
+ 		}
+ 		/* XXX: do we need to check rec_len == 0 case? -Chris */
+-		de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++		de = (struct ext3_dir_entry_2 *)((char*)de +
++				le16_to_cpu(de->rec_len));
+ 	}
+ 	return count;
+ }
+@@ -706,7 +837,8 @@
+ 	} while(more);
+ }
+ 
+-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct inode *dir, struct dx_frame *frame,
++			u32 hash, u32 block, u32 idx)
+ {
+ 	struct dx_entry *entries = frame->entries;
+ 	struct dx_entry *old = frame->at, *new = old + 1;
+@@ -718,6 +850,7 @@
+ 	dx_set_hash(new, hash);
+ 	dx_set_block(new, block);
+ 	dx_set_count(entries, count + 1);
++	
+ }
+ #endif
+ 
+@@ -798,7 +931,8 @@
+  * to brelse() it when appropriate.
+  */
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+-					struct ext3_dir_entry_2 ** res_dir)
++					struct ext3_dir_entry_2 ** res_dir,
++					int rwlock, void **lock)
+ {
+ 	struct super_block * sb;
+ 	struct buffer_head * bh_use[NAMEI_RA_SIZE];
+@@ -814,6 +948,7 @@
+ 	int namelen;
+ 	const u8 *name;
+ 	unsigned blocksize;
++	int do_not_use_dx = 0;
+ 
+ 	*res_dir = NULL;
+ 	sb = dir->i_sb;
+@@ -822,9 +957,10 @@
+ 	name = dentry->d_name.name;
+ 	if (namelen > EXT3_NAME_LEN)
+ 		return NULL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+ 	if (is_dx(dir)) {
+-		bh = ext3_dx_find_entry(dentry, res_dir, &err);
++		bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock);
+ 		/*
+ 		 * On success, or if the error was file not found,
+ 		 * return.  Otherwise, fall back to doing a search the
+@@ -833,8 +969,14 @@
+ 		if (bh || (err != ERR_BAD_DX_DIR))
+ 			return bh;
+ 		dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++		do_not_use_dx = 1;
+ 	}
+ #endif
++	*lock = ext3_lock_htree(dir, 0, rwlock);
++	if (is_dx(dir) && !do_not_use_dx) {
++		ext3_unlock_htree(dir, *lock);
++		goto repeat;
++	}
+ 	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+ 	start = EXT3_I(dir)->i_dir_start_lookup;
+ 	if (start >= nblocks)
+@@ -907,12 +1049,17 @@
+ 	/* Clean up the read-ahead blocks */
+ 	for (; ra_ptr < ra_max; ra_ptr++)
+ 		brelse (bh_use[ra_ptr]);
++	if (!ret) {
++		ext3_unlock_htree(dir, *lock);
++		*lock = NULL;
++	}
+ 	return ret;
+ }
+ 
+ #ifdef CONFIG_EXT3_INDEX
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+-		       struct ext3_dir_entry_2 **res_dir, int *err)
++		       struct ext3_dir_entry_2 **res_dir, int *err,
++		       int rwlock, void **lock)
+ {
+ 	struct super_block * sb;
+ 	struct dx_hash_info	hinfo;
+@@ -927,11 +1074,21 @@
+ 	struct inode *dir = dentry->d_parent->d_inode;
+ 
+ 	sb = dir->i_sb;
+-	if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
++repeat:
++ 	if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err)))
+ 		return NULL;
++ 	
++ 	*lock = ext3_lock_htree(dir, frame->leaf, rwlock);
++ 	/* while locking leaf we just found may get splitted
++ 	 * so, we need another leaf. check this */
++ 	if (!dx_check_full_path(frames, &hinfo)) {
++ 		ext3_unlock_htree(dir, *lock);
++ 		dx_release(frames);
++ 		goto repeat;
++ 	}
+ 	hash = hinfo.hash;
+ 	do {
+-		block = dx_get_block(frame->at);
++ 		block = frame->leaf;
+ 		if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+ 			goto errout;
+ 		de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -966,6 +1123,8 @@
+ 	*err = -ENOENT;
+ errout:
+ 	dxtrace(printk("%s not found\n", name));
++	ext3_unlock_htree(dir, *lock);
++	*lock = NULL;
+ 	dx_release (frames);
+ 	return NULL;
+ }
+@@ -976,14 +1135,16 @@
+ 	struct inode * inode;
+ 	struct ext3_dir_entry_2 * de;
+ 	struct buffer_head * bh;
++	void *lock = NULL;
+ 
+ 	if (dentry->d_name.len > EXT3_NAME_LEN)
+ 		return ERR_PTR(-ENAMETOOLONG);
+ 
+-	bh = ext3_find_entry(dentry, &de);
++	bh = ext3_find_entry(dentry, &de, 0, &lock);
+ 	inode = NULL;
+ 	if (bh) {
+ 		unsigned long ino = le32_to_cpu(de->inode);
++		ext3_unlock_htree(dir, lock);
+ 		brelse (bh);
+ 		inode = iget(dir->i_sb, ino);
+ 
+@@ -1005,17 +1166,19 @@
+ 	struct dentry dotdot;
+ 	struct ext3_dir_entry_2 * de;
+ 	struct buffer_head *bh;
++	void *lock = NULL;
+ 
+ 	dotdot.d_name.name = "..";
+ 	dotdot.d_name.len = 2;
+ 	dotdot.d_parent = child; /* confusing, isn't it! */
+ 
+-	bh = ext3_find_entry(&dotdot, &de);
++	bh = ext3_find_entry(&dotdot, &de, 0, &lock);
+ 	inode = NULL;
+ 	if (!bh)
+ 		return ERR_PTR(-ENOENT);
+ 	ino = le32_to_cpu(de->inode);
+ 	brelse(bh);
++	ext3_unlock_htree(child->d_inode, lock);
+ 	inode = iget(child->d_inode->i_sb, ino);
+ 
+ 	if (!inode)
+@@ -1054,7 +1217,8 @@
+ 	unsigned rec_len = 0;
+ 
+ 	while (count--) {
+-		struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++		struct ext3_dir_entry_2 *de =
++			(struct ext3_dir_entry_2 *) (from + map->offs);
+ 		rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ 		memcpy (to, de, rec_len);
+ 		((struct ext3_dir_entry_2 *) to)->rec_len =
+@@ -1068,7 +1232,8 @@
+ 
+ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+ {
+-	struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++	struct ext3_dir_entry_2 *next, *to, *prev;
++	struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base;
+ 	unsigned rec_len = 0;
+ 
+ 	prev = to = de;
+@@ -1090,7 +1255,8 @@
+ 
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+ 			struct buffer_head **bh,struct dx_frame *frame,
+-			struct dx_hash_info *hinfo, int *error)
++			struct dx_hash_info *hinfo, void **target,
++			int *error)
+ {
+ 	unsigned blocksize = dir->i_sb->s_blocksize;
+ 	unsigned count, continued;
+@@ -1137,23 +1303,30 @@
+ 	hash2 = map[split].hash;
+ 	continued = hash2 == map[split - 1].hash;
+ 	dxtrace(printk("Split block %i at %x, %i/%i\n",
+-		dx_get_block(frame->at), hash2, split, count-split));
+-
++		frame->leaf, hash2, split, count-split));
++	
+ 	/* Fancy dance to stay within two buffers */
+ 	de2 = dx_move_dirents(data1, data2, map + split, count - split);
+ 	de = dx_pack_dirents(data1,blocksize);
+ 	de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+ 	de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+-	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+-	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++	dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1));
++	dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1));
+ 
+ 	/* Which block gets the new entry? */
++	*target = NULL;
+ 	if (hinfo->hash >= hash2)
+ 	{
+ 		swap(*bh, bh2);
+ 		de = de2;
+-	}
+-	dx_insert_block (frame, hash2 + continued, newblock);
++
++		/* entry will be stored into new block
++		 * we have to lock it before add_dirent_to_buf */
++		*target = ext3_lock_htree(dir, newblock, 1);
++	}
++	dx_lock_bh(frame->bh);
++	dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx);
++	dx_unlock_bh(frame->bh);
+ 	err = ext3_journal_dirty_metadata (handle, bh2);
+ 	if (err)
+ 		goto journal_error;
+@@ -1227,7 +1400,8 @@
+ 	nlen = EXT3_DIR_REC_LEN(de->name_len);
+ 	rlen = le16_to_cpu(de->rec_len);
+ 	if (de->inode) {
+-		struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++		struct ext3_dir_entry_2 *de1 =
++			(struct ext3_dir_entry_2 *)((char *)de + nlen);
+ 		de1->rec_len = cpu_to_le16(rlen - nlen);
+ 		de->rec_len = cpu_to_le16(nlen);
+ 		de = de1;
+@@ -1286,6 +1460,7 @@
+ 	struct dx_hash_info hinfo;
+ 	u32		block;
+ 	struct fake_dirent *fde;
++	void		*lock, *new_lock;
+ 
+ 	blocksize =  dir->i_sb->s_blocksize;
+ 	dxtrace(printk("Creating index\n"));
+@@ -1305,6 +1480,8 @@
+ 	EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
+ 	data1 = bh2->b_data;
+ 
++	lock = ext3_lock_htree(dir, block, 1);
++
+ 	/* The 0th block becomes the root, move the dirents out */
+ 	fde = &root->dotdot;
+ 	de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+@@ -1334,13 +1511,25 @@
+ 	frame->entries = entries;
+ 	frame->at = entries;
+ 	frame->bh = bh;
++	frame->curidx = 0;
++	frame->leaf = 0;
++	frame[1].bh = NULL;
+ 	bh = bh2;
+-	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++	de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval);
+ 	dx_release (frames);
+ 	if (!(de))
+-		return retval;
++		goto cleanup;
++
++	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++cleanup:
++	if (new_lock)
++		ext3_unlock_htree(dir, new_lock);
++	/* we mark directory indexed in order to
++	 * avoid races while htree being created -bzzz */
++	EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++	ext3_unlock_htree(dir, lock);
+ 
+-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
++	return retval;
+ }
+ #endif
+ 
+@@ -1369,11 +1558,13 @@
+ 	unsigned blocksize;
+ 	unsigned nlen, rlen;
+ 	u32 block, blocks;
++	void *lock;
+ 
+ 	sb = dir->i_sb;
+ 	blocksize = sb->s_blocksize;
+ 	if (!dentry->d_name.len)
+ 		return -EINVAL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+ 	if (is_dx(dir)) {
+ 		retval = ext3_dx_add_entry(handle, dentry, inode);
+@@ -1384,30 +1575,52 @@
+ 		ext3_mark_inode_dirty(handle, dir);
+ 	}
+ #endif
++ 	lock = ext3_lock_htree(dir, 0, 1);
++ 	if (is_dx(dir)) {
++ 		/* we got lock for block 0
++ 		 * probably previous holder of the lock
++ 		 * created htree -bzzz */
++ 		ext3_unlock_htree(dir, lock);
++ 		goto repeat;
++ 	}
++
+ 	blocks = dir->i_size >> sb->s_blocksize_bits;
+ 	for (block = 0, offset = 0; block < blocks; block++) {
+ 		bh = ext3_bread(handle, dir, block, 0, &retval);
+-		if(!bh)
+-			return retval;
++		if(!bh) {
++ 			ext3_unlock_htree(dir, lock);
++  			return retval;
++ 		}
+ 		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
++		if (retval != -ENOSPC) {
++ 			ext3_unlock_htree(dir, lock);
++  			return retval;
++ 		}
+ 		if (retval != -ENOSPC)
+ 			return retval;
+ 
+ #ifdef CONFIG_EXT3_INDEX
+ 		if (blocks == 1 && !dx_fallback &&
+-		    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+-			return make_indexed_dir(handle, dentry, inode, bh);
++		    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) {
++ 			retval = make_indexed_dir(handle, dentry, inode, bh);
++ 			ext3_unlock_htree(dir, lock);
++ 			return retval;
++ 		}
+ #endif
+ 		brelse(bh);
+ 	}
+ 	bh = ext3_append(handle, dir, &block, &retval);
+-	if (!bh)
+-		return retval;
+-	de = (struct ext3_dir_entry_2 *) bh->b_data;
+-	de->inode = 0;
+-	de->rec_len = cpu_to_le16(rlen = blocksize);
+-	nlen = 0;
+-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
++ 	if (!bh) {
++ 		ext3_unlock_htree(dir, lock);
++  		return retval;
++ 	}
++  	de = (struct ext3_dir_entry_2 *) bh->b_data;
++  	de->inode = 0;
++  	de->rec_len = cpu_to_le16(rlen = blocksize);
++  	nlen = 0;
++ 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ 	ext3_unlock_htree(dir, lock);
++ 	return retval;
+ }
+ 
+ #ifdef CONFIG_EXT3_INDEX
+@@ -1425,15 +1638,27 @@
+ 	struct super_block * sb = dir->i_sb;
+ 	struct ext3_dir_entry_2 *de;
+ 	int err;
+-
+-	frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
++ 	int curidx;
++ 	void *idx_lock, *leaf_lock, *newleaf_lock;
++ 	
++repeat:
++ 	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+ 	if (!frame)
+ 		return err;
+-	entries = frame->entries;
+-	at = frame->at;
+-
+-	if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+-		goto cleanup;
++ 	/* we're going to chage leaf, so lock it first */
++ 	leaf_lock = ext3_lock_htree(dir, frame->leaf, 1);
++ 
++ 	/* while locking leaf we just found may get splitted
++ 	 * so we need to check this */
++ 	if (!dx_check_full_path(frames, &hinfo)) {
++ 		ext3_unlock_htree(dir, leaf_lock);
++ 		dx_release(frames);
++ 		goto repeat;
++ 	}
++ 	if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) {
++ 		printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err);
++  		goto cleanup;
++ 	}
+ 
+ 	BUFFER_TRACE(bh, "get_write_access");
+ 	err = ext3_journal_get_write_access(handle, bh);
+@@ -1446,6 +1671,35 @@
+ 		goto cleanup;
+ 	}
+ 
++	/* our leaf has no enough space. hence, we have to
++	 * split it. so lock index for this leaf first */
++	curidx = frame->curidx;
++	idx_lock = ext3_lock_htree(dir, curidx, 1);
++
++	/* now check did path get changed? */
++	dx_release(frames);
++
++	frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode,
++			&hinfo, frames, &err);
++	if (!frame) {
++		/* FIXME: error handling here */
++		brelse(bh);
++		ext3_unlock_htree(dir, idx_lock);
++		return err;
++	}
++	
++	if (frame->curidx != curidx) {
++		/* path has been changed. we have to drop old lock
++		 * and repeat */
++		brelse(bh);
++		ext3_unlock_htree(dir, idx_lock);
++		ext3_unlock_htree(dir, leaf_lock);
++		dx_release(frames);
++		goto repeat;
++	}
++	entries = frame->entries;
++	at = frame->at;
++
+ 	/* Block full, should compress but for now just split */
+ 	dxtrace(printk("using %u of %u node entries\n",
+ 		       dx_get_count(entries), dx_get_limit(entries)));
+@@ -1457,7 +1711,8 @@
+ 		struct dx_entry *entries2;
+ 		struct dx_node *node2;
+ 		struct buffer_head *bh2;
+-
++		void *nb_lock;
++		
+ 		if (levels && (dx_get_count(frames->entries) ==
+ 			       dx_get_limit(frames->entries))) {
+ 			ext3_warning(sb, __FUNCTION__,
+@@ -1468,6 +1723,7 @@
+ 		bh2 = ext3_append (handle, dir, &newblock, &err);
+ 		if (!(bh2))
+ 			goto cleanup;
++		nb_lock = ext3_lock_htree(dir, newblock, 1);
+ 		node2 = (struct dx_node *)(bh2->b_data);
+ 		entries2 = node2->entries;
+ 		node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+@@ -1479,27 +1735,73 @@
+ 		if (levels) {
+ 			unsigned icount1 = icount/2, icount2 = icount - icount1;
+ 			unsigned hash2 = dx_get_hash(entries + icount1);
+-			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++			void *ri_lock;
+ 
+-			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++			/* we have to protect root htree index against
++			 * another dx_add_entry() which would want to
++			 * split it too -bzzz */
++			ri_lock = ext3_lock_htree(dir, 0, 1);
++
++			/* as root index block blocked we must repeat
++			 * searching for current position of our 2nd index -bzzz */
++			dx_lock_bh(frame->bh);
++			frames->at = dx_find_position(frames->entries, hinfo.hash);
++			dx_unlock_bh(frame->bh);
++			
++			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++	
++			BUFFER_TRACE(frame->bh, "get_write_access");
+ 			err = ext3_journal_get_write_access(handle,
+ 							     frames[0].bh);
+ 			if (err)
+ 				goto journal_error;
+ 
++			/* copy index into new one */
+ 			memcpy ((char *) entries2, (char *) (entries + icount1),
+ 				icount2 * sizeof(struct dx_entry));
+-			dx_set_count (entries, icount1);
+ 			dx_set_count (entries2, icount2);
+ 			dx_set_limit (entries2, dx_node_limit(dir));
+ 
+ 			/* Which index block gets the new entry? */
+ 			if (at - entries >= icount1) {
++				/* unlock index we won't use */
++				ext3_unlock_htree(dir, idx_lock);
++				idx_lock = nb_lock;
+ 				frame->at = at = at - entries - icount1 + entries2;
+-				frame->entries = entries = entries2;
++				frame->entries = entries2;
++				frame->curidx = curidx = newblock;
+ 				swap(frame->bh, bh2);
++			} else {
++				/* we'll use old index,so new one may be freed */
++				ext3_unlock_htree(dir, nb_lock);
+ 			}
+-			dx_insert_block (frames + 0, hash2, newblock);
++		
++			/* NOTE: very subtle piece of code
++			 * competing dx_probe() may find 2nd level index in root
++			 * index, then we insert new index here and set new count
++			 * in that 2nd level index. so, dx_probe() may see 2nd
++			 * level index w/o hash it looks for. the solution is
++			 * to check root index after we locked just founded 2nd
++			 * level index -bzzz */
++			dx_lock_bh(frames[0].bh);
++			dx_insert_block (dir, frames + 0, hash2, newblock, 0);
++			dx_unlock_bh(frames[0].bh);
++			
++			/* now old and new 2nd level index blocks contain
++			 * all pointers, so dx_probe() may find it in the both.
++			 * it's OK -bzzz */
++			
++			dx_lock_bh(frame->bh);
++			dx_set_count(entries, icount1);
++			dx_unlock_bh(frame->bh);
++
++			/* now old 2nd level index block points to first half
++			 * of leafs. it's importand that dx_probe() must
++			 * check root index block for changes under
++			 * dx_lock_bh(frame->bh) -bzzz */
++
++			ext3_unlock_htree(dir, ri_lock);
++		
+ 			dxtrace(dx_show_index ("node", frames[1].entries));
+ 			dxtrace(dx_show_index ("node",
+ 			       ((struct dx_node *) bh2->b_data)->entries));
+@@ -1508,38 +1810,60 @@
+ 				goto journal_error;
+ 			brelse (bh2);
+ 		} else {
++ 			unsigned long leaf = frame->leaf;
+ 			dxtrace(printk("Creating second level index...\n"));
+ 			memcpy((char *) entries2, (char *) entries,
+ 			       icount * sizeof(struct dx_entry));
+ 			dx_set_limit(entries2, dx_node_limit(dir));
+ 
+ 			/* Set up root */
++  			dx_lock_bh(frames[0].bh);
+ 			dx_set_count(entries, 1);
+ 			dx_set_block(entries + 0, newblock);
+ 			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++ 			dx_unlock_bh(frames[0].bh);
+ 
+ 			/* Add new access path frame */
+ 			frame = frames + 1;
+ 			frame->at = at = at - entries + entries2;
+ 			frame->entries = entries = entries2;
+ 			frame->bh = bh2;
++ 			frame->curidx = newblock;
++ 			frame->leaf = leaf;
+ 			err = ext3_journal_get_write_access(handle,
+ 							     frame->bh);
+ 			if (err)
+ 				goto journal_error;
++		 
++ 			/* first level index was root. it's already initialized */
++ 			/* we my unlock it now */
++ 			ext3_unlock_htree(dir, idx_lock);
++ 
++ 			/* current index is just created 2nd level index */
++ 			curidx = newblock;
++ 			idx_lock = nb_lock;
+ 		}
+ 		ext3_journal_dirty_metadata(handle, frames[0].bh);
+ 	}
+-	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ 	de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err);
+ 	if (!de)
+ 		goto cleanup;
++	 
++ 	/* index splitted */
++ 	ext3_unlock_htree(dir, idx_lock);
++ 	
+ 	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ 
++ 	if (newleaf_lock)
++ 		ext3_unlock_htree(dir, newleaf_lock);
++ 	
+ 	bh = NULL;
+ 	goto cleanup;
+ 
+ journal_error:
+ 	ext3_std_error(dir->i_sb, err);
+ cleanup:
++ 	ext3_unlock_htree(dir, leaf_lock);
+ 	if (bh)
+ 		brelse(bh);
+ 	dx_release(frames);
+@@ -1989,6 +2313,7 @@
+ 	struct buffer_head * bh;
+ 	struct ext3_dir_entry_2 * de;
+ 	handle_t *handle;
++	void *lock;
+ 
+ 	/* Initialize quotas before so that eventual writes go in
+ 	 * separate transaction */
+@@ -1998,7 +2323,7 @@
+ 		return PTR_ERR(handle);
+ 
+ 	retval = -ENOENT;
+-	bh = ext3_find_entry (dentry, &de);
++	bh = ext3_find_entry (dentry, &de, 1, &lock);
+ 	if (!bh)
+ 		goto end_rmdir;
+ 
+@@ -2008,14 +2333,19 @@
+ 	inode = dentry->d_inode;
+ 
+ 	retval = -EIO;
+-	if (le32_to_cpu(de->inode) != inode->i_ino)
++	if (le32_to_cpu(de->inode) != inode->i_ino) {
++		ext3_unlock_htree(dir, lock);
+ 		goto end_rmdir;
++	}
+ 
+ 	retval = -ENOTEMPTY;
+-	if (!empty_dir (inode))
++	if (!empty_dir (inode)) {
++		ext3_unlock_htree(dir, lock);
+ 		goto end_rmdir;
++	}
+ 
+ 	retval = ext3_delete_entry(handle, dir, de, bh);
++	ext3_unlock_htree(dir, lock);
+ 	if (retval)
+ 		goto end_rmdir;
+ 	if (inode->i_nlink != 2)
+@@ -2048,6 +2378,7 @@
+ 	struct buffer_head * bh;
+ 	struct ext3_dir_entry_2 * de;
+ 	handle_t *handle;
++	void *lock;
+ 
+ 	/* Initialize quotas before so that eventual writes go
+ 	 * in separate transaction */
+@@ -2060,15 +2391,17 @@
+ 		handle->h_sync = 1;
+ 
+ 	retval = -ENOENT;
+-	bh = ext3_find_entry (dentry, &de);
++	bh = ext3_find_entry (dentry, &de, 1, &lock);
+ 	if (!bh)
+ 		goto end_unlink;
+ 
+ 	inode = dentry->d_inode;
+ 
+ 	retval = -EIO;
+-	if (le32_to_cpu(de->inode) != inode->i_ino)
++	if (le32_to_cpu(de->inode) != inode->i_ino) {
++		ext3_unlock_htree(dir, lock);
+ 		goto end_unlink;
++	}
+ 
+ 	if (!inode->i_nlink) {
+ 		ext3_warning (inode->i_sb, "ext3_unlink",
+@@ -2077,6 +2410,7 @@
+ 		inode->i_nlink = 1;
+ 	}
+ 	retval = ext3_delete_entry(handle, dir, de, bh);
++	ext3_unlock_htree(dir, lock);
+ 	if (retval)
+ 		goto end_unlink;
+ 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+@@ -2196,6 +2530,7 @@
+ 	struct buffer_head * old_bh, * new_bh, * dir_bh;
+ 	struct ext3_dir_entry_2 * old_de, * new_de;
+ 	int retval;
++	void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL;
+ 
+ 	old_bh = new_bh = dir_bh = NULL;
+ 
+@@ -2211,7 +2546,10 @@
+ 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ 		handle->h_sync = 1;
+ 
+-	old_bh = ext3_find_entry (old_dentry, &old_de);
++	if (old_dentry->d_parent == new_dentry->d_parent)
++		down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
++
++	old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */);
+ 	/*
+ 	 *  Check for inode number is _not_ due to possible IO errors.
+ 	 *  We might rmdir the source, keep it as pwd of some process
+@@ -2224,7 +2562,7 @@
+ 		goto end_rename;
+ 
+ 	new_inode = new_dentry->d_inode;
+-	new_bh = ext3_find_entry (new_dentry, &new_de);
++	new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */);
+ 	if (new_bh) {
+ 		if (!new_inode) {
+ 			brelse (new_bh);
+@@ -2288,7 +2626,7 @@
+ 		struct buffer_head *old_bh2;
+ 		struct ext3_dir_entry_2 *old_de2;
+ 
+-		old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++		old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */);
+ 		if (old_bh2) {
+ 			retval = ext3_delete_entry(handle, old_dir,
+ 						   old_de2, old_bh2);
+@@ -2331,6 +2669,14 @@
+ 	retval = 0;
+ 
+ end_rename:
++	if (lock1)
++		ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1);
++	if (lock2)
++		ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2);
++	if (lock3)
++		ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3);
++	if (old_dentry->d_parent == new_dentry->d_parent)
++		up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
+ 	brelse (dir_bh);
+ 	brelse (old_bh);
+ 	brelse (new_bh);
+@@ -2339,6 +2685,29 @@
+ }
+ 
+ /*
++ * this locking primitives are used to protect parts
++ * of dir's htree. protection unit is block: leaf or index
++ */
++static  void *ext3_lock_htree(struct inode *dir,
++					unsigned long value, int rwlock)
++{
++	void *lock;
++	
++	if (!test_opt(dir->i_sb, PDIROPS))
++		return NULL;
++	lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL);
++	return lock;
++}
++
++static  void ext3_unlock_htree(struct inode *dir,
++					void *lock)
++{
++	if (!test_opt(dir->i_sb, PDIROPS) || !lock)
++		return;
++	dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock);
++}
++
++/*
+  * directories can handle most operations...
+  */
+ struct inode_operations ext3_dir_inode_operations = {
+Index: linux-2.6.10/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs_i.h	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/include/linux/ext3_fs_i.h	2005-03-31 19:44:54.254322024 +0800
+@@ -19,6 +19,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/rbtree.h>
+ #include <linux/seqlock.h>
++#include <linux/dynlocks.h>
+ 
+ struct ext3_reserve_window {
+ 	__u32			_rsv_start;	/* First byte reserved */
+@@ -125,6 +126,11 @@
+ 	 */
+ 	struct semaphore truncate_sem;
+ 	struct inode vfs_inode;
++ 
++ 	/* following fields for parallel directory operations -bzzz */
++ 	struct dynlock i_htree_lock;
++ 	struct semaphore i_append_sem;
++ 	struct semaphore i_rename_sem;
+ };
+ 
+ #endif	/* _LINUX_EXT3_FS_I */
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h	2005-03-31 19:44:54.254322024 +0800
+@@ -355,6 +355,7 @@
+ #define EXT3_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
+ #define EXT3_MOUNT_RESERVATION		0x10000	/* Preallocation */
+ #define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
++#define EXT3_MOUNT_PDIROPS		0x800000/* Parallel dir operations */
+ 
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
diff --git a/lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch
new file mode 100644
index 0000000..d5de424
--- /dev/null
+++ b/lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch
@@ -0,0 +1,192 @@
+ fs/ext3/ialloc.c        |   35 ++++++++++++++++++++++++++++++++++-
+ fs/ext3/ioctl.c         |   25 +++++++++++++++++++++++++
+ fs/ext3/namei.c         |   21 +++++++++++++++++----
+ include/linux/dcache.h  |    5 +++++
+ include/linux/ext3_fs.h |    5 ++++-
+ 5 files changed, 85 insertions(+), 6 deletions(-)
+
+Index: linux-2.6.10/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ialloc.c	2005-03-31 18:19:50.911148112 +0800
++++ linux-2.6.10/fs/ext3/ialloc.c	2005-03-31 18:39:48.578075064 +0800
+@@ -419,7 +419,8 @@
+  * For other inodes, search forward from the parent directory's block
+  * group to find a free inode.
+  */
+-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
++struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode,
++				unsigned long goal)
+ {
+ 	struct super_block *sb;
+ 	struct buffer_head *bitmap_bh = NULL;
+@@ -447,6 +448,38 @@
+ 
+ 	sbi = EXT3_SB(sb);
+ 	es = sbi->s_es;
++	if (goal) {
++		group = (goal - 1) / EXT3_INODES_PER_GROUP(sb);
++		ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb);
++		gdp = ext3_get_group_desc(sb, group, &bh2);
++
++		err = -EIO;
++		bitmap_bh = read_inode_bitmap (sb, group);
++		if (!bitmap_bh)
++			goto fail;
++
++		BUFFER_TRACE(bh, "get_write_access");
++		err = ext3_journal_get_write_access(handle, bitmap_bh);
++		if (err) goto fail;
++
++		if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
++					ino, bitmap_bh->b_data)) {
++			printk(KERN_ERR "goal inode %lu unavailable\n", goal);
++			/* Oh well, we tried. */
++			goto continue_allocation;
++		}
++
++		BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++		err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++		if (err) goto fail;
++
++		/* We've shortcircuited the allocation system successfully,
++		 * now finish filling in the inode.
++		 */
++		goto got;
++	}
++
++continue_allocation:
+ 	if (S_ISDIR(mode)) {
+ 		if (test_opt (sb, OLDALLOC))
+ 			group = find_group_dir(sb, dir);
+Index: linux-2.6.10/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ioctl.c	2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/fs/ext3/ioctl.c	2005-03-31 18:39:48.579074912 +0800
+@@ -9,6 +9,7 @@
+ 
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
++#include <linux/namei.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/time.h>
+@@ -25,6 +26,31 @@
+ 	ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+ 
+ 	switch (cmd) {
++	case EXT3_IOC_CREATE_INUM: {
++		char name[32];
++		struct dentry *dchild, *dparent;
++		int rc = 0;
++
++		dparent = list_entry(inode->i_dentry.next, struct dentry,
++				     d_alias);
++		snprintf(name, sizeof name, "%lu", arg);
++		dchild = lookup_one_len(name, dparent, strlen(name));
++		if (dchild->d_inode) {
++			printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n",
++			       dparent->d_name.len, dparent->d_name.name, arg,
++			       dchild->d_inode->i_ino);
++			rc = -EEXIST;
++		} else {
++			dchild->d_fsdata = (void *)arg;
++			rc = vfs_create(inode, dchild, 0644, NULL);
++			if (rc)
++				printk(KERN_ERR "vfs_create: %d\n", rc);
++			else if (dchild->d_inode->i_ino != arg)
++				rc = -EEXIST;
++		}
++		dput(dchild);
++		return rc;
++	}
+ 	case EXT3_IOC_GETFLAGS:
+ 		flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
+ 		return put_user(flags, (int __user *) arg);
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c	2005-03-31 18:36:12.177972880 +0800
++++ linux-2.6.10/fs/ext3/namei.c	2005-03-31 18:39:48.582074456 +0800
+@@ -1940,6 +1940,19 @@
+ 	return err;
+ }
+ 
++static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir,
++						int mode, struct dentry *dentry)
++{
++	unsigned long inum = 0;
++
++	if (dentry->d_fsdata != NULL) {
++		struct dentry_params *param =
++			(struct dentry_params *) dentry->d_fsdata;
++		inum = param->p_inum;
++	}
++	return ext3_new_inode(handle, dir, mode, inum);
++}
++
+ /*
+  * By the time this is called, we already have created
+  * the directory cache entry for the new file, but it
+@@ -1965,7 +1978,7 @@
+ 	if (IS_DIRSYNC(dir))
+ 		handle->h_sync = 1;
+ 
+-	inode = ext3_new_inode (handle, dir, mode);
++	inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
+ 	err = PTR_ERR(inode);
+ 	if (!IS_ERR(inode)) {
+ 		inode->i_op = &ext3_file_inode_operations;
+@@ -1999,7 +2012,7 @@
+ 	if (IS_DIRSYNC(dir))
+ 		handle->h_sync = 1;
+ 
+-	inode = ext3_new_inode (handle, dir, mode);
++	inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
+ 	err = PTR_ERR(inode);
+ 	if (!IS_ERR(inode)) {
+ 		init_special_inode(inode, inode->i_mode, rdev);
+@@ -2035,7 +2048,7 @@
+ 	if (IS_DIRSYNC(dir))
+ 		handle->h_sync = 1;
+ 
+-	inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
++	inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry);
+ 	err = PTR_ERR(inode);
+ 	if (IS_ERR(inode))
+ 		goto out_stop;
+@@ -2450,7 +2463,7 @@
+ 	if (IS_DIRSYNC(dir))
+ 		handle->h_sync = 1;
+ 
+-	inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
++	inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry);
+ 	err = PTR_ERR(inode);
+ 	if (IS_ERR(inode))
+ 		goto out_stop;
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h	2005-03-31 18:38:11.720799608 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h	2005-03-31 18:40:36.630769944 +0800
+@@ -230,6 +230,7 @@
+ #define	EXT3_IOC_SETVERSION		_IOW('f', 4, long)
+ #define EXT3_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+ #define EXT3_IOC_GROUP_ADD		_IOW('f', 8,struct ext3_new_group_input)
++/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+ #define	EXT3_IOC_GETVERSION_OLD		_IOR('v', 1, long)
+ #define	EXT3_IOC_SETVERSION_OLD		_IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+@@ -742,7 +743,8 @@
+ 			  dx_hash_info *hinfo);
+ 
+ /* ialloc.c */
+-extern struct inode * ext3_new_inode (handle_t *, struct inode *, int);
++extern struct inode * ext3_new_inode (handle_t *, struct inode *, int,
++				      unsigned long);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+@@ -834,4 +836,5 @@
+ 
+ #endif	/* __KERNEL__ */
+ 
++#define EXT3_IOC_CREATE_INUM			_IOW('f', 5, long)
+ #endif	/* _LINUX_EXT3_FS_H */
diff --git a/lustre/kernel_patches/patches/hostfs_readdir_large.patch b/lustre/kernel_patches/patches/hostfs_readdir_large.patch
new file mode 100644
index 0000000..6ca6afd
--- /dev/null
+++ b/lustre/kernel_patches/patches/hostfs_readdir_large.patch
@@ -0,0 +1,32 @@
+Index: linux-2.6.10/fs/hostfs/hostfs_user.c
+===================================================================
+--- linux-2.6.10.orig/fs/hostfs/hostfs_user.c	2004-12-25 05:35:15.000000000 +0800
++++ linux-2.6.10/fs/hostfs/hostfs_user.c	2005-03-31 19:26:03.810175656 +0800
+@@ -121,13 +121,26 @@
+ {
+ 	DIR *dir = stream;
+ 	struct dirent *ent;
++        off_t off = 0;
++        off_t after_seek = 0;
++        off_t after_readdir = 0;
++        off_t after_readdir2 = 0;
+ 
+ 	seekdir(dir, *pos);
++        after_seek = telldir(dir);
+ 	ent = readdir(dir);
++	after_readdir = telldir(dir);
++	if ( after_seek != after_readdir ) {
++		off = after_readdir;
++	} else {
++		readdir(dir);
++		after_readdir2 = telldir(dir);
++		off = after_readdir2;
++	}
+ 	if(ent == NULL) return(NULL);
+ 	*len_out = strlen(ent->d_name);
+ 	*ino_out = ent->d_ino;
+-	*pos = telldir(dir);
++	*pos = off;
+ 	return(ent->d_name);
+ }
+ 
diff --git a/lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch b/lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch
new file mode 100644
index 0000000..afbd4d9
--- /dev/null
+++ b/lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch
@@ -0,0 +1,476 @@
+ fs/ext3/inode.c                    |    3 
+ fs/ext3/iopen.c                    |  239 +++++++++++++++++++++++++++++++++++++
+ fs/ext3/iopen.h                    |   15 ++
+ fs/ext3/namei.c                    |   13 ++
+ fs/ext3/super.c                    |   17 ++
+ include/linux/ext3_fs.h            |    2 
+ 7 files changed, 304 insertions(+), 1 deletion(-)
+
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h	2005-04-05 12:25:13.635136112 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h	2005-04-05 12:25:13.801110880 +0800
+@@ -357,6 +357,8 @@
+ #define EXT3_MOUNT_RESERVATION		0x10000	/* Preallocation */
+ #define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
+ #define EXT3_MOUNT_PDIROPS		0x800000/* Parallel dir operations */
++#define EXT3_MOUNT_IOPEN		0x40000	/* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV		0x80000	/* Make iopen world-readable */
+ 
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+Index: linux-2.6.10/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/inode.c	2005-04-05 12:25:13.726122280 +0800
++++ linux-2.6.10/fs/ext3/inode.c	2005-04-05 12:25:13.794111944 +0800
+@@ -37,6 +37,7 @@
+ #include <linux/mpage.h>
+ #include <linux/uio.h>
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+ 
+ /*
+@@ -2411,6 +2412,9 @@
+ #endif
+ 	ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+ 
++	if (ext3_iopen_get_inode(inode))
++		return;
++
+ 	if (ext3_get_inode_loc(inode, &iloc, 0))
+ 		goto bad_inode;
+ 	bh = iloc.bh;
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c	2005-04-05 12:25:13.728121976 +0800
++++ linux-2.6.10/fs/ext3/super.c	2005-04-05 12:25:13.797111488 +0800
+@@ -592,6 +592,7 @@
+ 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
++ 	Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+ };
+ 
+@@ -641,6 +642,9 @@
+ 	{Opt_ignore, "usrquota"},
+ 	{Opt_barrier, "barrier=%u"},
+  	{Opt_pdirops, "pdirops"},
++ 	{Opt_iopen,  "iopen"},
++ 	{Opt_noiopen,  "noiopen"},
++ 	{Opt_iopen_nopriv,  "iopen_nopriv"},
+ 	{Opt_err, NULL},
+ 	{Opt_resize, "resize"},
+ };
+@@ -921,6 +925,18 @@
+ 			else
+ 				clear_opt(sbi->s_mount_opt, BARRIER);
+ 			break;
++		case Opt_iopen:
++			set_opt (sbi->s_mount_opt, IOPEN);
++			clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++			break;
++		case Opt_noiopen:
++			clear_opt (sbi->s_mount_opt, IOPEN);
++			clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++			break;
++		case Opt_iopen_nopriv:
++			set_opt (sbi->s_mount_opt, IOPEN);
++			set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++			break;
+ 		case Opt_ignore:
+ 			break;
+ 		case Opt_resize:
+Index: linux-2.6.10/fs/ext3/iopen.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/iopen.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/ext3/iopen.c	2005-04-05 12:25:13.791112400 +0800
+@@ -0,0 +1,274 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
++ *
++ * Invariants:
++ *   - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ *     for an inode at one time.
++ *   - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ *     aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup().  Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent.  This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/smp_lock.h>
++#include <linux/dcache.h>
++#include <linux/security.h>
++#include "iopen.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN	32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry,
++				   struct nameidata *nd)
++{
++	struct inode *inode;
++	unsigned long ino;
++	struct list_head *lp;
++	struct dentry *alternate;
++	char buf[IOPEN_NAME_LEN];
++
++	if (dentry->d_name.len >= IOPEN_NAME_LEN)
++		return ERR_PTR(-ENAMETOOLONG);
++
++	memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++	buf[dentry->d_name.len] = 0;
++
++	if (strcmp(buf, ".") == 0)
++		ino = dir->i_ino;
++	else if (strcmp(buf, "..") == 0)
++		ino = EXT3_ROOT_INO;
++	else
++		ino = simple_strtoul(buf, 0, 0);
++
++	if ((ino != EXT3_ROOT_INO &&
++	     //ino != EXT3_ACL_IDX_INO &&
++	     //ino != EXT3_ACL_DATA_INO &&
++	     ino < EXT3_FIRST_INO(dir->i_sb)) ||
++	    ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
++		return ERR_PTR(-ENOENT);
++
++	inode = iget(dir->i_sb, ino);
++	if (!inode)
++		return ERR_PTR(-EACCES);
++	if (is_bad_inode(inode)) {
++		iput(inode);
++		return ERR_PTR(-ENOENT);
++	}
++
++	assert(list_empty(&dentry->d_alias));		/* d_instantiate */
++	assert(d_unhashed(dentry));		/* d_rehash */
++
++	/* preferrably return a connected dentry */
++	spin_lock(&dcache_lock);
++	list_for_each(lp, &inode->i_dentry) {
++		alternate = list_entry(lp, struct dentry, d_alias);
++		assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
++	}
++
++	if (!list_empty(&inode->i_dentry)) {
++		alternate = list_entry(inode->i_dentry.next,
++				       struct dentry, d_alias);
++		dget_locked(alternate);
++		spin_lock(&alternate->d_lock);
++		alternate->d_flags |= DCACHE_REFERENCED;
++		spin_unlock(&alternate->d_lock);
++		iput(inode);
++		spin_unlock(&dcache_lock);
++		return alternate;
++	}
++	dentry->d_flags |= DCACHE_DISCONNECTED;
++
++	/* d_add(), but don't drop dcache_lock before adding dentry to inode */
++	list_add(&dentry->d_alias, &inode->i_dentry);	/* d_instantiate */
++	dentry->d_inode = inode;
++
++	__d_rehash(dentry);				/* d_rehash */
++	spin_unlock(&dcache_lock);
++
++	return NULL;
++}
++
++#define do_switch(x,y) do { \
++	__typeof__ (x) __tmp = x; \
++	x = y; y = __tmp; } while (0)
++
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
++{
++	const unsigned char *old_name, *new_name;
++
++	memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN);
++	old_name = target->d_name.name;
++	new_name = dentry->d_name.name;
++	if (old_name == target->d_iname)
++		old_name = dentry->d_iname;
++	if (new_name == dentry->d_iname)
++		new_name = target->d_iname;
++	target->d_name.name = new_name;
++	dentry->d_name.name = old_name;
++}
++
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++				    int rehash)
++{
++	struct dentry *tmp, *goal = NULL;
++	struct list_head *lp;
++
++	/* verify this dentry is really new */
++	assert(dentry->d_inode == NULL);
++	assert(list_empty(&dentry->d_alias));		/* d_instantiate */
++	if (rehash)
++		assert(d_unhashed(dentry));	/* d_rehash */
++	assert(list_empty(&dentry->d_subdirs));
++
++	spin_lock(&dcache_lock);
++	if (!inode)
++		goto do_rehash;
++
++	/* preferrably return a connected dentry */
++	list_for_each(lp, &inode->i_dentry) {
++		tmp = list_entry(lp, struct dentry, d_alias);
++		if (tmp->d_flags & DCACHE_DISCONNECTED) {
++			assert(tmp->d_alias.next == &inode->i_dentry);
++			assert(tmp->d_alias.prev == &inode->i_dentry);
++			goal = tmp;
++			dget_locked(goal);
++			break;
++		}
++	}
++
++	if (!goal)
++		goto do_instantiate;
++
++	/* Move the goal to the de hash queue */
++	goal->d_flags &= ~ DCACHE_DISCONNECTED;
++	security_d_instantiate(goal, inode);
++	__d_rehash(dentry);
++	__d_move(goal, dentry);
++	spin_unlock(&dcache_lock);
++	iput(inode);
++
++	return goal;
++
++	/* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++	list_add(&dentry->d_alias, &inode->i_dentry);	/* d_instantiate */
++	dentry->d_inode = inode;
++do_rehash:
++	if (rehash)
++		__d_rehash(dentry);			/* d_rehash */
++	spin_unlock(&dcache_lock);
++
++	return NULL;
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++	lookup:		iopen_lookup,		/* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++	read:		generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++	int	len;
++
++	len = strlen(name);
++	if (dentry->d_name.len != len)
++		return 0;
++	if (strncmp(dentry->d_name.name, name, len))
++		return 0;
++	return 1;
++}
++
++/*
++ * This function is spliced into ext3_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++	struct inode *inode;
++
++	if (dir->i_ino != EXT3_ROOT_INO ||
++	    !test_opt(dir->i_sb, IOPEN) ||
++	    !match_dentry(dentry, "__iopen__"))
++		return 0;
++
++	inode = iget(dir->i_sb, EXT3_BAD_INO);
++
++	if (!inode)
++		return 0;
++	d_add(dentry, inode);
++	return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately.  Otherwise, this fuction returns 0.
++ */
++int ext3_iopen_get_inode(struct inode *inode)
++{
++	if (inode->i_ino != EXT3_BAD_INO)
++		return 0;
++
++	inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++	if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++		inode->i_mode |= 0777;
++	inode->i_uid = 0;
++	inode->i_gid = 0;
++	inode->i_nlink = 1;
++	inode->i_size = 4096;
++	inode->i_atime = CURRENT_TIME;
++	inode->i_ctime = CURRENT_TIME;
++	inode->i_mtime = CURRENT_TIME;
++	EXT3_I(inode)->i_dtime = 0;
++	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size
++					 * (for stat), not the fs block
++					 * size */
++	inode->i_blocks = 0;
++	inode->i_version = 1;
++	inode->i_generation = 0;
++
++	inode->i_op = &iopen_inode_operations;
++	inode->i_fop = &iopen_file_operations;
++	inode->i_mapping->a_ops = 0;
++
++	return 1;
++}
+Index: linux-2.6.10/fs/ext3/iopen.h
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/iopen.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/ext3/iopen.h	2005-04-05 12:25:13.792112248 +0800
+@@ -0,0 +1,15 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++					   struct inode *inode, int rehash);
+Index: linux-2.6.10/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/Makefile	2004-12-25 05:33:52.000000000 +0800
++++ linux-2.6.10/fs/ext3/Makefile	2005-04-05 12:26:06.897039072 +0800
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+ 
+ ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-	   ioctl.o namei.o super.o symlink.o hash.o resize.o
++	   ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o
+ 
+ ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c	2005-04-05 12:25:13.633136416 +0800
++++ linux-2.6.10/fs/ext3/namei.c	2005-04-05 12:25:13.799111184 +0800
+@@ -37,6 +37,7 @@
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+ 
+ /*
+@@ -1140,6 +1141,9 @@
+ 	if (dentry->d_name.len > EXT3_NAME_LEN)
+ 		return ERR_PTR(-ENAMETOOLONG);
+ 
++	if (ext3_check_for_iopen(dir, dentry))
++		return NULL;
++
+ 	bh = ext3_find_entry(dentry, &de, 0, &lock);
+ 	inode = NULL;
+ 	if (bh) {
+@@ -1151,10 +1155,8 @@
+ 		if (!inode)
+ 			return ERR_PTR(-EACCES);
+ 	}
+-	if (inode)
+-		return d_splice_alias(inode, dentry);
+-	d_add(dentry, inode);
+-	return NULL;
++
++	return iopen_connect_dentry(dentry, inode, 1);
+ }
+ 
+ 
+@@ -2367,10 +2369,6 @@
+ 			      inode->i_nlink);
+ 	inode->i_version++;
+ 	inode->i_nlink = 0;
+-	/* There's no need to set i_disksize: the fact that i_nlink is
+-	 * zero will ensure that the right thing happens during any
+-	 * recovery. */
+-	inode->i_size = 0;
+ 	ext3_orphan_add(handle, inode);
+ 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ 	ext3_mark_inode_dirty(handle, inode);
+@@ -2497,6 +2495,23 @@
+ 	return err;
+ }
+ 
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++			 struct inode *inode)
++{
++	int err = ext3_add_entry(handle, dentry, inode);
++	if (!err) {
++		err = ext3_mark_inode_dirty(handle, inode);
++		if (err == 0) {
++			dput(iopen_connect_dentry(dentry, inode, 0));
++			return 0;
++		}
++	}
++	ext3_dec_count(handle, inode);
++	iput(inode);
++	return err;
++}
++
+ static int ext3_link (struct dentry * old_dentry,
+ 		struct inode * dir, struct dentry *dentry)
+ {
+@@ -2520,7 +2535,8 @@
+ 	ext3_inc_count(handle, inode);
+ 	atomic_inc(&inode->i_count);
+ 
+-	err = ext3_add_nondir(handle, dentry, inode);
++	err = ext3_add_link(handle, dentry, inode);
++	ext3_orphan_del(handle,inode);
+ 	ext3_journal_stop(handle);
+ 	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+ 		goto retry;
diff --git a/lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch b/lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch
new file mode 100644
index 0000000..64085b9
--- /dev/null
+++ b/lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch
@@ -0,0 +1,222 @@
+--- 1.46/include/linux/jbd.h	2004-10-19 03:40:17 -06:00
++++ 1.47/include/linux/jbd.h	2004-11-07 19:13:24 -07:00
+@@ -352,6 +352,27 @@
+ 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+ }
+ 
++#define HAVE_JOURNAL_CALLBACK_STATUS
++/**
++ *   struct journal_callback - Base structure for callback information.
++ *   @jcb_list: list information for other callbacks attached to the same handle.
++ *   @jcb_func: Function to call with this callback structure. 
++ *
++ *   This struct is a 'seed' structure for a using with your own callback
++ *   structs. If you are using callbacks you must allocate one of these
++ *   or another struct of your own definition which has this struct 
++ *   as it's first element and pass it to journal_callback_set().
++ *
++ *   This is used internally by jbd to maintain callback information.
++ *
++ *   See journal_callback_set for more information.
++ **/
++struct journal_callback {
++	struct list_head jcb_list;		/* t_jcb_lock */
++	void (*jcb_func)(struct journal_callback *jcb, int error);
++	/* user data goes here */
++};
++
+ struct jbd_revoke_table_s;
+ 
+ /**
+@@ -360,6 +381,7 @@
+  * @h_transaction: Which compound transaction is this update a part of?
+  * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
+  * @h_ref: Reference count on this handle
++ * @h_jcb: List of application registered callbacks for this handle.
+  * @h_err: Field for caller's use to track errors through large fs operations
+  * @h_sync: flag for sync-on-close
+  * @h_jdata: flag to force data journaling
+@@ -385,6 +407,13 @@
+ 	/* operations */
+ 	int			h_err;
+ 
++	/*
++	 * List of application registered callbacks for this handle. The
++	 * function(s) will be called after the transaction that this handle is
++	 * part of has been committed to disk. [t_jcb_lock]
++	 */
++	struct list_head	h_jcb;
++
+ 	/* Flags [no locking] */
+ 	unsigned int	h_sync:		1;	/* sync-on-close */
+ 	unsigned int	h_jdata:	1;	/* force data journaling */
+@@ -426,6 +455,8 @@
+  *    j_state_lock
+  *    ->j_list_lock			(journal_unmap_buffer)
+  *
++ *    t_handle_lock
++ *    ->t_jcb_lock
+  */
+ 
+ struct transaction_s 
+@@ -549,6 +580,15 @@
+ 	 */
+ 	int t_handle_count;
+ 
++	/*
++	 * Protects the callback list
++	 */
++	spinlock_t		t_jcb_lock;
++	/*
++	 * List of registered callback functions for this transaction.
++	 * Called when the transaction is committed. [t_jcb_lock]
++	 */
++	struct list_head	t_jcb;
+ };
+ 
+ /**
+@@ -881,6 +921,10 @@
+ extern int	 journal_try_to_free_buffers(journal_t *, struct page *, int);
+ extern int	 journal_stop(handle_t *);
+ extern int	 journal_flush (journal_t *);
++extern void	 journal_callback_set(handle_t *handle,
++				      void (*fn)(struct journal_callback *,int),
++				      struct journal_callback *jcb);
++
+ extern void	 journal_lock_updates (journal_t *);
+ extern void	 journal_unlock_updates (journal_t *);
+ 
+--- 1.23/fs/jbd/checkpoint.c	2003-07-10 23:23:54 -06:00
++++ 1.24/fs/jbd/checkpoint.c	2004-11-07 19:13:24 -07:00
+@@ -616,6 +616,7 @@
+ 	J_ASSERT(transaction->t_log_list == NULL);
+ 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+ 	J_ASSERT(transaction->t_updates == 0);
++	J_ASSERT(list_empty(&transaction->t_jcb));
+ 	J_ASSERT(journal->j_committing_transaction != transaction);
+ 	J_ASSERT(journal->j_running_transaction != transaction);
+ 
+
+--- 1.53/fs/jbd/commit.c	2004-10-19 03:40:17 -06:00
++++ 1.54/fs/jbd/commit.c	2004-11-07 19:13:24 -07:00
+@@ -686,6 +686,30 @@
+ 	if (err)
+ 		__journal_abort_hard(journal);
+ 
++	/*
++	 * Call any callbacks that had been registered for handles in this
++	 * transaction.  It is up to the callback to free any allocated
++	 * memory.
++	 *
++	 * The spinlocking (t_jcb_lock) here is surely unnecessary...
++	 */
++	spin_lock(&commit_transaction->t_jcb_lock);
++	if (!list_empty(&commit_transaction->t_jcb)) {
++		struct list_head *p, *n;
++		int error = is_journal_aborted(journal);
++
++		list_for_each_safe(p, n, &commit_transaction->t_jcb) {
++			struct journal_callback *jcb;
++
++			jcb = list_entry(p, struct journal_callback, jcb_list);
++			list_del(p);
++			spin_unlock(&commit_transaction->t_jcb_lock);
++			jcb->jcb_func(jcb, error);
++			spin_lock(&commit_transaction->t_jcb_lock);
++		}
++	}
++	spin_unlock(&commit_transaction->t_jcb_lock);
++
+ 	jbd_debug(3, "JBD: commit phase 7\n");
+ 
+ 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+
+--- 1.77/fs/jbd/journal.c	2004-09-21 20:58:08 -06:00
++++ 1.78/fs/jbd/journal.c	2004-11-07 19:13:24 -07:00
+@@ -55,6 +55,7 @@
+ #endif
+ EXPORT_SYMBOL(journal_flush);
+ EXPORT_SYMBOL(journal_revoke);
++EXPORT_SYMBOL(journal_callback_set);
+ 
+ EXPORT_SYMBOL(journal_init_dev);
+ EXPORT_SYMBOL(journal_init_inode);
+@@ -78,6 +79,7 @@
+ EXPORT_SYMBOL(journal_blocks_per_page);
+ EXPORT_SYMBOL(journal_invalidatepage);
+ EXPORT_SYMBOL(journal_try_to_free_buffers);
++EXPORT_SYMBOL(journal_bmap);
+ EXPORT_SYMBOL(journal_force_commit);
+ 
+ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+
+--- 1.89/fs/jbd/transaction.c	2004-10-19 03:40:17 -06:00
++++ 1.90/fs/jbd/transaction.c	2004-11-07 19:13:24 -07:00
+@@ -50,7 +50,9 @@
+ 	transaction->t_state = T_RUNNING;
+ 	transaction->t_tid = journal->j_transaction_sequence++;
+ 	transaction->t_expires = jiffies + journal->j_commit_interval;
++	INIT_LIST_HEAD(&transaction->t_jcb);
+ 	spin_lock_init(&transaction->t_handle_lock);
++	spin_lock_init(&transaction->t_jcb_lock);
+ 
+ 	/* Set up the commit timer for the new transaction. */
+ 	journal->j_commit_timer->expires = transaction->t_expires;
+@@ -241,6 +243,7 @@
+ 	memset(handle, 0, sizeof(*handle));
+ 	handle->h_buffer_credits = nblocks;
+ 	handle->h_ref = 1;
++	INIT_LIST_HEAD(&handle->h_jcb);
+ 
+ 	return handle;
+ }
+@@ -1274,6 +1277,36 @@
+ }
+ 
+ /**
++ * void journal_callback_set() -  Register a callback function for this handle.
++ * @handle: handle to attach the callback to.
++ * @func: function to callback.
++ * @jcb:  structure with additional information required by func() , and
++ *        some space for jbd internal information.
++ * 
++ * The function will be
++ * called when the transaction that this handle is part of has been
++ * committed to disk with the original callback data struct and the
++ * error status of the journal as parameters.  There is no guarantee of
++ * ordering between handles within a single transaction, nor between
++ * callbacks registered on the same handle.
++ *
++ * The caller is responsible for allocating the journal_callback struct.
++ * This is to allow the caller to add as much extra data to the callback
++ * as needed, but reduce the overhead of multiple allocations.  The caller
++ * allocated struct must start with a struct journal_callback at offset 0,
++ * and has the caller-specific data afterwards.
++ */
++void journal_callback_set(handle_t *handle,
++			void (*func)(struct journal_callback *jcb, int error),
++			struct journal_callback *jcb)
++{
++	spin_lock(&handle->h_transaction->t_jcb_lock);
++	list_add_tail(&jcb->jcb_list, &handle->h_jcb);
++	spin_unlock(&handle->h_transaction->t_jcb_lock);
++	jcb->jcb_func = func;
++}
++
++/**
+  * int journal_stop() - complete a transaction
+  * @handle: tranaction to complete.
+  * 
+@@ -1338,6 +1371,11 @@
+ 		if (journal->j_barrier_count)
+ 			wake_up(&journal->j_wait_transaction_locked);
+ 	}
++
++	/* Move callbacks from the handle to the transaction. */
++	spin_lock(&transaction->t_jcb_lock);
++	list_splice(&handle->h_jcb, &transaction->t_jcb);
++	spin_unlock(&transaction->t_jcb_lock);
+ 
+ 	/*
+ 	 * If the handle is marked SYNC, we need to set another commit
+
diff --git a/lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch b/lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch
new file mode 100644
index 0000000..1ac66bc
--- /dev/null
+++ b/lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch
@@ -0,0 +1,399 @@
+fix against credits leak in journal_release_buffer()
+
+The idea is to charge a buffer at a time of modification (journal_dirty_metadata()),
+not at a time of access (journal_get_*_access()). Each buffer has flag first call
+journal_dirty_metadata() sets on the buffer.
+
+Signed-off-by: Alex Tomas <alex@clusterfs.com>
+
+Index: linux-2.6.10/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ialloc.c	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/fs/ext3/ialloc.c	2005-03-31 18:11:10.672236448 +0800
+@@ -474,11 +474,9 @@
+ 		ino = ext3_find_next_zero_bit((unsigned long *)
+ 				bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
+ 		if (ino < EXT3_INODES_PER_GROUP(sb)) {
+-			int credits = 0;
+ 
+ 			BUFFER_TRACE(bitmap_bh, "get_write_access");
+-			err = ext3_journal_get_write_access_credits(handle,
+-							bitmap_bh, &credits);
++			err = ext3_journal_get_write_access(handle, bitmap_bh);
+ 			if (err)
+ 				goto fail;
+ 
+@@ -494,7 +492,7 @@
+ 				goto got;
+ 			}
+ 			/* we lost it */
+-			journal_release_buffer(handle, bitmap_bh, credits);
++			journal_release_buffer(handle, bitmap_bh);
+ 
+ 			if (++ino < EXT3_INODES_PER_GROUP(sb))
+ 				goto repeat_in_this_group;
+Index: linux-2.6.10/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/xattr.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/ext3/xattr.c	2005-03-31 18:11:10.675235992 +0800
+@@ -507,8 +507,7 @@
+ 			goto skip_get_write_access;
+ 		/* ext3_journal_get_write_access() requires an unlocked bh,
+ 		   which complicates things here. */
+-		error = ext3_journal_get_write_access_credits(handle, bh,
+-							      &credits);
++		error = ext3_journal_get_write_access(handle, bh);
+ 		if (error)
+ 			goto cleanup;
+ 		ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev,
+@@ -525,7 +524,7 @@
+ 			if (ce)
+ 				mb_cache_entry_release(ce);
+ 			unlock_buffer(bh);
+-			journal_release_buffer(handle, bh, credits);
++			journal_release_buffer(handle, bh);
+ 		skip_get_write_access:
+ 			ea_bdebug(bh, "cloning");
+ 			header = kmalloc(bh->b_size, GFP_KERNEL);
+@@ -669,8 +668,7 @@
+ 				error = -EDQUOT;
+ 				if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+ 					unlock_buffer(new_bh);
+-					journal_release_buffer(handle, new_bh,
+-							       credits);
++					journal_release_buffer(handle, new_bh);
+ 					goto cleanup;
+ 				}
+ 				HDR(new_bh)->h_refcount = cpu_to_le32(1 +
+@@ -986,8 +984,7 @@
+ 			ext3_error(inode->i_sb, "ext3_xattr_cache_find",
+ 				"inode %ld: block %ld read error",
+ 				inode->i_ino, (unsigned long) ce->e_block);
+-		} else if (ext3_journal_get_write_access_credits(
+-				handle, bh, credits) == 0) {
++		} else if (ext3_journal_get_write_access(handle, bh) == 0) {
+ 			/* ext3_journal_get_write_access() requires an unlocked
+ 			 * bh, which complicates things here. */
+ 			lock_buffer(bh);
+@@ -1003,7 +1000,7 @@
+ 				return bh;
+ 			}
+ 			unlock_buffer(bh);
+-			journal_release_buffer(handle, bh, *credits);
++			journal_release_buffer(handle, bh);
+ 			*credits = 0;
+ 			brelse(bh);
+ 		}
+Index: linux-2.6.10/fs/ext3/balloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/balloc.c	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/fs/ext3/balloc.c	2005-03-31 18:14:05.705627328 +0800
+@@ -342,7 +342,7 @@
+ 	 */
+ 	/* @@@ check errors */
+ 	BUFFER_TRACE(bitmap_bh, "getting undo access");
+-	err = ext3_journal_get_undo_access(handle, bitmap_bh, NULL);
++	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+ 	if (err)
+ 		goto error_return;
+ 
+@@ -986,7 +986,6 @@
+ 	unsigned long group_first_block;
+ 	int ret = 0;
+ 	int fatal;
+-	int credits = 0;
+ 
+ 	*errp = 0;
+ 
+@@ -996,7 +995,7 @@
+ 	 * if the buffer is in BJ_Forget state in the committing transaction.
+ 	 */
+ 	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+-	fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits);
++	fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
+ 	if (fatal) {
+ 		*errp = fatal;
+ 		return -1;
+@@ -1087,7 +1086,7 @@
+ 	}
+ 
+ 	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+-	ext3_journal_release_buffer(handle, bitmap_bh, credits);
++ 	ext3_journal_release_buffer(handle, bitmap_bh);
+ 	return ret;
+ }
+ 
+Index: linux-2.6.10/fs/jbd/commit.c
+===================================================================
+--- linux-2.6.10.orig/fs/jbd/commit.c	2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/fs/jbd/commit.c	2005-03-31 18:11:10.668237056 +0800
+@@ -204,6 +204,19 @@
+ 	}
+ 
+ 	/*
++	 * First, drop modified flag: all accesses to the buffers
++	 * will be tracked for a new trasaction only -bzzz
++	 */
++	if (commit_transaction->t_buffers) {
++		new_jh = jh = commit_transaction->t_buffers->b_tnext;
++		do {
++			J_ASSERT_JH(new_jh, new_jh->b_modified == 1);
++			new_jh->b_modified = 0;
++			new_jh = new_jh->b_tnext;
++		} while (new_jh != jh);
++	}
++
++	/*
+ 	 * Now try to drop any written-back buffers from the journal's
+ 	 * checkpoint lists.  We do this *before* commit because it potentially
+ 	 * frees some memory
+Index: linux-2.6.10/fs/jbd/transaction.c
+===================================================================
+--- linux-2.6.10.orig/fs/jbd/transaction.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/jbd/transaction.c	2005-03-31 18:11:10.666237360 +0800
+@@ -522,7 +522,7 @@
+  */
+ static int
+ do_get_write_access(handle_t *handle, struct journal_head *jh,
+-			int force_copy, int *credits) 
++			int force_copy) 
+ {
+ 	struct buffer_head *bh;
+ 	transaction_t *transaction;
+@@ -604,11 +604,6 @@
+ 		JBUFFER_TRACE(jh, "has frozen data");
+ 		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ 		jh->b_next_transaction = transaction;
+-
+-		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+-		handle->h_buffer_credits--;
+-		if (credits)
+-			(*credits)++;
+ 		goto done;
+ 	}
+ 
+@@ -688,10 +683,6 @@
+ 		jh->b_next_transaction = transaction;
+ 	}
+ 
+-	J_ASSERT(handle->h_buffer_credits > 0);
+-	handle->h_buffer_credits--;
+-	if (credits)
+-		(*credits)++;
+ 
+ 	/*
+ 	 * Finally, if the buffer is not journaled right now, we need to make
+@@ -749,8 +740,7 @@
+  * because we're write()ing a buffer which is also part of a shared mapping.
+  */
+ 
+-int journal_get_write_access(handle_t *handle,
+-			struct buffer_head *bh, int *credits)
++int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+ {
+ 	struct journal_head *jh = journal_add_journal_head(bh);
+ 	int rc;
+@@ -758,7 +748,7 @@
+ 	/* We do not want to get caught playing with fields which the
+ 	 * log thread also manipulates.  Make sure that the buffer
+ 	 * completes any outstanding IO before proceeding. */
+-	rc = do_get_write_access(handle, jh, 0, credits);
++	rc = do_get_write_access(handle, jh, 0);
+ 	journal_put_journal_head(jh);
+ 	return rc;
+ }
+@@ -814,9 +804,6 @@
+ 	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ 	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+ 
+-	J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+-	handle->h_buffer_credits--;
+-
+ 	if (jh->b_transaction == NULL) {
+ 		jh->b_transaction = transaction;
+ 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+@@ -869,8 +856,7 @@
+  *
+  * Returns error number or 0 on success.
+  */
+-int journal_get_undo_access(handle_t *handle, struct buffer_head *bh,
+-				int *credits)
++int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+ {
+ 	int err;
+ 	struct journal_head *jh = journal_add_journal_head(bh);
+@@ -883,7 +869,7 @@
+ 	 * make sure that obtaining the committed_data is done
+ 	 * atomically wrt. completion of any outstanding commits.
+ 	 */
+-	err = do_get_write_access(handle, jh, 1, credits);
++	err = do_get_write_access(handle, jh, 1);
+ 	if (err)
+ 		goto out;
+ 
+@@ -1111,6 +1097,17 @@
+ 
+ 	jbd_lock_bh_state(bh);
+ 
++	if (jh->b_modified == 0) {
++		/*
++		 * This buffer's got modified and becoming part
++		 * of the transaction. This needs to be done
++		 * once a transaction -bzzz
++		 */
++		jh->b_modified = 1;
++		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
++		handle->h_buffer_credits--;
++	}
++
+ 	/*
+ 	 * fastpath, to avoid expensive locking.  If this buffer is already
+ 	 * on the running transaction's metadata list there is nothing to do.
+@@ -1161,24 +1158,11 @@
+  * journal_release_buffer: undo a get_write_access without any buffer
+  * updates, if the update decided in the end that it didn't need access.
+  *
+- * The caller passes in the number of credits which should be put back for
+- * this buffer (zero or one).
+- *
+- * We leave the buffer attached to t_reserved_list because even though this
+- * handle doesn't want it, some other concurrent handle may want to journal
+- * this buffer.  If that handle is curently in between get_write_access() and
+- * journal_dirty_metadata() then it expects the buffer to be reserved.  If
+- * we were to rip it off t_reserved_list here, the other handle will explode
+- * when journal_dirty_metadata is presented with a non-reserved buffer.
+- *
+- * If nobody really wants to journal this buffer then it will be thrown
+- * away at the start of commit.
+  */
+ void
+-journal_release_buffer(handle_t *handle, struct buffer_head *bh, int credits)
++journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+ {
+ 	BUFFER_TRACE(bh, "entry");
+-	handle->h_buffer_credits += credits;
+ }
+ 
+ /** 
+@@ -1222,6 +1206,12 @@
+ 		goto not_jbd;
+ 	}
+ 
++	/*
++	 * The buffer's going from the transaction, we must drop
++	 * all references -bzzz
++	 */
++	jh->b_modified = 0;
++
+ 	if (jh->b_transaction == handle->h_transaction) {
+ 		J_ASSERT_JH(jh, !jh->b_frozen_data);
+ 
+@@ -2015,7 +2005,10 @@
+ 	__journal_unfile_buffer(jh);
+ 	jh->b_transaction = jh->b_next_transaction;
+ 	jh->b_next_transaction = NULL;
+-	__journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
++	if (jh->b_modified == 1)
++		__journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
++	else
++		__journal_file_buffer(jh, jh->b_transaction, BJ_Reserved);
+ 	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+ 
+ 	if (was_dirty)
+Index: linux-2.6.10/include/linux/journal-head.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/journal-head.h	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/include/linux/journal-head.h	2005-03-31 18:11:10.658238576 +0800
+@@ -32,6 +32,13 @@
+ 	unsigned b_jlist;
+ 
+ 	/*
++	 * This flag signals the buffer has been modified by
++	 * the currently running transaction
++	 * [jbd_lock_bh_state()]
++	 */
++	unsigned b_modified;
++
++	/*
+ 	 * Copy of the buffer data frozen for writing to the log.
+ 	 * [jbd_lock_bh_state()]
+ 	 */
+Index: linux-2.6.10/include/linux/jbd.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/jbd.h	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/jbd.h	2005-03-31 18:12:52.504755552 +0800
+@@ -867,15 +867,12 @@
+ extern handle_t *journal_start(journal_t *, int nblocks);
+ extern int	 journal_restart (handle_t *, int nblocks);
+ extern int	 journal_extend (handle_t *, int nblocks);
+-extern int	 journal_get_write_access(handle_t *, struct buffer_head *,
+-						int *credits);
++extern int	 journal_get_write_access(handle_t *, struct buffer_head *);
+ extern int	 journal_get_create_access (handle_t *, struct buffer_head *);
+-extern int	 journal_get_undo_access(handle_t *, struct buffer_head *,
+-						int *credits);
++extern int	 journal_get_undo_access(handle_t *, struct buffer_head *);
+ extern int	 journal_dirty_data (handle_t *, struct buffer_head *);
+ extern int	 journal_dirty_metadata (handle_t *, struct buffer_head *);
+-extern void	 journal_release_buffer (handle_t *, struct buffer_head *,
+-						int credits);
++extern void	 journal_release_buffer (handle_t *, struct buffer_head *);
+ extern int	 journal_forget (handle_t *, struct buffer_head *);
+ extern void	 journal_sync_buffer (struct buffer_head *);
+ extern int	 journal_invalidatepage(journal_t *,
+Index: linux-2.6.10/include/linux/ext3_jbd.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_jbd.h	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/ext3_jbd.h	2005-03-31 18:11:10.660238272 +0800
+@@ -113,9 +113,9 @@
+ 
+ static inline int
+ __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+-				struct buffer_head *bh, int *credits)
++				struct buffer_head *bh)
+ {
+-	int err = journal_get_undo_access(handle, bh, credits);
++	int err = journal_get_undo_access(handle, bh);
+ 	if (err)
+ 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ 	return err;
+@@ -123,19 +123,18 @@
+ 
+ static inline int
+ __ext3_journal_get_write_access(const char *where, handle_t *handle,
+-				struct buffer_head *bh, int *credits)
++				struct buffer_head *bh)
+ {
+-	int err = journal_get_write_access(handle, bh, credits);
++	int err = journal_get_write_access(handle, bh);
+ 	if (err)
+ 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ 	return err;
+ }
+ 
+ static inline void
+-ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh,
+-				int credits)
++ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+ {
+-	journal_release_buffer(handle, bh, credits);
++	journal_release_buffer(handle, bh);
+ }
+ 
+ static inline int
+@@ -178,12 +177,10 @@
+ }
+ 
+ 
+-#define ext3_journal_get_undo_access(handle, bh, credits) \
+-	__ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh), (credits))
++#define ext3_journal_get_undo_access(handle, bh) \
++	__ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_get_write_access(handle, bh) \
+-	__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh), NULL)
+-#define ext3_journal_get_write_access_credits(handle, bh, credits) \
+-	__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh), (credits))
++	__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_revoke(handle, blocknr, bh) \
+ 	__ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ #define ext3_journal_get_create_access(handle, bh) \
diff --git a/lustre/kernel_patches/patches/kgdb-ga.patch b/lustre/kernel_patches/patches/kgdb-ga.patch
new file mode 100644
index 0000000..679853f
--- /dev/null
+++ b/lustre/kernel_patches/patches/kgdb-ga.patch
@@ -0,0 +1,6358 @@
+
+
+This kgdb will get called and will trap almost any kernel
+fault WITHOUT BEING ARMED.
+
+It is entered at boot time via "kgdb" in the boot string,
+not "gdb".  This entry occurs when the first setup on the
+boot string is called, not sometime later.  You will not
+find a "waiting for gdb" on your console, as the console has
+not yet been enabled at this time.  (Note, this early stuff
+is a bit fragile as the full trap table has yet to be
+loaded, something I might address, sometime...  So don't try
+to look at memory that can not be reached, for example. 
+Once the full trap table is loaded this restriction goes
+away.)
+
+If you hard code it, you can put a breakpoint() as the FIRST
+LINE OF C CODE.
+
+It does NOT use the serial driver, but if the serial driver
+is loaded, it tells it to release the port to avoid
+conflict.  
+
+The threads stuff is not configurable, does not require
+redirection of schedule() calls and does back track to the
+first non schedule() caller on the info threads command.  If
+you switch to the thread, however, it will show it in the
+switch code (as it should).
+
+It is MUCH more aggressive and paranoid about grabbing the
+other cpus on entry.  It issues a "send_nmi_all_but_self()"
+rather than depending on them to interrupt or hit an NMI
+sometime in the distant future.  If a cpu does not come to
+the party, it will continue without it so all is not lost.
+
+It does not have anything to do with IOCTL calls, but does
+do the control-C thing.
+
+There is a LOT of info in the patch which ends up in
+.../Documentation/i386/kgdb/*
+
+There is a nifty little thing call kgdb_ts() (kgdb time
+stamp) which is a function you can code calls to which puts
+some useful stuff in a circular buffer which can be examined
+with the supplied gdb macros.
+
+It also allows you do to do "p foobar(...)"  i.e. to call a
+function from gdb, just like gdb allows in program
+debugging.
+
+In an SMP system, you can choose to "hold" any given set of
+cpus.  It also defaults to holding other cpus on single step
+(this can be overridden).
+
+This said, you can imagine my consternation when I found it
+"lost it" on continues on 2.5.  I found and fixed this this
+early pm, a hold cpu on exit goof on my part.
+
+Oh, and a final point, the configure options are more
+extensive (the serial port is set up here, for example, (can
+not wait for a command line to do this)).  There is one to
+do system call exit tests.  This is VERY new and causes the
+kernel to hit a hard "int 3" if a system call attempts to
+exit with preempt count other than zero.  This is a fault,
+of course, but the current 2.5 is full of them so I don't
+recommend turning this on.
+
+
+DESC
+kgdbL warning fix
+EDESC
+From: Ingo Molnar <mingo@elte.hu>
+
+this patch fixes a deprecated use of asm input operands. (and shuts up a
+gcc 3.3 warning.)
+
+DESC
+kgdb buffer overflow fix
+EDESC
+From: George Anzinger <george@mvista.com>
+
+
+DESC
+kgdbL warning fix
+EDESC
+From: Ingo Molnar <mingo@elte.hu>
+
+this patch fixes a deprecated use of asm input operands. (and shuts up a
+gcc 3.3 warning.)
+
+DESC
+kgdb: CONFIG_DEBUG_INFO fix
+EDESC
+From: Thomas Schlichter <schlicht@uni-mannheim.de>
+
+that patch sets DEBUG_INFO to y by default, even if whether DEBUG_KERNEL nor 
+KGDB is enabled. The attached patch changes this to enable DEBUG_INFO by 
+default only if KGDB is enabled.
+
+DESC
+x86_64 fixes
+EDESC
+From Andi Kleen
+
+Fix x86_64 for kgdb.  We forget why.
+DESC
+correct kgdb.txt Documentation link (against  2.6.1-rc1-mm2)
+EDESC
+From: Jesper Juhl <juhl-lkml@dif.dk>
+
+The help text for "config KGDB" in arch/i386/Kconfig refers to
+Documentation/i386/kgdb.txt - the actual location is
+Documentation/i386/kgdb/kgdb.txt - patch below to fix that.
+
+DESC
+kgdb: fix for recent gcc
+EDESC
+
+arch/i386/kernel/traps.c:97: error: conflicting types for 'int3'
+arch/i386/kernel/traps.c:77: error: previous declaration of 'int3' was here
+arch/i386/kernel/traps.c:97: error: conflicting types for 'int3'
+arch/i386/kernel/traps.c:77: error: previous declaration of 'int3' was here
+arch/i386/kernel/traps.c:99: error: conflicting types for 'debug'
+arch/i386/kernel/traps.c:75: error: previous declaration of 'debug' was here
+arch/i386/kernel/traps.c:99: error: conflicting types for 'debug'
+arch/i386/kernel/traps.c:75: error: previous declaration of 'debug' was here
+
+DESC
+kgdb warning fixes
+EDESC
+
+arch/i386/kernel/kgdb_stub.c:1306: warning: 'time' might be used uninitialized in this function
+arch/i386/kernel/kgdb_stub.c:1306: warning: 'dum' might be used uninitialized in this function
+DESC
+THREAD_SIZE fixes for kgdb
+EDESC
+From: Matt Mackall <mpm@selenic.com>
+
+Noticed the THREAD_SIZE clean-ups are in -mm now. Here are the missing
+bits for kgdb, tested in -tiny with 4k stacks. 
+DESC
+Fix stack overflow test for non-8k stacks
+EDESC
+From: Matt Mackall <mpm@selenic.com>
+
+This is needed to work properly with 4k and 16k stacks.
+DESC
+kgdb-ga.patch fix for i386 single-step into sysenter
+EDESC
+From: Roland McGrath <roland@redhat.com>
+
+Using kgdb-ga.patch from -mm, if userland single-steps (PTRACE_SINGLESTEP)
+into the `sysenter' instruction, kgdb reports a bogus trap:
+
+	Program received signal SIGTRAP, Trace/breakpoint trap.
+	sysenter_past_esp () at arch/i386/kernel/entry.S:249
+	1: x/i $pc  0xc0106023 <sysenter_past_esp>:	sti    
+	(gdb) 
+
+The hackery in the "FIX_STACK" macro in entry.S changes the saved PC for a
+the spurious kernel-mode debug trap when TF was set on user-mode execution
+of `sysenter', so sysenter_past_esp is where it actually lies in this case.
+ The following patch removes the kgdb hiccup when userland
+PTRACE_SINGLESTEP's into sysenter.
+DESC
+fix TRAP_BAD_SYSCALL_EXITS on i386
+EDESC
+From: Andy Whitcroft <apw@shadowen.org>
+
+We are not using the right offset name, nor the right address when checking
+for a non-zero preempt count.  Move to TI_preempt_count(%ebp).
+
+Signed-off-by: Andy Whitcroft <apw@shadowen.org>
+DESC
+add TRAP_BAD_SYSCALL_EXITS config for i386
+EDESC
+From: Andy Whitcroft <apw@shadowen.org>
+
+There seems to be code recently added to -bk and thereby -mm which supports
+extra debug for preempt on system call exit.  Oddly there doesn't seem to
+be configuration options to enable them.  Below is a possible patch to
+allow enabling this on i386.  Sadly the most obvious menu to add this to is
+the Kernel Hacking menu, but that is defined in architecture specific
+configuration.  If this makes sense I could patch the other arches?
+
+Add a configuration option to allow enabling TRAP_BAD_SYSCALL_EXITS to the
+Kernel Hacking menu.
+
+Signed-off-by: Andy Whitcroft <apw@shadowen.org>
+DESC
+kgdb-is-incompatible-with-kprobes
+EDESC
+DESC
+kgdb-ga-build-fix
+EDESC
+DESC
+kgdb-ga-fixes
+EDESC
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Index: linux-2.6.10/include/asm-i386/kgdb_local.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/kgdb_local.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-i386/kgdb_local.h	2005-04-05 12:48:05.371600472 +0800
+@@ -0,0 +1,102 @@
++#ifndef __KGDB_LOCAL
++#define ___KGDB_LOCAL
++#include <linux/config.h>
++#include <linux/types.h>
++#include <linux/serial.h>
++#include <linux/serialP.h>
++#include <linux/spinlock.h>
++#include <asm/processor.h>
++#include <asm/msr.h>
++#include <asm/kgdb.h>
++
++#define PORT 0x3f8
++#ifdef CONFIG_KGDB_PORT
++#undef PORT
++#define PORT CONFIG_KGDB_PORT
++#endif
++#define IRQ 4
++#ifdef CONFIG_KGDB_IRQ
++#undef IRQ
++#define IRQ CONFIG_KGDB_IRQ
++#endif
++#define SB_CLOCK 1843200
++#define SB_BASE (SB_CLOCK/16)
++#define SB_BAUD9600 SB_BASE/9600
++#define SB_BAUD192  SB_BASE/19200
++#define SB_BAUD384  SB_BASE/38400
++#define SB_BAUD576  SB_BASE/57600
++#define SB_BAUD1152 SB_BASE/115200
++#ifdef CONFIG_KGDB_9600BAUD
++#define SB_BAUD SB_BAUD9600
++#endif
++#ifdef CONFIG_KGDB_19200BAUD
++#define SB_BAUD SB_BAUD192
++#endif
++#ifdef CONFIG_KGDB_38400BAUD
++#define SB_BAUD SB_BAUD384
++#endif
++#ifdef CONFIG_KGDB_57600BAUD
++#define SB_BAUD SB_BAUD576
++#endif
++#ifdef CONFIG_KGDB_115200BAUD
++#define SB_BAUD SB_BAUD1152
++#endif
++#ifndef SB_BAUD
++#define SB_BAUD SB_BAUD1152	/* Start with this if not given */
++#endif
++
++#ifndef CONFIG_X86_TSC
++#undef rdtsc
++#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;}
++#undef rdtscll
++#define rdtscll(s) s++
++#endif
++
++#ifdef _raw_read_unlock		/* must use a name that is "define"ed, not an inline */
++#undef spin_lock
++#undef spin_trylock
++#undef spin_unlock
++#define spin_lock	 _raw_spin_lock
++#define spin_trylock	 _raw_spin_trylock
++#define spin_unlock	 _raw_spin_unlock
++#else
++#endif
++#undef spin_unlock_wait
++#define spin_unlock_wait(x)  do { cpu_relax(); barrier();} \
++                                     while(spin_is_locked(x))
++
++#define SB_IER 1
++#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS
++
++#define FLAGS 0
++#define SB_STATE { \
++     magic: SSTATE_MAGIC, \
++     baud_base: SB_BASE,  \
++     port:      PORT,     \
++     irq:       IRQ,      \
++     flags:     FLAGS,    \
++     custom_divisor:SB_BAUD}
++#define SB_INFO  { \
++      magic: SERIAL_MAGIC, \
++      port:  PORT,0,FLAGS, \
++      state: &state,       \
++      tty:   (struct tty_struct *)&state, \
++      IER:   SB_IER,       \
++      MCR:   SB_MCR}
++extern void putDebugChar(int);
++/* RTAI support needs us to really stop/start interrupts */
++
++#define kgdb_sti() __asm__ __volatile__("sti": : :"memory")
++#define kgdb_cli() __asm__ __volatile__("cli": : :"memory")
++#define kgdb_local_save_flags(x) __asm__ __volatile__(\
++                                   "pushfl ; popl %0":"=g" (x): /* no input */)
++#define kgdb_local_irq_restore(x) __asm__ __volatile__(\
++                                   "pushl %0 ; popfl": \
++                                     /* no output */ :"g" (x):"memory", "cc")
++#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli()
++
++#ifdef CONFIG_SERIAL
++extern void shutdown_for_kgdb(struct async_struct *info);
++#endif
++#define INIT_KDEBUG putDebugChar("+");
++#endif				/* __KGDB_LOCAL */
+Index: linux-2.6.10/include/asm-i386/kgdb.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/kgdb.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-i386/kgdb.h	2005-04-05 12:48:05.399596216 +0800
+@@ -0,0 +1,59 @@
++#ifndef __KGDB
++#define __KGDB
++
++/*
++ * This file should not include ANY others.  This makes it usable
++ * most anywhere without the fear of include order or inclusion.
++ * Make it so!
++ *
++ * This file may be included all the time.  It is only active if
++ * CONFIG_KGDB is defined, otherwise it stubs out all the macros
++ * and entry points.
++ */
++#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__)
++
++extern void breakpoint(void);
++#define INIT_KGDB_INTS kgdb_enable_ints()
++
++#ifndef BREAKPOINT
++#define BREAKPOINT   asm("   int $3")
++#endif
++/*
++ * GDB debug stub (or any debug stub) can point the 'linux_debug_hook'
++ * pointer to its routine and it will be entered as the first thing
++ * when a trap occurs.
++ *
++ * Return values are, at present, undefined.
++ *
++ * The debug hook routine does not necessarily return to its caller.
++ * It has the register image and thus may choose to resume execution
++ * anywhere it pleases.
++ */
++struct pt_regs;
++
++extern int kgdb_handle_exception(int trapno,
++				 int signo, int err_code, struct pt_regs *regs);
++extern int in_kgdb(struct pt_regs *regs);
++
++#ifdef CONFIG_KGDB_TS
++void kgdb_tstamp(int line, char *source, int data0, int data1);
++/*
++ * This is the time stamp function.  The macro adds the source info and
++ * does a cast on the data to allow most any 32-bit value.
++ */
++
++#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1)
++#else
++#define kgdb_ts(data0,data1)
++#endif
++#else				/* CONFIG_KGDB  && ! __ASSEMBLY__ ,stubs follow... */
++#ifndef BREAKPOINT
++#define BREAKPOINT
++#endif
++#define kgdb_ts(data0,data1)
++#define in_kgdb
++#define kgdb_handle_exception
++#define breakpoint
++#define INIT_KGDB_INTS
++#endif
++#endif				/* __KGDB */
+Index: linux-2.6.10/include/asm-i386/bugs.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/bugs.h	2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/include/asm-i386/bugs.h	2005-04-05 12:48:05.398596368 +0800
+@@ -1,11 +1,11 @@
+ /*
+  *  include/asm-i386/bugs.h
+  *
+- *  Copyright (C) 1994  Linus Torvalds
++ *  Copyright (C) 1994	Linus Torvalds
+  *
+  *  Cyrix stuff, June 1998 by:
+  *	- Rafael R. Reilova (moved everything from head.S),
+- *        <rreilova@ececs.uc.edu>
++ *	  <rreilova@ececs.uc.edu>
+  *	- Channing Corn (tests & fixes),
+  *	- Andrew D. Balsa (code cleanup).
+  *
+@@ -25,7 +25,20 @@
+ #include <asm/processor.h>
+ #include <asm/i387.h>
+ #include <asm/msr.h>
+-
++#ifdef CONFIG_KGDB
++/*
++ * Provied the command line "gdb" initial break
++ */
++int __init kgdb_initial_break(char * str)
++{
++	if (*str == '\0'){
++		breakpoint();
++		return 1;
++	}
++	return 0;
++}
++__setup("gdb",kgdb_initial_break);
++#endif
+ static int __init no_halt(char *s)
+ {
+ 	boot_cpu_data.hlt_works_ok = 0;
+@@ -140,7 +153,7 @@
+ 	  : "ecx", "edi" );
+ 	/* If this fails, it means that any user program may lock the CPU hard. Too bad. */
+ 	if (res != 12345678) printk( "Buggy.\n" );
+-		        else printk( "OK.\n" );
++			else printk( "OK.\n" );
+ #endif
+ }
+ 
+Index: linux-2.6.10/include/linux/serial_core.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/serial_core.h	2004-12-25 05:34:00.000000000 +0800
++++ linux-2.6.10/include/linux/serial_core.h	2005-04-05 12:48:05.367601080 +0800
+@@ -184,7 +184,6 @@
+ 	unsigned char		x_char;			/* xon/xoff char */
+ 	unsigned char		regshift;		/* reg offset shift */
+ 	unsigned char		iotype;			/* io access style */
+-
+ #define UPIO_PORT		(0)
+ #define UPIO_HUB6		(1)
+ #define UPIO_MEM		(2)
+Index: linux-2.6.10/include/linux/dwarf2.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dwarf2.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dwarf2.h	2005-04-05 12:48:05.369600776 +0800
+@@ -0,0 +1,738 @@
++/* Declarations and definitions of codes relating to the DWARF2 symbolic
++   debugging information format.
++   Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002
++   Free Software Foundation, Inc.
++
++   Written by Gary Funck (gary@intrepid.com) The Ada Joint Program
++   Office (AJPO), Florida State Unviversity and Silicon Graphics Inc.
++   provided support for this effort -- June 21, 1995.
++
++   Derived from the DWARF 1 implementation written by Ron Guilmette
++   (rfg@netcom.com), November 1990.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it under
++   the terms of the GNU General Public License as published by the Free
++   Software Foundation; either version 2, or (at your option) any later
++   version.
++
++   GCC is distributed in the hope that it will be useful, but WITHOUT
++   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
++   License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING.  If not, write to the Free
++   Software Foundation, 59 Temple Place - Suite 330, Boston, MA
++   02111-1307, USA.  */
++
++/* This file is derived from the DWARF specification (a public document)
++   Revision 2.0.0 (July 27, 1993) developed by the UNIX International
++   Programming Languages Special Interest Group (UI/PLSIG) and distributed
++   by UNIX International.  Copies of this specification are available from
++   UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.
++
++   This file also now contains definitions from the DWARF 3 specification.  */
++
++/* This file is shared between GCC and GDB, and should not contain
++   prototypes.	*/
++
++#ifndef _ELF_DWARF2_H
++#define _ELF_DWARF2_H
++
++/* Structure found in the .debug_line section.	*/
++#ifndef __ASSEMBLY__
++typedef struct
++{
++  unsigned char li_length	   [4];
++  unsigned char li_version	   [2];
++  unsigned char li_prologue_length [4];
++  unsigned char li_min_insn_length [1];
++  unsigned char li_default_is_stmt [1];
++  unsigned char li_line_base	   [1];
++  unsigned char li_line_range	   [1];
++  unsigned char li_opcode_base	   [1];
++}
++DWARF2_External_LineInfo;
++
++typedef struct
++{
++  unsigned long  li_length;
++  unsigned short li_version;
++  unsigned int	 li_prologue_length;
++  unsigned char  li_min_insn_length;
++  unsigned char  li_default_is_stmt;
++  int		 li_line_base;
++  unsigned char  li_line_range;
++  unsigned char  li_opcode_base;
++}
++DWARF2_Internal_LineInfo;
++
++/* Structure found in .debug_pubnames section.	*/
++typedef struct
++{
++  unsigned char pn_length  [4];
++  unsigned char pn_version [2];
++  unsigned char pn_offset  [4];
++  unsigned char pn_size    [4];
++}
++DWARF2_External_PubNames;
++
++typedef struct
++{
++  unsigned long  pn_length;
++  unsigned short pn_version;
++  unsigned long  pn_offset;
++  unsigned long  pn_size;
++}
++DWARF2_Internal_PubNames;
++
++/* Structure found in .debug_info section.  */
++typedef struct
++{
++  unsigned char  cu_length	  [4];
++  unsigned char  cu_version	  [2];
++  unsigned char  cu_abbrev_offset [4];
++  unsigned char  cu_pointer_size  [1];
++}
++DWARF2_External_CompUnit;
++
++typedef struct
++{
++  unsigned long  cu_length;
++  unsigned short cu_version;
++  unsigned long  cu_abbrev_offset;
++  unsigned char  cu_pointer_size;
++}
++DWARF2_Internal_CompUnit;
++
++typedef struct
++{
++  unsigned char  ar_length	 [4];
++  unsigned char  ar_version	 [2];
++  unsigned char  ar_info_offset  [4];
++  unsigned char  ar_pointer_size [1];
++  unsigned char  ar_segment_size [1];
++}
++DWARF2_External_ARange;
++
++typedef struct
++{
++  unsigned long  ar_length;
++  unsigned short ar_version;
++  unsigned long  ar_info_offset;
++  unsigned char  ar_pointer_size;
++  unsigned char  ar_segment_size;
++}
++DWARF2_Internal_ARange;
++
++#define ENUM(name) enum name {
++#define IF_NOT_ASM(a) a
++#define COMMA ,
++#else
++#define ENUM(name)
++#define IF_NOT_ASM(a)
++#define COMMA
++
++#endif
++
++/* Tag names and codes.  */
++ENUM(dwarf_tag)
++
++    DW_TAG_padding = 0x00 COMMA
++    DW_TAG_array_type = 0x01 COMMA
++    DW_TAG_class_type = 0x02 COMMA
++    DW_TAG_entry_point = 0x03 COMMA
++    DW_TAG_enumeration_type = 0x04 COMMA
++    DW_TAG_formal_parameter = 0x05 COMMA
++    DW_TAG_imported_declaration = 0x08 COMMA
++    DW_TAG_label = 0x0a COMMA
++    DW_TAG_lexical_block = 0x0b COMMA
++    DW_TAG_member = 0x0d COMMA
++    DW_TAG_pointer_type = 0x0f COMMA
++    DW_TAG_reference_type = 0x10 COMMA
++    DW_TAG_compile_unit = 0x11 COMMA
++    DW_TAG_string_type = 0x12 COMMA
++    DW_TAG_structure_type = 0x13 COMMA
++    DW_TAG_subroutine_type = 0x15 COMMA
++    DW_TAG_typedef = 0x16 COMMA
++    DW_TAG_union_type = 0x17 COMMA
++    DW_TAG_unspecified_parameters = 0x18 COMMA
++    DW_TAG_variant = 0x19 COMMA
++    DW_TAG_common_block = 0x1a COMMA
++    DW_TAG_common_inclusion = 0x1b COMMA
++    DW_TAG_inheritance = 0x1c COMMA
++    DW_TAG_inlined_subroutine = 0x1d COMMA
++    DW_TAG_module = 0x1e COMMA
++    DW_TAG_ptr_to_member_type = 0x1f COMMA
++    DW_TAG_set_type = 0x20 COMMA
++    DW_TAG_subrange_type = 0x21 COMMA
++    DW_TAG_with_stmt = 0x22 COMMA
++    DW_TAG_access_declaration = 0x23 COMMA
++    DW_TAG_base_type = 0x24 COMMA
++    DW_TAG_catch_block = 0x25 COMMA
++    DW_TAG_const_type = 0x26 COMMA
++    DW_TAG_constant = 0x27 COMMA
++    DW_TAG_enumerator = 0x28 COMMA
++    DW_TAG_file_type = 0x29 COMMA
++    DW_TAG_friend = 0x2a COMMA
++    DW_TAG_namelist = 0x2b COMMA
++    DW_TAG_namelist_item = 0x2c COMMA
++    DW_TAG_packed_type = 0x2d COMMA
++    DW_TAG_subprogram = 0x2e COMMA
++    DW_TAG_template_type_param = 0x2f COMMA
++    DW_TAG_template_value_param = 0x30 COMMA
++    DW_TAG_thrown_type = 0x31 COMMA
++    DW_TAG_try_block = 0x32 COMMA
++    DW_TAG_variant_part = 0x33 COMMA
++    DW_TAG_variable = 0x34 COMMA
++    DW_TAG_volatile_type = 0x35 COMMA
++    /* DWARF 3.  */
++    DW_TAG_dwarf_procedure = 0x36 COMMA
++    DW_TAG_restrict_type = 0x37 COMMA
++    DW_TAG_interface_type = 0x38 COMMA
++    DW_TAG_namespace = 0x39 COMMA
++    DW_TAG_imported_module = 0x3a COMMA
++    DW_TAG_unspecified_type = 0x3b COMMA
++    DW_TAG_partial_unit = 0x3c COMMA
++    DW_TAG_imported_unit = 0x3d COMMA
++    /* SGI/MIPS Extensions.  */
++    DW_TAG_MIPS_loop = 0x4081 COMMA
++    /* GNU extensions.	*/
++    DW_TAG_format_label = 0x4101 COMMA	/* For FORTRAN 77 and Fortran 90.  */
++    DW_TAG_function_template = 0x4102 COMMA	/* For C++.  */
++    DW_TAG_class_template = 0x4103 COMMA	/* For C++.  */
++    DW_TAG_GNU_BINCL = 0x4104 COMMA
++    DW_TAG_GNU_EINCL = 0x4105 COMMA
++    /* Extensions for UPC.  See: http://upc.gwu.edu/~upc.  */
++    DW_TAG_upc_shared_type = 0x8765 COMMA
++    DW_TAG_upc_strict_type = 0x8766 COMMA
++    DW_TAG_upc_relaxed_type = 0x8767
++IF_NOT_ASM(};)
++
++#define DW_TAG_lo_user	0x4080
++#define DW_TAG_hi_user	0xffff
++
++/* Flag that tells whether entry has a child or not.  */
++#define DW_children_no	 0
++#define	DW_children_yes  1
++
++/* Form names and codes.  */
++ENUM(dwarf_form)
++
++    DW_FORM_addr = 0x01 COMMA
++    DW_FORM_block2 = 0x03 COMMA
++    DW_FORM_block4 = 0x04 COMMA
++    DW_FORM_data2 = 0x05 COMMA
++    DW_FORM_data4 = 0x06 COMMA
++    DW_FORM_data8 = 0x07 COMMA
++    DW_FORM_string = 0x08 COMMA
++    DW_FORM_block = 0x09 COMMA
++    DW_FORM_block1 = 0x0a COMMA
++    DW_FORM_data1 = 0x0b COMMA
++    DW_FORM_flag = 0x0c COMMA
++    DW_FORM_sdata = 0x0d COMMA
++    DW_FORM_strp = 0x0e COMMA
++    DW_FORM_udata = 0x0f COMMA
++    DW_FORM_ref_addr = 0x10 COMMA
++    DW_FORM_ref1 = 0x11 COMMA
++    DW_FORM_ref2 = 0x12 COMMA
++    DW_FORM_ref4 = 0x13 COMMA
++    DW_FORM_ref8 = 0x14 COMMA
++    DW_FORM_ref_udata = 0x15 COMMA
++    DW_FORM_indirect = 0x16
++IF_NOT_ASM(};)
++
++/* Attribute names and codes.  */
++
++ENUM(dwarf_attribute)
++
++    DW_AT_sibling = 0x01 COMMA
++    DW_AT_location = 0x02 COMMA
++    DW_AT_name = 0x03 COMMA
++    DW_AT_ordering = 0x09 COMMA
++    DW_AT_subscr_data = 0x0a COMMA
++    DW_AT_byte_size = 0x0b COMMA
++    DW_AT_bit_offset = 0x0c COMMA
++    DW_AT_bit_size = 0x0d COMMA
++    DW_AT_element_list = 0x0f COMMA
++    DW_AT_stmt_list = 0x10 COMMA
++    DW_AT_low_pc = 0x11 COMMA
++    DW_AT_high_pc = 0x12 COMMA
++    DW_AT_language = 0x13 COMMA
++    DW_AT_member = 0x14 COMMA
++    DW_AT_discr = 0x15 COMMA
++    DW_AT_discr_value = 0x16 COMMA
++    DW_AT_visibility = 0x17 COMMA
++    DW_AT_import = 0x18 COMMA
++    DW_AT_string_length = 0x19 COMMA
++    DW_AT_common_reference = 0x1a COMMA
++    DW_AT_comp_dir = 0x1b COMMA
++    DW_AT_const_value = 0x1c COMMA
++    DW_AT_containing_type = 0x1d COMMA
++    DW_AT_default_value = 0x1e COMMA
++    DW_AT_inline = 0x20 COMMA
++    DW_AT_is_optional = 0x21 COMMA
++    DW_AT_lower_bound = 0x22 COMMA
++    DW_AT_producer = 0x25 COMMA
++    DW_AT_prototyped = 0x27 COMMA
++    DW_AT_return_addr = 0x2a COMMA
++    DW_AT_start_scope = 0x2c COMMA
++    DW_AT_stride_size = 0x2e COMMA
++    DW_AT_upper_bound = 0x2f COMMA
++    DW_AT_abstract_origin = 0x31 COMMA
++    DW_AT_accessibility = 0x32 COMMA
++    DW_AT_address_class = 0x33 COMMA
++    DW_AT_artificial = 0x34 COMMA
++    DW_AT_base_types = 0x35 COMMA
++    DW_AT_calling_convention = 0x36 COMMA
++    DW_AT_count = 0x37 COMMA
++    DW_AT_data_member_location = 0x38 COMMA
++    DW_AT_decl_column = 0x39 COMMA
++    DW_AT_decl_file = 0x3a COMMA
++    DW_AT_decl_line = 0x3b COMMA
++    DW_AT_declaration = 0x3c COMMA
++    DW_AT_discr_list = 0x3d COMMA
++    DW_AT_encoding = 0x3e COMMA
++    DW_AT_external = 0x3f COMMA
++    DW_AT_frame_base = 0x40 COMMA
++    DW_AT_friend = 0x41 COMMA
++    DW_AT_identifier_case = 0x42 COMMA
++    DW_AT_macro_info = 0x43 COMMA
++    DW_AT_namelist_items = 0x44 COMMA
++    DW_AT_priority = 0x45 COMMA
++    DW_AT_segment = 0x46 COMMA
++    DW_AT_specification = 0x47 COMMA
++    DW_AT_static_link = 0x48 COMMA
++    DW_AT_type = 0x49 COMMA
++    DW_AT_use_location = 0x4a COMMA
++    DW_AT_variable_parameter = 0x4b COMMA
++    DW_AT_virtuality = 0x4c COMMA
++    DW_AT_vtable_elem_location = 0x4d COMMA
++    /* DWARF 3 values.	*/
++    DW_AT_allocated	= 0x4e COMMA
++    DW_AT_associated	= 0x4f COMMA
++    DW_AT_data_location = 0x50 COMMA
++    DW_AT_stride	= 0x51 COMMA
++    DW_AT_entry_pc	= 0x52 COMMA
++    DW_AT_use_UTF8	= 0x53 COMMA
++    DW_AT_extension	= 0x54 COMMA
++    DW_AT_ranges	= 0x55 COMMA
++    DW_AT_trampoline	= 0x56 COMMA
++    DW_AT_call_column	= 0x57 COMMA
++    DW_AT_call_file	= 0x58 COMMA
++    DW_AT_call_line	= 0x59 COMMA
++    /* SGI/MIPS extensions.  */
++    DW_AT_MIPS_fde = 0x2001 COMMA
++    DW_AT_MIPS_loop_begin = 0x2002 COMMA
++    DW_AT_MIPS_tail_loop_begin = 0x2003 COMMA
++    DW_AT_MIPS_epilog_begin = 0x2004 COMMA
++    DW_AT_MIPS_loop_unroll_factor = 0x2005 COMMA
++    DW_AT_MIPS_software_pipeline_depth = 0x2006 COMMA
++    DW_AT_MIPS_linkage_name = 0x2007 COMMA
++    DW_AT_MIPS_stride = 0x2008 COMMA
++    DW_AT_MIPS_abstract_name = 0x2009 COMMA
++    DW_AT_MIPS_clone_origin = 0x200a COMMA
++    DW_AT_MIPS_has_inlines = 0x200b COMMA
++    /* GNU extensions.	*/
++    DW_AT_sf_names   = 0x2101 COMMA
++    DW_AT_src_info   = 0x2102 COMMA
++    DW_AT_mac_info   = 0x2103 COMMA
++    DW_AT_src_coords = 0x2104 COMMA
++    DW_AT_body_begin = 0x2105 COMMA
++    DW_AT_body_end   = 0x2106 COMMA
++    DW_AT_GNU_vector = 0x2107 COMMA
++    /* VMS extensions.	*/
++    DW_AT_VMS_rtnbeg_pd_address = 0x2201 COMMA
++    /* UPC extension.  */
++    DW_AT_upc_threads_scaled = 0x3210
++IF_NOT_ASM(};)
++
++#define DW_AT_lo_user	0x2000	/* Implementation-defined range start.	*/
++#define DW_AT_hi_user	0x3ff0	/* Implementation-defined range end.  */
++
++/* Location atom names and codes.  */
++ENUM(dwarf_location_atom)
++
++    DW_OP_addr = 0x03 COMMA
++    DW_OP_deref = 0x06 COMMA
++    DW_OP_const1u = 0x08 COMMA
++    DW_OP_const1s = 0x09 COMMA
++    DW_OP_const2u = 0x0a COMMA
++    DW_OP_const2s = 0x0b COMMA
++    DW_OP_const4u = 0x0c COMMA
++    DW_OP_const4s = 0x0d COMMA
++    DW_OP_const8u = 0x0e COMMA
++    DW_OP_const8s = 0x0f COMMA
++    DW_OP_constu = 0x10 COMMA
++    DW_OP_consts = 0x11 COMMA
++    DW_OP_dup = 0x12 COMMA
++    DW_OP_drop = 0x13 COMMA
++    DW_OP_over = 0x14 COMMA
++    DW_OP_pick = 0x15 COMMA
++    DW_OP_swap = 0x16 COMMA
++    DW_OP_rot = 0x17 COMMA
++    DW_OP_xderef = 0x18 COMMA
++    DW_OP_abs = 0x19 COMMA
++    DW_OP_and = 0x1a COMMA
++    DW_OP_div = 0x1b COMMA
++    DW_OP_minus = 0x1c COMMA
++    DW_OP_mod = 0x1d COMMA
++    DW_OP_mul = 0x1e COMMA
++    DW_OP_neg = 0x1f COMMA
++    DW_OP_not = 0x20 COMMA
++    DW_OP_or = 0x21 COMMA
++    DW_OP_plus = 0x22 COMMA
++    DW_OP_plus_uconst = 0x23 COMMA
++    DW_OP_shl = 0x24 COMMA
++    DW_OP_shr = 0x25 COMMA
++    DW_OP_shra = 0x26 COMMA
++    DW_OP_xor = 0x27 COMMA
++    DW_OP_bra = 0x28 COMMA
++    DW_OP_eq = 0x29 COMMA
++    DW_OP_ge = 0x2a COMMA
++    DW_OP_gt = 0x2b COMMA
++    DW_OP_le = 0x2c COMMA
++    DW_OP_lt = 0x2d COMMA
++    DW_OP_ne = 0x2e COMMA
++    DW_OP_skip = 0x2f COMMA
++    DW_OP_lit0 = 0x30 COMMA
++    DW_OP_lit1 = 0x31 COMMA
++    DW_OP_lit2 = 0x32 COMMA
++    DW_OP_lit3 = 0x33 COMMA
++    DW_OP_lit4 = 0x34 COMMA
++    DW_OP_lit5 = 0x35 COMMA
++    DW_OP_lit6 = 0x36 COMMA
++    DW_OP_lit7 = 0x37 COMMA
++    DW_OP_lit8 = 0x38 COMMA
++    DW_OP_lit9 = 0x39 COMMA
++    DW_OP_lit10 = 0x3a COMMA
++    DW_OP_lit11 = 0x3b COMMA
++    DW_OP_lit12 = 0x3c COMMA
++    DW_OP_lit13 = 0x3d COMMA
++    DW_OP_lit14 = 0x3e COMMA
++    DW_OP_lit15 = 0x3f COMMA
++    DW_OP_lit16 = 0x40 COMMA
++    DW_OP_lit17 = 0x41 COMMA
++    DW_OP_lit18 = 0x42 COMMA
++    DW_OP_lit19 = 0x43 COMMA
++    DW_OP_lit20 = 0x44 COMMA
++    DW_OP_lit21 = 0x45 COMMA
++    DW_OP_lit22 = 0x46 COMMA
++    DW_OP_lit23 = 0x47 COMMA
++    DW_OP_lit24 = 0x48 COMMA
++    DW_OP_lit25 = 0x49 COMMA
++    DW_OP_lit26 = 0x4a COMMA
++    DW_OP_lit27 = 0x4b COMMA
++    DW_OP_lit28 = 0x4c COMMA
++    DW_OP_lit29 = 0x4d COMMA
++    DW_OP_lit30 = 0x4e COMMA
++    DW_OP_lit31 = 0x4f COMMA
++    DW_OP_reg0 = 0x50 COMMA
++    DW_OP_reg1 = 0x51 COMMA
++    DW_OP_reg2 = 0x52 COMMA
++    DW_OP_reg3 = 0x53 COMMA
++    DW_OP_reg4 = 0x54 COMMA
++    DW_OP_reg5 = 0x55 COMMA
++    DW_OP_reg6 = 0x56 COMMA
++    DW_OP_reg7 = 0x57 COMMA
++    DW_OP_reg8 = 0x58 COMMA
++    DW_OP_reg9 = 0x59 COMMA
++    DW_OP_reg10 = 0x5a COMMA
++    DW_OP_reg11 = 0x5b COMMA
++    DW_OP_reg12 = 0x5c COMMA
++    DW_OP_reg13 = 0x5d COMMA
++    DW_OP_reg14 = 0x5e COMMA
++    DW_OP_reg15 = 0x5f COMMA
++    DW_OP_reg16 = 0x60 COMMA
++    DW_OP_reg17 = 0x61 COMMA
++    DW_OP_reg18 = 0x62 COMMA
++    DW_OP_reg19 = 0x63 COMMA
++    DW_OP_reg20 = 0x64 COMMA
++    DW_OP_reg21 = 0x65 COMMA
++    DW_OP_reg22 = 0x66 COMMA
++    DW_OP_reg23 = 0x67 COMMA
++    DW_OP_reg24 = 0x68 COMMA
++    DW_OP_reg25 = 0x69 COMMA
++    DW_OP_reg26 = 0x6a COMMA
++    DW_OP_reg27 = 0x6b COMMA
++    DW_OP_reg28 = 0x6c COMMA
++    DW_OP_reg29 = 0x6d COMMA
++    DW_OP_reg30 = 0x6e COMMA
++    DW_OP_reg31 = 0x6f COMMA
++    DW_OP_breg0 = 0x70 COMMA
++    DW_OP_breg1 = 0x71 COMMA
++    DW_OP_breg2 = 0x72 COMMA
++    DW_OP_breg3 = 0x73 COMMA
++    DW_OP_breg4 = 0x74 COMMA
++    DW_OP_breg5 = 0x75 COMMA
++    DW_OP_breg6 = 0x76 COMMA
++    DW_OP_breg7 = 0x77 COMMA
++    DW_OP_breg8 = 0x78 COMMA
++    DW_OP_breg9 = 0x79 COMMA
++    DW_OP_breg10 = 0x7a COMMA
++    DW_OP_breg11 = 0x7b COMMA
++    DW_OP_breg12 = 0x7c COMMA
++    DW_OP_breg13 = 0x7d COMMA
++    DW_OP_breg14 = 0x7e COMMA
++    DW_OP_breg15 = 0x7f COMMA
++    DW_OP_breg16 = 0x80 COMMA
++    DW_OP_breg17 = 0x81 COMMA
++    DW_OP_breg18 = 0x82 COMMA
++    DW_OP_breg19 = 0x83 COMMA
++    DW_OP_breg20 = 0x84 COMMA
++    DW_OP_breg21 = 0x85 COMMA
++    DW_OP_breg22 = 0x86 COMMA
++    DW_OP_breg23 = 0x87 COMMA
++    DW_OP_breg24 = 0x88 COMMA
++    DW_OP_breg25 = 0x89 COMMA
++    DW_OP_breg26 = 0x8a COMMA
++    DW_OP_breg27 = 0x8b COMMA
++    DW_OP_breg28 = 0x8c COMMA
++    DW_OP_breg29 = 0x8d COMMA
++    DW_OP_breg30 = 0x8e COMMA
++    DW_OP_breg31 = 0x8f COMMA
++    DW_OP_regx = 0x90 COMMA
++    DW_OP_fbreg = 0x91 COMMA
++    DW_OP_bregx = 0x92 COMMA
++    DW_OP_piece = 0x93 COMMA
++    DW_OP_deref_size = 0x94 COMMA
++    DW_OP_xderef_size = 0x95 COMMA
++    DW_OP_nop = 0x96 COMMA
++    /* DWARF 3 extensions.  */
++    DW_OP_push_object_address = 0x97 COMMA
++    DW_OP_call2 = 0x98 COMMA
++    DW_OP_call4 = 0x99 COMMA
++    DW_OP_call_ref = 0x9a COMMA
++    /* GNU extensions.	*/
++    DW_OP_GNU_push_tls_address = 0xe0
++IF_NOT_ASM(};)
++
++#define DW_OP_lo_user	0xe0	/* Implementation-defined range start.	*/
++#define DW_OP_hi_user	0xff	/* Implementation-defined range end.  */
++
++/* Type encodings.  */
++ENUM(dwarf_type)
++
++    DW_ATE_void = 0x0 COMMA
++    DW_ATE_address = 0x1 COMMA
++    DW_ATE_boolean = 0x2 COMMA
++    DW_ATE_complex_float = 0x3 COMMA
++    DW_ATE_float = 0x4 COMMA
++    DW_ATE_signed = 0x5 COMMA
++    DW_ATE_signed_char = 0x6 COMMA
++    DW_ATE_unsigned = 0x7 COMMA
++    DW_ATE_unsigned_char = 0x8 COMMA
++    /* DWARF 3.  */
++    DW_ATE_imaginary_float = 0x9
++IF_NOT_ASM(};)
++
++#define	DW_ATE_lo_user 0x80
++#define	DW_ATE_hi_user 0xff
++
++/* Array ordering names and codes.  */
++ENUM(dwarf_array_dim_ordering)
++
++    DW_ORD_row_major = 0 COMMA
++    DW_ORD_col_major = 1
++IF_NOT_ASM(};)
++
++/* Access attribute.  */
++ENUM(dwarf_access_attribute)
++
++    DW_ACCESS_public = 1 COMMA
++    DW_ACCESS_protected = 2 COMMA
++    DW_ACCESS_private = 3
++IF_NOT_ASM(};)
++
++/* Visibility.	*/
++ENUM(dwarf_visibility_attribute)
++
++    DW_VIS_local = 1 COMMA
++    DW_VIS_exported = 2 COMMA
++    DW_VIS_qualified = 3
++IF_NOT_ASM(};)
++
++/* Virtuality.	*/
++ENUM(dwarf_virtuality_attribute)
++
++    DW_VIRTUALITY_none = 0 COMMA
++    DW_VIRTUALITY_virtual = 1 COMMA
++    DW_VIRTUALITY_pure_virtual = 2
++IF_NOT_ASM(};)
++
++/* Case sensitivity.  */
++ENUM(dwarf_id_case)
++
++    DW_ID_case_sensitive = 0 COMMA
++    DW_ID_up_case = 1 COMMA
++    DW_ID_down_case = 2 COMMA
++    DW_ID_case_insensitive = 3
++IF_NOT_ASM(};)
++
++/* Calling convention.	*/
++ENUM(dwarf_calling_convention)
++
++    DW_CC_normal = 0x1 COMMA
++    DW_CC_program = 0x2 COMMA
++    DW_CC_nocall = 0x3
++IF_NOT_ASM(};)
++
++#define DW_CC_lo_user 0x40
++#define DW_CC_hi_user 0xff
++
++/* Inline attribute.  */
++ENUM(dwarf_inline_attribute)
++
++    DW_INL_not_inlined = 0 COMMA
++    DW_INL_inlined = 1 COMMA
++    DW_INL_declared_not_inlined = 2 COMMA
++    DW_INL_declared_inlined = 3
++IF_NOT_ASM(};)
++
++/* Discriminant lists.	*/
++ENUM(dwarf_discrim_list)
++
++    DW_DSC_label = 0 COMMA
++    DW_DSC_range = 1
++IF_NOT_ASM(};)
++
++/* Line number opcodes.  */
++ENUM(dwarf_line_number_ops)
++
++    DW_LNS_extended_op = 0 COMMA
++    DW_LNS_copy = 1 COMMA
++    DW_LNS_advance_pc = 2 COMMA
++    DW_LNS_advance_line = 3 COMMA
++    DW_LNS_set_file = 4 COMMA
++    DW_LNS_set_column = 5 COMMA
++    DW_LNS_negate_stmt = 6 COMMA
++    DW_LNS_set_basic_block = 7 COMMA
++    DW_LNS_const_add_pc = 8 COMMA
++    DW_LNS_fixed_advance_pc = 9 COMMA
++    /* DWARF 3.  */
++    DW_LNS_set_prologue_end = 10 COMMA
++    DW_LNS_set_epilogue_begin = 11 COMMA
++    DW_LNS_set_isa = 12
++IF_NOT_ASM(};)
++
++/* Line number extended opcodes.  */
++ENUM(dwarf_line_number_x_ops)
++
++    DW_LNE_end_sequence = 1 COMMA
++    DW_LNE_set_address = 2 COMMA
++    DW_LNE_define_file = 3
++IF_NOT_ASM(};)
++
++/* Call frame information.  */
++ENUM(dwarf_call_frame_info)
++
++    DW_CFA_advance_loc = 0x40 COMMA
++    DW_CFA_offset = 0x80 COMMA
++    DW_CFA_restore = 0xc0 COMMA
++    DW_CFA_nop = 0x00 COMMA
++    DW_CFA_set_loc = 0x01 COMMA
++    DW_CFA_advance_loc1 = 0x02 COMMA
++    DW_CFA_advance_loc2 = 0x03 COMMA
++    DW_CFA_advance_loc4 = 0x04 COMMA
++    DW_CFA_offset_extended = 0x05 COMMA
++    DW_CFA_restore_extended = 0x06 COMMA
++    DW_CFA_undefined = 0x07 COMMA
++    DW_CFA_same_value = 0x08 COMMA
++    DW_CFA_register = 0x09 COMMA
++    DW_CFA_remember_state = 0x0a COMMA
++    DW_CFA_restore_state = 0x0b COMMA
++    DW_CFA_def_cfa = 0x0c COMMA
++    DW_CFA_def_cfa_register = 0x0d COMMA
++    DW_CFA_def_cfa_offset = 0x0e COMMA
++
++    /* DWARF 3.  */
++    DW_CFA_def_cfa_expression = 0x0f COMMA
++    DW_CFA_expression = 0x10 COMMA
++    DW_CFA_offset_extended_sf = 0x11 COMMA
++    DW_CFA_def_cfa_sf = 0x12 COMMA
++    DW_CFA_def_cfa_offset_sf = 0x13 COMMA
++
++    /* SGI/MIPS specific.  */
++    DW_CFA_MIPS_advance_loc8 = 0x1d COMMA
++
++    /* GNU extensions.	*/
++    DW_CFA_GNU_window_save = 0x2d COMMA
++    DW_CFA_GNU_args_size = 0x2e COMMA
++    DW_CFA_GNU_negative_offset_extended = 0x2f
++IF_NOT_ASM(};)
++
++#define DW_CIE_ID	  0xffffffff
++#define DW_CIE_VERSION	  1
++
++#define DW_CFA_extended   0
++#define DW_CFA_lo_user	  0x1c
++#define DW_CFA_hi_user	  0x3f
++
++#define DW_CHILDREN_no		     0x00
++#define DW_CHILDREN_yes		     0x01
++
++#define DW_ADDR_none		0
++
++/* Source language names and codes.  */
++ENUM(dwarf_source_language)
++
++    DW_LANG_C89 = 0x0001 COMMA
++    DW_LANG_C = 0x0002 COMMA
++    DW_LANG_Ada83 = 0x0003 COMMA
++    DW_LANG_C_plus_plus = 0x0004 COMMA
++    DW_LANG_Cobol74 = 0x0005 COMMA
++    DW_LANG_Cobol85 = 0x0006 COMMA
++    DW_LANG_Fortran77 = 0x0007 COMMA
++    DW_LANG_Fortran90 = 0x0008 COMMA
++    DW_LANG_Pascal83 = 0x0009 COMMA
++    DW_LANG_Modula2 = 0x000a COMMA
++    DW_LANG_Java = 0x000b COMMA
++    /* DWARF 3.  */
++    DW_LANG_C99 = 0x000c COMMA
++    DW_LANG_Ada95 = 0x000d COMMA
++    DW_LANG_Fortran95 = 0x000e COMMA
++    /* MIPS.  */
++    DW_LANG_Mips_Assembler = 0x8001 COMMA
++    /* UPC.  */
++    DW_LANG_Upc = 0x8765
++IF_NOT_ASM(};)
++
++#define DW_LANG_lo_user 0x8000	/* Implementation-defined range start.	*/
++#define DW_LANG_hi_user 0xffff	/* Implementation-defined range start.	*/
++
++/* Names and codes for macro information.  */
++ENUM(dwarf_macinfo_record_type)
++
++    DW_MACINFO_define = 1 COMMA
++    DW_MACINFO_undef = 2 COMMA
++    DW_MACINFO_start_file = 3 COMMA
++    DW_MACINFO_end_file = 4 COMMA
++    DW_MACINFO_vendor_ext = 255
++IF_NOT_ASM(};)
++
++/* @@@ For use with GNU frame unwind information.  */
++
++#define DW_EH_PE_absptr		0x00
++#define DW_EH_PE_omit		0xff
++
++#define DW_EH_PE_uleb128	0x01
++#define DW_EH_PE_udata2		0x02
++#define DW_EH_PE_udata4		0x03
++#define DW_EH_PE_udata8		0x04
++#define DW_EH_PE_sleb128	0x09
++#define DW_EH_PE_sdata2		0x0A
++#define DW_EH_PE_sdata4		0x0B
++#define DW_EH_PE_sdata8		0x0C
++#define DW_EH_PE_signed		0x08
++
++#define DW_EH_PE_pcrel		0x10
++#define DW_EH_PE_textrel	0x20
++#define DW_EH_PE_datarel	0x30
++#define DW_EH_PE_funcrel	0x40
++#define DW_EH_PE_aligned	0x50
++
++#define DW_EH_PE_indirect	0x80
++
++#endif /* _ELF_DWARF2_H */
+Index: linux-2.6.10/include/linux/spinlock.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/spinlock.h	2005-03-31 15:35:27.000000000 +0800
++++ linux-2.6.10/include/linux/spinlock.h	2005-04-05 12:48:05.365601384 +0800
+@@ -15,6 +15,12 @@
+ 
+ #include <asm/processor.h>	/* for cpu relax */
+ #include <asm/system.h>
++#ifdef CONFIG_KGDB
++#include <asm/current.h>
++#define SET_WHO(x, him) (x)->who = him;
++#else
++#define SET_WHO(x, him)
++#endif
+ 
+ /*
+  * Must define these before including other files, inline functions need them
+@@ -94,6 +100,9 @@
+ 	const char *module;
+ 	char *owner;
+ 	int oline;
++#ifdef CONFIG_KGDB
++	struct task_struct *who;
++#endif
+ } spinlock_t;
+ #define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0}
+ 
+@@ -105,6 +114,7 @@
+ 		(x)->module = __FILE__; \
+ 		(x)->owner = NULL; \
+ 		(x)->oline = 0; \
++                SET_WHO(x, NULL) \
+ 	} while (0)
+ 
+ #define CHECK_LOCK(x) \
+@@ -129,6 +139,7 @@
+ 		(x)->lock = 1; \
+ 		(x)->owner = __FILE__; \
+ 		(x)->oline = __LINE__; \
++                SET_WHO(x, current)       \
+ 	} while (0)
+ 
+ /* without debugging, spin_is_locked on UP always says
+@@ -159,6 +170,7 @@
+ 		(x)->lock = 1; \
+ 		(x)->owner = __FILE__; \
+ 		(x)->oline = __LINE__; \
++                SET_WHO(x, current)       \
+ 		1; \
+ 	})
+ 
+Index: linux-2.6.10/include/linux/config.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/config.h	2005-03-31 15:35:27.000000000 +0800
++++ linux-2.6.10/include/linux/config.h	2005-04-05 12:48:42.303985896 +0800
+@@ -2,6 +2,10 @@
+ #define _LINUX_CONFIG_H
+ 
+ #include <linux/autoconf.h>
++#if defined(__i386__) && !defined(IN_BOOTLOADER) && defined(CONFIG_KGDB)
++#include <asm/kgdb.h>
++#endif
++
+ #if !defined (__KERNEL__) && !defined(__KERNGLUE__)
+ #error including kernel header in userspace; use the glibc headers instead!
+ #endif
+Index: linux-2.6.10/include/linux/dwarf2-lang.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dwarf2-lang.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dwarf2-lang.h	2005-04-05 12:48:05.370600624 +0800
+@@ -0,0 +1,132 @@
++#ifndef DWARF2_LANG
++#define DWARF2_LANG
++#include <linux/dwarf2.h>
++
++/*
++ * This is free software; you can redistribute it and/or modify it under
++ * the terms of the GNU General Public License as published by the Free
++ * Software Foundation; either version 2, or (at your option) any later
++ * version.
++ */
++/*
++ * This file defines macros that allow generation of DWARF debug records
++ * for asm files.  This file is platform independent.  Register numbers
++ * (which are about the only thing that is platform dependent) are to be
++ * supplied by a platform defined file.
++ */
++#define DWARF_preamble()	.section	.debug_frame,"",@progbits
++/*
++ * This macro starts a debug frame section.  The debug_frame describes
++ * where to find the registers that the enclosing function saved on
++ * entry.
++ *
++ * ORD is use by the label generator and should be the same as what is
++ * passed to CFI_postamble.
++ *
++ * pc,	pc register gdb ordinal.
++ *
++ * code_align this is the factor used to define locations or regions
++ * where the given definitions apply.  If you use labels to define these
++ * this should be 1.
++ *
++ * data_align this is the factor used to define register offsets.  If
++ * you use struct offset, this should be the size of the register in
++ * bytes or the negative of that.  This is how it is used: you will
++ * define a register as the reference register, say the stack pointer,
++ * then you will say where a register is located relative to this
++ * reference registers value, say 40 for register 3 (the gdb register
++ * number).  The <40> will be multiplied by <data_align> to define the
++ * byte offset of the given register (3, in this example).  So if your
++ * <40> is the byte offset and the reference register points at the
++ * begining, you would want 1 for the data_offset.  If <40> was the 40th
++ * 4-byte element in that structure you would want 4.  And if your
++ * reference register points at the end of the structure you would want
++ * a negative data_align value(and you would have to do other math as
++ * well).
++ */
++
++#define CFI_preamble(ORD, pc, code_align, data_align)	\
++.section	.debug_frame,"",@progbits ;		\
++frame/**/_/**/ORD:						\
++	.long end/**/_/**/ORD-start/**/_/**/ORD;			\
++start/**/_/**/ORD:						\
++	.long	DW_CIE_ID;				\
++	.byte	DW_CIE_VERSION;			\
++	.byte 0	 ;				\
++	.uleb128 code_align;				\
++	.sleb128 data_align;				\
++	.byte pc;
++
++/*
++ * After the above macro and prior to the CFI_postamble, you need to
++ * define the initial state.  This starts with defining the reference
++ * register and, usually the pc.  Here are some helper macros:
++ */
++
++#define CFA_define_reference(reg, offset)	\
++	.byte DW_CFA_def_cfa;			\
++	.uleb128 reg;				\
++	.uleb128 (offset);
++
++#define CFA_define_offset(reg, offset)		\
++	.byte (DW_CFA_offset + reg);		\
++	.uleb128 (offset);
++
++#define CFI_postamble(ORD)			\
++	.align 4;				\
++end/**/_/**/ORD:
++/*
++ * So now your code pushs stuff on the stack, you need a new location
++ * and the rules for what to do.  This starts a running description of
++ * the call frame.  You need to describe what changes with respect to
++ * the call registers as the location of the pc moves through the code.
++ * The following builds an FDE (fram descriptor entry?).  Like the
++ * above, it has a preamble and a postamble.  It also is tied to the CFI
++ * above.
++ * The first entry after the preamble must be the location in the code
++ * that the call frame is being described for.
++ */
++#define FDE_preamble(ORD, fde_no, initial_address, length)	\
++	.long FDE_end/**/_/**/fde_no-FDE_start/**/_/**/fde_no;		\
++FDE_start/**/_/**/fde_no:						\
++	.long frame/**/_/**/ORD;					\
++	.long initial_address;					\
++	.long length;
++
++#define FDE_postamble(fde_no)			\
++	.align 4;				\
++FDE_end/**/_/**/fde_no:
++/*
++ * That done, you can now add registers, subtract registers, move the
++ * reference and even change the reference.  You can also define a new
++ * area of code the info applies to.  For discontinuous bits you should
++ * start a new FDE.  You may have as many as you like.
++ */
++
++/*
++ * To advance the address by <bytes>
++ */
++
++#define FDE_advance(bytes)			\
++	.byte DW_CFA_advance_loc4		\
++	.long bytes
++
++
++
++/*
++ * With the above you can define all the register locations.  But
++ * suppose the reference register moves... Takes the new offset NOT an
++ * increment.  This is how esp is tracked if it is not saved.
++ */
++
++#define CFA_define_cfa_offset(offset) \
++	.byte $DW_CFA_def_cfa_offset; \
++	.uleb128 (offset);
++/*
++ * Or suppose you want to use a different reference register...
++ */
++#define CFA_define_cfa_register(reg)		\
++	.byte DW_CFA_def_cfa_register;		\
++	.uleb128 reg;
++
++#endif
+Index: linux-2.6.10/kernel/pid.c
+===================================================================
+--- linux-2.6.10.orig/kernel/pid.c	2005-03-31 15:35:27.000000000 +0800
++++ linux-2.6.10/kernel/pid.c	2005-04-05 12:48:05.363601688 +0800
+@@ -252,6 +252,9 @@
+  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
+  * more.
+  */
++#ifdef CONFIG_KGDB
++int kgdb_pid_init_done; /* so we don't call prior to... */
++#endif
+ void __init pidhash_init(void)
+ {
+ 	int i, j, pidhash_size;
+@@ -273,6 +276,9 @@
+ 		for (j = 0; j < pidhash_size; j++)
+ 			INIT_HLIST_HEAD(&pid_hash[i][j]);
+ 	}
++#ifdef CONFIG_KGDB
++	kgdb_pid_init_done++;
++#endif
+ }
+ 
+ void __init pidmap_init(void)
+Index: linux-2.6.10/kernel/sched.c
+===================================================================
+--- linux-2.6.10.orig/kernel/sched.c	2005-03-31 15:57:21.000000000 +0800
++++ linux-2.6.10/kernel/sched.c	2005-04-05 12:48:05.362601840 +0800
+@@ -2991,6 +2991,13 @@
+ 
+ EXPORT_SYMBOL(set_user_nice);
+ 
++#ifdef CONFIG_KGDB
++struct task_struct *kgdb_get_idle(int this_cpu)
++{
++        return cpu_rq(this_cpu)->idle;
++}
++#endif
++
+ #ifdef __ARCH_WANT_SYS_NICE
+ 
+ /*
+Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit	2005-04-05 12:48:05.263616888 +0800
+@@ -0,0 +1,14 @@
++shell echo -e "\003" >/dev/ttyS0
++set remotebaud 38400
++target remote /dev/ttyS0
++define si
++stepi
++printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx
++printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp
++x/i $eip
++end
++define ni
++nexti
++printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx
++printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp
++x/i $eip
+Index: linux-2.6.10/Documentation/i386/kgdb/kgdb.txt
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/kgdb.txt	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/kgdb.txt	2005-04-05 12:48:05.271615672 +0800
+@@ -0,0 +1,775 @@
++Last edit: <20030806.1637.12>
++This file has information specific to the i386 kgdb option.  Other
++platforms with the kgdb option may behave in a similar fashion.
++
++New features:
++============
++20030806.1557.37
++This version was made against the 2.6.0-test2 kernel. We have made the
++following changes:
++
++- The getthread() code in the stub calls find_task_by_pid().  It fails
++  if we are early in the bring up such that the pid arrays have yet to
++  be allocated.  We have added a line to kernel/pid.c to make
++  "kgdb_pid_init_done" true once the arrays are allocated.  This way the
++  getthread() code knows not to call.  This is only used by the thread
++  debugging stuff and threads will not yet exist at this point in the
++  boot.
++
++- For some reason, gdb was not asking for a new thread list when the
++  "info thread" command was given.  We changed to the newer version of
++  the thread info command and gdb now seems to ask when needed.  Result,
++  we now get all threads in the thread list.
++
++- We now respond to the ThreadExtraInfo request from gdb with the thread
++  name from task_struct .comm.  This then appears in the thread list.
++  Thoughts on additional options for this are welcome.  Things such as
++  "has BKL" and "Preempted" come to mind.  I think we could have a flag
++  word that could enable different bits of info here.
++
++- We now honor, sort of, the C and S commands.  These are continue and
++  single set after delivering a signal.  We ignore the signal and do the
++  requested action.  This only happens when we told gdb that a signal
++  was the reason for entry, which is only done on memory faults.  The
++  result is that you can now continue into the Oops.
++
++- We changed the -g to -gdwarf-2.  This seems to be the same as -ggdb,
++  but it is more exact on what language to use.
++
++- We added two dwarf2 include files and a bit of code at the end of
++  entry.S.  This does not yet work, so it is disabled.  Still we want to
++  keep track of the code and "maybe" someone out there can fix it.
++
++- Randy Dunlap sent some fix ups for this file which are now merged.
++
++- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a
++  compiler warning if CONFIG_KGDB is off (now who would do that :).
++
++- Andrew Morton sent a fix for the serial driver which is now merged.
++
++- Andrew also sent a change to the stub around the cpu managment code
++  which is also merged.
++
++- Andrew also sent a patch to make "f" as well as "g" work as SysRq
++  commands to enter kgdb, merged.
++
++- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a
++  "who" field to the spinlock data struct.  This is filled with
++  "current" when ever the spinlock suceeds.  Useful if you want to know
++  who has the lock.
++
++_ And last, but not least, we fixed the "get_cu" macro to properly get
++  the current value of "current".
++
++New features:
++============
++20030505.1827.27
++We are starting to align with the sourceforge version, at least in
++commands.  To this end, the boot command string to start kgdb at
++boot time has been changed from "kgdb" to "gdb".
++
++Andrew Morton sent a couple of patches which are now included as follows:
++1.) We now return a flag to the interrupt handler.
++2.) We no longer use smp_num_cpus (a conflict with the lock meter).
++3.) And from William Lee Irwin III <wli@holomorphy.com> code to make
++    sure high-mem is set up before we attempt to register our interrupt
++    handler.
++We now include asm/kgdb.h from config.h so you will most likely never
++have to include it.  It also 'NULLS' the kgdb macros you might have in
++your code when CONFIG_KGDB is not defined.  This allows you to just
++turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such.
++This include is conditioned on the machine being an x86 so as to not
++mess with other archs.
++
++20020801.1129.03
++This is currently the version for the 2.4.18 (and beyond?) kernel.
++
++We have several new "features" beginning with this version:
++
++1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI.  No more
++    waiting and it will pull that guy out of an IRQ off spin lock :)
++
++2.) We doctored up the code that tells where a task is waiting and
++    included it so that the "info thread" command will show a bit more
++    than "schedule()".  Try it...
++
++3.) Added the ability to call a function from gdb.  All the standard gdb
++    issues apply, i.e. if you hit a breakpoint in the function, you are
++    not allowed to call another (gdb limitation, not kgdb).  To help
++    this capability we added a memory allocation function.  Gdb does not
++    return this memory (it is used for strings that you pass to that function
++    you are calling from gdb) so we fixed up a way to allow you to
++    manually return the memory (see below).
++
++4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the
++    interrupt flag to now also include the preemption count and the
++    "in_interrupt" info.  The flag is now called "with_pif" to indicate
++    the order, preempt_count, in_interrupt, flag.  The preempt_count is
++    shifted left by 4 bits so you can read the count in hex by dropping
++    the low order digit.  In_interrupt is in bit 1, and the flag is in
++    bit 0.
++
++5.) The command: "p kgdb_info" is now expanded and prints something
++    like:
++(gdb) p kgdb_info
++$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259,
++  errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1,
++  cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0,
++      regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}}
++
++    Things to note here: a.) used_malloc is the amount of memory that
++    has been malloc'ed to do calls from gdb.  You can reclaim this
++    memory like this: "p kgdb_info.used_malloc=0" Cool, huh?  b.)
++    cpus_waiting is now "sized" by the number of CPUs you enter at
++    configure time in the kgdb configure section.  This is NOT used
++    anywhere else in the system, but it is "nice" here.  c.)  The task's
++    "pid" is now in the structure.  This is the pid you will need to use
++    to decode to the thread id to get gdb to look at that thread.
++    Remember that the "info thread" command prints a list of threads
++    wherein it numbers each thread with its reference number followed
++    by the thread's pid.  Note that the per-CPU idle threads actually
++    have pids of 0 (yes, there is more than one pid 0 in an SMP system).
++    To avoid confusion, kgdb numbers these threads with numbers beyond
++    the MAX_PID.  That is why you see 32768 and above.
++
++6.) A subtle change, we now provide the complete register set for tasks
++    that are active on the other CPUs.  This allows better trace back on
++    those tasks.
++
++    And, let's mention what we could not fix.  Back-trace from all but the
++    thread that we trapped will, most likely, have a bogus entry in it.
++    The problem is that gdb does not recognize the entry code for
++    functions that use "current" near (at all?) the entry.  The compiler
++    is putting the "current" decode as the first two instructions of the
++    function where gdb expects to find %ebp changing code.  Back trace
++    also has trouble with interrupt frames.  I am talking with Daniel
++    Jacobowitz about some way to fix this, but don't hold your breath.
++
++20011220.0050.35
++Major enhancement with this version is the ability to hold one or more
++CPUs in an SMP system while allowing the others to continue.  Also, by
++default only the current CPU is enabled on single-step commands (please
++note that gdb issues single-step commands at times other than when you
++use the si command).
++
++Another change is to collect some useful information in
++a global structure called "kgdb_info".  You should be able to just:
++
++p kgdb_info
++
++although I have seen cases where the first time this is done gdb just
++prints the first member but prints the whole structure if you then enter
++CR (carriage return or enter).  This also works:
++
++p *&kgdb_info
++
++Here is a sample:
++(gdb) p kgdb_info
++$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0,
++  vector = 3, print_debug_info = 0}
++
++"Called_from" is the return address from the current entry into kgdb.
++Sometimes it is useful to know why you are in kgdb, for example, was
++it an NMI or a real breakpoint?  The simple way to interrogate this
++return address is:
++
++l *0xc010732c
++
++which will print the surrounding few lines of source code.
++
++"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the
++kgdb_ts entries).
++
++"errcode" and "vector" are other entry parameters which may be helpful on
++some traps.
++
++"print_debug_info" is the internal debugging kgdb print enable flag.  Yes,
++you can modify it.
++
++In SMP systems kgdb_info also includes the "cpus_waiting" structure and
++"hold_on_step":
++
++(gdb) p kgdb_info
++$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0,
++  vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{
++      task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0,
++      regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0,
++      hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0,
++      hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0,
++      hold = 0, regs = 0x0}}}
++
++"Cpus_waiting" has an entry for each CPU other than the current one that
++has been stopped.  Each entry contains the task_struct address for that
++CPU, the address of the regs for that task and a hold flag.  All these
++have the proper typing so that, for example:
++
++p *kgdb_info.cpus_waiting[1].regs
++
++will print the registers for CPU 1.
++
++"Hold_on_sstep" is a new feature with this version and comes up set or
++true.  What this means is that whenever kgdb is asked to single-step all
++other CPUs are held (i.e. not allowed to execute).  The flag applies to
++all but the current CPU and, again, can be changed:
++
++p kgdb_info.hold_on_sstep=0
++
++restores the old behavior of letting all CPUs run during single-stepping.
++
++Likewise, each CPU has a "hold" flag, which if set, locks that CPU out
++of execution.  Note that this has some risk in cases where the CPUs need
++to communicate with each other.  If kgdb finds no CPU available on exit,
++it will push a message thru gdb and stay in kgdb.  Note that it is legal
++to hold the current CPU as long as at least one CPU can execute.
++
++20010621.1117.09
++This version implements an event queue.  Events are signaled by calling
++a function in the kgdb stub and may be examined from gdb.  See EVENTS
++below for details.  This version also tightens up the interrupt and SMP
++handling to not allow interrupts on the way to kgdb from a breakpoint
++trap.  It is fine to allow these interrupts for user code, but not
++system debugging.
++
++Version
++=======
++
++This version of the kgdb package was developed and tested on
++kernel version 2.4.16.  It will not install on any earlier kernels.
++It is possible that it will continue to work on later versions
++of 2.4 and then versions of 2.5 (I hope).
++
++
++Debugging Setup
++===============
++
++Designate one machine as the "development" machine.  This is the
++machine on which you run your compiles and which has your source
++code for the kernel.  Designate a second machine as the "target"
++machine.  This is the machine that will run your experimental
++kernel.
++
++The two machines will be connected together via a serial line out
++one or the other of the COM ports of the PC.  You will need the
++appropriate modem eliminator (null modem) cable(s) for this.
++
++Decide on which tty port you want the machines to communicate, then
++connect them up back-to-back using the null modem cable.  COM1 is
++/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection
++with the two machines prior to trying to debug a kernel.  Once you
++have it working, on the TARGET machine, enter:
++
++setserial /dev/ttyS0 (or what ever tty you are using)
++
++and record the port address and the IRQ number.
++
++On the DEVELOPMENT machine you need to apply the patch for the kgdb
++hooks.  You have probably already done that if you are reading this
++file.
++
++On your DEVELOPMENT machine, go to your kernel source directory and do
++"make Xconfig" where X is one of "x", "menu", or "".  If you are
++configuring in the standard serial driver, it must not be a module.
++Either yes or no is ok, but making the serial driver a module means it
++will initialize after kgdb has set up the UART interrupt code and may
++cause a failure of the control-C option discussed below.  The configure
++question for the serial driver is under the "Character devices" heading
++and is:
++
++"Standard/generic (8250/16550 and compatible UARTs) serial support"
++
++Go down to the kernel debugging menu item and open it up.  Enable the
++kernel kgdb stub code by selecting that item.  You can also choose to
++turn on the "-ggdb -O1" compile options.  The -ggdb causes the compiler
++to put more debug info (like local symbols) in the object file.  On the
++i386 -g and -ggdb are the same so this option just reduces to "O1".  The
++-O1 reduces the optimization level.  This may be helpful in some cases,
++be aware, however, that this may also mask the problem you are looking
++for.
++
++The baud rate.  Default is 115200.  What ever you choose be sure that
++the host machine is set to the same speed.  I recommend the default.
++
++The port.  This is the I/O address of the serial UART that you should
++have gotten using setserial as described above.  The standard COM1 port
++(3f8) using IRQ 4 is default.  COM2 is 2f8 which by convention uses IRQ
++3.
++
++The port IRQ (see above).
++
++Stack overflow test.  This option makes a minor change in the trap,
++system call and interrupt code to detect stack overflow and transfer
++control to kgdb if it happens.  (Some platforms have this in the
++baseline code, but the i386 does not.)
++
++You can also configure the system to recognize the boot option
++"console=kgdb" which if given will cause all console output during
++booting to be put thru gdb as well as other consoles.  This option
++requires that gdb and kgdb be connected prior to sending console output
++so, if they are not, a breakpoint is executed to force the connection.
++This will happen before any kernel output (it is going thru gdb, right),
++and will stall the boot until the connection is made.
++
++You can also configure in a patch to SysRq to enable the kGdb SysRq.
++This request generates a breakpoint.  Since the serial port IRQ line is
++set up after any serial drivers, it is possible that this command will
++work when the control-C will not.
++
++Save and exit the Xconfig program.  Then do "make clean" , "make dep"
++and "make bzImage" (or whatever target you want to make).  This gets the
++kernel compiled with the "-g" option set -- necessary for debugging.
++
++You have just built the kernel on your DEVELOPMENT machine that you
++intend to run on your TARGET machine.
++
++To install this new kernel, use the following installation procedure.
++Remember, you are on the DEVELOPMENT machine patching the kernel source
++for the kernel that you intend to run on the TARGET machine.
++
++Copy this kernel to your target machine using your usual procedures.  I
++usually arrange to copy development:
++/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine
++via a LAN based NFS access.  That is, I run the cp command on the target
++and copy from the development machine via the LAN.  Run Lilo (see "man
++lilo" for details on how to set this up) on the new kernel on the target
++machine so that it will boot!  Then boot the kernel on the target
++machine.
++
++On the DEVELOPMENT machine, create a file called .gdbinit in the
++directory /usr/src/linux.  An example .gdbinit file looks like this:
++
++shell echo -e "\003" >/dev/ttyS0
++set remotebaud 38400 (or what ever speed you have chosen)
++target remote /dev/ttyS0
++
++
++Change the "echo" and "target" definition so that it specifies the tty
++port that you intend to use.  Change the "remotebaud" definition to
++match the data rate that you are going to use for the com line.
++
++You are now ready to try it out.
++
++Boot your target machine with "kgdb" in the boot command i.e. something
++like:
++
++lilo> test kgdb
++
++or if you also want console output thru gdb:
++
++lilo> test kgdb console=kgdb
++
++You should see the lilo message saying it has loaded the kernel and then
++all output stops.  The kgdb stub is trying to connect with gdb.  Start
++gdb something like this:
++
++
++On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux".
++When gdb gets the symbols loaded it will read your .gdbinit file and, if
++everything is working correctly, you should see gdb print out a few
++lines indicating that a breakpoint has been taken.  It will actually
++show a line of code in the target kernel inside the kgdb activation
++code.
++
++The gdb interaction should look something like this:
++
++    linux-dev:/usr/src/linux# gdb vmlinux
++    GDB is free software and you are welcome to distribute copies of it
++     under certain conditions; type "show copying" to see the conditions.
++    There is absolutely no warranty for GDB; type "show warranty" for details.
++    GDB 4.15.1 (i486-slackware-linux),
++    Copyright 1995 Free Software Foundation, Inc...
++    breakpoint () at i386-stub.c:750
++    750     }
++    (gdb)
++
++You can now use whatever gdb commands you like to set breakpoints.
++Enter "continue" to start your target machine executing again.  At this
++point the target system will run at full speed until it encounters
++your breakpoint or gets a segment violation in the kernel, or whatever.
++
++If you have the kgdb console enabled when you continue, gdb will print
++out all the console messages.
++
++The above example caused a breakpoint relatively early in the boot
++process.  For the i386 kgdb it is possible to code a break instruction
++as the first C-language point in init/main.c, i.e. as the first instruction
++in start_kernel().  This could be done as follows:
++
++#include <asm/kgdb.h>
++	 breakpoint();
++
++This breakpoint() is really a function that sets up the breakpoint and
++single-step hardware trap cells and then executes a breakpoint.  Any
++early hard coded breakpoint will need to use this function.  Once the
++trap cells are set up they need not be set again, but doing it again
++does not hurt anything, so you don't need to be concerned about which
++breakpoint is hit first.  Once the trap cells are set up (and the kernel
++sets them up in due course even if breakpoint() is never called) the
++macro:
++
++BREAKPOINT;
++
++will generate an inline breakpoint.  This may be more useful as it stops
++the processor at the instruction instead of in a function a step removed
++from the location of interest.  In either case <asm/kgdb.h> must be
++included to define both breakpoint() and BREAKPOINT.
++
++Triggering kgdbstub at other times
++==================================
++
++Often you don't need to enter the debugger until much later in the boot
++or even after the machine has been running for some time.  Once the
++kernel is booted and interrupts are on, you can force the system to
++enter the debugger by sending a control-C to the debug port. This is
++what the first line of the recommended .gdbinit file does.  This allows
++you to start gdb any time after the system is up as well as when the
++system is already at a breakpoint.  (In the case where the system is
++already at a breakpoint the control-C is not needed, however, it will
++be ignored by the target so no harm is done.  Also note the the echo
++command assumes that the port speed is already set.  This will be true
++once gdb has connected, but it is best to set the port speed before you
++run gdb.)
++
++Another simple way to do this is to put the following file in you ~/bin
++directory:
++
++#!/bin/bash
++echo  -e "\003"  > /dev/ttyS0
++
++Here, the ttyS0 should be replaced with what ever port you are using.
++The "\003" is control-C.  Once you are connected with gdb, you can enter
++control-C at the command prompt.
++
++An alternative way to get control to the debugger is to enable the kGdb
++SysRq command.  Then you would enter Alt-SysRq-g (all three keys at the
++same time, but push them down in the order given).  To refresh your
++memory of the available SysRq commands try Alt-SysRq-=.  Actually any
++undefined command could replace the "=", but I like to KNOW that what I
++am pushing will never be defined.
++
++Debugging hints
++===============
++
++You can break into the target machine at any time from the development
++machine by typing ^C (see above paragraph).  If the target machine has
++interrupts enabled this will stop it in the kernel and enter the
++debugger.
++
++There is unfortunately no way of breaking into the kernel if it is
++in a loop with interrupts disabled, so if this happens to you then
++you need to place exploratory breakpoints or printk's into the kernel
++to find out where it is looping.  The exploratory breakpoints can be
++entered either thru gdb or hard coded into the source.  This is very
++handy if you do something like:
++
++if (<it hurts>) BREAKPOINT;
++
++
++There is a copy of an e-mail in the Documentation/i386/kgdb/ directory
++(debug-nmi.txt) which describes how to create an NMI on an ISA bus
++machine using a paper clip.  I have a sophisticated version of this made
++by wiring a push button switch into a PC104/ISA bus adapter card.  The
++adapter card nicely furnishes wire wrap pins for all the ISA bus
++signals.
++
++When you are done debugging the kernel on the target machine it is a
++good idea to leave it in a running state.  This makes reboots faster,
++bypassing the fsck.  So do a gdb "continue" as the last gdb command if
++this is possible.  To terminate gdb itself on the development machine
++and leave the target machine running, first clear all breakpoints and
++continue, then type ^Z to suspend gdb and then kill it with "kill %1" or
++something similar.
++
++If gdbstub Does Not Work
++========================
++
++If it doesn't work, you will have to troubleshoot it.  Do the easy
++things first like double checking your cabling and data rates.  You
++might try some non-kernel based programs to see if the back-to-back
++connection works properly.  Just something simple like cat /etc/hosts
++>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you
++if you can send data from one machine to the other.  Make sure it works
++in both directions.  There is no point in tearing out your hair in the
++kernel if the line doesn't work.
++
++All of the real action takes place in the file
++/usr/src/linux/arch/i386/kernel/kgdb_stub.c.  That is the code on the target
++machine that interacts with gdb on the development machine.  In gdb you can
++turn on a debug switch with the following command:
++
++	set remotedebug
++
++This will print out the protocol messages that gdb is exchanging with
++the target machine.
++
++Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is
++the code that talks to the serial port on the target side.  There might
++be a problem there.  In particular there is a section of this code that
++tests the UART which will tell you what UART you have if you define
++"PRNT" (just remove "_off" from the #define PRNT_off).  To view this
++report you will need to boot the system without any beakpoints.  This
++allows the kernel to run to the point where it calls kgdb to set up
++interrupts.  At this time kgdb will test the UART and print out the type
++it finds.  (You need to wait so that the printks are actually being
++printed.  Early in the boot they are cached, waiting for the console to
++be enabled.  Also, if kgdb is entered thru a breakpoint it is possible
++to cause a dead lock by calling printk when the console is locked.  The
++stub thus avoids doing printks from breakpoints, especially in the
++serial code.)  At this time, if the UART fails to do the expected thing,
++kgdb will print out (using printk) information on what failed.  (These
++messages will be buried in all the other boot up messages.  Look for
++lines that start with "gdb_hook_interrupt:".  You may want to use dmesg
++once the system is up to view the log.  If this fails or if you still
++don't connect, review your answers for the port address.  Use:
++
++setserial /dev/ttyS0
++
++to get the current port and IRQ information.  This command will also
++tell you what the system found for the UART type. The stub recognizes
++the following UART types:
++
++16450, 16550, and 16550A
++
++If you are really desperate you can use printk debugging in the
++kgdbstub code in the target kernel until you get it working.  In particular,
++there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c
++named "remote_debug".  Compile your kernel with this set to 1, rather
++than 0 and the debug stub will print out lots of stuff as it does
++what it does.  Likewise there are debug printks in the kgdb_serial.c
++code that can be turned on with simple changes in the macro defines.
++
++
++Debugging Loadable Modules
++==========================
++
++This technique comes courtesy of Edouard Parmelan
++<Edouard.Parmelan@quadratec.fr>
++
++When you run gdb, enter the command
++
++source gdbinit-modules
++
++This will read in a file of gdb macros that was installed in your
++kernel source directory when kgdb was installed.  This file implements
++the following commands:
++
++mod-list
++    Lists the loaded modules in the form <module-address> <module-name>
++
++mod-print-symbols <module-address>
++    Prints all the symbols in the indicated module.
++
++mod-add-symbols <module-address> <object-file-path-name>
++    Loads the symbols from the object file and associates them
++    with the indicated module.
++
++After you have loaded the module that you want to debug, use the command
++mod-list to find the <module-address> of your module.  Then use that
++address in the mod-add-symbols command to load your module's symbols.
++From that point onward you can debug your module as if it were a part
++of the kernel.
++
++The file gdbinit-modules also contains a command named mod-add-lis as
++an example of how to construct a command of your own to load your
++favorite module.  The idea is to "can" the pathname of the module
++in the command so you don't have to type so much.
++
++Threads
++=======
++
++Each process in a target machine is seen as a gdb thread. gdb thread
++related commands (info threads, thread n) can be used.
++
++ia-32 hardware breakpoints
++==========================
++
++kgdb stub contains support for hardware breakpoints using debugging features
++of ia-32(x86) processors. These breakpoints do not need code modification.
++They use debugging registers. 4 hardware breakpoints are available in ia-32
++processors.
++
++Each hardware breakpoint can be of one of the following three types.
++
++1. Execution breakpoint - An Execution breakpoint is triggered when code
++	at the breakpoint address is executed.
++
++	As limited number of hardware breakpoints are available, it is
++	advisable to use software breakpoints ( break command ) instead
++	of execution hardware breakpoints, unless modification of code
++	is to be avoided.
++
++2. Write breakpoint - A write breakpoint is triggered when memory
++	location at the breakpoint address is written.
++
++	A write or can be placed for data of variable length. Length of
++	a write breakpoint indicates length of the datatype to be
++	watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for
++	4 byte data.
++
++3. Access breakpoint - An access breakpoint is triggered when memory
++	location at the breakpoint address is either read or written.
++
++	Access breakpoints also have lengths similar to write breakpoints.
++
++IO breakpoints in ia-32 are not supported.
++
++Since gdb stub at present does not use the protocol used by gdb for hardware
++breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros
++for hardware breakpoints are described below.
++
++hwebrk	- Places an execution breakpoint
++	hwebrk breakpointno address
++hwwbrk	- Places a write breakpoint
++	hwwbrk breakpointno length address
++hwabrk	- Places an access breakpoint
++	hwabrk breakpointno length address
++hwrmbrk	- Removes a breakpoint
++	hwrmbrk breakpointno
++exinfo	- Tells whether a software or hardware breakpoint has occurred.
++	Prints number of the hardware breakpoint if a hardware breakpoint has
++	occurred.
++
++Arguments required by these commands are as follows
++breakpointno	- 0 to 3
++length		- 1 to 3
++address		- Memory location in hex digits ( without 0x ) e.g c015e9bc
++
++SMP support
++==========
++
++When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb
++client, all the processors are forced to enter the debugger. Current
++thread corresponds to the thread running on the processor where
++breakpoint occurred.  Threads running on other processor(s) appear
++similar to other non-running threads in the 'info threads' output.
++Within the kgdb stub there is a structure "waiting_cpus" in which kgdb
++records the values of "current" and "regs" for each CPU other than the
++one that hit the breakpoint.  "current" is a pointer to the task
++structure for the task that CPU is running, while "regs" points to the
++saved registers for the task.  This structure can be examined with the
++gdb "p" command.
++
++ia-32 hardware debugging registers on all processors are set to same
++values.  Hence any hardware breakpoints may occur on any processor.
++
++gdb troubleshooting
++===================
++
++1. gdb hangs
++Kill it. restart gdb. Connect to target machine.
++
++2. gdb cannot connect to target machine (after killing a gdb and
++restarting another) If the target machine was not inside debugger when
++you killed gdb, gdb cannot connect because the target machine won't
++respond.  In this case echo "Ctrl+C"(ASCII 3) to the serial line.
++e.g. echo -e "\003" > /dev/ttyS1
++This forces that target machine into the debugger, after which you
++can connect.
++
++3. gdb cannot connect even after echoing Ctrl+C into serial line
++Try changing serial line settings min to 1 and time to 0
++e.g. stty min 1 time 0 < /dev/ttyS1
++Try echoing again
++
++Check serial line speed and set it to correct value if required
++e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1
++
++EVENTS
++======
++
++Ever want to know the order of things happening?  Which CPU did what and
++when?  How did the spinlock get the way it is?  Then events are for
++you.  Events are defined by calls to an event collection interface and
++saved for later examination.  In this case, kgdb events are saved by a
++very fast bit of code in kgdb which is fully SMP and interrupt protected
++and they are examined by using gdb to display them.  Kgdb keeps only
++the last N events, where N must be a power of two and is defined at
++configure time.
++
++
++Events are signaled to kgdb by calling:
++
++kgdb_ts(data0,data1)
++
++For each call kgdb records each call in an array along with other info.
++Here is the array definition:
++
++struct kgdb_and_then_struct {
++#ifdef CONFIG_SMP
++	int	on_cpu;
++#endif
++	long long at_time;
++	int  	from_ln;
++	char	* in_src;
++	void	*from;
++        int     with_if;
++	int	data0;
++	int	data1;
++};
++
++For SMP machines the CPU is recorded, for all machines the TSC is
++recorded (gets a time stamp) as well as the line number and source file
++the call was made from.  The address of the (from), the "if" (interrupt
++flag) and the two data items are also recorded.  The macro kgdb_ts casts
++the types to int, so you can put any 32-bit values here.  There is a
++configure option to select the number of events you want to keep.  A
++nice number might be 128, but you can keep up to 1024 if you want.  The
++number must be a power of two.  An "andthen" macro library is provided
++for gdb to help you look at these events.  It is also possible to define
++a different structure for the event storage and cast the data to this
++structure.  For example the following structure is defined in kgdb:
++
++struct kgdb_and_then_struct2 {
++#ifdef CONFIG_SMP
++	int	on_cpu;
++#endif
++	long long at_time;
++	int  	from_ln;
++	char	* in_src;
++	void	*from;
++        int     with_if;
++	struct task_struct *t1;
++	struct task_struct *t2;
++};
++
++If you use this for display, the data elements will be displayed as
++pointers to task_struct entries.  You may want to define your own
++structure to use in casting.  You should only change the last two items
++and you must keep the structure size the same.  Kgdb will handle these
++as 32-bit ints, but within that constraint you can define a structure to
++cast to any 32-bit quantity.  This need only be available to gdb and is
++only used for casting in the display code.
++
++Final Items
++===========
++
++I picked up this code from Amit S. Kale and enhanced it.
++
++If you make some really cool modification to this stuff, or if you
++fix a bug, please let me know.
++
++George Anzinger
++<george@mvista.com>
++
++Amit S. Kale
++<akale@veritas.com>
++
++(First kgdb by David Grothe <dave@gcom.com>)
++
++(modified by Tigran Aivazian <tigran@sco.com>)
++    Putting gdbstub into the kernel config menu.
++
++(modified by Scott Foehner <sfoehner@engr.sgi.com>)
++    Hooks for entering gdbstub at boot time.
++
++(modified by Amit S. Kale <akale@veritas.com>)
++    Threads, ia-32 hw debugging, mp support, console support,
++    nmi watchdog handling.
++
++(modified by George Anzinger <george@mvista.com>)
++    Extended threads to include the idle threads.
++    Enhancements to allow breakpoint() at first C code.
++    Use of module_init() and __setup() to automate the configure.
++    Enhanced the cpu "collection" code to work in early bring-up.
++    Added ability to call functions from gdb
++    Print info thread stuff without going back to schedule()
++    Now collect the "other" cpus with an IPI/ NMI.
+Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit.hw
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit.hw	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit.hw	2005-04-05 12:48:05.273615368 +0800
+@@ -0,0 +1,117 @@
++
++#Using ia-32 hardware breakpoints.
++#
++#4 hardware breakpoints are available in ia-32 processors. These breakpoints
++#do not need code modification. They are set using debug registers.
++#
++#Each hardware breakpoint can be of one of the
++#three types: execution, write, access.
++#1. An Execution breakpoint is triggered when code at the breakpoint address is
++#executed.
++#2. A write breakpoint ( aka watchpoints ) is triggered when memory location
++#at the breakpoint address is written.
++#3. An access breakpoint is triggered when memory location at the breakpoint
++#address is either read or written.
++#
++#As hardware breakpoints are available in limited number, use software
++#breakpoints ( br command in gdb ) instead of execution hardware breakpoints.
++#
++#Length of an access or a write breakpoint defines length of the datatype to
++#be watched. Length is 1 for char, 2 short , 3 int.
++#
++#For placing execution, write and access breakpoints, use commands
++#hwebrk, hwwbrk, hwabrk
++#To remove a breakpoint use hwrmbrk command.
++#
++#These commands take following types of arguments. For arguments associated
++#with each command, use help command.
++#1. breakpointno: 0 to 3
++#2. length: 1 to 3
++#3. address: Memory location in hex ( without 0x ) e.g c015e9bc
++#
++#Use the command exinfo to find which hardware breakpoint occured.
++
++#hwebrk breakpointno address
++define hwebrk
++	maintenance packet Y$arg0,0,0,$arg1
++end
++document hwebrk
++	hwebrk <breakpointno> <address>
++	Places a hardware execution breakpoint
++	<breakpointno> = 0 - 3
++	<address> = Hex digits without leading "0x".
++end
++
++#hwwbrk breakpointno length address
++define hwwbrk
++	maintenance packet Y$arg0,1,$arg1,$arg2
++end
++document hwwbrk
++	hwwbrk <breakpointno> <length> <address>
++	Places a hardware write breakpoint
++	<breakpointno> = 0 - 3
++	<length> = 1 (1 byte), 2 (2 byte), 3 (4 byte)
++	<address> = Hex digits without leading "0x".
++end
++
++#hwabrk breakpointno length address
++define hwabrk
++	maintenance packet Y$arg0,1,$arg1,$arg2
++end
++document hwabrk
++	hwabrk <breakpointno> <length> <address>
++	Places a hardware access breakpoint
++	<breakpointno> = 0 - 3
++	<length> = 1 (1 byte), 2 (2 byte), 3 (4 byte)
++	<address> = Hex digits without leading "0x".
++end
++
++#hwrmbrk breakpointno
++define hwrmbrk
++	maintenance packet y$arg0
++end
++document hwrmbrk
++	hwrmbrk <breakpointno>
++	<breakpointno> = 0 - 3
++	Removes a hardware breakpoint
++end
++
++define reboot
++        maintenance packet r
++end
++#exinfo
++define exinfo
++	maintenance packet qE
++end
++document exinfo
++	exinfo
++	Gives information about a breakpoint.
++end
++define get_th
++	p $th=(struct thread_info *)((int)$esp & ~8191)
++end
++document get_th
++	get_tu
++	Gets and prints the current thread_info pointer, Defines th to be it.
++end
++define get_cu
++	p $cu=((struct thread_info *)((int)$esp & ~8191))->task
++end
++document get_cu
++	get_cu
++	Gets and print the "current" value.  Defines $cu to be it.
++end
++define int_off
++	set var $flags=$eflags
++	set $eflags=$eflags&~0x200
++	end
++define int_on
++	set var $eflags|=$flags&0x200
++	end
++document int_off
++	saves the current interrupt state and clears the processor interrupt
++	flag.  Use int_on to restore the saved flag.
++end
++document int_on
++	Restores the interrupt flag saved by int_off.
++end
+Index: linux-2.6.10/Documentation/i386/kgdb/gdb-globals.txt
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/gdb-globals.txt	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/gdb-globals.txt	2005-04-05 12:48:05.260617344 +0800
+@@ -0,0 +1,71 @@
++Sender: akale@veritas.com
++Date: Fri, 23 Jun 2000 19:26:35 +0530
++From: "Amit S. Kale" <akale@veritas.com>
++Organization: Veritas Software (India)
++To: Dave Grothe <dave@gcom.com>, linux-kernel@vger.rutgers.edu
++CC: David Milburn <dmilburn@wirespeed.com>,
++        "Edouard G. Parmelan" <Edouard.Parmelan@quadratec.fr>,
++        ezannoni@cygnus.com, Keith Owens <kaos@ocs.com.au>
++Subject: Re: Module debugging using kgdb
++
++Dave Grothe wrote:
++>
++> Amit:
++>
++> There is a 2.4.0 version of kgdb on our ftp site:
++> ftp://ftp.gcom.com/pub/linux/src/kgdb.  I mirrored your version of gdb
++> and loadmodule.sh there.
++>
++> Have a look at the README file and see if I go it right.  If not, send
++> me some corrections and I will update it.
++>
++> Does your version of gdb solve the global variable problem?
++
++Yes.
++Thanks to Elena Zanoni, gdb (developement version) can now calculate
++correctly addresses  of dynamically loaded object files. I have not been
++following gdb developement for sometime and am not sure when symbol
++address calculation fix is going to appear in a gdb stable version.
++
++Elena, any idea when the fix will make it to a prebuilt gdb from a
++redhat release?
++
++For the time being I have built a gdb developement version. It can be
++used for module debugging with loadmodule.sh script.
++
++The problem with calculating of module addresses with previous versions
++of gdb was as follows:
++gdb did not use base address of a section while calculating address of
++a symbol in the section in an object file loaded via 'add-symbol-file'.
++It used address of .text segment instead. Due to this addresses of
++symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly.
++
++Above mentioned fix allow gdb to use base address of a segment while
++calculating address of a symbol in it. It adds a parameter '-s' to
++'add-symbol-file' command for specifying base address of a segment.
++
++loadmodule.sh script works as follows.
++
++1. Copy a module file to target machine.
++2. Load the module on the target machine using insmod with -m parameter.
++insmod produces a module load map which contains base addresses of all
++sections in the module and addresses of symbols in the module file.
++3. Find all sections and their base addresses in the module from
++the module map.
++4. Generate a script that loads the module file. The script uses
++'add-symbol-file' and specifies address of text segment followed by
++addresses of all segments in the module.
++
++Here is an example gdb script produced by loadmodule.sh script.
++
++add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5
++-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38
++-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838
++
++With this command gdb can calculate addresses of symbols in ANY segment
++in a module file.
++
++Regards.
++--
++Amit Kale
++Veritas Software ( http://www.veritas.com )
+Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit-modules
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit-modules	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit-modules	2005-04-05 12:48:05.262617040 +0800
+@@ -0,0 +1,146 @@
++#
++# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub.
++#
++# This don't work for Linux-2.0 or older.
++#
++# Author Edouard G. Parmelan <Edouard.Parmelan@quadratec.fr>
++#
++#
++# Fri Apr 30 20:33:29 CEST 1999
++#   First public release.
++#
++#   Major cleanup after experiment Linux-2.0 kernel without success.
++#   Symbols of a module are not in the correct order, I can't explain
++#   why :(
++#
++# Fri Mar 19 15:41:40 CET 1999
++#   Initial version.
++#
++# Thu Jan  6 16:29:03 CST 2000
++#   A little fixing by Dave Grothe <dave@gcom.com>
++#
++# Mon Jun 19 09:33:13 CDT 2000
++#   Alignment changes from Edouard Parmelan
++#
++# The basic idea is to find where insmod load the module and inform
++# GDB to load the symbol table of the module with the GDB command
++# ``add-symbol-file <object> <address>''.
++#
++# The Linux kernel holds the list of all loaded modules in module_list,
++# this list end with &kernel_module (exactly with module->next == NULL,
++# but the last module is not a real module).
++#
++# Insmod allocates the struct module before the object file.  Since
++# Linux-2.1, this structure contain his size.  The real address of
++# the object file is then (char*)module + module->size_of_struct.
++#
++# You can use three user functions ``mod-list'', ``mod-print-symbols''
++# and ``add-module-symbols''.
++#
++# mod-list list all loaded modules with the format:
++#    <module-address> <module-name>
++#
++# As soon as you have found the address of your module, you can
++# print its exported symbols (mod-print-symbols) or inform GDB to add
++# symbols from your module file (mod-add-symbols).
++#
++# The argument that you give to mod-print-symbols or mod-add-symbols
++# is the <module-address> from the mod-list command.
++#
++# When using the mod-add-symbols command you must also give the full
++# pathname of the modules object code file.
++#
++# The command mod-add-lis is an example of how to make this easier.
++# You can edit this macro to contain the path name of your own
++# favorite module and then use it as a shorthand to load it.  You
++# still need the module-address, however.
++#
++# The internal function ``mod-validate'' set the GDB variable $mod
++# as a ``struct module*'' if the kernel known the module otherwise
++# $mod is set to NULL.  This ensure to not add symbols for a wrong
++# address.
++#
++# Have a nice hacking day !
++#
++#
++define mod-list
++    set $mod = (struct module*)module_list
++    # the last module is the kernel, ignore it
++    while $mod != &kernel_module
++    	printf "%p\t%s\n", (long)$mod, ($mod)->name
++	set $mod = $mod->next
++    end
++end
++document mod-list
++List all modules in the form: <module-address> <module-name>
++Use the <module-address> as the argument for the other
++mod-commands: mod-print-symbols, mod-add-symbols.
++end
++
++define mod-validate
++    set $mod = (struct module*)module_list
++    while ($mod != $arg0) && ($mod != &kernel_module)
++    	set $mod = $mod->next
++    end
++    if $mod == &kernel_module
++	set $mod = 0
++    	printf "%p is not a module\n", $arg0
++    end
++end
++document mod-validate
++mod-validate <module-address>
++Internal user-command used to validate the module parameter.
++If <module> is a real loaded module, set $mod to it otherwise set $mod to 0.
++end
++
++
++define mod-print-symbols
++    mod-validate $arg0
++    if $mod != 0
++	set $i = 0
++	while $i < $mod->nsyms
++	    set $sym = $mod->syms[$i]
++	    printf "%p\t%s\n", $sym->value, $sym->name
++	    set $i = $i + 1
++	end
++    end
++end
++document mod-print-symbols
++mod-print-symbols <module-address>
++Print all exported symbols of the module.  see mod-list
++end
++
++
++define mod-add-symbols-align
++    mod-validate $arg0
++    if $mod != 0
++	set $mod_base = ($mod->size_of_struct + (long)$mod)
++	if ($arg2 != 0) && (($mod_base & ($arg2 - 1)) != 0)
++	    set $mod_base = ($mod_base | ($arg2 - 1)) + 1
++	end
++	add-symbol-file $arg1 $mod_base
++    end
++end
++document mod-add-symbols-align
++mod-add-symbols-align <module-address> <object file path name> <align>
++Load the symbols table of the module from the object file where
++first section aligment is <align>.
++To retreive alignment, use `objdump -h <object file path name>'.
++end
++
++define mod-add-symbols
++    mod-add-symbols-align $arg0 $arg1 sizeof(long)
++end
++document mod-add-symbols
++mod-add-symbols <module-address> <object file path name>
++Load the symbols table of the module from the object file.
++Default alignment is 4.  See mod-add-symbols-align.
++end
++
++define mod-add-lis
++    mod-add-symbols-align $arg0 /usr/src/LiS/streams.o 16
++end
++document mod-add-lis
++mod-add-lis <module-address>
++Does mod-add-symbols <module-address> /usr/src/LiS/streams.o
++end
+Index: linux-2.6.10/Documentation/i386/kgdb/debug-nmi.txt
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/debug-nmi.txt	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/debug-nmi.txt	2005-04-05 12:48:05.261617192 +0800
+@@ -0,0 +1,37 @@
++Subject: Debugging with NMI
++Date: Mon, 12 Jul 1999 11:28:31 -0500
++From: David Grothe <dave@gcom.com>
++Organization: Gcom, Inc
++To: David Grothe <dave@gcom.com>
++
++Kernel hackers:
++
++Maybe this is old hat, but it is new to me --
++
++On an ISA bus machine, if you short out the A1 and B1 pins of an ISA
++slot you will generate an NMI to the CPU.  This interrupts even a
++machine that is hung in a loop with interrupts disabled.  Used in
++conjunction with kgdb <
++ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can
++gain debugger control of a machine that is hung in the kernel!  Even
++without kgdb the kernel will print a stack trace so you can find out
++where it was hung.
++
++The A1/B1 pins are directly opposite one another and the farthest pins
++towards the bracket end of the ISA bus socket.  You can stick a paper
++clip or multi-meter probe between them to short them out.
++
++I had a spare ISA bus to PC104 bus adapter around.  The PC104 end of the
++board consists of two rows of wire wrap pins.  So I wired a push button
++between the A1/B1 pins and now have an ISA board that I can stick into
++any ISA bus slot for debugger entry.
++
++Microsoft has a circuit diagram of a PCI card at
++http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM.  If you want to
++build one you will have to mail them and ask for the PAL equations.
++Nobody makes one comercially.
++
++[THIS TIP COMES WITH NO WARRANTY WHATSOEVER.  It works for me, but if
++your machine catches fire, it is your problem, not mine.]
++
++-- Dave (the kgdb guy)
+Index: linux-2.6.10/Documentation/i386/kgdb/loadmodule.sh
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/loadmodule.sh	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/loadmodule.sh	2005-04-05 12:48:05.274615216 +0800
+@@ -0,0 +1,78 @@
++#/bin/sh
++# This script loads a module on a target machine and generates a gdb script.
++# source generated gdb script to load the module file at appropriate addresses
++# in gdb.
++#
++# Usage:
++# Loading the module on target machine and generating gdb script)
++#	[foo]$ loadmodule.sh <modulename>
++#
++# Loading the module file into gdb
++#	(gdb) source <gdbscriptpath>
++#
++# Modify following variables according to your setup.
++#	TESTMACHINE - Name of the target machine
++#	GDBSCRIPTS - The directory where a gdb script will be generated
++#
++# Author: Amit S. Kale (akale@veritas.com).
++#
++# If you run into problems, please check files pointed to by following
++# variables.
++#	ERRFILE - /tmp/<modulename>.errs contains stderr output of insmod
++#	MAPFILE - /tmp/<modulename>.map contains stdout output of insmod
++#	GDBSCRIPT - $GDBSCRIPTS/load<modulename> gdb script.
++
++TESTMACHINE=foo
++GDBSCRIPTS=/home/bar
++
++if [ $# -lt 1 ] ; then {
++	echo Usage: $0 modulefile
++	exit
++} ; fi
++
++MODULEFILE=$1
++MODULEFILEBASENAME=`basename $1`
++
++if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then {
++	MODULEFILE=`pwd`/$MODULEFILE
++} fi
++
++ERRFILE=/tmp/$MODULEFILEBASENAME.errs
++MAPFILE=/tmp/$MODULEFILEBASENAME.map
++GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME
++
++function findaddr() {
++	local ADDR=0x$(echo "$SEGMENTS" | \
++		grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \
++		sed 's/[ ]*[^ ]*$//')
++	echo $ADDR
++}
++
++function checkerrs() {
++	if [ "`cat $ERRFILE`" != "" ] ; then {
++		cat $ERRFILE
++		exit
++	} fi
++}
++
++#load the module
++echo Copying $MODULEFILE to $TESTMACHINE
++rcp $MODULEFILE root@${TESTMACHINE}:
++
++echo Loading module $MODULEFILE
++rsh -l root $TESTMACHINE  /sbin/insmod -m ./`basename $MODULEFILE` \
++	> $MAPFILE 2> $ERRFILE
++checkerrs
++
++SEGMENTS=`head -n 11 $MAPFILE | tail -n 10`
++TEXTADDR=$(findaddr "\\.text[^.]")
++LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR"
++SEGADDRS=`echo "$SEGMENTS" | awk '//{
++	if ($1 != ".text" && $1 != ".this" &&
++	    $1 != ".kstrtab" && $1 != ".kmodtab") {
++		print " -s " $1 " 0x" $3 " "
++	}
++}'`
++LOADSTRING="$LOADSTRING $SEGADDRS"
++echo Generating script $GDBSCRIPT
++echo $LOADSTRING > $GDBSCRIPT
+Index: linux-2.6.10/Documentation/i386/kgdb/andthen
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/andthen	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/andthen	2005-04-05 12:48:05.272615520 +0800
+@@ -0,0 +1,100 @@
++
++define	set_andthen
++	set var $thp=0
++	set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0]
++	set var $at_size = (sizeof kgdb_data)/(sizeof *$thp)
++	set var $at_oc=kgdb_and_then_count
++	set var $at_cc=$at_oc
++end
++
++define andthen_next
++	set var $at_cc=$arg0
++end
++
++define andthen
++	andthen_set_edge
++	if ($at_cc >= $at_oc)
++		printf "Outside window.  Window size is %d\n",($at_oc-$at_low)
++	else
++		printf "%d: ",$at_cc
++		output *($thp+($at_cc++ % $at_size ))
++		printf "\n"
++	end
++end
++define andthen_set_edge
++	set var $at_oc=kgdb_and_then_count
++	set var $at_low = $at_oc - $at_size
++	if ($at_low < 0 )
++		set var $at_low = 0
++	end
++	if (( $at_cc > $at_oc) || ($at_cc < $at_low))
++		printf "Count outside of window, setting count to "
++		if ($at_cc >= $at_oc)
++			set var $at_cc = $at_oc
++		else
++			set var $at_cc = $at_low
++		end
++		printf "%d\n",$at_cc
++	end
++end
++
++define beforethat
++	andthen_set_edge
++	if ($at_cc <= $at_low)
++		printf "Outside window.  Window size is %d\n",($at_oc-$at_low)
++	else
++		printf "%d: ",$at_cc-1
++		output *($thp+(--$at_cc % $at_size ))
++		printf "\n"
++	end
++end
++
++document andthen_next
++	andthen_next <count>
++	.	sets the number of the event to display next. If this event
++	.	is not in the event pool, either andthen or beforethat will
++	.	correct it to the nearest event pool edge.  The event pool
++	.	ends at the last event recorded and begins <number of events>
++	.	prior to that.  If beforethat is used next, it will display
++	.	event <count> -1.
++.
++	andthen commands are: set_andthen, andthen_next, andthen and beforethat
++end
++
++
++document andthen
++	andthen
++.	displays the next event in the list.  <set_andthen> sets up to display
++.	the oldest saved event first.
++.	<count> (optional) count of the event to display.
++.	note the number of events saved is specified at configure time.
++.	if events are saved between calls to andthen the index will change
++.	but the displayed event will be the next one (unless the event buffer
++.	is overrun).
++.
++.	andthen commands are: set_andthen, andthen_next, andthen and beforethat
++end
++
++document set_andthen
++	set_andthen
++.	sets up to use the <andthen> and <beforethat> commands.
++.		if you have defined your own struct, use the above and
++.		then enter the following:
++.		p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0]
++.		where <kgdb_and_then_structX> is the name of your structure.
++.
++.	andthen commands are: set_andthen, andthen_next, andthen and beforethat
++end
++
++document beforethat
++	beforethat
++.	displays the next prior event in the list. <set_andthen> sets up to
++.	display the last occuring event first.
++.
++.	note the number of events saved is specified at configure time.
++.	if events are saved between calls to beforethat the index will change
++.	but the displayed event will be the next one (unless the event buffer
++.	is overrun).
++.
++.	andthen commands are: set_andthen, andthen_next, andthen and beforethat
++end
+Index: linux-2.6.10/arch/i386/lib/kgdb_serial.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/lib/kgdb_serial.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/i386/lib/kgdb_serial.c	2005-04-05 12:48:05.193627528 +0800
+@@ -0,0 +1,485 @@
++/*
++ * Serial interface GDB stub
++ *
++ * Written (hacked together) by David Grothe (dave@gcom.com)
++ * Modified to allow invokation early in boot see also
++ * kgdb.h for instructions by George Anzinger(george@mvista.com)
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/errno.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/timer.h>
++#include <linux/interrupt.h>
++#include <linux/tty.h>
++#include <linux/tty_flip.h>
++#include <linux/serial.h>
++#include <linux/serial_reg.h>
++#include <linux/config.h>
++#include <linux/major.h>
++#include <linux/string.h>
++#include <linux/fcntl.h>
++#include <linux/ptrace.h>
++#include <linux/ioport.h>
++#include <linux/mm.h>
++#include <linux/init.h>
++#include <linux/highmem.h>
++#include <asm/system.h>
++#include <asm/io.h>
++#include <asm/segment.h>
++#include <asm/bitops.h>
++#include <asm/system.h>
++#include <asm/kgdb_local.h>
++#ifdef CONFIG_KGDB_USER_CONSOLE
++extern void kgdb_console_finit(void);
++#endif
++#define PRNT_off
++#define TEST_EXISTANCE
++#ifdef PRNT
++#define dbprintk(s) printk s
++#else
++#define dbprintk(s)
++#endif
++#define TEST_INTERRUPT_off
++#ifdef TEST_INTERRUPT
++#define intprintk(s) printk s
++#else
++#define intprintk(s)
++#endif
++
++#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT)
++
++#define	GDB_BUF_SIZE	512	/* power of 2, please */
++
++static char gdb_buf[GDB_BUF_SIZE];
++static int gdb_buf_in_inx;
++static atomic_t gdb_buf_in_cnt;
++static int gdb_buf_out_inx;
++
++struct async_struct *gdb_async_info;
++static int gdb_async_irq;
++
++#define outb_px(a,b) outb_p(b,a)
++
++static void program_uart(struct async_struct *info);
++static void write_char(struct async_struct *info, int chr);
++/*
++ * Get a byte from the hardware data buffer and return it
++ */
++static int
++read_data_bfr(struct async_struct *info)
++{
++	char it = inb_p(info->port + UART_LSR);
++
++	if (it & UART_LSR_DR)
++		return (inb_p(info->port + UART_RX));
++	/*
++	 * If we have a framing error assume somebody messed with
++	 * our uart.  Reprogram it and send '-' both ways...
++	 */
++	if (it & 0xc) {
++		program_uart(info);
++		write_char(info, '-');
++		return ('-');
++	}
++	return (-1);
++
++}				/* read_data_bfr */
++
++/*
++ * Get a char if available, return -1 if nothing available.
++ * Empty the receive buffer first, then look at the interface hardware.
++
++ * Locking here is a bit of a problem.	We MUST not lock out communication
++ * if we are trying to talk to gdb about a kgdb entry.	ON the other hand
++ * we can loose chars in the console pass thru if we don't lock.  It is also
++ * possible that we could hold the lock or be waiting for it when kgdb
++ * NEEDS to talk.  Since kgdb locks down the world, it does not need locks.
++ * We do, of course have possible issues with interrupting a uart operation,
++ * but we will just depend on the uart status to help keep that straight.
++
++ */
++static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED;
++#ifdef CONFIG_SMP
++extern spinlock_t kgdb_spinlock;
++#endif
++
++static int
++read_char(struct async_struct *info)
++{
++	int chr;
++	unsigned long flags;
++	local_irq_save(flags);
++#ifdef CONFIG_SMP
++	if (!spin_is_locked(&kgdb_spinlock)) {
++		spin_lock(&uart_interrupt_lock);
++	}
++#endif
++	if (atomic_read(&gdb_buf_in_cnt) != 0) {	/* intr routine has q'd chars */
++		chr = gdb_buf[gdb_buf_out_inx++];
++		gdb_buf_out_inx &= (GDB_BUF_SIZE - 1);
++		atomic_dec(&gdb_buf_in_cnt);
++	} else {
++		chr = read_data_bfr(info);
++	}
++#ifdef CONFIG_SMP
++	if (!spin_is_locked(&kgdb_spinlock)) {
++		spin_unlock(&uart_interrupt_lock);
++	}
++#endif
++	local_irq_restore(flags);
++	return (chr);
++}
++
++/*
++ * Wait until the interface can accept a char, then write it.
++ */
++static void
++write_char(struct async_struct *info, int chr)
++{
++	while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ;
++
++	outb_p(chr, info->port + UART_TX);
++
++}				/* write_char */
++
++/*
++ * Mostly we don't need a spinlock, but since the console goes
++ * thru here with interrutps on, well, we need to catch those
++ * chars.
++ */
++/*
++ * This is the receiver interrupt routine for the GDB stub.
++ * It will receive a limited number of characters of input
++ * from the gdb  host machine and save them up in a buffer.
++ *
++ * When the gdb stub routine getDebugChar() is called it
++ * draws characters out of the buffer until it is empty and
++ * then reads directly from the serial port.
++ *
++ * We do not attempt to write chars from the interrupt routine
++ * since the stubs do all of that via putDebugChar() which
++ * writes one byte after waiting for the interface to become
++ * ready.
++ *
++ * The debug stubs like to run with interrupts disabled since,
++ * after all, they run as a consequence of a breakpoint in
++ * the kernel.
++ *
++ * Perhaps someone who knows more about the tty driver than I
++ * care to learn can make this work for any low level serial
++ * driver.
++ */
++static irqreturn_t
++gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++	struct async_struct *info;
++	unsigned long flags;
++
++	info = gdb_async_info;
++	if (!info || !info->tty || irq != gdb_async_irq)
++		return IRQ_NONE;
++
++	local_irq_save(flags);
++	spin_lock(&uart_interrupt_lock);
++	do {
++		int chr = read_data_bfr(info);
++		intprintk(("Debug char on int: %x hex\n", chr));
++		if (chr < 0)
++			continue;
++
++		if (chr == 3) {	/* Ctrl-C means remote interrupt */
++			BREAKPOINT;
++			continue;
++		}
++
++		if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) {
++			/* buffer overflow tosses early char */
++			read_char(info);
++		}
++		gdb_buf[gdb_buf_in_inx++] = chr;
++		gdb_buf_in_inx &= (GDB_BUF_SIZE - 1);
++	} while (inb_p(info->port + UART_IIR) & UART_IIR_RDI);
++	spin_unlock(&uart_interrupt_lock);
++	local_irq_restore(flags);
++	return IRQ_HANDLED;
++}				/* gdb_interrupt */
++
++/*
++ * Just a NULL routine for testing.
++ */
++void
++gdb_null(void)
++{
++}				/* gdb_null */
++
++/* These structure are filled in with values defined in asm/kgdb_local.h
++ */
++static struct serial_state state = SB_STATE;
++static struct async_struct local_info = SB_INFO;
++static int ok_to_enable_ints = 0;
++static void kgdb_enable_ints_now(void);
++
++extern char *kgdb_version;
++/*
++ * Hook an IRQ for KGDB.
++ *
++ * This routine is called from putDebugChar, below.
++ */
++static int ints_disabled = 1;
++int
++gdb_hook_interrupt(struct async_struct *info, int verb)
++{
++	struct serial_state *state = info->state;
++	unsigned long flags;
++	int port;
++#ifdef TEST_EXISTANCE
++	int scratch, scratch2;
++#endif
++
++	/* The above fails if memory managment is not set up yet.
++	 * Rather than fail the set up, just keep track of the fact
++	 * and pick up the interrupt thing later.
++	 */
++	gdb_async_info = info;
++	port = gdb_async_info->port;
++	gdb_async_irq = state->irq;
++	if (verb) {
++		printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n",
++		       kgdb_version,
++		       port,
++		       gdb_async_irq, gdb_async_info->state->custom_divisor);
++	}
++	local_irq_save(flags);
++#ifdef TEST_EXISTANCE
++	/* Existance test */
++	/* Should not need all this, but just in case.... */
++
++	scratch = inb_p(port + UART_IER);
++	outb_px(port + UART_IER, 0);
++	outb_px(0xff, 0x080);
++	scratch2 = inb_p(port + UART_IER);
++	outb_px(port + UART_IER, scratch);
++	if (scratch2) {
++		printk
++		    ("gdb_hook_interrupt: Could not clear IER, not a UART!\n");
++		local_irq_restore(flags);
++		return 1;	/* We failed; there's nothing here */
++	}
++	scratch2 = inb_p(port + UART_LCR);
++	outb_px(port + UART_LCR, 0xBF);	/* set up for StarTech test */
++	outb_px(port + UART_EFR, 0);	/* EFR is the same as FCR */
++	outb_px(port + UART_LCR, 0);
++	outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO);
++	scratch = inb_p(port + UART_IIR) >> 6;
++	if (scratch == 1) {
++		printk("gdb_hook_interrupt: Undefined UART type!"
++		       "  Not a UART! \n");
++		local_irq_restore(flags);
++		return 1;
++	} else {
++		dbprintk(("gdb_hook_interrupt: UART type "
++			  "is %d where 0=16450, 2=16550 3=16550A\n", scratch));
++	}
++	scratch = inb_p(port + UART_MCR);
++	outb_px(port + UART_MCR, UART_MCR_LOOP | scratch);
++	outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A);
++	scratch2 = inb_p(port + UART_MSR) & 0xF0;
++	outb_px(port + UART_MCR, scratch);
++	if (scratch2 != 0x90) {
++		printk("gdb_hook_interrupt: "
++		       "Loop back test failed! Not a UART!\n");
++		local_irq_restore(flags);
++		return scratch2 + 1000;	/* force 0 to fail */
++	}
++#endif				/* test existance */
++	program_uart(info);
++	local_irq_restore(flags);
++
++	return (0);
++
++}				/* gdb_hook_interrupt */
++
++static void
++program_uart(struct async_struct *info)
++{
++	int port = info->port;
++
++	(void) inb_p(port + UART_RX);
++	outb_px(port + UART_IER, 0);
++
++	(void) inb_p(port + UART_RX);	/* serial driver comments say */
++	(void) inb_p(port + UART_IIR);	/* this clears the interrupt regs */
++	(void) inb_p(port + UART_MSR);
++	outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB);
++	outb_px(port + UART_DLL, info->state->custom_divisor & 0xff);	/* LS */
++	outb_px(port + UART_DLM, info->state->custom_divisor >> 8);	/* MS  */
++	outb_px(port + UART_MCR, info->MCR);
++
++	outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR);	/* set fcr */
++	outb_px(port + UART_LCR, UART_LCR_WLEN8);	/* reset DLAB */
++	outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1);	/* set fcr */
++	if (!ints_disabled) {
++		intprintk(("KGDB: Sending %d to port %x offset %d\n",
++			   gdb_async_info->IER,
++			   (int) gdb_async_info->port, UART_IER));
++		outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER);
++	}
++	return;
++}
++
++/*
++ * getDebugChar
++ *
++ * This is a GDB stub routine.	It waits for a character from the
++ * serial interface and then returns it.  If there is no serial
++ * interface connection then it returns a bogus value which will
++ * almost certainly cause the system to hang.  In the
++ */
++int kgdb_in_isr = 0;
++int kgdb_in_lsr = 0;
++extern spinlock_t kgdb_spinlock;
++
++/* Caller takes needed protections */
++
++int
++getDebugChar(void)
++{
++	volatile int chr, dum, time, end_time;
++
++	dbprintk(("getDebugChar(port %x): ", gdb_async_info->port));
++
++	if (gdb_async_info == NULL) {
++		gdb_hook_interrupt(&local_info, 0);
++	}
++	/*
++	 * This trick says if we wait a very long time and get
++	 * no char, return the -1 and let the upper level deal
++	 * with it.
++	 */
++	rdtsc(dum, time);
++	end_time = time + 2;
++	while (((chr = read_char(gdb_async_info)) == -1) &&
++	       (end_time - time) > 0) {
++		rdtsc(dum, time);
++	};
++	/*
++	 * This covers our butts if some other code messes with
++	 * our uart, hay, it happens :o)
++	 */
++	if (chr == -1)
++		program_uart(gdb_async_info);
++
++	dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '));
++	return (chr);
++
++}				/* getDebugChar */
++
++static int count = 3;
++static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED;
++
++static int __init
++kgdb_enable_ints(void)
++{
++	if (gdb_async_info == NULL) {
++		gdb_hook_interrupt(&local_info, 1);
++	}
++	ok_to_enable_ints = 1;
++	kgdb_enable_ints_now();
++#ifdef CONFIG_KGDB_USER_CONSOLE
++	kgdb_console_finit();
++#endif
++	return 0;
++}
++
++#ifdef CONFIG_SERIAL_8250
++void shutdown_for_kgdb(struct async_struct *gdb_async_info);
++#endif
++
++#ifdef CONFIG_DISCONTIGMEM
++static inline int kgdb_mem_init_done(void)
++{
++	return highmem_start_page != NULL;
++}
++#else
++static inline int kgdb_mem_init_done(void)
++{
++	return max_mapnr != 0;
++}
++#endif
++
++static void
++kgdb_enable_ints_now(void)
++{
++	if (!spin_trylock(&one_at_atime))
++		return;
++	if (!ints_disabled)
++		goto exit;
++	if (kgdb_mem_init_done() &&
++			ints_disabled) {	/* don't try till mem init */
++#ifdef CONFIG_SERIAL_8250
++		/*
++		 * The ifdef here allows the system to be configured
++		 * without the serial driver.
++		 * Don't make it a module, however, it will steal the port
++		 */
++		shutdown_for_kgdb(gdb_async_info);
++#endif
++		ints_disabled = request_irq(gdb_async_info->state->irq,
++					    gdb_interrupt,
++					    IRQ_T(gdb_async_info),
++					    "KGDB-stub", NULL);
++		intprintk(("KGDB: request_irq returned %d\n", ints_disabled));
++	}
++	if (!ints_disabled) {
++		intprintk(("KGDB: Sending %d to port %x offset %d\n",
++			   gdb_async_info->IER,
++			   (int) gdb_async_info->port, UART_IER));
++		outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER);
++	}
++      exit:
++	spin_unlock(&one_at_atime);
++}
++
++/*
++ * putDebugChar
++ *
++ * This is a GDB stub routine.	It waits until the interface is ready
++ * to transmit a char and then sends it.  If there is no serial
++ * interface connection then it simply returns to its caller, having
++ * pretended to send the char.	Caller takes needed protections.
++ */
++void
++putDebugChar(int chr)
++{
++	dbprintk(("putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n",
++		  gdb_async_info->port,
++		  chr,
++		  chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1));
++
++	if (gdb_async_info == NULL) {
++		gdb_hook_interrupt(&local_info, 0);
++	}
++
++	write_char(gdb_async_info, chr);	/* this routine will wait */
++	count = (chr == '#') ? 0 : count + 1;
++	if ((count == 2)) {	/* try to enable after */
++		if (ints_disabled & ok_to_enable_ints)
++			kgdb_enable_ints_now();	/* try to enable after */
++
++		/* We do this a lot because, well we really want to get these
++		 * interrupts.	The serial driver will clear these bits when it
++		 * initializes the chip.  Every thing else it does is ok,
++		 * but this.
++		 */
++		if (!ints_disabled) {
++			outb_px(gdb_async_info->port + UART_IER,
++				gdb_async_info->IER);
++		}
++	}
++
++}				/* putDebugChar */
++
++module_init(kgdb_enable_ints);
+Index: linux-2.6.10/arch/i386/lib/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/i386/lib/Makefile	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/i386/lib/Makefile	2005-04-05 12:48:05.194627376 +0800
+@@ -8,3 +8,4 @@
+ 
+ lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
+ lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
++lib-$(CONFIG_KGDB) += kgdb_serial.o
+Index: linux-2.6.10/arch/i386/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig.debug	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/arch/i386/Kconfig.debug	2005-04-05 12:48:05.204625856 +0800
+@@ -65,4 +65,6 @@
+ 	depends on X86_LOCAL_APIC && !X86_VISWS
+ 	default y
+ 
++source "arch/i386/Kconfig.kgdb"
++
+ endmenu
+Index: linux-2.6.10/arch/i386/kernel/entry.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/entry.S	2005-04-05 12:48:03.413898088 +0800
++++ linux-2.6.10/arch/i386/kernel/entry.S	2005-04-05 12:48:05.244619776 +0800
+@@ -48,6 +48,18 @@
+ #include <asm/smp.h>
+ #include <asm/page.h>
+ #include "irq_vectors.h"
++        /* We do not recover from a stack overflow, but at least
++         * we know it happened and should be able to track it down.
++         */
++#ifdef CONFIG_STACK_OVERFLOW_TEST
++#define STACK_OVERFLOW_TEST \
++        testl $(THREAD_SIZE - 512),%esp;    \
++        jnz   10f;            \
++        call  stack_overflow; \
++10:
++#else
++#define STACK_OVERFLOW_TEST
++#endif
+ 
+ #define nr_syscalls ((syscall_table_size)/4)
+ 
+@@ -94,7 +106,8 @@
+ 	pushl %ebx; \
+ 	movl $(__USER_DS), %edx; \
+ 	movl %edx, %ds; \
+-	movl %edx, %es;
++	movl %edx, %es; \
++        STACK_OVERFLOW_TEST
+ 
+ #define RESTORE_INT_REGS \
+ 	popl %ebx;	\
+@@ -198,6 +211,7 @@
+ 	# sysenter call handler stub
+ ENTRY(sysenter_entry)
+ 	movl TSS_sysenter_esp0(%esp),%esp
++	.globl sysenter_past_esp
+ sysenter_past_esp:
+ 	sti
+ 	pushl $(__USER_DS)
+@@ -261,6 +275,19 @@
+ 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
+ 	jne syscall_exit_work
+ restore_all:
++#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS
++	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
++	movb CS(%esp), %al
++	testl $(VM_MASK | 3), %eax
++	jz resume_kernelX		# returning to kernel or vm86-space
++
++	cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
++	jz resume_kernelX
++
++        int $3
++
++resume_kernelX:
++#endif
+ 	RESTORE_ALL
+ 
+ 	# perform work that needs to be done immediately before resumption
+Index: linux-2.6.10/arch/i386/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/traps.c	2005-03-31 16:20:09.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/traps.c	2005-04-05 12:48:05.221623272 +0800
+@@ -105,6 +105,39 @@
+ 	return err;
+ }
+ 
++#ifdef CONFIG_KGDB
++extern void sysenter_past_esp(void);
++#include <asm/kgdb.h>
++#include <linux/init.h>
++void set_intr_gate(unsigned int n, void *addr);
++static void set_intr_usr_gate(unsigned int n, void *addr);
++/*
++ * Should be able to call this breakpoint() very early in
++ * bring up.  Just hard code the call where needed.
++ * The breakpoint() code is here because set_?_gate() functions
++ * are local (static) to trap.c.  They need be done only once,
++ * but it does not hurt to do them over.
++ */
++void breakpoint(void)
++{
++	set_intr_usr_gate(3,&int3); /* disable ints on trap */
++	set_intr_gate(1,&debug);
++	set_intr_gate(14,&page_fault);
++
++        BREAKPOINT;
++}
++#define	CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after)		\
++    {									\
++	if (!user_mode(regs)  ) \
++	{								\
++		kgdb_handle_exception(trapnr, signr, error_code, regs);	\
++		after;							\
++	} else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \
++    }
++#else
++#define	CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after)
++#endif
++
+ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+ {
+ 	return	p > (void *)tinfo &&
+@@ -332,6 +365,15 @@
+ #endif
+ 		if (nl)
+ 			printk("\n");
++#ifdef CONFIG_KGDB
++	/* This is about the only place we want to go to kgdb even if in
++	 * user mode.  But we must go in via a trap so within kgdb we will
++	 * always be in kernel mode.
++	 */
++		if (user_mode(regs))
++			BREAKPOINT;
++#endif
++	 	CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,)
+ 	notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
+ 		show_registers(regs);
+   	} else
+@@ -397,6 +439,7 @@
+ #define DO_ERROR(trapnr, signr, str, name) \
+ fastcall void do_##name(struct pt_regs * regs, long error_code) \
+ { \
++	CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,) \
+ 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ 						== NOTIFY_STOP) \
+ 		return; \
+@@ -420,6 +463,7 @@
+ #define DO_VM86_ERROR(trapnr, signr, str, name) \
+ fastcall void do_##name(struct pt_regs * regs, long error_code) \
+ { \
++	CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return) \
+ 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ 						== NOTIFY_STOP) \
+ 		return; \
+@@ -503,6 +547,7 @@
+ 
+ gp_in_kernel:
+ 	if (!fixup_exception(regs)) {
++ 		CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,)
+ 		if (notify_die(DIE_GPF, "general protection fault", regs,
+ 				error_code, 13, SIGSEGV) == NOTIFY_STOP)
+ 			return;
+@@ -716,12 +761,35 @@
+ 		 * allowing programs to debug themselves without the ptrace()
+ 		 * interface.
+ 		 */
+-		if ((regs->xcs & 3) == 0)
+-			goto clear_TF_reenable;
++#ifdef CONFIG_KGDB
++ 		/*
++ 		 * I think this is the only "real" case of a TF in the kernel
++ 		 * that really belongs to user space.  Others are
++ 		 * "Ours all ours!"
++ 		 */
++ 		if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_past_esp))
++ 			goto clear_TF_reenable;
++#else
++  		if ((regs->xcs & 3) == 0)
++  			goto clear_TF_reenable;
++#endif
+ 		if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE)
+ 			goto clear_TF;
+ 	}
+ 
++#ifdef CONFIG_KGDB
++	/*
++	 * If this is a kernel mode trap, we need to reset db7 to allow us
++	 * to continue sanely ALSO skip the signal delivery
++	 */
++	if ((regs->xcs & 3) == 0)
++		goto clear_dr7;
++
++	/* if not kernel, allow ints but only if they were on */
++	if (regs->eflags & 0x200)
++		local_irq_enable();
++#endif
++
+ 	/* Ok, finally something we can handle */
+ 	tsk->thread.trap_no = 1;
+ 	tsk->thread.error_code = error_code;
+@@ -743,6 +811,7 @@
+ 	__asm__("movl %0,%%db7"
+ 		: /* no output */
+ 		: "r" (0));
++	CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,)
+ 	return;
+ 
+ debug_vm86:
+@@ -999,6 +1068,12 @@
+ {
+ 	_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+ }
++#ifdef CONFIG_KGDB
++void set_intr_usr_gate(unsigned int n, void *addr)
++{
++	_set_gate(idt_table+n,14,3,addr,__KERNEL_CS);
++}
++#endif
+ 
+ 
+ void __init trap_init(void)
+@@ -1016,7 +1091,11 @@
+ 	set_trap_gate(0,&divide_error);
+ 	set_intr_gate(1,&debug);
+ 	set_intr_gate(2,&nmi);
++#ifndef CONFIG_KGDB
+ 	set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
++#else
++	set_intr_usr_gate(3,&int3);	/* int3-5 can be called from all */
++#endif
+ 	set_system_gate(4,&overflow);
+ 	set_system_gate(5,&bounds);
+ 	set_trap_gate(6,&invalid_op);
+Index: linux-2.6.10/arch/i386/kernel/nmi.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/nmi.c	2005-03-31 15:57:19.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/nmi.c	2005-04-05 12:48:05.222623120 +0800
+@@ -34,7 +34,17 @@
+ 
+ #include "mach_traps.h"
+ 
++#ifdef CONFIG_KGDB
++#include <asm/kgdb.h>
++#ifdef CONFIG_SMP
++unsigned int nmi_watchdog = NMI_IO_APIC;
++#else
++unsigned int nmi_watchdog = NMI_LOCAL_APIC;
++#endif
++#else
+ unsigned int nmi_watchdog = NMI_NONE;
++#endif
++
+ extern int unknown_nmi_panic;
+ static unsigned int nmi_hz = HZ;
+ static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
+@@ -466,6 +476,9 @@
+ 	for (i = 0; i < NR_CPUS; i++)
+ 		alert_counter[i] = 0;
+ }
++#ifdef CONFIG_KGDB
++int tune_watchdog = 5*HZ;
++#endif
+ 
+ extern void die_nmi(struct pt_regs *, const char *msg);
+ 
+@@ -480,14 +493,25 @@
+ 	int sum, cpu = smp_processor_id();
+ 
+ 	sum = irq_stat[cpu].apic_timer_irqs;
+-
+-	if (last_irq_sums[cpu] == sum) {
++#ifdef CONFIG_KGDB
++ 	if (!in_kgdb(regs) && last_irq_sums[cpu] == sum) {
++ 
++#else
++  	if (last_irq_sums[cpu] == sum) {
++#endif
+ 		/*
+ 		 * Ayiee, looks like this CPU is stuck ...
+ 		 * wait a few IRQs (5 seconds) before doing the oops ...
+ 		 */
+ 		alert_counter[cpu]++;
+-		if (alert_counter[cpu] == 30*nmi_hz)
++#ifdef CONFIG_KGDB
++ 		if (alert_counter[cpu] == tune_watchdog) {
++ 			kgdb_handle_exception(2, SIGPWR, 0, regs);
++ 			last_irq_sums[cpu] = sum;
++ 			alert_counter[cpu] = 0;
++ 		}
++#endif
++		if (alert_counter[cpu] == 5*nmi_hz)
+ 			die_nmi(regs, "NMI Watchdog detected LOCKUP");
+ 	} else {
+ 		last_irq_sums[cpu] = sum;
+Index: linux-2.6.10/arch/i386/kernel/kgdb_stub.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/kgdb_stub.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/i386/kernel/kgdb_stub.c	2005-04-05 12:48:05.242620080 +0800
+@@ -0,0 +1,2330 @@
++/*
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2, or (at your option) any
++ * later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
++ * General Public License for more details.
++ *
++ */
++
++/*
++ * Copyright (c) 2000 VERITAS Software Corporation.
++ *
++ */
++/****************************************************************************
++ *  Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $
++ *
++ *  Module name: remcom.c $
++ *  Revision: 1.34 $
++ *  Date: 91/03/09 12:29:49 $
++ *  Contributor:     Lake Stevens Instrument Division$
++ *
++ *  Description:     low level support for gdb debugger. $
++ *
++ *  Considerations:  only works on target hardware $
++ *
++ *  Written by:	     Glenn Engel $
++ *  Updated by:	     David Grothe <dave@gcom.com>
++ *  ModuleState:     Experimental $
++ *
++ *  NOTES:	     See Below $
++ *
++ *  Modified for 386 by Jim Kingdon, Cygnus Support.
++ *  Compatibility with 2.1.xx kernel by David Grothe <dave@gcom.com>
++ *
++ *  Changes to allow auto initilization.  All that is needed is that it
++ *  be linked with the kernel and a break point (int 3) be executed.
++ *  The header file <asm/kgdb.h> defines BREAKPOINT to allow one to do
++ *  this. It should also be possible, once the interrupt system is up, to
++ *  call putDebugChar("+").  Once this is done, the remote debugger should
++ *  get our attention by sending a ^C in a packet. George Anzinger
++ *  <george@mvista.com>
++ *  Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
++ *  Added thread support, support for multiple processors,
++ *	support for ia-32(x86) hardware debugging.
++ *	Amit S. Kale ( akale@veritas.com )
++ *
++ *
++ *  To enable debugger support, two things need to happen.  One, a
++ *  call to set_debug_traps() is necessary in order to allow any breakpoints
++ *  or error conditions to be properly intercepted and reported to gdb.
++ *  Two, a breakpoint needs to be generated to begin communication.  This
++ *  is most easily accomplished by a call to breakpoint().  Breakpoint()
++ *  simulates a breakpoint by executing an int 3.
++ *
++ *************
++ *
++ *    The following gdb commands are supported:
++ *
++ * command	    function				   Return value
++ *
++ *    g		    return the value of the CPU registers  hex data or ENN
++ *    G		    set the value of the CPU registers	   OK or ENN
++ *
++ *    mAA..AA,LLLL  Read LLLL bytes at address AA..AA	   hex data or ENN
++ *    MAA..AA,LLLL: Write LLLL bytes at address AA.AA	   OK or ENN
++ *
++ *    c		    Resume at current address		   SNN	 ( signal NN)
++ *    cAA..AA	    Continue at address AA..AA		   SNN
++ *
++ *    s		    Step one instruction		   SNN
++ *    sAA..AA	    Step one instruction from AA..AA	   SNN
++ *
++ *    k		    kill
++ *
++ *    ?		    What was the last sigval ?		   SNN	 (signal NN)
++ *
++ * All commands and responses are sent with a packet which includes a
++ * checksum.  A packet consists of
++ *
++ * $<packet info>#<checksum>.
++ *
++ * where
++ * <packet info> :: <characters representing the command or response>
++ * <checksum>	 :: < two hex digits computed as modulo 256 sum of <packetinfo>>
++ *
++ * When a packet is received, it is first acknowledged with either '+' or '-'.
++ * '+' indicates a successful transfer.	 '-' indicates a failed transfer.
++ *
++ * Example:
++ *
++ * Host:		  Reply:
++ * $m0,10#2a		   +$00010203040506070809101112131415#42
++ *
++ ****************************************************************************/
++#define KGDB_VERSION "<20030915.1651.33>"
++#include <linux/config.h>
++#include <linux/types.h>
++#include <asm/string.h>		/* for strcpy */
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <asm/vm86.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>		/* for linux pt_regs struct */
++#include <asm/kgdb_local.h>
++#include <linux/list.h>
++#include <asm/atomic.h>
++#include <asm/processor.h>
++#include <linux/irq.h>
++#include <asm/desc.h>
++
++/************************************************************************
++ *
++ * external low-level support routines
++ */
++typedef void (*Function) (void);	/* pointer to a function */
++
++/* Thread reference */
++typedef unsigned char threadref[8];
++
++extern void putDebugChar(int);	/* write a single character	 */
++extern int getDebugChar(void);	/* read and return a single char */
++
++/************************************************************************/
++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/
++/* at least NUMREGBYTES*2 are needed for register packets */
++/* Longer buffer is needed to list all threads */
++#define BUFMAX 400
++
++char *kgdb_version = KGDB_VERSION;
++
++/*  debug >  0 prints ill-formed commands in valid packets & checksum errors */
++int debug_regs = 0;		/* set to non-zero to print registers */
++
++/* filled in by an external module */
++char *gdb_module_offsets;
++
++static const char hexchars[] = "0123456789abcdef";
++
++/* Number of bytes of registers.  */
++#define NUMREGBYTES 64
++/*
++ * Note that this register image is in a different order than
++ * the register image that Linux produces at interrupt time.
++ *
++ * Linux's register image is defined by struct pt_regs in ptrace.h.
++ * Just why GDB uses a different order is a historical mystery.
++ */
++enum regnames { _EAX,		/* 0 */
++	_ECX,			/* 1 */
++	_EDX,			/* 2 */
++	_EBX,			/* 3 */
++	_ESP,			/* 4 */
++	_EBP,			/* 5 */
++	_ESI,			/* 6 */
++	_EDI,			/* 7 */
++	_PC /* 8 also known as eip */ ,
++	_PS /* 9 also known as eflags */ ,
++	_CS,			/* 10 */
++	_SS,			/* 11 */
++	_DS,			/* 12 */
++	_ES,			/* 13 */
++	_FS,			/* 14 */
++	_GS			/* 15 */
++};
++
++/***************************  ASSEMBLY CODE MACROS *************************/
++/*
++ * Put the error code here just in case the user cares.
++ * Likewise, the vector number here (since GDB only gets the signal
++ * number through the usual means, and that's not very specific).
++ * The called_from is the return address so he can tell how we entered kgdb.
++ * This will allow him to seperate out the various possible entries.
++ */
++#define REMOTE_DEBUG 0		/* set != to turn on printing (also available in info) */
++
++#define PID_MAX PID_MAX_DEFAULT
++
++#ifdef CONFIG_SMP
++void smp_send_nmi_allbutself(void);
++#define IF_SMP(x) x
++#undef MAX_NO_CPUS
++#ifndef CONFIG_NO_KGDB_CPUS
++#define CONFIG_NO_KGDB_CPUS 2
++#endif
++#if CONFIG_NO_KGDB_CPUS > NR_CPUS
++#define MAX_NO_CPUS NR_CPUS
++#else
++#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS
++#endif
++#define hold_init hold_on_sstep: 1,
++#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL)
++#define NUM_CPUS num_online_cpus()
++#else
++#define IF_SMP(x)
++#define hold_init
++#undef MAX_NO_CPUS
++#define MAX_NO_CPUS 1
++#define NUM_CPUS 1
++#endif
++#define NOCPU (struct task_struct *)0xbad1fbad
++/* *INDENT-OFF*	 */
++struct kgdb_info {
++	int used_malloc;
++	void *called_from;
++	long long entry_tsc;
++	int errcode;
++	int vector;
++	int print_debug_info;
++#ifdef CONFIG_SMP
++	int hold_on_sstep;
++	struct {
++		volatile struct task_struct *task;
++		int pid;
++		int hold;
++		struct pt_regs *regs;
++	} cpus_waiting[MAX_NO_CPUS];
++#endif
++} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1};
++
++/* *INDENT-ON*	*/
++
++#define used_m kgdb_info.used_malloc
++/*
++ * This is little area we set aside to contain the stack we
++ * need to build to allow gdb to call functions.  We use one
++ * per cpu to avoid locking issues.  We will do all this work
++ * with interrupts off so that should take care of the protection
++ * issues.
++ */
++#define LOOKASIDE_SIZE 200	/* should be more than enough */
++#define MALLOC_MAX   200	/* Max malloc size */
++struct {
++	unsigned int esp;
++	int array[LOOKASIDE_SIZE];
++} fn_call_lookaside[MAX_NO_CPUS];
++
++static int trap_cpu;
++static unsigned int OLD_esp;
++
++#define END_OF_LOOKASIDE  &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE]
++#define IF_BIT 0x200
++#define TF_BIT 0x100
++
++#define MALLOC_ROUND 8-1
++
++static char malloc_array[MALLOC_MAX];
++IF_SMP(static void to_gdb(const char *mess));
++void *
++malloc(int size)
++{
++
++	if (size <= (MALLOC_MAX - used_m)) {
++		int old_used = used_m;
++		used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND));
++		return &malloc_array[old_used];
++	} else {
++		return NULL;
++	}
++}
++
++/*
++ * Gdb calls functions by pushing agruments, including a return address
++ * on the stack and the adjusting EIP to point to the function.	 The
++ * whole assumption in GDB is that we are on a different stack than the
++ * one the "user" i.e. code that hit the break point, is on.  This, of
++ * course is not true in the kernel.  Thus various dodges are needed to
++ * do the call without directly messing with EIP (which we can not change
++ * as it is just a location and not a register.	 To adjust it would then
++ * require that we move every thing below EIP up or down as needed.  This
++ * will not work as we may well have stack relative pointer on the stack
++ * (such as the pointer to regs, for example).
++
++ * So here is what we do:
++ * We detect gdb attempting to store into the stack area and instead, store
++ * into the fn_call_lookaside.array at the same relative location as if it
++ * were the area ESP pointed at.  We also trap ESP modifications
++ * and uses these to adjust fn_call_lookaside.esp.  On entry
++ * fn_call_lookaside.esp will be set to point at the last entry in
++ * fn_call_lookaside.array.  This allows us to check if it has changed, and
++ * if so, on exit, we add the registers we will use to do the move and a
++ * trap/ interrupt return exit sequence.  We then adjust the eflags in the
++ * regs array (remember we now have a copy in the fn_call_lookaside.array) to
++ * kill the interrupt bit, AND we change EIP to point at our set up stub.
++ * As part of the register set up we preset the registers to point at the
++ * begining and end of the fn_call_lookaside.array, so all the stub needs to
++ * do is move words from the array to the stack until ESP= the desired value
++ * then do the rti.  This will then transfer to the desired function with
++ * all the correct registers.  Nifty huh?
++ */
++extern asmlinkage void fn_call_stub(void);
++extern asmlinkage void fn_rtn_stub(void);
++/*					   *INDENT-OFF*	 */
++__asm__("fn_rtn_stub:\n\t"
++	"movl %eax,%esp\n\t"
++	"fn_call_stub:\n\t"
++	"1:\n\t"
++	"addl $-4,%ebx\n\t"
++	"movl (%ebx), %eax\n\t"
++	"pushl %eax\n\t"
++	"cmpl %esp,%ecx\n\t"
++	"jne  1b\n\t"
++	"popl %eax\n\t"
++	"popl %ebx\n\t"
++	"popl %ecx\n\t"
++	"iret \n\t");
++/*					     *INDENT-ON*  */
++#define gdb_i386vector	kgdb_info.vector
++#define gdb_i386errcode kgdb_info.errcode
++#define waiting_cpus	kgdb_info.cpus_waiting
++#define remote_debug	kgdb_info.print_debug_info
++#define hold_cpu(cpu)	kgdb_info.cpus_waiting[cpu].hold
++/* gdb locks */
++
++#ifdef CONFIG_SMP
++static int in_kgdb_called;
++static spinlock_t waitlocks[MAX_NO_CPUS] =
++    {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED };
++/*
++ * The following array has the thread pointer of each of the "other"
++ * cpus.  We make it global so it can be seen by gdb.
++ */
++volatile int in_kgdb_entry_log[MAX_NO_CPUS];
++volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS];
++/*
++static spinlock_t continuelocks[MAX_NO_CPUS];
++*/
++spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED;
++/* waiters on our spinlock plus us */
++static atomic_t spinlock_waiters = ATOMIC_INIT(1);
++static int spinlock_count = 0;
++static int spinlock_cpu = 0;
++/*
++ * Note we use nested spin locks to account for the case where a break
++ * point is encountered when calling a function by user direction from
++ * kgdb. Also there is the memory exception recursion to account for.
++ * Well, yes, but this lets other cpus thru too.  Lets add a
++ * cpu id to the lock.
++ */
++#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \
++			      spinlock_cpu != smp_processor_id()){\
++				      atomic_inc(&spinlock_waiters); \
++				      while (! spin_trylock(x)) {\
++					    in_kgdb(&regs);\
++				      }\
++				      atomic_dec(&spinlock_waiters); \
++				      spinlock_count = 1; \
++				      spinlock_cpu = smp_processor_id(); \
++			  }else{  \
++				      spinlock_count++; \
++			  }
++#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x)
++#else
++unsigned kgdb_spinlock = 0;
++#define KGDB_SPIN_LOCK(x) --*x
++#define KGDB_SPIN_UNLOCK(x) ++*x
++#endif
++
++int
++hex(char ch)
++{
++	if ((ch >= 'a') && (ch <= 'f'))
++		return (ch - 'a' + 10);
++	if ((ch >= '0') && (ch <= '9'))
++		return (ch - '0');
++	if ((ch >= 'A') && (ch <= 'F'))
++		return (ch - 'A' + 10);
++	return (-1);
++}
++
++/* scan for the sequence $<data>#<checksum>	*/
++void
++getpacket(char *buffer)
++{
++	unsigned char checksum;
++	unsigned char xmitcsum;
++	int i;
++	int count;
++	char ch;
++
++	do {
++		/* wait around for the start character, ignore all other characters */
++		while ((ch = (getDebugChar() & 0x7f)) != '$') ;
++		checksum = 0;
++		xmitcsum = -1;
++
++		count = 0;
++
++		/* now, read until a # or end of buffer is found */
++		while (count < BUFMAX) {
++			ch = getDebugChar() & 0x7f;
++			if (ch == '#')
++				break;
++			checksum = checksum + ch;
++			buffer[count] = ch;
++			count = count + 1;
++		}
++		buffer[count] = 0;
++
++		if (ch == '#') {
++			xmitcsum = hex(getDebugChar() & 0x7f) << 4;
++			xmitcsum += hex(getDebugChar() & 0x7f);
++			if ((remote_debug) && (checksum != xmitcsum)) {
++				printk
++				    ("bad checksum.	My count = 0x%x, sent=0x%x. buf=%s\n",
++				     checksum, xmitcsum, buffer);
++			}
++
++			if (checksum != xmitcsum)
++				putDebugChar('-');	/* failed checksum */
++			else {
++				putDebugChar('+');	/* successful transfer */
++				/* if a sequence char is present, reply the sequence ID */
++				if (buffer[2] == ':') {
++					putDebugChar(buffer[0]);
++					putDebugChar(buffer[1]);
++					/* remove sequence chars from buffer */
++					count = strlen(buffer);
++					for (i = 3; i <= count; i++)
++						buffer[i - 3] = buffer[i];
++				}
++			}
++		}
++	} while (checksum != xmitcsum);
++
++	if (remote_debug)
++		printk("R:%s\n", buffer);
++}
++
++/* send the packet in buffer.  */
++
++void
++putpacket(char *buffer)
++{
++	unsigned char checksum;
++	int count;
++	char ch;
++
++	/*  $<packet info>#<checksum>. */
++	do {
++		if (remote_debug)
++			printk("T:%s\n", buffer);
++		putDebugChar('$');
++		checksum = 0;
++		count = 0;
++
++		while ((ch = buffer[count])) {
++			putDebugChar(ch);
++			checksum += ch;
++			count += 1;
++		}
++
++		putDebugChar('#');
++		putDebugChar(hexchars[checksum >> 4]);
++		putDebugChar(hexchars[checksum % 16]);
++
++	} while ((getDebugChar() & 0x7f) != '+');
++
++}
++
++static char remcomInBuffer[BUFMAX];
++static char remcomOutBuffer[BUFMAX];
++static short error;
++
++void
++debug_error(char *format, char *parm)
++{
++	if (remote_debug)
++		printk(format, parm);
++}
++
++static void
++print_regs(struct pt_regs *regs)
++{
++	printk("EAX=%08lx ", regs->eax);
++	printk("EBX=%08lx ", regs->ebx);
++	printk("ECX=%08lx ", regs->ecx);
++	printk("EDX=%08lx ", regs->edx);
++	printk("\n");
++	printk("ESI=%08lx ", regs->esi);
++	printk("EDI=%08lx ", regs->edi);
++	printk("EBP=%08lx ", regs->ebp);
++	printk("ESP=%08lx ", (long) &regs->esp);
++	printk("\n");
++	printk(" DS=%08x ", regs->xds);
++	printk(" ES=%08x ", regs->xes);
++	printk(" SS=%08x ", __KERNEL_DS);
++	printk(" FL=%08lx ", regs->eflags);
++	printk("\n");
++	printk(" CS=%08x ", regs->xcs);
++	printk(" IP=%08lx ", regs->eip);
++#if 0
++	printk(" FS=%08x ", regs->fs);
++	printk(" GS=%08x ", regs->gs);
++#endif
++	printk("\n");
++
++}				/* print_regs */
++
++#define NEW_esp fn_call_lookaside[trap_cpu].esp
++
++static void
++regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs)
++{
++	gdb_regs[_EAX] = regs->eax;
++	gdb_regs[_EBX] = regs->ebx;
++	gdb_regs[_ECX] = regs->ecx;
++	gdb_regs[_EDX] = regs->edx;
++	gdb_regs[_ESI] = regs->esi;
++	gdb_regs[_EDI] = regs->edi;
++	gdb_regs[_EBP] = regs->ebp;
++	gdb_regs[_DS] = regs->xds;
++	gdb_regs[_ES] = regs->xes;
++	gdb_regs[_PS] = regs->eflags;
++	gdb_regs[_CS] = regs->xcs;
++	gdb_regs[_PC] = regs->eip;
++	/* Note, as we are a debugging the kernel, we will always
++	 * trap in kernel code, this means no priviledge change,
++	 * and so the pt_regs structure is not completely valid.  In a non
++	 * privilege change trap, only EFLAGS, CS and EIP are put on the stack,
++	 * SS and ESP are not stacked, this means that the last 2 elements of
++	 * pt_regs is not valid (they would normally refer to the user stack)
++	 * also, using regs+1 is no good because you end up will a value that is
++	 * 2 longs (8) too high.  This used to cause stepping over functions
++	 * to fail, so my fix is to use the address of regs->esp, which
++	 * should point at the end of the stack frame.	Note I have ignored
++	 * completely exceptions that cause an error code to be stacked, such
++	 * as double fault.  Stuart Hughes, Zentropix.
++	 * original code: gdb_regs[_ESP] =  (int) (regs + 1) ;
++
++	 * this is now done on entry and moved to OLD_esp (as well as NEW_esp).
++	 */
++	gdb_regs[_ESP] = NEW_esp;
++	gdb_regs[_SS] = __KERNEL_DS;
++	gdb_regs[_FS] = 0xFFFF;
++	gdb_regs[_GS] = 0xFFFF;
++}				/* regs_to_gdb_regs */
++
++static void
++gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs)
++{
++	regs->eax = gdb_regs[_EAX];
++	regs->ebx = gdb_regs[_EBX];
++	regs->ecx = gdb_regs[_ECX];
++	regs->edx = gdb_regs[_EDX];
++	regs->esi = gdb_regs[_ESI];
++	regs->edi = gdb_regs[_EDI];
++	regs->ebp = gdb_regs[_EBP];
++	regs->xds = gdb_regs[_DS];
++	regs->xes = gdb_regs[_ES];
++	regs->eflags = gdb_regs[_PS];
++	regs->xcs = gdb_regs[_CS];
++	regs->eip = gdb_regs[_PC];
++	NEW_esp = gdb_regs[_ESP];	/* keep the value */
++#if 0				/* can't change these */
++	regs->esp = gdb_regs[_ESP];
++	regs->xss = gdb_regs[_SS];
++	regs->fs = gdb_regs[_FS];
++	regs->gs = gdb_regs[_GS];
++#endif
++
++}				/* gdb_regs_to_regs */
++
++int thread_list = 0;
++
++void
++get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs)
++{
++	unsigned long stack_page;
++	int count = 0;
++	IF_SMP(int i);
++	if (!p || p == current) {
++		regs_to_gdb_regs(gdb_regs, regs);
++		return;
++	}
++#ifdef CONFIG_SMP
++	for (i = 0; i < MAX_NO_CPUS; i++) {
++		if (p == kgdb_info.cpus_waiting[i].task) {
++			regs_to_gdb_regs(gdb_regs,
++					 kgdb_info.cpus_waiting[i].regs);
++			gdb_regs[_ESP] =
++			    (int) &kgdb_info.cpus_waiting[i].regs->esp;
++
++			return;
++		}
++	}
++#endif
++	memset(gdb_regs, 0, NUMREGBYTES);
++	gdb_regs[_ESP] = p->thread.esp;
++	gdb_regs[_PC] = p->thread.eip;
++	gdb_regs[_EBP] = *(int *) gdb_regs[_ESP];
++	gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4);
++	gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8);
++
++/*
++ * This code is to give a more informative notion of where a process
++ * is waiting.	It is used only when the user asks for a thread info
++ * list.  If he then switches to the thread, s/he will find the task
++ * is in schedule, but a back trace should show the same info we come
++ * up with.  This code was shamelessly purloined from process.c.  It was
++ * then enhanced to provide more registers than simply the program
++ * counter.
++ */
++
++	if (!thread_list) {
++		return;
++	}
++
++	if (p->state == TASK_RUNNING)
++		return;
++	stack_page = (unsigned long) p->thread_info;
++	if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] >
++	    THREAD_SIZE - sizeof(long) + stack_page)
++		return;
++	/* include/asm-i386/system.h:switch_to() pushes ebp last. */
++	do {
++		if (gdb_regs[_EBP] < stack_page ||
++		    gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page)
++			return;
++		gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4);
++		gdb_regs[_ESP] = gdb_regs[_EBP] + 8;
++		gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP];
++		if (!in_sched_functions(gdb_regs[_PC]))
++			return;
++	} while (count++ < 16);
++	return;
++}
++
++/* Indicate to caller of mem2hex or hex2mem that there has been an
++   error.  */
++static volatile int mem_err = 0;
++static volatile int mem_err_expected = 0;
++static volatile int mem_err_cnt = 0;
++static int garbage_loc = -1;
++
++int
++get_char(char *addr)
++{
++	return *addr;
++}
++
++void
++set_char(char *addr, int val, int may_fault)
++{
++	/*
++	 * This code traps references to the area mapped to the kernel
++	 * stack as given by the regs and, instead, stores to the
++	 * fn_call_lookaside[cpu].array
++	 */
++	if (may_fault &&
++	    (unsigned int) addr < OLD_esp &&
++	    ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) {
++		addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr);
++	}
++	*addr = val;
++}
++
++/* convert the memory pointed to by mem into hex, placing result in buf */
++/* return a pointer to the last char put in buf (null) */
++/* If MAY_FAULT is non-zero, then we should set mem_err in response to
++   a fault; if zero treat a fault like any other fault in the stub.  */
++char *
++mem2hex(char *mem, char *buf, int count, int may_fault)
++{
++	int i;
++	unsigned char ch;
++
++	if (may_fault) {
++		mem_err_expected = 1;
++		mem_err = 0;
++	}
++	for (i = 0; i < count; i++) {
++		/* printk("%lx = ", mem) ; */
++
++		ch = get_char(mem++);
++
++		/* printk("%02x\n", ch & 0xFF) ; */
++		if (may_fault && mem_err) {
++			if (remote_debug)
++				printk("Mem fault fetching from addr %lx\n",
++				       (long) (mem - 1));
++			*buf = 0;	/* truncate buffer */
++			return (buf);
++		}
++		*buf++ = hexchars[ch >> 4];
++		*buf++ = hexchars[ch % 16];
++	}
++	*buf = 0;
++	if (may_fault)
++		mem_err_expected = 0;
++	return (buf);
++}
++
++/* convert the hex array pointed to by buf into binary to be placed in mem */
++/* return a pointer to the character AFTER the last byte written */
++/* NOTE: We use the may fault flag to also indicate if the write is to
++ * the registers (0) or "other" memory (!=0)
++ */
++char *
++hex2mem(char *buf, char *mem, int count, int may_fault)
++{
++	int i;
++	unsigned char ch;
++
++	if (may_fault) {
++		mem_err_expected = 1;
++		mem_err = 0;
++	}
++	for (i = 0; i < count; i++) {
++		ch = hex(*buf++) << 4;
++		ch = ch + hex(*buf++);
++		set_char(mem++, ch, may_fault);
++
++		if (may_fault && mem_err) {
++			if (remote_debug)
++				printk("Mem fault storing to addr %lx\n",
++				       (long) (mem - 1));
++			return (mem);
++		}
++	}
++	if (may_fault)
++		mem_err_expected = 0;
++	return (mem);
++}
++
++/**********************************************/
++/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */
++/* RETURN NUMBER OF CHARS PROCESSED	      */
++/**********************************************/
++int
++hexToInt(char **ptr, int *intValue)
++{
++	int numChars = 0;
++	int hexValue;
++
++	*intValue = 0;
++
++	while (**ptr) {
++		hexValue = hex(**ptr);
++		if (hexValue >= 0) {
++			*intValue = (*intValue << 4) | hexValue;
++			numChars++;
++		} else
++			break;
++
++		(*ptr)++;
++	}
++
++	return (numChars);
++}
++
++#define stubhex(h) hex(h)
++#ifdef old_thread_list
++
++static int
++stub_unpack_int(char *buff, int fieldlength)
++{
++	int nibble;
++	int retval = 0;
++
++	while (fieldlength) {
++		nibble = stubhex(*buff++);
++		retval |= nibble;
++		fieldlength--;
++		if (fieldlength)
++			retval = retval << 4;
++	}
++	return retval;
++}
++#endif
++static char *
++pack_hex_byte(char *pkt, int byte)
++{
++	*pkt++ = hexchars[(byte >> 4) & 0xf];
++	*pkt++ = hexchars[(byte & 0xf)];
++	return pkt;
++}
++
++#define BUF_THREAD_ID_SIZE 16
++
++static char *
++pack_threadid(char *pkt, threadref * id)
++{
++	char *limit;
++	unsigned char *altid;
++
++	altid = (unsigned char *) id;
++	limit = pkt + BUF_THREAD_ID_SIZE;
++	while (pkt < limit)
++		pkt = pack_hex_byte(pkt, *altid++);
++	return pkt;
++}
++
++#ifdef old_thread_list
++static char *
++unpack_byte(char *buf, int *value)
++{
++	*value = stub_unpack_int(buf, 2);
++	return buf + 2;
++}
++
++static char *
++unpack_threadid(char *inbuf, threadref * id)
++{
++	char *altref;
++	char *limit = inbuf + BUF_THREAD_ID_SIZE;
++	int x, y;
++
++	altref = (char *) id;
++
++	while (inbuf < limit) {
++		x = stubhex(*inbuf++);
++		y = stubhex(*inbuf++);
++		*altref++ = (x << 4) | y;
++	}
++	return inbuf;
++}
++#endif
++void
++int_to_threadref(threadref * id, int value)
++{
++	unsigned char *scan;
++
++	scan = (unsigned char *) id;
++	{
++		int i = 4;
++		while (i--)
++			*scan++ = 0;
++	}
++	*scan++ = (value >> 24) & 0xff;
++	*scan++ = (value >> 16) & 0xff;
++	*scan++ = (value >> 8) & 0xff;
++	*scan++ = (value & 0xff);
++}
++int
++int_to_hex_v(unsigned char * id, int value)
++{
++	unsigned char *start = id;
++	int shift;
++	int ch;
++
++	for (shift = 28; shift >= 0; shift -= 4) {
++		if ((ch = (value >> shift) & 0xf) || (id != start)) {
++			*id = hexchars[ch];
++			id++;
++		}
++	}
++	if (id == start)
++		*id++ = '0';
++	return id - start;
++}
++#ifdef old_thread_list
++
++static int
++threadref_to_int(threadref * ref)
++{
++	int i, value = 0;
++	unsigned char *scan;
++
++	scan = (char *) ref;
++	scan += 4;
++	i = 4;
++	while (i-- > 0)
++		value = (value << 8) | ((*scan++) & 0xff);
++	return value;
++}
++#endif
++static int
++cmp_str(char *s1, char *s2, int count)
++{
++	while (count--) {
++		if (*s1++ != *s2++)
++			return 0;
++	}
++	return 1;
++}
++
++#if 1				/* this is a hold over from 2.4 where O(1) was "sometimes" */
++extern struct task_struct *kgdb_get_idle(int cpu);
++#define idle_task(cpu) kgdb_get_idle(cpu)
++#else
++#define idle_task(cpu) init_tasks[cpu]
++#endif
++
++extern int kgdb_pid_init_done;
++
++struct task_struct *
++getthread(int pid)
++{
++	struct task_struct *thread;
++	if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) {
++
++		return idle_task(pid - PID_MAX);
++	} else {
++		/*
++		 * find_task_by_pid is relatively safe all the time
++		 * Other pid functions require lock downs which imply
++		 * that we may be interrupting them (as we get here
++		 * in the middle of most any lock down).
++		 * Still we don't want to call until the table exists!
++		 */
++		if (kgdb_pid_init_done){
++			thread = find_task_by_pid(pid);
++			if (thread) {
++				return thread;
++			}
++		}
++	}
++	return NULL;
++}
++/* *INDENT-OFF*	 */
++struct hw_breakpoint {
++	unsigned enabled;
++	unsigned type;
++	unsigned len;
++	unsigned addr;
++} breakinfo[4] = { {enabled:0},
++		   {enabled:0},
++		   {enabled:0},
++		   {enabled:0}};
++/* *INDENT-ON*	*/
++unsigned hw_breakpoint_status;
++void
++correct_hw_break(void)
++{
++	int breakno;
++	int correctit;
++	int breakbit;
++	unsigned dr7;
++
++	asm volatile ("movl %%db7, %0\n":"=r" (dr7)
++		      :);
++	/* *INDENT-OFF*	 */
++	do {
++		unsigned addr0, addr1, addr2, addr3;
++		asm volatile ("movl %%db0, %0\n"
++			      "movl %%db1, %1\n"
++			      "movl %%db2, %2\n"
++			      "movl %%db3, %3\n"
++			      :"=r" (addr0), "=r"(addr1),
++			      "=r"(addr2), "=r"(addr3)
++			      :);
++	} while (0);
++	/* *INDENT-ON*	*/
++	correctit = 0;
++	for (breakno = 0; breakno < 3; breakno++) {
++		breakbit = 2 << (breakno << 1);
++		if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
++			correctit = 1;
++			dr7 |= breakbit;
++			dr7 &= ~(0xf0000 << (breakno << 2));
++			dr7 |= (((breakinfo[breakno].len << 2) |
++				 breakinfo[breakno].type) << 16) <<
++			    (breakno << 2);
++			switch (breakno) {
++			case 0:
++				asm volatile ("movl %0, %%dr0\n"::"r"
++					      (breakinfo[breakno].addr));
++				break;
++
++			case 1:
++				asm volatile ("movl %0, %%dr1\n"::"r"
++					      (breakinfo[breakno].addr));
++				break;
++
++			case 2:
++				asm volatile ("movl %0, %%dr2\n"::"r"
++					      (breakinfo[breakno].addr));
++				break;
++
++			case 3:
++				asm volatile ("movl %0, %%dr3\n"::"r"
++					      (breakinfo[breakno].addr));
++				break;
++			}
++		} else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
++			correctit = 1;
++			dr7 &= ~breakbit;
++			dr7 &= ~(0xf0000 << (breakno << 2));
++		}
++	}
++	if (correctit) {
++		asm volatile ("movl %0, %%db7\n"::"r" (dr7));
++	}
++}
++
++int
++remove_hw_break(unsigned breakno)
++{
++	if (!breakinfo[breakno].enabled) {
++		return -1;
++	}
++	breakinfo[breakno].enabled = 0;
++	return 0;
++}
++
++int
++set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr)
++{
++	if (breakinfo[breakno].enabled) {
++		return -1;
++	}
++	breakinfo[breakno].enabled = 1;
++	breakinfo[breakno].type = type;
++	breakinfo[breakno].len = len;
++	breakinfo[breakno].addr = addr;
++	return 0;
++}
++
++#ifdef CONFIG_SMP
++static int in_kgdb_console = 0;
++
++int
++in_kgdb(struct pt_regs *regs)
++{
++	unsigned flags;
++	int cpu = smp_processor_id();
++	in_kgdb_called = 1;
++	if (!spin_is_locked(&kgdb_spinlock)) {
++		if (in_kgdb_here_log[cpu] ||	/* we are holding this cpu */
++		    in_kgdb_console) {	/* or we are doing slow i/o */
++			return 1;
++		}
++		return 0;
++	}
++
++	/* As I see it the only reason not to let all cpus spin on
++	 * the same spin_lock is to allow selected ones to proceed.
++	 * This would be a good thing, so we leave it this way.
++	 * Maybe someday....  Done !
++
++	 * in_kgdb() is called from an NMI so we don't pretend
++	 * to have any resources, like printk() for example.
++	 */
++
++	kgdb_local_irq_save(flags);	/* only local here, to avoid hanging */
++	/*
++	 * log arival of this cpu
++	 * The NMI keeps on ticking.  Protect against recurring more
++	 * than once, and ignor the cpu that has the kgdb lock
++	 */
++	in_kgdb_entry_log[cpu]++;
++	in_kgdb_here_log[cpu] = regs;
++	if (cpu == spinlock_cpu || waiting_cpus[cpu].task) {
++		goto exit_in_kgdb;
++	}
++	/*
++	 * For protection of the initilization of the spin locks by kgdb
++	 * it locks the kgdb spinlock before it gets the wait locks set
++	 * up.	We wait here for the wait lock to be taken.  If the
++	 * kgdb lock goes away first??	Well, it could be a slow exit
++	 * sequence where the wait lock is removed prior to the kgdb lock
++	 * so if kgdb gets unlocked, we just exit.
++	 */
++	while (spin_is_locked(&kgdb_spinlock) &&
++	       !spin_is_locked(waitlocks + cpu)) ;
++	if (!spin_is_locked(&kgdb_spinlock)) {
++		goto exit_in_kgdb;
++	}
++	waiting_cpus[cpu].task = current;
++	waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu);
++	waiting_cpus[cpu].regs = regs;
++
++	spin_unlock_wait(waitlocks + cpu);
++	/*
++	 * log departure of this cpu
++	 */
++	waiting_cpus[cpu].task = 0;
++	waiting_cpus[cpu].pid = 0;
++	waiting_cpus[cpu].regs = 0;
++	correct_hw_break();
++      exit_in_kgdb:
++	in_kgdb_here_log[cpu] = 0;
++	kgdb_local_irq_restore(flags);
++	return 1;
++	/*
++	   spin_unlock(continuelocks + smp_processor_id());
++	 */
++}
++
++void
++smp__in_kgdb(struct pt_regs regs)
++{
++	ack_APIC_irq();
++	in_kgdb(&regs);
++}
++#else
++int
++in_kgdb(struct pt_regs *regs)
++{
++	return (kgdb_spinlock);
++}
++#endif
++
++void
++printexceptioninfo(int exceptionNo, int errorcode, char *buffer)
++{
++	unsigned dr6;
++	int i;
++	switch (exceptionNo) {
++	case 1:		/* debug exception */
++		break;
++	case 3:		/* breakpoint */
++		sprintf(buffer, "Software breakpoint");
++		return;
++	default:
++		sprintf(buffer, "Details not available");
++		return;
++	}
++	asm volatile ("movl %%db6, %0\n":"=r" (dr6)
++		      :);
++	if (dr6 & 0x4000) {
++		sprintf(buffer, "Single step");
++		return;
++	}
++	for (i = 0; i < 4; ++i) {
++		if (dr6 & (1 << i)) {
++			sprintf(buffer, "Hardware breakpoint %d", i);
++			return;
++		}
++	}
++	sprintf(buffer, "Unknown trap");
++	return;
++}
++
++/*
++ * This function does all command procesing for interfacing to gdb.
++ *
++ * NOTE:  The INT nn instruction leaves the state of the interrupt
++ *	  enable flag UNCHANGED.  That means that when this routine
++ *	  is entered via a breakpoint (INT 3) instruction from code
++ *	  that has interrupts enabled, then interrupts will STILL BE
++ *	  enabled when this routine is entered.	 The first thing that
++ *	  we do here is disable interrupts so as to prevent recursive
++ *	  entries and bothersome serial interrupts while we are
++ *	  trying to run the serial port in polled mode.
++ *
++ * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so
++ * it is always necessary to do a restore_flags before returning
++ * so as to let go of that lock.
++ */
++int
++kgdb_handle_exception(int exceptionVector,
++		      int signo, int err_code, struct pt_regs *linux_regs)
++{
++	struct task_struct *usethread = NULL;
++	struct task_struct *thread_list_start = 0, *thread = NULL;
++	int addr, length;
++	int breakno, breaktype;
++	char *ptr;
++	int newPC;
++	threadref thref;
++	int threadid;
++	int thread_min = PID_MAX + MAX_NO_CPUS;
++#ifdef old_thread_list
++	int maxthreads;
++#endif
++	int nothreads;
++	unsigned long flags;
++	int gdb_regs[NUMREGBYTES / 4];
++	int dr6;
++	IF_SMP(int entry_state = 0);	/* 0, ok, 1, no nmi, 2 sync failed */
++#define NO_NMI 1
++#define NO_SYNC 2
++#define	regs	(*linux_regs)
++#define NUMREGS NUMREGBYTES/4
++	/*
++	 * If the entry is not from the kernel then return to the Linux
++	 * trap handler and let it process the interrupt normally.
++	 */
++	if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) {
++		printk("ignoring non-kernel exception\n");
++		print_regs(&regs);
++		return (0);
++	}
++
++	kgdb_local_irq_save(flags);
++
++	/* Get kgdb spinlock */
++
++	KGDB_SPIN_LOCK(&kgdb_spinlock);
++	rdtscll(kgdb_info.entry_tsc);
++	/*
++	 * We depend on this spinlock and the NMI watch dog to control the
++	 * other cpus.	They will arrive at "in_kgdb()" as a result of the
++	 * NMI and will wait there for the following spin locks to be
++	 * released.
++	 */
++#ifdef CONFIG_SMP
++
++#if 0
++	if (cpu_callout_map & ~MAX_CPU_MASK) {
++		printk("kgdb : too many cpus, possibly not mapped"
++		       " in contiguous space, change MAX_NO_CPUS"
++		       " in kgdb_stub and make new kernel.\n"
++		       " cpu_callout_map is %lx\n", cpu_callout_map);
++		goto exit_just_unlock;
++	}
++#endif
++	if (spinlock_count == 1) {
++		int time = 0, end_time, dum = 0;
++		int i;
++		int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0)
++		};
++		if (remote_debug) {
++			printk("kgdb : cpu %d entry, syncing others\n",
++			       smp_processor_id());
++		}
++		for (i = 0; i < MAX_NO_CPUS; i++) {
++			/*
++			 * Use trylock as we may already hold the lock if
++			 * we are holding the cpu.  Net result is all
++			 * locked.
++			 */
++			spin_trylock(&waitlocks[i]);
++		}
++		for (i = 0; i < MAX_NO_CPUS; i++)
++			cpu_logged_in[i] = 0;
++		/*
++		 * Wait for their arrival.  We know the watch dog is active if
++		 * in_kgdb() has ever been called, as it is always called on a
++		 * watchdog tick.
++		 */
++		rdtsc(dum, time);
++		end_time = time + 2;	/* Note: we use the High order bits! */
++		i = 1;
++		if (num_online_cpus() > 1) {
++			int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()];
++			smp_send_nmi_allbutself();
++			while (i < num_online_cpus() && time != end_time) {
++				int j;
++				for (j = 0; j < MAX_NO_CPUS; j++) {
++					if (waiting_cpus[j].task &&
++					    !cpu_logged_in[j]) {
++						i++;
++						cpu_logged_in[j] = 1;
++						if (remote_debug) {
++							printk
++							    ("kgdb : cpu %d arrived at kgdb\n",
++							     j);
++						}
++						break;
++					} else if (!waiting_cpus[j].task &&
++						   !cpu_online(j)) {
++						waiting_cpus[j].task = NOCPU;
++						cpu_logged_in[j] = 1;
++						waiting_cpus[j].hold = 1;
++						break;
++					}
++					if (!waiting_cpus[j].task &&
++					    in_kgdb_here_log[j]) {
++
++						int wait = 100000;
++						while (wait--) ;
++						if (!waiting_cpus[j].task &&
++						    in_kgdb_here_log[j]) {
++							printk
++							    ("kgdb : cpu %d stall"
++							     " in in_kgdb\n",
++							     j);
++							i++;
++							cpu_logged_in[j] = 1;
++							waiting_cpus[j].task =
++							    (struct task_struct
++							     *) 1;
++						}
++					}
++				}
++
++				if (in_kgdb_entry_log[smp_processor_id()] >
++				    (me_in_kgdb + 10)) {
++					break;
++				}
++
++				rdtsc(dum, time);
++			}
++			if (i < num_online_cpus()) {
++				printk
++				    ("kgdb : time out, proceeding without sync\n");
++#if 0
++				printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n",
++				       waiting_cpus[0].task != 0,
++				       waiting_cpus[1].task != 0);
++				printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n",
++				       cpu_logged_in[0], cpu_logged_in[1]);
++				printk
++				    ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n",
++				     in_kgdb_here_log[0] != 0,
++				     in_kgdb_here_log[1] != 0);
++#endif
++				entry_state = NO_SYNC;
++			} else {
++#if 0
++				int ent =
++				    in_kgdb_entry_log[smp_processor_id()] -
++				    me_in_kgdb;
++				printk("kgdb : sync after %d entries\n", ent);
++#endif
++			}
++		} else {
++			if (remote_debug) {
++				printk
++				    ("kgdb : %d cpus, but watchdog not active\n"
++				     "proceeding without locking down other cpus\n",
++				     num_online_cpus());
++				entry_state = NO_NMI;
++			}
++		}
++	}
++#endif
++
++	if (remote_debug) {
++		unsigned long *lp = (unsigned long *) &linux_regs;
++
++		printk("handle_exception(exceptionVector=%d, "
++		       "signo=%d, err_code=%d, linux_regs=%p)\n",
++		       exceptionVector, signo, err_code, linux_regs);
++		if (debug_regs) {
++			print_regs(&regs);
++			printk("Stk: %8lx %8lx %8lx %8lx"
++			       "  %8lx %8lx %8lx %8lx\n",
++			       lp[0], lp[1], lp[2], lp[3],
++			       lp[4], lp[5], lp[6], lp[7]);
++			printk("     %8lx %8lx %8lx %8lx"
++			       "  %8lx %8lx %8lx %8lx\n",
++			       lp[8], lp[9], lp[10], lp[11],
++			       lp[12], lp[13], lp[14], lp[15]);
++			printk("     %8lx %8lx %8lx %8lx  "
++			       "%8lx %8lx %8lx %8lx\n",
++			       lp[16], lp[17], lp[18], lp[19],
++			       lp[20], lp[21], lp[22], lp[23]);
++			printk("     %8lx %8lx %8lx %8lx  "
++			       "%8lx %8lx %8lx %8lx\n",
++			       lp[24], lp[25], lp[26], lp[27],
++			       lp[28], lp[29], lp[30], lp[31]);
++		}
++	}
++
++	/* Disable hardware debugging while we are in kgdb */
++	/* Get the debug register status register */
++/*				       *INDENT-OFF*  */
++      __asm__("movl %0,%%db7"
++	      :	/* no output */
++	      :"r"(0));
++
++	asm volatile ("movl %%db6, %0\n"
++		      :"=r" (hw_breakpoint_status)
++		      :);
++
++/*				       *INDENT-ON*  */
++	switch (exceptionVector) {
++	case 0:		/* divide error */
++	case 1:		/* debug exception */
++	case 2:		/* NMI */
++	case 3:		/* breakpoint */
++	case 4:		/* overflow */
++	case 5:		/* bounds check */
++	case 6:		/* invalid opcode */
++	case 7:		/* device not available */
++	case 8:		/* double fault (errcode) */
++	case 10:		/* invalid TSS (errcode) */
++	case 12:		/* stack fault (errcode) */
++	case 16:		/* floating point error */
++	case 17:		/* alignment check (errcode) */
++	default:		/* any undocumented */
++		break;
++	case 11:		/* segment not present (errcode) */
++	case 13:		/* general protection (errcode) */
++	case 14:		/* page fault (special errcode) */
++	case 19:		/* cache flush denied */
++		if (mem_err_expected) {
++			/*
++			 * This fault occured because of the
++			 * get_char or set_char routines.  These
++			 * two routines use either eax of edx to
++			 * indirectly reference the location in
++			 * memory that they are working with.
++			 * For a page fault, when we return the
++			 * instruction will be retried, so we
++			 * have to make sure that these
++			 * registers point to valid memory.
++			 */
++			mem_err = 1;	/* set mem error flag */
++			mem_err_expected = 0;
++			mem_err_cnt++;	/* helps in debugging */
++			/* make valid address */
++			regs.eax = (long) &garbage_loc;
++			/* make valid address */
++			regs.edx = (long) &garbage_loc;
++			if (remote_debug)
++				printk("Return after memory error: "
++				       "mem_err_cnt=%d\n", mem_err_cnt);
++			if (debug_regs)
++				print_regs(&regs);
++			goto exit_kgdb;
++		}
++		break;
++	}
++	if (remote_debug)
++		printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id());
++
++	gdb_i386vector = exceptionVector;
++	gdb_i386errcode = err_code;
++	kgdb_info.called_from = __builtin_return_address(0);
++#ifdef CONFIG_SMP
++	/*
++	 * OK, we can now communicate, lets tell gdb about the sync.
++	 * but only if we had a problem.
++	 */
++	switch (entry_state) {
++	case NO_NMI:
++		to_gdb("NMI not active, other cpus not stopped\n");
++		break;
++	case NO_SYNC:
++		to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n");
++	default:;
++	}
++
++#endif
++/*
++ * Set up the gdb function call area.
++ */
++	trap_cpu = smp_processor_id();
++	OLD_esp = NEW_esp = (int) (&linux_regs->esp);
++
++      IF_SMP(once_again:)
++	    /* reply to host that an exception has occurred */
++	    remcomOutBuffer[0] = 'S';
++	remcomOutBuffer[1] = hexchars[signo >> 4];
++	remcomOutBuffer[2] = hexchars[signo % 16];
++	remcomOutBuffer[3] = 0;
++
++	putpacket(remcomOutBuffer);
++
++	while (1 == 1) {
++		error = 0;
++		remcomOutBuffer[0] = 0;
++		getpacket(remcomInBuffer);
++		switch (remcomInBuffer[0]) {
++		case '?':
++			remcomOutBuffer[0] = 'S';
++			remcomOutBuffer[1] = hexchars[signo >> 4];
++			remcomOutBuffer[2] = hexchars[signo % 16];
++			remcomOutBuffer[3] = 0;
++			break;
++		case 'd':
++			remote_debug = !(remote_debug);	/* toggle debug flag */
++			printk("Remote debug %s\n",
++			       remote_debug ? "on" : "off");
++			break;
++		case 'g':	/* return the value of the CPU registers */
++			get_gdb_regs(usethread, &regs, gdb_regs);
++			mem2hex((char *) gdb_regs,
++				remcomOutBuffer, NUMREGBYTES, 0);
++			break;
++		case 'G':	/* set the value of the CPU registers - return OK */
++			hex2mem(&remcomInBuffer[1],
++				(char *) gdb_regs, NUMREGBYTES, 0);
++			if (!usethread || usethread == current) {
++				gdb_regs_to_regs(gdb_regs, &regs);
++				strcpy(remcomOutBuffer, "OK");
++			} else {
++				strcpy(remcomOutBuffer, "E00");
++			}
++			break;
++
++		case 'P':{	/* set the value of a single CPU register -
++				   return OK */
++				/*
++				 * For some reason, gdb wants to talk about psudo
++				 * registers (greater than 15).	 These may have
++				 * meaning for ptrace, but for us it is safe to
++				 * ignor them.	We do this by dumping them into
++				 * _GS which we also ignor, but do have memory for.
++				 */
++				int regno;
++
++				ptr = &remcomInBuffer[1];
++				regs_to_gdb_regs(gdb_regs, &regs);
++				if ((!usethread || usethread == current) &&
++				    hexToInt(&ptr, &regno) &&
++				    *ptr++ == '=' && (regno >= 0)) {
++					regno =
++					    (regno >= NUMREGS ? _GS : regno);
++					hex2mem(ptr, (char *) &gdb_regs[regno],
++						4, 0);
++					gdb_regs_to_regs(gdb_regs, &regs);
++					strcpy(remcomOutBuffer, "OK");
++					break;
++				}
++				strcpy(remcomOutBuffer, "E01");
++				break;
++			}
++
++			/* mAA..AA,LLLL	 Read LLLL bytes at address AA..AA */
++		case 'm':
++			/* TRY TO READ %x,%x.  IF SUCCEED, SET PTR = 0 */
++			ptr = &remcomInBuffer[1];
++			if (hexToInt(&ptr, &addr) &&
++			    (*(ptr++) == ',') && (hexToInt(&ptr, &length))) {
++				ptr = 0;
++				/*
++				 * hex doubles the byte count
++				 */
++				if (length > (BUFMAX / 2))
++					length = BUFMAX / 2;
++				mem2hex((char *) addr,
++					remcomOutBuffer, length, 1);
++				if (mem_err) {
++					strcpy(remcomOutBuffer, "E03");
++					debug_error("memory fault\n", NULL);
++				}
++			}
++
++			if (ptr) {
++				strcpy(remcomOutBuffer, "E01");
++				debug_error
++				    ("malformed read memory command: %s\n",
++				     remcomInBuffer);
++			}
++			break;
++
++			/* MAA..AA,LLLL:
++			   Write LLLL bytes at address AA.AA return OK */
++		case 'M':
++			/* TRY TO READ '%x,%x:'.  IF SUCCEED, SET PTR = 0 */
++			ptr = &remcomInBuffer[1];
++			if (hexToInt(&ptr, &addr) &&
++			    (*(ptr++) == ',') &&
++			    (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) {
++				hex2mem(ptr, (char *) addr, length, 1);
++
++				if (mem_err) {
++					strcpy(remcomOutBuffer, "E03");
++					debug_error("memory fault\n", NULL);
++				} else {
++					strcpy(remcomOutBuffer, "OK");
++				}
++
++				ptr = 0;
++			}
++			if (ptr) {
++				strcpy(remcomOutBuffer, "E02");
++				debug_error
++				    ("malformed write memory command: %s\n",
++				     remcomInBuffer);
++			}
++			break;
++		case 'S':
++			remcomInBuffer[0] = 's';
++		case 'C':
++			/* Csig;AA..AA where ;AA..AA is optional
++			 * continue with signal
++			 * Since signals are meaning less to us, delete that
++			 * part and then fall into the 'c' code.
++			 */
++			ptr = &remcomInBuffer[1];
++			length = 2;
++			while (*ptr && *ptr != ';') {
++				length++;
++				ptr++;
++			}
++			if (*ptr) {
++				do {
++					ptr++;
++					*(ptr - length++) = *ptr;
++				} while (*ptr);
++			} else {
++				remcomInBuffer[1] = 0;
++			}
++
++			/* cAA..AA  Continue at address AA..AA(optional) */
++			/* sAA..AA  Step one instruction from AA..AA(optional) */
++			/* D	    detach, reply OK and then continue */
++		case 'c':
++		case 's':
++		case 'D':
++
++			/* try to read optional parameter,
++			   pc unchanged if no parm */
++			ptr = &remcomInBuffer[1];
++			if (hexToInt(&ptr, &addr)) {
++				if (remote_debug)
++					printk("Changing EIP to 0x%x\n", addr);
++
++				regs.eip = addr;
++			}
++
++			newPC = regs.eip;
++
++			/* clear the trace bit */
++			regs.eflags &= 0xfffffeff;
++
++			/* set the trace bit if we're stepping */
++			if (remcomInBuffer[0] == 's')
++				regs.eflags |= 0x100;
++
++			/* detach is a friendly version of continue. Note that
++			   debugging is still enabled (e.g hit control C)
++			 */
++			if (remcomInBuffer[0] == 'D') {
++				strcpy(remcomOutBuffer, "OK");
++				putpacket(remcomOutBuffer);
++			}
++
++			if (remote_debug) {
++				printk("Resuming execution\n");
++				print_regs(&regs);
++			}
++			asm volatile ("movl %%db6, %0\n":"=r" (dr6)
++				      :);
++			if (!(dr6 & 0x4000)) {
++				for (breakno = 0; breakno < 4; ++breakno) {
++					if (dr6 & (1 << breakno) &&
++					    (breakinfo[breakno].type == 0)) {
++						/* Set restore flag */
++						regs.eflags |= 0x10000;
++						break;
++					}
++				}
++			}
++			correct_hw_break();
++			asm volatile ("movl %0, %%db6\n"::"r" (0));
++			goto exit_kgdb;
++
++			/* kill the program */
++		case 'k':	/* do nothing */
++			break;
++
++			/* query */
++		case 'q':
++			nothreads = 0;
++			switch (remcomInBuffer[1]) {
++			case 'f':
++				threadid = 1;
++				thread_list = 2;
++				thread_list_start = (usethread ? : current);
++			case 's':
++				if (!cmp_str(&remcomInBuffer[2],
++					     "ThreadInfo", 10))
++					break;
++
++				remcomOutBuffer[nothreads++] = 'm';
++				for (; threadid < PID_MAX + MAX_NO_CPUS;
++				     threadid++) {
++					thread = getthread(threadid);
++					if (thread) {
++						nothreads += int_to_hex_v(
++							&remcomOutBuffer[
++								nothreads],
++							threadid);
++						if (thread_min > threadid)
++							thread_min = threadid;
++						remcomOutBuffer[
++							nothreads] = ',';
++						nothreads++;
++						if (nothreads > BUFMAX - 10)
++							break;
++					}
++				}
++				if (remcomOutBuffer[nothreads - 1] == 'm') {
++					remcomOutBuffer[nothreads - 1] = 'l';
++				} else {
++					nothreads--;
++				}
++				remcomOutBuffer[nothreads] = 0;
++				break;
++
++#ifdef old_thread_list /* Old thread info request */
++			case 'L':
++				/* List threads */
++				thread_list = 2;
++				thread_list_start = (usethread ? : current);
++				unpack_byte(remcomInBuffer + 3, &maxthreads);
++				unpack_threadid(remcomInBuffer + 5, &thref);
++				do {
++					int buf_thread_limit =
++					    (BUFMAX - 22) / BUF_THREAD_ID_SIZE;
++					if (maxthreads > buf_thread_limit) {
++						maxthreads = buf_thread_limit;
++					}
++				} while (0);
++				remcomOutBuffer[0] = 'q';
++				remcomOutBuffer[1] = 'M';
++				remcomOutBuffer[4] = '0';
++				pack_threadid(remcomOutBuffer + 5, &thref);
++
++				threadid = threadref_to_int(&thref);
++				for (nothreads = 0;
++				     nothreads < maxthreads &&
++				     threadid < PID_MAX + MAX_NO_CPUS;
++				     threadid++) {
++					thread = getthread(threadid);
++					if (thread) {
++						int_to_threadref(&thref,
++								 threadid);
++						pack_threadid(remcomOutBuffer +
++							      21 +
++							      nothreads * 16,
++							      &thref);
++						nothreads++;
++						if (thread_min > threadid)
++							thread_min = threadid;
++					}
++				}
++
++				if (threadid == PID_MAX + MAX_NO_CPUS) {
++					remcomOutBuffer[4] = '1';
++				}
++				pack_hex_byte(remcomOutBuffer + 2, nothreads);
++				remcomOutBuffer[21 + nothreads * 16] = '\0';
++				break;
++#endif
++			case 'C':
++				/* Current thread id */
++				remcomOutBuffer[0] = 'Q';
++				remcomOutBuffer[1] = 'C';
++				threadid = current->pid;
++				if (!threadid) {
++					/*
++					 * idle thread
++					 */
++					for (threadid = PID_MAX;
++					     threadid < PID_MAX + MAX_NO_CPUS;
++					     threadid++) {
++						if (current ==
++						    idle_task(threadid -
++							      PID_MAX))
++							break;
++					}
++				}
++				int_to_threadref(&thref, threadid);
++				pack_threadid(remcomOutBuffer + 2, &thref);
++				remcomOutBuffer[18] = '\0';
++				break;
++
++			case 'E':
++				/* Print exception info */
++				printexceptioninfo(exceptionVector,
++						   err_code, remcomOutBuffer);
++				break;
++			case 'T':{
++				char * nptr;
++				/* Thread extra info */
++				if (!cmp_str(&remcomInBuffer[2],
++					    "hreadExtraInfo,", 15)) {
++					break;
++				}
++				ptr = &remcomInBuffer[17];
++				hexToInt(&ptr, &threadid);
++				thread = getthread(threadid);
++				nptr = &thread->comm[0];
++				length = 0;
++				ptr = &remcomOutBuffer[0];
++				do {
++					length++;
++					ptr = pack_hex_byte(ptr, *nptr++);
++				 } while (*nptr && length < 16);
++				/*
++				 * would like that 16 to be the size of
++				 * task_struct.comm but don't know the
++				 * syntax..
++				 */
++				*ptr = 0;
++			}
++			}
++			break;
++
++			/* task related */
++		case 'H':
++			switch (remcomInBuffer[1]) {
++			case 'g':
++				ptr = &remcomInBuffer[2];
++				hexToInt(&ptr, &threadid);
++				thread = getthread(threadid);
++				if (!thread) {
++					remcomOutBuffer[0] = 'E';
++					remcomOutBuffer[1] = '\0';
++					break;
++				}
++				/*
++				 * Just in case I forget what this is all about,
++				 * the "thread info" command to gdb causes it
++				 * to ask for a thread list.  It then switches
++				 * to each thread and asks for the registers.
++				 * For this (and only this) usage, we want to
++				 * fudge the registers of tasks not on the run
++				 * list (i.e. waiting) to show the routine that
++				 * called schedule. Also, gdb, is a minimalist
++				 * in that if the current thread is the last
++				 * it will not re-read the info when done.
++				 * This means that in this case we must show
++				 * the real registers. So here is how we do it:
++				 * Each entry we keep track of the min
++				 * thread in the list (the last that gdb will)
++				 * get info for.  We also keep track of the
++				 * starting thread.
++				 * "thread_list" is cleared when switching back
++				 * to the min thread if it is was current, or
++				 * if it was not current, thread_list is set
++				 * to 1.  When the switch to current comes,
++				 * if thread_list is 1, clear it, else do
++				 * nothing.
++				 */
++				usethread = thread;
++				if ((thread_list == 1) &&
++				    (thread == thread_list_start)) {
++					thread_list = 0;
++				}
++				if (thread_list && (threadid == thread_min)) {
++					if (thread == thread_list_start) {
++						thread_list = 0;
++					} else {
++						thread_list = 1;
++					}
++				}
++				/* follow through */
++			case 'c':
++				remcomOutBuffer[0] = 'O';
++				remcomOutBuffer[1] = 'K';
++				remcomOutBuffer[2] = '\0';
++				break;
++			}
++			break;
++
++			/* Query thread status */
++		case 'T':
++			ptr = &remcomInBuffer[1];
++			hexToInt(&ptr, &threadid);
++			thread = getthread(threadid);
++			if (thread) {
++				remcomOutBuffer[0] = 'O';
++				remcomOutBuffer[1] = 'K';
++				remcomOutBuffer[2] = '\0';
++				if (thread_min > threadid)
++					thread_min = threadid;
++			} else {
++				remcomOutBuffer[0] = 'E';
++				remcomOutBuffer[1] = '\0';
++			}
++			break;
++
++		case 'Y': /* set up a hardware breakpoint */
++			ptr = &remcomInBuffer[1];
++			hexToInt(&ptr, &breakno);
++			ptr++;
++			hexToInt(&ptr, &breaktype);
++			ptr++;
++			hexToInt(&ptr, &length);
++			ptr++;
++			hexToInt(&ptr, &addr);
++			if (set_hw_break(breakno & 0x3,
++					 breaktype & 0x3,
++					 length & 0x3, addr) == 0) {
++				strcpy(remcomOutBuffer, "OK");
++			} else {
++				strcpy(remcomOutBuffer, "ERROR");
++			}
++			break;
++
++			/* Remove hardware breakpoint */
++		case 'y':
++			ptr = &remcomInBuffer[1];
++			hexToInt(&ptr, &breakno);
++			if (remove_hw_break(breakno & 0x3) == 0) {
++				strcpy(remcomOutBuffer, "OK");
++			} else {
++				strcpy(remcomOutBuffer, "ERROR");
++			}
++			break;
++
++		case 'r':	/* reboot */
++			strcpy(remcomOutBuffer, "OK");
++			putpacket(remcomOutBuffer);
++			/*to_gdb("Rebooting\n"); */
++			/* triplefault	 no return from here */
++			{
++				static long no_idt[2];
++				__asm__ __volatile__("lidt %0"::"m"(no_idt[0]));
++				BREAKPOINT;
++			}
++
++		}		/* switch */
++
++		/* reply to the request */
++		putpacket(remcomOutBuffer);
++	}			/* while(1==1) */
++	/*
++	 *  reached by goto only.
++	 */
++      exit_kgdb:
++	/*
++	 * Here is where we set up to trap a gdb function call.	 NEW_esp
++	 * will be changed if we are trying to do this.	 We handle both
++	 * adding and subtracting, thus allowing gdb to put grung on
++	 * the stack which it removes later.
++	 */
++	if (NEW_esp != OLD_esp) {
++		int *ptr = END_OF_LOOKASIDE;
++		if (NEW_esp < OLD_esp)
++			ptr -= (OLD_esp - NEW_esp) / sizeof (int);
++		*--ptr = linux_regs->eflags;
++		*--ptr = linux_regs->xcs;
++		*--ptr = linux_regs->eip;
++		*--ptr = linux_regs->ecx;
++		*--ptr = linux_regs->ebx;
++		*--ptr = linux_regs->eax;
++		linux_regs->ecx = NEW_esp - (sizeof (int) * 6);
++		linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE;
++		if (NEW_esp < OLD_esp) {
++			linux_regs->eip = (unsigned int) fn_call_stub;
++		} else {
++			linux_regs->eip = (unsigned int) fn_rtn_stub;
++			linux_regs->eax = NEW_esp;
++		}
++		linux_regs->eflags &= ~(IF_BIT | TF_BIT);
++	}
++#ifdef CONFIG_SMP
++	/*
++	 * Release gdb wait locks
++	 * Sanity check time.  Must have at least one cpu to run.  Also single
++	 * step must not be done if the current cpu is on hold.
++	 */
++	if (spinlock_count == 1) {
++		int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep;
++		int cpu_avail = 0;
++		int i;
++
++		for (i = 0; i < MAX_NO_CPUS; i++) {
++			if (!cpu_online(i))
++				break;
++			if (!hold_cpu(i)) {
++				cpu_avail = 1;
++			}
++		}
++		/*
++		 * Early in the bring up there will be NO cpus on line...
++		 */
++		if (!cpu_avail && !cpus_empty(cpu_online_map)) {
++			to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n");
++			goto once_again;
++		}
++		if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) {
++			to_gdb
++			    ("Current cpu must be unblocked to single step\n");
++			goto once_again;
++		}
++		if (!(ss_hold)) {
++			int i;
++			for (i = 0; i < MAX_NO_CPUS; i++) {
++				if (!hold_cpu(i)) {
++					spin_unlock(&waitlocks[i]);
++				}
++			}
++		} else {
++			spin_unlock(&waitlocks[smp_processor_id()]);
++		}
++		/* Release kgdb spinlock */
++		KGDB_SPIN_UNLOCK(&kgdb_spinlock);
++		/*
++		 * If this cpu is on hold, this is where we
++		 * do it.  Note, the NMI will pull us out of here,
++		 * but will return as the above lock is not held.
++		 * We will stay here till another cpu releases the lock for us.
++		 */
++		spin_unlock_wait(waitlocks + smp_processor_id());
++		kgdb_local_irq_restore(flags);
++		return (0);
++	}
++#if 0
++exit_just_unlock:
++#endif
++#endif
++	/* Release kgdb spinlock */
++	KGDB_SPIN_UNLOCK(&kgdb_spinlock);
++	kgdb_local_irq_restore(flags);
++	return (0);
++}
++
++/* this function is used to set up exception handlers for tracing and
++ * breakpoints.
++ * This function is not needed as the above line does all that is needed.
++ * We leave it for backward compatitability...
++ */
++void
++set_debug_traps(void)
++{
++	/*
++	 * linux_debug_hook is defined in traps.c.  We store a pointer
++	 * to our own exception handler into it.
++
++	 * But really folks, every hear of labeled common, an old Fortran
++	 * concept.  Lots of folks can reference it and it is define if
++	 * anyone does.	 Only one can initialize it at link time.  We do
++	 * this with the hook.	See the statement above.  No need for any
++	 * executable code and it is ready as soon as the kernel is
++	 * loaded.  Very desirable in kernel debugging.
++
++	 linux_debug_hook = handle_exception ;
++	 */
++
++	/* In case GDB is started before us, ack any packets (presumably
++	   "$?#xx") sitting there.
++	   putDebugChar ('+');
++
++	   initialized = 1;
++	 */
++}
++
++/* This function will generate a breakpoint exception.	It is used at the
++   beginning of a program to sync up with a debugger and can be used
++   otherwise as a quick means to stop program execution and "break" into
++   the debugger. */
++/* But really, just use the BREAKPOINT macro.  We will handle the int stuff
++ */
++
++#ifdef later
++/*
++ * possibly we should not go thru the traps.c code at all?  Someday.
++ */
++void
++do_kgdb_int3(struct pt_regs *regs, long error_code)
++{
++	kgdb_handle_exception(3, 5, error_code, regs);
++	return;
++}
++#endif
++#undef regs
++#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS
++asmlinkage void
++bad_sys_call_exit(int stuff)
++{
++	struct pt_regs *regs = (struct pt_regs *) &stuff;
++	printk("Sys call %d return with %x preempt_count\n",
++	       (int) regs->orig_eax, preempt_count());
++}
++#endif
++#ifdef CONFIG_STACK_OVERFLOW_TEST
++#include <asm/kgdb.h>
++asmlinkage void
++stack_overflow(void)
++{
++#ifdef BREAKPOINT
++	BREAKPOINT;
++#else
++	printk("Kernel stack overflow, looping forever\n");
++#endif
++	while (1) {
++	}
++}
++#endif
++
++#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE)
++char gdbconbuf[BUFMAX];
++
++static void
++kgdb_gdb_message(const char *s, unsigned count)
++{
++	int i;
++	int wcount;
++	char *bufptr;
++	/*
++	 * This takes care of NMI while spining out chars to gdb
++	 */
++	IF_SMP(in_kgdb_console = 1);
++	gdbconbuf[0] = 'O';
++	bufptr = gdbconbuf + 1;
++	while (count > 0) {
++		if ((count << 1) > (BUFMAX - 2)) {
++			wcount = (BUFMAX - 2) >> 1;
++		} else {
++			wcount = count;
++		}
++		count -= wcount;
++		for (i = 0; i < wcount; i++) {
++			bufptr = pack_hex_byte(bufptr, s[i]);
++		}
++		*bufptr = '\0';
++		s += wcount;
++
++		putpacket(gdbconbuf);
++
++	}
++	IF_SMP(in_kgdb_console = 0);
++}
++#endif
++#ifdef CONFIG_SMP
++static void
++to_gdb(const char *s)
++{
++	int count = 0;
++	while (s[count] && (count++ < BUFMAX)) ;
++	kgdb_gdb_message(s, count);
++}
++#endif
++#ifdef CONFIG_KGDB_CONSOLE
++#include <linux/console.h>
++#include <linux/init.h>
++#include <linux/fs.h>
++#include <asm/uaccess.h>
++#include <asm/semaphore.h>
++
++void
++kgdb_console_write(struct console *co, const char *s, unsigned count)
++{
++
++	if (gdb_i386vector == -1) {
++		/*
++		 * We have not yet talked to gdb.  What to do...
++		 * lets break, on continue we can do the write.
++		 * But first tell him whats up. Uh, well no can do,
++		 * as this IS the console.  Oh well...
++		 * We do need to wait or the messages will be lost.
++		 * Other option would be to tell the above code to
++		 * ignore this breakpoint and do an auto return,
++		 * but that might confuse gdb.	Also this happens
++		 * early enough in boot up that we don't have the traps
++		 * set up yet, so...
++		 */
++		breakpoint();
++	}
++	kgdb_gdb_message(s, count);
++}
++
++/*
++ * ------------------------------------------------------------
++ * Serial KGDB driver
++ * ------------------------------------------------------------
++ */
++
++static struct console kgdbcons = {
++	name:"kgdb",
++	write:kgdb_console_write,
++#ifdef CONFIG_KGDB_USER_CONSOLE
++	device:kgdb_console_device,
++#endif
++	flags:CON_PRINTBUFFER | CON_ENABLED,
++	index:-1,
++};
++
++/*
++ * The trick here is that this file gets linked before printk.o
++ * That means we get to peer at the console info in the command
++ * line before it does.	 If we are up, we register, otherwise,
++ * do nothing.	By returning 0, we allow printk to look also.
++ */
++static int kgdb_console_enabled;
++
++int __init
++kgdb_console_init(char *str)
++{
++	if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) {
++		register_console(&kgdbcons);
++		kgdb_console_enabled = 1;
++	}
++	return 0;		/* let others look at the string */
++}
++
++__setup("console=", kgdb_console_init);
++
++#ifdef CONFIG_KGDB_USER_CONSOLE
++static kdev_t kgdb_console_device(struct console *c);
++/* This stuff sort of works, but it knocks out telnet devices
++ * we are leaving it here in case we (or you) find time to figure it out
++ * better..
++ */
++
++/*
++ * We need a real char device as well for when the console is opened for user
++ * space activities.
++ */
++
++static int
++kgdb_consdev_open(struct inode *inode, struct file *file)
++{
++	return 0;
++}
++
++static ssize_t
++kgdb_consdev_write(struct file *file, const char *buf,
++		   size_t count, loff_t * ppos)
++{
++	int size, ret = 0;
++	static char kbuf[128];
++	static DECLARE_MUTEX(sem);
++
++	/* We are not reentrant... */
++	if (down_interruptible(&sem))
++		return -ERESTARTSYS;
++
++	while (count > 0) {
++		/* need to copy the data from user space */
++		size = count;
++		if (size > sizeof (kbuf))
++			size = sizeof (kbuf);
++		if (copy_from_user(kbuf, buf, size)) {
++			ret = -EFAULT;
++			break;;
++		}
++		kgdb_console_write(&kgdbcons, kbuf, size);
++		count -= size;
++		ret += size;
++		buf += size;
++	}
++
++	up(&sem);
++
++	return ret;
++}
++
++struct file_operations kgdb_consdev_fops = {
++	open:kgdb_consdev_open,
++	write:kgdb_consdev_write
++};
++static kdev_t
++kgdb_console_device(struct console *c)
++{
++	return MKDEV(TTYAUX_MAJOR, 1);
++}
++
++/*
++ * This routine gets called from the serial stub in the i386/lib
++ * This is so it is done late in bring up (just before the console open).
++ */
++void
++kgdb_console_finit(void)
++{
++	if (kgdb_console_enabled) {
++		char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1));
++		char *cp = cptr;
++		while (*cptr && *cptr != '(')
++			cptr++;
++		*cptr = 0;
++		unregister_chrdev(TTYAUX_MAJOR, cp);
++		register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops);
++	}
++}
++#endif
++#endif
++#ifdef CONFIG_KGDB_TS
++#include <asm/msr.h>		/* time stamp code */
++#include <asm/hardirq.h>	/* in_interrupt */
++#ifdef CONFIG_KGDB_TS_64
++#define DATA_POINTS 64
++#endif
++#ifdef CONFIG_KGDB_TS_128
++#define DATA_POINTS 128
++#endif
++#ifdef CONFIG_KGDB_TS_256
++#define DATA_POINTS 256
++#endif
++#ifdef CONFIG_KGDB_TS_512
++#define DATA_POINTS 512
++#endif
++#ifdef CONFIG_KGDB_TS_1024
++#define DATA_POINTS 1024
++#endif
++#ifndef DATA_POINTS
++#define DATA_POINTS 128		/* must be a power of two */
++#endif
++#define INDEX_MASK (DATA_POINTS - 1)
++#if (INDEX_MASK & DATA_POINTS)
++#error "CONFIG_KGDB_TS_COUNT must be a power of 2"
++#endif
++struct kgdb_and_then_struct {
++#ifdef CONFIG_SMP
++	int on_cpu;
++#endif
++	struct task_struct *task;
++	long long at_time;
++	int from_ln;
++	char *in_src;
++	void *from;
++	int *with_shpf;
++	int data0;
++	int data1;
++};
++struct kgdb_and_then_struct2 {
++#ifdef CONFIG_SMP
++	int on_cpu;
++#endif
++	struct task_struct *task;
++	long long at_time;
++	int from_ln;
++	char *in_src;
++	void *from;
++	int *with_shpf;
++	struct task_struct *t1;
++	struct task_struct *t2;
++};
++struct kgdb_and_then_struct kgdb_data[DATA_POINTS];
++
++struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0];
++int kgdb_and_then_count;
++
++void
++kgdb_tstamp(int line, char *source, int data0, int data1)
++{
++	static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED;
++	int flags;
++	kgdb_local_irq_save(flags);
++	spin_lock(&ts_spin);
++	rdtscll(kgdb_and_then->at_time);
++#ifdef CONFIG_SMP
++	kgdb_and_then->on_cpu = smp_processor_id();
++#endif
++	kgdb_and_then->task = current;
++	kgdb_and_then->from_ln = line;
++	kgdb_and_then->in_src = source;
++	kgdb_and_then->from = __builtin_return_address(0);
++	kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) |
++					    (preempt_count() << 8));
++	kgdb_and_then->data0 = data0;
++	kgdb_and_then->data1 = data1;
++	kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK];
++	spin_unlock(&ts_spin);
++	kgdb_local_irq_restore(flags);
++#ifdef CONFIG_PREEMPT
++
++#endif
++	return;
++}
++#endif
++typedef int gdb_debug_hook(int exceptionVector,
++			   int signo, int err_code, struct pt_regs *linux_regs);
++gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception;	/* histerical reasons... */
+Index: linux-2.6.10/arch/i386/kernel/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/Makefile	2005-03-31 15:35:23.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/Makefile	2005-04-05 12:48:05.254618256 +0800
+@@ -14,6 +14,7 @@
+ obj-$(CONFIG_ACPI_BOOT)		+= acpi/
+ obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o
+ obj-$(CONFIG_MCA)		+= mca.o
++obj-$(CONFIG_KGDB)		+= kgdb_stub.o
+ obj-$(CONFIG_X86_MSR)		+= msr.o
+ obj-$(CONFIG_X86_CPUID)		+= cpuid.o
+ obj-$(CONFIG_MICROCODE)		+= microcode.o
+Index: linux-2.6.10/arch/i386/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/smp.c	2005-03-31 16:20:11.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/smp.c	2005-04-05 12:48:05.218623728 +0800
+@@ -466,7 +466,17 @@
+ {
+ 	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+ }
+-
++#ifdef CONFIG_KGDB
++/*
++ * By using the NMI code instead of a vector we just sneak thru the
++ * word generator coming out with just what we want.  AND it does
++ * not matter if clustered_apic_mode is set or not.
++ */
++void smp_send_nmi_allbutself(void)
++{
++	send_IPI_allbutself(APIC_DM_NMI);
++}
++#endif
+ /*
+  * this function sends a 'reschedule' IPI to another CPU.
+  * it goes straight through and wastes no time serializing
+Index: linux-2.6.10/arch/i386/Kconfig.kgdb
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig.kgdb	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/i386/Kconfig.kgdb	2005-04-05 12:48:05.205625704 +0800
+@@ -0,0 +1,175 @@
++config KGDB
++	bool "Include kgdb kernel debugger"
++	depends on DEBUG_KERNEL && !KPROBES
++	help
++	  If you say Y here, the system will be compiled with the debug
++	  option (-g) and a debugging stub will be included in the
++	  kernel.  This stub communicates with gdb on another (host)
++	  computer via a serial port.  The host computer should have
++	  access to the kernel binary file (vmlinux) and a serial port
++	  that is connected to the target machine.  Gdb can be made to
++	  configure the serial port or you can use stty and setserial to
++	  do this. See the 'target' command in gdb. This option also
++	  configures in the ability to request a breakpoint early in the
++	  boot process.  To request the breakpoint just include 'kgdb'
++	  as a boot option when booting the target machine.  The system
++	  will then break as soon as it looks at the boot options.  This
++	  option also installs a breakpoint in panic and sends any
++	  kernel faults to the debugger. For more information see the
++	  Documentation/i386/kgdb/kgdb.txt file.
++
++choice
++	depends on KGDB
++    	prompt "Debug serial port BAUD"
++	default KGDB_115200BAUD
++	help
++	  Gdb and the kernel stub need to agree on the baud rate to be
++	  used.  Some systems (x86 family at this writing) allow this to
++	  be configured.
++
++config KGDB_9600BAUD
++	bool "9600"
++
++config KGDB_19200BAUD
++	bool "19200"
++
++config KGDB_38400BAUD
++	bool "38400"
++
++config KGDB_57600BAUD
++	bool "57600"
++
++config KGDB_115200BAUD
++	bool "115200"
++endchoice
++
++config KGDB_PORT
++	hex "hex I/O port address of the debug serial port"
++	depends on KGDB
++	default  3f8
++	help
++	  Some systems (x86 family at this writing) allow the port
++	  address to be configured.  The number entered is assumed to be
++	  hex, don't put 0x in front of it.  The standard address are:
++	  COM1 3f8 , irq 4 and COM2 2f8 irq 3.  Setserial /dev/ttySx
++	  will tell you what you have.  It is good to test the serial
++	  connection with a live system before trying to debug.
++
++config KGDB_IRQ
++	int "IRQ of the debug serial port"
++	depends on KGDB
++	default 4
++	help
++	  This is the irq for the debug port.  If everything is working
++	  correctly and the kernel has interrupts on a control C to the
++	  port should cause a break into the kernel debug stub.
++
++config DEBUG_INFO
++	bool
++	depends on KGDB
++	default y
++
++config KGDB_MORE
++	bool "Add any additional compile options"
++	depends on KGDB
++	default n
++	help
++	  Saying yes here turns on the ability to enter additional
++	  compile options.
++
++
++config KGDB_OPTIONS
++	depends on KGDB_MORE
++	string "Additional compile arguments"
++	default "-O1"
++	help
++	  This option allows you enter additional compile options for
++	  the whole kernel compile.  Each platform will have a default
++	  that seems right for it.  For example on PPC "-ggdb -O1", and
++	  for i386 "-O1".  Note that by configuring KGDB "-g" is already
++	  turned on.  In addition, on i386 platforms
++	  "-fomit-frame-pointer" is deleted from the standard compile
++	  options.
++
++config NO_KGDB_CPUS
++	int "Number of CPUs"
++	depends on KGDB && SMP
++	default NR_CPUS
++	help
++
++	  This option sets the number of cpus for kgdb ONLY.  It is used
++	  to prune some internal structures so they look "nice" when
++	  displayed with gdb.  This is to overcome possibly larger
++	  numbers that may have been entered above.  Enter the real
++	  number to get nice clean kgdb_info displays.
++
++config KGDB_TS
++	bool "Enable kgdb time stamp macros?"
++	depends on KGDB
++	default n
++	help
++	  Kgdb event macros allow you to instrument your code with calls
++	  to the kgdb event recording function.  The event log may be
++	  examined with gdb at a break point.  Turning on this
++	  capability also allows you to choose how many events to
++	  keep. Kgdb always keeps the lastest events.
++
++choice
++	depends on KGDB_TS
++	prompt "Max number of time stamps to save?"
++	default KGDB_TS_128
++
++config KGDB_TS_64
++	bool "64"
++
++config KGDB_TS_128
++	bool "128"
++
++config KGDB_TS_256
++	bool "256"
++
++config KGDB_TS_512
++	bool "512"
++
++config KGDB_TS_1024
++	bool "1024"
++
++endchoice
++
++config STACK_OVERFLOW_TEST
++	bool "Turn on kernel stack overflow testing?"
++	depends on KGDB
++	default n
++	help
++	  This option enables code in the front line interrupt handlers
++	  to check for kernel stack overflow on interrupts and system
++	  calls.  This is part of the kgdb code on x86 systems.
++
++config KGDB_CONSOLE
++	bool "Enable serial console thru kgdb port"
++	depends on KGDB
++	default n
++	help
++	  This option enables the command line "console=kgdb" option.
++	  When the system is booted with this option in the command line
++	  all kernel printk output is sent to gdb (as well as to other
++	  consoles).  For this to work gdb must be connected.  For this
++	  reason, this command line option will generate a breakpoint if
++	  gdb has not yet connected.  After the gdb continue command is
++	  given all pent up console output will be printed by gdb on the
++	  host machine.  Neither this option, nor KGDB require the
++	  serial driver to be configured.
++
++config KGDB_SYSRQ
++	bool "Turn on SysRq 'G' command to do a break?"
++	depends on KGDB
++	default y
++	help
++	  This option includes an option in the SysRq code that allows
++	  you to enter SysRq G which generates a breakpoint to the KGDB
++	  stub.  This will work if the keyboard is alive and can
++	  interrupt the system.  Because of constraints on when the
++	  serial port interrupt can be enabled, this code may allow you
++	  to interrupt the system before the serial port control C is
++	  available.  Just say yes here.
++
+Index: linux-2.6.10/arch/i386/mm/fault.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/mm/fault.c	2004-12-25 05:33:48.000000000 +0800
++++ linux-2.6.10/arch/i386/mm/fault.c	2005-04-05 12:48:05.196627072 +0800
+@@ -430,6 +430,12 @@
+  * Oops. The kernel tried to access some bad page. We'll have to
+  * terminate things with extreme prejudice.
+  */
++#ifdef CONFIG_KGDB
++        if (!user_mode(regs)){
++                kgdb_handle_exception(14,SIGBUS, error_code, regs);
++                return;
++        }
++#endif
+ 
+ 	bust_spinlocks(1);
+ 
+Index: linux-2.6.10/arch/i386/Kconfig
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig	2005-04-05 12:48:03.417897480 +0800
++++ linux-2.6.10/arch/i386/Kconfig	2005-04-05 12:48:05.257617800 +0800
+@@ -1196,6 +1196,14 @@
+ 
+ source "fs/Kconfig.binfmt"
+ 
++config TRAP_BAD_SYSCALL_EXITS
++	bool "Debug bad system call exits"
++	depends on KGDB
++	help
++	  If you say Y here the kernel will check for system calls which
++	  return without clearing preempt.
++        default n
++
+ endmenu
+ 
+ source "drivers/Kconfig"
+Index: linux-2.6.10/arch/i386/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Makefile	2005-03-31 15:35:27.000000000 +0800
++++ linux-2.6.10/arch/i386/Makefile	2005-04-05 12:48:05.255618104 +0800
+@@ -99,6 +99,9 @@
+ # default subarch .h files
+ mflags-y += -Iinclude/asm-i386/mach-default
+ 
++mflags-$(CONFIG_KGDB) += -gdwarf-2
++mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g')
++
+ head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
+ 
+ libs-y 					+= arch/i386/lib/
+Index: linux-2.6.10/arch/x86_64/boot/compressed/head.S
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/boot/compressed/head.S	2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/x86_64/boot/compressed/head.S	2005-04-05 12:48:05.258617648 +0800
+@@ -26,6 +26,7 @@
+ .code32
+ .text
+ 
++#define IN_BOOTLOADER
+ #include <linux/linkage.h>
+ #include <asm/segment.h>
+ 
+Index: linux-2.6.10/arch/x86_64/boot/compressed/misc.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/boot/compressed/misc.c	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/x86_64/boot/compressed/misc.c	2005-04-05 12:48:05.259617496 +0800
+@@ -9,6 +9,7 @@
+  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+  */
+ 
++#define IN_BOOTLOADER
+ #include "miscsetup.h"
+ #include <asm/io.h>
+ 
+Index: linux-2.6.10/MAINTAINERS
+===================================================================
+--- linux-2.6.10.orig/MAINTAINERS	2005-03-31 15:35:24.000000000 +0800
++++ linux-2.6.10/MAINTAINERS	2005-04-05 12:48:05.181629352 +0800
+@@ -1245,6 +1245,12 @@
+ W:	http://developer.osdl.org/rddunlap/kj-patches/
+ S:	Maintained
+ 
++KGDB FOR I386 PLATFORM
++P:	George Anzinger
++M:	george@mvista.com
++L:	linux-net@vger.kernel.org
++S:	Supported
++
+ KERNEL NFSD
+ P:	Neil Brown
+ M:	neilb@cse.unsw.edu.au
+Index: linux-2.6.10/drivers/char/sysrq.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/sysrq.c	2005-03-31 15:57:20.000000000 +0800
++++ linux-2.6.10/drivers/char/sysrq.c	2005-04-05 12:48:05.191627832 +0800
+@@ -35,6 +35,25 @@
+ #include <linux/spinlock.h>
+ 
+ #include <asm/ptrace.h>
++#ifdef CONFIG_KGDB_SYSRQ
++
++#define  GDB_OP &kgdb_op
++static void kgdb_sysrq(int key, struct pt_regs *pt_regs, struct tty_struct *tty)
++{
++	printk("kgdb sysrq\n");
++	breakpoint();
++}
++
++static struct sysrq_key_op kgdb_op = {
++	.handler	= kgdb_sysrq,
++	.help_msg	= "kGdb|Fgdb",
++	.action_msg	= "Debug breakpoint\n",
++};
++
++#else
++#define  GDB_OP NULL
++#endif
++
+ 
+ extern void reset_vc(unsigned int);
+ 
+@@ -249,7 +268,7 @@
+ /* d */	NULL,
+ /* e */	&sysrq_term_op,
+ /* f */	NULL,
+-/* g */	NULL,
++/* g */	GDB_OP,
+ /* h */	NULL,
+ /* i */	&sysrq_kill_op,
+ /* j */	NULL,
+Index: linux-2.6.10/drivers/char/keyboard.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/keyboard.c	2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/drivers/char/keyboard.c	2005-04-05 12:48:05.190627984 +0800
+@@ -1078,6 +1078,9 @@
+ 	}
+ 	if (sysrq_down && down && !rep) {
+ 		handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty);
++#ifdef CONFIG_KGDB_SYSRQ
++                sysrq_down = 0;        /* in case we miss the "up" event */
++#endif
+ 		return;
+ 	}
+ #endif
+Index: linux-2.6.10/drivers/serial/serial_core.c
+===================================================================
+--- linux-2.6.10.orig/drivers/serial/serial_core.c	2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/drivers/serial/serial_core.c	2005-04-05 12:48:05.188628288 +0800
+@@ -1924,6 +1924,15 @@
+ {
+ 	unsigned int flags;
+ 
++#ifdef CONFIG_KGDB
++	{
++		extern int kgdb_irq;
++
++		if (port->irq == kgdb_irq)
++			return;
++	}
++#endif
++
+ 	/*
+ 	 * If there isn't a port here, don't do anything further.
+ 	 */
+Index: linux-2.6.10/drivers/serial/8250.c
+===================================================================
+--- linux-2.6.10.orig/drivers/serial/8250.c	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/drivers/serial/8250.c	2005-04-05 12:48:05.185628744 +0800
+@@ -1350,12 +1350,21 @@
+ 	spin_unlock_irqrestore(&up->port.lock, flags);
+ }
+ 
++#ifdef CONFIG_KGDB
++int kgdb_irq = -1;
++#endif
++
+ static int serial8250_startup(struct uart_port *port)
+ {
+ 	struct uart_8250_port *up = (struct uart_8250_port *)port;
+ 	unsigned long flags;
+ 	int retval;
+ 
++#ifdef CONFIG_KGDB
++	if (up->port.irq == kgdb_irq)
++		return -EBUSY;
++#endif
++
+ 	up->capabilities = uart_config[up->port.type].flags;
+ 	up->mcr = 0;
+ 
+@@ -2438,6 +2447,33 @@
+ }
+ EXPORT_SYMBOL(serial8250_unregister_port);
+ 
++#ifdef CONFIG_KGDB
++#include <linux/serialP.h>
++
++/*
++ * Find all the ports using the given irq and shut them down.
++ * Result should be that the irq will be released.
++ */
++void shutdown_for_kgdb(struct async_struct * info)
++{
++        int irq = info->state->irq;
++        struct uart_8250_port *up;
++	int ttyS;
++
++	kgdb_irq = irq;			/* save for later init */
++	for (ttyS = 0; ttyS < UART_NR; ttyS++){
++		up =  &serial8250_ports[ttyS];
++		if (up->port.irq == irq && (irq_lists + irq)->head) {
++#ifdef CONFIG_DEBUG_SPINLOCK   /* ugly business... */
++			if(up->port.lock.magic != SPINLOCK_MAGIC)
++				spin_lock_init(&up->port.lock);
++#endif
++			serial8250_shutdown(&up->port);
++		}
++        }
++}
++#endif	/* CONFIG_KGDB */
++
+ static int __init serial8250_init(void)
+ {
+ 	int ret, i;
diff --git a/lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch b/lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch
new file mode 100644
index 0000000..cf91437
--- /dev/null
+++ b/lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch
@@ -0,0 +1,10703 @@
+
+
+The complete set of citi nfsv4 patches combined into one patch.
+
+Changes since 2.6.10-rc3-CITI_NFS4_ALL-3
+	* minor adjustments to xdr buffer length calculations in fs/nfs4xdr.c
+	* client acl revisions: pass acls in page array of xdr bufs, removing
+	  arbitrary length restrictions.  Temporarily disable acl caching.
+
+Index: linux-2.6.10/include/linux/nfsd/state.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfsd/state.h	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/include/linux/nfsd/state.h	2005-04-05 14:49:13.465682224 +0800
+@@ -67,6 +67,45 @@
+ #define ZERO_STATEID(stateid)       (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
+ #define ONE_STATEID(stateid)        (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+ 
++/* Delegation recall states */
++#define NFS4_NO_RECALL			0x000
++#define NFS4_RECALL_IN_PROGRESS		0x001
++#define NFS4_RECALL_COMPLETE		0x002
++
++
++/* Delegation flags */
++#define NFS4_DELAY_CLOSE		0x001
++
++struct nfs4_cb_recall {
++	u32			cbr_ident;
++	int			cbr_trunc;
++	stateid_t		cbr_stateid;
++	u32			cbr_fhlen;
++	u32			cbr_fhval[NFS4_FHSIZE];
++	struct nfs4_delegation	*cbr_dp;
++};
++
++struct nfs4_delegation {
++	struct list_head	dl_del_perfile; /* nfs4_file->fi_del_perfile */
++	struct list_head	dl_del_perclnt; /* nfs4_client->cl_del_perclnt*/
++	struct list_head	dl_recall_lru;  /* delegation recalled */
++	atomic_t		dl_recall_cnt;  /* resend cb_recall only once */
++	atomic_t		dl_count;       /* ref count */
++	atomic_t		dl_state;       /* recall state */
++	struct nfs4_client	*dl_client;
++	struct nfs4_file	*dl_file;
++	struct file_lock	*dl_flock;
++	struct nfs4_stateid	*dl_stp;
++	u32			dl_flags;
++	u32			dl_type;
++	time_t			dl_time;
++	struct nfs4_cb_recall	dl_recall;
++};
++
++#define dl_stateid      dl_recall.cbr_stateid
++#define dl_fhlen        dl_recall.cbr_fhlen
++#define dl_fhval        dl_recall.cbr_fhval
++
+ /* client delegation callback info */
+ struct nfs4_callback {
+ 	/* SETCLIENTID info */
+@@ -75,9 +114,8 @@
+ 	unsigned short          cb_port;
+ 	u32                     cb_prog;
+ 	u32                     cb_ident;
+-	struct xdr_netobj	cb_netid;
+ 	/* RPC client info */
+-	u32			cb_set;     /* successful CB_NULL call */
++	atomic_t		cb_set;     /* successful CB_NULL call */
+ 	struct rpc_program      cb_program;
+ 	struct rpc_stat         cb_stat;
+ 	struct rpc_clnt *       cb_client;
+@@ -97,6 +135,7 @@
+ 	struct list_head	cl_idhash; 	/* hash by cl_clientid.id */
+ 	struct list_head	cl_strhash; 	/* hash by cl_name */
+ 	struct list_head	cl_perclient; 	/* list: stateowners */
++	struct list_head	cl_del_perclnt; /* list: delegations */
+ 	struct list_head        cl_lru;         /* tail queue */
+ 	struct xdr_netobj	cl_name; 	/* id generated by client */
+ 	nfs4_verifier		cl_verifier; 	/* generated by client */
+@@ -106,7 +145,8 @@
+ 	clientid_t		cl_clientid;	/* generated by server */
+ 	nfs4_verifier		cl_confirm;	/* generated by server */
+ 	struct nfs4_callback	cl_callback;    /* callback info */
+-	time_t			cl_first_state; /* first state aquisition*/
++	atomic_t		cl_count;	/* ref count */
++	u32			cl_firststate;  /* recovery file creation */
+ };
+ 
+ /* struct nfs4_client_reset
+@@ -117,8 +157,6 @@
+ struct nfs4_client_reclaim {
+ 	struct list_head	cr_strhash;	/* hash by cr_name */
+ 	struct xdr_netobj 	cr_name; 	/* id generated by client */
+-	time_t			cr_first_state; /* first state aquisition */
+-	u32			cr_expired;     /* boolean: lease expired? */
+ };
+ 
+ static inline void
+@@ -194,6 +232,7 @@
+ struct nfs4_file {
+ 	struct list_head        fi_hash;    /* hash by "struct inode *" */
+ 	struct list_head        fi_perfile; /* list: nfs4_stateid */
++	struct list_head	fi_del_perfile; /* list: nfs4_delegation */
+ 	struct inode		*fi_inode;
+ 	u32                     fi_id;      /* used with stateowner->so_id 
+ 					     * for stateid_hashtbl hash */
+@@ -231,8 +270,10 @@
+ #define CONFIRM                 0x00000002
+ #define OPEN_STATE              0x00000004
+ #define LOCK_STATE              0x00000008
+-#define RDWR_STATE              0x00000010
+-#define CLOSE_STATE             0x00000020
++#define RD_STATE	        0x00000010
++#define WR_STATE	        0x00000020
++#define CLOSE_STATE             0x00000040
++#define DELEG_RET               0x00000080
+ 
+ #define seqid_mutating_err(err)                       \
+ 	(((err) != nfserr_stale_clientid) &&    \
+@@ -243,14 +284,24 @@
+ extern time_t nfs4_laundromat(void);
+ extern int nfsd4_renew(clientid_t *clid);
+ extern int nfs4_preprocess_stateid_op(struct svc_fh *current_fh, 
+-		stateid_t *stateid, int flags, struct nfs4_stateid **stpp);
++		stateid_t *stateid, int flags, struct file **filp);
+ extern int nfs4_share_conflict(struct svc_fh *current_fh, 
+ 		unsigned int deny_type);
+ extern void nfs4_lock_state(void);
+ extern void nfs4_unlock_state(void);
+ extern int nfs4_in_grace(void);
+ extern int nfs4_check_open_reclaim(clientid_t *clid);
++extern void put_nfs4_client(struct nfs4_client *clp);
+ extern void nfs4_free_stateowner(struct kref *kref);
++extern void nfsd4_probe_callback(struct nfs4_client *clp);
++extern int nfsd4_cb_recall(struct nfs4_delegation *dp);
++extern int nfsd4_create_clid_file(struct nfs4_client *clp);
++extern void nfsd4_remove_clid_file(struct nfs4_client *clp);
++extern int nfsd4_list_rec_dir(int clear);
++extern void nfsd4_init_rec_dir(char *rec_dirname);
++extern void nfsd4_shutdown_rec_dir(void);
++extern int nfs4_client_to_reclaim(char *name, int namlen);
++
+ 
+ static inline void
+ nfs4_put_stateowner(struct nfs4_stateowner *so)
+Index: linux-2.6.10/include/linux/nfsd/nfsd.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfsd/nfsd.h	2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/include/linux/nfsd/nfsd.h	2005-04-05 14:49:13.464682376 +0800
+@@ -98,8 +98,12 @@
+ void		nfsd_close(struct file *);
+ int		nfsd_read(struct svc_rqst *, struct svc_fh *,
+ 				loff_t, struct kvec *,int, unsigned long *);
++int 		nfsd_vfs_read(struct svc_rqst *, struct svc_fh *, struct file *,
++              			loff_t, struct kvec *, int, unsigned long *);
+ int		nfsd_write(struct svc_rqst *, struct svc_fh *,
+ 				loff_t, struct kvec *,int, unsigned long, int *);
++int 		nfsd_vfs_write(struct svc_rqst *, struct svc_fh *,struct file *,
++                                loff_t, struct kvec *,int, unsigned long, int *);
+ int		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+ 				char *, int *);
+ int		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+Index: linux-2.6.10/include/linux/nfsd/xdr4.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfsd/xdr4.h	2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/include/linux/nfsd/xdr4.h	2005-04-05 14:49:13.466682072 +0800
+@@ -44,16 +44,6 @@
+ #define NFSD4_MAX_TAGLEN	128
+ #define XDR_LEN(n)                     (((n) + 3) & ~3)
+ 
+-typedef u32 delegation_zero_t;
+-typedef u32 delegation_boot_t;
+-typedef u64 delegation_id_t;
+-
+-typedef struct {
+-	delegation_zero_t	ds_zero;
+-	delegation_boot_t	ds_boot;
+-	delegation_id_t		ds_id;
+-} delegation_stateid_t;
+-
+ struct nfsd4_change_info {
+ 	u32		atomic;
+ 	u32		before_ctime_sec;
+@@ -104,6 +94,10 @@
+ #define cr_specdata1	u.dev.specdata1
+ #define cr_specdata2	u.dev.specdata2
+ 
++struct nfsd4_delegreturn {
++	stateid_t	dr_stateid;
++};
++
+ struct nfsd4_getattr {
+ 	u32		ga_bmval[2];        /* request */
+ 	struct svc_fh	*ga_fhp;            /* response */
+@@ -202,13 +196,13 @@
+ 	u32		op_claim_type;      /* request */
+ 	struct xdr_netobj op_fname;	    /* request - everything but CLAIM_PREV */
+ 	u32		op_delegate_type;   /* request - CLAIM_PREV only */
+-	delegation_stateid_t	op_delegate_stateid; /* request - CLAIM_DELEGATE_CUR only */
++	stateid_t       op_delegate_stateid; /* request - response */
+ 	u32		op_create;     	    /* request */
+ 	u32		op_createmode;      /* request */
+ 	u32		op_bmval[2];        /* request */
+ 	union {                             /* request */
+-		struct iattr	iattr;		            /* UNCHECKED4,GUARDED4 */
+-		nfs4_verifier	verf;		                     /* EXCLUSIVE4 */
++		struct iattr	iattr;                      /* UNCHECKED4,GUARDED4 */
++		nfs4_verifier	verf;                                /* EXCLUSIVE4 */
+ 	} u;
+ 	clientid_t	op_clientid;        /* request */
+ 	struct xdr_netobj op_owner;           /* request */
+@@ -247,6 +241,7 @@
+ 	u32		rd_length;          /* request */
+ 	struct kvec	rd_iov[RPCSVC_MAXPAGES];
+ 	int		rd_vlen;
++	struct file     *rd_filp;
+ 	
+ 	struct svc_rqst *rd_rqstp;          /* response */
+ 	struct svc_fh * rd_fhp;             /* response */
+@@ -345,6 +340,7 @@
+ 		struct nfsd4_close		close;
+ 		struct nfsd4_commit		commit;
+ 		struct nfsd4_create		create;
++		struct nfsd4_delegreturn	delegreturn;
+ 		struct nfsd4_getattr		getattr;
+ 		struct svc_fh *			getfh;
+ 		struct nfsd4_link		link;
+@@ -456,6 +452,8 @@
+ nfsd4_release_lockowner(struct svc_rqst *rqstp,
+ 		struct nfsd4_release_lockowner *rlockowner);
+ extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
++extern int nfsd4_delegreturn(struct svc_rqst *rqstp,
++		struct svc_fh *current_fh, struct nfsd4_delegreturn *dr);
+ #endif
+ 
+ /*
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/fs.h	2005-04-05 14:49:13.461682832 +0800
+@@ -1185,11 +1185,6 @@
+ 
+ extern int vfs_statfs(struct super_block *, struct kstatfs *);
+ 
+-/* Return value for VFS lock functions - tells locks.c to lock conventionally
+- * REALLY kosha for root NFS and nfs_lock
+- */ 
+-#define LOCK_USE_CLNT 1
+-
+ #define FLOCK_VERIFY_READ  1
+ #define FLOCK_VERIFY_WRITE 2
+ 
+Index: linux-2.6.10/include/linux/dcache.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dcache.h	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/dcache.h	2005-04-05 14:49:13.460682984 +0800
+@@ -200,6 +200,7 @@
+  * These are the low-level FS interfaces to the dcache..
+  */
+ extern void d_instantiate(struct dentry *, struct inode *);
++extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
+ extern void d_delete(struct dentry *);
+ 
+ /* allocate/de-allocate */
+@@ -244,6 +245,23 @@
+ 	d_rehash(entry);
+ }
+ 
++/**
++ * d_add_unique - add dentry to hash queues without aliasing
++ * @entry: dentry to add
++ * @inode: The inode to attach to this dentry
++ *
++ * This adds the entry to the hash queues and initializes @inode.
++ * The entry was actually filled in earlier during d_alloc().
++ */
++static inline struct dentry *d_add_unique(struct dentry *entry, struct inode *inode)
++{
++	struct dentry *res;
++
++	res = d_instantiate_unique(entry, inode);
++	d_rehash(res != NULL ? res : entry);
++	return res;
++}
++
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
+ 
+Index: linux-2.6.10/include/linux/nfs_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfs_fs.h	2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/include/linux/nfs_fs.h	2005-04-05 14:49:13.463682528 +0800
+@@ -30,6 +30,7 @@
+ #include <linux/nfs_xdr.h>
+ #include <linux/rwsem.h>
+ #include <linux/workqueue.h>
++#include <linux/mempool.h>
+ 
+ /*
+  * Enable debugging support for nfs client.
+@@ -201,6 +202,7 @@
+ #define NFS_INO_INVALID_ATTR	0x0008		/* cached attrs are invalid */
+ #define NFS_INO_INVALID_DATA	0x0010		/* cached data is invalid */
+ #define NFS_INO_INVALID_ATIME	0x0020		/* cached atime is invalid */
++#define NFS_INO_INVALID_ACCESS	0x0040		/* cached access cred invalid */
+ 
+ static inline struct nfs_inode *NFS_I(struct inode *inode)
+ {
+@@ -239,7 +241,7 @@
+ static inline void NFS_CACHEINV(struct inode *inode)
+ {
+ 	if (!nfs_caches_unstable(inode))
+-		NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR;
++		NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+ }
+ 
+ static inline int nfs_server_capable(struct inode *inode, int cap)
+@@ -424,6 +426,44 @@
+ 	return nfs_wb_page_priority(inode, page, 0);
+ }
+ 
++/*
++ * Allocate and free nfs_write_data structures
++ */
++extern mempool_t *nfs_wdata_mempool;
++extern mempool_t *nfs_commit_mempool;
++
++static inline struct nfs_write_data *nfs_writedata_alloc(void)
++{
++	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
++	if (p) {
++		memset(p, 0, sizeof(*p));
++		INIT_LIST_HEAD(&p->pages);
++	}
++	return p;
++}
++
++static inline void nfs_writedata_free(struct nfs_write_data *p)
++{
++	mempool_free(p, nfs_wdata_mempool);
++}
++
++extern void  nfs_writedata_release(struct rpc_task *task);
++
++static inline struct nfs_write_data *nfs_commit_alloc(void)
++{
++	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
++	if (p) {
++		memset(p, 0, sizeof(*p));
++		INIT_LIST_HEAD(&p->pages);
++	}
++	return p;
++}
++
++static inline void nfs_commit_free(struct nfs_write_data *p)
++{
++	mempool_free(p, nfs_commit_mempool);
++}
++
+ /* Hack for future NFS swap support */
+ #ifndef IS_SWAPFILE
+ # define IS_SWAPFILE(inode)	(0)
+@@ -439,6 +479,26 @@
+ extern void nfs_readpage_result(struct rpc_task *);
+ 
+ /*
++ * Allocate and free nfs_read_data structures
++ */
++extern mempool_t *nfs_rdata_mempool;
++
++static inline struct nfs_read_data *nfs_readdata_alloc(void)
++{
++	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
++	if (p)
++		memset(p, 0, sizeof(*p));
++	return p;
++}
++
++static inline void nfs_readdata_free(struct nfs_read_data *p)
++{
++	mempool_free(p, nfs_rdata_mempool);
++}
++
++extern void  nfs_readdata_release(struct rpc_task *task);
++
++/*
+  * linux/fs/mount_clnt.c
+  * (Used only by nfsroot module)
+  */
+@@ -644,6 +704,12 @@
+ 
+ extern struct dentry_operations nfs4_dentry_operations;
+ extern struct inode_operations nfs4_dir_inode_operations;
++extern struct inode_operations nfs4_file_inode_operations;
++
++/* inode.c */
++extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
++extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
+ 
+ /* nfs4proc.c */
+ extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short);
+@@ -651,13 +717,14 @@
+ extern int nfs4_open_reclaim(struct nfs4_state_owner *, struct nfs4_state *);
+ extern int nfs4_proc_async_renew(struct nfs4_client *);
+ extern int nfs4_proc_renew(struct nfs4_client *);
+-extern int nfs4_do_close(struct inode *, struct nfs4_state *);
+-extern int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode);
++extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode);
+ extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *);
+ extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
+ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int);
+ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
+ extern int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request);
++extern ssize_t nfs4_proc_get_acl(struct inode *, void *buf, ssize_t buflen);
++extern int nfs4_proc_set_acl(struct inode *, const void *buf, ssize_t buflen);
+ 
+ /* nfs4renewd.c */
+ extern void nfs4_schedule_state_renewal(struct nfs4_client *);
+Index: linux-2.6.10/include/linux/nfs4.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfs4.h	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/include/linux/nfs4.h	2005-04-05 14:49:13.474680856 +0800
+@@ -28,7 +28,7 @@
+ #define NFS4_ACCESS_DELETE      0x0010
+ #define NFS4_ACCESS_EXECUTE     0x0020
+ 
+-#define NFS4_FH_PERISTENT		0x0000
++#define NFS4_FH_PERSISTENT		0x0000
+ #define NFS4_FH_NOEXPIRE_WITH_OPEN	0x0001
+ #define NFS4_FH_VOLATILE_ANY		0x0002
+ #define NFS4_FH_VOL_MIGRATION		0x0004
+@@ -382,6 +382,8 @@
+ 	NFSPROC4_CLNT_READDIR,
+ 	NFSPROC4_CLNT_SERVER_CAPS,
+ 	NFSPROC4_CLNT_DELEGRETURN,
++	NFSPROC4_CLNT_GETACL,
++	NFSPROC4_CLNT_SETACL,
+ };
+ 
+ #endif
+Index: linux-2.6.10/include/linux/sunrpc/auth.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/auth.h	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/auth.h	2005-04-05 14:49:13.468681768 +0800
+@@ -51,7 +51,6 @@
+ };
+ #define RPCAUTH_CRED_LOCKED	0x0001
+ #define RPCAUTH_CRED_UPTODATE	0x0002
+-#define RPCAUTH_CRED_DEAD	0x0004
+ 
+ #define RPCAUTH_CRED_MAGIC	0x0f4aa4f0
+ 
+@@ -133,7 +132,6 @@
+ int			rpcauth_refreshcred(struct rpc_task *);
+ void			rpcauth_invalcred(struct rpc_task *);
+ int			rpcauth_uptodatecred(struct rpc_task *);
+-int			rpcauth_deadcred(struct rpc_task *);
+ void			rpcauth_init_credcache(struct rpc_auth *);
+ void			rpcauth_free_credcache(struct rpc_auth *);
+ 
+Index: linux-2.6.10/include/linux/sunrpc/svc.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/svc.h	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/svc.h	2005-04-05 14:49:13.467681920 +0800
+@@ -251,8 +251,7 @@
+ 	char *			pg_name;	/* service name */
+ 	char *			pg_class;	/* class name: services sharing authentication */
+ 	struct svc_stat *	pg_stats;	/* rpc statistics */
+-	/* Override authentication. NULL means use default */
+-	int			(*pg_authenticate)(struct svc_rqst *, u32 *);
++	int			(*pg_authenticate)(struct svc_rqst *);
+ };
+ 
+ /*
+Index: linux-2.6.10/include/linux/sunrpc/cache.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/cache.h	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/cache.h	2005-04-05 14:49:13.470681464 +0800
+@@ -128,20 +128,17 @@
+  * just like a template in C++, this macro does cache lookup
+  * for us.
+  * The function is passed some sort of HANDLE from which a cache_detail
+- * structure can be determined (via SETUP, DETAIL), a template
++ * structure can be determined (via DETAIL), a template
+  * cache entry (type RTN*), and a "set" flag.  Using the HASHFN and the 
+  * TEST, the function will try to find a matching cache entry in the cache.
+  * If "set" == 0 :
+  *    If an entry is found, it is returned
+  *    If no entry is found, a new non-VALID entry is created.
+- * If "set" == 1 and INPLACE == 0 :
++ * If "set" == 1:
+  *    If no entry is found a new one is inserted with data from "template"
+  *    If a non-CACHE_VALID entry is found, it is updated from template using UPDATE
+  *    If a CACHE_VALID entry is found, a new entry is swapped in with data
+  *       from "template"
+- * If set == 1, and INPLACE == 1 :
+- *    As above, except that if a CACHE_VALID entry is found, we UPDATE in place
+- *       instead of swapping in a new entry.
+  *
+  * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not
+  * run but insteead CACHE_NEGATIVE is set in any new item.
+@@ -153,25 +150,22 @@
+  * MEMBER is the member of the cache which is cache_head, which must be first
+  * FNAME is the name for the function	
+  * ARGS are arguments to function and must contain RTN *item, int set.  May
+- *   also contain something to be usedby SETUP or DETAIL to find cache_detail.
+- * SETUP  locates the cache detail and makes it available as...
+- * DETAIL identifies the cache detail, possibly set up by SETUP
++ *   also contain something to be used by DETAIL to find cache_detail.
++ * DETAIL identifies the cache detail
+  * HASHFN returns a hash value of the cache entry "item"
+  * TEST  tests if "tmp" matches "item"
+  * INIT copies key information from "item" to "new"
+  * UPDATE copies content information from "item" to "tmp"
+- * INPLACE is true if updates can happen inplace rather than allocating a new structure
+  *
+  * WARNING: any substantial changes to this must be reflected in
+  *   net/sunrpc/svcauth.c(auth_domain_lookup)
+  *  which is a similar routine that is open-coded.
+  */
+-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE)	\
++#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,DETAIL,HASHFN,TEST,INIT,UPDATE)	\
+ RTN *FNAME ARGS										\
+ {											\
+ 	RTN *tmp, *new=NULL;								\
+ 	struct cache_head **hp, **head;							\
+-	SETUP;										\
+ 	head = &(DETAIL)->hash_table[HASHFN];						\
+  retry:											\
+ 	if (set||new) write_lock(&(DETAIL)->hash_lock);					\
+@@ -180,14 +174,14 @@
+ 		tmp = container_of(*hp, RTN, MEMBER);					\
+ 		if (TEST) { /* found a match */						\
+ 											\
+-			if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
++			if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
+ 				break;							\
+ 											\
+ 			if (new)							\
+ 				{INIT;}							\
+ 			cache_get(&tmp->MEMBER);					\
+ 			if (set) {							\
+-				if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
++				if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
+ 				{ /* need to swap in new */				\
+ 					RTN *t2;					\
+ 											\
+@@ -209,7 +203,7 @@
+ 			else read_unlock(&(DETAIL)->hash_lock);				\
+ 			if (set)							\
+ 				cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
+-			if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0);	\
++			if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0);	\
+ 			if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL);		\
+ 			return tmp;							\
+ 		}									\
+@@ -242,10 +236,10 @@
+ 	return NULL;									\
+ }
+ 
+-#define DefineSimpleCacheLookup(STRUCT,INPLACE)	\
+-	DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */,	\
++#define DefineSimpleCacheLookup(STRUCT)	\
++	DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set),	\
+ 			  & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\
+-			  STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE)
++			  STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+ 
+ #define cache_for_each(pos, detail, index, member) 						\
+ 	for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ;		\
+Index: linux-2.6.10/include/linux/sunrpc/sched.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/sched.h	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/sched.h	2005-04-05 14:49:13.472681160 +0800
+@@ -11,7 +11,9 @@
+ 
+ #include <linux/timer.h>
+ #include <linux/sunrpc/types.h>
++#include <linux/spinlock.h>
+ #include <linux/wait.h>
++#include <linux/workqueue.h>
+ #include <linux/sunrpc/xdr.h>
+ 
+ /*
+@@ -25,11 +27,18 @@
+ 	struct rpc_cred *	rpc_cred;	/* Credentials */
+ };
+ 
++struct rpc_wait_queue;
++struct rpc_wait {
++	struct list_head	list;		/* wait queue links */
++	struct list_head	links;		/* Links to related tasks */
++	wait_queue_head_t	waitq;		/* sync: sleep on this q */
++	struct rpc_wait_queue *	rpc_waitq;	/* RPC wait queue we're on */
++};
++
+ /*
+  * This is the RPC task struct
+  */
+ struct rpc_task {
+-	struct list_head	tk_list;	/* wait queue links */
+ #ifdef RPC_DEBUG
+ 	unsigned long		tk_magic;	/* 0xf00baa */
+ #endif
+@@ -37,7 +46,6 @@
+ 	struct rpc_clnt *	tk_client;	/* RPC client */
+ 	struct rpc_rqst *	tk_rqstp;	/* RPC request */
+ 	int			tk_status;	/* result of last operation */
+-	struct rpc_wait_queue *	tk_rpcwait;	/* RPC wait queue we're on */
+ 
+ 	/*
+ 	 * RPC call state
+@@ -70,13 +78,18 @@
+ 	 * you have a pathological interest in kernel oopses.
+ 	 */
+ 	struct timer_list	tk_timer;	/* kernel timer */
+-	wait_queue_head_t	tk_wait;	/* sync: sleep on this q */
+ 	unsigned long		tk_timeout;	/* timeout for rpc_sleep() */
+ 	unsigned short		tk_flags;	/* misc flags */
+ 	unsigned char		tk_active   : 1;/* Task has been activated */
+ 	unsigned char		tk_priority : 2;/* Task priority */
+ 	unsigned long		tk_runstate;	/* Task run status */
+-	struct list_head	tk_links;	/* links to related tasks */
++	struct workqueue_struct	*tk_workqueue;	/* Normally rpciod, but could
++						 * be any workqueue
++						 */
++	union {
++		struct work_struct	tk_work;	/* Async task work queue */
++		struct rpc_wait		tk_wait;	/* RPC wait */
++	} u;
+ #ifdef RPC_DEBUG
+ 	unsigned short		tk_pid;		/* debugging aid */
+ #endif
+@@ -87,11 +100,11 @@
+ /* support walking a list of tasks on a wait queue */
+ #define	task_for_each(task, pos, head) \
+ 	list_for_each(pos, head) \
+-		if ((task=list_entry(pos, struct rpc_task, tk_list)),1)
++		if ((task=list_entry(pos, struct rpc_task, u.tk_wait.list)),1)
+ 
+ #define	task_for_first(task, head) \
+ 	if (!list_empty(head) &&  \
+-	    ((task=list_entry((head)->next, struct rpc_task, tk_list)),1))
++	    ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1))
+ 
+ /* .. and walking list of all tasks */
+ #define	alltask_for_each(task, pos, head) \
+@@ -126,22 +139,39 @@
+ #define RPC_IS_SOFT(t)		((t)->tk_flags & RPC_TASK_SOFT)
+ #define RPC_TASK_UNINTERRUPTIBLE(t) ((t)->tk_flags & RPC_TASK_NOINTR)
+ 
+-#define RPC_TASK_SLEEPING	0
+-#define RPC_TASK_RUNNING	1
+-#define RPC_IS_SLEEPING(t)	(test_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
+-#define RPC_IS_RUNNING(t)	(test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define RPC_TASK_RUNNING	0
++#define RPC_TASK_QUEUED		1
++#define RPC_TASK_WAKEUP		2
++#define RPC_TASK_HAS_TIMER	3
+ 
++#define RPC_IS_RUNNING(t)	(test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+ #define rpc_set_running(t)	(set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+-#define rpc_clear_running(t)	(clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define rpc_test_and_set_running(t) \
++				(test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define rpc_clear_running(t)	\
++	do { \
++		smp_mb__before_clear_bit(); \
++		clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \
++		smp_mb__after_clear_bit(); \
++	} while (0)
+ 
+-#define rpc_set_sleeping(t)	(set_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
++#define RPC_IS_QUEUED(t)	(test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
++#define rpc_set_queued(t)	(set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
++#define rpc_clear_queued(t)	\
++	do { \
++		smp_mb__before_clear_bit(); \
++		clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \
++		smp_mb__after_clear_bit(); \
++	} while (0)
+ 
+-#define rpc_clear_sleeping(t) \
++#define rpc_start_wakeup(t) \
++	(test_and_set_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate) == 0)
++#define rpc_finish_wakeup(t) \
+ 	do { \
+ 		smp_mb__before_clear_bit(); \
+-		clear_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate); \
++		clear_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate); \
+ 		smp_mb__after_clear_bit(); \
+-	} while(0)
++	} while (0)
+ 
+ /*
+  * Task priorities.
+@@ -157,6 +187,7 @@
+  * RPC synchronization objects
+  */
+ struct rpc_wait_queue {
++	spinlock_t		lock;
+ 	struct list_head	tasks[RPC_NR_PRIORITY];	/* task queue for each priority level */
+ 	unsigned long		cookie;			/* cookie of last task serviced */
+ 	unsigned char		maxpriority;		/* maximum priority (0 if queue is not a priority queue) */
+@@ -177,6 +208,7 @@
+ 
+ #ifndef RPC_DEBUG
+ # define RPC_WAITQ_INIT(var,qname) { \
++		.lock = SPIN_LOCK_UNLOCKED, \
+ 		.tasks = { \
+ 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
+ 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
+@@ -185,6 +217,7 @@
+ 	}
+ #else
+ # define RPC_WAITQ_INIT(var,qname) { \
++		.lock = SPIN_LOCK_UNLOCKED, \
+ 		.tasks = { \
+ 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
+ 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
+@@ -209,13 +242,10 @@
+ int		rpc_execute(struct rpc_task *);
+ void		rpc_run_child(struct rpc_task *parent, struct rpc_task *child,
+ 					rpc_action action);
+-int		rpc_add_wait_queue(struct rpc_wait_queue *, struct rpc_task *);
+-void		rpc_remove_wait_queue(struct rpc_task *);
+ void		rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
+ void		rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
+ void		rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
+ 					rpc_action action, rpc_action timer);
+-void		rpc_add_timer(struct rpc_task *, rpc_action);
+ void		rpc_wake_up_task(struct rpc_task *);
+ void		rpc_wake_up(struct rpc_wait_queue *);
+ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
+Index: linux-2.6.10/include/linux/sunrpc/gss_krb5.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/gss_krb5.h	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/gss_krb5.h	2005-04-05 14:49:13.473681008 +0800
+@@ -53,6 +53,8 @@
+ 	struct xdr_netobj	mech_used;
+ };
+ 
++extern spinlock_t krb5_seq_lock;
++
+ #define KG_TOK_MIC_MSG    0x0101
+ #define KG_TOK_WRAP_MSG   0x0201
+ 
+@@ -116,18 +118,25 @@
+ 
+ s32
+ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
+-		   struct xdr_netobj *cksum);
++		int body_offset, struct xdr_netobj *cksum);
+ 
+ u32
+ krb5_make_token(struct krb5_ctx *context_handle, int qop_req,
+ 	struct xdr_buf *input_message_buffer,
+-	struct xdr_netobj *output_message_buffer, int toktype);
++	struct xdr_netobj *output_message_buffer);
+ 
+ u32
+ krb5_read_token(struct krb5_ctx *context_handle,
+ 	  struct xdr_netobj *input_token_buffer,
+-	  struct xdr_buf *message_buffer,
+-	  int *qop_state, int toktype);
++	  struct xdr_buf *message_buffer, int *qop_state);
++
++u32
++gss_wrap_kerberos(struct gss_ctx *ctx_id, u32 qop, int offset,
++		struct xdr_buf *outbuf, struct page **pages);
++
++u32
++gss_unwrap_kerberos(struct gss_ctx *ctx_id, u32 *qop, int offset,
++		struct xdr_buf *buf, int *out_offset);
+ 
+ u32
+ krb5_encrypt(struct crypto_tfm * key,
+@@ -137,6 +146,13 @@
+ krb5_decrypt(struct crypto_tfm * key,
+ 	     void *iv, void *in, void *out, int length); 
+ 
++int
++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *outbuf, int offset,
++		struct page **pages);
++
++int
++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *inbuf, int offset);
++
+ s32
+ krb5_make_seq_num(struct crypto_tfm * key,
+ 		int direction,
+Index: linux-2.6.10/include/linux/sunrpc/xdr.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/xdr.h	2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/xdr.h	2005-04-05 14:49:13.467681920 +0800
+@@ -192,6 +192,7 @@
+ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p);
+ extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
+ extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
++extern void truncate_xdr_buf(struct xdr_buf *xdr, int len);
+ 
+ #endif /* __KERNEL__ */
+ 
+Index: linux-2.6.10/include/linux/sunrpc/gss_api.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/gss_api.h	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/gss_api.h	2005-04-05 14:49:13.471681312 +0800
+@@ -47,6 +47,18 @@
+ 		struct xdr_buf		*message,
+ 		struct xdr_netobj	*mic_token,
+ 		u32			*qstate);
++u32 gss_wrap(
++		struct gss_ctx		*ctx_id,
++		u32			qop,
++		int			offset,
++		struct xdr_buf		*outbuf,
++		struct page		**inpages);
++u32 gss_unwrap(
++		struct gss_ctx		*ctx_id,
++		u32			*qop,
++		int			offset,
++		struct xdr_buf		*inbuf,
++		int			*out_offset);
+ u32 gss_delete_sec_context(
+ 		struct gss_ctx		**ctx_id);
+ 
+@@ -93,6 +105,18 @@
+ 			struct xdr_buf		*message,
+ 			struct xdr_netobj	*mic_token,
+ 			u32			*qstate);
++	u32 (*gss_wrap)(
++			struct gss_ctx		*ctx_id,
++			u32			qop,
++			int			offset,
++			struct xdr_buf		*outbuf,
++			struct page		**inpages);
++	u32 (*gss_unwrap)(
++			struct gss_ctx		*ctx_id,
++			u32			*qop,
++			int			offset,
++			struct xdr_buf		*buf,
++			int			*out_offset);
+ 	void (*gss_delete_sec_context)(
+ 			void			*internal_ctx_id);
+ };
+Index: linux-2.6.10/include/linux/sunrpc/svcauth.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/svcauth.h	2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/svcauth.h	2005-04-05 14:49:13.469681616 +0800
+@@ -26,21 +26,23 @@
+ struct svc_rqst;		/* forward decl */
+ 
+ /* Authentication is done in the context of a domain.
+- * For a server, a domain represents a group of clients using
++ *
++ * Currently, the nfs server uses the auth_domain to stand
++ * for the "client" listed in /etc/exports.
++ *
++ * More generally, a domain might represent a group of clients using
+  * a common mechanism for authentication and having a common mapping
+  * between local identity (uid) and network identity.  All clients
+  * in a domain have similar general access rights.  Each domain can
+  * contain multiple principals which will have different specific right
+  * based on normal Discretionary Access Control.
+  *
+- * For a client, a domain represents a number of servers which all
+- * use a common authentication mechanism and network identity name space.
+- *
+  * A domain is created by an authentication flavour module based on name
+  * only.  Userspace then fills in detail on demand.
+  *
+- * The creation of a domain typically implies creation of one or
+- * more caches for storing domain specific information.
++ * In the case of auth_unix and auth_null, the auth_domain is also
++ * associated with entries in another cache representing the mapping
++ * of ip addresses to the given client.
+  */
+ struct auth_domain {
+ 	struct	cache_head	h;
+@@ -92,6 +94,7 @@
+ 	int	(*accept)(struct svc_rqst *rq, u32 *authp);
+ 	int	(*release)(struct svc_rqst *rq);
+ 	void	(*domain_release)(struct auth_domain *);
++	int	(*set_client)(struct svc_rqst *rq);
+ };
+ 
+ #define	SVC_GARBAGE	1
+@@ -107,6 +110,7 @@
+ 
+ extern int	svc_authenticate(struct svc_rqst *rqstp, u32 *authp);
+ extern int	svc_authorise(struct svc_rqst *rqstp);
++extern int	svc_set_client(struct svc_rqst *rqstp);
+ extern int	svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
+ extern void	svc_auth_unregister(rpc_authflavor_t flavor);
+ 
+Index: linux-2.6.10/include/linux/sunrpc/xprt.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/xprt.h	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/xprt.h	2005-04-05 14:49:13.471681312 +0800
+@@ -95,7 +95,10 @@
+ 	int			rq_cong;	/* has incremented xprt->cong */
+ 	int			rq_received;	/* receive completed */
+ 	u32			rq_seqno;	/* gss seq no. used on req. */
+-
++	int			rq_enc_pages_num;
++	struct page		**rq_enc_pages;	/* scratch pages for use by
++						   gss privacy code */
++	void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
+ 	struct list_head	rq_list;
+ 
+ 	struct xdr_buf		rq_private_buf;		/* The receive buffer
+Index: linux-2.6.10/include/linux/nfs_xdr.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfs_xdr.h	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/include/linux/nfs_xdr.h	2005-04-05 14:49:13.459683136 +0800
+@@ -326,6 +326,20 @@
+ 	const u32 *			bitmask;
+ };
+ 
++struct nfs_setaclargs {
++	struct nfs_fh *			fh;
++	ssize_t				acl_len;
++	unsigned int			acl_pgbase;
++	struct page **			acl_pages;
++};
++
++struct nfs_getaclargs {
++	struct nfs_fh *			fh;
++	ssize_t				acl_len;
++	unsigned int			acl_pgbase;
++	struct page **			acl_pages;
++};
++
+ struct nfs_setattrres {
+ 	struct nfs_fattr *              fattr;
+ 	const struct nfs_server *	server;
+@@ -666,6 +680,7 @@
+ 	int	version;		/* Protocol version */
+ 	struct dentry_operations *dentry_ops;
+ 	struct inode_operations *dir_inode_ops;
++	struct inode_operations *file_inode_ops;
+ 
+ 	int	(*getroot) (struct nfs_server *, struct nfs_fh *,
+ 			    struct nfs_fsinfo *);
+@@ -681,7 +696,7 @@
+ 	int	(*read)    (struct nfs_read_data *);
+ 	int	(*write)   (struct nfs_write_data *);
+ 	int	(*commit)  (struct nfs_write_data *);
+-	struct inode *	(*create)  (struct inode *, struct qstr *,
++	struct inode *	(*create)  (struct inode *, struct dentry *,
+ 			    struct iattr *, int);
+ 	int	(*remove)  (struct inode *, struct qstr *);
+ 	int	(*unlink_setup)  (struct rpc_message *,
+Index: linux-2.6.10/net/sunrpc/xprt.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/xprt.c	2004-12-25 05:35:14.000000000 +0800
++++ linux-2.6.10/net/sunrpc/xprt.c	2005-04-05 14:49:13.393693168 +0800
+@@ -891,7 +891,8 @@
+ 	xprt->tcp_flags &= ~XPRT_COPY_XID;
+ 	xprt->tcp_flags |= XPRT_COPY_DATA;
+ 	xprt->tcp_copied = 4;
+-	dprintk("RPC:      reading reply for XID %08x\n", xprt->tcp_xid);
++	dprintk("RPC:      reading reply for XID %08x\n",
++						ntohl(xprt->tcp_xid));
+ 	tcp_check_recm(xprt);
+ }
+ 
+@@ -911,7 +912,7 @@
+ 	if (!req) {
+ 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
+ 		dprintk("RPC:      XID %08x request not found!\n",
+-				xprt->tcp_xid);
++				ntohl(xprt->tcp_xid));
+ 		spin_unlock(&xprt->sock_lock);
+ 		return;
+ 	}
+@@ -1101,7 +1102,7 @@
+ 		goto out;
+ 
+ 	spin_lock_bh(&xprt->sock_lock);
+-	if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending)
++	if (xprt->snd_task)
+ 		rpc_wake_up_task(xprt->snd_task);
+ 	spin_unlock_bh(&xprt->sock_lock);
+ out:
+@@ -1359,8 +1360,9 @@
+ 	req->rq_task	= task;
+ 	req->rq_xprt    = xprt;
+ 	req->rq_xid     = xprt_alloc_xid(xprt);
++	req->rq_release_snd_buf = NULL;
+ 	dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid,
+-			req, req->rq_xid);
++			req, ntohl(req->rq_xid));
+ }
+ 
+ /*
+@@ -1384,6 +1386,8 @@
+ 		mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT);
+ 	spin_unlock_bh(&xprt->sock_lock);
+ 	task->tk_rqstp = NULL;
++	if (req->rq_release_snd_buf)
++		req->rq_release_snd_buf(req);
+ 	memset(req, 0, sizeof(*req));	/* mark unused */
+ 
+ 	dprintk("RPC: %4d release request %p\n", task->tk_pid, req);
+Index: linux-2.6.10/net/sunrpc/auth.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth.c	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth.c	2005-04-05 14:49:13.394693016 +0800
+@@ -214,8 +214,6 @@
+ 	list_for_each_safe(pos, next, &auth->au_credcache[nr]) {
+ 		struct rpc_cred *entry;
+ 	       	entry = list_entry(pos, struct rpc_cred, cr_hash);
+-		if (entry->cr_flags & RPCAUTH_CRED_DEAD)
+-			continue;
+ 		if (rpcauth_prune_expired(entry, &free))
+ 			continue;
+ 		if (entry->cr_ops->crmatch(acred, entry, taskflags)) {
+@@ -307,9 +305,6 @@
+ 	if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
+ 		return;
+ 
+-	if ((cred->cr_flags & RPCAUTH_CRED_DEAD) && !list_empty(&cred->cr_hash))
+-		list_del_init(&cred->cr_hash);
+-
+ 	if (list_empty(&cred->cr_hash)) {
+ 		spin_unlock(&rpc_credcache_lock);
+ 		rpcauth_crdestroy(cred);
+@@ -413,10 +408,3 @@
+ 	return !(task->tk_msg.rpc_cred) ||
+ 		(task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_UPTODATE);
+ }
+-
+-int
+-rpcauth_deadcred(struct rpc_task *task)
+-{
+-	return !(task->tk_msg.rpc_cred) ||
+-		(task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_DEAD);
+-}
+Index: linux-2.6.10/net/sunrpc/svcauth_unix.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/svcauth_unix.c	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/net/sunrpc/svcauth_unix.c	2005-04-05 14:49:13.395692864 +0800
+@@ -97,7 +97,7 @@
+ };
+ static struct cache_head	*ip_table[IP_HASHMAX];
+ 
+-void ip_map_put(struct cache_head *item, struct cache_detail *cd)
++static void ip_map_put(struct cache_head *item, struct cache_detail *cd)
+ {
+ 	struct ip_map *im = container_of(item, struct ip_map,h);
+ 	if (cache_put(item, cd)) {
+@@ -258,7 +258,7 @@
+ 	.cache_show	= ip_map_show,
+ };
+ 
+-static DefineSimpleCacheLookup(ip_map, 0)
++static DefineSimpleCacheLookup(ip_map)
+ 
+ 
+ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
+@@ -329,14 +329,49 @@
+ 	cache_purge(&auth_domain_cache);
+ }
+ 
++int
++svcauth_unix_set_client(struct svc_rqst *rqstp)
++{
++	struct ip_map key, *ipm;
++
++	rqstp->rq_client = NULL;
++	if (rqstp->rq_proc == 0)
++		return SVC_OK;
++
++	strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class);
++	key.m_addr = rqstp->rq_addr.sin_addr;
++
++	ipm = ip_map_lookup(&key, 0);
++
++	if (ipm == NULL)
++		return SVC_DENIED;
++
++	switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
++		case -EAGAIN:
++			return SVC_DROP;
++		case -ENOENT:
++			return SVC_DENIED;
++		case 0:
++			rqstp->rq_client = &ipm->m_client->h;
++			cache_get(&rqstp->rq_client->h);
++			ip_map_put(&ipm->h, &ip_map_cache);
++			return SVC_OK;
++		default:
++			BUG();
++	}
++	/* shut up gcc: */
++	return -1;
++}
+ 
+ static int
+ svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp)
+ {
+ 	struct kvec	*argv = &rqstp->rq_arg.head[0];
+ 	struct kvec	*resv = &rqstp->rq_res.head[0];
+-	int		rv=0;
+-	struct ip_map key, *ipm;
++	struct svc_cred	*cred = &rqstp->rq_cred;
++
++	cred->cr_group_info = NULL;
++	rqstp->rq_client = NULL;
+ 
+ 	if (argv->iov_len < 3*4)
+ 		return SVC_GARBAGE;
+@@ -353,45 +388,17 @@
+ 	}
+ 
+ 	/* Signal that mapping to nobody uid/gid is required */
+-	rqstp->rq_cred.cr_uid = (uid_t) -1;
+-	rqstp->rq_cred.cr_gid = (gid_t) -1;
+-	rqstp->rq_cred.cr_group_info = groups_alloc(0);
+-	if (rqstp->rq_cred.cr_group_info == NULL)
++	cred->cr_uid = (uid_t) -1;
++	cred->cr_gid = (gid_t) -1;
++	cred->cr_group_info = groups_alloc(0);
++	if (cred->cr_group_info == NULL)
+ 		return SVC_DROP; /* kmalloc failure - client must retry */
+ 
+ 	/* Put NULL verifier */
+ 	svc_putu32(resv, RPC_AUTH_NULL);
+ 	svc_putu32(resv, 0);
+ 
+-	strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class);
+-	key.m_addr = rqstp->rq_addr.sin_addr;
+-
+-	ipm = ip_map_lookup(&key, 0);
+-
+-	rqstp->rq_client = NULL;
+-
+-	if (ipm)
+-		switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
+-		case -EAGAIN:
+-			rv = SVC_DROP;
+-			break;
+-		case -ENOENT:
+-			rv = SVC_OK; /* rq_client is NULL */
+-			break;
+-		case 0:
+-			rqstp->rq_client = &ipm->m_client->h;
+-			cache_get(&rqstp->rq_client->h);
+-			ip_map_put(&ipm->h, &ip_map_cache);
+-			rv = SVC_OK;
+-			break;
+-		default: BUG();
+-		}
+-	else rv = SVC_DROP;
+-
+-	if (rqstp->rq_client == NULL && rqstp->rq_proc != 0)
+-		*authp = rpc_autherr_badcred;
+-
+-	return rv;
++	return SVC_OK;
+ }
+ 
+ static int
+@@ -414,6 +421,7 @@
+ 	.flavour	= RPC_AUTH_NULL,
+ 	.accept 	= svcauth_null_accept,
+ 	.release	= svcauth_null_release,
++	.set_client	= svcauth_unix_set_client,
+ };
+ 
+ 
+@@ -425,8 +433,6 @@
+ 	struct svc_cred	*cred = &rqstp->rq_cred;
+ 	u32		slen, i;
+ 	int		len   = argv->iov_len;
+-	int		rv=0;
+-	struct ip_map key, *ipm;
+ 
+ 	cred->cr_group_info = NULL;
+ 	rqstp->rq_client = NULL;
+@@ -458,39 +464,11 @@
+ 		return SVC_DENIED;
+ 	}
+ 
+-
+-	strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class);
+-	key.m_addr = rqstp->rq_addr.sin_addr;
+-
+-
+-	ipm = ip_map_lookup(&key, 0);
+-
+-	if (ipm)
+-		switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
+-		case -EAGAIN:
+-			rv = SVC_DROP;
+-			break;
+-		case -ENOENT:
+-			rv = SVC_OK; /* rq_client is NULL */
+-			break;
+-		case 0:
+-			rqstp->rq_client = &ipm->m_client->h;
+-			cache_get(&rqstp->rq_client->h);
+-			ip_map_put(&ipm->h, &ip_map_cache);
+-			rv = SVC_OK;
+-			break;
+-		default: BUG();
+-		}
+-	else rv = SVC_DROP;
+-
+-	if (rv  == SVC_OK && rqstp->rq_client == NULL && rqstp->rq_proc != 0)
+-		goto badcred;
+-
+ 	/* Put NULL verifier */
+ 	svc_putu32(resv, RPC_AUTH_NULL);
+ 	svc_putu32(resv, 0);
+ 
+-	return rv;
++	return SVC_OK;
+ 
+ badcred:
+ 	*authp = rpc_autherr_badcred;
+@@ -520,5 +498,6 @@
+ 	.accept 	= svcauth_unix_accept,
+ 	.release	= svcauth_unix_release,
+ 	.domain_release	= svcauth_unix_domain_release,
++	.set_client	= svcauth_unix_set_client,
+ };
+ 
+Index: linux-2.6.10/net/sunrpc/clnt.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/clnt.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/net/sunrpc/clnt.c	2005-04-05 14:49:13.410690584 +0800
+@@ -636,8 +636,14 @@
+ 		rpc_exit(task, -EIO);
+ 		return;
+ 	}
+-	if (encode && (status = rpcauth_wrap_req(task, encode, req, p,
+-						 task->tk_msg.rpc_argp)) < 0) {
++	if (encode == NULL)
++		return;
++
++	status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp);
++	if (status == -EAGAIN) {
++		printk("XXXJBF: out of memory?  Should retry here!!!\n");
++	}
++	if (status < 0) {
+ 		printk(KERN_WARNING "%s: can't encode arguments: %d\n",
+ 				clnt->cl_protname, -status);
+ 		rpc_exit(task, status);
+@@ -935,7 +941,7 @@
+ 	task->tk_action = call_reserve;
+ 	if (status >= 0 && rpcauth_uptodatecred(task))
+ 		return;
+-	if (rpcauth_deadcred(task)) {
++	if (status == -EACCES) {
+ 		rpc_exit(task, -EACCES);
+ 		return;
+ 	}
+@@ -993,7 +999,7 @@
+ 			goto garbage;
+ 		if ((n = ntohl(*p++)) != RPC_AUTH_ERROR) {
+ 			printk(KERN_WARNING "call_verify: RPC call rejected: %x\n", n);
+-		} else if (--len < 0)
++		} else if (--len == 0)
+ 		switch ((n = ntohl(*p++))) {
+ 		case RPC_AUTH_REJECTEDCRED:
+ 		case RPC_AUTH_REJECTEDVERF:
+Index: linux-2.6.10/net/sunrpc/svcauth.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/svcauth.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/net/sunrpc/svcauth.c	2005-04-05 14:49:13.392693320 +0800
+@@ -59,6 +59,11 @@
+ 	return aops->accept(rqstp, authp);
+ }
+ 
++int svc_set_client(struct svc_rqst *rqstp)
++{
++	return rqstp->rq_authop->set_client(rqstp);
++}
++
+ /* A request, which was authenticated, has now executed.
+  * Time to finalise the the credentials and verifier
+  * and release and resources
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_unseal.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_unseal.c	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_unseal.c	2005-04-05 14:49:13.401691952 +0800
+@@ -68,20 +68,13 @@
+ #endif
+ 
+ 
+-/* message_buffer is an input if toktype is MIC and an output if it is WRAP:
+- * If toktype is MIC: read_token is a mic token, and message_buffer is the
+- *   data that the mic was supposedly taken over.
+- * If toktype is WRAP: read_token is a wrap token, and message_buffer is used
+- *   to return the decrypted data.
+- */
++/* read_token is a mic token, and message_buffer is the data that the mic was
++ * supposedly taken over. */
+ 
+-/* XXX will need to change prototype and/or just split into a separate function
+- * when we add privacy (because read_token will be in pages too). */
+ u32
+ krb5_read_token(struct krb5_ctx *ctx,
+ 		struct xdr_netobj *read_token,
+-		struct xdr_buf *message_buffer,
+-		int *qop_state, int toktype)
++		struct xdr_buf *message_buffer, int *qop_state)
+ {
+ 	int			signalg;
+ 	int			sealalg;
+@@ -100,16 +93,12 @@
+ 					read_token->len))
+ 		goto out;
+ 
+-	if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff)))
++	if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
++	    (*ptr++ != ( KG_TOK_MIC_MSG    &0xff))   )
+ 		goto out;
+ 
+ 	/* XXX sanity-check bodysize?? */
+ 
+-	if (toktype == KG_TOK_WRAP_MSG) {
+-		/* XXX gone */
+-		goto out;
+-	}
+-
+ 	/* get the sign and seal algorithms */
+ 
+ 	signalg = ptr[0] + (ptr[1] << 8);
+@@ -120,14 +109,7 @@
+ 	if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+ 		goto out;
+ 
+-	if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) ||
+-	    ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff)))
+-		goto out;
+-
+-	/* in the current spec, there is only one valid seal algorithm per
+-	   key type, so a simple comparison is ok */
+-
+-	if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg))
++	if (sealalg != 0xffff)
+ 		goto out;
+ 
+ 	/* there are several mappings of seal algorithms to sign algorithms,
+@@ -154,7 +136,7 @@
+ 	switch (signalg) {
+ 	case SGN_ALG_DES_MAC_MD5:
+ 		ret = make_checksum(checksum_type, ptr - 2, 8,
+-					 message_buffer, &md5cksum);
++					 message_buffer, 0, &md5cksum);
+ 		if (ret)
+ 			goto out;
+ 
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_mech_switch.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_mech_switch.c	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_mech_switch.c	2005-04-05 14:49:13.408690888 +0800
+@@ -279,6 +279,29 @@
+ 				 qstate);
+ }
+ 
++u32
++gss_wrap(struct gss_ctx	*ctx_id,
++	 u32		qop,
++	 int		offset,
++	 struct xdr_buf	*buf,
++	 struct page	**inpages)
++{
++	return ctx_id->mech_type->gm_ops
++		->gss_wrap(ctx_id, qop, offset, buf, inpages);
++}
++
++u32
++gss_unwrap(struct gss_ctx	*ctx_id,
++	   u32			*qop,
++	   int			offset,
++	   struct xdr_buf	*buf,
++	   int			*out_offset)
++{
++	return ctx_id->mech_type->gm_ops
++		->gss_unwrap(ctx_id, qop, offset, buf, out_offset);
++}
++
++
+ /* gss_delete_sec_context: free all resources associated with context_handle.
+  * Note this differs from the RFC 2744-specified prototype in that we don't
+  * bother returning an output token, since it would never be used anyway. */
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_wrap.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_wrap.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_wrap.c	2005-04-05 14:49:13.397692560 +0800
+@@ -0,0 +1,337 @@
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_krb5.h>
++#include <linux/random.h>
++#include <linux/pagemap.h>
++#include <asm/scatterlist.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY	RPCDBG_AUTH
++#endif
++
++static inline int
++gss_krb5_padding(int blocksize, int length)
++{
++	/* Most of the code is block-size independent but currently we
++	 * use only 8: */
++	BUG_ON(blocksize != 8);
++	return 8 - (length & 7);
++}
++
++static inline void
++gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize)
++{
++	int padding = gss_krb5_padding(blocksize, buf->len - offset);
++	char *p;
++	struct kvec *iov;
++
++	if (buf->page_len || buf->tail[0].iov_len)
++		iov = &buf->tail[0];
++	else
++		iov = &buf->head[0];
++	p = iov->iov_base + iov->iov_len;
++	iov->iov_len += padding;
++	buf->len += padding;
++	memset(p, padding, padding);
++}
++
++static inline int
++gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
++{
++	u8 *ptr;
++	u8 pad;
++	int len = buf->len;
++
++	if (len <= buf->head[0].iov_len) {
++		pad = *(u8 *)(buf->head[0].iov_base + len - 1);
++		goto out;
++	} else
++		len -= buf->head[0].iov_len;
++	if (len <= buf->page_len) {
++		int last = (buf->page_base + len - 1)
++					>>PAGE_CACHE_SHIFT;
++		int offset = (buf->page_base + len - 1)
++					& (PAGE_CACHE_SIZE - 1);
++		ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA);
++		pad = *(ptr + offset);
++		kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA);
++		goto out;
++	} else
++		len -= buf->page_len;
++	BUG_ON(len > buf->tail[0].iov_len);
++	pad = *(u8 *)(buf->tail[0].iov_base + len - 1);
++out:
++	if (pad > blocksize)
++		return -EINVAL;
++	buf->len -= pad;
++	return 0;
++}
++
++static inline void
++make_confounder(char *p, int blocksize)
++{
++	/* XXX?  Is this OK to do on every packet? */
++	get_random_bytes(p, blocksize);
++}
++
++/* Assumptions: the head and tail of inbuf are ours to play with.
++ * The pages, however, may be real pages in the page cache and we replace
++ * them with scratch pages from **pages before writing to them. */
++/* XXX: obviously the above should be documentation of wrap interface,
++ * and shouldn't be in this kerberos-specific file. */
++
++/* XXX factor out common code with seal/unseal. */
++
++u32
++gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset,
++		struct xdr_buf *buf, struct page **pages)
++{
++	struct krb5_ctx		*kctx = ctx->internal_ctx_id;
++	s32			checksum_type;
++	struct xdr_netobj	md5cksum = {.len = 0, .data = NULL};
++	int			blocksize = 0, plainlen;
++	unsigned char		*ptr, *krb5_hdr, *msg_start;
++	s32			now;
++	int			headlen;
++	struct page		**tmp_pages;
++	u32			seq_send;
++
++	dprintk("RPC:     gss_wrap_kerberos\n");
++
++	now = get_seconds();
++
++	if (qop != 0)
++		goto out_err;
++
++	switch (kctx->signalg) {
++		case SGN_ALG_DES_MAC_MD5:
++			checksum_type = CKSUMTYPE_RSA_MD5;
++			break;
++		default:
++			dprintk("RPC:      gss_krb5_seal: kctx->signalg %d not"
++				" supported\n", kctx->signalg);
++			goto out_err;
++	}
++	if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) {
++		dprintk("RPC:      gss_krb5_seal: kctx->sealalg %d not supported\n",
++			kctx->sealalg);
++		goto out_err;
++	}
++
++	blocksize = crypto_tfm_alg_blocksize(kctx->enc);
++	gss_krb5_add_padding(buf, offset, blocksize);
++	BUG_ON((buf->len - offset) % blocksize);
++	plainlen = blocksize + buf->len - offset;
++
++	headlen = g_token_size(&kctx->mech_used, 22 + plainlen) -
++						(buf->len - offset);
++
++	ptr = buf->head[0].iov_base + offset;
++	/* shift data to make room for header. */
++	/* XXX Would be cleverer to encrypt while copying. */
++	/* XXX bounds checking, slack, etc. */
++	memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset);
++	buf->head[0].iov_len += headlen;
++	buf->len += headlen;
++	BUG_ON((buf->len - offset - headlen) % blocksize);
++
++	g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr);
++
++
++	*ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff);
++	*ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff);
++
++	/* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
++	krb5_hdr = ptr - 2;
++	msg_start = krb5_hdr + 24;
++	/* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize);
++
++	*(u16 *)(krb5_hdr + 2) = htons(kctx->signalg);
++	memset(krb5_hdr + 4, 0xff, 4);
++	*(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg);
++
++	make_confounder(msg_start, blocksize);
++
++	/* XXXJBF: UGH!: */
++	tmp_pages = buf->pages;
++	buf->pages = pages;
++	if (make_checksum(checksum_type, krb5_hdr, 8, buf,
++				offset + headlen - blocksize, &md5cksum))
++		goto out_err;
++	buf->pages = tmp_pages;
++
++	switch (kctx->signalg) {
++	case SGN_ALG_DES_MAC_MD5:
++		if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
++				  md5cksum.data, md5cksum.len))
++			goto out_err;
++		memcpy(krb5_hdr + 16,
++		       md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
++		       KRB5_CKSUM_LENGTH);
++
++		dprintk("RPC:      make_seal_token: cksum data: \n");
++		print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0);
++		break;
++	default:
++		BUG();
++	}
++
++	kfree(md5cksum.data);
++
++	spin_lock(&krb5_seq_lock);
++	seq_send = kctx->seq_send++;
++	spin_unlock(&krb5_seq_lock);
++
++	/* XXX would probably be more efficient to compute checksum
++	 * and encrypt at the same time: */
++	if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
++			       seq_send, krb5_hdr + 16, krb5_hdr + 8)))
++		goto out_err;
++
++	if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
++									pages))
++		goto out_err;
++
++	return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
++out_err:
++	if (md5cksum.data) kfree(md5cksum.data);
++	return GSS_S_FAILURE;
++}
++
++u32
++gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset,
++			struct xdr_buf *buf, int *out_offset)
++{
++	struct krb5_ctx		*kctx = ctx->internal_ctx_id;
++	int			signalg;
++	int			sealalg;
++	s32			checksum_type;
++	struct xdr_netobj	md5cksum = {.len = 0, .data = NULL};
++	s32			now;
++	int			direction;
++	s32			seqnum;
++	unsigned char		*ptr;
++	int			bodysize;
++	u32			ret = GSS_S_DEFECTIVE_TOKEN;
++	u8			*data_start;
++	int			blocksize;
++
++	dprintk("RPC:      gss_unwrap_kerberos\n");
++
++	ptr = (u8 *)buf->head[0].iov_base + offset;
++	if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
++					buf->len - offset))
++		goto out;
++
++	if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) ||
++	    (*ptr++ !=  (KG_TOK_WRAP_MSG    &0xff))   )
++		goto out;
++
++	/* XXX sanity-check bodysize?? */
++
++	/* get the sign and seal algorithms */
++
++	signalg = ptr[0] + (ptr[1] << 8);
++	sealalg = ptr[2] + (ptr[3] << 8);
++
++	/* Sanity checks */
++
++	if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
++		goto out;
++
++	if (sealalg == 0xffff)
++		goto out;
++
++	/* in the current spec, there is only one valid seal algorithm per
++	   key type, so a simple comparison is ok */
++
++	if (sealalg != kctx->sealalg)
++		goto out;
++
++	/* there are several mappings of seal algorithms to sign algorithms,
++	   but few enough that we can try them all. */
++
++	if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
++	    (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
++	    (kctx->sealalg == SEAL_ALG_DES3KD &&
++	     signalg != SGN_ALG_HMAC_SHA1_DES3_KD))
++		goto out;
++
++	if (gss_decrypt_xdr_buf(kctx->enc, buf,
++			ptr + 22 - (unsigned char *)buf->head[0].iov_base))
++		goto out;
++
++	/* compute the checksum of the message */
++
++	/* initialize the the cksum */
++	switch (signalg) {
++	case SGN_ALG_DES_MAC_MD5:
++		checksum_type = CKSUMTYPE_RSA_MD5;
++		break;
++	default:
++		ret = GSS_S_DEFECTIVE_TOKEN;
++		goto out;
++	}
++
++	switch (signalg) {
++	case SGN_ALG_DES_MAC_MD5:
++		ret = make_checksum(checksum_type, ptr - 2, 8, buf,
++			 ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum);
++		if (ret)
++			goto out;
++
++		ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data,
++				   md5cksum.data, md5cksum.len);
++		if (ret)
++			goto out;
++
++		if (memcmp(md5cksum.data + 8, ptr + 14, 8)) {
++			ret = GSS_S_BAD_SIG;
++			goto out;
++		}
++		break;
++	default:
++		ret = GSS_S_DEFECTIVE_TOKEN;
++		goto out;
++	}
++
++	/* it got through unscathed.  Make sure the context is unexpired */
++
++	if (qop)
++		*qop = GSS_C_QOP_DEFAULT;
++
++	now = get_seconds();
++
++	ret = GSS_S_CONTEXT_EXPIRED;
++	if (now > kctx->endtime)
++		goto out;
++
++	/* do sequencing checks */
++
++	ret = GSS_S_BAD_SIG;
++	if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction,
++				    &seqnum)))
++		goto out;
++
++	if ((kctx->initiate && direction != 0xff) ||
++	    (!kctx->initiate && direction != 0))
++		goto out;
++
++	/* Copy the data back to the right position.  XXX: Would probably be
++	 * better to copy and encrypt at the same time. */
++
++	blocksize = crypto_tfm_alg_blocksize(kctx->enc);
++	data_start = ptr + 22 + blocksize;
++	*out_offset = data_start - (u8 *)buf->head[0].iov_base;
++
++	ret = GSS_S_DEFECTIVE_TOKEN;
++	if (gss_krb5_remove_padding(buf, blocksize))
++		goto out;
++
++	ret = GSS_S_COMPLETE;
++out:
++	if (md5cksum.data) kfree(md5cksum.data);
++	return ret;
++}
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_crypto.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_crypto.c	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_crypto.c	2005-04-05 14:49:13.398692408 +0800
+@@ -139,17 +139,91 @@
+ 	sg->length = len;
+ }
+ 
++static int
++process_xdr_buf(struct xdr_buf *buf, int offset, int len,
++		int (*actor)(struct scatterlist *, void *), void *data)
++{
++	int i, page_len, thislen, page_offset, ret = 0;
++	struct scatterlist	sg[1];
++
++	if (offset >= buf->head[0].iov_len) {
++		offset -= buf->head[0].iov_len;
++	} else {
++		thislen = buf->head[0].iov_len - offset;
++		if (thislen > len)
++			thislen = len;
++		buf_to_sg(sg, buf->head[0].iov_base + offset, thislen);
++		ret = actor(sg, data);
++		if (ret)
++			goto out;
++		offset = 0;
++		len -= thislen;
++	}
++	if (len == 0)
++		goto out;
++
++	if (offset >= buf->page_len) {
++		offset -= buf->page_len;
++	} else {
++		page_len = buf->page_len - offset;
++		if (page_len > len)
++			page_len = len;
++		len -= page_len;
++		page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
++		i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
++		thislen = PAGE_CACHE_SIZE - page_offset;
++		do {
++			if (thislen > page_len)
++				thislen = page_len;
++			sg->page = buf->pages[i];
++			sg->offset = page_offset;
++			sg->length = thislen;
++			ret = actor(sg, data);
++			if (ret)
++				goto out;
++			page_len -= thislen;
++			i++;
++			page_offset = 0;
++			thislen = PAGE_CACHE_SIZE;
++		} while (page_len != 0);
++		offset = 0;
++	}
++	if (len == 0)
++		goto out;
++
++	if (offset < buf->tail[0].iov_len) {
++		thislen = buf->tail[0].iov_len - offset;
++		if (thislen > len)
++			thislen = len;
++		buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen);
++		ret = actor(sg, data);
++		len -= thislen;
++	}
++	if (len != 0)
++		ret = -EINVAL;
++out:
++	return ret;
++}
++
++static int
++checksummer(struct scatterlist *sg, void *data)
++{
++	struct crypto_tfm *tfm = (struct crypto_tfm *)data;
++
++	crypto_digest_update(tfm, sg, 1);
++
++	return 0;
++}
++
+ /* checksum the plaintext data and hdrlen bytes of the token header */
+ s32
+ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
+-		   struct xdr_netobj *cksum)
++		   int body_offset, struct xdr_netobj *cksum)
+ {
+ 	char                            *cksumname;
+ 	struct crypto_tfm               *tfm = NULL; /* XXX add to ctx? */
+ 	struct scatterlist              sg[1];
+ 	u32                             code = GSS_S_FAILURE;
+-	int				len, thislen, offset;
+-	int				i;
+ 
+ 	switch (cksumtype) {
+ 		case CKSUMTYPE_RSA_MD5:
+@@ -169,35 +243,8 @@
+ 	crypto_digest_init(tfm);
+ 	buf_to_sg(sg, header, hdrlen);
+ 	crypto_digest_update(tfm, sg, 1);
+-	if (body->head[0].iov_len) {
+-		buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len);
+-		crypto_digest_update(tfm, sg, 1);
+-	}
+-
+-	len = body->page_len;
+-	if (len != 0) {
+-		offset = body->page_base & (PAGE_CACHE_SIZE - 1);
+-		i = body->page_base >> PAGE_CACHE_SHIFT;
+-		thislen = PAGE_CACHE_SIZE - offset;
+-		do {
+-			if (thislen > len)
+-				thislen = len;
+-			sg->page = body->pages[i];
+-			sg->offset = offset;
+-			sg->length = thislen;
+-			kmap(sg->page); /* XXX kmap_atomic? */
+-			crypto_digest_update(tfm, sg, 1);
+-			kunmap(sg->page);
+-			len -= thislen;
+-			i++;
+-			offset = 0;
+-			thislen = PAGE_CACHE_SIZE;
+-		} while(len != 0);
+-	}
+-	if (body->tail[0].iov_len) {
+-		buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len);
+-		crypto_digest_update(tfm, sg, 1);
+-	}
++	process_xdr_buf(body, body_offset, body->len - body_offset,
++			checksummer, tfm);
+ 	crypto_digest_final(tfm, cksum->data);
+ 	code = 0;
+ out:
+@@ -207,3 +254,154 @@
+ }
+ 
+ EXPORT_SYMBOL(make_checksum);
++
++struct encryptor_desc {
++	u8 iv[8]; /* XXX hard-coded blocksize */
++	struct crypto_tfm *tfm;
++	int pos;
++	struct xdr_buf *outbuf;
++	struct page **pages;
++	struct scatterlist infrags[4];
++	struct scatterlist outfrags[4];
++	int fragno;
++	int fraglen;
++};
++
++static int
++encryptor(struct scatterlist *sg, void *data)
++{
++	struct encryptor_desc *desc = data;
++	struct xdr_buf *outbuf = desc->outbuf;
++	struct page *in_page;
++	int thislen = desc->fraglen + sg->length;
++	int fraglen, ret;
++	int page_pos;
++
++	/* Worst case is 4 fragments: head, end of page 1, start
++	 * of page 2, tail.  Anything more is a bug. */
++	BUG_ON(desc->fragno > 3);
++	desc->infrags[desc->fragno] = *sg;
++	desc->outfrags[desc->fragno] = *sg;
++
++	page_pos = desc->pos - outbuf->head[0].iov_len;
++	if (page_pos >= 0 && page_pos < outbuf->page_len) {
++		/* pages are not in place: */
++		int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
++		in_page = desc->pages[i];
++	} else {
++		in_page = sg->page;
++	}
++	desc->infrags[desc->fragno].page = in_page;
++	desc->fragno++;
++	desc->fraglen += sg->length;
++	desc->pos += sg->length;
++
++	fraglen = thislen & 7; /* XXX hardcoded blocksize */
++	thislen -= fraglen;
++
++	if (thislen == 0)
++		return 0;
++
++	ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags,
++					thislen, desc->iv);
++	if (ret)
++		return ret;
++	if (fraglen) {
++		desc->outfrags[0].page = sg->page;
++		desc->outfrags[0].offset = sg->offset + sg->length - fraglen;
++		desc->outfrags[0].length = fraglen;
++		desc->infrags[0] = desc->outfrags[0];
++		desc->infrags[0].page = in_page;
++		desc->fragno = 1;
++		desc->fraglen = fraglen;
++	} else {
++		desc->fragno = 0;
++		desc->fraglen = 0;
++	}
++	return 0;
++}
++
++int
++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset,
++		struct page **pages)
++{
++	int ret;
++	struct encryptor_desc desc;
++
++	BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
++
++	memset(desc.iv, 0, sizeof(desc.iv));
++	desc.tfm = tfm;
++	desc.pos = offset;
++	desc.outbuf = buf;
++	desc.pages = pages;
++	desc.fragno = 0;
++	desc.fraglen = 0;
++
++	ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc);
++	return ret;
++}
++
++EXPORT_SYMBOL(gss_encrypt_xdr_buf);
++
++struct decryptor_desc {
++	u8 iv[8]; /* XXX hard-coded blocksize */
++	struct crypto_tfm *tfm;
++	struct scatterlist frags[4];
++	int fragno;
++	int fraglen;
++};
++
++static int
++decryptor(struct scatterlist *sg, void *data)
++{
++	struct decryptor_desc *desc = data;
++	int thislen = desc->fraglen + sg->length;
++	int fraglen, ret;
++
++	/* Worst case is 4 fragments: head, end of page 1, start
++	 * of page 2, tail.  Anything more is a bug. */
++	BUG_ON(desc->fragno > 3);
++	desc->frags[desc->fragno] = *sg;
++	desc->fragno++;
++	desc->fraglen += sg->length;
++
++	fraglen = thislen & 7; /* XXX hardcoded blocksize */
++	thislen -= fraglen;
++
++	if (thislen == 0)
++		return 0;
++
++	ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags,
++					thislen, desc->iv);
++	if (ret)
++		return ret;
++	if (fraglen) {
++		desc->frags[0].page = sg->page;
++		desc->frags[0].offset = sg->offset + sg->length - fraglen;
++		desc->frags[0].length = fraglen;
++		desc->fragno = 1;
++		desc->fraglen = fraglen;
++	} else {
++		desc->fragno = 0;
++		desc->fraglen = 0;
++	}
++	return 0;
++}
++
++int
++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset)
++{
++	struct decryptor_desc desc;
++
++	/* XXXJBF: */
++	BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
++
++	memset(desc.iv, 0, sizeof(desc.iv));
++	desc.tfm = tfm;
++	desc.fragno = 0;
++	desc.fraglen = 0;
++	return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc);
++}
++
++EXPORT_SYMBOL(gss_decrypt_xdr_buf);
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_seal.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_seal.c	2004-12-25 05:33:47.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_seal.c	2005-04-05 14:49:13.402691800 +0800
+@@ -70,24 +70,17 @@
+ # define RPCDBG_FACILITY        RPCDBG_AUTH
+ #endif
+ 
+-static inline int
+-gss_krb5_padding(int blocksize, int length) {
+-	/* Most of the code is block-size independent but in practice we
+-	 * use only 8: */
+-	BUG_ON(blocksize != 8);
+-	return 8 - (length & 7);
+-}
++spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+ 
+ u32
+ krb5_make_token(struct krb5_ctx *ctx, int qop_req,
+-		   struct xdr_buf *text, struct xdr_netobj *token,
+-		   int toktype)
++		   struct xdr_buf *text, struct xdr_netobj *token)
+ {
+ 	s32			checksum_type;
+ 	struct xdr_netobj	md5cksum = {.len = 0, .data = NULL};
+-	int			blocksize = 0, tmsglen;
+ 	unsigned char		*ptr, *krb5_hdr, *msg_start;
+ 	s32			now;
++	u32			seq_send;
+ 
+ 	dprintk("RPC:     gss_krb5_seal\n");
+ 
+@@ -111,21 +104,13 @@
+ 		goto out_err;
+ 	}
+ 
+-	if (toktype == KG_TOK_WRAP_MSG) {
+-		blocksize = crypto_tfm_alg_blocksize(ctx->enc);
+-		tmsglen = blocksize + text->len
+-			+ gss_krb5_padding(blocksize, blocksize + text->len);
+-	} else {
+-		tmsglen = 0;
+-	}
+-
+-	token->len = g_token_size(&ctx->mech_used, 22 + tmsglen);
++	token->len = g_token_size(&ctx->mech_used, 22);
+ 
+ 	ptr = token->data;
+-	g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr);
++	g_make_token_header(&ctx->mech_used, 22, &ptr);
+ 
+-	*ptr++ = (unsigned char) ((toktype>>8)&0xff);
+-	*ptr++ = (unsigned char) (toktype&0xff);
++	*ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
++	*ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
+ 
+ 	/* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
+ 	krb5_hdr = ptr - 2;
+@@ -133,17 +118,9 @@
+ 
+ 	*(u16 *)(krb5_hdr + 2) = htons(ctx->signalg);
+ 	memset(krb5_hdr + 4, 0xff, 4);
+-	if (toktype == KG_TOK_WRAP_MSG)
+-		*(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg);
+ 
+-	if (toktype == KG_TOK_WRAP_MSG) {
+-		/* XXX removing support for now */
+-		goto out_err;
+-	} else { /* Sign only.  */
+-		if (make_checksum(checksum_type, krb5_hdr, 8, text,
+-				       &md5cksum))
++	if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum))
+ 			goto out_err;
+-	}
+ 
+ 	switch (ctx->signalg) {
+ 	case SGN_ALG_DES_MAC_MD5:
+@@ -163,12 +140,14 @@
+ 
+ 	kfree(md5cksum.data);
+ 
++	spin_lock(&krb5_seq_lock);
++	seq_send = ctx->seq_send++;
++	spin_unlock(&krb5_seq_lock);
++
+ 	if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
+-			       ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8)))
++			       seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+ 		goto out_err;
+ 
+-	ctx->seq_send++;
+-
+ 	return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
+ out_err:
+ 	if (md5cksum.data) kfree(md5cksum.data);
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_pseudoflavors.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_pseudoflavors.c	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_pseudoflavors.c	2005-04-05 19:01:49.158500672 +0800
+@@ -1,237 +0,0 @@
+-/*
+- *  linux/net/sunrpc/gss_union.c
+- *
+- *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic code
+- *
+- *  Copyright (c) 2001 The Regents of the University of Michigan.
+- *  All rights reserved.
+- *
+- *  Andy Adamson   <andros@umich.edu>
+- *
+- */
+-
+-/*
+- * Copyright 1993 by OpenVision Technologies, Inc.
+- *
+- * Permission to use, copy, modify, distribute, and sell this software
+- * and its documentation for any purpose is hereby granted without fee,
+- * provided that the above copyright notice appears in all copies and
+- * that both that copyright notice and this permission notice appear in
+- * supporting documentation, and that the name of OpenVision not be used
+- * in advertising or publicity pertaining to distribution of the software
+- * without specific, written prior permission. OpenVision makes no
+- * representations about the suitability of this software for any
+- * purpose.  It is provided "as is" without express or implied warranty.
+- *
+- * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+- * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+- * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+- * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+- * PERFORMANCE OF THIS SOFTWARE.
+- */ 
+-
+-#include <linux/types.h>
+-#include <linux/slab.h>
+-#include <linux/socket.h>
+-#include <linux/sunrpc/gss_asn1.h>
+-#include <linux/sunrpc/auth_gss.h>
+-
+-#ifdef RPC_DEBUG
+-# define RPCDBG_FACILITY        RPCDBG_AUTH
+-#endif
+-
+-static LIST_HEAD(registered_triples);
+-static spinlock_t registered_triples_lock = SPIN_LOCK_UNLOCKED;
+-
+-/* The following must be called with spinlock held: */
+-static struct sup_sec_triple *
+-do_lookup_triple_by_pseudoflavor(u32 pseudoflavor)
+-{
+-	struct sup_sec_triple *pos, *triple = NULL;
+-
+-	list_for_each_entry(pos, &registered_triples, triples) {
+-		if (pos->pseudoflavor == pseudoflavor) {
+-			triple = pos;
+-			break;
+-		}
+-	}
+-	return triple;
+-}
+-
+-/* XXX Need to think about reference counting of triples and of mechs.
+- * Currently we do no reference counting of triples, and I think that's
+- * probably OK given the reference counting on mechs, but there's probably
+- * a better way to do all this. */
+-
+-int
+-gss_register_triple(u32 pseudoflavor, struct gss_api_mech *mech,
+-			  u32 qop, u32 service)
+-{
+-	struct sup_sec_triple *triple;
+-
+-	if (!(triple = kmalloc(sizeof(*triple), GFP_KERNEL))) {
+-		printk("Alloc failed in gss_register_triple");
+-		goto err;
+-	}
+-	triple->pseudoflavor = pseudoflavor;
+-	triple->mech = gss_mech_get_by_OID(&mech->gm_oid);
+-	triple->qop = qop;
+-	triple->service = service;
+-
+-	spin_lock(&registered_triples_lock);
+-	if (do_lookup_triple_by_pseudoflavor(pseudoflavor)) {
+-		printk(KERN_WARNING "RPC: Registered pseudoflavor %d again\n",
+-				pseudoflavor);
+-		goto err_unlock;
+-	}
+-	list_add(&triple->triples, &registered_triples);
+-	spin_unlock(&registered_triples_lock);
+-	dprintk("RPC:      registered pseudoflavor %d\n", pseudoflavor);
+-
+-	return 0;
+-
+-err_unlock:
+-	kfree(triple);
+-	spin_unlock(&registered_triples_lock);
+-err:
+-	return -1;
+-}
+-
+-int
+-gss_unregister_triple(u32 pseudoflavor)
+-{
+-	struct sup_sec_triple *triple;
+-
+-	spin_lock(&registered_triples_lock);
+-	if (!(triple = do_lookup_triple_by_pseudoflavor(pseudoflavor))) {
+-		spin_unlock(&registered_triples_lock);
+-		printk("Can't unregister unregistered pseudoflavor %d\n",
+-		       pseudoflavor);
+-		return -1;
+-	}
+-	list_del(&triple->triples);
+-	spin_unlock(&registered_triples_lock);
+-	gss_mech_put(triple->mech);
+-	kfree(triple);
+-	return 0;
+-
+-}
+-
+-void
+-print_sec_triple(struct xdr_netobj *oid,u32 qop,u32 service)
+-{
+-	dprintk("RPC: print_sec_triple:\n");
+-	dprintk("                     oid_len %d\n  oid :\n",oid->len);
+-	print_hexl((u32 *)oid->data,oid->len,0);
+-	dprintk("                     qop %d\n",qop);
+-	dprintk("                     service %d\n",service);
+-}
+-
+-/* Function: gss_get_cmp_triples
+- *
+- * Description: search sec_triples for a matching security triple
+- * return pseudoflavor if match, else 0
+- * (Note that 0 is a valid pseudoflavor, but not for any gss pseudoflavor
+- * (0 means auth_null), so this shouldn't cause confusion.)
+- */
+-u32
+-gss_cmp_triples(u32 oid_len, char *oid_data, u32 qop, u32 service)
+-{
+-	struct sup_sec_triple *triple;
+-	u32 pseudoflavor = 0;
+-	struct xdr_netobj oid;
+-
+-	oid.len = oid_len;
+-	oid.data = oid_data;
+-
+-	dprintk("RPC:      gss_cmp_triples\n");
+-	print_sec_triple(&oid,qop,service);
+-
+-	spin_lock(&registered_triples_lock);
+-	list_for_each_entry(triple, &registered_triples, triples) {
+-		if((g_OID_equal(&oid, &triple->mech->gm_oid))
+-		    && (qop == triple->qop)
+-		    && (service == triple->service)) {
+-			pseudoflavor = triple->pseudoflavor;
+-			break;
+-		}
+-	}
+-	spin_unlock(&registered_triples_lock);
+-	dprintk("RPC:      gss_cmp_triples return %d\n", pseudoflavor);
+-	return pseudoflavor;
+-}
+-
+-u32
+-gss_get_pseudoflavor(struct gss_ctx *ctx, u32 qop, u32 service)
+-{
+-	return gss_cmp_triples(ctx->mech_type->gm_oid.len,
+-			       ctx->mech_type->gm_oid.data,
+-			       qop, service);
+-}
+-
+-/* Returns nonzero iff the given pseudoflavor is in the supported list.
+- * (Note that without incrementing a reference count or anything, this
+- * doesn't give any guarantees.) */
+-int
+-gss_pseudoflavor_supported(u32 pseudoflavor)
+-{
+-	struct sup_sec_triple *triple;
+-
+-	spin_lock(&registered_triples_lock);
+-	triple = do_lookup_triple_by_pseudoflavor(pseudoflavor);
+-	spin_unlock(&registered_triples_lock);
+-	return (triple ? 1 : 0);
+-}
+-
+-u32
+-gss_pseudoflavor_to_service(u32 pseudoflavor)
+-{
+-	struct sup_sec_triple *triple;
+-
+-	spin_lock(&registered_triples_lock);
+-	triple = do_lookup_triple_by_pseudoflavor(pseudoflavor);
+-	spin_unlock(&registered_triples_lock);
+-	if (!triple) {
+-		dprintk("RPC:      gss_pseudoflavor_to_service called with unsupported pseudoflavor %d\n",
+-				pseudoflavor);
+-		return 0;
+-	}
+-	return triple->service;
+-}
+-
+-struct gss_api_mech *
+-gss_pseudoflavor_to_mech(u32 pseudoflavor) {
+-	struct sup_sec_triple *triple;
+-	struct gss_api_mech *mech = NULL;
+-
+-	spin_lock(&registered_triples_lock);
+-	triple = do_lookup_triple_by_pseudoflavor(pseudoflavor);
+-	spin_unlock(&registered_triples_lock);
+-	if (triple)
+-		mech = gss_mech_get(triple->mech);
+-	else
+-		dprintk("RPC:      gss_pseudoflavor_to_mech called with unsupported pseudoflavor %d\n",
+-				pseudoflavor);
+-	return mech;
+-}
+-
+-int
+-gss_pseudoflavor_to_mechOID(u32 pseudoflavor, struct xdr_netobj * oid)
+-{
+-	struct gss_api_mech *mech;
+-
+-	mech = gss_pseudoflavor_to_mech(pseudoflavor);
+-	if (!mech)  {
+-		dprintk("RPC:      gss_pseudoflavor_to_mechOID called with unsupported pseudoflavor %d\n",
+-				pseudoflavor);
+-		        return -1;
+-	}
+-	oid->len = mech->gm_oid.len;
+-	if (!(oid->data = kmalloc(oid->len, GFP_KERNEL)))
+-		return -1;
+-	memcpy(oid->data, mech->gm_oid.data, oid->len);
+-	gss_mech_put(mech);
+-	return 0;
+-}
+Index: linux-2.6.10/net/sunrpc/auth_gss/svcauth_gss.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/svcauth_gss.c	2004-12-25 05:34:44.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/svcauth_gss.c	2005-04-05 14:49:13.407691040 +0800
+@@ -37,6 +37,7 @@
+  *
+  */
+ 
++#include <asm/bitops.h>
+ #include <linux/types.h>
+ #include <linux/module.h>
+ #include <linux/pagemap.h>
+@@ -78,7 +79,6 @@
+ 
+ static struct cache_head *rsi_table[RSI_HASHMAX];
+ static struct cache_detail rsi_cache;
+-static struct rsi *rsi_lookup(struct rsi *item, int set);
+ 
+ static void rsi_free(struct rsi *rsii)
+ {
+@@ -125,38 +125,6 @@
+ 	return dup_to_netobj(dst, src->data, src->len);
+ }
+ 
+-static inline void rsi_init(struct rsi *new, struct rsi *item)
+-{
+-	new->out_handle.data = NULL;
+-	new->out_handle.len = 0;
+-	new->out_token.data = NULL;
+-	new->out_token.len = 0;
+-	new->in_handle.len = item->in_handle.len;
+-	item->in_handle.len = 0;
+-	new->in_token.len = item->in_token.len;
+-	item->in_token.len = 0;
+-	new->in_handle.data = item->in_handle.data;
+-	item->in_handle.data = NULL;
+-	new->in_token.data = item->in_token.data;
+-	item->in_token.data = NULL;
+-}
+-
+-static inline void rsi_update(struct rsi *new, struct rsi *item)
+-{
+-	BUG_ON(new->out_handle.data || new->out_token.data);
+-	new->out_handle.len = item->out_handle.len;
+-	item->out_handle.len = 0;
+-	new->out_token.len = item->out_token.len;
+-	item->out_token.len = 0;
+-	new->out_handle.data = item->out_handle.data;
+-	item->out_handle.data = NULL;
+-	new->out_token.data = item->out_token.data;
+-	item->out_token.data = NULL;
+-
+-	new->major_status = item->major_status;
+-	new->minor_status = item->minor_status;
+-}
+-
+ static void rsi_request(struct cache_detail *cd,
+                        struct cache_head *h,
+                        char **bpp, int *blen)
+@@ -168,6 +136,75 @@
+ 	(*bpp)[-1] = '\n';
+ }
+ 
++static inline int
++gssd_reply(struct rsi *item)
++{
++	struct rsi *tmp;
++	struct cache_head **hp, **head;
++
++	head = &rsi_cache.hash_table[rsi_hash(item)];
++	write_lock(&rsi_cache.hash_lock);
++	for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++		tmp = container_of(*hp, struct rsi, h);
++		if (rsi_match(tmp, item)) {
++			cache_get(&tmp->h);
++			clear_bit(CACHE_HASHED, &tmp->h.flags);
++			*hp = tmp->h.next;
++			tmp->h.next = NULL;
++			rsi_cache.entries--;
++			if (test_bit(CACHE_VALID, &tmp->h.flags)) {
++				write_unlock(&rsi_cache.hash_lock);
++				rsi_put(&tmp->h, &rsi_cache);
++				return -EINVAL;
++			}
++			set_bit(CACHE_HASHED, &item->h.flags);
++			item->h.next = *hp;
++			*hp = &item->h;
++			rsi_cache.entries++;
++			set_bit(CACHE_VALID, &item->h.flags);
++			item->h.last_refresh = get_seconds();
++			write_unlock(&rsi_cache.hash_lock);
++			cache_fresh(&rsi_cache, &tmp->h, 0);
++			rsi_put(&tmp->h, &rsi_cache);
++			return 0;
++		}
++	}
++	write_unlock(&rsi_cache.hash_lock);
++	return -EINVAL;
++}
++
++static inline struct rsi *
++gssd_upcall(struct rsi *item, struct svc_rqst *rqstp)
++{
++	struct rsi *tmp;
++	struct cache_head **hp, **head;
++
++	head = &rsi_cache.hash_table[rsi_hash(item)];
++	read_lock(&rsi_cache.hash_lock);
++	for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++		tmp = container_of(*hp, struct rsi, h);
++		if (rsi_match(tmp, item)) {
++			if (!test_bit(CACHE_VALID, &tmp->h.flags)) {
++				read_unlock(&rsi_cache.hash_lock);
++				return NULL;
++			}
++			*hp = tmp->h.next;
++			tmp->h.next = NULL;
++			rsi_cache.entries--;
++			read_unlock(&rsi_cache.hash_lock);
++			return tmp;
++		}
++	}
++	cache_get(&item->h);
++	item->h.next = *head;
++	*head = &item->h;
++	rsi_cache.entries++;
++	read_unlock(&rsi_cache.hash_lock);
++	cache_get(&item->h);
++	if (cache_check(&rsi_cache, &item->h, &rqstp->rq_chandle))
++		return NULL;
++	return item;
++}
+ 
+ static int rsi_parse(struct cache_detail *cd,
+                     char *mesg, int mlen)
+@@ -176,17 +213,22 @@
+ 	char *buf = mesg;
+ 	char *ep;
+ 	int len;
+-	struct rsi rsii, *rsip = NULL;
++	struct rsi *rsii;
+ 	time_t expiry;
+ 	int status = -EINVAL;
+ 
+-	memset(&rsii, 0, sizeof(rsii));
++	rsii = kmalloc(sizeof(*rsii), GFP_KERNEL);
++	if (!rsii)
++		return -ENOMEM;
++	memset(rsii, 0, sizeof(*rsii));
++	cache_init(&rsii->h);
++
+ 	/* handle */
+ 	len = qword_get(&mesg, buf, mlen);
+ 	if (len < 0)
+ 		goto out;
+ 	status = -ENOMEM;
+-	if (dup_to_netobj(&rsii.in_handle, buf, len))
++	if (dup_to_netobj(&rsii->in_handle, buf, len))
+ 		goto out;
+ 
+ 	/* token */
+@@ -195,10 +237,9 @@
+ 	if (len < 0)
+ 		goto out;
+ 	status = -ENOMEM;
+-	if (dup_to_netobj(&rsii.in_token, buf, len))
++	if (dup_to_netobj(&rsii->in_token, buf, len))
+ 		goto out;
+ 
+-	rsii.h.flags = 0;
+ 	/* expiry */
+ 	expiry = get_expiry(&mesg);
+ 	status = -EINVAL;
+@@ -212,13 +253,13 @@
+ 	if (len == 0) {
+ 		goto out;
+ 	} else {
+-		rsii.major_status = simple_strtoul(buf, &ep, 10);
++		rsii->major_status = simple_strtoul(buf, &ep, 10);
+ 		if (*ep)
+ 			goto out;
+ 		len = qword_get(&mesg, buf, mlen);
+ 		if (len <= 0)
+ 			goto out;
+-		rsii.minor_status = simple_strtoul(buf, &ep, 10);
++		rsii->minor_status = simple_strtoul(buf, &ep, 10);
+ 		if (*ep)
+ 			goto out;
+ 
+@@ -227,7 +268,7 @@
+ 		if (len < 0)
+ 			goto out;
+ 		status = -ENOMEM;
+-		if (dup_to_netobj(&rsii.out_handle, buf, len))
++		if (dup_to_netobj(&rsii->out_handle, buf, len))
+ 			goto out;
+ 
+ 		/* out_token */
+@@ -236,16 +277,14 @@
+ 		if (len < 0)
+ 			goto out;
+ 		status = -ENOMEM;
+-		if (dup_to_netobj(&rsii.out_token, buf, len))
++		if (dup_to_netobj(&rsii->out_token, buf, len))
+ 			goto out;
+ 	}
+-	rsii.h.expiry_time = expiry;
+-	rsip = rsi_lookup(&rsii, 1);
+-	status = 0;
++	rsii->h.expiry_time = expiry;
++	status = gssd_reply(rsii);
+ out:
+-	rsi_free(&rsii);
+-	if (rsip)
+-		rsi_put(&rsip->h, &rsi_cache);
++	if (rsii)
++		rsi_put(&rsii->h, &rsi_cache);
+ 	return status;
+ }
+ 
+@@ -258,8 +297,6 @@
+ 	.cache_parse    = rsi_parse,
+ };
+ 
+-static DefineSimpleCacheLookup(rsi, 0)
+-
+ /*
+  * The rpcsec_context cache is used to store a context that is
+  * used in data exchange.
+@@ -292,7 +329,6 @@
+ 
+ static struct cache_head *rsc_table[RSC_HASHMAX];
+ static struct cache_detail rsc_cache;
+-static struct rsc *rsc_lookup(struct rsc *item, int set);
+ 
+ static void rsc_free(struct rsc *rsci)
+ {
+@@ -325,26 +361,46 @@
+ 	return netobj_equal(&new->handle, &tmp->handle);
+ }
+ 
+-static inline void
+-rsc_init(struct rsc *new, struct rsc *tmp)
++static struct rsc *rsc_lookup(struct rsc *item, int set)
+ {
+-	new->handle.len = tmp->handle.len;
+-	tmp->handle.len = 0;
+-	new->handle.data = tmp->handle.data;
+-	tmp->handle.data = NULL;
+-	new->mechctx = NULL;
+-	new->cred.cr_group_info = NULL;
+-}
+-
+-static inline void
+-rsc_update(struct rsc *new, struct rsc *tmp)
+-{
+-	new->mechctx = tmp->mechctx;
+-	tmp->mechctx = NULL;
+-	memset(&new->seqdata, 0, sizeof(new->seqdata));
+-	spin_lock_init(&new->seqdata.sd_lock);
+-	new->cred = tmp->cred;
+-	tmp->cred.cr_group_info = NULL;
++	struct rsc *tmp = NULL;
++	struct cache_head **hp, **head;
++	head = &rsc_cache.hash_table[rsc_hash(item)];
++
++	if (set)
++		write_lock(&rsc_cache.hash_lock);
++	else
++		read_lock(&rsc_cache.hash_lock);
++	for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++		tmp = container_of(*hp, struct rsc, h);
++		if (!rsc_match(tmp, item))
++			continue;
++		cache_get(&tmp->h);
++		if (!set)
++			goto out_noset;
++		*hp = tmp->h.next;
++		tmp->h.next = NULL;
++		clear_bit(CACHE_HASHED, &tmp->h.flags);
++		rsc_put(&tmp->h, &rsc_cache);
++		goto out_set;
++	}
++	/* Didn't find anything */
++	if (!set)
++		goto out_nada;
++	rsc_cache.entries++;
++out_set:
++	set_bit(CACHE_HASHED, &item->h.flags);
++	item->h.next = *head;
++	*head = &item->h;
++	write_unlock(&rsc_cache.hash_lock);
++	cache_fresh(&rsc_cache, &item->h, item->h.expiry_time);
++	cache_get(&item->h);
++	return item;
++out_nada:
++	tmp = NULL;
++out_noset:
++	read_unlock(&rsc_cache.hash_lock);
++	return tmp;
+ }
+ 
+ static int rsc_parse(struct cache_detail *cd,
+@@ -353,19 +409,22 @@
+ 	/* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
+ 	char *buf = mesg;
+ 	int len, rv;
+-	struct rsc rsci, *rscp = NULL;
++	struct rsc *rsci, *res = NULL;
+ 	time_t expiry;
+ 	int status = -EINVAL;
+ 
+-	memset(&rsci, 0, sizeof(rsci));
++	rsci = kmalloc(sizeof(*rsci), GFP_KERNEL);
++	if (!rsci)
++		return -ENOMEM;
++	memset(rsci, 0, sizeof(*rsci));
++	cache_init(&rsci->h);
+ 	/* context handle */
+ 	len = qword_get(&mesg, buf, mlen);
+ 	if (len < 0) goto out;
+ 	status = -ENOMEM;
+-	if (dup_to_netobj(&rsci.handle, buf, len))
++	if (dup_to_netobj(&rsci->handle, buf, len))
+ 		goto out;
+ 
+-	rsci.h.flags = 0;
+ 	/* expiry */
+ 	expiry = get_expiry(&mesg);
+ 	status = -EINVAL;
+@@ -373,26 +432,26 @@
+ 		goto out;
+ 
+ 	/* uid, or NEGATIVE */
+-	rv = get_int(&mesg, &rsci.cred.cr_uid);
++	rv = get_int(&mesg, &rsci->cred.cr_uid);
+ 	if (rv == -EINVAL)
+ 		goto out;
+ 	if (rv == -ENOENT)
+-		set_bit(CACHE_NEGATIVE, &rsci.h.flags);
++		set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ 	else {
+ 		int N, i;
+ 		struct gss_api_mech *gm;
+ 		struct xdr_netobj tmp_buf;
+ 
+ 		/* gid */
+-		if (get_int(&mesg, &rsci.cred.cr_gid))
++		if (get_int(&mesg, &rsci->cred.cr_gid))
+ 			goto out;
+ 
+ 		/* number of additional gid's */
+ 		if (get_int(&mesg, &N))
+ 			goto out;
+ 		status = -ENOMEM;
+-		rsci.cred.cr_group_info = groups_alloc(N);
+-		if (rsci.cred.cr_group_info == NULL)
++		rsci->cred.cr_group_info = groups_alloc(N);
++		if (rsci->cred.cr_group_info == NULL)
+ 			goto out;
+ 
+ 		/* gid's */
+@@ -401,7 +460,7 @@
+ 			gid_t gid;
+ 			if (get_int(&mesg, &gid))
+ 				goto out;
+-			GROUP_AT(rsci.cred.cr_group_info, i) = gid;
++			GROUP_AT(rsci->cred.cr_group_info, i) = gid;
+ 		}
+ 
+ 		/* mech name */
+@@ -422,19 +481,21 @@
+ 		}
+ 		tmp_buf.len = len;
+ 		tmp_buf.data = buf;
+-		if (gss_import_sec_context(&tmp_buf, gm, &rsci.mechctx)) {
++		if (gss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) {
+ 			gss_mech_put(gm);
+ 			goto out;
+ 		}
+ 		gss_mech_put(gm);
+ 	}
+-	rsci.h.expiry_time = expiry;
+-	rscp = rsc_lookup(&rsci, 1);
++	rsci->h.expiry_time = expiry;
++	spin_lock_init(&rsci->seqdata.sd_lock);
++	res = rsc_lookup(rsci, 1);
++	rsc_put(&res->h, &rsc_cache);
++	rsci = NULL;
+ 	status = 0;
+ out:
+-	rsc_free(&rsci);
+-	if (rscp)
+-		rsc_put(&rscp->h, &rsc_cache);
++	if (rsci)
++		rsc_put(&rsci->h, &rsc_cache);
+ 	return status;
+ }
+ 
+@@ -446,19 +507,14 @@
+ 	.cache_parse	= rsc_parse,
+ };
+ 
+-static DefineSimpleCacheLookup(rsc, 0);
+-
+ struct rsc *
+ gss_svc_searchbyctx(struct xdr_netobj *handle)
+ {
+ 	struct rsc rsci;
+ 	struct rsc *found;
+ 
+-	memset(&rsci, 0, sizeof(rsci));
+-	if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+-		return NULL;
++	rsci.handle = *handle;
+ 	found = rsc_lookup(&rsci, 0);
+-	rsc_free(&rsci);
+ 	if (!found)
+ 		return NULL;
+ 	if (cache_check(&rsc_cache, &found->h, NULL))
+@@ -721,6 +777,45 @@
+ 	return stat;
+ }
+ 
++static int
++unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
++{
++	int stat = -EINVAL;
++	int out_offset;
++	u32 * lenp;
++	u32 priv_len, maj_stat;
++	int saved_len;
++
++	lenp = buf->head[0].iov_base;
++	priv_len = ntohl(svc_getu32(&buf->head[0]));
++	if (priv_len > buf->len) /* XXXJBF: wrong check */
++		goto out;
++	/* XXXJBF: bizarre hack: to handle revisits (and not decrypt
++	 * twice), the first time through we write an offset
++	 * telling us where to skip to find the already-decrypted data */
++	if (rqstp->rq_deferred) {
++		buf->head[0].iov_base += priv_len;
++		buf->head[0].iov_len -= priv_len;
++		return 0;
++	}
++	saved_len = buf->len; /* XXX HACK */
++	buf->len = priv_len;
++	maj_stat = gss_unwrap(ctx, NULL, 0, buf, &out_offset);
++	buf->len = saved_len;
++	buf->head[0].iov_base += out_offset;
++	buf->head[0].iov_len -= out_offset;
++	BUG_ON(buf->head[0].iov_len <= 0);
++	if (maj_stat != GSS_S_COMPLETE)
++		goto out;
++	if (ntohl(svc_getu32(&buf->head[0])) != seq)
++		goto out;
++	/* XXXJBF: see "bizarre hack", above. */
++	*lenp = htonl(out_offset + 4);
++	stat = 0;
++out:
++	return stat;
++}
++
+ struct gss_svc_data {
+ 	/* decoded gss client cred: */
+ 	struct rpc_gss_wire_cred	clcred;
+@@ -730,6 +825,19 @@
+ 	struct rsc			*rsci;
+ };
+ 
++static int
++svcauth_gss_set_client(struct svc_rqst *rqstp)
++{
++	struct gss_svc_data *svcdata = rqstp->rq_auth_data;
++	struct rsc *rsci = svcdata->rsci;
++	struct rpc_gss_wire_cred *gc = &svcdata->clcred;
++
++	rqstp->rq_client = find_gss_auth_domain(rsci->mechctx, gc->gc_svc);
++	if (rqstp->rq_client == NULL)
++		return SVC_DENIED;
++	return SVC_OK;
++}
++
+ /*
+  * Accept an rpcsec packet.
+  * If context establishment, punt to user space
+@@ -748,7 +856,7 @@
+ 	struct gss_svc_data *svcdata = rqstp->rq_auth_data;
+ 	struct rpc_gss_wire_cred *gc;
+ 	struct rsc	*rsci = NULL;
+-	struct rsi	*rsip, rsikey;
++	struct rsi	*rsip, *rsikey = NULL;
+ 	u32		*rpcstart;
+ 	u32		*reject_stat = resv->iov_base + resv->iov_len;
+ 	int		ret;
+@@ -841,30 +949,23 @@
+ 		*authp = rpc_autherr_badcred;
+ 		if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0)
+ 			goto auth_err;
+-		memset(&rsikey, 0, sizeof(rsikey));
+-		if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
++		rsikey = kmalloc(sizeof(*rsikey), GFP_KERNEL);
++		if (!rsikey)
++			goto drop;
++		memset(rsikey, 0, sizeof(*rsikey));
++		cache_init(&rsikey->h);
++		if (dup_netobj(&rsikey->in_handle, &gc->gc_ctx))
+ 			goto drop;
+ 		*authp = rpc_autherr_badverf;
+-		if (svc_safe_getnetobj(argv, &tmpobj)) {
+-			kfree(rsikey.in_handle.data);
++		if (svc_safe_getnetobj(argv, &tmpobj))
+ 			goto auth_err;
+-		}
+-		if (dup_netobj(&rsikey.in_token, &tmpobj)) {
+-			kfree(rsikey.in_handle.data);
++		if (dup_netobj(&rsikey->in_token, &tmpobj))
+ 			goto drop;
+-		}
+ 
+-		rsip = rsi_lookup(&rsikey, 0);
+-		rsi_free(&rsikey);
+-		if (!rsip) {
+-			goto drop;
+-		}
+-		switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) {
+-		case -EAGAIN:
++		rsip = gssd_upcall(rsikey, rqstp);
++		if (!rsip)
+ 			goto drop;
+-		case -ENOENT:
+-			goto drop;
+-		case 0:
++		else {
+ 			rsci = gss_svc_searchbyctx(&rsip->out_handle);
+ 			if (!rsci) {
+ 				goto drop;
+@@ -893,11 +994,6 @@
+ 		svc_putu32(resv, rpc_success);
+ 		goto complete;
+ 	case RPC_GSS_PROC_DATA:
+-		*authp = rpc_autherr_badcred;
+-		rqstp->rq_client =
+-			find_gss_auth_domain(rsci->mechctx, gc->gc_svc);
+-		if (rqstp->rq_client == NULL)
+-			goto auth_err;
+ 		*authp = rpcsec_gsserr_ctxproblem;
+ 		if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
+ 			goto auth_err;
+@@ -911,6 +1007,15 @@
+ 			if (unwrap_integ_data(&rqstp->rq_arg,
+ 					gc->gc_seq, rsci->mechctx))
+ 				goto auth_err;
++			/* placeholders for length and seq. number: */
++			svcdata->body_start = resv->iov_base + resv->iov_len;
++			svc_putu32(resv, 0);
++			svc_putu32(resv, 0);
++			break;
++		case RPC_GSS_SVC_PRIVACY:
++			if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
++					gc->gc_seq, rsci->mechctx))
++				goto auth_err;
+ 			svcdata->rsci = rsci;
+ 			cache_get(&rsci->h);
+ 			/* placeholders for length and seq. number: */
+@@ -918,11 +1023,11 @@
+ 			svc_putu32(resv, 0);
+ 			svc_putu32(resv, 0);
+ 			break;
+-		case RPC_GSS_SVC_PRIVACY:
+-			/* currently unsupported */
+ 		default:
+ 			goto auth_err;
+ 		}
++		svcdata->rsci = rsci;
++		cache_get(&rsci->h);
+ 		ret = SVC_OK;
+ 		goto out;
+ 	}
+@@ -937,13 +1042,15 @@
+ drop:
+ 	ret = SVC_DROP;
+ out:
++	if (rsikey)
++		rsi_put(&rsikey->h, &rsi_cache);
+ 	if (rsci)
+ 		rsc_put(&rsci->h, &rsc_cache);
+ 	return ret;
+ }
+ 
+-static int
+-svcauth_gss_release(struct svc_rqst *rqstp)
++static inline int
++svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
+ {
+ 	struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+ 	struct rpc_gss_wire_cred *gc = &gsd->clcred;
+@@ -955,10 +1062,160 @@
+ 	int integ_offset, integ_len;
+ 	int stat = -EINVAL;
+ 
++	p = gsd->body_start;
++	gsd->body_start = NULL;
++	/* move accept_stat to right place: */
++	memcpy(p, p + 2, 4);
++	/* Don't wrap in failure case: */
++	/* Counting on not getting here if call was not even accepted! */
++	if (*p != rpc_success) {
++		resbuf->head[0].iov_len -= 2 * 4;
++		goto out;
++	}
++	p++;
++	integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
++	integ_len = resbuf->len - integ_offset;
++	BUG_ON(integ_len % 4);
++	*p++ = htonl(integ_len);
++	*p++ = htonl(gc->gc_seq);
++	if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
++				integ_len))
++		BUG();
++	if (resbuf->page_len == 0
++			&& resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
++			< PAGE_SIZE) {
++		BUG_ON(resbuf->tail[0].iov_len);
++		/* Use head for everything */
++		resv = &resbuf->head[0];
++	} else if (resbuf->tail[0].iov_base == NULL) {
++		/* copied from nfsd4_encode_read */
++		svc_take_page(rqstp);
++		resbuf->tail[0].iov_base = page_address(rqstp
++				->rq_respages[rqstp->rq_resused-1]);
++		rqstp->rq_restailpage = rqstp->rq_resused-1;
++		resbuf->tail[0].iov_len = 0;
++		resv = &resbuf->tail[0];
++	} else {
++		resv = &resbuf->tail[0];
++	}
++	mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
++	if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
++		goto out_err;
++	svc_putu32(resv, htonl(mic.len));
++	memset(mic.data + mic.len, 0,
++			round_up_to_quad(mic.len) - mic.len);
++	resv->iov_len += XDR_QUADLEN(mic.len) << 2;
++	/* not strictly required: */
++	resbuf->len += XDR_QUADLEN(mic.len) << 2;
++	BUG_ON(resv->iov_len > PAGE_SIZE);
++out:
++	stat = 0;
++out_err:
++	return stat;
++}
++
++/* XXXJBF: Look for chances to share code with client */
++/* XXXJBF: Do we need to preallocate these pages somehow?  E.g. see
++ * buffer size calculations in svcsock.c */
++/* XXXJBF: how does reference counting on pages work? */
++static struct page **
++svc_alloc_enc_pages(struct xdr_buf *buf)
++{
++	struct page **ret;
++	int last, i;
++
++	if (buf->page_len == 0)
++		return NULL;
++	BUG_ON(buf->page_base >> PAGE_CACHE_SHIFT);
++	last = (buf->page_base + buf->page_len - 1) >> PAGE_CACHE_SHIFT;
++	ret = kmalloc((last + 1) * sizeof(struct page *), GFP_KERNEL);
++	if (!ret)
++		goto out;
++	for (i = 0; i<= last; i++) {
++		ret[i] = alloc_page(GFP_KERNEL);
++		if (ret[i] == NULL)
++			goto out_free;
++	}
++out:
++	return ret;
++out_free:
++	for (i--; i >= 0; i--) {
++		__free_page(ret[i]);
++	}
++	return NULL;
++}
++
++static inline int
++svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp)
++{
++	struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
++	struct rpc_gss_wire_cred *gc = &gsd->clcred;
++	struct xdr_buf *resbuf = &rqstp->rq_res;
++	struct page **inpages;
++	u32 *p;
++	int offset, *len;
++	int pad;
++	int stat = -EINVAL;
++
++	p = gsd->body_start;
++	gsd->body_start = NULL;
++	/* move accept_stat to right place: */
++	memcpy(p, p + 2, 4);
++	/* Don't wrap in failure case: */
++	/* Counting on not getting here if call was not even accepted! */
++	if (*p != rpc_success) {
++		resbuf->head[0].iov_len -= 2 * 4;
++		goto out;
++	}
++	p++;
++	len = p++;
++	offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base;
++	*p++ = htonl(gc->gc_seq);
++	stat = -ENOMEM;
++	inpages = resbuf->pages;
++	/* XXXJBF: huge memory leaks here: allocated pages probably aren't
++	 * freed, and neither is memory used to hold page array. */
++	resbuf->pages = svc_alloc_enc_pages(resbuf);
++	if (resbuf->page_len && !resbuf->pages)
++		goto out_err; /* XXX sleep and retry? Reserve ahead of time
++				and BUG_ON? */
++	if (resbuf->tail[0].iov_len == 0 || resbuf->tail[0].iov_base == NULL) {
++		/* copied from nfsd4_encode_read */
++		{int i = svc_take_page(rqstp); BUG_ON(i); }
++		resbuf->tail[0].iov_base = page_address(rqstp
++				->rq_respages[rqstp->rq_resused-1]);
++		rqstp->rq_restailpage = rqstp->rq_resused-1;
++		resbuf->tail[0].iov_len = 0;
++	}
++	/* XXX: Will svc code attempt to free stuff in xdr_buf->pages?
++	 * Or can we leave it in any old state on error?? */
++	stat = -EINVAL;
++	if (gss_wrap(gsd->rsci->mechctx, GSS_C_QOP_DEFAULT, offset,
++				resbuf, inpages))
++		goto out_err;
++	*len = htonl(resbuf->len - offset);
++	pad = 3 - ((resbuf->len - offset - 1)&3);
++	p = (u32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len);
++	memset(p, 0, pad);
++	resbuf->tail[0].iov_len += pad;
++out:
++	return 0;
++out_err:
++	return stat;
++}
++
++static int
++svcauth_gss_release(struct svc_rqst *rqstp)
++{
++	struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
++	struct rpc_gss_wire_cred *gc = &gsd->clcred;
++	struct xdr_buf *resbuf = &rqstp->rq_res;
++	int stat = -EINVAL;
++
+ 	if (gc->gc_proc != RPC_GSS_PROC_DATA)
+ 		goto out;
+ 	/* Release can be called twice, but we only wrap once. */
+-	if (gsd->body_start == 0)
++	if (gsd->body_start == NULL)
+ 		goto out;
+ 	/* normally not set till svc_send, but we need it here: */
+ 	resbuf->len = resbuf->head[0].iov_len
+@@ -967,55 +1224,15 @@
+ 	case RPC_GSS_SVC_NONE:
+ 		break;
+ 	case RPC_GSS_SVC_INTEGRITY:
+-		p = gsd->body_start;
+-		gsd->body_start = NULL;
+-		/* move accept_stat to right place: */
+-		memcpy(p, p + 2, 4);
+-		/* don't wrap in failure case: */
+-		/* Note: counting on not getting here if call was not even
+-		 * accepted! */
+-		if (*p != rpc_success) {
+-			resbuf->head[0].iov_len -= 2 * 4;
+-			goto out;
+-		}
+-		p++;
+-		integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
+-		integ_len = resbuf->len - integ_offset;
+-		BUG_ON(integ_len % 4);
+-		*p++ = htonl(integ_len);
+-		*p++ = htonl(gc->gc_seq);
+-		if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
+-					integ_len))
+-			BUG();
+-		if (resbuf->page_len == 0
+-			&& resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
+-				< PAGE_SIZE) {
+-			BUG_ON(resbuf->tail[0].iov_len);
+-			/* Use head for everything */
+-			resv = &resbuf->head[0];
+-		} else if (resbuf->tail[0].iov_base == NULL) {
+-			/* copied from nfsd4_encode_read */
+-			svc_take_page(rqstp);
+-			resbuf->tail[0].iov_base = page_address(rqstp
+-					->rq_respages[rqstp->rq_resused-1]);
+-			rqstp->rq_restailpage = rqstp->rq_resused-1;
+-			resbuf->tail[0].iov_len = 0;
+-			resv = &resbuf->tail[0];
+-		} else {
+-			resv = &resbuf->tail[0];
+-		}
+-		mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
+-		if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
++		stat = svcauth_gss_wrap_resp_integ(rqstp);
++		if (stat)
+ 			goto out_err;
+-		svc_putu32(resv, htonl(mic.len));
+-		memset(mic.data + mic.len, 0,
+-				round_up_to_quad(mic.len) - mic.len);
+-		resv->iov_len += XDR_QUADLEN(mic.len) << 2;
+-		/* not strictly required: */
+-		resbuf->len += XDR_QUADLEN(mic.len) << 2;
+-		BUG_ON(resv->iov_len > PAGE_SIZE);
+ 		break;
+ 	case RPC_GSS_SVC_PRIVACY:
++		stat = svcauth_gss_wrap_resp_priv(rqstp);
++		if (stat)
++			goto out_err;
++		break;
+ 	default:
+ 		goto out_err;
+ 	}
+@@ -1052,6 +1269,7 @@
+ 	.accept		= svcauth_gss_accept,
+ 	.release	= svcauth_gss_release,
+ 	.domain_release = svcauth_gss_domain_release,
++	.set_client	= svcauth_gss_set_client,
+ };
+ 
+ int
+Index: linux-2.6.10/net/sunrpc/auth_gss/sunrpcgss_syms.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/sunrpcgss_syms.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/sunrpcgss_syms.c	2005-04-05 19:01:49.158500672 +0800
+@@ -1,37 +0,0 @@
+-#include <linux/config.h>
+-#include <linux/module.h>
+-
+-#include <linux/types.h>
+-#include <linux/socket.h>
+-#include <linux/sched.h>
+-#include <linux/uio.h>
+-#include <linux/unistd.h>
+-
+-#include <linux/sunrpc/auth_gss.h>
+-#include <linux/sunrpc/svcauth_gss.h>
+-#include <linux/sunrpc/gss_asn1.h>
+-#include <linux/sunrpc/gss_krb5.h>
+-
+-/* svcauth_gss.c: */
+-EXPORT_SYMBOL(svcauth_gss_register_pseudoflavor);
+-
+-/* registering gss mechanisms to the mech switching code: */
+-EXPORT_SYMBOL(gss_mech_register);
+-EXPORT_SYMBOL(gss_mech_unregister);
+-EXPORT_SYMBOL(gss_mech_get);
+-EXPORT_SYMBOL(gss_mech_get_by_pseudoflavor);
+-EXPORT_SYMBOL(gss_mech_get_by_name);
+-EXPORT_SYMBOL(gss_mech_put);
+-EXPORT_SYMBOL(gss_pseudoflavor_to_service);
+-EXPORT_SYMBOL(gss_service_to_auth_domain_name);
+-
+-/* generic functionality in gss code: */
+-EXPORT_SYMBOL(g_make_token_header);
+-EXPORT_SYMBOL(g_verify_token_header);
+-EXPORT_SYMBOL(g_token_size);
+-EXPORT_SYMBOL(make_checksum);
+-EXPORT_SYMBOL(krb5_encrypt);
+-EXPORT_SYMBOL(krb5_decrypt);
+-
+-/* debug */
+-EXPORT_SYMBOL(print_hexl);
+Index: linux-2.6.10/net/sunrpc/auth_gss/Makefile
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/Makefile	2004-12-25 05:34:33.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/Makefile	2005-04-05 14:49:13.408690888 +0800
+@@ -10,7 +10,7 @@
+ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
+ 
+ rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
+-	gss_krb5_seqnum.o
++	gss_krb5_seqnum.o gss_krb5_wrap.o
+ 
+ obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
+ 
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_mech.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_mech.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_mech.c	2005-04-05 14:49:13.400692104 +0800
+@@ -182,6 +182,7 @@
+ 	kfree(kctx);
+ }
+ 
++/* XXX the following wrappers have become pointless; kill them. */
+ static u32
+ gss_verify_mic_kerberos(struct gss_ctx		*ctx,
+ 			struct xdr_buf		*message,
+@@ -191,8 +192,7 @@
+ 	int qop_state;
+ 	struct krb5_ctx *kctx = ctx->internal_ctx_id;
+ 
+-	maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state,
+-				   KG_TOK_MIC_MSG);
++	maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state);
+ 	if (!maj_stat && qop_state)
+ 	    *qstate = qop_state;
+ 
+@@ -208,7 +208,7 @@
+ 	u32 err = 0;
+ 	struct krb5_ctx *kctx = ctx->internal_ctx_id;
+ 
+-	err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG);
++	err = krb5_make_token(kctx, qop, message, mic_token);
+ 
+ 	dprintk("RPC:      gss_get_mic_kerberos returning %d\n",err);
+ 
+@@ -219,6 +219,8 @@
+ 	.gss_import_sec_context	= gss_import_sec_context_kerberos,
+ 	.gss_get_mic		= gss_get_mic_kerberos,
+ 	.gss_verify_mic		= gss_verify_mic_kerberos,
++	.gss_wrap		= gss_wrap_kerberos,
++	.gss_unwrap		= gss_unwrap_kerberos,
+ 	.gss_delete_sec_context	= gss_delete_sec_context_kerberos,
+ };
+ 
+@@ -233,6 +235,11 @@
+ 		.service = RPC_GSS_SVC_INTEGRITY,
+ 		.name = "krb5i",
+ 	},
++	[2] = {
++		.pseudoflavor = RPC_AUTH_GSS_KRB5P,
++		.service = RPC_GSS_SVC_PRIVACY,
++		.name = "krb5p",
++	},
+ };
+ 
+ static struct gss_api_mech gss_kerberos_mech = {
+Index: linux-2.6.10/net/sunrpc/auth_gss/auth_gss.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/auth_gss.c	2004-12-25 05:34:44.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/auth_gss.c	2005-04-05 14:49:13.404691496 +0800
+@@ -45,6 +45,7 @@
+ #include <linux/socket.h>
+ #include <linux/in.h>
+ #include <linux/sched.h>
++#include <linux/pagemap.h>
+ #include <linux/sunrpc/clnt.h>
+ #include <linux/sunrpc/auth.h>
+ #include <linux/sunrpc/auth_gss.h>
+@@ -480,12 +481,14 @@
+ 	if (!cred)
+ 		goto err;
+ 	if (gss_err)
+-		cred->cr_flags |= RPCAUTH_CRED_DEAD;
++		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+ 	else
+ 		gss_cred_set_ctx(cred, ctx);
+ 	spin_lock(&gss_auth->lock);
+ 	gss_msg = __gss_find_upcall(gss_auth, acred.uid);
+ 	if (gss_msg) {
++		if (gss_err)
++			gss_msg->msg.errno = -EACCES;
+ 		__gss_unhash_msg(gss_msg);
+ 		spin_unlock(&gss_auth->lock);
+ 		gss_release_msg(gss_msg);
+@@ -740,7 +743,9 @@
+ 	maj_stat = gss_get_mic(ctx->gc_gss_ctx,
+ 			       GSS_C_QOP_DEFAULT, 
+ 			       &verf_buf, &mic);
+-	if(maj_stat != 0){
++	if (maj_stat == GSS_S_CONTEXT_EXPIRED) {
++		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++	} else if (maj_stat != 0) {
+ 		printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
+ 		goto out_put_ctx;
+ 	}
+@@ -779,6 +784,7 @@
+ 	struct xdr_netobj mic;
+ 	u32		flav,len;
+ 	u32		service;
++	u32		maj_stat;
+ 
+ 	dprintk("RPC: %4u gss_validate\n", task->tk_pid);
+ 
+@@ -794,8 +800,11 @@
+ 	mic.data = (u8 *)p;
+ 	mic.len = len;
+ 
+-	if (gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state))
+-               goto out_bad;
++	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state);
++	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++	if (maj_stat)
++		goto out_bad;
+        service = gss_pseudoflavor_to_service(ctx->gc_gss_ctx->mech_type,
+ 					gss_cred->gc_flavor);
+        switch (service) {
+@@ -807,6 +816,11 @@
+ 	       /* verifier data, flavor, length, length, sequence number: */
+ 	       task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4;
+ 	       break;
++       case RPC_GSS_SVC_PRIVACY:
++	       /* XXXJBF: Ugh. Going for a wild overestimate.
++		* Need some info from krb5 layer? */
++	       task->tk_auth->au_rslack = XDR_QUADLEN(len) + 32;
++	       break;
+        default:
+ 	       goto out_bad;
+        }
+@@ -821,11 +835,10 @@
+ }
+ 
+ static inline int
+-gss_wrap_req_integ(struct gss_cl_ctx *ctx,
+-			kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
++gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
++		kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj)
+ {
+-	struct rpc_rqst	*req = (struct rpc_rqst *)rqstp;
+-	struct xdr_buf	*snd_buf = &req->rq_snd_buf;
++	struct xdr_buf	*snd_buf = &rqstp->rq_snd_buf;
+ 	struct xdr_buf	integ_buf;
+ 	u32             *integ_len = NULL;
+ 	struct xdr_netobj mic;
+@@ -836,7 +849,7 @@
+ 
+ 	integ_len = p++;
+ 	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+-	*p++ = htonl(req->rq_seqno);
++	*p++ = htonl(rqstp->rq_seqno);
+ 
+ 	status = encode(rqstp, p, obj);
+ 	if (status)
+@@ -848,7 +861,7 @@
+ 	*integ_len = htonl(integ_buf.len);
+ 
+ 	/* guess whether we're in the head or the tail: */
+-	if (snd_buf->page_len || snd_buf->tail[0].iov_len) 
++	if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+ 		iov = snd_buf->tail;
+ 	else
+ 		iov = snd_buf->head;
+@@ -858,7 +871,9 @@
+ 	maj_stat = gss_get_mic(ctx->gc_gss_ctx,
+ 			GSS_C_QOP_DEFAULT, &integ_buf, &mic);
+ 	status = -EIO; /* XXX? */
+-	if (maj_stat)
++	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++	else if (maj_stat)
+ 		return status;
+ 	q = xdr_encode_opaque(p, NULL, mic.len);
+ 
+@@ -868,6 +883,112 @@
+ 	return 0;
+ }
+ 
++static void
++priv_release_snd_buf(struct rpc_rqst *rqstp)
++{
++	int i;
++
++	for (i=0; i < rqstp->rq_enc_pages_num; i++)
++		__free_page(rqstp->rq_enc_pages[i]);
++	kfree(rqstp->rq_enc_pages);
++}
++
++static int
++alloc_enc_pages(struct rpc_rqst *rqstp)
++{
++	struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
++	int first, last, i;
++
++	if (snd_buf->page_len == 0) {
++		rqstp->rq_enc_pages_num = 0;
++		return 0;
++	}
++
++	first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
++	last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
++	rqstp->rq_enc_pages_num = last - first + 1 + 1;
++	rqstp->rq_enc_pages
++		= kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
++				GFP_NOFS);
++	if (!rqstp->rq_enc_pages)
++		goto out;
++	for (i=0; i < rqstp->rq_enc_pages_num; i++) {
++		rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
++		if (rqstp->rq_enc_pages[i] == NULL)
++			goto out_free;
++	}
++	rqstp->rq_release_snd_buf = priv_release_snd_buf;
++	return 0;
++out_free:
++	for (i--; i >= 0; i--) {
++		__free_page(rqstp->rq_enc_pages[i]);
++	}
++out:
++	return -EAGAIN;
++}
++
++static inline int
++gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
++		kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj)
++{
++	struct xdr_buf	*snd_buf = &rqstp->rq_snd_buf;
++	u32		offset;
++	u32             maj_stat;
++	int		status;
++	u32		*opaque_len;
++	struct page	**inpages;
++	int		first;
++	int		pad;
++	struct kvec	*iov;
++	char		*tmp;
++
++	opaque_len = p++;
++	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
++	*p++ = htonl(rqstp->rq_seqno);
++
++	status = encode(rqstp, p, obj);
++	if (status)
++		return status;
++
++	status = alloc_enc_pages(rqstp);
++	if (status)
++		return status;
++	/* XXXJBF: Oops!  Do we need rq_enc_pages really any more?? */
++	first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
++	inpages = snd_buf->pages + first;
++	snd_buf->pages = rqstp->rq_enc_pages;
++	snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
++	/* XXX?: tail needs to be separate if we want to be able to expand
++	 * the head (since it's often put right after the head).  But is
++	 * expanding the head safe in any case? */
++	if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
++		tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
++		memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
++		snd_buf->tail[0].iov_base = tmp;
++	}
++	maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset,
++				snd_buf, inpages);
++        status = -EIO; /* XXX? */
++	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++	else if (maj_stat)
++		return status;
++
++	*opaque_len = htonl(snd_buf->len - offset);
++	/* guess whether we're in the head or the tail: */
++	if (snd_buf->page_len || snd_buf->tail[0].iov_len)
++		iov = snd_buf->tail;
++	else
++		iov = snd_buf->head;
++	p = iov->iov_base + iov->iov_len;
++	pad = 3 - ((snd_buf->len - offset - 1) & 3);
++	memset(p, 0, pad);
++	iov->iov_len += pad;
++	snd_buf->len += pad;
++
++	return 0;
++}
++
+ static int
+ gss_wrap_req(struct rpc_task *task,
+ 	     kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
+@@ -894,9 +1015,13 @@
+ 			status = encode(rqstp, p, obj);
+ 			goto out;
+ 		case RPC_GSS_SVC_INTEGRITY:
+-			status = gss_wrap_req_integ(ctx, encode, rqstp, p, obj);
++			status = gss_wrap_req_integ(cred, ctx, encode,
++								rqstp, p, obj);
+ 			goto out;
+ 		case RPC_GSS_SVC_PRIVACY:
++			status = gss_wrap_req_priv(cred, ctx, encode,
++								rqstp, p, obj);
++			goto out;
+ 		default:
+ 			goto out;
+ 	}
+@@ -907,11 +1032,10 @@
+ }
+ 
+ static inline int
+-gss_unwrap_resp_integ(struct gss_cl_ctx *ctx,
+-		kxdrproc_t decode, void *rqstp, u32 **p, void *obj)
++gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
++		struct rpc_rqst *rqstp, u32 **p)
+ {
+-	struct rpc_rqst *req = (struct rpc_rqst *)rqstp;
+-	struct xdr_buf	*rcv_buf = &req->rq_rcv_buf;
++	struct xdr_buf	*rcv_buf = &rqstp->rq_rcv_buf;
+ 	struct xdr_buf integ_buf;
+ 	struct xdr_netobj mic;
+ 	u32 data_offset, mic_offset;
+@@ -926,7 +1050,7 @@
+ 	mic_offset = integ_len + data_offset;
+ 	if (mic_offset > rcv_buf->len)
+ 		return status;
+-	if (ntohl(*(*p)++) != req->rq_seqno)
++	if (ntohl(*(*p)++) != rqstp->rq_seqno)
+ 		return status;
+ 
+ 	if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset,
+@@ -938,11 +1062,44 @@
+ 
+ 	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf,
+ 			&mic, NULL);
++	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++	if (maj_stat != GSS_S_COMPLETE)
++		return status;
++	return 0;
++}
++
++static inline int
++gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
++		struct rpc_rqst *rqstp, u32 **p)
++{
++	struct xdr_buf  *rcv_buf = &rqstp->rq_rcv_buf;
++	u32 offset, out_offset;
++	u32 opaque_len;
++	u32 maj_stat;
++	int status = -EIO;
++
++	opaque_len = ntohl(*(*p)++);
++	offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
++	if (offset + opaque_len > rcv_buf->len)
++		return status;
++	/* remove padding: */
++	rcv_buf->len = offset + opaque_len;
++
++	maj_stat = gss_unwrap(ctx->gc_gss_ctx, NULL,
++			offset, rcv_buf, &out_offset);
++	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+ 	if (maj_stat != GSS_S_COMPLETE)
+ 		return status;
++	*p = (u32 *)(rcv_buf->head[0].iov_base + out_offset);
++	if (ntohl(*(*p)++) != rqstp->rq_seqno)
++		return status;
++
+ 	return 0;
+ }
+ 
++
+ static int
+ gss_unwrap_resp(struct rpc_task *task,
+ 		kxdrproc_t decode, void *rqstp, u32 *p, void *obj)
+@@ -962,12 +1119,16 @@
+ 		case RPC_GSS_SVC_NONE:
+ 			goto out_decode;
+ 		case RPC_GSS_SVC_INTEGRITY:
+-			status = gss_unwrap_resp_integ(ctx, decode, 
+-							rqstp, &p, obj);
++			status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p);
+ 			if (status)
+ 				goto out;
+ 			break;
+ 		case RPC_GSS_SVC_PRIVACY:
++			status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p);
++			if (status)
++				goto out;
++			break;
++
+ 		default:
+ 			goto out;
+ 	}
+Index: linux-2.6.10/net/sunrpc/svc.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/svc.c	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/net/sunrpc/svc.c	2005-04-05 14:49:13.409690736 +0800
+@@ -264,6 +264,7 @@
+ 	u32			dir, prog, vers, proc,
+ 				auth_stat, rpc_stat;
+ 	int			auth_res;
++	u32			*accept_statp;
+ 
+ 	rpc_stat = rpc_success;
+ 
+@@ -299,6 +300,9 @@
+ 	if (vers != 2)		/* RPC version number */
+ 		goto err_bad_rpc;
+ 
++	/* Save position in case we later decide to reject: */
++	accept_statp = resv->iov_base + resv->iov_len;
++
+ 	svc_putu32(resv, xdr_zero);		/* ACCEPT */
+ 
+ 	rqstp->rq_prog = prog = ntohl(svc_getu32(argv));	/* program number */
+@@ -311,10 +315,12 @@
+ 	 * We do this before anything else in order to get a decent
+ 	 * auth verifier.
+ 	 */
+-	if (progp->pg_authenticate != NULL)
+-		auth_res = progp->pg_authenticate(rqstp, &auth_stat);
+-	else
+-		auth_res = svc_authenticate(rqstp, &auth_stat);
++	auth_res = svc_authenticate(rqstp, &auth_stat);
++	/* Also give the program a chance to reject this call: */
++	if (auth_res == SVC_OK) {
++		auth_stat = rpc_autherr_badcred;
++		auth_res = progp->pg_authenticate(rqstp);
++	}
+ 	switch (auth_res) {
+ 	case SVC_OK:
+ 		break;
+@@ -437,7 +443,8 @@
+ err_bad_auth:
+ 	dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
+ 	serv->sv_stats->rpcbadauth++;
+-	resv->iov_len -= 4;
++	/* Restore write pointer to location of accept status: */
++	xdr_ressize_check(rqstp, accept_statp);
+ 	svc_putu32(resv, xdr_one);	/* REJECT */
+ 	svc_putu32(resv, xdr_one);	/* AUTH_ERROR */
+ 	svc_putu32(resv, auth_stat);	/* status */
+Index: linux-2.6.10/net/sunrpc/sched.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/sched.c	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/net/sunrpc/sched.c	2005-04-05 14:49:13.391693472 +0800
+@@ -41,13 +41,7 @@
+ 
+ static void			__rpc_default_timer(struct rpc_task *task);
+ static void			rpciod_killall(void);
+-
+-/*
+- * When an asynchronous RPC task is activated within a bottom half
+- * handler, or while executing another RPC task, it is put on
+- * schedq, and rpciod is woken up.
+- */
+-static RPC_WAITQ(schedq, "schedq");
++static void			rpc_async_schedule(void *);
+ 
+ /*
+  * RPC tasks that create another task (e.g. for contacting the portmapper)
+@@ -68,26 +62,18 @@
+ /*
+  * rpciod-related stuff
+  */
+-static DECLARE_WAIT_QUEUE_HEAD(rpciod_idle);
+-static DECLARE_COMPLETION(rpciod_killer);
+ static DECLARE_MUTEX(rpciod_sema);
+ static unsigned int		rpciod_users;
+-static pid_t			rpciod_pid;
+-static int			rpc_inhibit;
++static struct workqueue_struct *rpciod_workqueue;
+ 
+ /*
+- * Spinlock for wait queues. Access to the latter also has to be
+- * interrupt-safe in order to allow timers to wake up sleeping tasks.
+- */
+-static spinlock_t rpc_queue_lock = SPIN_LOCK_UNLOCKED;
+-/*
+  * Spinlock for other critical sections of code.
+  */
+ static spinlock_t rpc_sched_lock = SPIN_LOCK_UNLOCKED;
+ 
+ /*
+  * Disable the timer for a given RPC task. Should be called with
+- * rpc_queue_lock and bh_disabled in order to avoid races within
++ * queue->lock and bh_disabled in order to avoid races within
+  * rpc_run_timer().
+  */
+ static inline void
+@@ -105,19 +91,19 @@
+  * without calling del_timer_sync(). The latter could cause a
+  * deadlock if called while we're holding spinlocks...
+  */
+-static void
+-rpc_run_timer(struct rpc_task *task)
++static void rpc_run_timer(struct rpc_task *task)
+ {
+ 	void (*callback)(struct rpc_task *);
+ 
+-	spin_lock_bh(&rpc_queue_lock);
+ 	callback = task->tk_timeout_fn;
+ 	task->tk_timeout_fn = NULL;
+-	spin_unlock_bh(&rpc_queue_lock);
+-	if (callback) {
++	if (callback && RPC_IS_QUEUED(task)) {
+ 		dprintk("RPC: %4d running timer\n", task->tk_pid);
+ 		callback(task);
+ 	}
++	smp_mb__before_clear_bit();
++	clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
++	smp_mb__after_clear_bit();
+ }
+ 
+ /*
+@@ -136,29 +122,21 @@
+ 		task->tk_timeout_fn = timer;
+ 	else
+ 		task->tk_timeout_fn = __rpc_default_timer;
++	set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
+ 	mod_timer(&task->tk_timer, jiffies + task->tk_timeout);
+ }
+ 
+ /*
+- * Set up a timer for an already sleeping task.
+- */
+-void rpc_add_timer(struct rpc_task *task, rpc_action timer)
+-{
+-	spin_lock_bh(&rpc_queue_lock);
+-	if (!RPC_IS_RUNNING(task))
+-		__rpc_add_timer(task, timer);
+-	spin_unlock_bh(&rpc_queue_lock);
+-}
+-
+-/*
+  * Delete any timer for the current task. Because we use del_timer_sync(),
+- * this function should never be called while holding rpc_queue_lock.
++ * this function should never be called while holding queue->lock.
+  */
+ static inline void
+ rpc_delete_timer(struct rpc_task *task)
+ {
+-	if (del_timer_sync(&task->tk_timer))
++	if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) {
++		del_singleshot_timer_sync(&task->tk_timer);
+ 		dprintk("RPC: %4d deleting timer\n", task->tk_pid);
++	}
+ }
+ 
+ /*
+@@ -169,16 +147,17 @@
+ 	struct list_head *q;
+ 	struct rpc_task *t;
+ 
++	INIT_LIST_HEAD(&task->u.tk_wait.links);
+ 	q = &queue->tasks[task->tk_priority];
+ 	if (unlikely(task->tk_priority > queue->maxpriority))
+ 		q = &queue->tasks[queue->maxpriority];
+-	list_for_each_entry(t, q, tk_list) {
++	list_for_each_entry(t, q, u.tk_wait.list) {
+ 		if (t->tk_cookie == task->tk_cookie) {
+-			list_add_tail(&task->tk_list, &t->tk_links);
++			list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
+ 			return;
+ 		}
+ 	}
+-	list_add_tail(&task->tk_list, q);
++	list_add_tail(&task->u.tk_wait.list, q);
+ }
+ 
+ /*
+@@ -189,37 +168,21 @@
+  * improve overall performance.
+  * Everyone else gets appended to the queue to ensure proper FIFO behavior.
+  */
+-static int __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
++static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
+ {
+-	if (task->tk_rpcwait == queue)
+-		return 0;
++	BUG_ON (RPC_IS_QUEUED(task));
+ 
+-	if (task->tk_rpcwait) {
+-		printk(KERN_WARNING "RPC: doubly enqueued task!\n");
+-		return -EWOULDBLOCK;
+-	}
+ 	if (RPC_IS_PRIORITY(queue))
+ 		__rpc_add_wait_queue_priority(queue, task);
+ 	else if (RPC_IS_SWAPPER(task))
+-		list_add(&task->tk_list, &queue->tasks[0]);
++		list_add(&task->u.tk_wait.list, &queue->tasks[0]);
+ 	else
+-		list_add_tail(&task->tk_list, &queue->tasks[0]);
+-	task->tk_rpcwait = queue;
++		list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
++	task->u.tk_wait.rpc_waitq = queue;
++	rpc_set_queued(task);
+ 
+ 	dprintk("RPC: %4d added to queue %p \"%s\"\n",
+ 				task->tk_pid, queue, rpc_qname(queue));
+-
+-	return 0;
+-}
+-
+-int rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task)
+-{
+-	int		result;
+-
+-	spin_lock_bh(&rpc_queue_lock);
+-	result = __rpc_add_wait_queue(q, task);
+-	spin_unlock_bh(&rpc_queue_lock);
+-	return result;
+ }
+ 
+ /*
+@@ -229,12 +192,12 @@
+ {
+ 	struct rpc_task *t;
+ 
+-	if (!list_empty(&task->tk_links)) {
+-		t = list_entry(task->tk_links.next, struct rpc_task, tk_list);
+-		list_move(&t->tk_list, &task->tk_list);
+-		list_splice_init(&task->tk_links, &t->tk_links);
++	if (!list_empty(&task->u.tk_wait.links)) {
++		t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
++		list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
++		list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
+ 	}
+-	list_del(&task->tk_list);
++	list_del(&task->u.tk_wait.list);
+ }
+ 
+ /*
+@@ -243,31 +206,17 @@
+  */
+ static void __rpc_remove_wait_queue(struct rpc_task *task)
+ {
+-	struct rpc_wait_queue *queue = task->tk_rpcwait;
+-
+-	if (!queue)
+-		return;
++	struct rpc_wait_queue *queue;
++	queue = task->u.tk_wait.rpc_waitq;
+ 
+ 	if (RPC_IS_PRIORITY(queue))
+ 		__rpc_remove_wait_queue_priority(task);
+ 	else
+-		list_del(&task->tk_list);
+-	task->tk_rpcwait = NULL;
+-
++		list_del(&task->u.tk_wait.list);
+ 	dprintk("RPC: %4d removed from queue %p \"%s\"\n",
+ 				task->tk_pid, queue, rpc_qname(queue));
+ }
+ 
+-void
+-rpc_remove_wait_queue(struct rpc_task *task)
+-{
+-	if (!task->tk_rpcwait)
+-		return;
+-	spin_lock_bh(&rpc_queue_lock);
+-	__rpc_remove_wait_queue(task);
+-	spin_unlock_bh(&rpc_queue_lock);
+-}
+-
+ static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
+ {
+ 	queue->priority = priority;
+@@ -290,6 +239,7 @@
+ {
+ 	int i;
+ 
++	spin_lock_init(&queue->lock);
+ 	for (i = 0; i < ARRAY_SIZE(queue->tasks); i++)
+ 		INIT_LIST_HEAD(&queue->tasks[i]);
+ 	queue->maxpriority = maxprio;
+@@ -316,34 +266,31 @@
+  * Note: If the task is ASYNC, this must be called with 
+  * the spinlock held to protect the wait queue operation.
+  */
+-static inline void
+-rpc_make_runnable(struct rpc_task *task)
++static void rpc_make_runnable(struct rpc_task *task)
+ {
+-	if (task->tk_timeout_fn) {
+-		printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n");
++	int do_ret;
++
++	BUG_ON(task->tk_timeout_fn);
++	do_ret = rpc_test_and_set_running(task);
++	rpc_clear_queued(task);
++	if (do_ret)
+ 		return;
+-	}
+-	rpc_set_running(task);
+ 	if (RPC_IS_ASYNC(task)) {
+-		if (RPC_IS_SLEEPING(task)) {
+-			int status;
+-			status = __rpc_add_wait_queue(&schedq, task);
+-			if (status < 0) {
+-				printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
+-				task->tk_status = status;
+-				return;
+-			}
+-			rpc_clear_sleeping(task);
+-			wake_up(&rpciod_idle);
++		int status;
++
++		INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task);
++		status = queue_work(task->tk_workqueue, &task->u.tk_work);
++		if (status < 0) {
++			printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
++			task->tk_status = status;
++			return;
+ 		}
+-	} else {
+-		rpc_clear_sleeping(task);
+-		wake_up(&task->tk_wait);
+-	}
++	} else
++		wake_up(&task->u.tk_wait.waitq);
+ }
+ 
+ /*
+- * Place a newly initialized task on the schedq.
++ * Place a newly initialized task on the workqueue.
+  */
+ static inline void
+ rpc_schedule_run(struct rpc_task *task)
+@@ -352,33 +299,18 @@
+ 	if (RPC_IS_ACTIVATED(task))
+ 		return;
+ 	task->tk_active = 1;
+-	rpc_set_sleeping(task);
+ 	rpc_make_runnable(task);
+ }
+ 
+ /*
+- *	For other people who may need to wake the I/O daemon
+- *	but should (for now) know nothing about its innards
+- */
+-void rpciod_wake_up(void)
+-{
+-	if(rpciod_pid==0)
+-		printk(KERN_ERR "rpciod: wot no daemon?\n");
+-	wake_up(&rpciod_idle);
+-}
+-
+-/*
+  * Prepare for sleeping on a wait queue.
+  * By always appending tasks to the list we ensure FIFO behavior.
+  * NB: An RPC task will only receive interrupt-driven events as long
+  * as it's on a wait queue.
+  */
+-static void
+-__rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
++static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+ 			rpc_action action, rpc_action timer)
+ {
+-	int status;
+-
+ 	dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid,
+ 				rpc_qname(q), jiffies);
+ 
+@@ -388,49 +320,36 @@
+ 	}
+ 
+ 	/* Mark the task as being activated if so needed */
+-	if (!RPC_IS_ACTIVATED(task)) {
++	if (!RPC_IS_ACTIVATED(task))
+ 		task->tk_active = 1;
+-		rpc_set_sleeping(task);
+-	}
+ 
+-	status = __rpc_add_wait_queue(q, task);
+-	if (status) {
+-		printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
+-		task->tk_status = status;
+-	} else {
+-		rpc_clear_running(task);
+-		if (task->tk_callback) {
+-			dprintk(KERN_ERR "RPC: %4d overwrites an active callback\n", task->tk_pid);
+-			BUG();
+-		}
+-		task->tk_callback = action;
+-		__rpc_add_timer(task, timer);
+-	}
++	__rpc_add_wait_queue(q, task);
++
++	BUG_ON(task->tk_callback != NULL);
++	task->tk_callback = action;
++	__rpc_add_timer(task, timer);
+ }
+ 
+-void
+-rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
++void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+ 				rpc_action action, rpc_action timer)
+ {
+ 	/*
+ 	 * Protect the queue operations.
+ 	 */
+-	spin_lock_bh(&rpc_queue_lock);
++	spin_lock_bh(&q->lock);
+ 	__rpc_sleep_on(q, task, action, timer);
+-	spin_unlock_bh(&rpc_queue_lock);
++	spin_unlock_bh(&q->lock);
+ }
+ 
+ /**
+- * __rpc_wake_up_task - wake up a single rpc_task
++ * __rpc_do_wake_up_task - wake up a single rpc_task
+  * @task: task to be woken up
+  *
+- * Caller must hold rpc_queue_lock
++ * Caller must hold queue->lock, and have cleared the task queued flag.
+  */
+-static void
+-__rpc_wake_up_task(struct rpc_task *task)
++static void __rpc_do_wake_up_task(struct rpc_task *task)
+ {
+-	dprintk("RPC: %4d __rpc_wake_up_task (now %ld inh %d)\n",
+-					task->tk_pid, jiffies, rpc_inhibit);
++	dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies);
+ 
+ #ifdef RPC_DEBUG
+ 	if (task->tk_magic != 0xf00baa) {
+@@ -445,12 +364,9 @@
+ 		printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
+ 		return;
+ 	}
+-	if (RPC_IS_RUNNING(task))
+-		return;
+ 
+ 	__rpc_disable_timer(task);
+-	if (task->tk_rpcwait != &schedq)
+-		__rpc_remove_wait_queue(task);
++	__rpc_remove_wait_queue(task);
+ 
+ 	rpc_make_runnable(task);
+ 
+@@ -458,6 +374,18 @@
+ }
+ 
+ /*
++ * Wake up the specified task
++ */
++static void __rpc_wake_up_task(struct rpc_task *task)
++{
++	if (rpc_start_wakeup(task)) {
++		if (RPC_IS_QUEUED(task))
++			__rpc_do_wake_up_task(task);
++		rpc_finish_wakeup(task);
++	}
++}
++
++/*
+  * Default timeout handler if none specified by user
+  */
+ static void
+@@ -471,14 +399,18 @@
+ /*
+  * Wake up the specified task
+  */
+-void
+-rpc_wake_up_task(struct rpc_task *task)
++void rpc_wake_up_task(struct rpc_task *task)
+ {
+-	if (RPC_IS_RUNNING(task))
+-		return;
+-	spin_lock_bh(&rpc_queue_lock);
+-	__rpc_wake_up_task(task);
+-	spin_unlock_bh(&rpc_queue_lock);
++	if (rpc_start_wakeup(task)) {
++		if (RPC_IS_QUEUED(task)) {
++			struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq;
++
++			spin_lock_bh(&queue->lock);
++			__rpc_do_wake_up_task(task);
++			spin_unlock_bh(&queue->lock);
++		}
++		rpc_finish_wakeup(task);
++	}
+ }
+ 
+ /*
+@@ -494,11 +426,11 @@
+ 	 */
+ 	q = &queue->tasks[queue->priority];
+ 	if (!list_empty(q)) {
+-		task = list_entry(q->next, struct rpc_task, tk_list);
++		task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ 		if (queue->cookie == task->tk_cookie) {
+ 			if (--queue->nr)
+ 				goto out;
+-			list_move_tail(&task->tk_list, q);
++			list_move_tail(&task->u.tk_wait.list, q);
+ 		}
+ 		/*
+ 		 * Check if we need to switch queues.
+@@ -516,7 +448,7 @@
+ 		else
+ 			q = q - 1;
+ 		if (!list_empty(q)) {
+-			task = list_entry(q->next, struct rpc_task, tk_list);
++			task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ 			goto new_queue;
+ 		}
+ 	} while (q != &queue->tasks[queue->priority]);
+@@ -541,14 +473,14 @@
+ 	struct rpc_task	*task = NULL;
+ 
+ 	dprintk("RPC:      wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue));
+-	spin_lock_bh(&rpc_queue_lock);
++	spin_lock_bh(&queue->lock);
+ 	if (RPC_IS_PRIORITY(queue))
+ 		task = __rpc_wake_up_next_priority(queue);
+ 	else {
+ 		task_for_first(task, &queue->tasks[0])
+ 			__rpc_wake_up_task(task);
+ 	}
+-	spin_unlock_bh(&rpc_queue_lock);
++	spin_unlock_bh(&queue->lock);
+ 
+ 	return task;
+ }
+@@ -557,25 +489,25 @@
+  * rpc_wake_up - wake up all rpc_tasks
+  * @queue: rpc_wait_queue on which the tasks are sleeping
+  *
+- * Grabs rpc_queue_lock
++ * Grabs queue->lock
+  */
+ void rpc_wake_up(struct rpc_wait_queue *queue)
+ {
+ 	struct rpc_task *task;
+ 
+ 	struct list_head *head;
+-	spin_lock_bh(&rpc_queue_lock);
++	spin_lock_bh(&queue->lock);
+ 	head = &queue->tasks[queue->maxpriority];
+ 	for (;;) {
+ 		while (!list_empty(head)) {
+-			task = list_entry(head->next, struct rpc_task, tk_list);
++			task = list_entry(head->next, struct rpc_task, u.tk_wait.list);
+ 			__rpc_wake_up_task(task);
+ 		}
+ 		if (head == &queue->tasks[0])
+ 			break;
+ 		head--;
+ 	}
+-	spin_unlock_bh(&rpc_queue_lock);
++	spin_unlock_bh(&queue->lock);
+ }
+ 
+ /**
+@@ -583,18 +515,18 @@
+  * @queue: rpc_wait_queue on which the tasks are sleeping
+  * @status: status value to set
+  *
+- * Grabs rpc_queue_lock
++ * Grabs queue->lock
+  */
+ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
+ {
+ 	struct list_head *head;
+ 	struct rpc_task *task;
+ 
+-	spin_lock_bh(&rpc_queue_lock);
++	spin_lock_bh(&queue->lock);
+ 	head = &queue->tasks[queue->maxpriority];
+ 	for (;;) {
+ 		while (!list_empty(head)) {
+-			task = list_entry(head->next, struct rpc_task, tk_list);
++			task = list_entry(head->next, struct rpc_task, u.tk_wait.list);
+ 			task->tk_status = status;
+ 			__rpc_wake_up_task(task);
+ 		}
+@@ -602,7 +534,7 @@
+ 			break;
+ 		head--;
+ 	}
+-	spin_unlock_bh(&rpc_queue_lock);
++	spin_unlock_bh(&queue->lock);
+ }
+ 
+ /*
+@@ -626,22 +558,23 @@
+ /*
+  * This is the RPC `scheduler' (or rather, the finite state machine).
+  */
+-static int
+-__rpc_execute(struct rpc_task *task)
++static int __rpc_execute(struct rpc_task *task)
+ {
+ 	int		status = 0;
+ 
+ 	dprintk("RPC: %4d rpc_execute flgs %x\n",
+ 				task->tk_pid, task->tk_flags);
+ 
+-	if (!RPC_IS_RUNNING(task)) {
+-		printk(KERN_WARNING "RPC: rpc_execute called for sleeping task!!\n");
+-		return 0;
+-	}
++	BUG_ON(RPC_IS_QUEUED(task));
+ 
+  restarted:
+ 	while (1) {
+ 		/*
++		 * Garbage collection of pending timers...
++		 */
++		rpc_delete_timer(task);
++
++		/*
+ 		 * Execute any pending callback.
+ 		 */
+ 		if (RPC_DO_CALLBACK(task)) {
+@@ -657,7 +590,9 @@
+ 			 */
+ 			save_callback=task->tk_callback;
+ 			task->tk_callback=NULL;
++			lock_kernel();
+ 			save_callback(task);
++			unlock_kernel();
+ 		}
+ 
+ 		/*
+@@ -665,43 +600,35 @@
+ 		 * tk_action may be NULL when the task has been killed
+ 		 * by someone else.
+ 		 */
+-		if (RPC_IS_RUNNING(task)) {
+-			/*
+-			 * Garbage collection of pending timers...
+-			 */
+-			rpc_delete_timer(task);
++		if (!RPC_IS_QUEUED(task)) {
+ 			if (!task->tk_action)
+ 				break;
++			lock_kernel();
+ 			task->tk_action(task);
+-			/* micro-optimization to avoid spinlock */
+-			if (RPC_IS_RUNNING(task))
+-				continue;
++			unlock_kernel();
+ 		}
+ 
+ 		/*
+-		 * Check whether task is sleeping.
++		 * Lockless check for whether task is sleeping or not.
+ 		 */
+-		spin_lock_bh(&rpc_queue_lock);
+-		if (!RPC_IS_RUNNING(task)) {
+-			rpc_set_sleeping(task);
+-			if (RPC_IS_ASYNC(task)) {
+-				spin_unlock_bh(&rpc_queue_lock);
++		if (!RPC_IS_QUEUED(task))
++			continue;
++		rpc_clear_running(task);
++		if (RPC_IS_ASYNC(task)) {
++			/* Careful! we may have raced... */
++			if (RPC_IS_QUEUED(task))
+ 				return 0;
+-			}
++			if (rpc_test_and_set_running(task))
++				return 0;
++			continue;
+ 		}
+-		spin_unlock_bh(&rpc_queue_lock);
+ 
+-		if (!RPC_IS_SLEEPING(task))
+-			continue;
+ 		/* sync task: sleep here */
+ 		dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
+-		if (current->pid == rpciod_pid)
+-			printk(KERN_ERR "RPC: rpciod waiting on sync task!\n");
+-
+ 		if (RPC_TASK_UNINTERRUPTIBLE(task)) {
+-			__wait_event(task->tk_wait, !RPC_IS_SLEEPING(task));
++			__wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task));
+ 		} else {
+-			__wait_event_interruptible(task->tk_wait, !RPC_IS_SLEEPING(task), status);
++			__wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status);
+ 			/*
+ 			 * When a sync task receives a signal, it exits with
+ 			 * -ERESTARTSYS. In order to catch any callbacks that
+@@ -715,11 +642,14 @@
+ 				rpc_wake_up_task(task);
+ 			}
+ 		}
++		rpc_set_running(task);
+ 		dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
+ 	}
+ 
+ 	if (task->tk_exit) {
++		lock_kernel();
+ 		task->tk_exit(task);
++		unlock_kernel();
+ 		/* If tk_action is non-null, the user wants us to restart */
+ 		if (task->tk_action) {
+ 			if (!RPC_ASSASSINATED(task)) {
+@@ -738,7 +668,6 @@
+ 
+ 	/* Release all resources associated with the task */
+ 	rpc_release_task(task);
+-
+ 	return status;
+ }
+ 
+@@ -754,57 +683,16 @@
+ int
+ rpc_execute(struct rpc_task *task)
+ {
+-	int status = -EIO;
+-	if (rpc_inhibit) {
+-		printk(KERN_INFO "RPC: execution inhibited!\n");
+-		goto out_release;
+-	}
+-
+-	status = -EWOULDBLOCK;
+-	if (task->tk_active) {
+-		printk(KERN_ERR "RPC: active task was run twice!\n");
+-		goto out_err;
+-	}
++	BUG_ON(task->tk_active);
+ 
+ 	task->tk_active = 1;
+ 	rpc_set_running(task);
+ 	return __rpc_execute(task);
+- out_release:
+-	rpc_release_task(task);
+- out_err:
+-	return status;
+ }
+ 
+-/*
+- * This is our own little scheduler for async RPC tasks.
+- */
+-static void
+-__rpc_schedule(void)
++static void rpc_async_schedule(void *arg)
+ {
+-	struct rpc_task	*task;
+-	int		count = 0;
+-
+-	dprintk("RPC:      rpc_schedule enter\n");
+-	while (1) {
+-
+-		task_for_first(task, &schedq.tasks[0]) {
+-			__rpc_remove_wait_queue(task);
+-			spin_unlock_bh(&rpc_queue_lock);
+-
+-			__rpc_execute(task);
+-			spin_lock_bh(&rpc_queue_lock);
+-		} else {
+-			break;
+-		}
+-
+-		if (++count >= 200 || need_resched()) {
+-			count = 0;
+-			spin_unlock_bh(&rpc_queue_lock);
+-			schedule();
+-			spin_lock_bh(&rpc_queue_lock);
+-		}
+-	}
+-	dprintk("RPC:      rpc_schedule leave\n");
++	__rpc_execute((struct rpc_task *)arg);
+ }
+ 
+ /*
+@@ -862,7 +750,6 @@
+ 	task->tk_client = clnt;
+ 	task->tk_flags  = flags;
+ 	task->tk_exit   = callback;
+-	init_waitqueue_head(&task->tk_wait);
+ 	if (current->uid != current->fsuid || current->gid != current->fsgid)
+ 		task->tk_flags |= RPC_TASK_SETUID;
+ 
+@@ -873,7 +760,11 @@
+ 
+ 	task->tk_priority = RPC_PRIORITY_NORMAL;
+ 	task->tk_cookie = (unsigned long)current;
+-	INIT_LIST_HEAD(&task->tk_links);
++
++	/* Initialize workqueue for async tasks */
++	task->tk_workqueue = rpciod_workqueue;
++	if (!RPC_IS_ASYNC(task))
++		init_waitqueue_head(&task->u.tk_wait.waitq);
+ 
+ 	/* Add to global list of all tasks */
+ 	spin_lock(&rpc_sched_lock);
+@@ -944,8 +835,7 @@
+ 	goto out;
+ }
+ 
+-void
+-rpc_release_task(struct rpc_task *task)
++void rpc_release_task(struct rpc_task *task)
+ {
+ 	dprintk("RPC: %4d release task\n", task->tk_pid);
+ 
+@@ -963,19 +853,9 @@
+ 	list_del(&task->tk_task);
+ 	spin_unlock(&rpc_sched_lock);
+ 
+-	/* Protect the execution below. */
+-	spin_lock_bh(&rpc_queue_lock);
+-
+-	/* Disable timer to prevent zombie wakeup */
+-	__rpc_disable_timer(task);
+-
+-	/* Remove from any wait queue we're still on */
+-	__rpc_remove_wait_queue(task);
+-
++	BUG_ON (RPC_IS_QUEUED(task));
+ 	task->tk_active = 0;
+ 
+-	spin_unlock_bh(&rpc_queue_lock);
+-
+ 	/* Synchronously delete any running timer */
+ 	rpc_delete_timer(task);
+ 
+@@ -1005,10 +885,9 @@
+  * queue 'childq'. If so returns a pointer to the parent.
+  * Upon failure returns NULL.
+  *
+- * Caller must hold rpc_queue_lock
++ * Caller must hold childq.lock
+  */
+-static inline struct rpc_task *
+-rpc_find_parent(struct rpc_task *child)
++static inline struct rpc_task *rpc_find_parent(struct rpc_task *child)
+ {
+ 	struct rpc_task	*task, *parent;
+ 	struct list_head *le;
+@@ -1021,17 +900,16 @@
+ 	return NULL;
+ }
+ 
+-static void
+-rpc_child_exit(struct rpc_task *child)
++static void rpc_child_exit(struct rpc_task *child)
+ {
+ 	struct rpc_task	*parent;
+ 
+-	spin_lock_bh(&rpc_queue_lock);
++	spin_lock_bh(&childq.lock);
+ 	if ((parent = rpc_find_parent(child)) != NULL) {
+ 		parent->tk_status = child->tk_status;
+ 		__rpc_wake_up_task(parent);
+ 	}
+-	spin_unlock_bh(&rpc_queue_lock);
++	spin_unlock_bh(&childq.lock);
+ }
+ 
+ /*
+@@ -1054,22 +932,20 @@
+ 	return NULL;
+ }
+ 
+-void
+-rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
++void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
+ {
+-	spin_lock_bh(&rpc_queue_lock);
++	spin_lock_bh(&childq.lock);
+ 	/* N.B. Is it possible for the child to have already finished? */
+ 	__rpc_sleep_on(&childq, task, func, NULL);
+ 	rpc_schedule_run(child);
+-	spin_unlock_bh(&rpc_queue_lock);
++	spin_unlock_bh(&childq.lock);
+ }
+ 
+ /*
+  * Kill all tasks for the given client.
+  * XXX: kill their descendants as well?
+  */
+-void
+-rpc_killall_tasks(struct rpc_clnt *clnt)
++void rpc_killall_tasks(struct rpc_clnt *clnt)
+ {
+ 	struct rpc_task	*rovr;
+ 	struct list_head *le;
+@@ -1091,93 +967,14 @@
+ 
+ static DECLARE_MUTEX_LOCKED(rpciod_running);
+ 
+-static inline int
+-rpciod_task_pending(void)
+-{
+-	return !list_empty(&schedq.tasks[0]);
+-}
+-
+-
+-/*
+- * This is the rpciod kernel thread
+- */
+-static int
+-rpciod(void *ptr)
+-{
+-	int		rounds = 0;
+-
+-	lock_kernel();
+-	/*
+-	 * Let our maker know we're running ...
+-	 */
+-	rpciod_pid = current->pid;
+-	up(&rpciod_running);
+-
+-	daemonize("rpciod");
+-	allow_signal(SIGKILL);
+-
+-	dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid);
+-	spin_lock_bh(&rpc_queue_lock);
+-	while (rpciod_users) {
+-		DEFINE_WAIT(wait);
+-		if (signalled()) {
+-			spin_unlock_bh(&rpc_queue_lock);
+-			rpciod_killall();
+-			flush_signals(current);
+-			spin_lock_bh(&rpc_queue_lock);
+-		}
+-		__rpc_schedule();
+-		if (current->flags & PF_FREEZE) {
+-			spin_unlock_bh(&rpc_queue_lock);
+-			refrigerator(PF_FREEZE);
+-			spin_lock_bh(&rpc_queue_lock);
+-		}
+-
+-		if (++rounds >= 64) {	/* safeguard */
+-			spin_unlock_bh(&rpc_queue_lock);
+-			schedule();
+-			rounds = 0;
+-			spin_lock_bh(&rpc_queue_lock);
+-		}
+-
+-		dprintk("RPC: rpciod back to sleep\n");
+-		prepare_to_wait(&rpciod_idle, &wait, TASK_INTERRUPTIBLE);
+-		if (!rpciod_task_pending() && !signalled()) {
+-			spin_unlock_bh(&rpc_queue_lock);
+-			schedule();
+-			rounds = 0;
+-			spin_lock_bh(&rpc_queue_lock);
+-		}
+-		finish_wait(&rpciod_idle, &wait);
+-		dprintk("RPC: switch to rpciod\n");
+-	}
+-	spin_unlock_bh(&rpc_queue_lock);
+-
+-	dprintk("RPC: rpciod shutdown commences\n");
+-	if (!list_empty(&all_tasks)) {
+-		printk(KERN_ERR "rpciod: active tasks at shutdown?!\n");
+-		rpciod_killall();
+-	}
+-
+-	dprintk("RPC: rpciod exiting\n");
+-	unlock_kernel();
+-
+-	rpciod_pid = 0;
+-	complete_and_exit(&rpciod_killer, 0);
+-	return 0;
+-}
+-
+-static void
+-rpciod_killall(void)
++static void rpciod_killall(void)
+ {
+ 	unsigned long flags;
+ 
+ 	while (!list_empty(&all_tasks)) {
+ 		clear_thread_flag(TIF_SIGPENDING);
+ 		rpc_killall_tasks(NULL);
+-		spin_lock_bh(&rpc_queue_lock);
+-		__rpc_schedule();
+-		spin_unlock_bh(&rpc_queue_lock);
++		flush_workqueue(rpciod_workqueue);
+ 		if (!list_empty(&all_tasks)) {
+ 			dprintk("rpciod_killall: waiting for tasks to exit\n");
+ 			yield();
+@@ -1195,28 +992,30 @@
+ int
+ rpciod_up(void)
+ {
++	struct workqueue_struct *wq;
+ 	int error = 0;
+ 
+ 	down(&rpciod_sema);
+-	dprintk("rpciod_up: pid %d, users %d\n", rpciod_pid, rpciod_users);
++	dprintk("rpciod_up: users %d\n", rpciod_users);
+ 	rpciod_users++;
+-	if (rpciod_pid)
++	if (rpciod_workqueue)
+ 		goto out;
+ 	/*
+ 	 * If there's no pid, we should be the first user.
+ 	 */
+ 	if (rpciod_users > 1)
+-		printk(KERN_WARNING "rpciod_up: no pid, %d users??\n", rpciod_users);
++		printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users);
+ 	/*
+ 	 * Create the rpciod thread and wait for it to start.
+ 	 */
+-	error = kernel_thread(rpciod, NULL, 0);
+-	if (error < 0) {
+-		printk(KERN_WARNING "rpciod_up: create thread failed, error=%d\n", error);
++	error = -ENOMEM;
++	wq = create_workqueue("rpciod");
++	if (wq == NULL) {
++		printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error);
+ 		rpciod_users--;
+ 		goto out;
+ 	}
+-	down(&rpciod_running);
++	rpciod_workqueue = wq;
+ 	error = 0;
+ out:
+ 	up(&rpciod_sema);
+@@ -1227,20 +1026,21 @@
+ rpciod_down(void)
+ {
+ 	down(&rpciod_sema);
+-	dprintk("rpciod_down pid %d sema %d\n", rpciod_pid, rpciod_users);
++	dprintk("rpciod_down sema %d\n", rpciod_users);
+ 	if (rpciod_users) {
+ 		if (--rpciod_users)
+ 			goto out;
+ 	} else
+-		printk(KERN_WARNING "rpciod_down: pid=%d, no users??\n", rpciod_pid);
++		printk(KERN_WARNING "rpciod_down: no users??\n");
+ 
+-	if (!rpciod_pid) {
++	if (!rpciod_workqueue) {
+ 		dprintk("rpciod_down: Nothing to do!\n");
+ 		goto out;
+ 	}
++	rpciod_killall();
+ 
+-	kill_proc(rpciod_pid, SIGKILL, 1);
+-	wait_for_completion(&rpciod_killer);
++	destroy_workqueue(rpciod_workqueue);
++	rpciod_workqueue = NULL;
+  out:
+ 	up(&rpciod_sema);
+ }
+@@ -1258,7 +1058,12 @@
+ 	}
+ 	printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
+ 		"-rpcwait -action- --exit--\n");
+-	alltask_for_each(t, le, &all_tasks)
++	alltask_for_each(t, le, &all_tasks) {
++		const char *rpc_waitq = "none";
++
++		if (RPC_IS_QUEUED(t))
++			rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
++
+ 		printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n",
+ 			t->tk_pid,
+ 			(t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+@@ -1266,8 +1071,9 @@
+ 			t->tk_client,
+ 			(t->tk_client ? t->tk_client->cl_prog : 0),
+ 			t->tk_rqstp, t->tk_timeout,
+-			rpc_qname(t->tk_rpcwait),
++			rpc_waitq,
+ 			t->tk_action, t->tk_exit);
++	}
+ 	spin_unlock(&rpc_sched_lock);
+ }
+ #endif
+Index: linux-2.6.10/net/sunrpc/sunrpc_syms.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/sunrpc_syms.c	2004-12-25 05:35:25.000000000 +0800
++++ linux-2.6.10/net/sunrpc/sunrpc_syms.c	2005-04-05 14:49:13.411690432 +0800
+@@ -58,6 +58,9 @@
+ EXPORT_SYMBOL(rpc_wake_up);
+ EXPORT_SYMBOL(rpc_queue_upcall);
+ EXPORT_SYMBOL(rpc_mkpipe);
++EXPORT_SYMBOL(rpc_mkdir);
++EXPORT_SYMBOL(rpc_rmdir);
++
+ 
+ /* Client transport */
+ EXPORT_SYMBOL(xprt_create_proto);
+@@ -90,6 +93,7 @@
+ EXPORT_SYMBOL(svc_auth_register);
+ EXPORT_SYMBOL(auth_domain_lookup);
+ EXPORT_SYMBOL(svc_authenticate);
++EXPORT_SYMBOL(svc_set_client);
+ 
+ /* RPC statistics */
+ #ifdef CONFIG_PROC_FS
+Index: linux-2.6.10/kernel/exit.c
+===================================================================
+--- linux-2.6.10.orig/kernel/exit.c	2005-04-05 14:48:52.534864192 +0800
++++ linux-2.6.10/kernel/exit.c	2005-04-05 14:50:57.737830448 +0800
+@@ -848,6 +848,8 @@
+ 	for (;;) ;
+ }
+ 
++EXPORT_SYMBOL(do_exit);
++
+ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+ {
+ 	if (comp)
+Index: linux-2.6.10/fs/locks.c
+===================================================================
+--- linux-2.6.10.orig/fs/locks.c	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/fs/locks.c	2005-04-05 14:49:13.434686936 +0800
+@@ -1096,15 +1096,13 @@
+ */
+ void remove_lease(struct file_lock *fl)
+ {
+-	if (!IS_LEASE(fl))
+-		return;
+-
+ 	lock_kernel();
+-
++	if (!fl || !IS_LEASE(fl))
++		goto out;
+ 	fl->fl_type = F_UNLCK | F_INPROGRESS;
+ 	fl->fl_break_time = jiffies - 10;
+ 	time_out_leases(fl->fl_file->f_dentry->d_inode);
+-
++out:
+ 	unlock_kernel();
+ }
+ 
+@@ -1563,9 +1561,6 @@
+ 		error = filp->f_op->lock(filp, F_GETLK, &file_lock);
+ 		if (error < 0)
+ 			goto out;
+-		else if (error == LOCK_USE_CLNT)
+-		  /* Bypass for NFS with no locking - 2.0.36 compat */
+-		  fl = posix_test_lock(filp, &file_lock);
+ 		else
+ 		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
+ 	} else {
+@@ -1708,9 +1703,6 @@
+ 		error = filp->f_op->lock(filp, F_GETLK, &file_lock);
+ 		if (error < 0)
+ 			goto out;
+-		else if (error == LOCK_USE_CLNT)
+-		  /* Bypass for NFS with no locking - 2.0.36 compat */
+-		  fl = posix_test_lock(filp, &file_lock);
+ 		else
+ 		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
+ 	} else {
+Index: linux-2.6.10/fs/dcache.c
+===================================================================
+--- linux-2.6.10.orig/fs/dcache.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/dcache.c	2005-04-05 14:49:13.413690128 +0800
+@@ -789,6 +789,54 @@
+ }
+ 
+ /**
++ * d_instantiate_unique - instantiate a non-aliased dentry
++ * @entry: dentry to instantiate
++ * @inode: inode to attach to this dentry
++ *
++ * Fill in inode information in the entry. On success, it returns NULL.
++ * If an unhashed alias of "entry" already exists, then we return the
++ * aliased dentry instead.
++ *
++ * Note that in order to avoid conflicts with rename() etc, the caller
++ * had better be holding the parent directory semaphore.
++ */
++struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
++{
++	struct dentry *alias;
++	int len = entry->d_name.len;
++	const char *name = entry->d_name.name;
++	unsigned int hash = entry->d_name.hash;
++
++	BUG_ON(!list_empty(&entry->d_alias));
++	spin_lock(&dcache_lock);
++	if (!inode)
++		goto do_negative;
++	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
++		struct qstr *qstr = &alias->d_name;
++
++		if (qstr->hash != hash)
++			continue;
++		if (alias->d_parent != entry->d_parent)
++			continue;
++		if (qstr->len != len)
++			continue;
++		if (memcmp(qstr->name, name, len))
++			continue;
++		dget_locked(alias);
++		spin_unlock(&dcache_lock);
++		BUG_ON(!d_unhashed(alias));
++		return alias;
++	}
++	list_add(&entry->d_alias, &inode->i_dentry);
++do_negative:
++	entry->d_inode = inode;
++	spin_unlock(&dcache_lock);
++	security_d_instantiate(entry, inode);
++	return NULL;
++}
++EXPORT_SYMBOL(d_instantiate_unique);
++
++/**
+  * d_alloc_root - allocate root dentry
+  * @root_inode: inode to allocate the root for
+  *
+Index: linux-2.6.10/fs/lockd/svc.c
+===================================================================
+--- linux-2.6.10.orig/fs/lockd/svc.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/lockd/svc.c	2005-04-05 14:49:13.458683288 +0800
+@@ -418,6 +418,38 @@
+ 	return 0;							\
+ }
+ 
++static inline int is_callback(u32 proc)
++{
++	return proc == NLMPROC_GRANTED
++		|| proc == NLMPROC_GRANTED_MSG
++		|| proc == NLMPROC_TEST_RES
++		|| proc == NLMPROC_LOCK_RES
++		|| proc == NLMPROC_CANCEL_RES
++		|| proc == NLMPROC_UNLOCK_RES
++		|| proc == NLMPROC_NSM_NOTIFY;
++}
++
++
++static int lockd_authenticate(struct svc_rqst *rqstp)
++{
++	rqstp->rq_client = NULL;
++	switch (rqstp->rq_authop->flavour) {
++		case RPC_AUTH_NULL:
++		case RPC_AUTH_UNIX:
++			if (rqstp->rq_proc == 0)
++				return SVC_OK;
++			if (is_callback(rqstp->rq_proc)) {
++				/* Leave it to individual procedures to
++				 * call nlmsvc_lookup_host(rqstp)
++				 */
++				return SVC_OK;
++			}
++			return svc_set_client(rqstp);
++	}
++	return SVC_DENIED;
++}
++
++
+ param_set_min_max(port, int, simple_strtol, 0, 65535)
+ param_set_min_max(grace_period, unsigned long, simple_strtoul,
+ 		  nlm_grace_period_min, nlm_grace_period_max)
+@@ -498,4 +530,5 @@
+ 	.pg_name	= "lockd",		/* service name */
+ 	.pg_class	= "nfsd",		/* share authentication with nfsd */
+ 	.pg_stats	= &nlmsvc_stats,	/* stats table */
++	.pg_authenticate = &lockd_authenticate	/* export authentication */
+ };
+Index: linux-2.6.10/fs/nfsd/nfs4xdr.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4xdr.c	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4xdr.c	2005-04-05 14:49:13.425688304 +0800
+@@ -60,121 +60,6 @@
+ 
+ #define NFSDDBG_FACILITY		NFSDDBG_XDR
+ 
+-static const char utf8_byte_len[256] = {
+-	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+-	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+-	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+-	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+-	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+-	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
+-};
+-
+-static inline int
+-is_legal_utf8_sequence(unsigned char *source, int length)
+-{
+-	unsigned char *ptr;
+-	unsigned char c;
+-
+-	if (length==1) return 1;
+-
+-	/* Check for overlong sequence, and check second byte */
+-	c = *(source + 1);
+-	switch (*source) {
+-	case 0xE0: /* 3 bytes */
+-		if ( c < 0xA0 ) return 0;
+-		break;
+-	case 0xF0: /* 4 bytes */
+-		if ( c < 0x90 ) return 0;
+-		break;
+-	case 0xF8: /* 5 bytes */
+-		if ( c < 0xC8 ) return 0;
+-		break;
+-	case 0xFC: /* 6 bytes */
+-		if ( c < 0x84 ) return 0;
+-		break;
+-	default:
+-		if ( (c & 0xC0) != 0x80) return 0;
+-	}
+-
+-	/* Check that trailing bytes look like 10xxxxxx */
+-	for (ptr = source++ + length - 1; ptr>source; ptr--)
+-		if ( ((*ptr) & 0xC0) != 0x80 ) return 0;
+-	return 1;
+-}
+-
+-/* This does some screening on disallowed unicode characters.  It is NOT
+- * comprehensive.
+- */
+-static int
+-is_allowed_utf8_char(unsigned char *source, int length)
+-{
+-	/* We assume length and source point to a valid utf8 sequence */
+-	unsigned char c;
+-
+-	/* Disallow F0000 and up (in utf8, F3B08080) */
+-	if (*source > 0xF3 ) return 0;
+-	c = *(source + 1);
+-	switch (*source) {
+-	case 0xF3:
+-		if (c >= 0xB0) return 0;
+-		break;
+-	/* Disallow D800-F8FF (in utf8, EDA080-EFA3BF */
+-	case 0xED:
+-		if (c >= 0xA0) return 0;
+-		break;
+-	case 0xEE:
+-		return 0;
+-		break;
+-	case 0xEF:
+-		if (c <= 0xA3) return 0;
+-	/* Disallow FFF9-FFFF (EFBFB9-EFBFBF) */
+-		if (c==0xBF)
+-			/* Don't need to check <=0xBF, since valid utf8 */
+-			if ( *(source+2) >= 0xB9) return 0;
+-		break;
+-	}
+-	return 1;
+-}
+-
+-/* This routine should really check to see that the proper stringprep
+- * mappings have been applied.  Instead, we do a simple screen of some
+- * of the more obvious illegal values by calling is_allowed_utf8_char.
+- * This will allow many illegal strings through, but if a client behaves,
+- * it will get full functionality.  The other option (apart from full
+- * stringprep checking) is to limit everything to an easily handled subset,
+- * such as 7-bit ascii.
+- *
+- * Note - currently calling routines ignore return value except as boolean.
+- */
+-static int
+-check_utf8(char *str, int len)
+-{
+-	unsigned char *chunk, *sourceend;
+-	int chunklen;
+-
+-	chunk = str;
+-	sourceend = str + len;
+-
+-	while (chunk < sourceend) {
+-		chunklen = utf8_byte_len[*chunk];
+-		if (!chunklen)
+-			return nfserr_inval;
+-		if (chunk + chunklen > sourceend)
+-			return nfserr_inval;
+-		if (!is_legal_utf8_sequence(chunk, chunklen))
+-			return nfserr_inval;
+-		if (!is_allowed_utf8_char(chunk, chunklen))
+-			return nfserr_inval;
+-		if ( (chunklen==1) && (!*chunk) )
+-			return nfserr_inval; /* Disallow embedded nulls */
+-		chunk += chunklen;
+-	}
+-
+-	return 0;
+-}
+-
+ static int
+ check_filename(char *str, int len, int err)
+ {
+@@ -187,7 +72,7 @@
+ 	for (i = 0; i < len; i++)
+ 		if (str[i] == '/')
+ 			return err;
+-	return check_utf8(str, len);
++	return 0;
+ }
+ 
+ /*
+@@ -403,8 +288,6 @@
+ 			READ_BUF(dummy32);
+ 			len += XDR_QUADLEN(dummy32) << 2;
+ 			READMEM(buf, dummy32);
+-			if (check_utf8(buf, dummy32))
+-				return nfserr_inval;
+ 			ace.whotype = nfs4_acl_get_whotype(buf, dummy32);
+ 			status = 0;
+ 			if (ace.whotype != NFS4_ACL_WHO_NAMED)
+@@ -439,8 +322,6 @@
+ 		READ_BUF(dummy32);
+ 		len += (XDR_QUADLEN(dummy32) << 2);
+ 		READMEM(buf, dummy32);
+-		if (check_utf8(buf, dummy32))
+-			return nfserr_inval;
+ 		if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+ 			goto out_nfserr;
+ 		iattr->ia_valid |= ATTR_UID;
+@@ -452,8 +333,6 @@
+ 		READ_BUF(dummy32);
+ 		len += (XDR_QUADLEN(dummy32) << 2);
+ 		READMEM(buf, dummy32);
+-		if (check_utf8(buf, dummy32))
+-			return nfserr_inval;
+ 		if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+ 			goto out_nfserr;
+ 		iattr->ia_valid |= ATTR_GID;
+@@ -525,7 +404,7 @@
+ 		}
+ 	}
+ 	if (len != expected_len)
+-		goto xdr_error;
++		printk("nfsd: funky nfs4 client sent extra bytes in setattr\n");
+ 
+ 	DECODE_TAIL;
+ 
+@@ -585,8 +464,6 @@
+ 		READ32(create->cr_linklen);
+ 		READ_BUF(create->cr_linklen);
+ 		SAVEMEM(create->cr_linkname, create->cr_linklen);
+-		if (check_utf8(create->cr_linkname, create->cr_linklen))
+-			return nfserr_inval;
+ 		break;
+ 	case NF4BLK:
+ 	case NF4CHR:
+@@ -615,6 +492,18 @@
+ }
+ 
+ static inline int
++nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
++{
++	DECODE_HEAD;
++
++	READ_BUF(sizeof(stateid_t));
++	READ32(dr->dr_stateid.si_generation);
++	COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
++
++	DECODE_TAIL;
++}
++
++static inline int
+ nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
+ {
+ 	return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
+@@ -790,8 +679,8 @@
+ 		READ32(open->op_delegate_type);
+ 		break;
+ 	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+-		READ_BUF(sizeof(delegation_stateid_t) + 4);
+-		COPYMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t));
++		READ_BUF(sizeof(stateid_t) + 4);
++		COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ 		READ32(open->op_fname.len);
+ 		READ_BUF(open->op_fname.len);
+ 		SAVEMEM(open->op_fname.data, open->op_fname.len);
+@@ -825,7 +714,7 @@
+ 	DECODE_HEAD;
+ 		    
+ 	open_down->od_stateowner = NULL;
+-	READ_BUF(4 + sizeof(stateid_t));
++	READ_BUF(12 + sizeof(stateid_t));
+ 	READ32(open_down->od_stateid.si_generation);
+ 	COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
+ 	READ32(open_down->od_seqid);
+@@ -1170,6 +1059,9 @@
+ 		case OP_CREATE:
+ 			op->status = nfsd4_decode_create(argp, &op->u.create);
+ 			break;
++		case OP_DELEGRETURN:
++			op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn);
++			break;
+ 		case OP_GETATTR:
+ 			op->status = nfsd4_decode_getattr(argp, &op->u.getattr);
+ 			break;
+@@ -1425,7 +1317,7 @@
+ 		if (status)
+ 			goto out_nfserr;
+ 	}
+-	if ((bmval0 & FATTR4_WORD0_FILEHANDLE) && !fhp) {
++	if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+ 		fh_init(&tempfh, NFS4_FHSIZE);
+ 		status = fh_compose(&tempfh, exp, dentry, NULL);
+ 		if (status)
+@@ -1471,7 +1363,10 @@
+ 	if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
+ 		if ((buflen -= 4) < 0)
+ 			goto out_resource;
+-		WRITE32( NFS4_FH_NOEXPIRE_WITH_OPEN | NFS4_FH_VOL_RENAME );
++		if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
++			WRITE32(NFS4_FH_PERSISTENT);
++		else
++			WRITE32(NFS4_FH_VOL_RENAME);
+ 	}
+ 	if (bmval0 & FATTR4_WORD0_CHANGE) {
+ 		/*
+@@ -1508,10 +1403,15 @@
+ 	if (bmval0 & FATTR4_WORD0_FSID) {
+ 		if ((buflen -= 16) < 0)
+ 			goto out_resource;
+-		WRITE32(0);
+-		WRITE32(MAJOR(stat.dev));
+-		WRITE32(0);
+-		WRITE32(MINOR(stat.dev));
++		if (is_fsid(fhp, rqstp->rq_reffh)) {
++			WRITE64((u64)exp->ex_fsid);
++			WRITE64((u64)0);
++		} else {
++			WRITE32(0);
++			WRITE32(MAJOR(stat.dev));
++			WRITE32(0);
++			WRITE32(MINOR(stat.dev));
++		}
+ 	}
+ 	if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
+ 		if ((buflen -= 4) < 0)
+@@ -1765,17 +1665,65 @@
+ }
+ 
+ static int
++nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
++		const char *name, int namlen, u32 *p, int *buflen)
++{
++	struct svc_export *exp = cd->rd_fhp->fh_export;
++	struct dentry *dentry;
++	int nfserr;
++
++	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
++	if (IS_ERR(dentry))
++		return nfserrno(PTR_ERR(dentry));
++
++	exp_get(exp);
++	if (d_mountpoint(dentry)) {
++		if (nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp)) {
++		/*
++		 * -EAGAIN is the only error returned from
++		 * nfsd_cross_mnt() and it indicates that an
++		 * up-call has  been initiated to fill in the export
++		 * options on exp.  When the answer comes back,
++		 * this call will be retried.
++		 */
++			nfserr = nfserr_dropit;
++			goto out_put;
++		}
++
++	}
++	nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
++					cd->rd_rqstp);
++out_put:
++	dput(dentry);
++	exp_put(exp);
++	return nfserr;
++}
++
++static u32 *
++nfsd4_encode_rdattr_error(u32 *p, int buflen, int nfserr)
++{
++	u32 *attrlenp;
++
++	if (buflen < 6)
++		return NULL;
++	*p++ = htonl(2);
++	*p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
++	*p++ = htonl(0);			 /* bmval1 */
++
++	attrlenp = p++;
++	*p++ = nfserr;       /* no htonl */
++	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
++	return p;
++}
++
++static int
+ nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen,
+ 		    loff_t offset, ino_t ino, unsigned int d_type)
+ {
+ 	struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
+ 	int buflen;
+ 	u32 *p = cd->buffer;
+-	u32 *attrlenp;
+-	struct dentry *dentry;
+-	struct svc_export *exp = cd->rd_fhp->fh_export;
+-	u32 bmval0, bmval1;
+-	int nfserr = 0;
++	int nfserr = nfserr_toosmall;
+ 
+ 	/* In nfsv4, "." and ".." never make it onto the wire.. */
+ 	if (name && isdotent(name, namlen)) {
+@@ -1788,106 +1736,44 @@
+ 
+ 	buflen = cd->buflen - 4 - XDR_QUADLEN(namlen);
+ 	if (buflen < 0)
+-		goto nospc;
++		goto fail;
+ 
+ 	*p++ = xdr_one;                             /* mark entry present */
+ 	cd->offset = p;                             /* remember pointer */
+ 	p = xdr_encode_hyper(p, NFS_OFFSET_MAX);    /* offset of next entry */
+ 	p = xdr_encode_array(p, name, namlen);      /* name length & name */
+ 
+-	/*
+-	 * Now we come to the ugly part: writing the fattr for this entry.
+-	 */
+-	bmval0 = cd->rd_bmval[0];
+-	bmval1 = cd->rd_bmval[1];
+-	if ((bmval0 & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_FILEID)) || bmval1)  {
+-		/*
+-		 * "Heavyweight" case: we have no choice except to
+-		 * call nfsd4_encode_fattr(). 
+-		 */
+-		dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+-		if (IS_ERR(dentry)) {
+-			nfserr = nfserrno(PTR_ERR(dentry));
+-			goto error;
+-		}
+-
+-		exp_get(exp);
+-		if (d_mountpoint(dentry)) {
+-			if ((nfserr = nfsd_cross_mnt(cd->rd_rqstp, &dentry, 
+-					 &exp))) {	
+-			/* 
+-			 * -EAGAIN is the only error returned from 
+-			 * nfsd_cross_mnt() and it indicates that an 
+-			 * up-call has  been initiated to fill in the export 
+-			 * options on exp.  When the answer comes back,
+-			 * this call will be retried.
+-			 */
+-				dput(dentry);
+-				exp_put(exp);
+-				nfserr = nfserr_dropit;
+-				goto error;
+-			}
+-
+-		}
+-
+-		nfserr = nfsd4_encode_fattr(NULL, exp,
+-				dentry, p, &buflen, cd->rd_bmval,
+-				cd->rd_rqstp);
+-		dput(dentry);
+-		exp_put(exp);
+-		if (!nfserr) {
+-			p += buflen;
+-			goto out;
+-		}
+-		if (nfserr == nfserr_resource)
+-			goto nospc;
+-
+-error:
++	nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen);
++	switch (nfserr) {
++	case nfs_ok:
++		p += buflen;
++		break;
++	case nfserr_resource:
++		nfserr = nfserr_toosmall;
++		goto fail;
++	case nfserr_dropit:
++		goto fail;
++	default:
+ 		/*
+-		 * If we get here, we experienced a miscellaneous
+-		 * failure while writing the attributes.  If the
+-		 * client requested the RDATTR_ERROR attribute,
++		 * If the client requested the RDATTR_ERROR attribute,
+ 		 * we stuff the error code into this attribute
+ 		 * and continue.  If this attribute was not requested,
+ 		 * then in accordance with the spec, we fail the
+ 		 * entire READDIR operation(!)
+ 		 */
+-		if (!(bmval0 & FATTR4_WORD0_RDATTR_ERROR)) {
+-			cd->common.err = nfserr;
+-			return -EINVAL;
+-		}
+-
+-		bmval0 = FATTR4_WORD0_RDATTR_ERROR;
+-		bmval1 = 0;
+-		/* falling through here will do the right thing... */
++		if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
++			goto fail;
++		nfserr = nfserr_toosmall;
++		p = nfsd4_encode_rdattr_error(p, buflen, nfserr);
++		if (p == NULL)
++			goto fail;
+ 	}
+-
+-	/*
+-	 * In the common "lightweight" case, we avoid
+-	 * the overhead of nfsd4_encode_fattr() by assembling
+-	 * a small fattr by hand.
+-	 */
+-	if (buflen < 6)
+-		goto nospc;
+-	*p++ = htonl(2);
+-	*p++ = htonl(bmval0);
+-	*p++ = htonl(bmval1);
+-
+-	attrlenp = p++;
+-	if (bmval0 & FATTR4_WORD0_RDATTR_ERROR)
+-		*p++ = nfserr;       /* no htonl */
+-	if (bmval0 & FATTR4_WORD0_FILEID)
+-		p = xdr_encode_hyper(p, (u64)ino);
+-	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
+-
+-out:
+ 	cd->buflen -= (p - cd->buffer);
+ 	cd->buffer = p;
+ 	cd->common.err = nfs_ok;
+ 	return 0;
+-
+-nospc:
+-	cd->common.err = nfserr_toosmall;
++fail:
++	cd->common.err = nfserr;
+ 	return -EINVAL;
+ }
+ 
+@@ -2081,8 +1967,8 @@
+ 	case NFS4_OPEN_DELEGATE_NONE:
+ 		break;
+ 	case NFS4_OPEN_DELEGATE_READ:
+-		RESERVE_SPACE(20 + sizeof(delegation_stateid_t));
+-		WRITEMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t));
++		RESERVE_SPACE(20 + sizeof(stateid_t));
++		WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ 		WRITE32(0);
+ 
+ 		/*
+@@ -2095,8 +1981,8 @@
+ 		ADJUST_ARGS();
+ 		break;
+ 	case NFS4_OPEN_DELEGATE_WRITE:
+-		RESERVE_SPACE(32 + sizeof(delegation_stateid_t));
+-		WRITEMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t));
++		RESERVE_SPACE(32 + sizeof(stateid_t));
++		WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ 		WRITE32(0);
+ 
+ 		/*
+@@ -2185,10 +2071,17 @@
+ 	}
+ 	read->rd_vlen = v;
+ 
+-	nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp,
+-			   read->rd_offset,
+-			   read->rd_iov, read->rd_vlen,
+-			   &maxcount);
++	if (read->rd_filp)
++		nfserr = nfsd_vfs_read(read->rd_rqstp, read->rd_fhp,
++				read->rd_filp, read->rd_offset,
++				read->rd_iov, read->rd_vlen,
++				&maxcount);
++	else
++		nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp,
++				read->rd_offset,
++				read->rd_iov, read->rd_vlen,
++				&maxcount);
++
+ 	if (nfserr == nfserr_symlink)
+ 		nfserr = nfserr_inval;
+ 	if (nfserr)
+@@ -2460,6 +2353,8 @@
+ 	case OP_CREATE:
+ 		nfsd4_encode_create(resp, op->status, &op->u.create);
+ 		break;
++	case OP_DELEGRETURN:
++		break;
+ 	case OP_GETATTR:
+ 		op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr);
+ 		break;
+Index: linux-2.6.10/fs/nfsd/nfs4state.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4state.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4state.c	2005-04-05 14:49:13.421688912 +0800
+@@ -44,6 +44,7 @@
+ #include <linux/mount.h>
+ #include <linux/workqueue.h>
+ #include <linux/smp_lock.h>
++#include <linux/kthread.h>
+ #include <linux/nfs4.h>
+ #include <linux/nfsd/state.h>
+ #include <linux/nfsd/xdr4.h>
+@@ -56,9 +57,11 @@
+ static u32 nfs4_reclaim_init = 0;
+ time_t boot_time;
+ static time_t grace_end = 0;
++static u32 first_run = 1;       /* laundromat threads first run */
+ static u32 current_clientid = 1;
+-static u32 current_ownerid;
+-static u32 current_fileid;
++static u32 current_ownerid = 1;
++static u32 current_fileid = 1;
++static u32 current_delegid = 1;
+ static u32 nfs4_init;
+ stateid_t zerostateid;             /* bits all 0 */
+ stateid_t onestateid;              /* bits all 1 */
+@@ -70,14 +73,17 @@
+ u32 del_perclient = 0;
+ u32 alloc_file = 0;
+ u32 free_file = 0;
+-u32 alloc_sowner = 0;
+-u32 free_sowner = 0;
+ u32 vfsopen = 0;
+ u32 vfsclose = 0;
+-u32 alloc_lsowner= 0;
++u32 alloc_delegation= 0;
++u32 free_delegation= 0;
+ 
+ /* forward declarations */
+ struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
++static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
++static void release_delegation(struct nfs4_delegation *dp);
++static void release_stateid_lockowner(struct nfs4_stateid *open_stp);
++extern char recovery_dirname[];
+ 
+ /* Locking:
+  *
+@@ -117,6 +123,112 @@
+ static void release_stateid(struct nfs4_stateid *stp, int flags);
+ static void release_file(struct nfs4_file *fp);
+ 
++/*
++ * Delegation state
++ */
++
++/* recall_lock protects the del_recall_lru */
++spinlock_t recall_lock;
++static struct list_head del_recall_lru;
++
++static struct nfs4_delegation *
++alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
++{
++	struct nfs4_delegation *dp;
++	struct nfs4_file *fp = stp->st_file;
++
++	dprintk("NFSD alloc_init_deleg\n");
++	if ((dp = kmalloc(sizeof(struct nfs4_delegation),
++		GFP_KERNEL)) == NULL)
++		return dp;
++	INIT_LIST_HEAD(&dp->dl_del_perfile);
++	INIT_LIST_HEAD(&dp->dl_del_perclnt);
++	INIT_LIST_HEAD(&dp->dl_recall_lru);
++	dp->dl_client = clp;
++	dp->dl_file = fp;
++	dp->dl_flock = NULL;
++	dp->dl_stp = stp;
++	dp->dl_flags = 0;
++	dp->dl_type = type;
++	dp->dl_recall.cbr_dp = NULL;
++	dp->dl_recall.cbr_ident = 0;
++	dp->dl_recall.cbr_trunc = 0;
++	dp->dl_stateid.si_boot = boot_time;
++	dp->dl_stateid.si_stateownerid = current_delegid++;
++	dp->dl_stateid.si_fileid = 0;
++	dp->dl_stateid.si_generation = 0;
++	dp->dl_fhlen = current_fh->fh_handle.fh_size;
++	memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
++	current_fh->fh_handle.fh_size);
++	dp->dl_time = 0;
++	atomic_set(&dp->dl_state, NFS4_NO_RECALL);
++	atomic_set(&dp->dl_count, 1);
++	atomic_set(&dp->dl_recall_cnt, 0);
++	list_add(&dp->dl_del_perfile, &fp->fi_del_perfile);
++	list_add(&dp->dl_del_perclnt, &clp->cl_del_perclnt);
++	alloc_delegation++;
++	return dp;
++}
++
++/*
++ * Free the delegation structure.
++ */
++static void
++nfs4_free_delegation(struct nfs4_delegation *dp)
++{
++	dprintk("NFSD: nfs4_free_delegation freeing dp %p\n",dp);
++	list_del(&dp->dl_recall_lru);
++	kfree(dp);
++	free_delegation++;
++}
++
++/* release_delegation:
++ *
++ * lease_modify() is called to remove the FS_LEASE file_lock from
++ * the i_flock list, eventually calling nfsd's lock_manager
++ * fl_release_callback.
++ *
++ * call either:
++ *   nfsd_close : if last close, locks_remove_flock calls lease_modify.
++ *                otherwise, recalled state set to NFS4_RECALL_COMPLETE
++ *                so that it will be reaped by the laundromat service.
++ * or
++ *   remove_lease (calls time_out_lease which calls lease_modify).
++ *   and nfs4_free_delegation.
++ *
++ * lock_kernel() protects dp->dl_flock which is set under the kernel lock
++ * by nfsd_copy_lock_deleg_callback and nfsd_release_deleg_callback.
++ *
++ */
++
++static void
++release_delegation(struct nfs4_delegation *dp)
++{
++	/* delayed nfsd_close */
++	if (dp->dl_flags && NFS4_DELAY_CLOSE) {
++		struct file *filp = dp->dl_stp->st_vfs_file;
++
++		dprintk("NFSD: release_delegation CLOSE\n");
++		release_stateid_lockowner(dp->dl_stp);
++		kfree(dp->dl_stp);
++		dp->dl_flags &= ~NFS4_DELAY_CLOSE;
++		dp->dl_stp = NULL;
++		atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE);
++		nfsd_close(filp);
++		vfsclose++;
++	} else {
++		dprintk("NFSD: release_delegation remove lease dl_flock %p\n",
++			dp->dl_flock);
++		remove_lease(dp->dl_flock);
++		list_del_init(&dp->dl_del_perfile);
++		list_del_init(&dp->dl_del_perclnt);
++		/* dl_count > 0 => outstanding recall rpc */
++		dprintk("NFSD: release_delegation free deleg dl_count %d\n",
++		atomic_read(&dp->dl_count));
++		if (atomic_dec_and_test(&dp->dl_count))
++			nfs4_free_delegation(dp);
++	}
++}
+ 
+ /* 
+  * SETCLIENTID state 
+@@ -148,7 +260,7 @@
+  * for last close replay.
+  */
+ static struct list_head	reclaim_str_hashtbl[CLIENT_HASH_SIZE];
+-static int reclaim_str_hashtbl_size;
++static int reclaim_str_hashtbl_size = 0;
+ static struct list_head	conf_id_hashtbl[CLIENT_HASH_SIZE];
+ static struct list_head	conf_str_hashtbl[CLIENT_HASH_SIZE];
+ static struct list_head	unconf_str_hashtbl[CLIENT_HASH_SIZE];
+@@ -213,12 +325,38 @@
+ 	kfree(clp);
+ }
+ 
++void
++put_nfs4_client(struct nfs4_client *clp)
++{
++	if (atomic_dec_and_test(&clp->cl_count))
++		free_client(clp);
++}
++
+ static void
+ expire_client(struct nfs4_client *clp)
+ {
+ 	struct nfs4_stateowner *sop;
++	struct nfs4_delegation *dp;
++	struct nfs4_callback *cb = &clp->cl_callback;
++	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
++
++	dprintk("NFSD: expire_client cl_count %d\n",
++	                    atomic_read(&clp->cl_count));
+ 
+-	dprintk("NFSD: expire_client\n");
++	/* shutdown rpc client, ending any outstanding recall rpcs */
++	if (atomic_read(&cb->cb_set) == 1 && clnt) {
++		rpc_shutdown_client(clnt);
++		clnt = clp->cl_callback.cb_client = NULL;
++	}
++	while (!list_empty(&clp->cl_del_perclnt)) {
++		dp = list_entry(clp->cl_del_perclnt.next, struct nfs4_delegation, dl_del_perclnt);
++		dprintk("NFSD: expire client. dp %p, dl_state %d, fp %p\n",
++				dp, atomic_read(&dp->dl_state), dp->dl_flock);
++
++		/* force release of delegation. */
++		atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE);
++		release_delegation(dp);
++	}
+ 	list_del(&clp->cl_idhash);
+ 	list_del(&clp->cl_strhash);
+ 	list_del(&clp->cl_lru);
+@@ -226,7 +364,7 @@
+ 		sop = list_entry(clp->cl_perclient.next, struct nfs4_stateowner, so_perclient);
+ 		release_stateowner(sop);
+ 	}
+-	free_client(clp);
++	put_nfs4_client(clp);
+ }
+ 
+ static struct nfs4_client *
+@@ -235,9 +373,13 @@
+ 
+ 	if (!(clp = alloc_client(name)))
+ 		goto out;
++	atomic_set(&clp->cl_count, 1);
++	atomic_set(&clp->cl_callback.cb_set, 0);
++	clp->cl_callback.cb_parsed = 0;
+ 	INIT_LIST_HEAD(&clp->cl_idhash);
+ 	INIT_LIST_HEAD(&clp->cl_strhash);
+ 	INIT_LIST_HEAD(&clp->cl_perclient);
++	INIT_LIST_HEAD(&clp->cl_del_perclnt);
+ 	INIT_LIST_HEAD(&clp->cl_lru);
+ out:
+ 	return clp;
+@@ -420,17 +562,24 @@
+ {
+ 	struct nfs4_callback *cb = &clp->cl_callback;
+ 
++	/* Currently, we only support tcp for the callback channel */
++	if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
++		goto out_err;
++
+ 	if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
+-		         &cb->cb_addr, &cb->cb_port))) {
+-		printk(KERN_INFO "NFSD: BAD callback address. client will not receive delegations\n");
+-		cb->cb_parsed = 0;
+-		return;
+-	}
+-	cb->cb_netid.len = se->se_callback_netid_len;
+-	cb->cb_netid.data = se->se_callback_netid_val;
++	                          &cb->cb_addr, &cb->cb_port)))
++		goto out_err;
+ 	cb->cb_prog = se->se_callback_prog;
+ 	cb->cb_ident = se->se_callback_ident;
+ 	cb->cb_parsed = 1;
++	return;
++out_err:
++	printk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
++		"will not receive delegations\n",
++		clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
++
++	cb->cb_parsed = 0;
++	return;
+ }
+ 
+ /*
+@@ -707,6 +856,7 @@
+ 			status = nfserr_clid_inuse;
+ 		else {
+ 			expire_client(conf);
++			clp = unconf;
+ 			move_to_confirmed(unconf, idhashval);
+ 			status = nfs_ok;
+ 		}
+@@ -724,6 +874,7 @@
+ 		if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) {
+ 			status = nfserr_clid_inuse;
+ 		} else {
++			clp = conf;
+ 			status = nfs_ok;
+ 		}
+ 		goto out;
+@@ -738,6 +889,7 @@
+ 			status = nfserr_clid_inuse;
+ 		} else {
+ 			status = nfs_ok;
++			clp = unconf;
+ 			move_to_confirmed(unconf, idhashval);
+ 		}
+ 		goto out;
+@@ -757,7 +909,8 @@
+ 	status = nfserr_inval;
+ 	goto out;
+ out:
+-	/* XXX if status == nfs_ok, probe callback path */
++	if (!status)
++		nfsd4_probe_callback(clp);
+ 	nfs4_unlock_state();
+ 	return status;
+ }
+@@ -803,6 +956,7 @@
+ 	if ((fp = kmalloc(sizeof(struct nfs4_file),GFP_KERNEL))) {
+ 		INIT_LIST_HEAD(&fp->fi_hash);
+ 		INIT_LIST_HEAD(&fp->fi_perfile);
++		INIT_LIST_HEAD(&fp->fi_del_perfile);
+ 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ 		fp->fi_inode = igrab(ino);
+ 		fp->fi_id = current_fileid++;
+@@ -822,7 +976,7 @@
+ 		while (!list_empty(&file_hashtbl[i])) {
+ 			fp = list_entry(file_hashtbl[i].next, struct nfs4_file, fi_hash);
+ 			/* this should never be more than once... */
+-			if (!list_empty(&fp->fi_perfile)) {
++			if (!list_empty(&fp->fi_perfile) || !list_empty(&fp->fi_del_perfile)) {
+ 				printk("ERROR: release_all_files: file %p is open, creating dangling state !!!\n",fp);
+ 			}
+ 			release_file(fp);
+@@ -830,15 +984,36 @@
+ 	}
+ }
+ 
+-/* should use a slab cache */
++kmem_cache_t *stateowner_slab = NULL;
++
++int
++nfsd4_init_slabs(void)
++{
++	stateowner_slab = kmem_cache_create("nfsd4_stateowners",
++			sizeof(struct nfs4_stateowner), 0, 0, NULL, NULL);
++	if (stateowner_slab == NULL)
++		return -ENOMEM;
++	return 0;
++}
++
++int
++nfsd4_free_slabs(void)
++{
++	int status = 0;
++
++	if (stateowner_slab)
++		status = kmem_cache_destroy(stateowner_slab);
++	stateowner_slab = NULL;
++	return status;
++}
++
+ void
+ nfs4_free_stateowner(struct kref *kref)
+ {
+ 	struct nfs4_stateowner *sop =
+ 		container_of(kref, struct nfs4_stateowner, so_ref);
+ 	kfree(sop->so_owner.data);
+-	kfree(sop);
+-	free_sowner++;
++	kmem_cache_free(stateowner_slab, sop);
+ }
+ 
+ static inline struct nfs4_stateowner *
+@@ -846,14 +1021,14 @@
+ {
+ 	struct nfs4_stateowner *sop;
+ 
+-	if ((sop = kmalloc(sizeof(struct nfs4_stateowner),GFP_KERNEL))) {
++	if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) {
+ 		if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) {
+ 			memcpy(sop->so_owner.data, owner->data, owner->len);
+ 			sop->so_owner.len = owner->len;
+ 			kref_init(&sop->so_ref);
+ 			return sop;
+ 		} 
+-		kfree(sop);
++		kmem_cache_free(stateowner_slab, sop);
+ 	}
+ 	return NULL;
+ }
+@@ -887,7 +1062,6 @@
+ 	rp->rp_status = NFSERR_SERVERFAULT;
+ 	rp->rp_buflen = 0;
+ 	rp->rp_buf = rp->rp_ibuf;
+-	alloc_sowner++;
+ 	return sop;
+ }
+ 
+@@ -957,14 +1131,29 @@
+ 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
+ }
+ 
++/*
++* Because nfsd_close() can call locks_remove_flock() which removes leases,
++* delay nfsd_close() for delegations from the nfsd_open() clientid
++* until the delegation is reaped.
++*/
+ static void
+-release_stateid(struct nfs4_stateid *stp, int flags) {
++release_stateid(struct nfs4_stateid *stp, int flags)
++{
++	struct nfs4_delegation *dp;
++	struct nfs4_file *fp = stp->st_file;
+ 
+ 	list_del(&stp->st_hash);
+ 	list_del_perfile++;
+ 	list_del(&stp->st_perfile);
+ 	list_del(&stp->st_perfilestate);
+ 	if ((stp->st_vfs_set) && (flags & OPEN_STATE)) {
++		list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) {
++			if(cmp_clid(&dp->dl_client->cl_clientid,
++			    &stp->st_stateowner->so_client->cl_clientid)) {
++				dp->dl_flags |= NFS4_DELAY_CLOSE;
++				return;
++			}
++		}
+ 		release_stateid_lockowner(stp);
+ 		nfsd_close(stp->st_vfs_file);
+ 		vfsclose++;
+@@ -1013,7 +1202,7 @@
+ 	if (sop->so_confirmed && list_empty(&sop->so_perfilestate))
+ 		move_to_close_lru(sop);
+ 	/* unused nfs4_file's are releseed. XXX slab cache? */
+-	if (list_empty(&fp->fi_perfile)) {
++	if (list_empty(&fp->fi_perfile) && list_empty(&fp->fi_del_perfile)) {
+ 		release_file(fp);
+ 	}
+ }
+@@ -1141,6 +1330,100 @@
+ 	}
+ }
+ 
++/*
++ * Recall a delegation
++ */
++static int
++do_recall(void *__dp)
++{
++	struct nfs4_delegation *dp = __dp;
++
++	atomic_inc(&dp->dl_count);
++	nfsd4_cb_recall(dp);
++	do_exit(0);
++	return 0;
++}
++
++/*
++ * Spawn a thread to perform a recall on the delegation represented
++ * by the lease (file_lock)
++ *
++ * Called from break_lease() with lock_kernel() held,
++ *
++ */
++static
++void nfsd_break_deleg_cb(struct file_lock *fl)
++{
++	struct nfs4_delegation *dp=  (struct nfs4_delegation *)fl->fl_owner;
++	struct task_struct *t;
++
++	dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
++	if (!dp)
++		return;
++
++	/* schedule delegation for recall */
++	spin_lock(&recall_lock);
++	atomic_set(&dp->dl_state, NFS4_RECALL_IN_PROGRESS);
++	list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
++	spin_unlock(&recall_lock);
++
++	/* only place dl_time is set. protected by lock_kernel*/
++	dp->dl_time = get_seconds();
++
++	/* XXX need to merge NFSD_LEASE_TIME with fs/locks.c:lease_break_time */
++	fl->fl_break_time = jiffies + NFSD_LEASE_TIME * HZ;
++
++	t = kthread_run(do_recall, dp, "%s", "nfs4_cb_recall");
++	if (IS_ERR(t)) {
++		struct nfs4_client *clp = dp->dl_client;
++
++		printk(KERN_INFO "NFSD: Callback thread failed for "
++			"for client (clientid %08x/%08x)\n",
++			clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
++	}
++}
++
++/*
++ * The file_lock is being reapd.
++ *
++ * Called by locks_free_lock() with lock_kernel() held.
++ */
++static
++void nfsd_release_deleg_cb(struct file_lock *fl)
++{
++	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
++
++	dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d, dl_state %d\n", fl,dp, atomic_read(&dp->dl_count), atomic_read(&dp->dl_state));
++
++	if (!(fl->fl_flags & FL_LEASE) || !dp)
++		return;
++	atomic_set(&dp->dl_state,NFS4_RECALL_COMPLETE);
++	dp->dl_flock = NULL;
++}
++
++/*
++ * Set the delegation file_lock back pointer.
++ *
++ * Called from __setlease() with lock_kernel() held.
++ */
++static
++void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
++{
++	struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
++
++	dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
++	if (!dp)
++		return;
++	dp->dl_flock = new;
++}
++
++struct lock_manager_operations nfsd_lease_mng_ops = {
++        .fl_break = nfsd_break_deleg_cb,
++        .fl_release_private = nfsd_release_deleg_cb,
++        .fl_copy_lock = nfsd_copy_lock_deleg_cb,
++};
++
++
+ 
+ /*
+  * nfsd4_process_open1()
+@@ -1238,6 +1521,43 @@
+ }
+ 
+ static int
++nfs4_deleg_conflict(u32 share, u32 dtype)
++{
++	return (((share & NFS4_SHARE_ACCESS_WRITE) &&
++		dtype == NFS4_OPEN_DELEGATE_READ) ||
++		((share & NFS4_SHARE_ACCESS_READ) &&
++		dtype == NFS4_OPEN_DELEGATE_WRITE));
++}
++
++#define DONT_DELEGATE  8
++
++/*
++ * nfs4_check_deleg_recall()
++ *
++ * Test any delegation that is currently within an incompleted recalled
++ * state, and return NFSERR_DELAY for conflicting open share.
++ * flag is set to DONT_DELEGATE for shares that match the deleg type.
++ */
++static int
++nfs4_check_deleg_recall(struct nfs4_file *fp, struct nfsd4_open *op, int *flag)
++{
++	struct nfs4_delegation *dp;
++	int status = 0;
++
++	list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) {
++		dprintk("NFSD: found delegation %p with dl_state %d\n",
++		 	                 dp, atomic_read(&dp->dl_state));
++		if (atomic_read(&dp->dl_state) == NFS4_RECALL_IN_PROGRESS) {
++			if(nfs4_deleg_conflict(op->op_share_access, dp->dl_type))
++				status = nfserr_jukebox;
++			else
++				*flag = DONT_DELEGATE;
++		}
++	}
++	return status;
++}
++
++static int
+ nfs4_check_open(struct nfs4_file *fp, struct nfs4_stateowner *sop, struct nfsd4_open *open, struct nfs4_stateid **stpp)
+ {
+ 	struct nfs4_stateid *local;
+@@ -1339,6 +1659,65 @@
+ }
+ 
+ /*
++ * Attempt to hand out a delegation.
++ */
++static void
++nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp, int *flag)
++{
++	struct nfs4_delegation *dp;
++	struct nfs4_stateowner *sop = stp->st_stateowner;
++	struct nfs4_callback *cb = &sop->so_client->cl_callback;
++	struct file_lock fl, *flp = &fl;
++	int status;
++
++	if (*flag == DONT_DELEGATE) {
++		*flag = NFS4_OPEN_DELEGATE_NONE;
++		return;
++	}
++
++	/* set flag */
++	*flag = NFS4_OPEN_DELEGATE_NONE;
++	if (open->op_claim_type != NFS4_OPEN_CLAIM_NULL
++	     || !atomic_read(&cb->cb_set) || !sop->so_confirmed)
++		return;
++
++	if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
++		*flag = NFS4_OPEN_DELEGATE_READ;
++
++	else if (!(open->op_share_access & NFS4_SHARE_ACCESS_READ))
++		*flag = NFS4_OPEN_DELEGATE_WRITE;
++
++	if (!(dp = alloc_init_deleg(sop->so_client, stp, fh, *flag)))
++		return;
++	locks_init_lock(&fl);
++	fl.fl_lmops = &nfsd_lease_mng_ops;
++	fl.fl_flags = FL_LEASE;
++	fl.fl_end = OFFSET_MAX;
++	fl.fl_owner =  (fl_owner_t)dp;
++	fl.fl_file = stp->st_vfs_file;
++	fl.fl_pid = current->tgid;
++
++	if ((status = setlease(stp->st_vfs_file,
++		*flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK, &flp))) {
++		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
++		list_del(&dp->dl_del_perfile);
++		list_del(&dp->dl_del_perclnt);
++		kfree(dp);
++		free_delegation++;
++		*flag = NFS4_OPEN_DELEGATE_NONE;
++		return;
++	}
++
++	memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
++
++	dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n",
++	             dp->dl_stateid.si_boot,
++	             dp->dl_stateid.si_stateownerid,
++	             dp->dl_stateid.si_fileid,
++	             dp->dl_stateid.si_generation);
++}
++
++/*
+  * called with nfs4_lock_state() held.
+  */
+ int
+@@ -1346,28 +1725,24 @@
+ {
+ 	struct nfs4_stateowner *sop = open->op_stateowner;
+ 	struct nfs4_file *fp = NULL;
+-	struct inode *ino;
++	struct inode *ino = current_fh->fh_dentry->d_inode;
+ 	unsigned int fi_hashval;
+ 	struct nfs4_stateid *stp = NULL;
+-	int status;
+-
+-	status = nfserr_resource;
+-	if (!sop)
+-		return status;
+-
+-	ino = current_fh->fh_dentry->d_inode;
++	int status, delegflag = 0;
+ 
+ 	status = nfserr_inval;
+ 	if (!TEST_ACCESS(open->op_share_access) || !TEST_DENY(open->op_share_deny))
+ 		goto out;
+ 	/*
+-	 * Lookup file; if found, lookup stateid and check open request;
+-	 * not found, create
++	 * Lookup file; if found, lookup stateid and check open request,
++	 * and check for delegations in the process of being recalled.
++	 * If not found, create the nfs4_file struct
+ 	 */
+ 	fi_hashval = file_hashval(ino);
+ 	if (find_file(fi_hashval, ino, &fp)) {
+-		status = nfs4_check_open(fp, sop, open, &stp);
+-		if (status)
++		if ((status = nfs4_check_open(fp, sop, open, &stp)))
++			goto out;
++		if ((status = nfs4_check_deleg_recall(fp, open, &delegflag)))
+ 			goto out;
+ 	} else {
+ 		status = nfserr_resource;
+@@ -1407,14 +1782,20 @@
+ 			}
+ 		}
+ 	}
+-	dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n",
+-	            stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid,
+-	            stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
+-
+ 	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+ 
+-	open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
++	/*
++	* Attempt to hand out a delegation. No error return, because the
++	* OPEN succeeds even if we fail.
++	*/
++	nfs4_open_delegation(current_fh, open, stp, &delegflag);
++	open->op_delegate_type = delegflag;
++
+ 	status = nfs_ok;
++
++	dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n",
++	            stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid,
++	            stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
+ out:
+ 	/* take the opportunity to clean up unused state */
+ 	if (fp && list_empty(&fp->fi_perfile))
+@@ -1480,14 +1861,26 @@
+ {
+ 	struct nfs4_client *clp;
+ 	struct nfs4_stateowner *sop;
++	struct nfs4_delegation *dp;
+ 	struct list_head *pos, *next;
+ 	time_t cutoff = get_seconds() - NFSD_LEASE_TIME;
+ 	time_t t, clientid_val = NFSD_LEASE_TIME;
+-	time_t u, close_val = NFSD_LEASE_TIME;
++	time_t u, test_val = NFSD_LEASE_TIME;
+ 
+ 	nfs4_lock_state();
+ 
+-	dprintk("NFSD: laundromat service - starting, examining clients\n");
++	dprintk("NFSD: laundromat service - starting\n");
++	/* Remove clientid's from recovery directory */
++	if (first_run) {
++		int status;
++
++		dprintk("NFSD: laundromat service - FIRST_RUN\n");
++		status  = nfsd4_list_rec_dir(1);
++		if (status < 0)
++			printk("NFSD: error clearing recovery directory %s\n",
++				recovery_dirname);
++		first_run = 0;
++	}
+ 	list_for_each_safe(pos, next, &client_lru) {
+ 		clp = list_entry(pos, struct nfs4_client, cl_lru);
+ 		if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
+@@ -1498,14 +1891,34 @@
+ 		}
+ 		dprintk("NFSD: purging unused client (clientid %08x)\n",
+ 			clp->cl_clientid.cl_id);
++		if (clp->cl_firststate)
++			nfsd4_remove_clid_file(clp);
+ 		expire_client(clp);
+ 	}
++	spin_lock(&recall_lock);
++	list_for_each_safe(pos, next, &del_recall_lru) {
++		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
++		if (atomic_read(&dp->dl_state) == NFS4_RECALL_COMPLETE)
++			goto reap;
++		if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
++			u = dp->dl_time - cutoff;
++			if (test_val > u)
++				test_val = u;
++			break;
++		}
++reap:
++		dprintk("NFSD: purging unused delegation dp %p, fp %p\n",
++			            dp, dp->dl_flock);
++		release_delegation(dp);
++	}
++	spin_unlock(&recall_lock);
++	test_val = NFSD_LEASE_TIME;
+ 	list_for_each_safe(pos, next, &close_lru) {
+ 		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
+ 		if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
+ 			u = sop->so_time - cutoff;
+-			if (close_val > u)
+-				close_val = u;
++			if (test_val > u)
++				test_val = u;
+ 			break;
+ 		}
+ 		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
+@@ -1564,21 +1977,81 @@
+ 	return 1;
+ }
+ 
++static inline int
++access_permit_read(unsigned long access_bmap)
++{
++	return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) ||
++		test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
++}
++
++static inline int
++access_permit_write(unsigned long access_bmap)
++{
++	return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) ||
++		test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
++}
++
++static
++int nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
++{
++        int status = nfserr_openmode;
++
++	if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
++                goto out;
++	if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
++                goto out;
++	status = nfs_ok;
++out:
++	return status;
++}
++
++static int
++nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
++{
++	int status = nfserr_openmode;
++
++	if ((flags & WR_STATE) & (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
++		goto out;
++	if ((flags & RD_STATE) & (dp->dl_type == NFS4_OPEN_DELEGATE_WRITE))
++		goto out;
++	status = nfs_ok;
++out:
++	return status;
++}
++
++static int
++nfs4_rw_grace(int flags)
++{
++	return (nfs4_in_grace() && ((flags & RD_STATE) || (flags & WR_STATE)));
++}
++
++/*
++ * Allow READ/WRITE during grace period on recovered state only for files
++ * that are not able to provide mandatory locking.
++ */
++static int
++nfs4_check_rw_grace(umode_t mode, int flags)
++{
++	return (nfs4_rw_grace(flags) && ((mode & S_IXGRP) && (mode & S_ISGID)));
++}
+ 
+ /*
+ * Checks for stateid operations
+ */
+ int
+-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct nfs4_stateid **stpp)
++nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+ {
+-	struct nfs4_stateid *stp;
++	struct nfs4_stateid *stp = NULL;
++	struct nfs4_delegation *dp = NULL;
++	stateid_t *stidp;
++	struct inode *ino = current_fh->fh_dentry->d_inode;
+ 	int status;
+ 
+ 	dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
+ 		stateid->si_boot, stateid->si_stateownerid, 
+ 		stateid->si_fileid, stateid->si_generation); 
+-
+-	*stpp = NULL;
++	if (filpp)
++		*filpp = NULL;
+ 
+ 	/* STALE STATEID */
+ 	status = nfserr_stale_stateid;
+@@ -1587,33 +2060,58 @@
+ 
+ 	/* BAD STATEID */
+ 	status = nfserr_bad_stateid;
+-	if (!(stp = find_stateid(stateid, flags))) {
+-		dprintk("NFSD: preprocess_stateid_op: no open stateid!\n");
+-		goto out;
+-	}
+-	if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) {
+-		dprintk("NFSD: preprocess_stateid_op: fh-stateid mismatch!\n");
+-		stp->st_vfs_set = 0;
+-		goto out;
+-	}
+-	if (!stp->st_stateowner->so_confirmed) {
+-		dprintk("preprocess_stateid_op: lockowner not confirmed yet!\n");
+-		goto out;
++	if (!stateid->si_fileid) { /* delegation stateid */
++
++		if(!(dp = find_delegation_stateid(ino, stateid))) {
++			dprintk("NFSD: delegation stateid not found\n");
++			if (nfs4_rw_grace(flags))
++				status = nfserr_grace;
++			goto out;
++		}
++		stidp = &dp->dl_stateid;
++	} else { /* open or lock stateid */
++		if (!(stp = find_stateid(stateid, flags))) {
++			dprintk("NFSD: open or lock stateid not found\n");
++			if (nfs4_rw_grace(flags))
++				status = nfserr_grace;
++			goto out;
++		}
++		if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
++			goto out;
++		if (!stp->st_stateowner->so_confirmed)
++			goto out;
++		stidp = &stp->st_stateid;
+ 	}
+-	if (stateid->si_generation > stp->st_stateid.si_generation) {
+-		dprintk("preprocess_stateid_op: future stateid?!\n");
++	if (stateid->si_generation > stidp->si_generation)
+ 		goto out;
+-	}
+ 
+ 	/* OLD STATEID */
+ 	status = nfserr_old_stateid;
+-	if (stateid->si_generation < stp->st_stateid.si_generation) {
+-		dprintk("preprocess_stateid_op: old stateid!\n");
++	if (stateid->si_generation < stidp->si_generation)
+ 		goto out;
++
++	status = nfserr_grace;
++	if (nfs4_check_rw_grace(ino->i_mode, flags))
++		goto out;
++
++	if (stp) {
++		renew_client(stp->st_stateowner->so_client);
++		if ((status = nfs4_check_openmode(stp,flags)))
++			goto out;
++		if (filpp)
++			*filpp = stp->st_vfs_file;
++	} else if (dp) {
++		renew_client(dp->dl_client);
++		if ((status = nfs4_check_delegmode(dp, flags)))
++			goto out;
++		if (flags & DELEG_RET) {
++			atomic_set(&dp->dl_state,NFS4_RECALL_COMPLETE);
++			release_delegation(dp);
++		}
++		if (filpp && dp && dp->dl_stp)
++			*filpp = dp->dl_stp->st_vfs_file;
+ 	}
+-	*stpp = stp;
+ 	status = nfs_ok;
+-	renew_client(stp->st_stateowner->so_client);
+ out:
+ 	return status;
+ }
+@@ -1750,17 +2248,6 @@
+ 	goto out;
+ }
+ 
+-/*
+- * eventually, this will perform an upcall to the 'state daemon' as well as
+- * set the cl_first_state field.
+- */
+-void
+-first_state(struct nfs4_client *clp)
+-{
+-	if (!clp->cl_first_state)
+-		clp->cl_first_state = get_seconds();
+-}
+-
+ int
+ nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc)
+ {
+@@ -1793,8 +2280,16 @@
+ 		         stp->st_stateid.si_stateownerid,
+ 		         stp->st_stateid.si_fileid,
+ 		         stp->st_stateid.si_generation);
+-	status = nfs_ok;
+-	first_state(sop->so_client);
++
++	if (!sop->so_client->cl_firststate) {
++		int err = nfsd4_create_clid_file(sop->so_client);
++		if (!err) {
++			sop->so_client->cl_firststate = 1;
++			dprintk("NFSD: OPEN_CONFIRM firststate set [%.*s]\n",
++				sop->so_client->cl_name.len,
++				sop->so_client->cl_name.data);
++		}
++	}
+ out:
+ 	if (oc->oc_stateowner)
+ 		nfs4_get_stateowner(oc->oc_stateowner);
+@@ -1912,6 +2407,22 @@
+ 	return status;
+ }
+ 
++int
++nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr)
++{
++	int status;
++
++	if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0)))
++		goto out;
++
++	nfs4_lock_state();
++	status = nfs4_preprocess_stateid_op(current_fh, &dr->dr_stateid, DELEG_RET, NULL);
++	nfs4_unlock_state();
++out:
++	return status;
++}
++
++
+ /* 
+  * Lock owner state (byte-range locks)
+  */
+@@ -1938,7 +2449,7 @@
+ 	unsigned int hashval;
+ 
+ 	dprintk("NFSD: find_stateid flags 0x%x\n",flags);
+-	if ((flags & LOCK_STATE) || (flags & RDWR_STATE)) {
++	if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+ 		hashval = stateid_hashval(st_id, f_id);
+ 		list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
+ 			if ((local->st_stateid.si_stateownerid == st_id) &&
+@@ -1946,7 +2457,7 @@
+ 				return local;
+ 		}
+ 	} 
+-	if ((flags & OPEN_STATE) || (flags & RDWR_STATE)) {
++	if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+ 		hashval = stateid_hashval(st_id, f_id);
+ 		list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
+ 			if ((local->st_stateid.si_stateownerid == st_id) &&
+@@ -1958,6 +2469,30 @@
+ 	return NULL;
+ }
+ 
++static struct nfs4_delegation *
++find_delegation_stateid(struct inode *ino, stateid_t *stid)
++{
++	struct nfs4_delegation *dp = NULL;
++	struct nfs4_file *fp = NULL;
++	u32 st_id;
++	unsigned int fi_hashval;
++
++	dprintk("NFSD:find_delegation_stateid ino %p, stid %p\n",ino,stid);
++
++	if(!ino || !stid)
++		return NULL;
++	st_id = stid->si_stateownerid;
++	fi_hashval = file_hashval(ino);
++	if (find_file(fi_hashval, ino, &fp)) {
++		list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) {
++			if(dp->dl_stateid.si_stateownerid == st_id) {
++				dprintk("NFSD: find_delegation dp %p\n",dp);
++				return dp;
++			}
++		}
++	}
++	return NULL;
++}
+ 
+ /*
+  * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
+@@ -2085,7 +2620,6 @@
+ 	rp->rp_status = NFSERR_SERVERFAULT;
+ 	rp->rp_buflen = 0;
+ 	rp->rp_buf = rp->rp_ibuf;
+-	alloc_lsowner++;
+ 	return sop;
+ }
+ 
+@@ -2558,22 +3092,22 @@
+ /*
+  * failure => all reset bets are off, nfserr_no_grace...
+  */
+-static int
+-nfs4_client_to_reclaim(struct nfs4_client *clp)
++int
++nfs4_client_to_reclaim(char *name, int namlen)
+ {
+ 	unsigned int strhashval;
+ 	struct nfs4_client_reclaim *crp = NULL;
+ 
+-	crp = alloc_reclaim(clp->cl_name.len);
++	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", namlen, name);
++	crp = alloc_reclaim(namlen);
+ 	if (!crp)
+ 		return 0;
+-	strhashval = clientstr_hashval(clp->cl_name.data, clp->cl_name.len);
++	strhashval = clientstr_hashval(name, namlen);
+ 	INIT_LIST_HEAD(&crp->cr_strhash);
+ 	list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
+-	memcpy(crp->cr_name.data, clp->cl_name.data, clp->cl_name.len);
+-	crp->cr_name.len = clp->cl_name.len;
+-	crp->cr_first_state = clp->cl_first_state;
+-	crp->cr_expired = 0;
++	memcpy(crp->cr_name.data, name, namlen);
++	crp->cr_name.len = namlen;
++	reclaim_str_hashtbl_size++;
+ 	return 1;
+ }
+ 
+@@ -2618,6 +3152,9 @@
+ 	if (!client)
+ 		return NULL;
+ 
++	dprintk("NFSD: nfs4_find_reclaim_client for %.*s\n",
++		            clp->cl_name.len, clp->cl_name.data);
++
+ 	/* find clp->cl_name in reclaim_str_hashtbl */
+ 	strhashval = clientstr_hashval(client->cl_name.data,
+ 	                              client->cl_name.len);
+@@ -2639,8 +3176,6 @@
+ 
+ 	if ((crp = nfs4_find_reclaim_client(clid)) == NULL)
+ 		return nfserr_reclaim_bad;
+-	if (crp->cr_expired)
+-		return nfserr_no_grace;
+ 	return nfs_ok;
+ }
+ 
+@@ -2657,10 +3192,18 @@
+ 
+ 	if (nfs4_init)
+ 		return;
++	if (nfsd4_init_slabs())
++		BUG(); /* XXXXXX!!! */
+ 	if (!nfs4_reclaim_init) {
++		int status;
++
+ 		for (i = 0; i < CLIENT_HASH_SIZE; i++)
+ 			INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
+ 		reclaim_str_hashtbl_size = 0;
++		nfsd4_init_rec_dir(recovery_dirname);
++		status = nfsd4_list_rec_dir(0);
++		if (status)
++			printk("NFSD: Failure in reading recovery data\n");
+ 		nfs4_reclaim_init = 1;
+ 	}
+ 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+@@ -2689,6 +3232,8 @@
+ 
+ 	INIT_LIST_HEAD(&close_lru);
+ 	INIT_LIST_HEAD(&client_lru);
++	INIT_LIST_HEAD(&del_recall_lru);
++	spin_lock_init(&recall_lock);
+ 	boot_time = get_seconds();
+ 	grace_time = max(old_lease_time, lease_time);
+ 	if (reclaim_str_hashtbl_size == 0)
+@@ -2725,6 +3270,15 @@
+ {
+ 	int i;
+ 	struct nfs4_client *clp = NULL;
++	struct nfs4_delegation *dp = NULL;
++	struct nfs4_stateowner *sop = NULL;
++	struct list_head *pos, *next;
++
++	list_for_each_safe(pos, next, &close_lru) {
++		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
++		list_del(&sop->so_close_lru);
++		nfs4_put_stateowner(sop);
++	}
+ 
+ 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ 		while (!list_empty(&conf_id_hashtbl[i])) {
+@@ -2736,20 +3290,31 @@
+ 			expire_client(clp);
+ 		}
+ 	}
++	spin_lock(&recall_lock);
++	list_for_each_safe(pos, next, &del_recall_lru) {
++		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
++		atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE);
++		release_delegation(dp);
++	}
++	spin_unlock(&recall_lock);
++
+ 	release_all_files();
+ 	cancel_delayed_work(&laundromat_work);
+ 	flush_scheduled_work();
+ 	nfs4_init = 0;
++	nfs4_reclaim_init = 0;
+ 	dprintk("NFSD: list_add_perfile %d list_del_perfile %d\n",
+ 			list_add_perfile, list_del_perfile);
+ 	dprintk("NFSD: add_perclient %d del_perclient %d\n",
+ 			add_perclient, del_perclient);
+ 	dprintk("NFSD: alloc_file %d free_file %d\n",
+ 			alloc_file, free_file);
+-	dprintk("NFSD: alloc_sowner %d alloc_lsowner %d free_sowner %d\n",
+-			alloc_sowner, alloc_lsowner, free_sowner);
+ 	dprintk("NFSD: vfsopen %d vfsclose %d\n",
+ 			vfsopen, vfsclose);
++	dprintk("NFSD: alloc_delegation %d free_delegation %d\n",
++			alloc_delegation, free_delegation);
++	if (nfsd4_free_slabs())
++		BUG(); /* XXX? */
+ }
+ 
+ void
+@@ -2801,11 +3366,10 @@
+ 	/* populate reclaim_str_hashtbl with current confirmed nfs4_clientid */
+ 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ 		list_for_each_entry(clp, &conf_id_hashtbl[i], cl_idhash) {
+-			if (!nfs4_client_to_reclaim(clp)) {
++			if (!nfs4_client_to_reclaim(clp->cl_name.data, clp->cl_name.len)) {
+ 				nfs4_release_reclaim();
+ 				goto init_state;
+ 			}
+-			reclaim_str_hashtbl_size++;
+ 		}
+ 	}
+ init_state:
+Index: linux-2.6.10/fs/nfsd/nfsproc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfsproc.c	2004-12-25 05:34:30.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfsproc.c	2005-04-05 14:49:13.426688152 +0800
+@@ -586,7 +586,6 @@
+ 		{ nfserr_dquot, -EDQUOT },
+ #endif
+ 		{ nfserr_stale, -ESTALE },
+-		{ nfserr_jukebox, -EWOULDBLOCK },
+ 		{ nfserr_jukebox, -ETIMEDOUT },
+ 		{ nfserr_dropit, -EAGAIN },
+ 		{ nfserr_dropit, -ENOMEM },
+Index: linux-2.6.10/fs/nfsd/nfs4acl.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4acl.c	2004-12-25 05:34:29.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4acl.c	2005-04-05 14:49:13.429687696 +0800
+@@ -89,6 +89,8 @@
+ 	return ret;
+ }
+ 
++/* modify functions to take NFS errors */
++
+ static int
+ mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
+ {
+Index: linux-2.6.10/fs/nfsd/nfs4idmap.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4idmap.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4idmap.c	2005-04-05 14:49:13.414689976 +0800
+@@ -78,9 +78,9 @@
+ 
+ #define DefineSimpleCacheLookupMap(STRUCT, FUNC)			\
+         DefineCacheLookup(struct STRUCT, h, FUNC##_lookup,		\
+-        (struct STRUCT *item, int set), /*no setup */,			\
++        (struct STRUCT *item, int set),			\
+ 	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
+-	STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0)
++	STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+ 
+ /* Common entry handling */
+ 
+Index: linux-2.6.10/fs/nfsd/vfs.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/vfs.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/nfsd/vfs.c	2005-04-05 14:49:13.417689520 +0800
+@@ -304,6 +304,8 @@
+ 		 * we need to break all leases.
+ 		 */
+ 		err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
++		if (err == -EWOULDBLOCK)
++			err = -ETIMEDOUT;
+ 		if (err) /* ENOMEM or EWOULDBLOCK */
+ 			goto out_nfserr;
+ 
+@@ -678,6 +680,8 @@
+ 	 * This may block while leases are broken.
+ 	 */
+ 	err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
++	if (err == -EWOULDBLOCK)
++		err = -ETIMEDOUT;
+ 	if (err) /* NOMEM or WOULDBLOCK */
+ 		goto out_nfserr;
+ 
+@@ -822,21 +826,34 @@
+ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+           struct kvec *vec, int vlen, unsigned long *count)
+ {
+-	struct raparms	*ra;
+-	mm_segment_t	oldfs;
+ 	int		err;
+ 	struct file	*file;
+-	struct inode	*inode;
+ 
+ 	err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
+ 	if (err)
+ 		goto out;
++	err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
++
++	nfsd_close(file);
++out:
++	return err;
++}
++
++int
++nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
++              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
++{
++	struct inode *inode;
++	struct raparms	*ra;
++	mm_segment_t	oldfs;
++	int		err;
++
+ 	err = nfserr_perm;
+ 	inode = file->f_dentry->d_inode;
+ #ifdef MSNFS
+ 	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+ 		(!lock_may_read(inode, offset, *count)))
+-		goto out_close;
++		goto out;
+ #endif
+ 
+ 	/* Get readahead parameters */
+@@ -872,8 +889,6 @@
+ 		dnotify_parent(file->f_dentry, DN_ACCESS);
+ 	} else 
+ 		err = nfserrno(err);
+-out_close:
+-	nfsd_close(file);
+ out:
+ 	return err;
+ }
+@@ -888,25 +903,40 @@
+ 				struct kvec *vec, int vlen,
+ 	   			unsigned long cnt, int *stablep)
+ {
+-	struct svc_export	*exp;
+ 	struct file		*file;
+-	struct dentry		*dentry;
+-	struct inode		*inode;
+-	mm_segment_t		oldfs;
+ 	int			err = 0;
+-	int			stable = *stablep;
+ 
+ 	err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
+ 	if (err)
+ 		goto out;
+ 	if (!cnt)
+ 		goto out_close;
++
++	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep);
++out_close:
++	nfsd_close(file);
++out:
++	return err;
++}
++
++int
++nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
++				loff_t offset, struct kvec *vec, int vlen,
++	   			unsigned long cnt, int *stablep)
++{
++	struct svc_export	*exp;
++	struct dentry		*dentry;
++	struct inode		*inode;
++	mm_segment_t		oldfs;
++	int			err = 0;
++	int			stable = *stablep;
++
+ 	err = nfserr_perm;
+ 
+ #ifdef MSNFS
+ 	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+ 		(!lock_may_write(file->f_dentry->d_inode, offset, cnt)))
+-		goto out_close;
++		goto out;
+ #endif
+ 
+ 	dentry = file->f_dentry;
+@@ -993,13 +1023,10 @@
+ 		err = 0;
+ 	else 
+ 		err = nfserrno(err);
+-out_close:
+-	nfsd_close(file);
+ out:
+ 	return err;
+ }
+ 
+-
+ #ifdef CONFIG_NFSD_V3
+ /*
+  * Commit all pending writes to stable storage.
+Index: linux-2.6.10/fs/nfsd/nfs4callback.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4callback.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/nfsd/nfs4callback.c	2005-04-05 14:49:13.428687848 +0800
+@@ -0,0 +1,589 @@
++/*
++ *  linux/fs/nfsd/nfs4callback.c
++ *
++ *  Copyright (c) 2001 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Kendrick Smith <kmsmith@umich.edu>
++ *  Andy Adamson <andros@umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/list.h>
++#include <linux/inet.h>
++#include <linux/errno.h>
++#include <linux/sunrpc/xdr.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/clnt.h>
++#include <linux/nfsd/nfsd.h>
++#include <linux/nfsd/state.h>
++#include <linux/sunrpc/sched.h>
++#include <linux/nfs4.h>
++
++#define NFSDDBG_FACILITY                NFSDDBG_PROC
++
++#define NFSPROC4_CB_NULL 0
++#define NFSPROC4_CB_COMPOUND 1
++
++/* declarations */
++static void nfs4_cb_null(struct rpc_task *task);
++extern spinlock_t recall_lock;
++
++/* Index of predefined Linux callback client operations */
++
++enum {
++        NFSPROC4_CLNT_CB_NULL = 0,
++	NFSPROC4_CLNT_CB_RECALL,
++};
++
++enum nfs_cb_opnum4 {
++	OP_CB_RECALL            = 4,
++};
++
++#define NFS4_MAXTAGLEN		20
++
++#define NFS4_enc_cb_null_sz		0
++#define NFS4_dec_cb_null_sz		0
++#define cb_compound_enc_hdr_sz		4
++#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
++#define op_enc_sz			1
++#define op_dec_sz			2
++#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
++#define enc_stateid_sz			16
++#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
++					1 + enc_stateid_sz +            \
++					enc_nfs4_fh_sz)
++
++#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
++					op_dec_sz)
++
++/*
++* Generic encode routines from fs/nfs/nfs4xdr.c
++*/
++static inline u32 *
++xdr_writemem(u32 *p, const void *ptr, int nbytes)
++{
++	int tmp = XDR_QUADLEN(nbytes);
++	if (!tmp)
++		return p;
++	p[tmp-1] = 0;
++	memcpy(p, ptr, nbytes);
++	return p + tmp;
++}
++
++#define WRITE32(n)               *p++ = htonl(n)
++#define WRITEMEM(ptr,nbytes)     do {                           \
++	p = xdr_writemem(p, ptr, nbytes);                       \
++} while (0)
++#define RESERVE_SPACE(nbytes)   do {                            \
++	p = xdr_reserve_space(xdr, nbytes);                     \
++	if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
++	BUG_ON(!p);                                             \
++} while (0)
++
++/*
++ * Generic decode routines from fs/nfs/nfs4xdr.c
++ */
++#define DECODE_TAIL                             \
++	status = 0;                             \
++out:                                            \
++	return status;                          \
++xdr_error:                                      \
++	dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
++	status = -EIO;                          \
++	goto out
++
++#define READ32(x)         (x) = ntohl(*p++)
++#define READ64(x)         do {                  \
++	(x) = (u64)ntohl(*p++) << 32;           \
++	(x) |= ntohl(*p++);                     \
++} while (0)
++#define READTIME(x)       do {                  \
++	p++;                                    \
++	(x.tv_sec) = ntohl(*p++);               \
++	(x.tv_nsec) = ntohl(*p++);              \
++} while (0)
++#define READ_BUF(nbytes)  do { \
++	p = xdr_inline_decode(xdr, nbytes); \
++	if (!p) { \
++		dprintk("NFSD: %s: reply buffer overflowed in line %d.", \
++			__FUNCTION__, __LINE__); \
++		return -EIO; \
++	} \
++} while (0)
++
++struct nfs4_cb_compound_hdr {
++	int		status;
++	u32		ident;
++	u32		nops;
++	u32		taglen;
++	char *		tag;
++};
++
++static struct {
++int stat;
++int errno;
++} nfs_cb_errtbl[] = {
++	{ NFS4_OK,		0               },
++	{ NFS4ERR_PERM,		EPERM           },
++	{ NFS4ERR_NOENT,	ENOENT          },
++	{ NFS4ERR_IO,		EIO             },
++	{ NFS4ERR_NXIO,		ENXIO           },
++	{ NFS4ERR_ACCESS,	EACCES          },
++	{ NFS4ERR_EXIST,	EEXIST          },
++	{ NFS4ERR_XDEV,		EXDEV           },
++	{ NFS4ERR_NOTDIR,	ENOTDIR         },
++	{ NFS4ERR_ISDIR,	EISDIR          },
++	{ NFS4ERR_INVAL,	EINVAL          },
++	{ NFS4ERR_FBIG,		EFBIG           },
++	{ NFS4ERR_NOSPC,	ENOSPC          },
++	{ NFS4ERR_ROFS,		EROFS           },
++	{ NFS4ERR_MLINK,	EMLINK          },
++	{ NFS4ERR_NAMETOOLONG,	ENAMETOOLONG    },
++	{ NFS4ERR_NOTEMPTY,	ENOTEMPTY       },
++	{ NFS4ERR_DQUOT,	EDQUOT          },
++	{ NFS4ERR_STALE,	ESTALE          },
++	{ NFS4ERR_BADHANDLE,	EBADHANDLE      },
++	{ NFS4ERR_BAD_COOKIE,	EBADCOOKIE      },
++	{ NFS4ERR_NOTSUPP,	ENOTSUPP        },
++	{ NFS4ERR_TOOSMALL,	ETOOSMALL       },
++	{ NFS4ERR_SERVERFAULT,	ESERVERFAULT    },
++	{ NFS4ERR_BADTYPE,	EBADTYPE        },
++	{ NFS4ERR_LOCKED,	EAGAIN          },
++	{ NFS4ERR_RESOURCE,	EREMOTEIO       },
++	{ NFS4ERR_SYMLINK,	ELOOP           },
++	{ NFS4ERR_OP_ILLEGAL,	EOPNOTSUPP      },
++	{ NFS4ERR_DEADLOCK,	EDEADLK         },
++	{ -1,                   EIO             }
++};
++
++static int
++nfs_cb_stat_to_errno(int stat)
++{
++	int i;
++	for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
++		if (nfs_cb_errtbl[i].stat == stat)
++			return nfs_cb_errtbl[i].errno;
++	}
++	/* If we cannot translate the error, the recovery routines should
++	* handle it.
++	* Note: remaining NFSv4 error codes have values > 10000, so should
++	* not conflict with native Linux error codes.
++	*/
++	return stat;
++}
++
++/*
++ * XDR encode
++ */
++
++static int
++encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
++{
++	u32 * p;
++
++	RESERVE_SPACE(16);
++	WRITE32(0);            /* tag length is always 0 */
++	WRITE32(NFS4_MINOR_VERSION);
++	WRITE32(hdr->ident);
++	WRITE32(hdr->nops);
++	return 0;
++}
++
++static int
++encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
++{
++	u32 *p;
++	int len = cb_rec->cbr_fhlen;
++
++	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
++	WRITE32(OP_CB_RECALL);
++	WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
++	WRITE32(cb_rec->cbr_trunc);
++	WRITE32(len);
++	WRITEMEM(cb_rec->cbr_fhval, len);
++	return 0;
++}
++
++static int
++nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
++{
++	struct xdr_stream xdrs, *xdr = &xdrs;
++
++	xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
++        RESERVE_SPACE(0);
++	return 0;
++}
++
++static int
++nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args)
++{
++	struct xdr_stream xdr;
++	struct nfs4_cb_compound_hdr hdr = {
++		.nops   = 1,
++	};
++
++	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++	encode_cb_compound_hdr(&xdr, &hdr);
++	return (encode_cb_recall(&xdr, args));
++}
++
++
++static int
++decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
++        u32 *p;
++
++        READ_BUF(8);
++        READ32(hdr->status);
++        READ32(hdr->taglen);
++        READ_BUF(hdr->taglen + 4);
++        hdr->tag = (char *)p;
++        p += XDR_QUADLEN(hdr->taglen);
++        READ32(hdr->nops);
++        return 0;
++}
++
++static int
++decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
++{
++	u32 *p;
++	u32 op;
++	int32_t nfserr;
++
++	READ_BUF(8);
++	READ32(op);
++	if (op != expected) {
++		dprintk("NFSD: decode_cb_op_hdr: Callback server returned "
++		         " operation %d but we issued a request for %d\n",
++		         op, expected);
++		return -EIO;
++	}
++	READ32(nfserr);
++	if (nfserr != NFS_OK)
++		return -nfs_cb_stat_to_errno(nfserr);
++	return 0;
++}
++
++static int
++nfs4_xdr_dec_cb_null(struct rpc_rqst *req, u32 *p)
++{
++	return 0;
++}
++
++static int
++nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p)
++{
++	struct xdr_stream xdr;
++	struct nfs4_cb_compound_hdr hdr;
++	int status;
++
++	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++	status = decode_cb_compound_hdr(&xdr, &hdr);
++	if (status)
++		goto out;
++	status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
++out	:
++	return status;
++}
++
++/*
++ * RPC procedure tables
++ */
++#ifndef MAX
++# define MAX(a, b)      (((a) > (b))? (a) : (b))
++#endif
++
++#define PROC(proc, call, argtype, restype)                              \
++[NFSPROC4_CLNT_##proc] = {                                      	\
++        .p_proc   = NFSPROC4_CB_##call,					\
++        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
++        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
++        .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,  \
++}
++
++struct rpc_procinfo     nfs4_cb_procedures[] = {
++    PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
++    PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
++};
++
++struct rpc_version              nfs_cb_version4 = {
++        .number                 = 1,
++        .nrprocs                = sizeof(nfs4_cb_procedures)/sizeof(nfs4_cb_procedures[0]),
++        .procs                  = nfs4_cb_procedures
++};
++
++static struct rpc_version *	nfs_cb_version[] = {
++	NULL,
++	&nfs_cb_version4,
++};
++
++/*
++ * Use the SETCLIENTID credential
++ */
++struct rpc_cred *
++nfsd4_lookupcred(struct nfs4_client *clp, int taskflags)
++{
++        struct auth_cred acred;
++	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
++        struct rpc_cred *ret = NULL;
++
++	if (!clnt)
++		goto out;
++        get_group_info(clp->cl_cred.cr_group_info);
++        acred.uid = clp->cl_cred.cr_uid;
++        acred.gid = clp->cl_cred.cr_gid;
++        acred.group_info = clp->cl_cred.cr_group_info;
++
++        dprintk("NFSD:     looking up %s cred\n",
++                clnt->cl_auth->au_ops->au_name);
++        ret = rpcauth_lookup_credcache(clnt->cl_auth, &acred, taskflags);
++        put_group_info(clp->cl_cred.cr_group_info);
++out:
++        return ret;
++}
++
++/*
++ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
++ */
++void
++nfsd4_probe_callback(struct nfs4_client *clp)
++{
++	struct sockaddr_in	addr;
++	struct nfs4_callback    *cb = &clp->cl_callback;
++	struct rpc_timeout	timeparms;
++	struct rpc_xprt *	xprt;
++	struct rpc_program *	program = &cb->cb_program;
++	struct rpc_stat *	stat = &cb->cb_stat;
++	struct rpc_clnt *	clnt;
++	struct rpc_message msg = {
++		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
++		.rpc_argp       = clp,
++	};
++	char                    hostname[32];
++	int status;
++
++	dprintk("NFSD: probe_callback. cb_parsed %d cb_set %d\n",
++			cb->cb_parsed, atomic_read(&cb->cb_set));
++	if (!cb->cb_parsed || atomic_read(&cb->cb_set))
++		return;
++
++	/* Initialize address */
++	memset(&addr, 0, sizeof(addr));
++	addr.sin_family = AF_INET;
++	addr.sin_port = htons(cb->cb_port);
++	addr.sin_addr.s_addr = htonl(cb->cb_addr);
++
++	/* Initialize timeout */
++	timeparms.to_initval = (NFSD_LEASE_TIME/4) * HZ;
++	timeparms.to_retries = 5;
++	timeparms.to_maxval = (NFSD_LEASE_TIME/2) * HZ;
++	timeparms.to_exponential = 1;
++
++	/* Create RPC transport */
++	if (!(xprt = xprt_create_proto(IPPROTO_TCP, &addr, &timeparms))) {
++		dprintk("NFSD: couldn't create callback transport!\n");
++		goto out_err;
++	}
++
++	/* Initialize rpc_program */
++	program->name = "nfs4_cb";
++	program->number = cb->cb_prog;
++	program->nrvers = sizeof(nfs_cb_version)/sizeof(nfs_cb_version[0]);
++	program->version = nfs_cb_version;
++	program->stats = stat;
++
++	/* Initialize rpc_stat */
++	memset(stat, 0, sizeof(struct rpc_stat));
++	stat->program = program;
++
++	/* Create RPC client
++ 	 *
++	 * XXX AUTH_UNIX only - need AUTH_GSS....
++	 */
++	sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr));
++	if (!(clnt = rpc_create_client(xprt, hostname, program, 1, RPC_AUTH_UNIX))) {
++		dprintk("NFSD: couldn't create callback client\n");
++		goto out_xprt;
++	}
++	clnt->cl_intr = 1;
++	clnt->cl_softrtry = 1;
++	clnt->cl_chatty = 1;
++
++	/* Kick rpciod, put the call on the wire. */
++
++	if (rpciod_up() != 0) {
++		dprintk("nfsd: couldn't start rpciod for callbacks!\n");
++		goto out_clnt;
++	}
++
++	/* the task holds a reference to the nfs4_client struct */
++	cb->cb_client = clnt;
++	atomic_inc(&clp->cl_count);
++
++	msg.rpc_cred = nfsd4_lookupcred(clp,0);
++	status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, NULL);
++
++	if (status != 0) {
++		dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n");
++		goto out_rpciod;
++	}
++	return;
++
++out_rpciod:
++	atomic_dec(&clp->cl_count);
++	rpciod_down();
++out_clnt:
++	rpc_shutdown_client(clnt);
++	goto out_err;
++out_xprt:
++	xprt_destroy(xprt);
++out_err:
++	dprintk("NFSD: warning: no callback path to client %.*s\n",
++		clp->cl_name.len, clp->cl_name.data);
++	cb->cb_client = NULL;
++}
++
++static void
++nfs4_cb_null(struct rpc_task *task)
++{
++	struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
++	struct nfs4_callback *cb = &clp->cl_callback;
++	u32 addr = htonl(cb->cb_addr);
++
++	dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
++
++	if (task->tk_status < 0) {
++		dprintk("NFSD: callback establishment to client %.*s failed\n",
++			clp->cl_name.len, clp->cl_name.data);
++		goto out;
++	}
++	atomic_set(&cb->cb_set, 1);
++	dprintk("NFSD: callback set to client %u.%u.%u.%u\n", NIPQUAD(addr));
++out:
++	put_nfs4_client(clp);
++}
++
++/*
++ *  Called with dp->dl_count incremented
++ */
++static void
++nfs4_cb_recall_done(struct rpc_task *task)
++{
++	struct nfs4_cb_recall *cbr = (struct nfs4_cb_recall *)task->tk_calldata;
++	struct nfs4_delegation *dp = cbr->cbr_dp;
++	int status;
++
++	spin_lock(&recall_lock);
++
++	/* all is well... */
++	if (task->tk_status == 0)
++		goto out;
++
++	/* network partition, retry nfsd4_cb_recall once.  */
++	if (task->tk_status == -EIO) {
++		if (atomic_read(&dp->dl_recall_cnt) == 0)
++			goto retry;
++		else
++			/* callback channel no longer available */
++			atomic_set(&dp->dl_client->cl_callback.cb_set, 0);
++	}
++
++	/* Race: a recall occurred miliseconds after a delegation was granted.
++	* Client may have received recall prior to delegation. retry recall
++	* once.
++	* XXX what about nfserr_bad_stateid?
++	*/
++	if (task->tk_status == -EBADHANDLE) {
++		if (atomic_read(&dp->dl_recall_cnt) == 0)
++			goto retry;
++	}
++
++	/* nfs4_laundromat will reap delegation */
++	atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE);
++
++out:
++	atomic_dec(&dp->dl_count);
++	BUG_ON(atomic_read(&dp->dl_count) < 0);
++	spin_unlock(&recall_lock);
++	return;
++
++retry:
++	atomic_inc(&dp->dl_recall_cnt);
++	spin_unlock(&recall_lock);
++	/* sleep 2 seconds before retrying recall */
++	set_current_state(TASK_UNINTERRUPTIBLE);
++	schedule_timeout(2*HZ);
++	status = nfsd4_cb_recall(dp);
++	dprintk("NFSD: nfs4_cb_recall_done: retry status: %d  dp %p dl_flock %p\n",status,dp, dp->dl_flock);
++}
++
++/*
++ * called with dp->dl_count inc'ed.
++ * nfs4_lock_state() may or may not have been called.
++ */
++int
++nfsd4_cb_recall(struct nfs4_delegation *dp)
++{
++	struct nfs4_client *clp;
++	struct rpc_clnt *clnt;
++	struct rpc_message msg = {
++		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
++	};
++	struct nfs4_cb_recall *cbr = &dp->dl_recall;
++	int status;
++
++	dprintk("NFSD: nfsd4_cb_recall NFS4_enc_cb_recall_sz %d NFS4_dec_cb_recall_sz %d \n",NFS4_enc_cb_recall_sz,NFS4_dec_cb_recall_sz);
++
++	clp = dp->dl_client;
++	clnt = clp->cl_callback.cb_client;
++	status = EIO;
++	if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt)
++		goto out_free;
++
++	msg.rpc_argp = cbr;
++	msg.rpc_resp = cbr;
++	msg.rpc_cred = nfsd4_lookupcred(clp,0);
++
++	cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
++	cbr->cbr_dp = dp;
++
++	if ((status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
++		nfs4_cb_recall_done, cbr ))) {
++		dprintk("NFSD: recall_delegation: rpc_call_async failed %d\n",
++			status);
++		goto out_fail;
++	}
++out:
++	return status;
++out_fail:
++	status = nfserrno(status);
++	out_free:
++	kfree(cbr);
++	goto out;
++}
+Index: linux-2.6.10/fs/nfsd/nfs4proc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4proc.c	2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4proc.c	2005-04-05 14:49:13.432687240 +0800
+@@ -461,28 +461,12 @@
+ }
+ 
+ static inline int
+-access_bits_permit_read(unsigned long access_bmap)
+-{
+-	return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) ||
+-		test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
+-}
+-
+-static inline int
+-access_bits_permit_write(unsigned long access_bmap)
+-{
+-	return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) ||
+-		test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
+-}
+-
+-static inline int
+ nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read)
+ {
+-	struct nfs4_stateid *stp;
+ 	int status;
++	struct file *filp;
+ 
+ 	/* no need to check permission - this will be done in nfsd_read() */
+-	if (nfs4_in_grace())
+-		return nfserr_grace;
+ 
+ 	if (read->rd_offset >= OFFSET_MAX)
+ 		return nfserr_inval;
+@@ -508,21 +492,17 @@
+ 		goto out;
+ 	}
+ 	/* check stateid */
+-	if ((status = nfs4_preprocess_stateid_op(current_fh, &read->rd_stateid, 
+-					CHECK_FH | RDWR_STATE, &stp))) {
++	if ((status = nfs4_preprocess_stateid_op(current_fh, &read->rd_stateid,
++					CHECK_FH | RD_STATE, &filp))) {
+ 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
+ 		goto out;
+ 	}
+-	status = nfserr_openmode;
+-	if (!access_bits_permit_read(stp->st_access_bmap)) {
+-		dprintk("NFSD: nfsd4_read: file not opened for read!\n");
+-		goto out;
+-	}
+ 	status = nfs_ok;
+ out:
+ 	nfs4_unlock_state();
+ 	read->rd_rqstp = rqstp;
+ 	read->rd_fhp = current_fh;
++	read->rd_filp = filp;
+ 	return status;
+ }
+ 
+@@ -562,6 +542,8 @@
+ {
+ 	int status;
+ 
++	if (nfs4_in_grace())
++		return nfserr_grace;
+ 	status = nfsd_unlink(rqstp, current_fh, 0, remove->rm_name, remove->rm_namelen);
+ 	if (status == nfserr_symlink)
+ 		return nfserr_notdir;
+@@ -580,6 +562,9 @@
+ 
+ 	if (!save_fh->fh_dentry)
+ 		return status;
++	if (nfs4_in_grace() && !(save_fh->fh_export->ex_flags
++					& NFSEXP_NOSUBTREECHECK))
++		return nfserr_grace;
+ 	status = nfsd_rename(rqstp, save_fh, rename->rn_sname,
+ 			     rename->rn_snamelen, current_fh,
+ 			     rename->rn_tname, rename->rn_tnamelen);
+@@ -605,12 +590,8 @@
+ static inline int
+ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr)
+ {
+-	struct nfs4_stateid *stp;
+ 	int status = nfs_ok;
+ 
+-	if (nfs4_in_grace())
+-		return nfserr_grace;
+-
+ 	if (!current_fh->fh_dentry)
+ 		return nfserr_nofilehandle;
+ 
+@@ -626,15 +607,10 @@
+ 		nfs4_lock_state();
+ 		if ((status = nfs4_preprocess_stateid_op(current_fh, 
+ 						&setattr->sa_stateid, 
+-						CHECK_FH | RDWR_STATE, &stp))) {
++						CHECK_FH | WR_STATE, NULL))) {
+ 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+ 			goto out_unlock;
+ 		}
+-		status = nfserr_openmode;
+-		if (!access_bits_permit_write(stp->st_access_bmap)) {
+-			dprintk("NFSD: nfsd4_setattr: not opened for write!\n");
+-			goto out_unlock;
+-		}
+ 		nfs4_unlock_state();
+ 	}
+ 	status = nfs_ok;
+@@ -654,14 +630,11 @@
+ static inline int
+ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write)
+ {
+-	struct nfs4_stateid *stp;
+ 	stateid_t *stateid = &write->wr_stateid;
++	struct file *filp;
+ 	u32 *p;
+ 	int status = nfs_ok;
+ 
+-	if (nfs4_in_grace())
+-		return nfserr_grace;
+-
+ 	/* no need to check permission - this will be done in nfsd_write() */
+ 
+ 	if (write->wr_offset >= OFFSET_MAX)
+@@ -677,18 +650,13 @@
+ 		goto zero_stateid;
+ 	}
+ 	if ((status = nfs4_preprocess_stateid_op(current_fh, stateid, 
+-					CHECK_FH | RDWR_STATE, &stp))) {
++					CHECK_FH | WR_STATE, &filp))) {
+ 		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+ 		goto out;
+ 	}
+ 
+-	status = nfserr_openmode;
+-	if (!access_bits_permit_write(stp->st_access_bmap)) {
+-		dprintk("NFSD: nfsd4_write: file not open for write!\n");
+-		goto out;
+-	}
+-
+ zero_stateid:
++
+ 	nfs4_unlock_state();
+ 	write->wr_bytes_written = write->wr_buflen;
+ 	write->wr_how_written = write->wr_stable_how;
+@@ -696,9 +664,16 @@
+ 	*p++ = nfssvc_boot.tv_sec;
+ 	*p++ = nfssvc_boot.tv_usec;
+ 
+-	status =  nfsd_write(rqstp, current_fh, write->wr_offset,
+-			  write->wr_vec, write->wr_vlen, write->wr_buflen,
+-			  &write->wr_how_written);
++	if (filp)
++		status =  nfsd_vfs_write(rqstp, current_fh, filp,
++				write->wr_offset, write->wr_vec,
++				write->wr_vlen, write->wr_buflen,
++			  	&write->wr_how_written);
++	else
++		status =  nfsd_write(rqstp, current_fh, write->wr_offset,
++			  	write->wr_vec, write->wr_vlen, write->wr_buflen,
++			  	&write->wr_how_written);
++
+ 	if (status == nfserr_symlink)
+ 		status = nfserr_inval;
+ 	return status;
+@@ -872,6 +847,9 @@
+ 		case OP_CREATE:
+ 			op->status = nfsd4_create(rqstp, current_fh, &op->u.create);
+ 			break;
++		case OP_DELEGRETURN:
++			op->status = nfsd4_delegreturn(rqstp, current_fh, &op->u.delegreturn);
++			break;
+ 		case OP_GETATTR:
+ 			op->status = nfsd4_getattr(rqstp, current_fh, &op->u.getattr);
+ 			break;
+Index: linux-2.6.10/fs/nfsd/export.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/export.c	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/fs/nfsd/export.c	2005-04-05 14:49:13.415689824 +0800
+@@ -255,7 +255,7 @@
+ 	new->ek_export = item->ek_export;
+ }
+ 
+-static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */
++static DefineSimpleCacheLookup(svc_expkey)
+ 
+ #define	EXPORT_HASHBITS		8
+ #define	EXPORT_HASHMAX		(1<< EXPORT_HASHBITS)
+@@ -492,8 +492,72 @@
+ 	new->ex_fsid = item->ex_fsid;
+ }
+ 
+-static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */
++struct svc_export *
++svc_export_lookup(struct svc_export *item, int set)
++{
++	struct svc_export *tmp, *new = NULL;
++	struct cache_head **hp, **head;
+ 
++	head = &svc_export_cache.hash_table[svc_export_hash(item)];
++retry:
++	if (set||new)
++		write_lock(&svc_export_cache.hash_lock);
++	else
++		read_lock(&svc_export_cache.hash_lock);
++	for(hp=head; *hp != NULL; hp = &tmp->h.next) {
++		tmp = container_of(*hp, struct svc_export, h);
++		if (svc_export_match(item, tmp)) { /* found a match */
++			cache_get(&tmp->h);
++			if (set) {
++				if (test_bit(CACHE_NEGATIVE,  &item->h.flags))
++					 set_bit(CACHE_NEGATIVE, &tmp->h.flags);
++				else {
++					clear_bit(CACHE_NEGATIVE, &tmp->h.flags);
++					svc_export_update(tmp, item);
++				}
++			}
++			if (set||new)
++				write_unlock(&svc_export_cache.hash_lock);
++			else
++				read_unlock(&svc_export_cache.hash_lock);
++			if (set)
++				cache_fresh(&svc_export_cache, &tmp->h,
++						item->h.expiry_time);
++			if (new)
++				svc_export_put(&new->h, &svc_export_cache);
++			return tmp;
++		}
++	}
++	/* Didn't find anything */
++	if (new) {
++		svc_export_init(new, item);
++		new->h.next = *head;
++		*head = &new->h;
++		set_bit(CACHE_HASHED, &new->h.flags);
++		svc_export_cache.entries++;
++		if (set) {
++			tmp = new;
++			if (test_bit(CACHE_NEGATIVE, &item->h.flags))
++				set_bit(CACHE_NEGATIVE, &tmp->h.flags);
++			else
++				svc_export_update(tmp, item);
++		}
++	}
++	if (set||new)
++		write_unlock(&svc_export_cache.hash_lock);
++	else
++		read_unlock(&svc_export_cache.hash_lock);
++	if (new && set)
++		cache_fresh(&svc_export_cache, &new->h, item->h.expiry_time);
++	if (new)
++		return new;
++	new = kmalloc(sizeof(*new), GFP_KERNEL);
++	if (new) {
++		cache_init(&new->h);
++		goto retry;
++	}
++	return NULL;
++}
+ 
+ struct svc_expkey *
+ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
+Index: linux-2.6.10/fs/nfsd/nfssvc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfssvc.c	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfssvc.c	2005-04-05 14:49:13.422688760 +0800
+@@ -378,4 +378,6 @@
+ 	.pg_name		= "nfsd",		/* program name */
+ 	.pg_class		= "nfsd",		/* authentication class */
+ 	.pg_stats		= &nfsd_svcstats,	/* version table */
++	.pg_authenticate	= &svc_set_client,	/* export authentication */
++
+ };
+Index: linux-2.6.10/fs/nfsd/nfs4recover.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4recover.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/nfsd/nfs4recover.c	2005-04-05 14:49:13.430687544 +0800
+@@ -0,0 +1,411 @@
++/*
++*  linux/fs/nfsd/nfs4recover.c
++*
++*  Copyright (c) 2004 The Regents of the University of Michigan.
++*  All rights reserved.
++*
++*  Andy Adamson <andros@umich.edu>
++*
++*  Redistribution and use in source and binary forms, with or without
++*  modification, are permitted provided that the following conditions
++*  are met:
++*
++*  1. Redistributions of source code must retain the above copyright
++*     notice, this list of conditions and the following disclaimer.
++*  2. Redistributions in binary form must reproduce the above copyright
++*     notice, this list of conditions and the following disclaimer in the
++*     documentation and/or other materials provided with the distribution.
++*  3. Neither the name of the University nor the names of its
++*     contributors may be used to endorse or promote products derived
++*     from this software without specific prior written permission.
++*
++*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++*  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++*  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*
++*/
++
++
++#include <linux/param.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/nfsd/nfsd.h>
++#include <linux/nfs4.h>
++#include <linux/nfsd/state.h>
++#include <linux/nfsd/xdr4.h>
++#include <linux/file.h>
++#include <linux/namei.h>
++#include <asm/uaccess.h>
++
++#define NFSDDBG_FACILITY                NFSDDBG_PROC
++
++/* MAX_FILE_LEN/2 = max client id name length due to changing name
++ * into hex
++ */
++#define MAX_FILE_LEN 256
++
++/* Globals */
++char recovery_dirname[] = "/var/lib/nfs/v4recovery";
++static uid_t	saveuid;
++static gid_t   savegid;
++static struct nameidata nd_rec_init;
++static int rec_dir_init = 0;
++
++void
++nfs4_save_set_user(void)
++{
++	saveuid = current->fsuid;
++	savegid = current->fsgid;
++	current->fsuid = 0;
++	current->fsgid = 0;
++}
++
++void
++nfs4_reset_user(void)
++{
++	current->fsuid = saveuid;
++	current->fsgid = savegid;
++}
++
++void
++nfs4_make_rec_filename(char **filename, struct nfs4_client *clp)
++{
++	char   	*fname = *filename;
++	int	flen = MAX_FILE_LEN;
++
++	memset(fname, 0, flen);
++	qword_addhex(&fname, &flen, clp->cl_name.data, clp->cl_name.len);
++}
++
++/* XXX need to check dput() mntput ?? */
++int
++nfsd4_create_clid_file(struct nfs4_client *clp)
++{
++	struct file 		*filp = NULL;
++	struct dentry		*dentry;
++	mm_segment_t		oldfs;
++	loff_t			offset = 0;
++	char			fbuf[MAX_FILE_LEN], *fname = fbuf;
++	int 			status;
++
++
++	if (!rec_dir_init)
++		return -EINVAL;
++	nfs4_save_set_user();
++
++	dprintk("NFSD: nfsd4_create_clid_file IN recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	/* lock the parent */
++	down(&nd_rec_init.dentry->d_inode->i_sem);
++
++	nfs4_make_rec_filename(&fname, clp);
++	/* dentry->d_count will be 1 */
++	dentry = lookup_one_len(fname, nd_rec_init.dentry, strlen(fname));
++	status = PTR_ERR(dentry);
++	if (IS_ERR(dentry))
++		goto out_unlock;
++
++	status = -EEXIST;
++	if (dentry->d_inode){
++		dprintk("NFSD: nfsd4_create_clid_file: FILE EXISTS\n");
++		goto out_unlock;
++	}
++
++	/* nd_rec_init.dentry->d_count is bumped */
++	status = vfs_create(nd_rec_init.dentry->d_inode, dentry, S_IRWXU, NULL);
++	if (status < 0)
++		goto out_unlock;
++
++	up(&nd_rec_init.dentry->d_inode->i_sem);
++
++	filp = dentry_open(dget(dentry), mntget(nd_rec_init.mnt), O_RDWR);
++	status = PTR_ERR(filp);
++	if (IS_ERR(filp))
++		goto out_mnt;
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	status = vfs_write(filp, clp->cl_name.data, clp->cl_name.len, &offset);
++	set_fs(oldfs);
++
++	dprintk("NFSD: nfsd4_create_clid_file vfs_write returns %d\n",status);
++	if (status >= 0)
++		status = nfs_ok;
++
++	if (filp->f_op && filp->f_op->flush) {
++		int err = filp->f_op->flush(filp);
++		dprintk("NFSD: nfsd4_create_clid_file called flush\n");
++		if (!status)
++			status = err;
++	}
++	/* dget and mntget in dentry_open call */
++	fput(filp);
++
++	/* dentry->d_count will be 0 */
++	dput(dentry);
++out_mnt:
++	/* dget in vfs_create call */
++	dput(nd_rec_init.dentry);
++
++out:
++	nfs4_reset_user();
++
++	dprintk("NFSD: nfsd4_create_clid_file OUT recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++	dprintk("NFSD: nfsd4_create_clid_file returns %d\n",status);
++
++	return status;
++
++out_unlock:
++	up(&nd_rec_init.dentry->d_inode->i_sem);
++	goto out;
++}
++
++/*
++ * called with pdentry->d_inode->i_sem held ?
++ */
++int
++nfsd4_unlink_rec_file(char *name, int namlen)
++{
++	struct dentry *dentry;
++	int type, status;
++
++	dprintk("NFSD: nfsd4_unlink_rec_file. name %.*s\n", namlen, name);
++
++	dentry = lookup_one_len(name, nd_rec_init.dentry, namlen);
++	dprintk("NFSD: nfsd4_unlink_rec_file POST LOOKUP nd_rec d_count %d\n",
++		atomic_read(&nd_rec_init.dentry->d_count));
++	status = PTR_ERR(dentry);
++	if (IS_ERR(dentry))
++		goto out;
++
++	status = -ENOENT;
++	if (!dentry->d_inode) {
++		dput(dentry);
++		goto out;
++	}
++
++	/* should only be files here! */
++	type = dentry->d_inode->i_mode & S_IFMT;
++	status = -EISDIR;
++	if (!(type & S_IFREG)) {
++		dput(dentry);
++		goto out;
++	}
++
++	dprintk("NFSD: nfsd4_unlink_rec_file PRE VFS UNLINK [%d:%d]\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	status = vfs_unlink(nd_rec_init.dentry->d_inode, dentry);
++
++	dprintk("NFSD: nfsd4_unlink_rec_file POST VFS UNLINK [%d:%d]\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	dprintk("NFSD: nfsd4_unlink_rec_file FILE dentry->d_count %d\n",
++		atomic_read(&dentry->d_count));
++out:
++	dprintk("NFSD: nfsd4_unlink_rec_file returns %d\n",status);
++	return status;
++}
++
++void
++nfsd4_remove_clid_file(struct nfs4_client *clp)
++{
++	char			fbuf[MAX_FILE_LEN], *fname = fbuf;
++	int 			status;
++
++	if (!rec_dir_init)
++		return;
++
++	dprintk("NFSD: nfsd4_remove_clid_file client %.*s\n",
++		clp->cl_name.len,clp->cl_name.data);
++
++	nfs4_save_set_user();
++
++	dprintk("NFSD: nfsd4_remove_clid_file IN recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	nfs4_make_rec_filename(&fname, clp);
++	status = nfsd4_unlink_rec_file(fname, strlen(fname));
++	nfs4_reset_user();
++	if (status != nfs_ok)
++		printk("NFSD: Failed to remove expired client state file %.*s from %s\n", strlen(fname), fname, recovery_dirname);
++
++	dprintk("NFSD: nfsd4_remove_clid_file OUT recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++	return;
++}
++
++struct rec_dirent {
++	int clear;
++};
++
++/*
++ * on reboot, stuff the reclaim hash with known client id's.
++ *
++ * the filename may not equal the clid. the clid might be the first
++ * (and so far only) line of data in the file.
++ *
++ * i will probably end up writing data such as the setclientid principal
++ * to each clid file. if i do, i will always put the clid as the
++ * first line of data.
++ */
++
++int
++nfsd4_get_recdir_dirent(struct rec_dirent *rdirent, const char *name,
++		int namlen, loff_t offset, ino_t ino, unsigned int d_type)
++{
++	struct dentry 		*dclid;
++	struct file		*filp;
++	mm_segment_t		oldfs;
++	int			status = nfs_ok;
++
++	dprintk("NFSD: nfsd4_get_recdir_dirent IN recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	dprintk("NFSD: nfsd4_get_recdir_dirent name %.*s, clear %d\n",
++		namlen, name, rdirent->clear);
++
++	if (name && isdotent(name, namlen))
++		goto out;
++
++	dclid = lookup_one_len(name, nd_rec_init.dentry, namlen);
++	status = PTR_ERR(dclid);
++	if(IS_ERR(dclid))
++		goto out;
++
++	if (rdirent->clear){
++		dprintk("NFSD: nfsd4_get_recdir_dirent REMOVE\n");
++
++	dprintk("NFSD: nfsd4_get_recdir_dirent PRE VFS_UNLINK [%d:%d]\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++		status = vfs_unlink(nd_rec_init.dentry->d_inode, dclid);
++
++	dprintk("NFSD: nfsd4_get_recdir_dirent POST VFS_UNLINK [%d:%d]\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	} else {
++		char 	buf[MAX_FILE_LEN];
++
++		dprintk("NFSD: nfsd4_get_recdir_dirent READ\n");
++
++		filp = dentry_open(dclid, mntget(nd_rec_init.mnt), O_RDWR);
++		if (IS_ERR(filp)) {
++			status = PTR_ERR(filp);
++			goto out;
++		}
++
++		memset(buf, 0, MAX_FILE_LEN);
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		status = vfs_read(filp, buf, MAX_FILE_LEN, &filp->f_pos);
++		set_fs(oldfs);
++
++		dprintk("NFSD: nfsd4_get_recdir_dirent vfs_read returns %d\n",
++			status);
++		if (status > 0)
++			status = nfs4_client_to_reclaim(buf, status);
++		fput(filp);
++	}
++out:
++	dprintk("NFSD:nfsd4_get_recdir_dirent OUT recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	dprintk("NFSD: nfsd4_get_recdir_dirent returns %d\n",status);
++	return 0;
++}
++
++int
++nfsd4_list_rec_dir(int clear)
++{
++	struct file		*filp;
++	struct rec_dirent	rdirent;
++	int 			status;
++
++	if (!rec_dir_init)
++		return -EINVAL;
++
++	nfs4_save_set_user();
++
++	dprintk("NFSD: nfsd4_list_rec_dir IN recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	/* open directory */
++	filp = dentry_open(dget(nd_rec_init.dentry), mntget(nd_rec_init.mnt),
++		            O_RDWR);
++	status = PTR_ERR(filp);
++	if (IS_ERR(filp))
++		goto out;
++	rdirent.clear = clear;
++
++	/* read the directory entries into memory */
++	status = vfs_readdir(filp, (filldir_t) nfsd4_get_recdir_dirent,
++			(void*)&rdirent);
++
++	fput(filp);
++out:
++	dprintk("NFSD: nfsd4_list_rec_dir OUT recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	dprintk("NFSD: nfsd4_list_rec_dir DONE status: %d\n", status);
++
++	nfs4_reset_user();
++	return status;
++}
++
++
++/*
++ * Hold reference to the recovery directory.
++ */
++
++void
++nfsd4_init_rec_dir(char *rec_dirname)
++{
++	int 			status;
++
++	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
++		rec_dirname);
++
++	nfs4_save_set_user();
++
++        status = path_lookup(rec_dirname, LOOKUP_FOLLOW, &nd_rec_init);
++
++	printk("NFSD: nfsd4_init_rec_dir INITIAL recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++
++	if (!status)
++		rec_dir_init = 1;
++	nfs4_reset_user();
++	printk("NFSD: nfsd4_init_rec_dir rec_dir_init %d\n", rec_dir_init);
++}
++
++void
++nfsd4_shutdown_rec_dir(void)
++{
++	rec_dir_init = 0;
++	path_release(&nd_rec_init);
++
++	printk("NFSD: nfsd4_shutdown_rec_dir FINAL recdir [d:mnt] count %d:%d\n",
++		atomic_read(&nd_rec_init.dentry->d_count),
++		atomic_read(&nd_rec_init.mnt->mnt_count));
++}
+Index: linux-2.6.10/fs/nfsd/Makefile
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/Makefile	2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/fs/nfsd/Makefile	2005-04-05 14:49:13.431687392 +0800
+@@ -8,5 +8,5 @@
+ 			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
+ nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
+-			   nfs4acl.o
++			   nfs4acl.o nfs4callback.o nfs4recover.o
+ nfsd-objs		:= $(nfsd-y)
+Index: linux-2.6.10/fs/nfs/nfs4xdr.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/nfs4xdr.c	2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/fs/nfs/nfs4xdr.c	2005-04-05 14:49:13.452684200 +0800
+@@ -82,12 +82,16 @@
+ #define encode_getfh_maxsz      (op_encode_hdr_maxsz)
+ #define decode_getfh_maxsz      (op_decode_hdr_maxsz + 1 + \
+ 				((3+NFS4_FHSIZE) >> 2))
+-#define encode_getattr_maxsz    (op_encode_hdr_maxsz + 3)
++#define nfs4_fattr_bitmap_maxsz 3
++#define encode_getattr_maxsz    (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+ #define nfs4_name_maxsz		(1 + ((3 + NFS4_MAXNAMLEN) >> 2))
+ #define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+-#define nfs4_fattr_bitmap_maxsz (36 + 2 * nfs4_name_maxsz)
+-#define decode_getattr_maxsz    (op_decode_hdr_maxsz + 3 + \
+-                                nfs4_fattr_bitmap_maxsz)
++/* This is based on getfattr, which uses the most attributes: */
++#define nfs4_fattr_value_maxsz	(1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
++				3 + 3 + 3 + 2 * nfs4_name_maxsz))
++#define nfs4_fattr_maxsz	(nfs4_fattr_bitmap_maxsz + \
++				nfs4_fattr_value_maxsz)
++#define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
+ #define encode_savefh_maxsz     (op_encode_hdr_maxsz)
+ #define decode_savefh_maxsz     (op_decode_hdr_maxsz)
+ #define encode_fsinfo_maxsz	(op_encode_hdr_maxsz + 2)
+@@ -122,11 +126,11 @@
+ #define encode_symlink_maxsz	(op_encode_hdr_maxsz + \
+ 				1 + nfs4_name_maxsz + \
+ 				nfs4_path_maxsz + \
+-				nfs4_fattr_bitmap_maxsz)
++				nfs4_fattr_maxsz)
+ #define decode_symlink_maxsz	(op_decode_hdr_maxsz + 8)
+ #define encode_create_maxsz	(op_encode_hdr_maxsz + \
+ 				2 + nfs4_name_maxsz + \
+-				nfs4_fattr_bitmap_maxsz)
++				nfs4_fattr_maxsz)
+ #define decode_create_maxsz	(op_decode_hdr_maxsz + 8)
+ #define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
+ #define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
+@@ -205,7 +209,7 @@
+ #define NFS4_enc_setattr_sz     (compound_encode_hdr_maxsz + \
+                                 encode_putfh_maxsz + \
+                                 op_encode_hdr_maxsz + 4 + \
+-                                nfs4_fattr_bitmap_maxsz + \
++                                nfs4_fattr_maxsz + \
+                                 encode_getattr_maxsz)
+ #define NFS4_dec_setattr_sz     (compound_decode_hdr_maxsz + \
+                                 decode_putfh_maxsz + \
+@@ -360,6 +364,20 @@
+ 				encode_delegreturn_maxsz)
+ #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
+ 				decode_delegreturn_maxsz)
++#define NFS4_enc_getacl_sz	(compound_encode_hdr_maxsz + \
++				encode_putfh_maxsz + \
++				encode_getattr_maxsz)
++#define NFS4_dec_getacl_sz	(compound_decode_hdr_maxsz + \
++				decode_putfh_maxsz + \
++				op_decode_hdr_maxsz + \
++				nfs4_fattr_bitmap_maxsz + 1)
++#define NFS4_enc_setacl_sz	(compound_encode_hdr_maxsz + \
++				encode_putfh_maxsz + \
++				op_encode_hdr_maxsz + 4 + \
++				nfs4_fattr_bitmap_maxsz + 1)
++#define NFS4_dec_setacl_sz	(compound_decode_hdr_maxsz + \
++				decode_putfh_maxsz + \
++				op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+ 
+ static struct {
+ 	unsigned int	mode;
+@@ -459,7 +477,7 @@
+ 	 * In the worst-case, this would be
+ 	 *   12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+ 	 *          = 36 bytes, plus any contribution from variable-length fields
+-	 *            such as owner/group/acl's.
++	 *            such as owner/group.
+ 	 */
+ 	len = 16;
+ 
+@@ -1083,6 +1101,27 @@
+ 	return 0;
+ }
+ 
++extern nfs4_stateid zero_stateid;
++
++static int
++encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
++{
++	uint32_t *p;
++
++	RESERVE_SPACE(4+sizeof(zero_stateid.data));
++	WRITE32(OP_SETATTR);
++	WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
++	RESERVE_SPACE(2*4);
++	WRITE32(1);
++	WRITE32(FATTR4_WORD0_ACL);
++	if (arg->acl_len % 4)
++		return -EINVAL;
++	RESERVE_SPACE(4);
++	WRITE32(arg->acl_len);
++	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
++	return 0;
++}
++
+ static int
+ encode_savefh(struct xdr_stream *xdr)
+ {
+@@ -1627,6 +1666,34 @@
+ }
+ 
+ /*
++ * Encode a GETACL request
++ */
++static int
++nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,
++		struct nfs_getaclargs *args)
++{
++	struct xdr_stream xdr;
++	struct rpc_auth *auth = req->rq_task->tk_auth;
++	struct compound_hdr hdr = {
++		.nops   = 2,
++	};
++	int replen, status;
++
++	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++	encode_compound_hdr(&xdr, &hdr);
++	status = encode_putfh(&xdr, args->fh);
++	if (status)
++		goto out;
++	status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
++	/* set up reply buffer: */
++	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
++	xdr_inline_pages(&req->rq_rcv_buf, replen,
++		args->acl_pages, args->acl_pgbase, args->acl_len);
++out:
++	return status;
++}
++
++/*
+  * Encode a WRITE request
+  */
+ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+@@ -3122,6 +3189,46 @@
+ 	return decode_op_hdr(xdr, OP_RENEW);
+ }
+ 
++static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
++		ssize_t *acl_len)
++{
++	uint32_t *savep;
++	uint32_t attrlen,
++		 bitmap[2] = {0};
++	struct kvec *iov = req->rq_rcv_buf.head;
++	int status;
++
++	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
++		goto out;
++	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
++		goto out;
++	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
++		goto out;
++
++	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
++		return -EIO;
++	if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
++		int hdrlen, recvd;
++
++		/* We ignore &savep and don't do consistency checks on
++		 * the attr length.  Let userspace figure it out.... */
++		hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
++		recvd = req->rq_rcv_buf.len - hdrlen;
++		if (attrlen > recvd) {
++			printk(KERN_WARNING "NFS: server cheating in getattr"
++					" acl reply: attrlen %u > recvd %u\n",
++					attrlen, recvd);
++			return -EINVAL;
++		}
++		if (attrlen <= *acl_len)
++			xdr_read_pages(xdr, attrlen);
++		*acl_len = attrlen;
++	}
++
++out:
++	return status;
++}
++
+ static int
+ decode_savefh(struct xdr_stream *xdr)
+ {
+@@ -3413,6 +3520,71 @@
+ 
+ }
+ 
++/*
++ * Encode an SETACL request
++ */
++static int
++nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args)
++{
++        struct xdr_stream xdr;
++        struct compound_hdr hdr = {
++                .nops   = 2,
++        };
++        int status;
++
++        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++        encode_compound_hdr(&xdr, &hdr);
++        status = encode_putfh(&xdr, args->fh);
++        if (status)
++                goto out;
++        status = encode_setacl(&xdr, args);
++out:
++        return status;
++}
++/*
++ * Decode SETACL response
++ */
++static int
++nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res)
++{
++	struct xdr_stream xdr;
++	struct compound_hdr hdr;
++	int status;
++
++	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++	status = decode_compound_hdr(&xdr, &hdr);
++	if (status)
++		goto out;
++	status = decode_putfh(&xdr);
++	if (status)
++		goto out;
++	status = decode_setattr(&xdr, res);
++out:
++	return status;
++}
++
++/*
++ * Decode GETACL response
++ */
++static int
++nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, ssize_t *acl_len)
++{
++	struct xdr_stream xdr;
++	struct compound_hdr hdr;
++	int status;
++
++	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++	status = decode_compound_hdr(&xdr, &hdr);
++	if (status)
++		goto out;
++	status = decode_putfh(&xdr);
++	if (status)
++		goto out;
++	status = decode_getacl(&xdr, rqstp, acl_len);
++
++out:
++	return status;
++}
+ 
+ /*
+  * Decode CLOSE response
+@@ -4009,6 +4181,8 @@
+   PROC(READDIR,		enc_readdir,	dec_readdir),
+   PROC(SERVER_CAPS,	enc_server_caps, dec_server_caps),
+   PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn),
++  PROC(GETACL,		enc_getacl,	dec_getacl),
++  PROC(SETACL,		enc_setacl,	dec_setacl),
+ };
+ 
+ struct rpc_version		nfs_version4 = {
+Index: linux-2.6.10/fs/nfs/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/inode.c	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/fs/nfs/inode.c	2005-04-05 14:49:13.445685264 +0800
+@@ -486,13 +486,27 @@
+ 	if (error < 0)
+ 		goto out_err;
+ 
+-	buf->f_frsize = server->wtmult;
++	/*
++	 * Current versions of glibc do not correctly handle the
++	 * case where f_frsize != f_bsize.  Eventually we want to
++	 * report the value of wtmult in this field.
++	 */
++	buf->f_frsize = sb->s_blocksize;
++
++	/*
++	 * On most *nix systems, f_blocks, f_bfree, and f_bavail
++	 * are reported in units of f_frsize.  Linux hasn't had
++	 * an f_frsize field in its statfs struct until recently,
++	 * thus historically Linux's sys_statfs reports these
++	 * fields in units of f_bsize.
++	 */
+ 	buf->f_bsize = sb->s_blocksize;
+ 	blockbits = sb->s_blocksize_bits;
+ 	blockres = (1 << blockbits) - 1;
+ 	buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+ 	buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+ 	buf->f_bavail = (res.abytes + blockres) >> blockbits;
++
+ 	buf->f_files = res.tfiles;
+ 	buf->f_ffree = res.afiles;
+ 
+@@ -565,9 +579,9 @@
+ 
+ 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
+ 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+-		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
++		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS;
+ 	else
+-		nfsi->flags |= NFS_INO_INVALID_ATTR;
++		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+ }
+ 
+ /*
+@@ -605,7 +619,7 @@
+ 		return 0;
+ 	if (nfs_compare_fh(NFS_FH(inode), fh))
+ 		return 0;
+-	if (is_bad_inode(inode))
++	if (is_bad_inode(inode) || NFS_STALE(inode))
+ 		return 0;
+ 	return 1;
+ }
+@@ -664,7 +678,7 @@
+ 		/* Why so? Because we want revalidate for devices/FIFOs, and
+ 		 * that's precisely what we have in nfs_file_inode_operations.
+ 		 */
+-		inode->i_op = &nfs_file_inode_operations;
++		inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops;
+ 		if (S_ISREG(inode->i_mode)) {
+ 			inode->i_fop = &nfs_file_operations;
+ 			inode->i_data.a_ops = &nfs_file_aops;
+@@ -766,13 +780,8 @@
+ 			vmtruncate(inode, attr->ia_size);
+ 		}
+ 	}
+-	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+-		struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred;
+-		if (*cred) {
+-			put_rpccred(*cred);
+-			*cred = NULL;
+-		}
+-	}
++	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
++		NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS;
+ 	nfs_end_data_update(inode);
+ 	unlock_kernel();
+ 	return error;
+@@ -949,14 +958,14 @@
+ 	lock_kernel();
+ 	if (!inode || is_bad_inode(inode))
+  		goto out_nowait;
+-	if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode)
++	if (NFS_STALE(inode))
+  		goto out_nowait;
+ 
+ 	while (NFS_REVALIDATING(inode)) {
+ 		status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING);
+ 		if (status < 0)
+ 			goto out_nowait;
+-		if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOAC)
++		if (NFS_ATTRTIMEO(inode) == 0)
+ 			continue;
+ 		if (NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME))
+ 			continue;
+@@ -968,14 +977,14 @@
+ 	/* Protect against RPC races by saving the change attribute */
+ 	verifier = nfs_save_change_attribute(inode);
+ 	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
+-	if (status) {
++	if (status != 0) {
+ 		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
+ 			 inode->i_sb->s_id,
+ 			 (long long)NFS_FILEID(inode), status);
+ 		if (status == -ESTALE) {
+-			NFS_FLAGS(inode) |= NFS_INO_STALE;
+-			if (inode != inode->i_sb->s_root->d_inode)
+-				remove_inode_hash(inode);
++			nfs_zap_caches(inode);
++			if (!S_ISDIR(inode->i_mode))
++				NFS_FLAGS(inode) |= NFS_INO_STALE;
+ 		}
+ 		goto out;
+ 	}
+@@ -1014,7 +1023,6 @@
+ 		inode->i_sb->s_id,
+ 		(long long)NFS_FILEID(inode));
+ 
+-	NFS_FLAGS(inode) &= ~NFS_INO_STALE;
+ out:
+ 	NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING;
+ 	wake_up(&nfsi->nfs_i_wait);
+@@ -1161,7 +1169,7 @@
+ 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
+ 			|| inode->i_uid != fattr->uid
+ 			|| inode->i_gid != fattr->gid)
+-		nfsi->flags |= NFS_INO_INVALID_ATTR;
++		nfsi->flags |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+ 
+ 	/* Has the link count changed? */
+ 	if (inode->i_nlink != fattr->nlink)
+@@ -1270,7 +1278,7 @@
+ #endif
+ 		nfsi->change_attr = fattr->change_attr;
+ 		if (!data_unstable)
+-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
++			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS;
+ 	}
+ 
+ 	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+@@ -1278,14 +1286,8 @@
+ 
+ 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
+ 	    inode->i_uid != fattr->uid ||
+-	    inode->i_gid != fattr->gid) {
+-		struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred;
+-		if (*cred) {
+-			put_rpccred(*cred);
+-			*cred = NULL;
+-		}
+-		invalid |= NFS_INO_INVALID_ATTR;
+-	}
++	    inode->i_gid != fattr->gid)
++		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+ 
+ 	inode->i_mode = fattr->mode;
+ 	inode->i_nlink = fattr->nlink;
+@@ -1335,7 +1337,8 @@
+ 	 */
+ 	nfs_invalidate_inode(inode);
+  out_err:
+-	return -EIO;
++	NFS_FLAGS(inode) |= NFS_INO_STALE;
++	return -ESTALE;
+ }
+ 
+ /*
+@@ -1449,8 +1452,6 @@
+ 
+ 	kill_anon_super(s);
+ 
+-	nfs4_renewd_prepare_shutdown(server);
+-
+ 	if (server->client != NULL && !IS_ERR(server->client))
+ 		rpc_shutdown_client(server->client);
+ 	if (server->client_sys != NULL && !IS_ERR(server->client_sys))
+@@ -1461,8 +1462,6 @@
+ 
+ 	rpciod_down();		/* release rpciod */
+ 
+-	destroy_nfsv4_state(server);
+-
+ 	if (server->hostname != NULL)
+ 		kfree(server->hostname);
+ 	kfree(server);
+@@ -1478,8 +1477,53 @@
+ 
+ #ifdef CONFIG_NFS_V4
+ 
+-static void nfs4_clear_inode(struct inode *);
++#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
++
++int
++nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
++		size_t buflen, int flags)
++{
++	struct inode *inode = dentry->d_inode;
++
++	if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
++		return -EINVAL;
++
++	if (!S_ISREG(inode->i_mode) &&
++	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++		return -EPERM;
++
++	return nfs4_proc_set_acl(inode, buf, buflen);
++}
++
++/* The getxattr man page suggests returning -ENODATA for unknown attributes,
++ * and that's what we'll do for e.g. user attributes that haven't been set.
++ * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
++ * attributes in kernel-managed attribute namespaces. */
++ssize_t
++nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
++		size_t buflen)
++{
++	struct inode *inode = dentry->d_inode;
+ 
++	if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
++		return -EOPNOTSUPP;
++
++	return nfs4_proc_get_acl(inode, buf, buflen);
++}
++
++ssize_t
++nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
++{
++	ssize_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
++
++	if (buf && buflen < len)
++		return -ERANGE;
++	if (buf)
++		memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
++	return len;
++}
++
++static void nfs4_clear_inode(struct inode *);
+ 
+ static struct super_operations nfs4_sops = { 
+ 	.alloc_inode	= nfs_alloc_inode,
+@@ -1543,9 +1587,6 @@
+ 		server->wsize = nfs_block_size(data->wsize, NULL);
+ 	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+ 
+-	/* NFSv4 doesn't use NLM locking */
+-	server->flags |= NFS_MOUNT_NONLM;
+-
+ 	server->acregmin = data->acregmin*HZ;
+ 	server->acregmax = data->acregmax*HZ;
+ 	server->acdirmin = data->acdirmin*HZ;
+@@ -1790,8 +1831,22 @@
+ 
+ static void nfs4_kill_super(struct super_block *sb)
+ {
++	struct nfs_server *server = NFS_SB(sb);
++
+ 	nfs_return_all_delegations(sb);
+-	nfs_kill_super(sb);
++	kill_anon_super(sb);
++
++	nfs4_renewd_prepare_shutdown(server);
++
++	if (server->client != NULL && !IS_ERR(server->client))
++		rpc_shutdown_client(server->client);
++	rpciod_down();		/* release rpciod */
++
++	destroy_nfsv4_state(server);
++
++	if (server->hostname != NULL)
++		kfree(server->hostname);
++	kfree(server);
+ }
+ 
+ static struct file_system_type nfs4_fs_type = {
+@@ -1821,9 +1876,13 @@
+ extern int nfs_init_nfspagecache(void);
+ extern void nfs_destroy_nfspagecache(void);
+ extern int nfs_init_readpagecache(void);
+-extern int nfs_destroy_readpagecache(void);
++extern void nfs_destroy_readpagecache(void);
+ extern int nfs_init_writepagecache(void);
+-extern int nfs_destroy_writepagecache(void);
++extern void nfs_destroy_writepagecache(void);
++#ifdef CONFIG_NFS_DIRECTIO
++extern int nfs_init_directcache(void);
++extern void nfs_destroy_directcache(void);
++#endif
+ 
+ static kmem_cache_t * nfs_inode_cachep;
+ 
+@@ -1904,6 +1963,12 @@
+ 	if (err)
+ 		goto out1;
+ 
++#ifdef CONFIG_NFS_DIRECTIO
++	err = nfs_init_directcache();
++	if (err)
++		goto out0;
++#endif
++
+ #ifdef CONFIG_PROC_FS
+ 	rpc_proc_register(&nfs_rpcstat);
+ #endif
+@@ -1914,8 +1979,14 @@
+ 		goto out;
+ 	return 0;
+ out:
++#ifdef CONFIG_PROC_FS
+ 	rpc_proc_unregister("nfs");
++#endif
+ 	nfs_destroy_writepagecache();
++#ifdef CONFIG_NFS_DIRECTIO
++out0:
++	nfs_destroy_directcache();
++#endif
+ out1:
+ 	nfs_destroy_readpagecache();
+ out2:
+@@ -1928,6 +1999,9 @@
+ 
+ static void __exit exit_nfs_fs(void)
+ {
++#ifdef CONFIG_NFS_DIRECTIO
++	nfs_destroy_directcache();
++#endif
+ 	nfs_destroy_writepagecache();
+ 	nfs_destroy_readpagecache();
+ 	nfs_destroy_inodecache();
+Index: linux-2.6.10/fs/nfs/nfs4state.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/nfs4state.c	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/fs/nfs/nfs4state.c	2005-04-05 14:49:13.446685112 +0800
+@@ -445,7 +445,7 @@
+ 		state->owner = owner;
+ 		atomic_inc(&owner->so_count);
+ 		list_add(&state->inode_states, &nfsi->open_states);
+-		state->inode = inode;
++		state->inode = igrab(inode);
+ 		spin_unlock(&inode->i_lock);
+ 	} else {
+ 		spin_unlock(&inode->i_lock);
+@@ -471,6 +471,7 @@
+ 		list_del(&state->inode_states);
+ 	spin_unlock(&inode->i_lock);
+ 	list_del(&state->open_states);
++	iput(inode);
+ 	BUG_ON (state->state != 0);
+ 	nfs4_free_open_state(state);
+ 	nfs4_put_state_owner(owner);
+@@ -486,7 +487,6 @@
+ 	struct nfs4_state_owner *owner = state->owner;
+ 	struct nfs4_client *clp = owner->so_client;
+ 	int newstate;
+-	int status = 0;
+ 
+ 	atomic_inc(&owner->so_count);
+ 	down_read(&clp->cl_sem);
+@@ -508,10 +508,8 @@
+ 			newstate |= FMODE_WRITE;
+ 		if (state->state == newstate)
+ 			goto out;
+-		if (newstate != 0)
+-			status = nfs4_do_downgrade(inode, state, newstate);
+-		else
+-			status = nfs4_do_close(inode, state);
++		if (nfs4_do_close(inode, state, newstate) == -EINPROGRESS)
++			return;
+ 	}
+ out:
+ 	nfs4_put_open_state(state);
+Index: linux-2.6.10/fs/nfs/idmap.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/idmap.c	2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/fs/nfs/idmap.c	2005-04-05 14:49:13.454683896 +0800
+@@ -80,6 +80,7 @@
+ static ssize_t   idmap_pipe_downcall(struct file *, const char __user *,
+ 		     size_t);
+ void             idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
++static void	idmap_pipe_release(struct inode *inode);
+ 
+ static unsigned int fnvhash32(const void *, size_t);
+ 
+@@ -87,6 +88,7 @@
+         .upcall         = idmap_pipe_upcall,
+         .downcall       = idmap_pipe_downcall,
+         .destroy_msg    = idmap_pipe_destroy_msg,
++	.release_pipe	= idmap_pipe_release,
+ };
+ 
+ void
+@@ -448,6 +450,19 @@
+ 	up(&idmap->idmap_im_lock);
+ }
+ 
++static void
++idmap_pipe_release(struct inode *inode)
++{
++	struct rpc_inode *rpci = RPC_I(inode);
++	struct idmap *idmap = (struct idmap *)rpci->private;
++	struct idmap_msg *im = &idmap->idmap_im;
++
++	down(&idmap->idmap_im_lock);
++	im->im_status = IDMAP_STATUS_LOOKUPFAIL;
++	wake_up(&idmap->idmap_wq);
++	up(&idmap->idmap_im_lock);
++}
++
+ /* 
+  * Fowler/Noll/Vo hash
+  *    http://www.isthe.com/chongo/tech/comp/fnv/
+Index: linux-2.6.10/fs/nfs/dir.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/dir.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/nfs/dir.c	2005-04-05 14:49:13.439686176 +0800
+@@ -40,8 +40,6 @@
+ static int nfs_opendir(struct inode *, struct file *);
+ static int nfs_readdir(struct file *, void *, filldir_t);
+ static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+-static int nfs_cached_lookup(struct inode *, struct dentry *,
+-				struct nfs_fh *, struct nfs_fattr *);
+ static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
+ static int nfs_mkdir(struct inode *, struct dentry *, int);
+ static int nfs_rmdir(struct inode *, struct dentry *);
+@@ -92,6 +90,9 @@
+ 	.permission	= nfs_permission,
+ 	.getattr	= nfs_getattr,
+ 	.setattr	= nfs_setattr,
++	.getxattr       = nfs4_getxattr,
++	.setxattr       = nfs4_setxattr,
++	.listxattr      = nfs4_listxattr,
+ };
+ 
+ #endif /* CONFIG_NFS_V4 */
+@@ -294,24 +295,13 @@
+ 	return res;
+ }
+ 
+-static unsigned int nfs_type2dtype[] = {
+-	DT_UNKNOWN,
+-	DT_REG,
+-	DT_DIR,
+-	DT_BLK,
+-	DT_CHR,
+-	DT_LNK,
+-	DT_SOCK,
+-	DT_UNKNOWN,
+-	DT_FIFO
+-};
+-
+-static inline
+-unsigned int nfs_type_to_d_type(enum nfs_ftype type)
++static inline unsigned int dt_type(struct inode *inode)
+ {
+-	return nfs_type2dtype[type];
++	return (inode->i_mode >> 12) & 15;
+ }
+ 
++static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
++
+ /*
+  * Once we've found the start of the dirent within a page: fill 'er up...
+  */
+@@ -321,6 +311,7 @@
+ {
+ 	struct file	*file = desc->file;
+ 	struct nfs_entry *entry = desc->entry;
++	struct dentry	*dentry = NULL;
+ 	unsigned long	fileid;
+ 	int		loop_count = 0,
+ 			res;
+@@ -333,9 +324,16 @@
+ 		 *	 retrieving the current dirent on the server */
+ 		fileid = nfs_fileid_to_ino_t(entry->ino);
+ 
++		/* Get a dentry if we have one */
++		if (dentry != NULL)
++			dput(dentry);
++		dentry = nfs_readdir_lookup(desc);
++
+ 		/* Use readdirplus info */
+-		if (desc->plus && (entry->fattr->valid & NFS_ATTR_FATTR))
+-			d_type = nfs_type_to_d_type(entry->fattr->type);
++		if (dentry != NULL && dentry->d_inode != NULL) {
++			d_type = dt_type(dentry->d_inode);
++			fileid = dentry->d_inode->i_ino;
++		}
+ 
+ 		res = filldir(dirent, entry->name, entry->len, 
+ 			      entry->prev_cookie, fileid, d_type);
+@@ -352,7 +350,8 @@
+ 		}
+ 	}
+ 	dir_page_release(desc);
+-
++	if (dentry != NULL)
++		dput(dentry);
+ 	dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->target, res);
+ 	return res;
+ }
+@@ -615,24 +614,10 @@
+ 		goto out_valid;
+ 	}
+ 
+-	/*
+-	 * Note: we're not holding inode->i_sem and so may be racing with
+-	 * operations that change the directory. We therefore save the
+-	 * change attribute *before* we do the RPC call.
+-	 */
+-	verifier = nfs_save_change_attribute(dir);
+-	error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr);
+-	if (!error) {
+-		if (nfs_compare_fh(NFS_FH(inode), &fhandle))
+-			goto out_bad;
+-		if (nfs_lookup_verify_inode(inode, isopen))
+-			goto out_zap_parent;
+-		goto out_valid_renew;
+-	}
+-
+ 	if (NFS_STALE(inode))
+ 		goto out_bad;
+ 
++	verifier = nfs_save_change_attribute(dir);
+ 	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+ 	if (error)
+ 		goto out_bad;
+@@ -641,7 +626,6 @@
+ 	if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
+ 		goto out_bad;
+ 
+- out_valid_renew:
+ 	nfs_renew_times(dentry);
+ 	nfs_set_verifier(dentry, verifier);
+  out_valid:
+@@ -723,6 +707,7 @@
+ 
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+ {
++	struct dentry *res;
+ 	struct inode *inode = NULL;
+ 	int error;
+ 	struct nfs_fh fhandle;
+@@ -731,11 +716,11 @@
+ 	dfprintk(VFS, "NFS: lookup(%s/%s)\n",
+ 		dentry->d_parent->d_name.name, dentry->d_name.name);
+ 
+-	error = -ENAMETOOLONG;
++	res = ERR_PTR(-ENAMETOOLONG);
+ 	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+ 		goto out;
+ 
+-	error = -ENOMEM;
++	res = ERR_PTR(-ENOMEM);
+ 	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+ 
+ 	lock_kernel();
+@@ -746,29 +731,27 @@
+ 	if (nfs_is_exclusive_create(dir, nd))
+ 		goto no_entry;
+ 
+-	error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr);
+-	if (error != 0) {
+-		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name,
+-				&fhandle, &fattr);
+-		if (error == -ENOENT)
+-			goto no_entry;
+-		if (error != 0)
+-			goto out_unlock;
++	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
++	if (error == -ENOENT)
++		goto no_entry;
++	if (error < 0) {
++		res = ERR_PTR(error);
++		goto out_unlock;
+ 	}
+-	error = -EACCES;
++	res = ERR_PTR(-EACCES);
+ 	inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
+ 	if (!inode)
+ 		goto out_unlock;
+ no_entry:
+-	error = 0;
+-	d_add(dentry, inode);
++	res = d_add_unique(dentry, inode);
++	if (res != NULL)
++		dentry = res;
+ 	nfs_renew_times(dentry);
+ 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ out_unlock:
+ 	unlock_kernel();
+ out:
+-	BUG_ON(error > 0);
+-	return ERR_PTR(error);
++	return res;
+ }
+ 
+ #ifdef CONFIG_NFS_V4
+@@ -798,15 +781,15 @@
+ 
+ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+ {
++	struct dentry *res = NULL;
+ 	struct inode *inode = NULL;
+-	int error = 0;
+ 
+ 	/* Check that we are indeed trying to open this file */
+ 	if (!is_atomic_open(dir, nd))
+ 		goto no_open;
+ 
+ 	if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
+-		error = -ENAMETOOLONG;
++		res = ERR_PTR(-ENAMETOOLONG);
+ 		goto out;
+ 	}
+ 	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+@@ -828,7 +811,7 @@
+ 		inode = nfs4_atomic_open(dir, dentry, nd);
+ 	unlock_kernel();
+ 	if (IS_ERR(inode)) {
+-		error = PTR_ERR(inode);
++		int error = PTR_ERR(inode);
+ 		switch (error) {
+ 			/* Make a negative dentry */
+ 			case -ENOENT:
+@@ -841,16 +824,18 @@
+ 			/* case -EISDIR: */
+ 			/* case -EINVAL: */
+ 			default:
++				res = ERR_PTR(error);
+ 				goto out;
+ 		}
+ 	}
+ no_entry:
+-	d_add(dentry, inode);
++	res = d_add_unique(dentry, inode);
++	if (res != NULL)
++		dentry = res;
+ 	nfs_renew_times(dentry);
+ 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ out:
+-	BUG_ON(error > 0);
+-	return ERR_PTR(error);
++	return res;
+ no_open:
+ 	return nfs_lookup(dir, dentry, nd);
+ }
+@@ -906,83 +891,51 @@
+ }
+ #endif /* CONFIG_NFSV4 */
+ 
+-static inline
+-int find_dirent_name(nfs_readdir_descriptor_t *desc, struct page *page, struct dentry *dentry)
++static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
+ {
++	struct dentry *parent = desc->file->f_dentry;
++	struct inode *dir = parent->d_inode;
+ 	struct nfs_entry *entry = desc->entry;
+-	int		 status;
+-
+-	while((status = dir_decode(desc)) == 0) {
+-		if (entry->len != dentry->d_name.len)
+-			continue;
+-		if (memcmp(entry->name, dentry->d_name.name, entry->len))
+-			continue;
+-		if (!(entry->fattr->valid & NFS_ATTR_FATTR))
+-			continue;
+-		break;
+-	}
+-	return status;
+-}
+-
+-/*
+- * Use the cached Readdirplus results in order to avoid a LOOKUP call
+- * whenever we believe that the parent directory has not changed.
+- *
+- * We assume that any file creation/rename changes the directory mtime.
+- * As this results in a page cache invalidation whenever it occurs,
+- * we don't require any other tests for cache coherency.
+- */
+-static
+-int nfs_cached_lookup(struct inode *dir, struct dentry *dentry,
+-			struct nfs_fh *fh, struct nfs_fattr *fattr)
+-{
+-	nfs_readdir_descriptor_t desc;
+-	struct nfs_server *server;
+-	struct nfs_entry entry;
+-	struct page *page;
+-	unsigned long timestamp;
+-	int res;
+-
+-	if (!NFS_USE_READDIRPLUS(dir))
+-		return -ENOENT;
+-	server = NFS_SERVER(dir);
+-	/* Don't use readdirplus unless the cache is stable */
+-	if ((server->flags & NFS_MOUNT_NOAC) != 0
+-			|| nfs_caches_unstable(dir)
+-			|| nfs_attribute_timeout(dir))
+-		return -ENOENT;
+-	if ((NFS_FLAGS(dir) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) != 0)
+-		return -ENOENT;
+-	timestamp = NFS_I(dir)->readdir_timestamp;
+-
+-	entry.fh = fh;
+-	entry.fattr = fattr;
+-
+-	desc.decode = NFS_PROTO(dir)->decode_dirent;
+-	desc.entry = &entry;
+-	desc.page_index = 0;
+-	desc.plus = 1;
+-
+-	for(;(page = find_get_page(dir->i_mapping, desc.page_index)); desc.page_index++) {
+-
+-		res = -EIO;
+-		if (PageUptodate(page)) {
+-			void * kaddr = kmap_atomic(page, KM_USER0);
+-			desc.ptr = kaddr;
+-			res = find_dirent_name(&desc, page, dentry);
+-			kunmap_atomic(kaddr, KM_USER0);
+-		}
+-		page_cache_release(page);
++	struct dentry *dentry, *alias;
++	struct qstr name = {
++		.name = entry->name,
++		.len = entry->len,
++	};
++	struct inode *inode;
+ 
+-		if (res == 0)
+-			goto out_found;
+-		if (res != -EAGAIN)
++	switch (name.len) {
++		case 2:
++			if (name.name[0] == '.' && name.name[1] == '.')
++				return dget_parent(parent);
+ 			break;
++		case 1:
++			if (name.name[0] == '.')
++				return dget(parent);
++	}
++	name.hash = full_name_hash(name.name, name.len);
++	dentry = d_lookup(parent, &name);
++	if (dentry != NULL)
++		return dentry;
++	if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
++		return NULL;
++	/* Note: caller is already holding the dir->i_sem! */
++	dentry = d_alloc(parent, &name);
++	if (dentry == NULL)
++		return NULL;
++	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
++	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
++	if (!inode) {
++		dput(dentry);
++		return NULL;
+ 	}
+-	return -ENOENT;
+- out_found:
+-	fattr->timestamp = timestamp;
+-	return 0;
++	alias = d_add_unique(dentry, inode);
++	if (alias != NULL) {
++		dput(dentry);
++		dentry = alias;
++	}
++	nfs_renew_times(dentry);
++	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
++	return dentry;
+ }
+ 
+ /*
+@@ -1045,15 +998,9 @@
+ 	if (nd && (nd->flags & LOOKUP_CREATE))
+ 		open_flags = nd->intent.open.flags;
+ 
+-	/*
+-	 * The 0 argument passed into the create function should one day
+-	 * contain the O_EXCL flag if requested. This allows NFSv3 to
+-	 * select the appropriate create strategy. Currently open_namei
+-	 * does not pass the create flags.
+-	 */
+ 	lock_kernel();
+ 	nfs_begin_data_update(dir);
+-	inode = NFS_PROTO(dir)->create(dir, &dentry->d_name, &attr, open_flags);
++	inode = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
+ 	nfs_end_data_update(dir);
+ 	if (!IS_ERR(inode)) {
+ 		d_instantiate(dentry, inode);
+@@ -1508,7 +1455,7 @@
+ 
+ 	if (cache->cred != cred
+ 			|| time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
+-			|| (NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR))
++			|| (NFS_FLAGS(inode) & NFS_INO_INVALID_ACCESS))
+ 		return -ENOENT;
+ 	memcpy(res, cache, sizeof(*res));
+ 	return 0;
+@@ -1522,6 +1469,7 @@
+ 		if (cache->cred)
+ 			put_rpccred(cache->cred);
+ 		cache->cred = get_rpccred(set->cred);
++		NFS_FLAGS(inode) &= ~NFS_INO_INVALID_ACCESS;
+ 	}
+ 	cache->jiffies = set->jiffies;
+ 	cache->mask = set->mask;
+Index: linux-2.6.10/fs/nfs/unlink.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/unlink.c	2004-12-25 05:35:29.000000000 +0800
++++ linux-2.6.10/fs/nfs/unlink.c	2005-04-05 14:49:13.435686784 +0800
+@@ -215,7 +215,6 @@
+ 	spin_lock(&dentry->d_lock);
+ 	dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+ 	spin_unlock(&dentry->d_lock);
+-	if (data->task.tk_rpcwait == &nfs_delete_queue)
+-		rpc_wake_up_task(&data->task);
++	rpc_wake_up_task(&data->task);
+ 	nfs_put_unlinkdata(data);
+ }
+Index: linux-2.6.10/fs/nfs/write.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/write.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfs/write.c	2005-04-05 14:49:13.443685568 +0800
+@@ -61,7 +61,6 @@
+ #include <linux/nfs_page.h>
+ #include <asm/uaccess.h>
+ #include <linux/smp_lock.h>
+-#include <linux/mempool.h>
+ 
+ #include "delegation.h"
+ 
+@@ -83,49 +82,17 @@
+ static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int);
+ 
+ static kmem_cache_t *nfs_wdata_cachep;
+-static mempool_t *nfs_wdata_mempool;
+-static mempool_t *nfs_commit_mempool;
++mempool_t *nfs_wdata_mempool;
++mempool_t *nfs_commit_mempool;
+ 
+ static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
+ 
+-static __inline__ struct nfs_write_data *nfs_writedata_alloc(void)
+-{
+-	struct nfs_write_data	*p;
+-	p = (struct nfs_write_data *)mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
+-	if (p) {
+-		memset(p, 0, sizeof(*p));
+-		INIT_LIST_HEAD(&p->pages);
+-	}
+-	return p;
+-}
+-
+-static __inline__ void nfs_writedata_free(struct nfs_write_data *p)
+-{
+-	mempool_free(p, nfs_wdata_mempool);
+-}
+-
+-static void nfs_writedata_release(struct rpc_task *task)
++void nfs_writedata_release(struct rpc_task *task)
+ {
+ 	struct nfs_write_data	*wdata = (struct nfs_write_data *)task->tk_calldata;
+ 	nfs_writedata_free(wdata);
+ }
+ 
+-static __inline__ struct nfs_write_data *nfs_commit_alloc(void)
+-{
+-	struct nfs_write_data	*p;
+-	p = (struct nfs_write_data *)mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
+-	if (p) {
+-		memset(p, 0, sizeof(*p));
+-		INIT_LIST_HEAD(&p->pages);
+-	}
+-	return p;
+-}
+-
+-static __inline__ void nfs_commit_free(struct nfs_write_data *p)
+-{
+-	mempool_free(p, nfs_commit_mempool);
+-}
+-
+ /* Adjust the file length if we're writing beyond the end */
+ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
+ {
+@@ -184,11 +151,10 @@
+ 	int		result, written = 0;
+ 	struct nfs_write_data *wdata;
+ 
+-	wdata = kmalloc(sizeof(*wdata), GFP_NOFS);
++	wdata = nfs_writedata_alloc();
+ 	if (!wdata)
+ 		return -ENOMEM;
+ 
+-	memset(wdata, 0, sizeof(*wdata));
+ 	wdata->flags = how;
+ 	wdata->cred = ctx->cred;
+ 	wdata->inode = inode;
+@@ -238,8 +204,7 @@
+ 
+ io_error:
+ 	nfs_end_data_update_defer(inode);
+-
+-	kfree(wdata);
++	nfs_writedata_free(wdata);
+ 	return written ? written : result;
+ }
+ 
+@@ -1199,7 +1164,8 @@
+ 		}
+ 		if (time_before(complain, jiffies)) {
+ 			printk(KERN_WARNING
+-			       "NFS: Server wrote less than requested.\n");
++			       "NFS: Server wrote zero bytes, expected %u.\n",
++					argp->count);
+ 			complain = jiffies + 300 * HZ;
+ 		}
+ 		/* Can't do anything about it except throw an error. */
+Index: linux-2.6.10/fs/nfs/proc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/proc.c	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/fs/nfs/proc.c	2005-04-05 14:49:13.440686024 +0800
+@@ -63,12 +63,12 @@
+ 	dprintk("%s: call getattr\n", __FUNCTION__);
+ 	fattr->valid = 0;
+ 	status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0);
+-	dprintk("%s: reply getattr %d\n", __FUNCTION__, status);
++	dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
+ 	if (status)
+ 		return status;
+ 	dprintk("%s: call statfs\n", __FUNCTION__);
+ 	status = rpc_call(server->client_sys, NFSPROC_STATFS, fhandle, &fsinfo, 0);
+-	dprintk("%s: reply statfs %d\n", __FUNCTION__, status);
++	dprintk("%s: reply statfs: %d\n", __FUNCTION__, status);
+ 	if (status)
+ 		return status;
+ 	info->rtmax  = NFS_MAXDATA;
+@@ -96,7 +96,7 @@
+ 	fattr->valid = 0;
+ 	status = rpc_call(server->client, NFSPROC_GETATTR,
+ 				fhandle, fattr, 0);
+-	dprintk("NFS reply getattr\n");
++	dprintk("NFS reply getattr: %d\n", status);
+ 	return status;
+ }
+ 
+@@ -114,7 +114,7 @@
+ 	dprintk("NFS call  setattr\n");
+ 	fattr->valid = 0;
+ 	status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0);
+-	dprintk("NFS reply setattr\n");
++	dprintk("NFS reply setattr: %d\n", status);
+ 	return status;
+ }
+ 
+@@ -213,15 +213,15 @@
+ }
+ 
+ static struct inode *
+-nfs_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr,
++nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ 		int flags)
+ {
+ 	struct nfs_fh		fhandle;
+ 	struct nfs_fattr	fattr;
+ 	struct nfs_createargs	arg = {
+ 		.fh		= NFS_FH(dir),
+-		.name		= name->name,
+-		.len		= name->len,
++		.name		= dentry->d_name.name,
++		.len		= dentry->d_name.len,
+ 		.sattr		= sattr
+ 	};
+ 	struct nfs_diropok	res = {
+@@ -231,7 +231,7 @@
+ 	int			status;
+ 
+ 	fattr.valid = 0;
+-	dprintk("NFS call  create %s\n", name->name);
++	dprintk("NFS call  create %s\n", dentry->d_name.name);
+ 	status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
+ 	dprintk("NFS reply create: %d\n", status);
+ 	if (status == 0) {
+@@ -620,6 +620,7 @@
+ 	.version	= 2,		       /* protocol version */
+ 	.dentry_ops	= &nfs_dentry_operations,
+ 	.dir_inode_ops	= &nfs_dir_inode_operations,
++	.file_inode_ops	= &nfs_file_inode_operations,
+ 	.getroot	= nfs_proc_get_root,
+ 	.getattr	= nfs_proc_getattr,
+ 	.setattr	= nfs_proc_setattr,
+Index: linux-2.6.10/fs/nfs/callback.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/callback.c	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/fs/nfs/callback.c	2005-04-05 14:49:13.436686632 +0800
+@@ -139,133 +139,10 @@
+ 	return ret;
+ }
+ 
+-/*
+- * AUTH_NULL authentication
+- */
+-static int nfs_callback_null_accept(struct svc_rqst *rqstp, u32 *authp)
+-{
+-	struct kvec    *argv = &rqstp->rq_arg.head[0];
+-	struct kvec    *resv = &rqstp->rq_res.head[0];
+-
+-	if (argv->iov_len < 3*4)
+-		return SVC_GARBAGE;
+-
+-	if (svc_getu32(argv) != 0) {
+-		dprintk("svc: bad null cred\n");
+-		*authp = rpc_autherr_badcred;
+-		return SVC_DENIED;
+-	}
+-	if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
+-		dprintk("svc: bad null verf\n");
+-		 *authp = rpc_autherr_badverf;
+-		 return SVC_DENIED;
+-	}
+-
+-	/* Signal that mapping to nobody uid/gid is required */
+-	rqstp->rq_cred.cr_uid = (uid_t) -1;
+-	rqstp->rq_cred.cr_gid = (gid_t) -1;
+-	rqstp->rq_cred.cr_group_info = groups_alloc(0);
+-	if (rqstp->rq_cred.cr_group_info == NULL)
+-		return SVC_DROP; /* kmalloc failure - client must retry */
+-
+-	/* Put NULL verifier */
+-	svc_putu32(resv, RPC_AUTH_NULL);
+-	svc_putu32(resv, 0);
+-	dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK);
+-	return SVC_OK;
+-}
+-
+-static int nfs_callback_null_release(struct svc_rqst *rqstp)
+-{
+-	if (rqstp->rq_cred.cr_group_info)
+-		put_group_info(rqstp->rq_cred.cr_group_info);
+-	rqstp->rq_cred.cr_group_info = NULL;
+-	return 0; /* don't drop */
+-}
+-
+-static struct auth_ops nfs_callback_auth_null = {
+-	.name = "null",
+-	.flavour = RPC_AUTH_NULL,
+-	.accept = nfs_callback_null_accept,
+-	.release = nfs_callback_null_release,
+-};
+-
+-/*
+- * AUTH_SYS authentication
+- */
+-static int nfs_callback_unix_accept(struct svc_rqst *rqstp, u32 *authp)
+-{
+-	struct kvec    *argv = &rqstp->rq_arg.head[0];
+-	struct kvec    *resv = &rqstp->rq_res.head[0];
+-	struct svc_cred *cred = &rqstp->rq_cred;
+-	u32 slen, i;
+-	int len = argv->iov_len;
+-
+-	dprintk("%s: start\n", __FUNCTION__);
+-	cred->cr_group_info = NULL;
+-	rqstp->rq_client = NULL;
+-	if ((len -= 3*4) < 0)
+-		return SVC_GARBAGE;
+-
+-	/* Get length, time stamp and machine name */
+-	svc_getu32(argv);
+-	svc_getu32(argv);
+-	slen = XDR_QUADLEN(ntohl(svc_getu32(argv)));
+-	if (slen > 64 || (len -= (slen + 3)*4) < 0)
+-		goto badcred;
+-	argv->iov_base = (void*)((u32*)argv->iov_base + slen);
+-	argv->iov_len -= slen*4;
+-
+-	cred->cr_uid = ntohl(svc_getu32(argv));
+-	cred->cr_gid = ntohl(svc_getu32(argv));
+-	slen = ntohl(svc_getu32(argv));
+-	if (slen > 16 || (len -= (slen + 2)*4) < 0)
+-		goto badcred;
+-	cred->cr_group_info = groups_alloc(slen);
+-	if (cred->cr_group_info == NULL)
+-		return SVC_DROP;
+-	for (i = 0; i < slen; i++)
+-		GROUP_AT(cred->cr_group_info, i) = ntohl(svc_getu32(argv));
+-
+-	if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
+-		*authp = rpc_autherr_badverf;
+-		return SVC_DENIED;
+-	}
+-	/* Put NULL verifier */
+-	svc_putu32(resv, RPC_AUTH_NULL);
+-	svc_putu32(resv, 0);
+-	dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK);
+-	return SVC_OK;
+-badcred:
+-	*authp = rpc_autherr_badcred;
+-	return SVC_DENIED;
+-}
+-
+-static int nfs_callback_unix_release(struct svc_rqst *rqstp)
+-{
+-	if (rqstp->rq_cred.cr_group_info)
+-		put_group_info(rqstp->rq_cred.cr_group_info);
+-	rqstp->rq_cred.cr_group_info = NULL;
+-	return 0;
+-}
+-
+-static struct auth_ops nfs_callback_auth_unix = {
+-	.name = "unix",
+-	.flavour = RPC_AUTH_UNIX,
+-	.accept = nfs_callback_unix_accept,
+-	.release = nfs_callback_unix_release,
+-};
+-
+-/*
+- * Hook the authentication protocol
+- */
+-static int nfs_callback_auth(struct svc_rqst *rqstp, u32 *authp)
++static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+ {
+ 	struct in_addr *addr = &rqstp->rq_addr.sin_addr;
+ 	struct nfs4_client *clp;
+-	struct kvec *argv = &rqstp->rq_arg.head[0];
+-	int flavour;
+-	int retval;
+ 
+ 	/* Don't talk to strangers */
+ 	clp = nfs4_find_client(addr);
+@@ -273,34 +150,19 @@
+ 		return SVC_DROP;
+ 	dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr));
+ 	nfs4_put_client(clp);
+-	flavour = ntohl(svc_getu32(argv));
+-	switch(flavour) {
++	switch (rqstp->rq_authop->flavour) {
+ 		case RPC_AUTH_NULL:
+-			if (rqstp->rq_proc != CB_NULL) {
+-				*authp = rpc_autherr_tooweak;
+-				retval = SVC_DENIED;
+-				break;
+-			}
+-			rqstp->rq_authop = &nfs_callback_auth_null;
+-			retval = nfs_callback_null_accept(rqstp, authp);
++			if (rqstp->rq_proc != CB_NULL)
++				return SVC_DENIED;
+ 			break;
+ 		case RPC_AUTH_UNIX:
+-			/* Eat the authentication flavour */
+-			rqstp->rq_authop = &nfs_callback_auth_unix;
+-			retval = nfs_callback_unix_accept(rqstp, authp);
+ 			break;
++		case RPC_AUTH_GSS:
++			/* FIXME: RPCSEC_GSS handling? */
+ 		default:
+-			/* FIXME: need to add RPCSEC_GSS upcalls */
+-#if 0
+-			svc_ungetu32(argv);
+-			retval = svc_authenticate(rqstp, authp);
+-#else
+-			*authp = rpc_autherr_rejectedcred;
+-			retval = SVC_DENIED;
+-#endif
++			return SVC_DENIED;
+ 	}
+-	dprintk("%s: flavour %d returning error %d\n", __FUNCTION__, flavour, retval);
+-	return retval;
++	return SVC_OK;
+ }
+ 
+ /*
+@@ -321,5 +183,5 @@
+ 	.pg_name = "NFSv4 callback",			/* service name */
+ 	.pg_class = "nfs",				/* authentication class */
+ 	.pg_stats = &nfs4_callback_stats,
+-	.pg_authenticate = nfs_callback_auth,
++	.pg_authenticate = nfs_callback_authenticate,
+ };
+Index: linux-2.6.10/fs/nfs/file.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/file.c	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/fs/nfs/file.c	2005-04-05 14:49:13.453684048 +0800
+@@ -67,6 +67,19 @@
+ 	.setattr	= nfs_setattr,
+ };
+ 
++#ifdef CONFIG_NFS_V4
++
++struct inode_operations nfs4_file_inode_operations = {
++	.permission	= nfs_permission,
++	.getattr	= nfs_getattr,
++	.setattr	= nfs_setattr,
++	.getxattr	= nfs4_getxattr,
++	.setxattr	= nfs4_setxattr,
++	.listxattr	= nfs4_listxattr,
++};
++
++#endif /* CONFIG_NFS_V4 */
++
+ /* Hack for future NFS swap support */
+ #ifndef IS_SWAPFILE
+ # define IS_SWAPFILE(inode)	(0)
+@@ -295,10 +308,19 @@
+ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
+ {
+ 	struct inode *inode = filp->f_mapping->host;
+-	int status;
++	int status = 0;
+ 
+ 	lock_kernel();
+-	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++	/* Use local locking if mounted with "-onolock" */
++	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
++		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++	else {
++		struct file_lock *cfl = posix_test_lock(filp, fl);
++		if (cfl != NULL) {
++			memcpy(fl, cfl, sizeof(*fl));
++			fl->fl_type = F_UNLCK;
++		}
++	}
+ 	unlock_kernel();
+ 	return status;
+ }
+@@ -325,7 +347,11 @@
+ 	 * 	still need to complete the unlock.
+ 	 */
+ 	lock_kernel();
+-	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++	/* Use local locking if mounted with "-onolock" */
++	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
++		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++	else
++		status = posix_lock_file_wait(filp, fl);
+ 	rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset);
+ 	return status;
+ }
+@@ -351,15 +377,19 @@
+ 		return status;
+ 
+ 	lock_kernel();
+-	status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+-	/* If we were signalled we still need to ensure that
+-	 * we clean up any state on the server. We therefore
+-	 * record the lock call as having succeeded in order to
+-	 * ensure that locks_remove_posix() cleans it out when
+-	 * the process exits.
+-	 */
+-	if (status == -EINTR || status == -ERESTARTSYS)
+-		posix_lock_file(filp, fl);
++	/* Use local locking if mounted with "-onolock" */
++	if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) {
++		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++		/* If we were signalled we still need to ensure that
++		 * we clean up any state on the server. We therefore
++		 * record the lock call as having succeeded in order to
++		 * ensure that locks_remove_posix() cleans it out when
++		 * the process exits.
++		 */
++		if (status == -EINTR || status == -ERESTARTSYS)
++			posix_lock_file(filp, fl);
++	} else
++		status = posix_lock_file_wait(filp, fl);
+ 	unlock_kernel();
+ 	if (status < 0)
+ 		return status;
+@@ -396,15 +426,6 @@
+ 	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ 		return -ENOLCK;
+ 
+-	if (NFS_PROTO(inode)->version != 4) {
+-		/* Fake OK code if mounted without NLM support */
+-		if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) {
+-			if (IS_GETLK(cmd))
+-				return LOCK_USE_CLNT;
+-			return 0;
+-		}
+-	}
+-
+ 	/*
+ 	 * No BSD flocks over NFS allowed.
+ 	 * Note: we could try to fake a POSIX lock request here by
+Index: linux-2.6.10/fs/nfs/nfs3proc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/nfs3proc.c	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/fs/nfs/nfs3proc.c	2005-04-05 14:49:13.441685872 +0800
+@@ -80,10 +80,10 @@
+ 	dprintk("%s: call  fsinfo\n", __FUNCTION__);
+ 	info->fattr->valid = 0;
+ 	status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0);
+-	dprintk("%s: reply fsinfo %d\n", __FUNCTION__, status);
++	dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
+ 	if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
+ 		status = rpc_call(server->client_sys, NFS3PROC_GETATTR, fhandle, info->fattr, 0);
+-		dprintk("%s: reply getattr %d\n", __FUNCTION__, status);
++		dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
+ 	}
+ 	return status;
+ }
+@@ -101,7 +101,7 @@
+ 	fattr->valid = 0;
+ 	status = rpc_call(server->client, NFS3PROC_GETATTR,
+ 			  fhandle, fattr, 0);
+-	dprintk("NFS reply getattr\n");
++	dprintk("NFS reply getattr: %d\n", status);
+ 	return status;
+ }
+ 
+@@ -119,7 +119,7 @@
+ 	dprintk("NFS call  setattr\n");
+ 	fattr->valid = 0;
+ 	status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0);
+-	dprintk("NFS reply setattr\n");
++	dprintk("NFS reply setattr: %d\n", status);
+ 	return status;
+ }
+ 
+@@ -198,7 +198,7 @@
+ 		if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
+ 			entry->mask |= MAY_EXEC;
+ 	}
+-	dprintk("NFS reply access, status = %d\n", status);
++	dprintk("NFS reply access: %d\n", status);
+ 	return status;
+ }
+ 
+@@ -296,7 +296,7 @@
+  * For now, we don't implement O_EXCL.
+  */
+ static struct inode *
+-nfs3_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr,
++nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ 		 int flags)
+ {
+ 	struct nfs_fh		fhandle;
+@@ -304,8 +304,8 @@
+ 	struct nfs_fattr	dir_attr;
+ 	struct nfs3_createargs	arg = {
+ 		.fh		= NFS_FH(dir),
+-		.name		= name->name,
+-		.len		= name->len,
++		.name		= dentry->d_name.name,
++		.len		= dentry->d_name.len,
+ 		.sattr		= sattr,
+ 	};
+ 	struct nfs3_diropres	res = {
+@@ -315,7 +315,7 @@
+ 	};
+ 	int			status;
+ 
+-	dprintk("NFS call  create %s\n", name->name);
++	dprintk("NFS call  create %s\n", dentry->d_name.name);
+ 	arg.createmode = NFS3_CREATE_UNCHECKED;
+ 	if (flags & O_EXCL) {
+ 		arg.createmode  = NFS3_CREATE_EXCLUSIVE;
+@@ -353,7 +353,7 @@
+ 	if (status != 0)
+ 		goto out;
+ 	if (fhandle.size == 0 || !(fattr.valid & NFS_ATTR_FATTR)) {
+-		status = nfs3_proc_lookup(dir, name, &fhandle, &fattr);
++		status = nfs3_proc_lookup(dir, &dentry->d_name, &fhandle, &fattr);
+ 		if (status != 0)
+ 			goto out;
+ 	}
+@@ -838,6 +838,7 @@
+ 	.version	= 3,			/* protocol version */
+ 	.dentry_ops	= &nfs_dentry_operations,
+ 	.dir_inode_ops	= &nfs_dir_inode_operations,
++	.file_inode_ops	= &nfs_file_inode_operations,
+ 	.getroot	= nfs3_proc_get_root,
+ 	.getattr	= nfs3_proc_getattr,
+ 	.setattr	= nfs3_proc_setattr,
+Index: linux-2.6.10/fs/nfs/nfs4proc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/nfs4proc.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfs/nfs4proc.c	2005-04-05 14:49:13.456683592 +0800
+@@ -477,7 +477,7 @@
+ /*
+  * Returns an nfs4_state + an referenced inode
+  */
+-static int _nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
++static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+ {
+ 	struct nfs4_state_owner  *sp;
+ 	struct nfs4_state     *state = NULL;
+@@ -491,7 +491,7 @@
+ 	struct nfs_openargs o_arg = {
+ 		.fh             = NFS_FH(dir),
+ 		.open_flags	= flags,
+-		.name           = name,
++		.name           = &dentry->d_name,
+ 		.server         = server,
+ 		.bitmask = server->attr_bitmask,
+ 		.claim = NFS4_OPEN_CLAIM_NULL,
+@@ -581,14 +581,14 @@
+ }
+ 
+ 
+-struct nfs4_state *nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred)
++struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred)
+ {
+ 	struct nfs4_exception exception = { };
+ 	struct nfs4_state *res;
+ 	int status;
+ 
+ 	do {
+-		status = _nfs4_do_open(dir, name, flags, sattr, cred, &res);
++		status = _nfs4_do_open(dir, dentry, flags, sattr, cred, &res);
+ 		if (status == 0)
+ 			break;
+ 		/* NOTE: BAD_SEQID means the server and client disagree about the
+@@ -635,6 +635,8 @@
+ 
+         fattr->valid = 0;
+ 
++	if (state != NULL)
++		msg.rpc_cred = state->owner->so_cred;
+ 	if (sattr->ia_valid & ATTR_SIZE)
+ 		nfs4_copy_stateid(&arg.stateid, state, NULL);
+ 	else
+@@ -658,6 +660,61 @@
+ 	return err;
+ }
+ 
++struct nfs4_closedata {
++	struct inode *inode;
++	struct nfs4_state *state;
++	struct nfs_closeargs arg;
++	struct nfs_closeres res;
++};
++
++static void nfs4_close_done(struct rpc_task *task)
++{
++	struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
++	struct nfs4_state *state = calldata->state;
++	struct nfs4_state_owner *sp = state->owner;
++	struct nfs_server *server = NFS_SERVER(calldata->inode);
++
++        /* hmm. we are done with the inode, and in the process of freeing
++	 * the state_owner. we keep this around to process errors
++	 */
++	nfs4_increment_seqid(task->tk_status, sp);
++	switch (task->tk_status) {
++		case 0:
++			state->state = calldata->arg.open_flags;
++			memcpy(&state->stateid, &calldata->res.stateid,
++					sizeof(state->stateid));
++			break;
++		case -NFS4ERR_STALE_STATEID:
++		case -NFS4ERR_EXPIRED:
++			state->state = calldata->arg.open_flags;
++			nfs4_schedule_state_recovery(server->nfs4_state);
++			break;
++		default:
++			if (nfs4_async_handle_error(task, server) == -EAGAIN) {
++				rpc_restart_call(task);
++				return;
++			}
++	}
++	nfs4_put_open_state(state);
++	up(&sp->so_sema);
++	nfs4_put_state_owner(sp);
++	up_read(&server->nfs4_state->cl_sem);
++	kfree(calldata);
++}
++
++static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *calldata)
++{
++	struct rpc_message msg = {
++		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
++		.rpc_argp = &calldata->arg,
++		.rpc_resp = &calldata->res,
++		.rpc_cred = calldata->state->owner->so_cred,
++	};
++	if (calldata->arg.open_flags != 0)
++		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
++	return rpc_call_async(clnt, &msg, 0, nfs4_close_done, calldata);
++}
++
+ /* 
+  * It is possible for data to be read/written from a mem-mapped file 
+  * after the sys_close call (which hits the vfs layer as a flush).
+@@ -669,102 +726,34 @@
+  *
+  * NOTE: Caller must be holding the sp->so_owner semaphore!
+  */
+-static int _nfs4_do_close(struct inode *inode, struct nfs4_state *state) 
++int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) 
+ {
+-	struct nfs4_state_owner *sp = state->owner;
+-	int status = 0;
+-	struct nfs_closeargs arg = {
+-		.fh		= NFS_FH(inode),
+-	};
+-	struct nfs_closeres res;
+-	struct rpc_message msg = {
+-		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+-		.rpc_argp	= &arg,
+-		.rpc_resp	= &res,
+-	};
++	struct nfs4_closedata *calldata;
++	int status;
+ 
+-	if (test_bit(NFS_DELEGATED_STATE, &state->flags))
++	/* Tell caller we're done */
++	if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
++		state->state = mode;
+ 		return 0;
+-	memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
++	}
++	calldata = (struct nfs4_closedata *)kmalloc(sizeof(*calldata), GFP_KERNEL);
++	if (calldata == NULL)
++		return -ENOMEM;
++	calldata->inode = inode;
++	calldata->state = state;
++	calldata->arg.fh = NFS_FH(inode);
+ 	/* Serialization for the sequence id */
+-	arg.seqid = sp->so_seqid,
+-	status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, RPC_TASK_NOINTR);
+-
+-        /* hmm. we are done with the inode, and in the process of freeing
+-	 * the state_owner. we keep this around to process errors
++	calldata->arg.seqid = state->owner->so_seqid;
++	calldata->arg.open_flags = mode;
++	memcpy(&calldata->arg.stateid, &state->stateid,
++			sizeof(calldata->arg.stateid));
++	status = nfs4_close_call(NFS_SERVER(inode)->client, calldata);
++	/*
++	 * Return -EINPROGRESS on success in order to indicate to the
++	 * caller that an asynchronous RPC call has been launched, and
++	 * that it will release the semaphores on completion.
+ 	 */
+-	nfs4_increment_seqid(status, sp);
+-	if (!status)
+-		memcpy(&state->stateid, &res.stateid, sizeof(state->stateid));
+-
+-	return status;
+-}
+-
+-int nfs4_do_close(struct inode *inode, struct nfs4_state *state) 
+-{
+-	struct nfs_server *server = NFS_SERVER(state->inode);
+-	struct nfs4_exception exception = { };
+-	int err;
+-	do {
+-		err = _nfs4_do_close(inode, state);
+-		switch (err) {
+-			case -NFS4ERR_STALE_STATEID:
+-			case -NFS4ERR_EXPIRED:
+-				nfs4_schedule_state_recovery(server->nfs4_state);
+-				err = 0;
+-			default:
+-				state->state = 0;
+-		}
+-		err = nfs4_handle_exception(server, err, &exception);
+-	} while (exception.retry);
+-	return err;
+-}
+-
+-static int _nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) 
+-{
+-	struct nfs4_state_owner *sp = state->owner;
+-	int status = 0;
+-	struct nfs_closeargs arg = {
+-		.fh		= NFS_FH(inode),
+-		.seqid		= sp->so_seqid,
+-		.open_flags	= mode,
+-	};
+-	struct nfs_closeres res;
+-	struct rpc_message msg = {
+-		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE],
+-		.rpc_argp	= &arg,
+-		.rpc_resp	= &res,
+-	};
+-
+-	if (test_bit(NFS_DELEGATED_STATE, &state->flags))
+-		return 0;
+-	memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
+-	status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, RPC_TASK_NOINTR);
+-	nfs4_increment_seqid(status, sp);
+-	if (!status)
+-		memcpy(&state->stateid, &res.stateid, sizeof(state->stateid));
+-
+-	return status;
+-}
+-
+-int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) 
+-{
+-	struct nfs_server *server = NFS_SERVER(state->inode);
+-	struct nfs4_exception exception = { };
+-	int err;
+-	do {
+-		err = _nfs4_do_downgrade(inode, state, mode);
+-		switch (err) {
+-			case -NFS4ERR_STALE_STATEID:
+-			case -NFS4ERR_EXPIRED:
+-				nfs4_schedule_state_recovery(server->nfs4_state);
+-				err = 0;
+-			default:
+-				state->state = mode;
+-		}
+-		err = nfs4_handle_exception(server, err, &exception);
+-	} while (exception.retry);
+-	return err;
++	return (status == 0) ? -EINPROGRESS : status;
+ }
+ 
+ struct inode *
+@@ -785,7 +774,7 @@
+ 	}
+ 
+ 	cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+-	state = nfs4_do_open(dir, &dentry->d_name, nd->intent.open.flags, &attr, cred);
++	state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred);
+ 	put_rpccred(cred);
+ 	if (IS_ERR(state))
+ 		return (struct inode *)state;
+@@ -802,7 +791,7 @@
+ 	cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+ 	state = nfs4_open_delegated(dentry->d_inode, openflags, cred);
+ 	if (IS_ERR(state))
+-		state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred);
++		state = nfs4_do_open(dir, dentry, openflags, NULL, cred);
+ 	put_rpccred(cred);
+ 	if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0)
+ 		return 1;
+@@ -1026,7 +1015,7 @@
+ 					FMODE_WRITE, cred);
+ 			if (IS_ERR(state))
+ 				state = nfs4_do_open(dentry->d_parent->d_inode,
+-						&dentry->d_name, FMODE_WRITE,
++						dentry, FMODE_WRITE,
+ 						NULL, cred);
+ 			need_iput = 1;
+ 		}
+@@ -1327,7 +1316,7 @@
+  */
+ 
+ static struct inode *
+-nfs4_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr,
++nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+                  int flags)
+ {
+ 	struct inode *inode;
+@@ -1335,7 +1324,7 @@
+ 	struct rpc_cred *cred;
+ 
+ 	cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+-	state = nfs4_do_open(dir, name, flags, sattr, cred);
++	state = nfs4_do_open(dir, dentry, flags, sattr, cred);
+ 	put_rpccred(cred);
+ 	if (!IS_ERR(state)) {
+ 		inode = state->inode;
+@@ -2049,6 +2038,86 @@
+ }
+ 
+ static int
++nfs4_server_supports_acls(struct nfs_server *server)
++{
++	return (server->caps & NFS_CAP_ACLS)
++		&& (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
++		&& (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
++}
++
++/* XXX: assuming XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE,
++ * and that it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE)
++ * bytes on the stack.  (Currently probably both true.)
++ */
++#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
++
++static void buf_to_pages(const void *buf, ssize_t buflen,
++		struct page **pages, unsigned int *pgbase)
++{
++	const void *p = buf;
++
++	*pgbase = offset_in_page(buf);
++	p -= *pgbase;
++	while (p < buf + buflen) {
++		*(pages++) = virt_to_page(p);
++		p += PAGE_CACHE_SIZE;
++	}
++}
++
++ssize_t
++nfs4_proc_get_acl(struct inode *inode, void *buf, ssize_t buflen)
++{
++	struct nfs_server *server = NFS_SERVER(inode);
++	struct page *pages[NFS4ACL_MAXPAGES];
++	struct nfs_getaclargs args = {
++		.fh = NFS_FH(inode),
++		.acl_pages = pages,
++		.acl_len = buflen,
++	};
++	ssize_t acl_len = buflen;
++	struct rpc_message msg = {
++		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
++		.rpc_argp = &args,
++		.rpc_resp = &acl_len,
++	};
++	int ret;
++
++	if (!nfs4_server_supports_acls(server))
++		return -EOPNOTSUPP;
++	buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
++	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
++	if (buflen && acl_len > buflen)
++		return -ERANGE;
++	if (ret == 0)
++		ret = acl_len;
++	return ret;
++}
++
++int
++nfs4_proc_set_acl(struct inode *inode, const void *buf, ssize_t buflen)
++{
++	struct nfs_server *server = NFS_SERVER(inode);
++	struct page *pages[NFS4ACL_MAXPAGES];
++	struct nfs_setaclargs arg = {
++		.fh		= NFS_FH(inode),
++		.acl_pages	= pages,
++		.acl_len	= buflen,
++	};
++	struct rpc_message msg = {
++		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETACL],
++		.rpc_argp	= &arg,
++		.rpc_resp	= NULL,
++	};
++	int ret;
++
++	if (!nfs4_server_supports_acls(server))
++		return -EOPNOTSUPP;
++	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
++	ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
++	return ret;
++}
++
++static int
+ nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server)
+ {
+ 	struct nfs4_client *clp = server->nfs4_state;
+@@ -2589,6 +2658,7 @@
+ 	.version	= 4,			/* protocol version */
+ 	.dentry_ops	= &nfs4_dentry_operations,
+ 	.dir_inode_ops	= &nfs4_dir_inode_operations,
++	.file_inode_ops	= &nfs4_file_inode_operations,
+ 	.getroot	= nfs4_proc_get_root,
+ 	.getattr	= nfs4_proc_getattr,
+ 	.setattr	= nfs4_proc_setattr,
+Index: linux-2.6.10/fs/nfs/direct.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/direct.c	2005-03-31 15:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfs/direct.c	2005-04-05 14:49:13.448684808 +0800
+@@ -33,6 +33,7 @@
+  * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
+  * 08 Jun 2003	Port to 2.5 APIs  --cel
+  * 31 Mar 2004	Handle direct I/O without VFS support  --cel
++ * 15 Sep 2004	Parallel async reads  --cel
+  *
+  */
+ 
+@@ -43,6 +44,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/file.h>
+ #include <linux/pagemap.h>
++#include <linux/kref.h>
+ 
+ #include <linux/nfs_fs.h>
+ #include <linux/nfs_page.h>
+@@ -50,11 +52,27 @@
+ 
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
++#include <asm/atomic.h>
+ 
+ #define NFSDBG_FACILITY		NFSDBG_VFS
+-#define VERF_SIZE		(2 * sizeof(__u32))
+ #define MAX_DIRECTIO_SIZE	(4096UL << PAGE_SHIFT)
+ 
++static kmem_cache_t *nfs_direct_cachep;
++
++/*
++ * This represents a set of asynchronous requests that we're waiting on
++ */
++struct nfs_direct_req {
++	struct kref		kref;		/* release manager */
++	struct list_head	list;		/* nfs_read_data structs */
++	wait_queue_head_t	wait;		/* wait for i/o completion */
++	struct page **		pages;		/* pages in our buffer */
++	unsigned int		npages;		/* count of pages */
++	atomic_t		complete,	/* i/os we're waiting for */
++				count,		/* bytes actually processed */
++				error;		/* any reported error */
++};
++
+ 
+ /**
+  * nfs_get_user_pages - find and set up pages underlying user's buffer
+@@ -71,7 +89,8 @@
+ 	unsigned long page_count;
+ 	size_t array_size;
+ 
+-	/* set an arbitrary limit to prevent arithmetic overflow */
++ 	/* set an arbitrary limit to prevent type overflow */
++ 	/* XXX: this can probably be as large as INT_MAX */
+ 	if (size > MAX_DIRECTIO_SIZE) {
+ 		*pages = NULL;
+ 		return -EFBIG;
+@@ -95,6 +114,8 @@
+ /**
+  * nfs_free_user_pages - tear down page struct array
+  * @pages: array of page struct pointers underlying target buffer
++ * @npages: number of pages in the array
++ * @do_dirty: dirty the pages as we release them
+  */
+ static void
+ nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
+@@ -109,77 +130,231 @@
+ }
+ 
+ /**
+- * nfs_direct_read_seg - Read in one iov segment.  Generate separate
+- *                        read RPCs for each "rsize" bytes.
++ * nfs_direct_req_release - release  nfs_direct_req structure for direct read
++ * @kref: kref object embedded in an nfs_direct_req structure
++ *
++ */
++static void nfs_direct_req_release(struct kref *kref)
++{
++	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
++	kmem_cache_free(nfs_direct_cachep, dreq);
++}
++
++/**
++ * nfs_direct_read_alloc - allocate nfs_read_data structures for direct read
++ * @count: count of bytes for the read request
++ * @rsize: local rsize setting
++ *
++ * Note we also set the number of requests we have in the dreq when we are
++ * done.  This prevents races with I/O completion so we will always wait
++ * until all requests have been dispatched and completed.
++ */
++static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int rsize)
++{
++	struct list_head *list;
++	struct nfs_direct_req *dreq;
++	unsigned int reads = 0;
++
++	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
++	if (!dreq)
++		return NULL;
++
++	kref_init(&dreq->kref);
++	init_waitqueue_head(&dreq->wait);
++	INIT_LIST_HEAD(&dreq->list);
++	atomic_set(&dreq->count, 0);
++	atomic_set(&dreq->error, 0);
++
++	list = &dreq->list;
++	for(;;) {
++		struct nfs_read_data *data = nfs_readdata_alloc();
++
++		if (unlikely(!data)) {
++			while (!list_empty(list)) {
++				data = list_entry(list->next,
++						  struct nfs_read_data, pages);
++				list_del(&data->pages);
++				nfs_readdata_free(data);
++			}
++			kref_put(&dreq->kref, nfs_direct_req_release);
++			return NULL;
++		}
++
++		INIT_LIST_HEAD(&data->pages);
++		list_add(&data->pages, list);
++
++		data->req = (struct nfs_page *) dreq;
++		reads++;
++		if (nbytes <= rsize)
++			break;
++		nbytes -= rsize;
++	}
++	kref_get(&dreq->kref);
++	atomic_set(&dreq->complete, reads);
++	return dreq;
++}
++
++/**
++ * nfs_direct_read_result - handle a read reply for a direct read request
++ * @data: address of NFS READ operation control block
++ * @status: status of this NFS READ operation
++ *
++ * We must hold a reference to all the pages in this direct read request
++ * until the RPCs complete.  This could be long *after* we are woken up in
++ * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server).
++ */
++static void nfs_direct_read_result(struct nfs_read_data *data, int status)
++{
++	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
++
++	if (likely(status >= 0))
++		atomic_add(data->res.count, &dreq->count);
++	else
++		atomic_set(&dreq->error, status);
++
++	if (unlikely(atomic_dec_and_test(&dreq->complete))) {
++		nfs_free_user_pages(dreq->pages, dreq->npages, 1);
++		wake_up(&dreq->wait);
++		kref_put(&dreq->kref, nfs_direct_req_release);
++	}
++}
++
++/**
++ * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read
++ * @dreq: address of nfs_direct_req struct for this request
+  * @inode: target inode
+  * @ctx: target file open context
+- * user_addr: starting address of this segment of user's buffer
+- * count: size of this segment
+- * file_offset: offset in file to begin the operation
+- * @pages: array of addresses of page structs defining user's buffer
+- * nr_pages: size of pages array
++ * @user_addr: starting address of this segment of user's buffer
++ * @count: size of this segment
++ * @file_offset: offset in file to begin the operation
++ *
++ * For each nfs_read_data struct that was allocated on the list, dispatch
++ * an NFS READ operation
+  */
+-static int
+-nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx,
+-		unsigned long user_addr, size_t count, loff_t file_offset,
+-		struct page **pages, int nr_pages)
+-{
+-	const unsigned int rsize = NFS_SERVER(inode)->rsize;
+-	int tot_bytes = 0;
+-	int curpage = 0;
+-	struct nfs_read_data	rdata = {
+-		.inode		= inode,
+-		.cred		= ctx->cred,
+-		.args		= {
+-			.fh		= NFS_FH(inode),
+-			.context	= ctx,
+-		},
+-		.res		= {
+-			.fattr		= &rdata.fattr,
+-		},
+-	};
++static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
++		struct inode *inode, struct nfs_open_context *ctx,
++		unsigned long user_addr, size_t count, loff_t file_offset)
++{
++	struct list_head *list = &dreq->list;
++	struct page **pages = dreq->pages;
++	unsigned int curpage, pgbase;
++	unsigned int rsize = NFS_SERVER(inode)->rsize;
+ 
+-	rdata.args.pgbase = user_addr & ~PAGE_MASK;
+-	rdata.args.offset = file_offset;
+-        do {
+-		int result;
+-
+-		rdata.args.count = count;
+-                if (rdata.args.count > rsize)
+-                        rdata.args.count = rsize;
+-		rdata.args.pages = &pages[curpage];
+-
+-		dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
+-			rdata.args.count, (long long) rdata.args.offset,
+-			user_addr + tot_bytes, rdata.args.pgbase, curpage);
++	curpage = 0;
++	pgbase = user_addr & ~PAGE_MASK;
++	do {
++		struct nfs_read_data *data;
++		unsigned int bytes;
++
++		bytes = rsize;
++		if (count < rsize)
++			bytes = count;
++
++		data = list_entry(list->next, struct nfs_read_data, pages);
++		list_del_init(&data->pages);
++
++		data->inode = inode;
++		data->cred = ctx->cred;
++		data->args.fh = NFS_FH(inode);
++		data->args.context = ctx;
++		data->args.offset = file_offset;
++		data->args.pgbase = pgbase;
++		data->args.pages = &pages[curpage];
++		data->args.count = bytes;
++		data->res.fattr = &data->fattr;
++		data->res.eof = 0;
++		data->res.count = bytes;
++
++		NFS_PROTO(inode)->read_setup(data);
++
++		data->task.tk_cookie = (unsigned long) inode;
++		data->task.tk_calldata = data;
++		data->task.tk_release = nfs_readdata_release;
++		data->complete = nfs_direct_read_result;
+ 
+ 		lock_kernel();
+-		result = NFS_PROTO(inode)->read(&rdata);
++		rpc_execute(&data->task);
+ 		unlock_kernel();
+ 
+-		if (result <= 0) {
+-			if (tot_bytes > 0)
+-				break;
+-			if (result == -EISDIR)
+-				result = -EINVAL;
+-			return result;
+-		}
++		dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
++				data->task.tk_pid,
++				inode->i_sb->s_id,
++				(long long)NFS_FILEID(inode),
++				bytes,
++				(unsigned long long)data->args.offset);
++
++		file_offset += bytes;
++		pgbase += bytes;
++		curpage += pgbase >> PAGE_SHIFT;
++		pgbase &= ~PAGE_MASK;
+ 
+-                tot_bytes += result;
+-		if (rdata.res.eof)
+-			break;
+-
+-                rdata.args.offset += result;
+-		rdata.args.pgbase += result;
+-		curpage += rdata.args.pgbase >> PAGE_SHIFT;
+-		rdata.args.pgbase &= ~PAGE_MASK;
+-		count -= result;
++		count -= bytes;
+ 	} while (count != 0);
++}
+ 
+-	/* XXX: should we zero the rest of the user's buffer if we
+-	 *      hit eof? */
++/**
++ * nfs_direct_read_wait - wait for I/O completion for direct reads
++ * @dreq: request on which we are to wait
++ * @intr: whether or not this wait can be interrupted
++ *
++ * Collects and returns the final error value/byte-count.
++ */
++static ssize_t nfs_direct_read_wait(struct nfs_direct_req *dreq, int intr)
++{
++	int result = 0;
+ 
+-	return tot_bytes;
++	if (intr) {
++		result = wait_event_interruptible(dreq->wait,
++					(atomic_read(&dreq->complete) == 0));
++	} else {
++		wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0));
++	}
++
++	if (!result)
++		result = atomic_read(&dreq->error);
++	if (!result)
++		result = atomic_read(&dreq->count);
++
++	kref_put(&dreq->kref, nfs_direct_req_release);
++	return (ssize_t) result;
++}
++
++/**
++ * nfs_direct_read_seg - Read in one iov segment.  Generate separate
++ *                        read RPCs for each "rsize" bytes.
++ * @inode: target inode
++ * @ctx: target file open context
++ * @user_addr: starting address of this segment of user's buffer
++ * @count: size of this segment
++ * @file_offset: offset in file to begin the operation
++ * @pages: array of addresses of page structs defining user's buffer
++ * @nr_pages: number of pages in the array
++ *
++ */
++static ssize_t nfs_direct_read_seg(struct inode *inode,
++		struct nfs_open_context *ctx, unsigned long user_addr,
++		size_t count, loff_t file_offset, struct page **pages,
++		unsigned int nr_pages)
++{
++	ssize_t result;
++	sigset_t oldset;
++	struct rpc_clnt *clnt = NFS_CLIENT(inode);
++	struct nfs_direct_req *dreq;
++
++	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
++	if (!dreq)
++		return -ENOMEM;
++
++	dreq->pages = pages;
++	dreq->npages = nr_pages;
++
++	rpc_clnt_sigmask(clnt, &oldset);
++	nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count,
++				 file_offset);
++	result = nfs_direct_read_wait(dreq, clnt->cl_intr);
++	rpc_clnt_sigunmask(clnt, &oldset);
++
++	return result;
+ }
+ 
+ /**
+@@ -191,9 +366,8 @@
+  * file_offset: offset in file to begin the operation
+  * nr_segs: size of iovec array
+  *
+- * generic_file_direct_IO has already pushed out any non-direct
+- * writes so that this read will see them when we read from the
+- * server.
++ * We've already pushed out any non-direct writes so that this read
++ * will see them when we read from the server.
+  */
+ static ssize_t
+ nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx,
+@@ -222,8 +396,6 @@
+ 		result = nfs_direct_read_seg(inode, ctx, user_addr, size,
+ 				file_offset, pages, page_count);
+ 
+-		nfs_free_user_pages(pages, page_count, 1);
+-
+ 		if (result <= 0) {
+ 			if (tot_bytes > 0)
+ 				break;
+@@ -249,31 +421,31 @@
+  * @pages: array of addresses of page structs defining user's buffer
+  * nr_pages: size of pages array
+  */
+-static int
+-nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx,
+-		unsigned long user_addr, size_t count, loff_t file_offset,
+-		struct page **pages, int nr_pages)
++static ssize_t nfs_direct_write_seg(struct inode *inode,
++		struct nfs_open_context *ctx, unsigned long user_addr,
++		size_t count, loff_t file_offset, struct page **pages,
++		int nr_pages)
+ {
+ 	const unsigned int wsize = NFS_SERVER(inode)->wsize;
+ 	size_t request;
+-	int curpage, need_commit, result, tot_bytes;
++	int curpage, need_commit;
++	ssize_t result, tot_bytes;
+ 	struct nfs_writeverf first_verf;
+-	struct nfs_write_data	wdata = {
+-		.inode		= inode,
+-		.cred		= ctx->cred,
+-		.args		= {
+-			.fh		= NFS_FH(inode),
+-			.context	= ctx,
+-		},
+-		.res		= {
+-			.fattr		= &wdata.fattr,
+-			.verf		= &wdata.verf,
+-		},
+-	};
++	struct nfs_write_data *wdata;
+ 
+-	wdata.args.stable = NFS_UNSTABLE;
++	wdata = nfs_writedata_alloc();
++	if (!wdata)
++		return -ENOMEM;
++
++	wdata->inode = inode;
++	wdata->cred = ctx->cred;
++	wdata->args.fh = NFS_FH(inode);
++	wdata->args.context = ctx;
++	wdata->args.stable = NFS_UNSTABLE;
+ 	if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize)
+-		wdata.args.stable = NFS_FILE_SYNC;
++		wdata->args.stable = NFS_FILE_SYNC;
++	wdata->res.fattr = &wdata->fattr;
++	wdata->res.verf = &wdata->verf;
+ 
+ 	nfs_begin_data_update(inode);
+ retry:
+@@ -281,20 +453,20 @@
+ 	tot_bytes = 0;
+ 	curpage = 0;
+ 	request = count;
+-	wdata.args.pgbase = user_addr & ~PAGE_MASK;
+-	wdata.args.offset = file_offset;
+-        do {
+-		wdata.args.count = request;
+-                if (wdata.args.count > wsize)
+-                        wdata.args.count = wsize;
+-		wdata.args.pages = &pages[curpage];
++	wdata->args.pgbase = user_addr & ~PAGE_MASK;
++	wdata->args.offset = file_offset;
++	do {
++		wdata->args.count = request;
++		if (wdata->args.count > wsize)
++			wdata->args.count = wsize;
++		wdata->args.pages = &pages[curpage];
+ 
+ 		dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
+-			wdata.args.count, (long long) wdata.args.offset,
+-			user_addr + tot_bytes, wdata.args.pgbase, curpage);
++			wdata->args.count, (long long) wdata->args.offset,
++			user_addr + tot_bytes, wdata->args.pgbase, curpage);
+ 
+ 		lock_kernel();
+-		result = NFS_PROTO(inode)->write(&wdata);
++		result = NFS_PROTO(inode)->write(wdata);
+ 		unlock_kernel();
+ 
+ 		if (result <= 0) {
+@@ -304,20 +476,25 @@
+ 		}
+ 
+ 		if (tot_bytes == 0)
+-			memcpy(&first_verf.verifier, &wdata.verf.verifier,
+-								VERF_SIZE);
+-		if (wdata.verf.committed != NFS_FILE_SYNC) {
++			memcpy(&first_verf.verifier, &wdata->verf.verifier,
++						sizeof(first_verf.verifier));
++		if (wdata->verf.committed != NFS_FILE_SYNC) {
+ 			need_commit = 1;
+-			if (memcmp(&first_verf.verifier,
+-					&wdata.verf.verifier, VERF_SIZE))
++			if (memcmp(&first_verf.verifier, &wdata->verf.verifier,
++					sizeof(first_verf.verifier)));
+ 				goto sync_retry;
+ 		}
+ 
+-                tot_bytes += result;
+-                wdata.args.offset += result;
+-		wdata.args.pgbase += result;
+-		curpage += wdata.args.pgbase >> PAGE_SHIFT;
+-		wdata.args.pgbase &= ~PAGE_MASK;
++		tot_bytes += result;
++
++		/* in case of a short write: stop now, let the app recover */
++		if (result < wdata->args.count)
++			break;
++
++		wdata->args.offset += result;
++		wdata->args.pgbase += result;
++		curpage += wdata->args.pgbase >> PAGE_SHIFT;
++		wdata->args.pgbase &= ~PAGE_MASK;
+ 		request -= result;
+ 	} while (request != 0);
+ 
+@@ -325,27 +502,27 @@
+ 	 * Commit data written so far, even in the event of an error
+ 	 */
+ 	if (need_commit) {
+-		wdata.args.count = tot_bytes;
+-		wdata.args.offset = file_offset;
++		wdata->args.count = tot_bytes;
++		wdata->args.offset = file_offset;
+ 
+ 		lock_kernel();
+-		result = NFS_PROTO(inode)->commit(&wdata);
++		result = NFS_PROTO(inode)->commit(wdata);
+ 		unlock_kernel();
+ 
+ 		if (result < 0 || memcmp(&first_verf.verifier,
+-						&wdata.verf.verifier,
+-						VERF_SIZE) != 0)
++					 &wdata->verf.verifier,
++					 sizeof(first_verf.verifier)) != 0)
+ 			goto sync_retry;
+ 	}
+ 	result = tot_bytes;
+ 
+ out:
+ 	nfs_end_data_update_defer(inode);
+-
++	nfs_writedata_free(wdata);
+ 	return result;
+ 
+ sync_retry:
+-	wdata.args.stable = NFS_FILE_SYNC;
++	wdata->args.stable = NFS_FILE_SYNC;
+ 	goto retry;
+ }
+ 
+@@ -362,9 +539,9 @@
+  * that non-direct readers might access, so they will pick up these
+  * writes immediately.
+  */
+-static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx,
+-		const struct iovec *iov, loff_t file_offset,
+-		unsigned long nr_segs)
++static ssize_t nfs_direct_write(struct inode *inode,
++		struct nfs_open_context *ctx, const struct iovec *iov,
++		loff_t file_offset, unsigned long nr_segs)
+ {
+ 	ssize_t tot_bytes = 0;
+ 	unsigned long seg = 0;
+@@ -504,6 +681,8 @@
+ 	if (mapping->nrpages) {
+ 		retval = filemap_fdatawrite(mapping);
+ 		if (retval == 0)
++			retval = nfs_wb_all(inode);
++		if (retval == 0)
+ 			retval = filemap_fdatawait(mapping);
+ 		if (retval)
+ 			goto out;
+@@ -593,6 +772,8 @@
+ 	if (mapping->nrpages) {
+ 		retval = filemap_fdatawrite(mapping);
+ 		if (retval == 0)
++			retval = nfs_wb_all(inode);
++		if (retval == 0)
+ 			retval = filemap_fdatawait(mapping);
+ 		if (retval)
+ 			goto out;
+@@ -607,3 +788,21 @@
+ out:
+ 	return retval;
+ }
++
++int nfs_init_directcache(void)
++{
++	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
++						sizeof(struct nfs_direct_req),
++						0, SLAB_RECLAIM_ACCOUNT,
++						NULL, NULL);
++	if (nfs_direct_cachep == NULL)
++		return -ENOMEM;
++
++	return 0;
++}
++
++void nfs_destroy_directcache(void)
++{
++	if (kmem_cache_destroy(nfs_direct_cachep))
++		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
++}
+Index: linux-2.6.10/fs/nfs/read.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/read.c	2004-12-25 05:33:47.000000000 +0800
++++ linux-2.6.10/fs/nfs/read.c	2005-04-05 14:49:13.437686480 +0800
+@@ -24,7 +24,6 @@
+ #include <linux/mm.h>
+ #include <linux/slab.h>
+ #include <linux/pagemap.h>
+-#include <linux/mempool.h>
+ #include <linux/sunrpc/clnt.h>
+ #include <linux/nfs_fs.h>
+ #include <linux/nfs_page.h>
+@@ -39,25 +38,11 @@
+ static void nfs_readpage_result_full(struct nfs_read_data *, int);
+ 
+ static kmem_cache_t *nfs_rdata_cachep;
+-static mempool_t *nfs_rdata_mempool;
++mempool_t *nfs_rdata_mempool;
+ 
+ #define MIN_POOL_READ	(32)
+ 
+-static struct nfs_read_data *nfs_readdata_alloc(void)
+-{
+-	struct nfs_read_data   *p;
+-	p = (struct nfs_read_data *)mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
+-	if (p)
+-		memset(p, 0, sizeof(*p));
+-	return p;
+-}
+-
+-static __inline__ void nfs_readdata_free(struct nfs_read_data *p)
+-{
+-	mempool_free(p, nfs_rdata_mempool);
+-}
+-
+-static void nfs_readdata_release(struct rpc_task *task)
++void nfs_readdata_release(struct rpc_task *task)
+ {
+         struct nfs_read_data   *data = (struct nfs_read_data *)task->tk_calldata;
+         nfs_readdata_free(data);
diff --git a/lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch b/lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch
new file mode 100644
index 0000000..8aa3fd0
--- /dev/null
+++ b/lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch
@@ -0,0 +1,1477 @@
+Index: linux-2.6.10/arch/i386/kernel/asm-offsets.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/asm-offsets.c	2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/asm-offsets.c	2005-04-05 16:34:18.173220992 +0800
+@@ -52,6 +52,7 @@
+ 	OFFSET(TI_preempt_count, thread_info, preempt_count);
+ 	OFFSET(TI_addr_limit, thread_info, addr_limit);
+ 	OFFSET(TI_restart_block, thread_info, restart_block);
++	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+ 	BLANK();
+ 
+ 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
+Index: linux-2.6.10/arch/i386/kernel/cpu/common.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/cpu/common.c	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/cpu/common.c	2005-04-05 16:34:18.174220840 +0800
+@@ -384,6 +384,12 @@
+ 	if (disable_pse)
+ 		clear_bit(X86_FEATURE_PSE, c->x86_capability);
+ 
++	/* hack: disable SEP for non-NX cpus; SEP breaks Execshield. */
++	#ifdef CONFIG_HIGHMEM64G
++	if (!test_bit(X86_FEATURE_NX, c->x86_capability)) 
++	#endif
++		clear_bit(X86_FEATURE_SEP, c->x86_capability);
++
+ 	/* If the model name is still unset, do table lookup. */
+ 	if ( !c->x86_model_id[0] ) {
+ 		char *p;
+Index: linux-2.6.10/arch/i386/kernel/entry.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/entry.S	2005-04-05 16:29:30.192000792 +0800
++++ linux-2.6.10/arch/i386/kernel/entry.S	2005-04-05 16:34:18.167221904 +0800
+@@ -218,8 +218,12 @@
+ 	pushl %ebp
+ 	pushfl
+ 	pushl $(__USER_CS)
+-	pushl $SYSENTER_RETURN
+-
++	/*
++	 * Push current_thread_info()->sysenter_return to the stack.
++	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
++	 * pushed above, and the word being pushed now:
++	 */
++	pushl (TI_sysenter_return-THREAD_SIZE+4*4)(%esp)
+ /*
+  * Load the potential sixth argument from user stack.
+  * Careful about security.
+Index: linux-2.6.10/arch/i386/kernel/process.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/process.c	2004-12-25 05:33:47.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/process.c	2005-04-05 16:34:18.173220992 +0800
+@@ -36,6 +36,8 @@
+ #include <linux/module.h>
+ #include <linux/kallsyms.h>
+ #include <linux/ptrace.h>
++#include <linux/mman.h>
++#include <linux/random.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -565,6 +567,8 @@
+ 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+ 
+ 	__unlazy_fpu(prev_p);
++	if (next_p->mm)
++		load_user_cs_desc(cpu, next_p->mm);
+ 
+ 	/*
+ 	 * Reload esp0, LDT and the page table pointer:
+@@ -812,3 +816,62 @@
+ 	return 0;
+ }
+ 
++
++unsigned long arch_align_stack(unsigned long sp)
++{
++	if (current->flags & PF_RELOCEXEC)
++		sp -= ((get_random_int() % 65536) << 4);
++	return sp & ~0xf;
++}
++
++
++void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
++{
++	if (limit > mm->context.exec_limit) {
++		mm->context.exec_limit = limit;
++		set_user_cs(&mm->context.user_cs, limit);
++		if (mm == current->mm)
++			load_user_cs_desc(smp_processor_id(), mm);
++	}
++}
++
++void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
++{
++	struct vm_area_struct *vma;
++	unsigned long limit = 0;
++
++	if (old_end == mm->context.exec_limit) {
++		for (vma = mm->mmap; vma; vma = vma->vm_next)
++			if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++				limit = vma->vm_end;
++
++		mm->context.exec_limit = limit;
++		set_user_cs(&mm->context.user_cs, limit);
++		if (mm == current->mm)
++			load_user_cs_desc(smp_processor_id(), mm);
++	}
++}
++
++void arch_flush_exec_range(struct mm_struct *mm)
++{
++	mm->context.exec_limit = 0;
++	set_user_cs(&mm->context.user_cs, 0);
++}
++
++/*
++ * Generate random brk address between 128MB and 196MB. (if the layout
++ * allows it.)
++ */
++void randomize_brk(unsigned long old_brk)
++{
++	unsigned long new_brk, range_start, range_end;
++
++	range_start = 0x08000000;
++	if (current->mm->brk >= range_start)
++		range_start = current->mm->brk;
++	range_end = range_start + 0x02000000;
++	new_brk = randomize_range(range_start, range_end, 0);
++	if (new_brk)
++		current->mm->brk = new_brk;
++}
++
+Index: linux-2.6.10/arch/i386/kernel/signal.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/signal.c	2005-04-05 16:29:23.290050048 +0800
++++ linux-2.6.10/arch/i386/kernel/signal.c	2005-04-05 16:34:18.170221448 +0800
+@@ -390,7 +390,7 @@
+ 	if (err)
+ 		goto give_sigsegv;
+ 
+-	restorer = &__kernel_sigreturn;
++	restorer = current->mm->context.vdso + (long)&__kernel_sigreturn;
+ 	if (ka->sa.sa_flags & SA_RESTORER)
+ 		restorer = ka->sa.sa_restorer;
+ 
+@@ -487,9 +487,10 @@
+ 		goto give_sigsegv;
+ 
+ 	/* Set up to return from userspace.  */
+-	restorer = &__kernel_rt_sigreturn;
++	restorer = current->mm->context.vdso + (long)&__kernel_rt_sigreturn;
+ 	if (ka->sa.sa_flags & SA_RESTORER)
+ 		restorer = ka->sa.sa_restorer;
++
+ 	err |= __put_user(restorer, &frame->pretcode);
+ 	 
+ 	/*
+Index: linux-2.6.10/arch/i386/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/smp.c	2005-04-05 16:29:30.198999728 +0800
++++ linux-2.6.10/arch/i386/kernel/smp.c	2005-04-05 16:34:18.172221144 +0800
+@@ -22,6 +22,7 @@
+ 
+ #include <asm/mtrr.h>
+ #include <asm/tlbflush.h>
++#include <asm/desc.h>
+ #include <mach_apic.h>
+ 
+ /*
+@@ -313,6 +314,8 @@
+ 	unsigned long cpu;
+ 
+ 	cpu = get_cpu();
++	if (current->active_mm)
++		load_user_cs_desc(cpu, current->active_mm);
+ 
+ 	if (!cpu_isset(cpu, flush_cpumask))
+ 		goto out;
+Index: linux-2.6.10/arch/i386/kernel/sysenter.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/sysenter.c	2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/sysenter.c	2005-04-05 16:34:18.171221296 +0800
+@@ -13,6 +13,7 @@
+ #include <linux/gfp.h>
+ #include <linux/string.h>
+ #include <linux/elf.h>
++#include <linux/mman.h>
+ 
+ #include <asm/cpufeature.h>
+ #include <asm/msr.h>
+@@ -41,11 +42,14 @@
+ extern const char vsyscall_int80_start, vsyscall_int80_end;
+ extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+ 
++struct page *sysenter_page;
++
+ static int __init sysenter_setup(void)
+ {
+ 	void *page = (void *)get_zeroed_page(GFP_ATOMIC);
+ 
+-	__set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
++	__set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_KERNEL_RO);
++	sysenter_page = virt_to_page(page);
+ 
+ 	if (!boot_cpu_has(X86_FEATURE_SEP)) {
+ 		memcpy(page,
+@@ -59,7 +63,51 @@
+ 	       &vsyscall_sysenter_end - &vsyscall_sysenter_start);
+ 
+ 	on_each_cpu(enable_sep_cpu, NULL, 1, 1);
++
+ 	return 0;
+ }
+ 
+ __initcall(sysenter_setup);
++
++extern void SYSENTER_RETURN_OFFSET;
++
++unsigned int vdso_enabled = 0;
++
++void map_vsyscall(void)
++{
++	struct thread_info *ti = current_thread_info();
++	struct vm_area_struct *vma;
++	unsigned long addr;
++
++	if (unlikely(!vdso_enabled)) {
++		current->mm->context.vdso = NULL;
++		return;
++	}
++
++	/*
++	 * Map the vDSO (it will be randomized):
++	 */
++	down_write(&current->mm->mmap_sem);
++	addr = do_mmap(NULL, 0, 4096, PROT_READ | PROT_EXEC, MAP_PRIVATE, 0);
++	current->mm->context.vdso = (void *)addr;
++	ti->sysenter_return = (void *)addr + (long)&SYSENTER_RETURN_OFFSET;
++	if (addr != -1) {
++		vma = find_vma(current->mm, addr);
++		if (vma) {
++			pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW;
++			get_page(sysenter_page);
++			install_page(current->mm, vma, addr,
++					sysenter_page, vma->vm_page_prot);
++			
++		}
++	}
++	up_write(&current->mm->mmap_sem);
++}
++
++static int __init vdso_setup(char *str)
++{
++        vdso_enabled = simple_strtoul(str, NULL, 0);
++        return 1;
++}
++__setup("vdso=", vdso_setup);
++
+Index: linux-2.6.10/arch/i386/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/traps.c	2005-04-05 16:29:30.193000640 +0800
++++ linux-2.6.10/arch/i386/kernel/traps.c	2005-04-05 16:43:17.073295728 +0800
+@@ -497,6 +497,10 @@
+ DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
+ DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+ 
++/*
++ * the original non-exec stack patch was written by
++ * Solar Designer <solar at openwall.com>. Thanks!
++ */
+ fastcall void do_general_protection(struct pt_regs * regs, long error_code)
+ {
+ 	int cpu = get_cpu();
+@@ -535,6 +539,46 @@
+ 	if (!(regs->xcs & 3))
+ 		goto gp_in_kernel;
+ 
++	/*
++	 * lazy-check for CS validity on exec-shield binaries:
++	 */
++	if (current->mm) {
++		int cpu = smp_processor_id();
++		struct desc_struct *desc1, *desc2;
++		struct vm_area_struct *vma;
++		unsigned long limit = 0;
++		
++		spin_lock(&current->mm->page_table_lock);
++		for (vma = current->mm->mmap; vma; vma = vma->vm_next)
++			if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++				limit = vma->vm_end;
++		spin_unlock(&current->mm->page_table_lock);
++
++		current->mm->context.exec_limit = limit;
++		set_user_cs(&current->mm->context.user_cs, limit);
++
++		desc1 = &current->mm->context.user_cs;
++		desc2 = per_cpu(cpu_gdt_table, cpu) + GDT_ENTRY_DEFAULT_USER_CS;
++
++		/*
++		 * The CS was not in sync - reload it and retry the
++		 * instruction. If the instruction still faults then
++		 * we wont hit this branch next time around.
++		 */
++		if (desc1->a != desc2->a || desc1->b != desc2->b) {
++			if (print_fatal_signals >= 2) {
++				printk("#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id());
++				printk(" exec_limit: %08lx, user_cs: %08lx/%08lx, CPU_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, desc1->a, desc1->b, desc2->a, desc2->b);
++			}
++			load_user_cs_desc(cpu, current->mm);
++			return;
++		}
++	}
++	if (print_fatal_signals) {
++		printk("#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id());
++		printk(" exec_limit: %08lx, user_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, current->mm->context.user_cs.a, current->mm->context.user_cs.b);
++	}
++
+ 	current->thread.error_code = error_code;
+ 	current->thread.trap_no = 13;
+ 	force_sig(SIGSEGV, current);
+Index: linux-2.6.10/arch/i386/kernel/vsyscall.lds.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/vsyscall.lds.S	2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/vsyscall.lds.S	2005-04-05 16:34:18.169221600 +0800
+@@ -7,7 +7,7 @@
+ 
+ SECTIONS
+ {
+-  . = VSYSCALL_BASE + SIZEOF_HEADERS;
++  . = SIZEOF_HEADERS;
+ 
+   .hash           : { *(.hash) }		:text
+   .dynsym         : { *(.dynsym) }
+@@ -20,7 +20,7 @@
+      For the layouts to match, we need to skip more than enough
+      space for the dynamic symbol table et al.  If this amount
+      is insufficient, ld -shared will barf.  Just increase it here.  */
+-  . = VSYSCALL_BASE + 0x400;
++  . = 0x400;
+ 
+   .text           : { *(.text) }		:text =0x90909090
+ 
+Index: linux-2.6.10/arch/i386/kernel/vsyscall-sysenter.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/vsyscall-sysenter.S	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/vsyscall-sysenter.S	2005-04-05 16:34:18.170221448 +0800
+@@ -24,11 +24,11 @@
+ 	/* 7: align return point with nop's to make disassembly easier */
+ 	.space 7,0x90
+ 
+-	/* 14: System call restart point is here! (SYSENTER_RETURN - 2) */
++	/* 14: System call restart point is here! (SYSENTER_RETURN_OFFSET-2) */
+ 	jmp .Lenter_kernel
+ 	/* 16: System call normal return point is here! */
+-	.globl SYSENTER_RETURN	/* Symbol used by entry.S.  */
+-SYSENTER_RETURN:
++	.globl SYSENTER_RETURN_OFFSET	/* Symbol used by sysenter.c  */
++SYSENTER_RETURN_OFFSET:
+ 	pop %ebp
+ .Lpop_ebp:
+ 	pop %edx
+Index: linux-2.6.10/arch/i386/mm/init.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/mm/init.c	2005-04-05 16:29:28.016331544 +0800
++++ linux-2.6.10/arch/i386/mm/init.c	2005-04-05 16:34:18.167221904 +0800
+@@ -518,7 +518,10 @@
+ 	set_nx();
+ 	if (nx_enabled)
+ 		printk("NX (Execute Disable) protection: active\n");
++	else
+ #endif
++	if (exec_shield)
++		printk("Using x86 segment limits to approximate NX protection\n");
+ 
+ 	pagetable_init();
+ 
+Index: linux-2.6.10/arch/i386/mm/mmap.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/mm/mmap.c	2004-12-25 05:34:33.000000000 +0800
++++ linux-2.6.10/arch/i386/mm/mmap.c	2005-04-05 16:43:44.365146736 +0800
+@@ -26,6 +26,7 @@
+ 
+ #include <linux/personality.h>
+ #include <linux/mm.h>
++#include <linux/random.h>
+ 
+ /*
+  * Top of mmap area (just below the process stack).
+@@ -38,13 +39,17 @@
+ static inline unsigned long mmap_base(struct mm_struct *mm)
+ {
+ 	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
++	unsigned long random_factor = 0;
++
++	if (current->flags & PF_RELOCEXEC)
++		random_factor = get_random_int() % (1024*1024);
+ 
+ 	if (gap < MIN_GAP)
+ 		gap = MIN_GAP;
+ 	else if (gap > MAX_GAP)
+ 		gap = MAX_GAP;
+ 
+-	return TASK_SIZE - (gap & PAGE_MASK);
++	return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
+ }
+ 
+ /*
+@@ -57,15 +62,17 @@
+ 	 * Fall back to the standard layout if the personality
+ 	 * bit is set, or if the expected stack growth is unlimited:
+ 	 */
+-	if (sysctl_legacy_va_layout ||
++	if ((exec_shield != 2) && (sysctl_legacy_va_layout ||
+ 			(current->personality & ADDR_COMPAT_LAYOUT) ||
+-			current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
++			current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)){
+ 		mm->mmap_base = TASK_UNMAPPED_BASE;
+ 		mm->get_unmapped_area = arch_get_unmapped_area;
+ 		mm->unmap_area = arch_unmap_area;
+ 	} else {
+ 		mm->mmap_base = mmap_base(mm);
+ 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
++		if (current->flags & PF_RELOCEXEC)
++			mm->get_unmapped_exec_area = arch_get_unmapped_exec_area;
+ 		mm->unmap_area = arch_unmap_area_topdown;
+ 	}
+ }
+Index: linux-2.6.10/arch/ia64/ia32/binfmt_elf32.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/ia32/binfmt_elf32.c	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/arch/ia64/ia32/binfmt_elf32.c	2005-04-05 16:34:18.174220840 +0800
+@@ -272,7 +272,7 @@
+ }
+ 
+ static unsigned long
+-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
++elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)
+ {
+ 	unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK;
+ 
+Index: linux-2.6.10/arch/x86_64/ia32/ia32_binfmt.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/ia32/ia32_binfmt.c	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/x86_64/ia32/ia32_binfmt.c	2005-04-05 16:34:18.175220688 +0800
+@@ -390,7 +390,7 @@
+ }
+ 
+ static unsigned long
+-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
++elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)
+ {
+ 	unsigned long map_addr;
+ 	struct task_struct *me = current; 
+Index: linux-2.6.10/drivers/char/random.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/random.c	2005-04-05 16:29:24.214909448 +0800
++++ linux-2.6.10/drivers/char/random.c	2005-04-05 16:34:18.197217344 +0800
+@@ -2469,3 +2469,37 @@
+ }
+ #endif
+ #endif /* CONFIG_INET */
++
++/*
++ * Get a random word:
++ */
++unsigned int get_random_int(void)
++{
++	unsigned int val = 0;
++
++	if (!exec_shield_randomize)
++		return 0;
++
++#ifdef CONFIG_X86_HAS_TSC
++	rdtscl(val);
++#endif
++	val += current->pid + jiffies + (int)val;
++
++	/*
++	 * Use IP's RNG. It suits our purpose perfectly: it re-keys itself
++	 * every second, from the entropy pool (and thus creates a limited
++	 * drain on it), and uses halfMD4Transform within the second. We
++	 * also spice it with the TSC (if available), jiffies, PID and the
++	 * stack address:
++	 */
++	return secure_ip_id(val);
++}
++
++unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len)
++{
++	unsigned long range = end - len - start;
++	if (end <= start + len)
++		return 0;
++	return PAGE_ALIGN(get_random_int() % range + start);
++}
++
+Index: linux-2.6.10/fs/binfmt_elf.c
+===================================================================
+--- linux-2.6.10.orig/fs/binfmt_elf.c	2005-04-05 16:29:24.353888320 +0800
++++ linux-2.6.10/fs/binfmt_elf.c	2005-04-05 16:39:25.042569760 +0800
+@@ -494,7 +494,7 @@
+ 	unsigned long reloc_func_desc = 0;
+ 	char passed_fileno[6];
+ 	struct files_struct *files;
+-	int have_pt_gnu_stack, executable_stack = EXSTACK_DEFAULT;
++	int have_pt_gnu_stack, relocexec, executable_stack = EXSTACK_DEFAULT;
+ 	unsigned long def_flags = 0;
+ 	struct {
+ 		struct elfhdr elf_ex;
+@@ -660,6 +660,24 @@
+ 		}
+ 	have_pt_gnu_stack = (i < loc->elf_ex.e_phnum);
+ 
++        relocexec = 0;
++
++        if (current->personality == PER_LINUX)
++        switch (exec_shield) {
++        case 1:
++                if (executable_stack == EXSTACK_DISABLE_X) {
++                        current->flags |= PF_RELOCEXEC;
++                        relocexec = PF_RELOCEXEC;
++                }
++                break;
++
++        case 2:
++                executable_stack = EXSTACK_DISABLE_X;
++                current->flags |= PF_RELOCEXEC;
++                relocexec = PF_RELOCEXEC;
++                break;
++        }
++
+ 	/* Some simple consistency checks for the interpreter */
+ 	if (elf_interpreter) {
+ 		interpreter_type = INTERPRETER_ELF | INTERPRETER_AOUT;
+@@ -713,6 +731,15 @@
+ 	if (retval)
+ 		goto out_free_dentry;
+ 
++        current->flags |= relocexec;
++#ifdef __i386__
++        /*
++         * Turn off the CS limit completely if exec-shield disabled or
++         * NX active:
++         */
++        if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled)
++                arch_add_exec_range(current->mm, -1);
++#endif
+ 	/* Discard our unneeded old files struct */
+ 	if (files) {
+ 		steal_locks(files);
+@@ -731,7 +758,8 @@
+ 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
+ 	   may depend on the personality.  */
+ 	SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+-	if (elf_read_implies_exec(loc->elf_ex, have_pt_gnu_stack))
++	if (exec_shield != 2 &&
++			elf_read_implies_exec(loc->elf_ex, have_pt_gnu_stack))
+ 		current->personality |= READ_IMPLIES_EXEC;
+ 
+ 	arch_pick_mmap_layout(current->mm);
+@@ -894,6 +922,14 @@
+ 
+ 	set_binfmt(&elf_format);
+ 
++        /*
++         * Map the vsyscall trampoline. This address is then passed via
++         * AT_SYSINFO.
++         */
++#ifdef __HAVE_ARCH_VSYSCALL
++        map_vsyscall();
++#endif
++
+ 	compute_creds(bprm);
+ 	current->flags &= ~PF_FORKNOEXEC;
+ 	create_elf_tables(bprm, &loc->elf_ex, (interpreter_type == INTERPRETER_AOUT),
+Index: linux-2.6.10/fs/exec.c
+===================================================================
+--- linux-2.6.10.orig/fs/exec.c	2005-04-05 16:29:30.270988784 +0800
++++ linux-2.6.10/fs/exec.c	2005-04-05 16:34:18.177220384 +0800
+@@ -396,7 +396,12 @@
+ 	while (i < MAX_ARG_PAGES)
+ 		bprm->page[i++] = NULL;
+ #else
++#ifdef __HAVE_ARCH_ALIGN_STACK
++	stack_base = arch_align_stack(STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE);
++	stack_base = PAGE_ALIGN(stack_base);
++#else
+ 	stack_base = STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE;
++#endif
+ 	bprm->p += stack_base;
+ 	mm->arg_start = bprm->p;
+ 	arg_size = STACK_TOP - (PAGE_MASK & (unsigned long) mm->arg_start);
+@@ -854,6 +859,7 @@
+ 	tcomm[i] = '\0';
+ 	set_task_comm(current, tcomm);
+ 
++	current->flags &= ~PF_RELOCEXEC;
+ 	flush_thread();
+ 
+ 	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
+Index: linux-2.6.10/fs/proc/array.c
+===================================================================
+--- linux-2.6.10.orig/fs/proc/array.c	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/fs/proc/array.c	2005-04-05 16:34:18.180219928 +0800
+@@ -373,8 +373,12 @@
+ 	ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
+ 	read_unlock(&tasklist_lock);
+ 
+-	if (!whole || num_threads<2)
+-		wchan = get_wchan(task);
++	if (!whole || num_threads<2) {
++		wchan = 0;
++		if (current->uid == task->uid || current->euid == task->uid ||
++						capable(CAP_SYS_NICE))
++			wchan = get_wchan(task);
++	}
+ 	if (!whole) {
+ 		min_flt = task->min_flt;
+ 		maj_flt = task->maj_flt;
+Index: linux-2.6.10/fs/proc/base.c
+===================================================================
+--- linux-2.6.10.orig/fs/proc/base.c	2005-04-05 16:29:24.361887104 +0800
++++ linux-2.6.10/fs/proc/base.c	2005-04-05 16:34:18.179220080 +0800
+@@ -117,7 +117,7 @@
+ 	E(PROC_TGID_CMDLINE,   "cmdline", S_IFREG|S_IRUGO),
+ 	E(PROC_TGID_STAT,      "stat",    S_IFREG|S_IRUGO),
+ 	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
+-	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
++	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUSR),
+ 	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+ 	E(PROC_TGID_CWD,       "cwd",     S_IFLNK|S_IRWXUGO),
+ 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
+@@ -142,7 +142,7 @@
+ 	E(PROC_TID_CMDLINE,    "cmdline", S_IFREG|S_IRUGO),
+ 	E(PROC_TID_STAT,       "stat",    S_IFREG|S_IRUGO),
+ 	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
+-	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
++	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUSR),
+ 	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+ 	E(PROC_TID_CWD,        "cwd",     S_IFLNK|S_IRWXUGO),
+ 	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
+Index: linux-2.6.10/fs/proc/task_mmu.c
+===================================================================
+--- linux-2.6.10.orig/fs/proc/task_mmu.c	2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/fs/proc/task_mmu.c	2005-04-05 16:41:11.796340720 +0800
+@@ -14,19 +14,27 @@
+ 	buffer += sprintf(buffer,
+ 		"VmSize:\t%8lu kB\n"
+ 		"VmLck:\t%8lu kB\n"
+-		"VmRSS:\t%8lu kB\n"
+-		"VmData:\t%8lu kB\n"
+-		"VmStk:\t%8lu kB\n"
+-		"VmExe:\t%8lu kB\n"
+-		"VmLib:\t%8lu kB\n"
+-		"VmPTE:\t%8lu kB\n",
+-		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+-		mm->locked_vm << (PAGE_SHIFT-10),
+-		mm->rss << (PAGE_SHIFT-10),
+-		data << (PAGE_SHIFT-10),
+-		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
+-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+-	return buffer;
++                "VmData:\t%8lu kB\n"
++                "VmStk:\t%8lu kB\n"
++                "VmExe:\t%8lu kB\n"
++                "VmLib:\t%8lu kB\n"
++                "VmPTE:\t%8lu kB\n"
++                "StaBrk:\t%08lx kB\n"
++                "Brk:\t%08lx kB\n"
++                "StaStk:\t%08lx kB\n" ,
++                (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
++                mm->locked_vm << (PAGE_SHIFT-10),
++                mm->rss << (PAGE_SHIFT-10),
++                data << (PAGE_SHIFT-10),
++                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
++                (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
++                mm->start_brk, mm->brk, mm->start_stack);
++#if __i386__
++        if (!nx_enabled)
++                buffer += sprintf(buffer,
++                                "ExecLim:\t%08lx\n", mm->context.exec_limit);
++#endif
++        return buffer;
+ }
+ 
+ unsigned long task_vsize(struct mm_struct *mm)
+@@ -47,6 +55,9 @@
+ 
+ static int show_map(struct seq_file *m, void *v)
+ {
++#ifdef __i386__
++	struct task_struct *task = m->private;
++#endif
+ 	struct vm_area_struct *map = v;
+ 	struct file *file = map->vm_file;
+ 	int flags = map->vm_flags;
+@@ -65,7 +76,13 @@
+ 			map->vm_end,
+ 			flags & VM_READ ? 'r' : '-',
+ 			flags & VM_WRITE ? 'w' : '-',
+-			flags & VM_EXEC ? 'x' : '-',
++			(flags & VM_EXEC
++#ifdef __i386__
++				|| (!nx_enabled &&
++				(map->vm_start < task->mm->context.exec_limit))
++#endif
++			)
++				? 'x' : '-',
+ 			flags & VM_MAYSHARE ? 's' : 'p',
+ 			map->vm_pgoff << PAGE_SHIFT,
+ 			MAJOR(dev), MINOR(dev), ino, &len);
+Index: linux-2.6.10/include/asm-i386/desc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/desc.h	2005-04-05 16:29:30.129010368 +0800
++++ linux-2.6.10/include/asm-i386/desc.h	2005-04-05 16:34:18.188218712 +0800
+@@ -129,6 +129,20 @@
+ extern int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr,
+ 		      unsigned long bytecount);
+ 
++static inline void set_user_cs(struct desc_struct *desc, unsigned long limit)
++{
++	limit = (limit - 1) / PAGE_SIZE;
++	desc->a = limit & 0xffff;
++	desc->b = (limit & 0xf0000) | 0x00c0fb00;
++}
++
++#define load_user_cs_desc(cpu, mm) \
++    	per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs
++
++extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit);
++extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit);
++extern void arch_flush_exec_range(struct mm_struct *mm);
++
+ #endif /* !__ASSEMBLY__ */
+ 
+ #endif
+Index: linux-2.6.10/include/asm-i386/elf.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/elf.h	2004-12-25 05:35:15.000000000 +0800
++++ linux-2.6.10/include/asm-i386/elf.h	2005-04-05 16:34:18.188218712 +0800
+@@ -9,6 +9,7 @@
+ #include <asm/user.h>
+ #include <asm/processor.h>
+ #include <asm/system.h>		/* for savesegment */
++#include <asm/desc.h>
+ 
+ #include <linux/utsname.h>
+ 
+@@ -133,15 +134,22 @@
+ #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
+ #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
+ 
+-#define VSYSCALL_BASE	(__fix_to_virt(FIX_VSYSCALL))
+-#define VSYSCALL_EHDR	((const struct elfhdr *) VSYSCALL_BASE)
+-#define VSYSCALL_ENTRY	((unsigned long) &__kernel_vsyscall)
+ extern void __kernel_vsyscall;
++#define VSYSCALL_BASE	((unsigned long)current->mm->context.vdso)
++#define VSYSCALL_EHDR	((const struct elfhdr *) VSYSCALL_BASE)
++#define VSYSCALL_OFFSET	((unsigned long) &__kernel_vsyscall)
++#define VSYSCALL_ENTRY	(VSYSCALL_BASE + VSYSCALL_OFFSET)
+ 
+-#define ARCH_DLINFO						\
+-do {								\
+-		NEW_AUX_ENT(AT_SYSINFO,	VSYSCALL_ENTRY);	\
+-		NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE);	\
++/* kernel-internal fixmap address: */
++#define __VSYSCALL_BASE	(__fix_to_virt(FIX_VSYSCALL))
++#define __VSYSCALL_EHDR	((const struct elfhdr *) __VSYSCALL_BASE)
++
++#define ARCH_DLINFO							\
++do {									\
++	if (VSYSCALL_BASE) {						\
++		NEW_AUX_ENT(AT_SYSINFO,	VSYSCALL_ENTRY);		\
++		NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE);		\
++	}								\
+ } while (0)
+ 
+ /*
+@@ -152,15 +160,15 @@
+  * Dumping its extra ELF program headers includes all the other information
+  * a debugger needs to easily find how the vsyscall DSO was being used.
+  */
+-#define ELF_CORE_EXTRA_PHDRS		(VSYSCALL_EHDR->e_phnum)
++#define ELF_CORE_EXTRA_PHDRS		(__VSYSCALL_EHDR->e_phnum)
+ #define ELF_CORE_WRITE_EXTRA_PHDRS					      \
+ do {									      \
+ 	const struct elf_phdr *const vsyscall_phdrs =			      \
+-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
+-					   + VSYSCALL_EHDR->e_phoff);	      \
++		(const struct elf_phdr *) (__VSYSCALL_BASE		      \
++					   + __VSYSCALL_EHDR->e_phoff);	      \
+ 	int i;								      \
+ 	Elf32_Off ofs = 0;						      \
+-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
++	for (i = 0; i < __VSYSCALL_EHDR->e_phnum; ++i) {		      \
+ 		struct elf_phdr phdr = vsyscall_phdrs[i];		      \
+ 		if (phdr.p_type == PT_LOAD) {				      \
+ 			BUG_ON(ofs != 0);				      \
+@@ -178,10 +186,10 @@
+ #define ELF_CORE_WRITE_EXTRA_DATA					      \
+ do {									      \
+ 	const struct elf_phdr *const vsyscall_phdrs =			      \
+-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
+-					   + VSYSCALL_EHDR->e_phoff);	      \
++		(const struct elf_phdr *) (__VSYSCALL_BASE		      \
++					   + __VSYSCALL_EHDR->e_phoff);	      \
+ 	int i;								      \
+-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
++	for (i = 0; i < __VSYSCALL_EHDR->e_phnum; ++i) {		      \
+ 		if (vsyscall_phdrs[i].p_type == PT_LOAD)		      \
+ 			DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr,	      \
+ 				   PAGE_ALIGN(vsyscall_phdrs[i].p_memsz));    \
+@@ -190,4 +198,10 @@
+ 
+ #endif
+ 
++#define __HAVE_ARCH_RANDOMIZE_BRK
++extern void randomize_brk(unsigned long old_brk);
++
++#define __HAVE_ARCH_VSYSCALL
++extern void map_vsyscall(void);
++
+ #endif
+Index: linux-2.6.10/include/asm-i386/mmu.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/mmu.h	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/include/asm-i386/mmu.h	2005-04-05 16:34:18.189218560 +0800
+@@ -7,11 +7,17 @@
+  * we put the segment information here.
+  *
+  * cpu_vm_mask is used to optimize ldt flushing.
++ *
++ * exec_limit is used to track the range PROT_EXEC
++ * mappings span.
+  */
+ typedef struct { 
+ 	int size;
+ 	struct semaphore sem;
+ 	void *ldt;
++	struct desc_struct user_cs;
++	unsigned long exec_limit;
++	void *vdso;
+ } mm_context_t;
+ 
+ #endif
+Index: linux-2.6.10/include/asm-i386/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/pgalloc.h	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/include/asm-i386/pgalloc.h	2005-04-05 16:34:18.190218408 +0800
+@@ -4,6 +4,7 @@
+ #include <linux/config.h>
+ #include <asm/processor.h>
+ #include <asm/fixmap.h>
++#include <asm/desc.h>
+ #include <linux/threads.h>
+ #include <linux/mm.h>		/* for struct page */
+ 
+Index: linux-2.6.10/include/asm-i386/processor.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/processor.h	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/include/asm-i386/processor.h	2005-04-05 16:34:18.189218560 +0800
+@@ -296,7 +296,10 @@
+ /* This decides where the kernel will search for a free chunk of vm
+  * space during mmap's.
+  */
+-#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
++#define TASK_UNMAPPED_BASE	PAGE_ALIGN(TASK_SIZE/3)
++
++#define __HAVE_ARCH_ALIGN_STACK
++extern unsigned long arch_align_stack(unsigned long sp);
+ 
+ #define HAVE_ARCH_PICK_MMAP_LAYOUT
+ 
+@@ -478,6 +481,7 @@
+ 	regs->xcs = __USER_CS;					\
+ 	regs->eip = new_eip;					\
+ 	regs->esp = new_esp;					\
++	load_user_cs_desc(smp_processor_id(), current->mm);	\
+ } while (0)
+ 
+ /* Forward declaration, a strange C thing */
+Index: linux-2.6.10/include/asm-i386/thread_info.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/thread_info.h	2005-04-05 16:29:30.127010672 +0800
++++ linux-2.6.10/include/asm-i386/thread_info.h	2005-04-05 16:34:18.190218408 +0800
+@@ -38,6 +38,7 @@
+ 					 	   0-0xBFFFFFFF for user-thead
+ 						   0-0xFFFFFFFF for kernel-thread
+ 						*/
++	void			*sysenter_return;
+ 	struct restart_block    restart_block;
+ 
+ 	unsigned long           previous_esp;   /* ESP of the previous stack in case
+Index: linux-2.6.10/include/asm-ia64/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ia64/pgalloc.h	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/include/asm-ia64/pgalloc.h	2005-04-05 16:34:18.184219320 +0800
+@@ -23,6 +23,10 @@
+ #include <asm/mmu_context.h>
+ #include <asm/processor.h>
+ 
++#define arch_add_exec_range(mm, limit)		do { ; } while (0)
++#define arch_flush_exec_range(mm)		do { ; } while (0)
++#define arch_remove_exec_range(mm, limit)	do { ; } while (0)
++
+ /*
+  * Very stupidly, we used to get new pgd's and pmd's, init their contents
+  * to point to the NULL versions of the next level page table, later on
+Index: linux-2.6.10/include/asm-ppc64/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/pgalloc.h	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/include/asm-ppc64/pgalloc.h	2005-04-05 16:34:18.185219168 +0800
+@@ -11,6 +11,11 @@
+ 
+ extern kmem_cache_t *zero_cache;
+ 
++/* Dummy functions since we don't support execshield on ppc */
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm)      do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
+ /*
+  * This program is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU General Public License
+Index: linux-2.6.10/include/asm-ppc/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc/pgalloc.h	2004-12-25 05:33:48.000000000 +0800
++++ linux-2.6.10/include/asm-ppc/pgalloc.h	2005-04-05 16:34:18.183219472 +0800
+@@ -40,5 +40,10 @@
+ 
+ #define check_pgt_cache()	do { } while (0)
+ 
++#define arch_add_exec_range(mm, limit)         do { ; } while (0)
++#define arch_flush_exec_range(mm)              do { ; } while (0)
++#define arch_remove_exec_range(mm, limit)      do { ; } while (0)
++
++
+ #endif /* _PPC_PGALLOC_H */
+ #endif /* __KERNEL__ */
+Index: linux-2.6.10/include/asm-s390/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-s390/pgalloc.h	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/include/asm-s390/pgalloc.h	2005-04-05 16:34:18.186219016 +0800
+@@ -19,6 +19,10 @@
+ #include <linux/gfp.h>
+ #include <linux/mm.h>
+ 
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm)      do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
+ #define check_pgt_cache()	do {} while (0)
+ 
+ extern void diag10(unsigned long addr);
+Index: linux-2.6.10/include/asm-sparc64/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sparc64/pgalloc.h	2004-12-25 05:35:29.000000000 +0800
++++ linux-2.6.10/include/asm-sparc64/pgalloc.h	2005-04-05 16:34:18.187218864 +0800
+@@ -261,4 +261,8 @@
+ #define pgd_free(pgd)		free_pgd_fast(pgd)
+ #define pgd_alloc(mm)		get_pgd_fast()
+ 
++#define arch_add_exec_range(mm, limit)		do { ; } while (0)
++#define arch_flush_exec_range(mm)		do { ; } while (0)
++#define arch_remove_exec_range(mm, limit)	do { ; } while (0)
++
+ #endif /* _SPARC64_PGALLOC_H */
+Index: linux-2.6.10/include/asm-sparc/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sparc/pgalloc.h	2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/include/asm-sparc/pgalloc.h	2005-04-05 16:34:18.191218256 +0800
+@@ -66,4 +66,8 @@
+ #define pte_free(pte)		BTFIXUP_CALL(pte_free)(pte)
+ #define __pte_free_tlb(tlb, pte)	pte_free(pte)
+ 
++#define arch_add_exec_range(mm, limit)		do { ; } while (0)
++#define arch_flush_exec_range(mm)		do { ; } while (0)
++#define arch_remove_exec_range(mm, limit)	do { ; } while (0)
++
+ #endif /* _SPARC_PGALLOC_H */
+Index: linux-2.6.10/include/asm-x86_64/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/pgalloc.h	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/include/asm-x86_64/pgalloc.h	2005-04-05 16:34:18.185219168 +0800
+@@ -7,6 +7,11 @@
+ #include <linux/threads.h>
+ #include <linux/mm.h>
+ 
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm)      do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
++
+ #define pmd_populate_kernel(mm, pmd, pte) \
+ 		set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
+ #define pgd_populate(mm, pgd, pmd) \
+Index: linux-2.6.10/include/linux/mm.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mm.h	2005-04-05 16:29:30.250991824 +0800
++++ linux-2.6.10/include/linux/mm.h	2005-04-05 16:43:44.366146584 +0800
+@@ -685,7 +685,14 @@
+ 	unsigned long addr, unsigned long len, pgoff_t pgoff);
+ extern void exit_mmap(struct mm_struct *);
+ 
+-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
++extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int);
++
++
++static inline unsigned long get_unmapped_area(struct file * file, unsigned long addr, 
++		unsigned long len, unsigned long pgoff, unsigned long flags)
++{
++	return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0);	
++}
+ 
+ extern unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file *file,
+ 				   unsigned long addr, unsigned long len,
+Index: linux-2.6.10/include/linux/random.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/random.h	2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/include/linux/random.h	2005-04-05 16:34:18.183219472 +0800
+@@ -69,6 +69,9 @@
+ extern struct file_operations random_fops, urandom_fops;
+ #endif
+ 
++unsigned int get_random_int(void);
++unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
++
+ #endif /* __KERNEL___ */
+ 
+ #endif /* _LINUX_RANDOM_H */
+Index: linux-2.6.10/include/linux/resource.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/resource.h	2004-12-25 05:33:52.000000000 +0800
++++ linux-2.6.10/include/linux/resource.h	2005-04-05 16:34:18.182219624 +0800
+@@ -52,8 +52,11 @@
+ /*
+  * Limit the stack by to some sane default: root can always
+  * increase this limit if needed..  8MB seems reasonable.
++ *
++ * (2MB more to cover randomization effects.)
+  */
+-#define _STK_LIM	(8*1024*1024)
++#define _STK_LIM	(10*1024*1024)
++#define EXEC_STACK_BIAS	(2*1024*1024)
+ 
+ /*
+  * GPG wants 32kB of mlocked memory, to make sure pass phrases
+Index: linux-2.6.10/include/linux/sched.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sched.h	2005-04-05 16:29:27.971338384 +0800
++++ linux-2.6.10/include/linux/sched.h	2005-04-05 16:43:44.367146432 +0800
+@@ -32,6 +32,9 @@
+ #include <linux/topology.h>
+ 
+ struct exec_domain;
++extern int exec_shield;
++extern int exec_shield_randomize;
++extern int print_fatal_signals;
+ 
+ /*
+  * cloning flags:
+@@ -193,6 +196,10 @@
+ extern unsigned long
+ arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
+ 		       unsigned long, unsigned long);
++
++extern unsigned long
++arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long,
++		       unsigned long, unsigned long);
+ extern unsigned long
+ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ 			  unsigned long len, unsigned long pgoff,
+@@ -208,6 +215,9 @@
+ 	unsigned long (*get_unmapped_area) (struct file *filp,
+ 				unsigned long addr, unsigned long len,
+ 				unsigned long pgoff, unsigned long flags);
++	unsigned long (*get_unmapped_exec_area) (struct file *filp,
++				unsigned long addr, unsigned long len,
++				unsigned long pgoff, unsigned long flags);
+ 	void (*unmap_area) (struct vm_area_struct *area);
+ 	unsigned long mmap_base;		/* base of mmap area */
+ 	unsigned long free_area_cache;		/* first hole */
+@@ -720,6 +730,7 @@
+ #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
+ #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
+ #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
++#define PF_RELOCEXEC	0x00800000	/* relocate shared libraries */
+ 
+ #ifdef CONFIG_SMP
+ extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
+Index: linux-2.6.10/kernel/signal.c
+===================================================================
+--- linux-2.6.10.orig/kernel/signal.c	2005-04-05 16:29:27.951341424 +0800
++++ linux-2.6.10/kernel/signal.c	2005-04-05 16:43:17.077295120 +0800
+@@ -1608,6 +1608,35 @@
+ 	spin_unlock_irq(&current->sighand->siglock);
+ }
+ 
++int print_fatal_signals = 0;
++
++static void print_fatal_signal(struct pt_regs *regs, int signr)
++{
++	int i;
++	unsigned char insn;
++	printk("%s/%d: potentially unexpected fatal signal %d.\n",
++		current->comm, current->pid, signr);
++
++#ifdef __i386__
++	printk("code at %08lx: ", regs->eip);
++	for (i = 0; i < 16; i++) {
++		__get_user(insn, (unsigned char *)(regs->eip + i));
++		printk("%02x ", insn);
++	}
++#endif
++	printk("\n");
++	show_regs(regs);
++}
++
++static int __init setup_print_fatal_signals(char *str)
++{
++	get_option (&str, &print_fatal_signals);
++
++	return 1;
++}
++
++__setup("print-fatal-signals=", setup_print_fatal_signals);
++
+ #ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER
+ 
+ static void
+@@ -1808,6 +1837,12 @@
+ 		if (!signr)
+ 			break; /* will return 0 */
+ 
++		if ((signr == SIGSEGV) && print_fatal_signals) {
++			spin_unlock_irq(&current->sighand->siglock);
++			print_fatal_signal(regs, signr);
++			spin_lock_irq(&current->sighand->siglock);
++		}
++
+ 		if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) {
+ 			ptrace_signal_deliver(regs, cookie);
+ 
+@@ -1904,6 +1939,8 @@
+ 		 * Anything else is fatal, maybe with a core dump.
+ 		 */
+ 		current->flags |= PF_SIGNALED;
++		if (print_fatal_signals)
++			print_fatal_signal(regs, signr);
+ 		if (sig_kernel_coredump(signr)) {
+ 			/*
+ 			 * If it was able to dump core, this kills all
+Index: linux-2.6.10/kernel/sysctl.c
+===================================================================
+--- linux-2.6.10.orig/kernel/sysctl.c	2005-04-05 16:29:24.394882088 +0800
++++ linux-2.6.10/kernel/sysctl.c	2005-04-05 16:43:17.078294968 +0800
+@@ -75,6 +75,29 @@
+ 				  void __user *, size_t *, loff_t *);
+ #endif
+ 
++extern unsigned int vdso_enabled;
++
++int exec_shield = 1;
++int exec_shield_randomize = 1;
++
++static int __init setup_exec_shield(char *str)
++{
++        get_option (&str, &exec_shield);
++
++        return 1;
++}
++
++__setup("exec-shield=", setup_exec_shield);
++
++static int __init setup_exec_shield_randomize(char *str)
++{
++        get_option (&str, &exec_shield_randomize);
++
++        return 1;
++}
++
++__setup("exec-shield-randomize=", setup_exec_shield_randomize);
++
+ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+ static int maxolduid = 65535;
+ static int minolduid;
+@@ -276,6 +299,40 @@
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 	{
++		.ctl_name	= KERN_PANIC,
++		.procname	= "exec-shield",
++		.data		= &exec_shield,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= KERN_PANIC,
++		.procname	= "exec-shield-randomize",
++		.data		= &exec_shield_randomize,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= KERN_PANIC,
++		.procname	= "print-fatal-signals",
++		.data		= &print_fatal_signals,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++#if __i386__
++	{
++		.ctl_name	= KERN_PANIC,
++		.procname	= "vdso",
++		.data		= &vdso_enabled,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++#endif
++	{
+ 		.ctl_name	= KERN_CORE_USES_PID,
+ 		.procname	= "core_uses_pid",
+ 		.data		= &core_uses_pid,
+Index: linux-2.6.10/mm/mmap.c
+===================================================================
+--- linux-2.6.10.orig/mm/mmap.c	2005-04-05 16:29:30.134009608 +0800
++++ linux-2.6.10/mm/mmap.c	2005-04-05 16:43:44.369146128 +0800
+@@ -23,6 +23,7 @@
+ #include <linux/mount.h>
+ #include <linux/mempolicy.h>
+ #include <linux/rmap.h>
++#include <linux/random.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -245,6 +246,8 @@
+ __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ 		struct vm_area_struct *prev, struct rb_node *rb_parent)
+ {
++	if (vma->vm_flags & VM_EXEC)
++		arch_add_exec_range(mm, vma->vm_end);
+ 	if (prev) {
+ 		vma->vm_next = prev->vm_next;
+ 		prev->vm_next = vma;
+@@ -347,6 +350,8 @@
+ 	rb_erase(&vma->vm_rb, &mm->mm_rb);
+ 	if (mm->mmap_cache == vma)
+ 		mm->mmap_cache = prev;
++	if (vma->vm_flags & VM_EXEC)
++		arch_remove_exec_range(mm, vma->vm_end);
+ }
+ 
+ /*
+@@ -642,6 +647,8 @@
+ 		} else					/* cases 2, 5, 7 */
+ 			vma_adjust(prev, prev->vm_start,
+ 				end, prev->vm_pgoff, NULL);
++		if (prev->vm_flags & VM_EXEC)
++			arch_add_exec_range(mm, prev->vm_end);
+ 		return prev;
+ 	}
+ 
+@@ -813,7 +820,7 @@
+ 	/* Obtain the address to map to. we verify (or select) it and ensure
+ 	 * that it represents a valid section of the address space.
+ 	 */
+-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
++	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags, prot & PROT_EXEC);
+ 	if (addr & ~PAGE_MASK)
+ 		return addr;
+ 
+@@ -1207,9 +1214,10 @@
+ 		area->vm_mm->free_area_cache = area->vm_end;
+ }
+ 
++
+ unsigned long
+-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+-		unsigned long pgoff, unsigned long flags)
++get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
++		unsigned long pgoff, unsigned long flags, int exec)
+ {
+ 	if (flags & MAP_FIXED) {
+ 		unsigned long ret;
+@@ -1241,10 +1249,80 @@
+ 		return file->f_op->get_unmapped_area(file, addr, len,
+ 						pgoff, flags);
+ 
+-	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
++	if (exec && current->mm->get_unmapped_exec_area)
++		return current->mm->get_unmapped_exec_area(file, addr, len, pgoff, flags);
++	else
++		return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ }
+ 
+-EXPORT_SYMBOL(get_unmapped_area);
++EXPORT_SYMBOL(get_unmapped_area_prot);
++
++
++#define SHLIB_BASE             0x00111000
++
++unsigned long arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
++		unsigned long len0, unsigned long pgoff, unsigned long flags)
++{
++	unsigned long addr = addr0, len = len0;
++	struct mm_struct *mm = current->mm;
++	struct vm_area_struct *vma;
++	unsigned long tmp;
++
++	if (len > TASK_SIZE)
++		return -ENOMEM;
++		
++	if (!addr && !(flags & MAP_FIXED))
++		addr = randomize_range(SHLIB_BASE, 0x01000000, len);
++
++	if (addr) {
++		addr = PAGE_ALIGN(addr);
++		vma = find_vma(mm, addr);
++		if (TASK_SIZE - len >= addr &&
++		    (!vma || addr + len <= vma->vm_start)) {
++			return addr;
++		}
++	}
++
++	addr = SHLIB_BASE;
++
++	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
++		/* At this point:  (!vma || addr < vma->vm_end). */
++		if (TASK_SIZE - len < addr) {
++			return -ENOMEM;
++		}
++		if (!vma || addr + len <= vma->vm_start) {
++			/*
++			 * Must not let a PROT_EXEC mapping get into the
++			 * brk area:
++			 */
++			if (addr + len > mm->brk)
++				goto failed;
++			
++			/*
++			 * Up until the brk area we randomize addresses
++			 * as much as possible:
++			 */
++			if (addr >= 0x01000000) {
++				tmp = randomize_range(0x01000000, mm->brk, len);
++				vma = find_vma(mm, tmp);
++				if (TASK_SIZE - len >= tmp &&
++				    (!vma || tmp + len <= vma->vm_start))
++					return tmp;
++			}
++			/*
++			 * Ok, randomization didnt work out - return
++			 * the result of the linear search:
++			 */
++			return addr;
++		}
++		addr = vma->vm_end;
++	}
++	
++failed:
++	return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags);
++}
++
++
+ 
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+@@ -1319,6 +1397,14 @@
+ 	return prev ? prev->vm_next : vma;
+ }
+ 
++
++static int over_stack_limit(unsigned long sz)
++{
++	if (sz < EXEC_STACK_BIAS)
++		return 0;
++	return (sz - EXEC_STACK_BIAS) > current->signal->rlim[RLIMIT_STACK].rlim_cur;
++}
++
+ #ifdef CONFIG_STACK_GROWSUP
+ /*
+  * vma is the first one with address > vma->vm_end.  Have to extend vma.
+@@ -1358,7 +1444,7 @@
+ 		return -ENOMEM;
+ 	}
+ 	
+-	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur ||
++	if (over_stack_limit(address - vma->vm_start) ||
+ 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
+ 			current->signal->rlim[RLIMIT_AS].rlim_cur) {
+ 		anon_vma_unlock(vma);
+@@ -1432,7 +1518,7 @@
+ 		return -ENOMEM;
+ 	}
+ 	
+-	if (vma->vm_end - address > current->signal->rlim[RLIMIT_STACK].rlim_cur ||
++	if (over_stack_limit(vma->vm_end - address) ||
+ 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
+ 			current->signal->rlim[RLIMIT_AS].rlim_cur) {
+ 		anon_vma_unlock(vma);
+@@ -1668,10 +1754,14 @@
+ 	if (new->vm_ops && new->vm_ops->open)
+ 		new->vm_ops->open(new);
+ 
+-	if (new_below)
++	if (new_below) {
++		unsigned long old_end = vma->vm_end;
++
+ 		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ 			((addr - new->vm_start) >> PAGE_SHIFT), new);
+-	else
++		if (vma->vm_flags & VM_EXEC)
++			arch_remove_exec_range(mm, old_end);
++	} else
+ 		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ 
+ 	return 0;
+@@ -1890,6 +1980,7 @@
+ 	mm->rss = 0;
+ 	mm->total_vm = 0;
+ 	mm->locked_vm = 0;
++	arch_flush_exec_range(mm);
+ 
+ 	spin_unlock(&mm->page_table_lock);
+ 
+Index: linux-2.6.10/mm/mprotect.c
+===================================================================
+--- linux-2.6.10.orig/mm/mprotect.c	2005-04-05 16:29:30.135009456 +0800
++++ linux-2.6.10/mm/mprotect.c	2005-04-05 16:34:18.193217952 +0800
+@@ -22,6 +22,7 @@
+ 
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
++#include <asm/pgalloc.h>
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
+@@ -117,7 +118,7 @@
+ 	struct mm_struct * mm = vma->vm_mm;
+ 	unsigned long oldflags = vma->vm_flags;
+ 	long nrpages = (end - start) >> PAGE_SHIFT;
+-	unsigned long charged = 0;
++	unsigned long charged = 0, old_end = vma->vm_end;
+ 	pgprot_t newprot;
+ 	pgoff_t pgoff;
+ 	int error;
+@@ -179,8 +180,11 @@
+ 	 * vm_flags and vm_page_prot are protected by the mmap_sem
+ 	 * held in write mode.
+ 	 */
++	oldflags = vma->vm_flags;
+ 	vma->vm_flags = newflags;
+ 	vma->vm_page_prot = newprot;
++	if (oldflags & VM_EXEC)
++		arch_remove_exec_range(current->mm, old_end);
+ 	change_protection(vma, start, end, newprot);
+ 	__vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+ 	__vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+Index: linux-2.6.10/mm/mremap.c
+===================================================================
+--- linux-2.6.10.orig/mm/mremap.c	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/mm/mremap.c	2005-04-05 16:43:44.370145976 +0800
+@@ -385,8 +385,8 @@
+ 			if (vma->vm_flags & VM_MAYSHARE)
+ 				map_flags |= MAP_SHARED;
+ 
+-			new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+-						vma->vm_pgoff, map_flags);
++			new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len, 
++				vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC);
+ 			ret = new_addr;
+ 			if (new_addr & ~PAGE_MASK)
+ 				goto out;
diff --git a/lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch b/lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch
new file mode 100644
index 0000000..9c0bb12
--- /dev/null
+++ b/lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch
@@ -0,0 +1,10676 @@
+Index: linux-2.6.10/arch/i386/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig.debug	2005-04-05 16:29:30.191000944 +0800
++++ linux-2.6.10/arch/i386/Kconfig.debug	2005-04-05 16:47:53.904211032 +0800
+@@ -2,6 +2,63 @@
+ 
+ source "lib/Kconfig.debug"
+ 
++config CRASH_DUMP
++	tristate "Crash dump support (EXPERIMENTAL)"
++	depends on EXPERIMENTAL
++	default n
++	---help---
++	  Say Y here to enable saving an image of system memory when a panic
++	  or other error occurs. Dumps can also be forced with the SysRq+d
++	  key if MAGIC_SYSRQ is enabled.
++
++config KERNTYPES
++	bool
++	depends on CRASH_DUMP
++	default y
++
++config CRASH_DUMP_BLOCKDEV
++	tristate "Crash dump block device driver"
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow saving crash dumps directly to a disk device.
++
++config CRASH_DUMP_NETDEV
++	tristate "Crash dump network device driver"
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow saving crash dumps over a network device.
++
++config CRASH_DUMP_MEMDEV
++	bool "Crash dump staged memory driver"
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow intermediate saving crash dumps in spare 
++	  memory pages which would then be written out to disk
++	  later.
++
++config CRASH_DUMP_SOFTBOOT
++	bool "Save crash dump across a soft reboot"
++	depends on CRASH_DUMP_MEMDEV
++	help
++	  Say Y to allow a crash dump to be preserved in memory
++	  pages across a soft reboot and written out to disk
++	  thereafter. For this to work, CRASH_DUMP must be 
++	  configured as part of the kernel (not as a module).
++
++config CRASH_DUMP_COMPRESS_RLE
++	tristate "Crash dump RLE compression"
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow saving dumps with Run Length Encoding compression.
++
++config CRASH_DUMP_COMPRESS_GZIP
++	tristate "Crash dump GZIP compression"
++	select ZLIB_INFLATE
++	select ZLIB_DEFLATE
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow saving dumps with Gnu Zip compression.
++
+ config EARLY_PRINTK
+ 	bool "Early printk" if EMBEDDED
+ 	default y
+@@ -15,8 +72,8 @@
+ 	  with klogd/syslogd or the X server. You should normally N here,
+ 	  unless you want to debug such a crash.
+ 
+-config DEBUG_STACKOVERFLOW
+-	bool "Check for stack overflows"
++config DEBUG_STACKOVERFLOW 
++	bool "Check for stack overflows" 
+ 	depends on DEBUG_KERNEL
+ 
+ config KPROBES
+Index: linux-2.6.10/arch/i386/mm/init.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/mm/init.c	2005-04-05 16:47:05.157621640 +0800
++++ linux-2.6.10/arch/i386/mm/init.c	2005-04-05 16:47:53.909210272 +0800
+@@ -244,6 +244,13 @@
+    return 0;
+ }
+ 
++/* To enable modules to check if a page is in RAM */
++int pfn_is_ram(unsigned long pfn)
++{
++	return (page_is_ram(pfn));
++}
++
++
+ #ifdef CONFIG_HIGHMEM
+ pte_t *kmap_pte;
+ pgprot_t kmap_prot;
+Index: linux-2.6.10/arch/i386/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/traps.c	2005-04-05 16:47:05.156621792 +0800
++++ linux-2.6.10/arch/i386/kernel/traps.c	2005-04-05 16:47:53.906210728 +0800
+@@ -27,6 +27,7 @@
+ #include <linux/ptrace.h>
+ #include <linux/utsname.h>
+ #include <linux/kprobes.h>
++#include <linux/dump.h>
+ 
+ #ifdef CONFIG_EISA
+ #include <linux/ioport.h>
+@@ -382,6 +383,7 @@
+ 	bust_spinlocks(0);
+ 	die.lock_owner = -1;
+ 	spin_unlock_irq(&die.lock);
++	dump((char *)str, regs);
+ 	if (in_interrupt())
+ 		panic("Fatal exception in interrupt");
+ 
+@@ -654,6 +656,7 @@
+ 	printk(" on CPU%d, eip %08lx, registers:\n",
+ 		smp_processor_id(), regs->eip);
+ 	show_registers(regs);
++	dump((char *)msg, regs);
+ 	printk("console shuts up ...\n");
+ 	console_silent();
+ 	spin_unlock(&nmi_print_lock);
+Index: linux-2.6.10/arch/i386/kernel/setup.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/setup.c	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/setup.c	2005-04-05 16:47:53.905210880 +0800
+@@ -662,6 +662,10 @@
+  */
+ #define LOWMEMSIZE()	(0x9f000)
+ 
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++unsigned long crashdump_addr = 0xdeadbeef;
++#endif
++
+ static void __init parse_cmdline_early (char ** cmdline_p)
+ {
+ 	char c = ' ', *to = command_line, *from = saved_command_line;
+@@ -823,6 +827,11 @@
+ 		if (c == ' ' && !memcmp(from, "vmalloc=", 8))
+ 			__VMALLOC_RESERVE = memparse(from+8, &from);
+ 
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++		if (c == ' ' && !memcmp(from, "crashdump=", 10))
++		    crashdump_addr = memparse(from+10, &from); 
++#endif
++
+ 		c = *(from++);
+ 		if (!c)
+ 			break;
+@@ -1288,6 +1297,10 @@
+ 
+ static char * __init machine_specific_memory_setup(void);
+ 
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++extern void crashdump_reserve(void);
++#endif
++
+ /*
+  * Determine if we were loaded by an EFI loader.  If so, then we have also been
+  * passed the efi memmap, systab, etc., so we should use these data structures
+@@ -1393,6 +1406,10 @@
+ #endif
+ 
+ 
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++	crashdump_reserve(); /* Preserve crash dump state from prev boot */
++#endif
++
+ 	dmi_scan_machine();
+ 
+ #ifdef CONFIG_X86_GENERICARCH
+Index: linux-2.6.10/arch/i386/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/smp.c	2005-04-05 16:47:05.154622096 +0800
++++ linux-2.6.10/arch/i386/kernel/smp.c	2005-04-05 16:47:53.908210424 +0800
+@@ -19,6 +19,7 @@
+ #include <linux/mc146818rtc.h>
+ #include <linux/cache.h>
+ #include <linux/interrupt.h>
++#include <linux/dump.h>
+ 
+ #include <asm/mtrr.h>
+ #include <asm/tlbflush.h>
+@@ -143,6 +144,13 @@
+ 	 */
+ 	cfg = __prepare_ICR(shortcut, vector);
+ 
++	if (vector == DUMP_VECTOR) {
++		/*
++		 * Setup DUMP IPI to be delivered as an NMI
++		 */
++		cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI;
++	}
++
+ 	/*
+ 	 * Send the IPI. The write to APIC_ICR fires this off.
+ 	 */
+@@ -220,6 +228,13 @@
+ 			 * program the ICR 
+ 			 */
+ 			cfg = __prepare_ICR(0, vector);
++
++			if (vector == DUMP_VECTOR) {
++				/*
++				 * Setup DUMP IPI to be delivered as an NMI
++				 */
++				cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI;
++			}	
+ 			
+ 			/*
+ 			 * Send the IPI. The write to APIC_ICR fires this off.
+@@ -506,6 +521,11 @@
+ 
+ static struct call_data_struct * call_data;
+ 
++void dump_send_ipi(void)
++{
++	send_IPI_allbutself(DUMP_VECTOR);
++}
++
+ /*
+  * this function sends a 'generic call function' IPI to all other CPUs
+  * in the system.
+@@ -561,7 +581,7 @@
+ 	return 0;
+ }
+ 
+-static void stop_this_cpu (void * dummy)
++void stop_this_cpu (void * dummy)
+ {
+ 	/*
+ 	 * Remove this CPU:
+@@ -622,4 +642,3 @@
+ 		atomic_inc(&call_data->finished);
+ 	}
+ }
+-
+Index: linux-2.6.10/arch/i386/kernel/i386_ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/i386_ksyms.c	2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/i386_ksyms.c	2005-04-05 16:47:53.907210576 +0800
+@@ -16,6 +16,7 @@
+ #include <linux/tty.h>
+ #include <linux/highmem.h>
+ #include <linux/time.h>
++#include <linux/nmi.h>
+ 
+ #include <asm/semaphore.h>
+ #include <asm/processor.h>
+@@ -31,6 +32,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/nmi.h>
+ #include <asm/ist.h>
++#include <asm/e820.h>
+ #include <asm/kdebug.h>
+ 
+ extern void dump_thread(struct pt_regs *, struct user *);
+@@ -192,3 +194,20 @@
+ #endif
+ 
+ EXPORT_SYMBOL(csum_partial);
++
++#ifdef CONFIG_CRASH_DUMP_MODULE
++#ifdef CONFIG_SMP
++extern irq_desc_t irq_desc[NR_IRQS];
++extern cpumask_t irq_affinity[NR_IRQS];
++extern void stop_this_cpu(void *);
++EXPORT_SYMBOL(irq_desc);
++EXPORT_SYMBOL(irq_affinity);
++EXPORT_SYMBOL(stop_this_cpu);
++EXPORT_SYMBOL(dump_send_ipi);
++#endif
++extern int pfn_is_ram(unsigned long);
++EXPORT_SYMBOL(pfn_is_ram);
++#ifdef ARCH_HAS_NMI_WATCHDOG
++EXPORT_SYMBOL(touch_nmi_watchdog);
++#endif
++#endif
+Index: linux-2.6.10/arch/s390/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/s390/Kconfig.debug	2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/arch/s390/Kconfig.debug	2005-04-05 16:47:53.921208448 +0800
+@@ -2,4 +2,13 @@
+ 
+ source "lib/Kconfig.debug"
+ 
++config KERNTYPES
++	bool "Kerntypes debugging information"
++	default y
++	---help---
++	  Say Y here to save additional kernel debugging information in the
++	  file init/kerntypes.o. This information is used by crash analysis
++	  tools such as lcrash to assign structures to kernel addresses.
++
++
+ endmenu
+Index: linux-2.6.10/arch/s390/boot/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/s390/boot/Makefile	2004-12-25 05:35:49.000000000 +0800
++++ linux-2.6.10/arch/s390/boot/Makefile	2005-04-05 16:47:53.922208296 +0800
+@@ -15,4 +15,4 @@
+ 
+ install: $(CONFIGURE) $(obj)/image
+ 	sh -x  $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/image \
+-	      System.map Kerntypes "$(INSTALL_PATH)"
++	      System.map init/Kerntypes "$(INSTALL_PATH)"
+Index: linux-2.6.10/arch/s390/boot/install.sh
+===================================================================
+--- linux-2.6.10.orig/arch/s390/boot/install.sh	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/s390/boot/install.sh	2005-04-05 16:47:53.921208448 +0800
+@@ -16,7 +16,8 @@
+ #   $1 - kernel version
+ #   $2 - kernel image file
+ #   $3 - kernel map file
+-#   $4 - default install path (blank if root directory)
++#   $4 - kernel type file
++#   $5 - default install path (blank if root directory)
+ #
+ 
+ # User may have a custom install script
+@@ -26,13 +27,13 @@
+ 
+ # Default install - same as make zlilo
+ 
+-if [ -f $4/vmlinuz ]; then
+-	mv $4/vmlinuz $4/vmlinuz.old
++if [ -f $5/vmlinuz ]; then
++	mv $5/vmlinuz $5/vmlinuz.old
+ fi
+ 
+-if [ -f $4/System.map ]; then
+-	mv $4/System.map $4/System.old
++if [ -f $5/System.map ]; then
++	mv $5/System.map $5/System.old
+ fi
+ 
+-cat $2 > $4/vmlinuz
+-cp $3 $4/System.map
++cat $2 > $5/vmlinuz
++cp $3 $5/System.map
+Index: linux-2.6.10/arch/ia64/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/Kconfig.debug	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/ia64/Kconfig.debug	2005-04-05 16:47:53.917209056 +0800
+@@ -2,6 +2,65 @@
+ 
+ source "lib/Kconfig.debug"
+ 
++config CRASH_DUMP
++       tristate "Crash dump support (EXPERIMENTAL)"
++       depends on EXPERIMENTAL
++       default n
++       ---help---
++         Say Y here to enable saving an image of system memory when a panic
++         or other error occurs. Dumps can also be forced with the SysRq+d
++         key if MAGIC_SYSRQ is enabled.
++
++config KERNTYPES
++        bool
++        depends on CRASH_DUMP
++        default y
++
++config CRASH_DUMP_BLOCKDEV
++       tristate "Crash dump block device driver"
++       depends on CRASH_DUMP
++       help
++         Say Y to allow saving crash dumps directly to a disk device.
++
++config CRASH_DUMP_NETDEV
++       tristate "Crash dump network device driver"
++       depends on CRASH_DUMP
++       help
++        Say Y to allow saving crash dumps over a network device.
++
++config CRASH_DUMP_MEMDEV
++       bool "Crash dump staged memory driver"
++       depends on CRASH_DUMP
++       help
++         Say Y to allow intermediate saving crash dumps in spare
++         memory pages which would then be written out to disk
++         later.
++
++config CRASH_DUMP_SOFTBOOT
++       bool "Save crash dump across a soft reboot"
++       depends on CRASH_DUMP_MEMDEV
++       help
++         Say Y to allow a crash dump to be preserved in memory
++         pages across a soft reboot and written out to disk
++         thereafter. For this to work, CRASH_DUMP must be
++         configured as part of the kernel (not as a module).
++
++config CRASH_DUMP_COMPRESS_RLE
++       tristate "Crash dump RLE compression"
++       depends on CRASH_DUMP
++       help
++         Say Y to allow saving dumps with Run Length Encoding compression.
++
++config CRASH_DUMP_COMPRESS_GZIP
++       tristate "Crash dump GZIP compression"
++       select ZLIB_INFLATE
++       select ZLIB_DEFLATE
++       depends on CRASH_DUMP
++       help
++         Say Y to allow saving dumps with Gnu Zip compression.
++
++
++
+ choice
+ 	prompt "Physical memory granularity"
+ 	default IA64_GRANULE_64MB
+Index: linux-2.6.10/arch/ia64/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/kernel/traps.c	2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/ia64/kernel/traps.c	2005-04-05 16:47:53.918208904 +0800
+@@ -21,6 +21,8 @@
+ #include <asm/intrinsics.h>
+ #include <asm/processor.h>
+ #include <asm/uaccess.h>
++#include <asm/nmi.h>
++#include <linux/dump.h>
+ 
+ extern spinlock_t timerlist_lock;
+ 
+@@ -89,6 +91,7 @@
+ 		printk("%s[%d]: %s %ld [%d]\n",
+ 			current->comm, current->pid, str, err, ++die_counter);
+ 		show_regs(regs);
++		dump((char *)str, regs);
+   	} else
+ 		printk(KERN_ERR "Recursive die() failure, output suppressed\n");
+ 
+Index: linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/kernel/ia64_ksyms.c	2005-04-05 16:29:27.954340968 +0800
++++ linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c	2005-04-05 16:47:53.917209056 +0800
+@@ -7,7 +7,6 @@
+ 
+ #include <linux/config.h>
+ #include <linux/module.h>
+-
+ #include <linux/string.h>
+ EXPORT_SYMBOL(memset);
+ EXPORT_SYMBOL(memchr);
+@@ -28,6 +27,9 @@
+ EXPORT_SYMBOL(strstr);
+ EXPORT_SYMBOL(strpbrk);
+ 
++#include <linux/syscalls.h>
++EXPORT_SYMBOL(sys_ioctl);
++
+ #include <asm/checksum.h>
+ EXPORT_SYMBOL(ip_fast_csum);		/* hand-coded assembly */
+ 
+@@ -125,3 +127,21 @@
+ #  endif
+ # endif
+ #endif
++
++#include <asm/hw_irq.h>
++
++#ifdef CONFIG_CRASH_DUMP_MODULE
++#ifdef CONFIG_SMP
++extern irq_desc_t _irq_desc[NR_IRQS];
++extern cpumask_t irq_affinity[NR_IRQS];
++extern void stop_this_cpu(void *);
++extern int (*dump_ipi_function_ptr)(struct pt_regs *);
++extern void dump_send_ipi(void);
++EXPORT_SYMBOL(_irq_desc);
++EXPORT_SYMBOL(irq_affinity);
++EXPORT_SYMBOL(stop_this_cpu);
++EXPORT_SYMBOL(dump_send_ipi);
++EXPORT_SYMBOL(dump_ipi_function_ptr);
++#endif
++#endif
++
+Index: linux-2.6.10/arch/ia64/kernel/irq.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/kernel/irq.c	2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/arch/ia64/kernel/irq.c	2005-04-05 16:47:53.919208752 +0800
+@@ -933,7 +933,11 @@
+ 
+ static struct proc_dir_entry * smp_affinity_entry [NR_IRQS];
+ 
++#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE)
++cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
++#else
+ static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
++#endif
+ 
+ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
+ 
+Index: linux-2.6.10/arch/ia64/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/kernel/smp.c	2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/arch/ia64/kernel/smp.c	2005-04-05 16:47:53.920208600 +0800
+@@ -31,6 +31,10 @@
+ #include <linux/efi.h>
+ #include <linux/bitops.h>
+ 
++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE)
++#include <linux/dump.h>
++#endif
++
+ #include <asm/atomic.h>
+ #include <asm/current.h>
+ #include <asm/delay.h>
+@@ -67,6 +71,11 @@
+ #define IPI_CALL_FUNC		0
+ #define IPI_CPU_STOP		1
+ 
++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE)
++#define IPI_DUMP_INTERRUPT      4
++	int (*dump_ipi_function_ptr)(struct pt_regs *) = NULL;
++#endif
++
+ /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
+ static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+ 
+@@ -84,7 +93,9 @@
+ 	spin_unlock_irq(&call_lock);
+ }
+ 
+-static void
++
++/*changed static void stop_this_cpu -> void stop_this_cpu */
++void
+ stop_this_cpu (void)
+ {
+ 	/*
+@@ -155,6 +166,15 @@
+ 			      case IPI_CPU_STOP:
+ 				stop_this_cpu();
+ 				break;
++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE)
++			case IPI_DUMP_INTERRUPT:
++                        if( dump_ipi_function_ptr != NULL ) {
++                                if (!dump_ipi_function_ptr(regs)) {
++                                         printk(KERN_ERR "(*dump_ipi_function_ptr)(): rejected IPI_DUMP_INTERRUPT\n");
++                                }
++                        }
++                        break;
++#endif
+ 
+ 			      default:
+ 				printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which);
+@@ -369,9 +389,17 @@
+ {
+ 	send_IPI_allbutself(IPI_CPU_STOP);
+ }
++EXPORT_SYMBOL(smp_send_stop);
+ 
+ int __init
+ setup_profiling_timer (unsigned int multiplier)
+ {
+ 	return -EINVAL;
+ }
++
++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE)
++void dump_send_ipi(void)
++{
++        send_IPI_allbutself(IPI_DUMP_INTERRUPT);
++}
++#endif
+Index: linux-2.6.10/arch/ppc64/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/Kconfig.debug	2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/arch/ppc64/Kconfig.debug	2005-04-05 16:47:53.922208296 +0800
+@@ -2,6 +2,64 @@
+ 
+ source "lib/Kconfig.debug"
+ 
++config KERNTYPES
++	bool
++	depends on CRASH_DUMP
++	default y
++
++config CRASH_DUMP
++	tristate "Crash dump support"
++	default n
++	---help---
++	   Say Y here to enable saving an image of system memory when a panic
++	   or other error occurs. Dumps can also be forced with the SysRq+d
++	   key if MAGIC_SYSRQ is enabled.
++
++config CRASH_DUMP_BLOCKDEV
++	tristate "Crash dump block device driver"
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow saving crash dumps directly to a disk device.
++	
++config CRASH_DUMP_NETDEV
++	tristate "Crash dump network device driver"
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow saving crash dumps over a network device.
++
++config CRASH_DUMP_MEMDEV
++	bool "Crash dump staged memory driver"
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow intermediate saving crash dumps in spare
++	  memory pages which would then be written out to disk
++	  later. Need 'kexec' support for this to work.
++	     **** Not supported at present ****
++
++config CRASH_DUMP_SOFTBOOT
++	bool "Save crash dump across a soft reboot"
++	help
++	  Say Y to allow a crash dump to be preserved in memory
++	  pages across a soft reboot and written out to disk
++	  thereafter. For this to work, CRASH_DUMP must be
++	  configured as part of the kernel (not as a module).
++	  Need 'kexec' support to use this option.
++	    **** Not supported at present ****
++
++config CRASH_DUMP_COMPRESS_RLE
++	tristate "Crash dump RLE compression"
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow saving dumps with Run Length Encoding compression.
++
++config CRASH_DUMP_COMPRESS_GZIP
++	tristate "Crash dump GZIP compression"
++	select ZLIB_INFLATE
++	select ZLIB_DEFLATE
++	depends on CRASH_DUMP
++	help
++	  Say Y to allow saving dumps with Gnu Zip compression.
++
+ config DEBUG_STACKOVERFLOW
+ 	bool "Check for stack overflows"
+ 	depends on DEBUG_KERNEL
+Index: linux-2.6.10/arch/ppc64/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/traps.c	2004-12-25 05:34:47.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/traps.c	2005-04-05 16:47:53.923208144 +0800
+@@ -29,6 +29,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
++#include <linux/dump.h>
+ 
+ #include <asm/pgtable.h>
+ #include <asm/uaccess.h>
+@@ -116,6 +117,7 @@
+ 	if (nl)
+ 		printk("\n");
+ 	show_regs(regs);
++	dump((char *)str, regs);
+ 	bust_spinlocks(0);
+ 	spin_unlock_irq(&die_lock);
+ 
+Index: linux-2.6.10/arch/ppc64/kernel/ppc_ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/ppc_ksyms.c	2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/ppc_ksyms.c	2005-04-05 16:47:53.925207840 +0800
+@@ -159,6 +159,17 @@
+ EXPORT_SYMBOL(get_wchan);
+ EXPORT_SYMBOL(console_drivers);
+ 
++#ifdef CONFIG_CRASH_DUMP_MODULE
++extern int dump_page_is_ram(unsigned long);
++EXPORT_SYMBOL(dump_page_is_ram);
++#ifdef CONFIG_SMP
++EXPORT_SYMBOL(irq_affinity);
++extern void stop_this_cpu(void *);
++EXPORT_SYMBOL(stop_this_cpu);
++EXPORT_SYMBOL(dump_send_ipi);
++#endif
++#endif
++
+ EXPORT_SYMBOL(tb_ticks_per_usec);
+ EXPORT_SYMBOL(paca);
+ EXPORT_SYMBOL(cur_cpu_spec);
+Index: linux-2.6.10/arch/ppc64/kernel/lmb.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/lmb.c	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/lmb.c	2005-04-05 16:47:53.924207992 +0800
+@@ -344,3 +344,31 @@
+ 
+ 	return pa;
+ }
++
++
++/*
++ * This is the copy of page_is_ram (mm/init.c). The difference is 
++ * it identifies all memory holes.
++ */
++int dump_page_is_ram(unsigned long pfn)
++{
++        int i;
++	unsigned long paddr = (pfn << PAGE_SHIFT);
++
++	for (i=0; i < lmb.memory.cnt ;i++) {
++		unsigned long base;
++
++#ifdef CONFIG_MSCHUNKS
++		base = lmb.memory.region[i].physbase;
++#else
++		base = lmb.memory.region[i].base;
++#endif
++		if ((paddr >= base) &&
++			(paddr < (base + lmb.memory.region[i].size))) {
++			return 1;
++		}
++	}
++
++	return 0;
++}
++
+Index: linux-2.6.10/arch/ppc64/kernel/xics.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/xics.c	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/xics.c	2005-04-05 16:47:53.925207840 +0800
+@@ -421,7 +421,8 @@
+ 			smp_message_recv(PPC_MSG_MIGRATE_TASK, regs);
+ 		}
+ #endif
+-#ifdef CONFIG_DEBUGGER
++#if defined(CONFIG_DEBUGGER) || defined(CONFIG_CRASH_DUMP) \
++	|| defined(CONFIG_CRASH_DUMP_MODULE)
+ 		if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK,
+ 				       &xics_ipi_message[cpu].value)) {
+ 			mb();
+Index: linux-2.6.10/arch/ppc64/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/smp.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/smp.c	2005-04-05 16:47:53.926207688 +0800
+@@ -30,6 +30,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/cache.h>
+ #include <linux/err.h>
++#include <linux/dump.h>
+ #include <linux/sysdev.h>
+ #include <linux/cpu.h>
+ 
+@@ -71,6 +72,7 @@
+ struct smp_ops_t *smp_ops;
+ 
+ static volatile unsigned int cpu_callin_map[NR_CPUS];
++static int (*dump_ipi_function_ptr)(struct pt_regs *) = NULL;
+ 
+ extern unsigned char stab_array[];
+ 
+@@ -177,9 +179,16 @@
+ 		/* spare */
+ 		break;
+ #endif
+-#ifdef CONFIG_DEBUGGER
++#if defined(CONFIG_DEBUGGER) || defined(CONFIG_CRASH_DUMP) \
++	|| defined(CONFIG_CRASH_DUMP_MODULE)
+ 	case PPC_MSG_DEBUGGER_BREAK:
+-		debugger_ipi(regs);
++		if (dump_ipi_function_ptr) {
++			dump_ipi_function_ptr(regs);
++		}
++#ifdef CONFIG_DEBUGGER
++		else
++			debugger_ipi(regs);
++#endif
+ 		break;
+ #endif
+ 	default:
+@@ -201,7 +210,16 @@
+ }
+ #endif
+ 
+-static void stop_this_cpu(void *dummy)
++void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *))
++{
++	dump_ipi_function_ptr = dump_ipi_callback;
++	if (dump_ipi_callback) {
++		mb();
++		smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_DEBUGGER_BREAK);
++	}
++}
++
++void stop_this_cpu(void *dummy)
+ {
+ 	local_irq_disable();
+ 	while (1)
+Index: linux-2.6.10/arch/x86_64/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/Kconfig.debug	2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/arch/x86_64/Kconfig.debug	2005-04-05 16:47:53.909210272 +0800
+@@ -2,6 +2,66 @@
+ 
+ source "lib/Kconfig.debug"
+ 
++config CRASH_DUMP
++       tristate "Crash dump support (EXPERIMENTAL)"
++       depends on EXPERIMENTAL
++       default n
++       ---help---
++         Say Y here to enable saving an image of system memory when a panic
++         or other error occurs. Dumps can also be forced with the SysRq+d
++         key if MAGIC_SYSRQ is enabled.
++
++config KERNTYPES
++	bool
++	depends on CRASH_DUMP
++	default y
++
++config CRASH_DUMP_BLOCKDEV
++       tristate "Crash dump block device driver"
++       depends on CRASH_DUMP
++       help
++         Say Y to allow saving crash dumps directly to a disk device.
++
++config CRASH_DUMP_NETDEV
++       tristate "Crash dump network device driver"
++       depends on CRASH_DUMP
++       help
++         Say Y to allow saving crash dumps over a network device.
++
++config CRASH_DUMP_MEMDEV
++       bool "Crash dump staged memory driver"
++       depends on CRASH_DUMP
++       help
++         Say Y to allow intermediate saving crash dumps in spare
++         memory pages which would then be written out to disk
++         later.
++
++config CRASH_DUMP_SOFTBOOT
++       bool "Save crash dump across a soft reboot"
++       depends on CRASH_DUMP_MEMDEV
++       help
++         Say Y to allow a crash dump to be preserved in memory
++	 lkcd-kernpages across a soft reboot and written out to disk
++         thereafter. For this to work, CRASH_DUMP must be
++         configured as part of the kernel (not as a module).
++
++config CRASH_DUMP_COMPRESS_RLE
++       tristate "Crash dump RLE compression"
++       depends on CRASH_DUMP
++       help
++         Say Y to allow saving dumps with Run Length Encoding compression.
++
++
++config CRASH_DUMP_COMPRESS_GZIP
++       tristate "Crash dump GZIP compression"
++       select ZLIB_INFLATE
++       select ZLIB_DEFLATE
++       depends on CRASH_DUMP
++       help
++         Say Y to allow saving dumps with Gnu Zip compression.
++
++
++
+ # !SMP for now because the context switch early causes GPF in segment reloading
+ # and the GS base checking does the wrong thing then, causing a hang.
+ config CHECKING
+Index: linux-2.6.10/arch/x86_64/mm/init.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/mm/init.c	2005-04-05 16:29:30.040023896 +0800
++++ linux-2.6.10/arch/x86_64/mm/init.c	2005-04-05 16:47:53.916209208 +0800
+@@ -378,7 +378,7 @@
+ 	__flush_tlb_all();
+ } 
+ 
+-static inline int page_is_ram (unsigned long pagenr)
++inline int page_is_ram (unsigned long pagenr)
+ {
+ 	int i;
+ 
+Index: linux-2.6.10/arch/x86_64/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/traps.c	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/traps.c	2005-04-05 16:47:53.915209360 +0800
+@@ -27,6 +27,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/interrupt.h>
+ #include <linux/module.h>
++#include <linux/dump.h>
+ #include <linux/moduleparam.h>
+ 
+ #include <asm/system.h>
+@@ -369,6 +370,7 @@
+ 	printk("\n");
+ 	notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
+ 	show_registers(regs);
++	dump((char *)str, regs); 
+ 	/* Executive summary in case the oops scrolled away */
+ 	printk(KERN_ALERT "RIP ");
+ 	printk_address(regs->rip); 
+Index: linux-2.6.10/arch/x86_64/kernel/setup.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/setup.c	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/setup.c	2005-04-05 16:47:53.911209968 +0800
+@@ -221,6 +221,8 @@
+ 	}
+ }
+ 
++unsigned long crashdump_addr = 0xdeadbeef;
++
+ static __init void parse_cmdline_early (char ** cmdline_p)
+ {
+ 	char c = ' ', *to = command_line, *from = COMMAND_LINE;
+@@ -311,6 +313,9 @@
+ 
+ 		if (!memcmp(from,"oops=panic", 10))
+ 			panic_on_oops = 1;
++		
++		if (c == ' ' && !memcmp(from, "crashdump=", 10))
++			crashdump_addr = memparse(from+10, &from);
+ 
+ 	next_char:
+ 		c = *(from++);
+@@ -441,6 +446,10 @@
+ 		reserve_bootmem_generic(addr, PAGE_SIZE);
+ }
+ 
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++extern void crashdump_reserve(void);
++#endif
++
+ void __init setup_arch(char **cmdline_p)
+ {
+ 	unsigned long low_mem_size;
+@@ -550,6 +559,9 @@
+ 	}
+ #endif
+ 	paging_init();
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++	crashdump_reserve(); /* Preserve crash dump state from prev boot */
++#endif
+ 
+ 		check_ioapic();
+ #ifdef CONFIG_ACPI_BOOT
+Index: linux-2.6.10/arch/x86_64/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/smp.c	2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/smp.c	2005-04-05 16:47:53.915209360 +0800
+@@ -20,6 +20,7 @@
+ #include <linux/kernel_stat.h>
+ #include <linux/mc146818rtc.h>
+ #include <linux/interrupt.h>
++#include <linux/dump.h>
+ 
+ #include <asm/mtrr.h>
+ #include <asm/pgalloc.h>
+@@ -151,6 +152,13 @@
+ 	if (!mm)
+ 		BUG();
+ 
++	 if (vector == DUMP_VECTOR) {
++ 		 /*
++ 		  * Setup DUMP IPI to be delivered as an NMI
++ 		  */
++ 		 cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI;
++	 }
++
+ 	/*
+ 	 * I'm not happy about this global shared spinlock in the
+ 	 * MM hot path, but we'll see how contended it is.
+@@ -253,6 +261,13 @@
+ 	send_IPI_allbutself(KDB_VECTOR);
+ }
+ 
++
++/* void dump_send_ipi(int (*dump_ipi_handler)(struct pt_regs *)); */
++void dump_send_ipi(void)
++{
++	send_IPI_allbutself(DUMP_VECTOR);
++}
++
+ /*
+  * this function sends a 'reschedule' IPI to another CPU.
+  * it goes straight through and wastes no time serializing
+@@ -340,6 +355,18 @@
+ 	return 0;
+ }
+ 
++void stop_this_cpu(void* dummy)
++{
++	/*
++	 * Remove this CPU:
++	 */
++	cpu_clear(smp_processor_id(), cpu_online_map);
++	local_irq_disable();
++	disable_local_APIC();
++	for (;;) 
++		asm("hlt"); 
++}
++
+ void smp_stop_cpu(void)
+ {
+ 	/*
+Index: linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/x8664_ksyms.c	2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c	2005-04-05 16:47:53.914209512 +0800
+@@ -32,6 +32,7 @@
+ #include <asm/unistd.h>
+ #include <asm/delay.h>
+ #include <asm/tlbflush.h>
++#include <asm/e820.h>
+ #include <asm/kdebug.h>
+ 
+ extern spinlock_t rtc_lock;
+@@ -216,6 +217,20 @@
+ extern unsigned long __supported_pte_mask;
+ EXPORT_SYMBOL(__supported_pte_mask);
+ 
++#ifdef CONFIG_CRASH_DUMP_MODULE
++#ifdef CONFIG_SMP
++extern irq_desc_t irq_desc[NR_IRQS];
++extern cpumask_t irq_affinity[NR_IRQS];
++extern void stop_this_cpu(void *);
++EXPORT_SYMBOL(irq_desc);
++EXPORT_SYMBOL(irq_affinity);
++EXPORT_SYMBOL(dump_send_ipi);
++EXPORT_SYMBOL(stop_this_cpu);
++#endif
++extern int page_is_ram(unsigned long);
++EXPORT_SYMBOL(page_is_ram);
++#endif
++
+ #ifdef CONFIG_SMP
+ EXPORT_SYMBOL(flush_tlb_page);
+ EXPORT_SYMBOL_GPL(flush_tlb_all);
+Index: linux-2.6.10/arch/x86_64/kernel/pci-gart.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/pci-gart.c	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/pci-gart.c	2005-04-05 16:47:53.913209664 +0800
+@@ -34,7 +34,7 @@
+ dma_addr_t bad_dma_address;
+ 
+ unsigned long iommu_bus_base;	/* GART remapping area (physical) */
+-static unsigned long iommu_size; 	/* size of remapping area bytes */
++unsigned long iommu_size; 	/* size of remapping area bytes */
+ static unsigned long iommu_pages;	/* .. and in pages */
+ 
+ u32 *iommu_gatt_base; 		/* Remapping table */
+Index: linux-2.6.10/init/version.c
+===================================================================
+--- linux-2.6.10.orig/init/version.c	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/init/version.c	2005-04-05 16:47:53.896212248 +0800
+@@ -11,6 +11,7 @@
+ #include <linux/uts.h>
+ #include <linux/utsname.h>
+ #include <linux/version.h>
++#include <linux/stringify.h>
+ 
+ #define version(a) Version_ ## a
+ #define version_string(a) version(a)
+@@ -31,3 +32,6 @@
+ const char *linux_banner = 
+ 	"Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+ 	LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n";
++
++const char *LINUX_COMPILE_VERSION_ID = __stringify(LINUX_COMPILE_VERSION_ID);
++LINUX_COMPILE_VERSION_ID_TYPE;
+Index: linux-2.6.10/init/kerntypes.c
+===================================================================
+--- linux-2.6.10.orig/init/kerntypes.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/init/kerntypes.c	2005-04-05 16:47:53.895212400 +0800
+@@ -0,0 +1,40 @@
++/*
++ * kerntypes.c
++ *
++ * Copyright (C) 2000 Tom Morano (tjm@sgi.com) and
++ *                    Matt D. Robinson (yakker@alacritech.com)
++ *
++ * Dummy module that includes headers for all kernel types of interest. 
++ * The kernel type information is used by the lcrash utility when 
++ * analyzing system crash dumps or the live system. Using the type 
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under version 2 of the GNU GPL.
++ */
++
++#include <linux/compile.h>
++#include <linux/module.h>
++#include <linux/mm.h>
++#include <linux/vmalloc.h>
++#include <linux/config.h>
++#include <linux/utsname.h>
++#include <linux/kernel_stat.h>
++#include <linux/dump.h>
++
++#include <asm/kerntypes.h>
++
++#ifdef LINUX_COMPILE_VERSION_ID_TYPE
++/* Define version type for version validation of dump and kerntypes */
++LINUX_COMPILE_VERSION_ID_TYPE;
++#endif
++#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP)
++extern struct runqueue runqueues;
++struct runqueue rn;
++#endif
++
++struct new_utsname *p;
++void
++kerntypes_dummy(void)
++{
++}
+Index: linux-2.6.10/init/main.c
+===================================================================
+--- linux-2.6.10.orig/init/main.c	2005-04-05 16:29:30.028025720 +0800
++++ linux-2.6.10/init/main.c	2005-04-05 16:47:53.897212096 +0800
+@@ -109,6 +109,16 @@
+ EXPORT_SYMBOL(system_state);
+ 
+ /*
++ * The kernel_magic value represents the address of _end, which allows
++ * namelist tools to "match" each other respectively.  That way a tool
++ * that looks at /dev/mem can verify that it is using the right System.map
++ * file -- if kernel_magic doesn't equal the namelist value of _end,
++ * something's wrong.
++ */
++extern unsigned long _end;
++unsigned long *kernel_magic = &_end;
++
++/*
+  * Boot command-line arguments
+  */
+ #define MAX_INIT_ARGS 32
+Index: linux-2.6.10/init/Makefile
+===================================================================
+--- linux-2.6.10.orig/init/Makefile	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/init/Makefile	2005-04-05 16:47:53.897212096 +0800
+@@ -9,12 +9,20 @@
+ mounts-$(CONFIG_BLK_DEV_INITRD)	+= do_mounts_initrd.o
+ mounts-$(CONFIG_BLK_DEV_MD)	+= do_mounts_md.o
+ 
++extra-$(CONFIG_KERNTYPES)	+= kerntypes.o
++#For IA64, compile kerntypes in dwarf-2 format.
++ifeq ($(CONFIG_IA64),y)
++CFLAGS_kerntypes.o             := -gdwarf-2
++else
++CFLAGS_kerntypes.o             := -gstabs
++endif
++
+ # files to be removed upon make clean
+ clean-files := ../include/linux/compile.h
+ 
+ # dependencies on generated files need to be listed explicitly
+ 
+-$(obj)/version.o: include/linux/compile.h
++$(obj)/version.o $(obj)/kerntypes.o: include/linux/compile.h
+ 
+ # compile.h changes depending on hostname, generation number, etc,
+ # so we regenerate it always.
+@@ -24,3 +32,4 @@
+ include/linux/compile.h: FORCE
+ 	@echo '  CHK     $@'
+ 	@$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CC) $(CFLAGS)"
++
+Index: linux-2.6.10/include/asm-um/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-um/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-um/kerntypes.h	2005-04-05 16:47:53.864217112 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-um/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* Usermode-Linux-specific header files */
++#ifndef _UM_KERNTYPES_H
++#define _UM_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _UM_KERNTYPES_H */
+Index: linux-2.6.10/include/linux/sysctl.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sysctl.h	2005-04-05 16:29:27.969338688 +0800
++++ linux-2.6.10/include/linux/sysctl.h	2005-04-05 16:47:53.894212552 +0800
+@@ -135,6 +135,7 @@
+ 	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
+ 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
+ 	KERN_SETUID_DUMPABLE=67, /* int: behaviour of dumps for setuid core */
++ 	KERN_DUMP=68,		/* directory: dump parameters */
+ };
+ 
+ 
+Index: linux-2.6.10/include/linux/sched.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sched.h	2005-04-05 16:47:05.178618448 +0800
++++ linux-2.6.10/include/linux/sched.h	2005-04-05 16:47:53.891213008 +0800
+@@ -94,6 +94,7 @@
+ extern int nr_threads;
+ extern int last_pid;
+ DECLARE_PER_CPU(unsigned long, process_counts);
++DECLARE_PER_CPU(struct runqueue, runqueues);
+ extern int nr_processes(void);
+ extern unsigned long nr_running(void);
+ extern unsigned long nr_uninterruptible(void);
+@@ -760,6 +761,110 @@
+ void yield(void);
+ 
+ /*
++ * These are the runqueue data structures:
++ */
++
++#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
++
++typedef struct runqueue runqueue_t;
++
++struct prio_array {
++	unsigned int nr_active;
++	unsigned long bitmap[BITMAP_SIZE];
++	struct list_head queue[MAX_PRIO];
++};
++
++/*
++ * This is the main, per-CPU runqueue data structure.
++ *
++ * Locking rule: those places that want to lock multiple runqueues
++ * (such as the load balancing or the thread migration code), lock
++ * acquire operations must be ordered by ascending &runqueue.
++ */
++struct runqueue {
++	spinlock_t lock;
++
++	/*
++	 * nr_running and cpu_load should be in the same cacheline because
++	 * remote CPUs use both these fields when doing load calculation.
++	 */
++	unsigned long nr_running;
++#ifdef CONFIG_SMP
++	unsigned long cpu_load;
++#endif
++	unsigned long long nr_switches;
++
++	/*
++	 * This is part of a global counter where only the total sum
++	 * over all CPUs matters. A task can increase this counter on
++	 * one CPU and if it got migrated afterwards it may decrease
++	 * it on another CPU. Always updated under the runqueue lock:
++	 */
++	unsigned long nr_uninterruptible;
++
++	unsigned long expired_timestamp;
++	unsigned long long timestamp_last_tick;
++	task_t *curr, *idle;
++	struct mm_struct *prev_mm;
++	prio_array_t *active, *expired, arrays[2];
++	int best_expired_prio;
++	atomic_t nr_iowait;
++
++#ifdef CONFIG_SMP
++	struct sched_domain *sd;
++
++	/* For active balancing */
++	int active_balance;
++	int push_cpu;
++
++	task_t *migration_thread;
++	struct list_head migration_queue;
++#endif
++
++#ifdef CONFIG_SCHEDSTATS
++	/* latency stats */
++	struct sched_info rq_sched_info;
++
++	/* sys_sched_yield() stats */
++	unsigned long yld_exp_empty;
++	unsigned long yld_act_empty;
++	unsigned long yld_both_empty;
++	unsigned long yld_cnt;
++
++	/* schedule() stats */
++	unsigned long sched_noswitch;
++	unsigned long sched_switch;
++	unsigned long sched_cnt;
++	unsigned long sched_goidle;
++
++	/* pull_task() stats */
++	unsigned long pt_gained[MAX_IDLE_TYPES];
++	unsigned long pt_lost[MAX_IDLE_TYPES];
++
++	/* active_load_balance() stats */
++	unsigned long alb_cnt;
++	unsigned long alb_lost;
++	unsigned long alb_gained;
++	unsigned long alb_failed;
++
++	/* try_to_wake_up() stats */
++	unsigned long ttwu_cnt;
++	unsigned long ttwu_attempts;
++	unsigned long ttwu_moved;
++
++	/* wake_up_new_task() stats */
++	unsigned long wunt_cnt;
++	unsigned long wunt_moved;
++
++	/* sched_migrate_task() stats */
++	unsigned long smt_cnt;
++
++	/* sched_balance_exec() stats */
++	unsigned long sbe_cnt;
++#endif
++};
++
++/*
+  * The default (Linux) execution domain.
+  */
+ extern struct exec_domain	default_exec_domain;
+Index: linux-2.6.10/include/linux/miscdevice.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/miscdevice.h	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/include/linux/miscdevice.h	2005-04-05 16:47:53.893212704 +0800
+@@ -25,6 +25,7 @@
+ #define MICROCODE_MINOR		184
+ #define MWAVE_MINOR	219		/* ACP/Mwave Modem */
+ #define MPT_MINOR	220
++#define CRASH_DUMP_MINOR   230		/* LKCD */
+ #define MISC_DYNAMIC_MINOR 255
+ 
+ #define TUN_MINOR	     200
+Index: linux-2.6.10/include/linux/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dump.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dump.h	2005-04-05 16:47:53.893212704 +0800
+@@ -0,0 +1,406 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ *
++ * vmdump.h to dump.h by: Matt D. Robinson (yakker@sourceforge.net)
++ * Copyright 2001 - 2002 Matt D. Robinson.  All rights reserved.
++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved.
++ *
++ * Most of this is the same old stuff from vmdump.h, except now we're
++ * actually a stand-alone driver plugged into the block layer interface,
++ * with the exception that we now allow for compression modes externally
++ * loaded (e.g., someone can come up with their own).
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file includes all structure definitions for crash dumps. */
++#ifndef _DUMP_H
++#define _DUMP_H
++
++#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE)
++
++#include <linux/list.h>
++#include <linux/notifier.h>
++#include <linux/dumpdev.h>
++#include <asm/ioctl.h>
++
++/* 
++ * Predefine default DUMP_PAGE constants, asm header may override.
++ *
++ * On ia64 discontinuous memory systems it's possible for the memory
++ * banks to stop at 2**12 page alignments, the smallest possible page
++ * size. But the system page size, PAGE_SIZE, is in fact larger.
++ */
++#define DUMP_PAGE_SHIFT 	PAGE_SHIFT
++#define DUMP_PAGE_MASK		PAGE_MASK
++#define DUMP_PAGE_ALIGN(addr)	PAGE_ALIGN(addr)
++
++/*
++ * Dump offset changed from 4Kb to 64Kb to support multiple PAGE_SIZE 
++ * (kernel page size). Assumption goes that 64K is the highest page size 
++ * supported 
++ */
++
++#define DUMP_HEADER_OFFSET	(1ULL << 16)
++
++#define OLDMINORBITS	8
++#define OLDMINORMASK	((1U << OLDMINORBITS) -1)
++
++/* Making DUMP_PAGE_SIZE = PAGE_SIZE, to support dumping on architectures 
++ * which support page sizes (PAGE_SIZE) greater than 4KB.
++ * Will it affect ia64 discontinuous memory systems ????
++ */
++#define DUMP_PAGE_SIZE		PAGE_SIZE
++
++/* thread_info lies at the bottom of stack, (Except IA64). */
++#define STACK_START_POSITION(tsk)               (tsk->thread_info)
++/* 
++ * Predefined default memcpy() to use when copying memory to the dump buffer.
++ *
++ * On ia64 there is a heads up function that can be called to let the prom
++ * machine check monitor know that the current activity is risky and it should
++ * ignore the fault (nofault). In this case the ia64 header will redefine this
++ * macro to __dump_memcpy() and use it's arch specific version.
++ */
++#define DUMP_memcpy		memcpy
++#define bzero(a,b)              memset(a, 0, b)
++
++/* necessary header files */
++#include <asm/dump.h>			/* for architecture-specific header */
++
++/* 
++ * Size of the buffer that's used to hold:
++ *
++ *	1. the dump header (padded to fill the complete buffer)
++ *	2. the possibly compressed page headers and data
++ *   
++ *  = 256k for page size >= 64k
++ *  = 64k  for page size < 64k
++ */
++#if (PAGE_SHIFT >= 16)
++#define DUMP_BUFFER_SIZE	(256 * 1024)  /* size of dump buffer         */
++#else
++#define DUMP_BUFFER_SIZE	(64 * 1024)  /* size of dump buffer         */
++#endif
++
++#define DUMP_HEADER_SIZE	DUMP_BUFFER_SIZE
++
++/* standard header definitions */
++#define DUMP_MAGIC_NUMBER	0xa8190173618f23edULL  /* dump magic number */
++#define DUMP_MAGIC_LIVE		0xa8190173618f23cdULL  /* live magic number */
++#define DUMP_VERSION_NUMBER	0x8	/* dump version number              */
++#define DUMP_PANIC_LEN		0x100	/* dump panic string length         */
++
++/* dump levels - type specific stuff added later -- add as necessary */
++#define DUMP_LEVEL_NONE		0x0	/* no dumping at all -- just bail   */
++#define DUMP_LEVEL_HEADER	0x1	/* kernel dump header only          */
++#define DUMP_LEVEL_KERN		0x2	/* dump header and kernel pages     */
++#define DUMP_LEVEL_USED		0x4	/* dump header, kernel/user pages   */
++#define DUMP_LEVEL_ALL_RAM	0x8	/* dump header, all RAM pages       */
++#define DUMP_LEVEL_ALL		0x10	/* dump all memory RAM and firmware */
++
++
++/* dump compression options -- add as necessary */
++#define DUMP_COMPRESS_NONE	0x0	/* don't compress this dump         */
++#define DUMP_COMPRESS_RLE	0x1	/* use RLE compression              */
++#define DUMP_COMPRESS_GZIP	0x2	/* use GZIP compression             */
++
++/* dump flags - any dump-type specific flags -- add as necessary */
++#define DUMP_FLAGS_NONE		0x0	/* no flags are set for this dump   */
++#define DUMP_FLAGS_SOFTBOOT	0x2	/* 2 stage soft-boot based dump	    */
++#define DUMP_FLAGS_NONDISRUPT   0X1	/* non-disruptive dumping 	    */
++
++#define DUMP_FLAGS_TARGETMASK	0xf0000000 /* handle special case targets   */
++#define DUMP_FLAGS_DISKDUMP	0x80000000 /* dump to local disk 	    */
++#define DUMP_FLAGS_NETDUMP	0x40000000 /* dump over the network         */
++
++/* dump header flags -- add as necessary */
++#define DUMP_DH_FLAGS_NONE	0x0	/* no flags set (error condition!)  */
++#define DUMP_DH_RAW		0x1	/* raw page (no compression)        */
++#define DUMP_DH_COMPRESSED	0x2	/* page is compressed               */
++#define DUMP_DH_END		0x4	/* end marker on a full dump        */
++#define DUMP_DH_TRUNCATED	0x8	/* dump is incomplete               */
++#define DUMP_DH_TEST_PATTERN	0x10	/* dump page is a test pattern      */
++#define DUMP_DH_NOT_USED	0x20	/* 1st bit not used in flags        */
++
++/* names for various dump parameters in /proc/kernel */
++#define DUMP_ROOT_NAME		"sys/dump"
++#define DUMP_DEVICE_NAME	"device"
++#define DUMP_COMPRESS_NAME	"compress"
++#define DUMP_LEVEL_NAME		"level"
++#define DUMP_FLAGS_NAME		"flags"
++#define DUMP_ADDR_NAME		"addr"
++
++#define DUMP_SYSRQ_KEY		'd'	/* key to use for MAGIC_SYSRQ key   */
++
++/* CTL_DUMP names: */
++enum
++{
++	CTL_DUMP_DEVICE=1,
++	CTL_DUMP_COMPRESS=3,
++	CTL_DUMP_LEVEL=3,
++	CTL_DUMP_FLAGS=4,
++	CTL_DUMP_ADDR=5,
++	CTL_DUMP_TEST=6,
++};
++
++
++/* page size for gzip compression -- buffered slightly beyond hardware PAGE_SIZE used by DUMP */
++#define DUMP_DPC_PAGE_SIZE	(DUMP_PAGE_SIZE + 512)
++
++/* dump ioctl() control options */
++#define DIOSDUMPDEV     _IOW('p', 0xA0, unsigned int)  /* set the dump device              */
++#define DIOGDUMPDEV     _IOR('p', 0xA1, unsigned int)  /* get the dump device              */
++#define DIOSDUMPLEVEL   _IOW('p', 0xA2, unsigned int)  /* set the dump level               */
++#define DIOGDUMPLEVEL   _IOR('p', 0xA3, unsigned int)  /* get the dump level               */
++#define DIOSDUMPFLAGS   _IOW('p', 0xA4, unsigned int)  /* set the dump flag parameters     */
++#define DIOGDUMPFLAGS   _IOR('p', 0xA5, unsigned int)  /* get the dump flag parameters     */
++#define DIOSDUMPCOMPRESS _IOW('p', 0xA6, unsigned int) /* set the dump compress level      */
++#define DIOGDUMPCOMPRESS _IOR('p', 0xA7, unsigned int) /* get the dump compress level      */
++
++/* these ioctls are used only by netdump module */
++#define DIOSTARGETIP    _IOW('p', 0xA8, unsigned int)  /* set the target m/c's ip           */
++#define DIOGTARGETIP    _IOR('p', 0xA9, unsigned int)  /* get the target m/c's ip           */
++#define DIOSTARGETPORT  _IOW('p', 0xAA, unsigned int) /* set the target m/c's port          */
++#define DIOGTARGETPORT  _IOR('p', 0xAB, unsigned int) /* get the target m/c's port          */
++#define DIOSSOURCEPORT  _IOW('p', 0xAC, unsigned int) /* set the source m/c's port          */
++#define DIOGSOURCEPORT  _IOR('p', 0xAD, unsigned int) /* get the source m/c's port          */
++#define DIOSETHADDR     _IOW('p', 0xAE, unsigned int) /* set ethernet address      */
++#define DIOGETHADDR     _IOR('p', 0xAF, unsigned int) /* get ethernet address       */
++#define DIOGDUMPOKAY	_IOR('p', 0xB0, unsigned int) /* check if dump is configured      */
++#define DIOSDUMPTAKE    _IOW('p', 0xB1, unsigned int) /* Take a manual dump               */
++
++/*
++ * Structure: __dump_header
++ *  Function: This is the header dumped at the top of every valid crash
++ *            dump.  
++ */
++struct __dump_header {
++	/* the dump magic number -- unique to verify dump is valid */
++	u64	dh_magic_number;
++
++	/* the version number of this dump */
++	u32	dh_version;
++
++	/* the size of this header (in case we can't read it) */
++	u32	dh_header_size;
++
++	/* the level of this dump (just a header?) */
++	u32	dh_dump_level;
++
++	/* 
++	 * We assume dump_page_size to be 4K in every case.
++	 * Store here the configurable system page size (4K, 8K, 16K, etc.) 
++	 */
++	u32	dh_page_size;
++
++	/* the size of all physical memory */
++	u64	dh_memory_size;
++
++	/* the start of physical memory */
++	u64	dh_memory_start;
++
++	/* the end of physical memory */
++	u64	dh_memory_end;
++
++	/* the number of hardware/physical pages in this dump specifically */
++	u32	dh_num_dump_pages;
++
++	/* the panic string, if available */
++	char	dh_panic_string[DUMP_PANIC_LEN];
++
++	/* timeval depends on architecture, two long values */
++	struct {
++		u64 tv_sec;
++		u64 tv_usec;
++	} dh_time; /* the time of the system crash */
++
++	/* the NEW utsname (uname) information -- in character form */
++	/* we do this so we don't have to include utsname.h         */
++	/* plus it helps us be more architecture independent        */
++	/* now maybe one day soon they'll make the [65] a #define!  */
++	char	dh_utsname_sysname[65];
++	char	dh_utsname_nodename[65];
++	char	dh_utsname_release[65];
++	char	dh_utsname_version[65];
++	char	dh_utsname_machine[65];
++	char	dh_utsname_domainname[65];
++
++	/* the address of current task (OLD = void *, NEW = u64) */
++	u64	dh_current_task;
++
++	/* what type of compression we're using in this dump (if any) */
++	u32	dh_dump_compress;
++
++	/* any additional flags */
++	u32	dh_dump_flags;
++
++	/* any additional flags */
++	u32	dh_dump_device;
++} __attribute__((packed));
++
++/*
++ * Structure: __dump_page
++ *  Function: To act as the header associated to each physical page of
++ *            memory saved in the system crash dump.  This allows for
++ *            easy reassembly of each crash dump page.  The address bits
++ *            are split to make things easier for 64-bit/32-bit system
++ *            conversions.
++ *
++ * dp_byte_offset and dp_page_index are landmarks that are helpful when
++ * looking at a hex dump of /dev/vmdump,
++ */
++struct __dump_page {
++	/* the address of this dump page */
++	u64	dp_address;
++
++	/* the size of this dump page */
++	u32	dp_size;
++
++	/* flags (currently DUMP_COMPRESSED, DUMP_RAW or DUMP_END) */
++	u32	dp_flags;
++} __attribute__((packed));
++
++/*
++ * Structure: __lkcdinfo
++ * Function:  This structure contains information needed for the lkcdutils
++ *            package (particularly lcrash) to determine what information is
++ *            associated to this kernel, specifically.
++ */
++struct __lkcdinfo {
++	int	arch;
++	int	ptrsz;
++	int	byte_order;
++	int	linux_release;
++	int	page_shift;
++	int	page_size;
++	u64	page_mask;
++	u64	page_offset;
++	int	stack_offset;
++};
++
++#ifdef __KERNEL__
++
++/*
++ * Structure: __dump_compress
++ *  Function: This is what an individual compression mechanism can use
++ *            to plug in their own compression techniques.  It's always
++ *            best to build these as individual modules so that people
++ *            can put in whatever they want.
++ */
++struct __dump_compress {
++	/* the list_head structure for list storage */
++	struct list_head list;
++
++	/* the type of compression to use (DUMP_COMPRESS_XXX) */
++	int compress_type;
++	const char *compress_name;
++
++	/* the compression function to call */
++	u32 (*compress_func)(const u8 *, u32, u8 *, u32, unsigned long);
++};
++
++/* functions for dump compression registration */
++extern void dump_register_compression(struct __dump_compress *);
++extern void dump_unregister_compression(int);
++
++/*
++ * Structure dump_mbank[]:
++ *
++ * For CONFIG_DISCONTIGMEM systems this array specifies the
++ * memory banks/chunks that need to be dumped after a panic.
++ *
++ * For classic systems it specifies a single set of pages from
++ * 0 to max_mapnr.
++ */
++struct __dump_mbank {
++	u64 	start;
++	u64 	end;
++	int	type;
++	int	pad1;
++	long	pad2;
++};
++
++#define DUMP_MBANK_TYPE_CONVENTIONAL_MEMORY		1
++#define DUMP_MBANK_TYPE_OTHER				2
++
++#define MAXCHUNKS 256
++extern int dump_mbanks;
++extern struct __dump_mbank dump_mbank[MAXCHUNKS];
++
++/* notification event codes */
++#define DUMP_BEGIN		0x0001	/* dump beginning */
++#define DUMP_END		0x0002	/* dump ending */
++
++/* Scheduler soft spin control.
++ *
++ * 0 - no dump in progress
++ * 1 - cpu0 is dumping, ...
++ */
++extern unsigned long dump_oncpu;
++extern void dump_execute(const char *, const struct pt_regs *);
++
++/*
++ *	Notifier list for kernel code which wants to be called
++ *	at kernel dump. 
++ */
++extern struct notifier_block *dump_notifier_list;
++static inline int register_dump_notifier(struct notifier_block *nb)
++{
++	return notifier_chain_register(&dump_notifier_list, nb);
++}
++static inline int unregister_dump_notifier(struct notifier_block * nb)
++{
++	return notifier_chain_unregister(&dump_notifier_list, nb);
++}
++
++extern void (*dump_function_ptr)(const char *, const struct pt_regs *);
++static inline void dump(char * str, struct pt_regs * regs)
++{
++	if (dump_function_ptr)
++		dump_function_ptr(str, regs);
++}
++
++/*
++ * Common Arch Specific Functions should be declared here.
++ * This allows the C compiler to detect discrepancies.
++ */
++extern void	__dump_open(void);
++extern void	__dump_cleanup(void);
++extern void	__dump_clean_irq_state(void);
++extern void	__dump_init(u64);
++extern void	__dump_save_regs(struct pt_regs *, const struct pt_regs *);
++extern void	__dump_save_context(int cpu, const struct pt_regs *, struct task_struct *tsk);
++extern int	__dump_configure_header(const struct pt_regs *);
++extern int	__dump_irq_enable(void);
++extern void	__dump_irq_restore(void);
++extern int	__dump_page_valid(unsigned long index);
++#ifdef CONFIG_SMP
++extern void 	__dump_save_other_cpus(void);
++#else
++#define 	__dump_save_other_cpus()
++#endif
++
++extern int manual_handle_crashdump(void);
++
++/* to track all used (compound + zero order) pages */
++#define PageInuse(p)   (PageCompound(p) || page_count(p))
++
++#endif /* __KERNEL__ */
++
++#else	/* !CONFIG_CRASH_DUMP */
++
++/* If not configured then make code disappear! */
++#define register_dump_watchdog(x) 	do { } while(0)
++#define unregister_dump_watchdog(x) 	do { } while(0)
++#define register_dump_notifier(x)	do { } while(0)
++#define unregister_dump_notifier(x)	do { } while(0)
++#define dump_in_progress() 		0
++#define dump(x, y)			do { } while(0)
++
++#endif	/* !CONFIG_CRASH_DUMP */
++
++#endif /* _DUMP_H */
+Index: linux-2.6.10/include/linux/dumpdev.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dumpdev.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dumpdev.h	2005-04-05 16:47:53.890213160 +0800
+@@ -0,0 +1,163 @@
++/*
++ * Generic dump device interfaces for flexible system dump 
++ * (Enables variation of dump target types e.g disk, network, memory)
++ *
++ * These interfaces have evolved based on discussions on lkcd-devel. 
++ * Eventually the intent is to support primary and secondary or 
++ * alternate targets registered at the same time, with scope for 
++ * situation based failover or multiple dump devices used for parallel 
++ * dump i/o.
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com)
++ *
++ * Copyright (C) 2001 - 2002 Matt D. Robinson.  All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#ifndef _LINUX_DUMPDEV_H
++#define _LINUX_DUMPDEV_H
++
++#include <linux/kernel.h>
++#include <linux/wait.h>
++#include <linux/netpoll.h>
++#include <linux/bio.h>
++
++/* Determined by the dump target (device) type */
++
++struct dump_dev;
++
++struct dump_dev_ops {
++	int (*open)(struct dump_dev *, unsigned long); /* configure */
++	int (*release)(struct dump_dev *); /* unconfigure */
++	int (*silence)(struct dump_dev *); /* when dump starts */
++	int (*resume)(struct dump_dev *); /* when dump is over */
++	int (*seek)(struct dump_dev *, loff_t);
++	/* trigger a write (async in nature typically) */
++	int (*write)(struct dump_dev *, void *, unsigned long);
++	/* not usually used during dump, but option available */
++	int (*read)(struct dump_dev *, void *, unsigned long);
++	/* use to poll for completion */
++	int (*ready)(struct dump_dev *, void *); 
++	int (*ioctl)(struct dump_dev *, unsigned int, unsigned long);
++};
++
++struct dump_dev {
++	char type_name[32]; /* block, net-poll etc */
++	unsigned long device_id; /* interpreted differently for various types */
++	struct dump_dev_ops *ops;
++	struct list_head list;
++	loff_t curr_offset;
++	struct netpoll np;
++};
++
++/*
++ * dump_dev type variations: 
++ */
++
++/* block */
++struct dump_blockdev {
++	struct dump_dev ddev;
++	dev_t dev_id;
++	struct block_device *bdev;
++	struct bio *bio;
++	loff_t start_offset;
++	loff_t limit;
++	int err;
++};
++
++static inline struct dump_blockdev *DUMP_BDEV(struct dump_dev *dev)
++{
++	return container_of(dev, struct dump_blockdev, ddev);
++}
++
++
++/* mem  - for internal use by soft-boot based dumper */
++struct dump_memdev {
++	struct dump_dev ddev;
++	unsigned long indirect_map_root;
++	unsigned long nr_free;
++	struct page *curr_page;
++	unsigned long *curr_map;
++	unsigned long curr_map_offset;
++	unsigned long last_offset;
++	unsigned long last_used_offset;
++	unsigned long last_bs_offset;
++};	
++
++static inline struct dump_memdev *DUMP_MDEV(struct dump_dev *dev)
++{
++	return container_of(dev, struct dump_memdev, ddev);
++}
++
++/* Todo/future - meant for raw dedicated interfaces e.g. mini-ide driver */
++struct dump_rdev {
++	struct dump_dev ddev;
++	char name[32];
++	int (*reset)(struct dump_rdev *, unsigned int, 
++		unsigned long);
++	/* ... to do ... */
++};
++
++/* just to get the size right when saving config across a soft-reboot */
++struct dump_anydev {
++	union {
++		struct dump_blockdev bddev;
++		/* .. add other types here .. */
++	};
++};
++
++
++
++/* Dump device / target operation wrappers */
++/* These assume that dump_dev is initiatized to dump_config.dumper->dev */
++
++extern struct dump_dev *dump_dev;
++
++static inline int dump_dev_open(unsigned long arg)
++{
++	return dump_dev->ops->open(dump_dev, arg);
++}
++
++static inline int dump_dev_release(void)
++{
++	return dump_dev->ops->release(dump_dev);
++}
++
++static inline int dump_dev_silence(void)
++{
++	return dump_dev->ops->silence(dump_dev);
++}
++
++static inline int dump_dev_resume(void)
++{
++	return dump_dev->ops->resume(dump_dev);
++}
++
++static inline int dump_dev_seek(loff_t offset)
++{
++	return dump_dev->ops->seek(dump_dev, offset);
++}
++
++static inline int dump_dev_write(void *buf, unsigned long len)
++{
++	return dump_dev->ops->write(dump_dev, buf, len);
++}
++
++static inline int dump_dev_ready(void *buf)
++{
++	return dump_dev->ops->ready(dump_dev, buf);
++}
++
++static inline int dump_dev_ioctl(unsigned int cmd, unsigned long arg)
++{
++	if (!dump_dev || !dump_dev->ops->ioctl)
++		return -EINVAL;
++	return dump_dev->ops->ioctl(dump_dev, cmd, arg);
++}
++
++extern int dump_register_device(struct dump_dev *);
++extern void dump_unregister_device(struct dump_dev *);
++
++#endif /*  _LINUX_DUMPDEV_H */
+Index: linux-2.6.10/include/linux/dump_netdev.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dump_netdev.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dump_netdev.h	2005-04-05 16:47:53.889213312 +0800
+@@ -0,0 +1,80 @@
++/*
++ *  linux/drivers/net/netconsole.h
++ *
++ *  Copyright (C) 2001  Ingo Molnar <mingo@redhat.com>
++ *
++ *  This file contains the implementation of an IRQ-safe, crash-safe
++ *  kernel console implementation that outputs kernel messages to the
++ *  network.
++ *
++ * Modification history:
++ *
++ * 2001-09-17    started by Ingo Molnar.
++ */
++
++/****************************************************************
++ *      This program is free software; you can redistribute it and/or modify
++ *      it under the terms of the GNU General Public License as published by
++ *      the Free Software Foundation; either version 2, or (at your option)
++ *      any later version.
++ *
++ *      This program is distributed in the hope that it will be useful,
++ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *      GNU General Public License for more details.
++ *
++ *      You should have received a copy of the GNU General Public License
++ *      along with this program; if not, write to the Free Software
++ *      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ *
++ ****************************************************************/
++
++#define NETCONSOLE_VERSION 0x03
++
++enum netdump_commands {
++	COMM_NONE = 0,
++	COMM_SEND_MEM = 1,
++	COMM_EXIT = 2,
++	COMM_REBOOT = 3,
++	COMM_HELLO = 4,
++	COMM_GET_NR_PAGES = 5,
++	COMM_GET_PAGE_SIZE = 6,
++	COMM_START_NETDUMP_ACK = 7,
++	COMM_GET_REGS = 8,
++	COMM_GET_MAGIC = 9,
++	COMM_START_WRITE_NETDUMP_ACK = 10,
++};
++
++typedef struct netdump_req_s {
++	u64 magic;
++	u32 nr;
++	u32 command;
++	u32 from;
++	u32 to;
++} req_t;
++
++enum netdump_replies {
++	REPLY_NONE = 0,
++	REPLY_ERROR = 1,
++	REPLY_LOG = 2,
++	REPLY_MEM = 3,
++	REPLY_RESERVED = 4,
++	REPLY_HELLO = 5,
++	REPLY_NR_PAGES = 6,
++	REPLY_PAGE_SIZE = 7,
++	REPLY_START_NETDUMP = 8,
++	REPLY_END_NETDUMP = 9,
++	REPLY_REGS = 10,
++	REPLY_MAGIC = 11,
++	REPLY_START_WRITE_NETDUMP = 12,
++};
++
++typedef struct netdump_reply_s {
++	u32 nr;
++	u32 code;
++	u32 info;
++} reply_t;
++
++#define HEADER_LEN (1 + sizeof(reply_t))
++
++
+Index: linux-2.6.10/include/asm-parisc/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-parisc/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-parisc/kerntypes.h	2005-04-05 16:47:53.870216200 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-parisc/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* PA-RISC-specific header files */
++#ifndef _PARISC_KERNTYPES_H
++#define _PARISC_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _PARISC_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-h8300/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-h8300/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-h8300/kerntypes.h	2005-04-05 16:47:53.880214680 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-h8300/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* H8300-specific header files */
++#ifndef _H8300_KERNTYPES_H
++#define _H8300_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _H8300_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-ppc/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ppc/kerntypes.h	2005-04-05 16:47:53.882214376 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-ppc/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* PowerPC-specific header files */
++#ifndef _PPC_KERNTYPES_H
++#define _PPC_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _PPC_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-alpha/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-alpha/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-alpha/kerntypes.h	2005-04-05 16:47:53.876215288 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-alpha/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* Alpha-specific header files */
++#ifndef _ALPHA_KERNTYPES_H
++#define _ALPHA_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _ALPHA_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-arm26/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-arm26/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-arm26/kerntypes.h	2005-04-05 16:47:53.865216960 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-arm26/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* ARM26-specific header files */
++#ifndef _ARM26_KERNTYPES_H
++#define _ARM26_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _ARM26_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-sh/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sh/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-sh/kerntypes.h	2005-04-05 16:47:53.877215136 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-sh/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* Super-H-specific header files */
++#ifndef _SH_KERNTYPES_H
++#define _SH_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _SH_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-ia64/nmi.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ia64/nmi.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ia64/nmi.h	2005-04-05 16:47:53.883214224 +0800
+@@ -0,0 +1,28 @@
++/*
++ *  linux/include/asm-ia64/nmi.h
++ */
++#ifndef ASM_NMI_H
++#define ASM_NMI_H
++
++#include <linux/pm.h>
++
++struct pt_regs;
++ 
++typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
++ 
++/** 
++ * set_nmi_callback
++ *
++ * Set a handler for an NMI. Only one handler may be
++ * set. Return 1 if the NMI was handled.
++ */
++void set_nmi_callback(nmi_callback_t callback);
++ 
++/** 
++ * unset_nmi_callback
++ *
++ * Remove the handler previously set.
++ */
++void unset_nmi_callback(void);
++ 
++#endif /* ASM_NMI_H */
+Index: linux-2.6.10/include/asm-ia64/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ia64/dump.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ia64/dump.h	2005-04-05 16:47:53.884214072 +0800
+@@ -0,0 +1,201 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file holds the architecture specific crash dump header */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++/* definitions */
++#define DUMP_ASM_MAGIC_NUMBER     0xdeaddeadULL  /* magic number            */
++#define DUMP_ASM_VERSION_NUMBER   0x4            /* version number          */
++
++#ifdef __KERNEL__
++#include <linux/efi.h>
++#include <asm/pal.h>
++#include <asm/ptrace.h>
++
++#ifdef CONFIG_SMP
++extern cpumask_t irq_affinity[];
++extern int (*dump_ipi_function_ptr)(struct pt_regs *);
++extern void dump_send_ipi(void);
++#else /* !CONFIG_SMP */
++#define dump_send_ipi() do { } while(0)
++#endif
++
++#else  /* !__KERNEL__ */
++/* necessary header files */
++#include <asm/ptrace.h>                          /* for pt_regs             */
++#include <linux/threads.h>
++#endif /* __KERNEL__ */
++
++/* 
++ * mkswap.c calls getpagesize() to get the system page size,
++ * which is not  necessarily the same as the hardware page size.
++ *
++ * For ia64 the kernel PAGE_SIZE can be configured from 4KB ... 16KB.
++ *
++ * The physical memory is layed out out in the hardware/minimal pages.
++ * This is the size we need to use for dumping physical pages.
++ *
++ * Note ths hardware/minimal page size being use in;
++ *      arch/ia64/kernel/efi.c`efi_memmap_walk():
++ *	    curr.end   = curr.start + (md->num_pages << 12);
++ *
++ * Since the system page size could change between the kernel we boot
++ * on the the kernel that cause the core dume we may want to have something
++ * more constant like the maximum system page size (See include/asm-ia64/page.h).
++ */
++/* IA64 manages the stack in differnt manner as compared to other architectures.
++ * task_struct lies at the bottom of stack.
++ */
++#undef STACK_START_POSITION
++#define STACK_START_POSITION(tsk)		(tsk)
++#define DUMP_MIN_PAGE_SHIFT                 	12
++#define DUMP_MIN_PAGE_SIZE                  	(1UL << DUMP_MIN_PAGE_SHIFT)
++#define DUMP_MIN_PAGE_MASK                  	(~(DUMP_MIN_PAGE_SIZE - 1))
++#define DUMP_MIN_PAGE_ALIGN(addr)           	(((addr) + DUMP_MIN_PAGE_SIZE - 1) & DUMP_MIN_PAGE_MASK)
++
++#define DUMP_MAX_PAGE_SHIFT                 	16
++#define DUMP_MAX_PAGE_SIZE                  	(1UL << DUMP_MAX_PAGE_SHIFT)
++#define DUMP_MAX_PAGE_MASK                  	(~(DUMP_MAX_PAGE_SIZE - 1))
++#define DUMP_MAX_PAGE_ALIGN(addr)           	(((addr) + DUMP_MAX_PAGE_SIZE - 1) & DUMP_MAX_PAGE_MASK)
++
++#define DUMP_EF_PAGE_SHIFT			DUMP_MIN_PAGE_SHIFT
++
++extern int _end,_start;
++
++/*
++ * Structure: dump_header_asm_t
++ *  Function: This is the header for architecture-specific stuff.  It
++ *            follows right after the dump header.
++ */
++/*typedef struct _dump_header_asm {*/
++
++typedef struct __dump_header_asm {
++
++        /* the dump magic number -- unique to verify dump is valid */
++        uint64_t             dha_magic_number;
++
++        /* the version number of this dump */
++        uint32_t             dha_version;
++
++        /* the size of this header (in case we can't read it) */
++        uint32_t             dha_header_size;
++
++        /* pointer to pt_regs, (OLD: (struct pt_regs *, NEW: (uint64_t)) */
++	uint64_t             dha_pt_regs;
++
++	/* the dump registers */
++	struct pt_regs       dha_regs;
++
++        /* the rnat register saved after flushrs */
++        uint64_t             dha_rnat;
++
++	/* the pfs register saved after flushrs */
++	uint64_t             dha_pfs;
++
++	/* the bspstore register saved after flushrs */
++	uint64_t             dha_bspstore;
++
++	/* smp specific */
++	uint32_t	     dha_smp_num_cpus;
++	uint32_t	     dha_dumping_cpu;	
++	struct pt_regs	     dha_smp_regs[NR_CPUS];
++	uint64_t	     dha_smp_current_task[NR_CPUS];
++	uint64_t	     dha_stack[NR_CPUS];
++	uint64_t	     dha_stack_ptr[NR_CPUS];
++
++} __attribute__((packed)) dump_header_asm_t;
++
++
++extern struct __dump_header_asm dump_header_asm;
++
++#ifdef __KERNEL__
++static inline void get_current_regs(struct pt_regs *regs)
++{
++	/* 
++	 * REMIND: Looking at functions/Macros like:
++	 *		 DO_SAVE_SWITCH_STACK
++	 *		 ia64_switch_to()
++	 *		 ia64_save_extra()
++	 *		 switch_to()
++	 *	   to implement this new feature that Matt seem to have added
++	 *	   to panic.c; seems all platforms are now expected to provide
++	 *	   this function to dump the current registers into the pt_regs
++	 *	   structure.
++	 */
++	volatile unsigned long rsc_value;/*for storing the rsc value*/
++	volatile unsigned long ic_value;
++
++	__asm__ __volatile__("mov %0=b6;;":"=r"(regs->b6));
++	__asm__ __volatile__("mov %0=b7;;":"=r"(regs->b7));
++	
++        __asm__ __volatile__("mov %0=ar.csd;;":"=r"(regs->ar_csd));
++	__asm__ __volatile__("mov %0=ar.ssd;;":"=r"(regs->ar_ssd));
++	__asm__ __volatile__("mov %0=psr;;":"=r"(ic_value));
++	if(ic_value & 0x1000)/*Within an interrupt*/
++	{
++		__asm__ __volatile__("mov %0=cr.ipsr;;":"=r"(regs->cr_ipsr));
++		__asm__ __volatile__("mov %0=cr.iip;;":"=r"(regs->cr_iip));
++		__asm__ __volatile__("mov %0=cr.ifs;;":"=r"(regs->cr_ifs));
++	}
++	else
++	{
++		regs->cr_ipsr=regs->cr_iip=regs->cr_ifs=(unsigned long)-1;
++	}
++	__asm__ __volatile__("mov %0=ar.unat;;":"=r"(regs->ar_unat));
++	__asm__ __volatile__("mov %0=ar.pfs;;":"=r"(regs->ar_pfs));
++	__asm__ __volatile__("mov %0=ar.rsc;;":"=r"(rsc_value));
++	regs->ar_rsc = rsc_value;
++	/*loadrs is from 16th bit to 29th bit of rsc*/
++	regs->loadrs =  rsc_value >> 16 & (unsigned long)0x3fff;
++	/*setting the rsc.mode value to 0 (rsc.mode is the last two bits of rsc)*/
++	__asm__ __volatile__("mov ar.rsc=%0;;"::"r"(rsc_value & (unsigned long)(~3)));
++	__asm__ __volatile__("mov %0=ar.rnat;;":"=r"(regs->ar_rnat));
++	__asm__ __volatile__("mov %0=ar.bspstore;;":"=r"(regs->ar_bspstore));
++	/*copying the original value back*/
++	__asm__ __volatile__("mov ar.rsc=%0;;"::"r"(rsc_value));
++	__asm__ __volatile__("mov %0=pr;;":"=r"(regs->pr));
++	__asm__ __volatile__("mov %0=ar.fpsr;;":"=r"(regs->ar_fpsr));
++	__asm__ __volatile__("mov %0=ar.ccv;;":"=r"(regs->ar_ccv));
++
++	__asm__ __volatile__("mov %0=r2;;":"=r"(regs->r2));
++        __asm__ __volatile__("mov %0=r3;;":"=r"(regs->r3));
++        __asm__ __volatile__("mov %0=r8;;":"=r"(regs->r8));
++        __asm__ __volatile__("mov %0=r9;;":"=r"(regs->r9));
++        __asm__ __volatile__("mov %0=r10;;":"=r"(regs->r10));
++	__asm__ __volatile__("mov %0=r11;;":"=r"(regs->r11));
++        __asm__ __volatile__("mov %0=r12;;":"=r"(regs->r12));
++	__asm__ __volatile__("mov %0=r13;;":"=r"(regs->r13));
++	__asm__ __volatile__("mov %0=r14;;":"=r"(regs->r14));
++	__asm__ __volatile__("mov %0=r15;;":"=r"(regs->r15));
++	__asm__ __volatile__("mov %0=r16;;":"=r"(regs->r16));
++	__asm__ __volatile__("mov %0=r17;;":"=r"(regs->r17));
++	__asm__ __volatile__("mov %0=r18;;":"=r"(regs->r18));
++	__asm__ __volatile__("mov %0=r19;;":"=r"(regs->r19));
++	__asm__ __volatile__("mov %0=r20;;":"=r"(regs->r20));
++	__asm__ __volatile__("mov %0=r21;;":"=r"(regs->r21));
++	__asm__ __volatile__("mov %0=r22;;":"=r"(regs->r22));
++	__asm__ __volatile__("mov %0=r23;;":"=r"(regs->r23));
++	__asm__ __volatile__("mov %0=r24;;":"=r"(regs->r24));
++	__asm__ __volatile__("mov %0=r25;;":"=r"(regs->r25));
++	__asm__ __volatile__("mov %0=r26;;":"=r"(regs->r26));
++	__asm__ __volatile__("mov %0=r27;;":"=r"(regs->r27));
++	__asm__ __volatile__("mov %0=r28;;":"=r"(regs->r28));
++	__asm__ __volatile__("mov %0=r29;;":"=r"(regs->r29));
++	__asm__ __volatile__("mov %0=r30;;":"=r"(regs->r30));
++	__asm__ __volatile__("mov %0=r31;;":"=r"(regs->r31));
++}
++
++/* Perhaps added to Common Arch Specific Functions and moved to dump.h some day */
++extern void * __dump_memcpy(void *, const void *, size_t);
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_DUMP_H */
+Index: linux-2.6.10/include/asm-ia64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ia64/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ia64/kerntypes.h	2005-04-05 16:47:53.884214072 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-ia64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* IA64-specific header files */
++#ifndef _IA64_KERNTYPES_H
++#define _IA64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _IA64_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-ppc64/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/dump.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ppc64/dump.h	2005-04-05 16:47:53.878214984 +0800
+@@ -0,0 +1,115 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Todd Inglett <tinglett@vnet.ibm.com>
++ *
++ * Copyright 2002 - 2004 International Business Machines
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file holds the architecture specific crash dump header */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++/* necessary header files */
++#include <asm/ptrace.h>                          /* for pt_regs             */
++#include <asm/kmap_types.h>
++#include <linux/threads.h>
++
++/* definitions */
++#define DUMP_ASM_MAGIC_NUMBER     0xdeaddeadULL  /* magic number            */
++#define DUMP_ASM_VERSION_NUMBER   0x5            /* version number          */
++
++/*
++ * Structure: __dump_header_asm
++ *  Function: This is the header for architecture-specific stuff.  It
++ *            follows right after the dump header.
++ */
++struct __dump_header_asm {
++
++        /* the dump magic number -- unique to verify dump is valid */
++        uint64_t             dha_magic_number;
++
++        /* the version number of this dump */
++        uint32_t             dha_version;
++
++        /* the size of this header (in case we can't read it) */
++        uint32_t             dha_header_size;
++
++	/* the dump registers */
++	struct pt_regs       dha_regs;
++
++	/* smp specific */
++	uint32_t	     dha_smp_num_cpus;
++	int		     dha_dumping_cpu;	
++	struct pt_regs	     dha_smp_regs[NR_CPUS];
++	uint64_t	     dha_smp_current_task[NR_CPUS];
++	uint64_t	     dha_stack[NR_CPUS];
++	uint64_t     	     dha_stack_ptr[NR_CPUS];
++} __attribute__((packed));
++
++#ifdef __KERNEL__
++static inline void get_current_regs(struct pt_regs *regs)
++{
++	unsigned long tmp1, tmp2;
++
++	__asm__ __volatile__ (
++		"std	0,0(%2)\n"
++		"std	1,8(%2)\n"
++		"std	2,16(%2)\n"
++		"std	3,24(%2)\n"
++		"std	4,32(%2)\n"
++		"std	5,40(%2)\n"
++		"std	6,48(%2)\n"
++		"std	7,56(%2)\n"
++		"std	8,64(%2)\n"
++		"std	9,72(%2)\n"
++		"std	10,80(%2)\n"
++		"std	11,88(%2)\n"
++		"std	12,96(%2)\n"
++		"std	13,104(%2)\n"
++		"std	14,112(%2)\n"
++		"std	15,120(%2)\n"
++		"std	16,128(%2)\n"
++		"std	17,136(%2)\n"
++		"std	18,144(%2)\n"
++		"std	19,152(%2)\n"
++		"std	20,160(%2)\n"
++		"std	21,168(%2)\n"
++		"std	22,176(%2)\n"
++		"std	23,184(%2)\n"
++		"std	24,192(%2)\n"
++		"std	25,200(%2)\n"
++		"std	26,208(%2)\n"
++		"std	27,216(%2)\n"
++		"std	28,224(%2)\n"
++		"std	29,232(%2)\n"
++		"std	30,240(%2)\n"
++		"std	31,248(%2)\n"
++		"mfmsr	%0\n"
++		"std	%0, 264(%2)\n"
++		"mfctr	%0\n"
++		"std	%0, 280(%2)\n"
++		"mflr	%0\n"
++		"std	%0, 288(%2)\n"
++		"bl	1f\n"
++	"1:	 mflr	%1\n"
++		"std	%1, 256(%2)\n"
++		"mtlr	%0\n"
++		"mfxer	%0\n"
++		"std	%0, 296(%2)\n"
++		: "=&r" (tmp1), "=&r" (tmp2)
++		: "b" (regs));
++}
++
++extern struct __dump_header_asm dump_header_asm;
++
++#ifdef CONFIG_SMP
++extern void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *));
++#else
++#define dump_send_ipi() do { } while(0)
++#endif
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_DUMP_H */
+Index: linux-2.6.10/include/asm-ppc64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ppc64/kerntypes.h	2005-04-05 16:47:53.879214832 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-ppc64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* PPC64-specific header files */
++#ifndef _PPC64_KERNTYPES_H
++#define _PPC64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _PPC64_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-ppc64/kmap_types.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/kmap_types.h	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/include/asm-ppc64/kmap_types.h	2005-04-05 16:47:53.878214984 +0800
+@@ -16,7 +16,8 @@
+ 	KM_IRQ1,
+ 	KM_SOFTIRQ0,
+ 	KM_SOFTIRQ1,	
+-	KM_TYPE_NR
++	KM_TYPE_NR,
++	KM_DUMP
+ };
+ 
+ #endif
+Index: linux-2.6.10/include/asm-ppc64/smp.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/smp.h	2004-12-25 05:33:47.000000000 +0800
++++ linux-2.6.10/include/asm-ppc64/smp.h	2005-04-05 16:47:53.877215136 +0800
+@@ -36,7 +36,7 @@
+ extern void smp_send_debugger_break(int cpu);
+ struct pt_regs;
+ extern void smp_message_recv(int, struct pt_regs *);
+-
++extern void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *));
+ 
+ #define smp_processor_id() (get_paca()->paca_index)
+ #define hard_smp_processor_id() (get_paca()->hw_cpu_id)
+Index: linux-2.6.10/include/asm-cris/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-cris/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-cris/kerntypes.h	2005-04-05 16:47:53.874215592 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-cris/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* CRIS-specific header files */
++#ifndef _CRIS_KERNTYPES_H
++#define _CRIS_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _CRIS_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-m68knommu/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-m68knommu/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-m68knommu/kerntypes.h	2005-04-05 16:47:53.870216200 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-m68knommu/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* m68k/no-MMU-specific header files */
++#ifndef _M68KNOMMU_KERNTYPES_H
++#define _M68KNOMMU_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _M68KNOMMU_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-v850/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-v850/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-v850/kerntypes.h	2005-04-05 16:47:53.888213464 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-v850/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* V850-specific header files */
++#ifndef _V850_KERNTYPES_H
++#define _V850_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _V850_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-x86_64/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/dump.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-x86_64/dump.h	2005-04-05 16:47:53.868216504 +0800
+@@ -0,0 +1,93 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ * x86_64 lkcd port Sachin Sant ( sachinp@in.ibm.com)
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file holds the architecture specific crash dump header */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++/* necessary header files */
++#include <asm/ptrace.h>                          /* for pt_regs             */
++#include <linux/threads.h>
++
++/* definitions */
++#define DUMP_ASM_MAGIC_NUMBER     0xdeaddeadULL  /* magic number            */
++#define DUMP_ASM_VERSION_NUMBER   0x2            /* version number          */
++
++
++/*
++ * Structure: dump_header_asm_t
++ *  Function: This is the header for architecture-specific stuff.  It
++ *            follows right after the dump header.
++ */
++struct __dump_header_asm {
++
++        /* the dump magic number -- unique to verify dump is valid */
++        uint64_t             dha_magic_number;
++
++        /* the version number of this dump */
++        uint32_t             dha_version;
++
++        /* the size of this header (in case we can't read it) */
++        uint32_t             dha_header_size;
++
++	/* the dump registers */
++	struct pt_regs       dha_regs;
++
++	/* smp specific */
++	uint32_t	     dha_smp_num_cpus;
++	int		     dha_dumping_cpu;	
++	struct pt_regs	     dha_smp_regs[NR_CPUS];
++	uint64_t	     dha_smp_current_task[NR_CPUS];
++	uint64_t	     dha_stack[NR_CPUS];
++	uint64_t	     dha_stack_ptr[NR_CPUS];
++} __attribute__((packed));
++
++#ifdef __KERNEL__
++static inline void get_current_regs(struct pt_regs *regs)
++{
++	unsigned seg;
++	__asm__ __volatile__("movq %%r15,%0" : "=m"(regs->r15));
++	__asm__ __volatile__("movq %%r14,%0" : "=m"(regs->r14));
++	__asm__ __volatile__("movq %%r13,%0" : "=m"(regs->r13));
++	__asm__ __volatile__("movq %%r12,%0" : "=m"(regs->r12));
++	__asm__ __volatile__("movq %%r11,%0" : "=m"(regs->r11));
++	__asm__ __volatile__("movq %%r10,%0" : "=m"(regs->r10));
++	__asm__ __volatile__("movq %%r9,%0" : "=m"(regs->r9));
++	__asm__ __volatile__("movq %%r8,%0" : "=m"(regs->r8));
++	__asm__ __volatile__("movq %%rbx,%0" : "=m"(regs->rbx));
++	__asm__ __volatile__("movq %%rcx,%0" : "=m"(regs->rcx));
++	__asm__ __volatile__("movq %%rdx,%0" : "=m"(regs->rdx));
++	__asm__ __volatile__("movq %%rsi,%0" : "=m"(regs->rsi));
++	__asm__ __volatile__("movq %%rdi,%0" : "=m"(regs->rdi));
++	__asm__ __volatile__("movq %%rbp,%0" : "=m"(regs->rbp));
++	__asm__ __volatile__("movq %%rax,%0" : "=m"(regs->rax));
++	__asm__ __volatile__("movq %%rsp,%0" : "=m"(regs->rsp));
++	__asm__ __volatile__("movl %%ss, %0" :"=r"(seg)); 
++	regs->ss = (unsigned long)seg;
++	__asm__ __volatile__("movl %%cs, %0" :"=r"(seg));
++	regs->cs = (unsigned long)seg;
++	__asm__ __volatile__("pushfq; popq %0" :"=m"(regs->eflags));
++	regs->rip = (unsigned long)current_text_addr();
++	
++}
++
++extern volatile int dump_in_progress;
++extern struct __dump_header_asm dump_header_asm;
++
++#ifdef CONFIG_SMP
++
++
++extern void dump_send_ipi(void);
++#else
++#define dump_send_ipi() do { } while(0)
++#endif
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_DUMP_H */
+Index: linux-2.6.10/include/asm-x86_64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-x86_64/kerntypes.h	2005-04-05 16:47:53.869216352 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-x86_64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* x86_64-specific header files */
++#ifndef _X86_64_KERNTYPES_H
++#define _X86_64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _X86_64_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-x86_64/hw_irq.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/hw_irq.h	2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/include/asm-x86_64/hw_irq.h	2005-04-05 16:47:53.869216352 +0800
+@@ -34,7 +34,6 @@
+ 
+ #define IA32_SYSCALL_VECTOR	0x80
+ 
+-
+ /*
+  * Vectors 0x20-0x2f are used for ISA interrupts.
+  */
+@@ -55,6 +54,7 @@
+ #define TASK_MIGRATION_VECTOR	0xfb
+ #define CALL_FUNCTION_VECTOR	0xfa
+ #define KDB_VECTOR	0xf9
++#define DUMP_VECTOR	0xf8
+ 
+ #define THERMAL_APIC_VECTOR	0xf0
+ 
+Index: linux-2.6.10/include/asm-x86_64/kmap_types.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/kmap_types.h	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/include/asm-x86_64/kmap_types.h	2005-04-05 16:47:53.868216504 +0800
+@@ -13,7 +13,8 @@
+ 	KM_IRQ1,
+ 	KM_SOFTIRQ0,
+ 	KM_SOFTIRQ1,
+-	KM_TYPE_NR
++	KM_DUMP,
++	KM_TYPE_NR,
+ };
+ 
+ #endif
+Index: linux-2.6.10/include/asm-x86_64/smp.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/smp.h	2004-12-25 05:33:48.000000000 +0800
++++ linux-2.6.10/include/asm-x86_64/smp.h	2005-04-05 16:47:53.867216656 +0800
+@@ -41,6 +41,7 @@
+ extern int pic_mode;
+ extern int smp_num_siblings;
+ extern void smp_flush_tlb(void);
++extern void dump_send_ipi(void);
+ extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
+ extern void smp_send_reschedule(int cpu);
+ extern void smp_invalidate_rcv(void);		/* Process an NMI */
+Index: linux-2.6.10/include/asm-s390/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-s390/dump.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-s390/dump.h	2005-04-05 16:47:53.865216960 +0800
+@@ -0,0 +1,10 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ */
++
++/* Nothing to be done here, we have proper hardware support */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++#endif
++
+Index: linux-2.6.10/include/asm-s390/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-s390/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-s390/kerntypes.h	2005-04-05 16:47:53.866216808 +0800
+@@ -0,0 +1,46 @@
++/*
++ * asm-s390/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* S/390 specific header files */
++#ifndef _S390_KERNTYPES_H
++#define _S390_KERNTYPES_H
++
++#include <asm/lowcore.h>
++#include <asm/debug.h>
++#include <asm/ccwdev.h>
++#include <asm/ccwgroup.h>
++#include <asm/qdio.h>
++
++/* channel subsystem driver */
++#include "../../drivers/s390/cio/cio.h"
++#include "../../drivers/s390/cio/chsc.h"
++#include "../../drivers/s390/cio/css.h"
++#include "../../drivers/s390/cio/device.h"
++#include "../../drivers/s390/cio/qdio.h"
++
++/* dasd device driver */
++#include "../../drivers/s390/block/dasd_int.h"
++#include "../../drivers/s390/block/dasd_diag.h"
++#include "../../drivers/s390/block/dasd_eckd.h"
++#include "../../drivers/s390/block/dasd_fba.h"
++
++/* networking drivers */
++#include "../../drivers/s390/net/fsm.h"
++#include "../../drivers/s390/net/iucv.h"
++#include "../../drivers/s390/net/lcs.h"
++
++/* zfcp device driver */
++#include "../../drivers/s390/scsi/zfcp_def.h"
++#include "../../drivers/s390/scsi/zfcp_fsf.h"
++
++#endif /* _S390_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-sparc64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sparc64/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-sparc64/kerntypes.h	2005-04-05 16:47:53.872215896 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-sparc64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* SPARC64-specific header files */
++#ifndef _SPARC64_KERNTYPES_H
++#define _SPARC64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _SPARC64_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-mips/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-mips/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-mips/kerntypes.h	2005-04-05 16:47:53.881214528 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-mips/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* MIPS-specific header files */
++#ifndef _MIPS_KERNTYPES_H
++#define _MIPS_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _MIPS_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-m68k/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-m68k/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-m68k/kerntypes.h	2005-04-05 16:47:53.875215440 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-m68k/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* m68k-specific header files */
++#ifndef _M68K_KERNTYPES_H
++#define _M68K_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _M68K_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-generic/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-generic/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-generic/kerntypes.h	2005-04-05 16:47:53.871216048 +0800
+@@ -0,0 +1,20 @@
++/*
++ * asm-generic/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* Arch-independent header files */
++#ifndef _GENERIC_KERNTYPES_H
++#define _GENERIC_KERNTYPES_H
++
++#include <linux/pci.h>
++
++#endif /* _GENERIC_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-i386/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/dump.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-i386/dump.h	2005-04-05 16:47:53.886213768 +0800
+@@ -0,0 +1,90 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file holds the architecture specific crash dump header */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++/* necessary header files */
++#include <asm/ptrace.h>
++#include <asm/page.h>
++#include <linux/threads.h>
++#include <linux/mm.h>
++
++/* definitions */
++#define DUMP_ASM_MAGIC_NUMBER	0xdeaddeadULL	/* magic number            */
++#define DUMP_ASM_VERSION_NUMBER	0x3	/* version number          */
++
++/*
++ * Structure: __dump_header_asm
++ *  Function: This is the header for architecture-specific stuff.  It
++ *            follows right after the dump header.
++ */
++struct __dump_header_asm {
++	/* the dump magic number -- unique to verify dump is valid */
++	u64		dha_magic_number;
++
++	/* the version number of this dump */
++	u32		dha_version;
++
++	/* the size of this header (in case we can't read it) */
++	u32		dha_header_size;
++
++	/* the esp for i386 systems */
++	u32		dha_esp;
++
++	/* the eip for i386 systems */
++	u32		dha_eip;
++
++	/* the dump registers */
++	struct pt_regs	dha_regs;
++
++	/* smp specific */
++	u32		dha_smp_num_cpus;
++	u32		dha_dumping_cpu;
++	struct pt_regs	dha_smp_regs[NR_CPUS];
++	u32		dha_smp_current_task[NR_CPUS];
++	u32		dha_stack[NR_CPUS];
++	u32		dha_stack_ptr[NR_CPUS];
++} __attribute__((packed));
++
++#ifdef __KERNEL__
++
++extern struct __dump_header_asm dump_header_asm;
++
++#ifdef CONFIG_SMP
++extern cpumask_t irq_affinity[];
++extern int (*dump_ipi_function_ptr)(struct pt_regs *);
++extern void dump_send_ipi(void);
++#else
++#define dump_send_ipi() do { } while(0)
++#endif
++
++static inline void get_current_regs(struct pt_regs *regs)
++{
++	__asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx));
++	__asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx));
++	__asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx));
++	__asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi));
++	__asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi));
++	__asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp));
++	__asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax));
++	__asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp));
++	__asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss));
++	__asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs));
++	__asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds));
++	__asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes));
++	__asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags));
++	regs->eip = (unsigned long)current_text_addr();
++}
++
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_DUMP_H */
+Index: linux-2.6.10/include/asm-i386/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-i386/kerntypes.h	2005-04-05 16:47:53.887213616 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-i386/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* ix86-specific header files */
++#ifndef _I386_KERNTYPES_H
++#define _I386_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _I386_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-i386/kmap_types.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/kmap_types.h	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/include/asm-i386/kmap_types.h	2005-04-05 16:47:53.886213768 +0800
+@@ -23,7 +23,8 @@
+ D(10)	KM_IRQ1,
+ D(11)	KM_SOFTIRQ0,
+ D(12)	KM_SOFTIRQ1,
+-D(13)	KM_TYPE_NR
++D(13)	KM_DUMP,
++D(14)	KM_TYPE_NR
+ };
+ 
+ #undef D
+Index: linux-2.6.10/include/asm-i386/smp.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/smp.h	2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/include/asm-i386/smp.h	2005-04-05 16:47:53.885213920 +0800
+@@ -37,6 +37,7 @@
+ extern cpumask_t cpu_sibling_map[];
+ 
+ extern void smp_flush_tlb(void);
++extern void dump_send_ipi(void);
+ extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
+ extern void smp_invalidate_rcv(void);		/* Process an NMI */
+ extern void (*mtrr_hook) (void);
+Index: linux-2.6.10/include/asm-i386/mach-default/irq_vectors.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/mach-default/irq_vectors.h	2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/include/asm-i386/mach-default/irq_vectors.h	2005-04-05 16:47:53.887213616 +0800
+@@ -48,6 +48,7 @@
+ #define INVALIDATE_TLB_VECTOR	0xfd
+ #define RESCHEDULE_VECTOR	0xfc
+ #define CALL_FUNCTION_VECTOR	0xfb
++#define DUMP_VECTOR		0xfa
+ 
+ #define THERMAL_APIC_VECTOR	0xf0
+ /*
+Index: linux-2.6.10/include/asm-arm/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-arm/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-arm/kerntypes.h	2005-04-05 16:47:53.873215744 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-arm/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* ARM-specific header files */
++#ifndef _ARM_KERNTYPES_H
++#define _ARM_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _ARM_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-sparc/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sparc/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-sparc/kerntypes.h	2005-04-05 16:47:53.874215592 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-sparc/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* SPARC-specific header files */
++#ifndef _SPARC_KERNTYPES_H
++#define _SPARC_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _SPARC_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-mips64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-mips64/kerntypes.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-mips64/kerntypes.h	2005-04-05 16:47:53.881214528 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-mips64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific 
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* MIPS64-specific header files */
++#ifndef _MIPS64_KERNTYPES_H
++#define _MIPS64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _MIPS64_KERNTYPES_H */
+Index: linux-2.6.10/net/Kconfig
+===================================================================
+--- linux-2.6.10.orig/net/Kconfig	2005-04-05 16:29:27.896349784 +0800
++++ linux-2.6.10/net/Kconfig	2005-04-05 16:47:53.895212400 +0800
+@@ -632,7 +632,7 @@
+ endmenu
+ 
+ config NETPOLL
+-	def_bool NETCONSOLE
++	def_bool NETCONSOLE || CRASH_DUMP_NETDEV
+ 
+ config NETPOLL_RX
+ 	bool "Netpoll support for trapping incoming packets"
+Index: linux-2.6.10/scripts/mkcompile_h
+===================================================================
+--- linux-2.6.10.orig/scripts/mkcompile_h	2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/scripts/mkcompile_h	2005-04-05 16:47:53.950204040 +0800
+@@ -33,7 +33,7 @@
+ 
+ UTS_LEN=64
+ UTS_TRUNCATE="sed -e s/\(.\{1,$UTS_LEN\}\).*/\1/"
+-
++LINUX_COMPILE_VERSION_ID="__linux_compile_version_id__`hostname | tr -c '[0-9A-Za-z\n]' '__'`_`LANG=C date | tr -c '[0-9A-Za-z\n]' '_'`"
+ # Generate a temporary compile.h
+ 
+ ( echo /\* This file is auto generated, version $VERSION \*/
+@@ -55,6 +55,8 @@
+   fi
+ 
+   echo \#define LINUX_COMPILER \"`$CC -v 2>&1 | tail -n 1`\"
++  echo \#define LINUX_COMPILE_VERSION_ID $LINUX_COMPILE_VERSION_ID
++  echo \#define LINUX_COMPILE_VERSION_ID_TYPE typedef char* "$LINUX_COMPILE_VERSION_ID""_t"
+ ) > .tmpcompile
+ 
+ # Only replace the real compile.h if the new one is different,
+Index: linux-2.6.10/mm/bootmem.c
+===================================================================
+--- linux-2.6.10.orig/mm/bootmem.c	2004-12-25 05:34:30.000000000 +0800
++++ linux-2.6.10/mm/bootmem.c	2005-04-05 16:47:53.903211184 +0800
+@@ -26,6 +26,7 @@
+  */
+ unsigned long max_low_pfn;
+ unsigned long min_low_pfn;
++EXPORT_SYMBOL(min_low_pfn);
+ unsigned long max_pfn;
+ 
+ EXPORT_SYMBOL(max_pfn);		/* This is exported so
+@@ -284,6 +285,7 @@
+ 				if (j + 16 < BITS_PER_LONG)
+ 					prefetchw(page + j + 16);
+ 				__ClearPageReserved(page + j);
++				set_page_count(page + j, 1);
+ 			}
+ 			__free_pages(page, ffs(BITS_PER_LONG)-1);
+ 			i += BITS_PER_LONG;
+Index: linux-2.6.10/mm/page_alloc.c
+===================================================================
+--- linux-2.6.10.orig/mm/page_alloc.c	2005-04-05 16:29:28.218300840 +0800
++++ linux-2.6.10/mm/page_alloc.c	2005-04-05 16:47:53.902211336 +0800
+@@ -47,6 +47,11 @@
+ EXPORT_SYMBOL(totalram_pages);
+ EXPORT_SYMBOL(nr_swap_pages);
+ 
++#ifdef CONFIG_CRASH_DUMP_MODULE
++/* This symbol has to be exported to use 'for_each_pgdat' macro by modules. */
++EXPORT_SYMBOL(pgdat_list);
++#endif
++
+ /*
+  * Used by page_zone() to look up the address of the struct zone whose
+  * id is encoded in the upper bits of page->flags
+@@ -281,8 +286,11 @@
+ 	arch_free_page(page, order);
+ 
+ 	mod_page_state(pgfree, 1 << order);
+-	for (i = 0 ; i < (1 << order) ; ++i)
++	for (i = 0 ; i < (1 << order) ; ++i){
++	        if (unlikely(i))
++		    __put_page(page + i); 
+ 		free_pages_check(__FUNCTION__, page + i);
++	}
+ 	list_add(&page->lru, &list);
+ 	kernel_map_pages(page, 1<<order, 0);
+ 	free_pages_bulk(page_zone(page), 1, &list, order);
+@@ -322,44 +330,34 @@
+ 	return page;
+ }
+ 
+-static inline void set_page_refs(struct page *page, int order)
+-{
+-#ifdef CONFIG_MMU
+-	set_page_count(page, 1);
+-#else
+-	int i;
+-
+-	/*
+-	 * We need to reference all the pages for this order, otherwise if
+-	 * anyone accesses one of the pages with (get/put) it will be freed.
+-	 */
+-	for (i = 0; i < (1 << order); i++)
+-		set_page_count(page+i, 1);
+-#endif /* CONFIG_MMU */
+-}
+-
+ /*
+  * This page is about to be returned from the page allocator
+  */
+-static void prep_new_page(struct page *page, int order)
++static void prep_new_page(struct page *_page, int order)
+ {
+-	if (page->mapping || page_mapped(page) ||
+-	    (page->flags & (
+-			1 << PG_private	|
+-			1 << PG_locked	|
+-			1 << PG_lru	|
+-			1 << PG_active	|
+-			1 << PG_dirty	|
+-			1 << PG_reclaim	|
+-			1 << PG_swapcache |
+-			1 << PG_writeback )))
++        int i;
++
++	for(i = 0; i < (1 << order); i++){
++	    struct page *page = _page + i;
++
++	    if (page->mapping || page_mapped(page) ||
++		    (page->flags & (
++				    1 << PG_private	|
++				    1 << PG_locked	|
++				    1 << PG_lru	|
++				    1 << PG_active	|
++				    1 << PG_dirty	|
++				    1 << PG_reclaim	|
++				    1 << PG_swapcache |
++				    1 << PG_writeback )))
+ 		bad_page(__FUNCTION__, page);
+ 
+-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+-			1 << PG_referenced | 1 << PG_arch_1 |
+-			1 << PG_checked | 1 << PG_mappedtodisk);
+-	page->private = 0;
+-	set_page_refs(page, order);
++	    page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
++		    1 << PG_referenced | 1 << PG_arch_1 |
++		    1 << PG_checked | 1 << PG_mappedtodisk);
++	    page->private = 0;
++	    set_page_count(page, 1);
++	}
+ }
+ 
+ /* 
+Index: linux-2.6.10/kernel/sched.c
+===================================================================
+--- linux-2.6.10.orig/kernel/sched.c	2005-04-05 16:29:30.335978904 +0800
++++ linux-2.6.10/kernel/sched.c	2005-04-05 16:47:53.901211488 +0800
+@@ -54,6 +54,10 @@
+ #define cpu_to_node_mask(cpu) (cpu_online_map)
+ #endif
+ 
++/* used to soft spin in sched while dump is in progress */
++unsigned long dump_oncpu;
++EXPORT_SYMBOL(dump_oncpu);
++
+ /*
+  * Convert user-nice values [ -20 ... 0 ... 19 ]
+  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+@@ -184,109 +188,6 @@
+ #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
+ 				< (long long) (sd)->cache_hot_time)
+ 
+-/*
+- * These are the runqueue data structures:
+- */
+-
+-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+-
+-typedef struct runqueue runqueue_t;
+-
+-struct prio_array {
+-	unsigned int nr_active;
+-	unsigned long bitmap[BITMAP_SIZE];
+-	struct list_head queue[MAX_PRIO];
+-};
+-
+-/*
+- * This is the main, per-CPU runqueue data structure.
+- *
+- * Locking rule: those places that want to lock multiple runqueues
+- * (such as the load balancing or the thread migration code), lock
+- * acquire operations must be ordered by ascending &runqueue.
+- */
+-struct runqueue {
+-	spinlock_t lock;
+-
+-	/*
+-	 * nr_running and cpu_load should be in the same cacheline because
+-	 * remote CPUs use both these fields when doing load calculation.
+-	 */
+-	unsigned long nr_running;
+-#ifdef CONFIG_SMP
+-	unsigned long cpu_load;
+-#endif
+-	unsigned long long nr_switches;
+-
+-	/*
+-	 * This is part of a global counter where only the total sum
+-	 * over all CPUs matters. A task can increase this counter on
+-	 * one CPU and if it got migrated afterwards it may decrease
+-	 * it on another CPU. Always updated under the runqueue lock:
+-	 */
+-	unsigned long nr_uninterruptible;
+-
+-	unsigned long expired_timestamp;
+-	unsigned long long timestamp_last_tick;
+-	task_t *curr, *idle;
+-	struct mm_struct *prev_mm;
+-	prio_array_t *active, *expired, arrays[2];
+-	int best_expired_prio;
+-	atomic_t nr_iowait;
+-
+-#ifdef CONFIG_SMP
+-	struct sched_domain *sd;
+-
+-	/* For active balancing */
+-	int active_balance;
+-	int push_cpu;
+-
+-	task_t *migration_thread;
+-	struct list_head migration_queue;
+-#endif
+-
+-#ifdef CONFIG_SCHEDSTATS
+-	/* latency stats */
+-	struct sched_info rq_sched_info;
+-
+-	/* sys_sched_yield() stats */
+-	unsigned long yld_exp_empty;
+-	unsigned long yld_act_empty;
+-	unsigned long yld_both_empty;
+-	unsigned long yld_cnt;
+-
+-	/* schedule() stats */
+-	unsigned long sched_noswitch;
+-	unsigned long sched_switch;
+-	unsigned long sched_cnt;
+-	unsigned long sched_goidle;
+-
+-	/* pull_task() stats */
+-	unsigned long pt_gained[MAX_IDLE_TYPES];
+-	unsigned long pt_lost[MAX_IDLE_TYPES];
+-
+-	/* active_load_balance() stats */
+-	unsigned long alb_cnt;
+-	unsigned long alb_lost;
+-	unsigned long alb_gained;
+-	unsigned long alb_failed;
+-
+-	/* try_to_wake_up() stats */
+-	unsigned long ttwu_cnt;
+-	unsigned long ttwu_attempts;
+-	unsigned long ttwu_moved;
+-
+-	/* wake_up_new_task() stats */
+-	unsigned long wunt_cnt;
+-	unsigned long wunt_moved;
+-
+-	/* sched_migrate_task() stats */
+-	unsigned long smt_cnt;
+-
+-	/* sched_balance_exec() stats */
+-	unsigned long sbe_cnt;
+-#endif
+-};
+ 
+ static DEFINE_PER_CPU(struct runqueue, runqueues);
+ 
+@@ -2535,6 +2436,15 @@
+ 	unsigned long run_time;
+ 	int cpu, idx;
+ 
++ 	/*
++	 * If crash dump is in progress, this other cpu's
++	 * need to wait until it completes.
++	 * NB: this code is optimized away for kernels without
++	 * dumping enabled.
++	 */
++	if (unlikely(dump_oncpu))
++		goto dump_scheduling_disabled;
++
+ 	/*
+ 	 * Test if we are atomic.  Since do_exit() needs to call into
+ 	 * schedule() atomically, we ignore that path for now.
+@@ -2698,6 +2608,16 @@
+ 	preempt_enable_no_resched();
+ 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ 		goto need_resched;
++
++	return;
++
++ dump_scheduling_disabled:
++	/* allow scheduling only if this is the dumping cpu */
++	if (dump_oncpu != smp_processor_id()+1) {
++		while (dump_oncpu)
++			cpu_relax();
++	}
++	return;
+ }
+ 
+ EXPORT_SYMBOL(schedule);
+Index: linux-2.6.10/kernel/panic.c
+===================================================================
+--- linux-2.6.10.orig/kernel/panic.c	2004-12-25 05:35:29.000000000 +0800
++++ linux-2.6.10/kernel/panic.c	2005-04-05 16:47:53.898211944 +0800
+@@ -18,12 +18,17 @@
+ #include <linux/sysrq.h>
+ #include <linux/interrupt.h>
+ #include <linux/nmi.h>
++#ifdef CONFIG_KEXEC
++#include <linux/kexec.h>
++#endif
+ 
+ int panic_timeout;
+ int panic_on_oops;
+ int tainted;
++void (*dump_function_ptr)(const char *, const struct pt_regs *) = 0;
+ 
+ EXPORT_SYMBOL(panic_timeout);
++EXPORT_SYMBOL(dump_function_ptr);
+ 
+ struct notifier_block *panic_notifier_list;
+ 
+@@ -71,11 +76,12 @@
+ 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+ 	bust_spinlocks(0);
+ 
++        notifier_call_chain(&panic_notifier_list, 0, buf);
++ 	
+ #ifdef CONFIG_SMP
+ 	smp_send_stop();
+ #endif
+ 
+-	notifier_call_chain(&panic_notifier_list, 0, buf);
+ 
+ 	if (!panic_blink)
+ 		panic_blink = no_blink;
+@@ -87,6 +93,18 @@
+ 		 * We can't use the "normal" timers since we just panicked..
+ 	 	 */
+ 		printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
++#ifdef CONFIG_KEXEC
++{		
++ 		struct kimage *image;
++ 		image = xchg(&kexec_image, 0);
++ 		if (image) {
++ 			printk(KERN_EMERG "by starting a new kernel ..\n");
++ 			mdelay(panic_timeout*1000);
++ 			machine_kexec(image);
++ 		}
++}
++#endif
++
+ 		for (i = 0; i < panic_timeout*1000; ) {
+ 			touch_nmi_watchdog();
+ 			i += panic_blink(i);
+Index: linux-2.6.10/drivers/block/ll_rw_blk.c
+===================================================================
+--- linux-2.6.10.orig/drivers/block/ll_rw_blk.c	2005-04-05 16:29:30.310982704 +0800
++++ linux-2.6.10/drivers/block/ll_rw_blk.c	2005-04-05 16:47:53.949204192 +0800
+@@ -28,6 +28,7 @@
+ #include <linux/slab.h>
+ #include <linux/swap.h>
+ #include <linux/writeback.h>
++#include <linux/dump.h>
+ 
+ /*
+  * for max sense size
+@@ -2628,7 +2629,8 @@
+ 	sector_t maxsector;
+ 	int ret, nr_sectors = bio_sectors(bio);
+ 
+-	might_sleep();
++	if (likely(!dump_oncpu))
++	    might_sleep();
+ 	/* Test device or partition size, when known. */
+ 	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+ 	if (maxsector) {
+Index: linux-2.6.10/drivers/dump/dump_i386.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_i386.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_i386.c	2005-04-05 16:47:53.940205560 +0800
+@@ -0,0 +1,372 @@
++/*
++ * Architecture specific (i386) functions for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ *
++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com)
++ * Copyright 2000 TurboLinux, Inc.  All rights reserved.
++ * 
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * The hooks for dumping the kernel virtual memory to disk are in this
++ * file.  Any time a modification is made to the virtual memory mechanism,
++ * these routines must be changed to use the new mechanisms.
++ */
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/smp.h>
++#include <linux/fs.h>
++#include <linux/vmalloc.h>
++#include <linux/mm.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++#include <linux/irq.h>
++
++#include <asm/processor.h>
++#include <asm/e820.h>
++#include <asm/hardirq.h>
++#include <asm/nmi.h>
++
++static __s32 	     saved_irq_count;	/* saved preempt_count() flags */
++
++static int
++alloc_dha_stack(void)
++{
++	int i;
++	void *ptr;
++	
++	if (dump_header_asm.dha_stack[0])
++		return 0;
++
++	ptr = vmalloc(THREAD_SIZE * num_online_cpus());
++	if (!ptr) {
++		printk("vmalloc for dha_stacks failed\n");
++		return -ENOMEM;
++	}
++
++	for (i = 0; i < num_online_cpus(); i++) {
++		dump_header_asm.dha_stack[i] = (u32)((unsigned long)ptr +
++				(i * THREAD_SIZE));
++	}
++	return 0;
++}
++
++static int
++free_dha_stack(void) 
++{
++	if (dump_header_asm.dha_stack[0]) {
++		vfree((void *)dump_header_asm.dha_stack[0]);	
++		dump_header_asm.dha_stack[0] = 0;
++	}
++	return 0;
++}
++
++
++void 
++__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs)
++{
++	*dest_regs = *regs;
++
++	/* In case of panic dumps, we collects regs on entry to panic.
++	 * so, we shouldn't 'fix' ssesp here again. But it is hard to
++	 * tell just looking at regs whether ssesp need fixing. We make
++	 * this decision by looking at xss in regs. If we have better
++	 * means to determine that ssesp are valid (by some flag which
++	 * tells that we are here due to panic dump), then we can use
++	 * that instead of this kludge.
++	 */
++	if (!user_mode(regs)) {
++		if ((0xffff & regs->xss) == __KERNEL_DS) 
++			/* already fixed up */
++			return;
++		dest_regs->esp = (unsigned long)&(regs->esp);
++		__asm__ __volatile__ ("movw %%ss, %%ax;"
++			:"=a"(dest_regs->xss));
++	}
++}
++
++void
++__dump_save_context(int cpu, const struct pt_regs *regs, 
++	struct task_struct *tsk)
++{
++	dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk;
++	__dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs);
++
++	/* take a snapshot of the stack */
++	/* doing this enables us to tolerate slight drifts on this cpu */
++
++	if (dump_header_asm.dha_stack[cpu]) {
++		memcpy((void *)dump_header_asm.dha_stack[cpu],
++				STACK_START_POSITION(tsk),
++				THREAD_SIZE);
++	}
++	dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info);
++}
++
++#ifdef CONFIG_SMP
++extern cpumask_t irq_affinity[];
++extern irq_desc_t irq_desc[];
++extern void dump_send_ipi(void);
++
++static int dump_expect_ipi[NR_CPUS];
++static atomic_t waiting_for_dump_ipi;
++static cpumask_t saved_affinity[NR_IRQS];
++
++extern void stop_this_cpu(void *); /* exported by i386 kernel */
++
++static int
++dump_nmi_callback(struct pt_regs *regs, int cpu) 
++{
++	if (!dump_expect_ipi[cpu])
++		return 0;
++
++	dump_expect_ipi[cpu] = 0;
++	
++	dump_save_this_cpu(regs);
++	atomic_dec(&waiting_for_dump_ipi);
++
++ level_changed:
++	switch (dump_silence_level) {
++	case DUMP_HARD_SPIN_CPUS:	/* Spin until dump is complete */
++		while (dump_oncpu) {
++			barrier();    	/* paranoia */
++			if (dump_silence_level != DUMP_HARD_SPIN_CPUS)
++				goto level_changed;
++
++			cpu_relax();	/* kill time nicely */
++		}
++		break;
++
++	case DUMP_HALT_CPUS:		/* Execute halt */
++		stop_this_cpu(NULL);
++		break;
++		
++	case DUMP_SOFT_SPIN_CPUS:
++		/* Mark the task so it spins in schedule */
++		set_tsk_thread_flag(current, TIF_NEED_RESCHED);
++		break;
++	}
++
++	return 1;
++}
++
++/* save registers on other processors */
++void 
++__dump_save_other_cpus(void)
++{
++	int i, cpu = smp_processor_id();
++	int other_cpus = num_online_cpus()-1;
++	
++	if (other_cpus > 0) {
++		atomic_set(&waiting_for_dump_ipi, other_cpus);
++
++		for (i = 0; i < NR_CPUS; i++) {
++			dump_expect_ipi[i] = (i != cpu && cpu_online(i));
++		}
++
++		/* short circuit normal NMI handling temporarily */
++		set_nmi_callback(dump_nmi_callback);
++		wmb();
++
++		dump_send_ipi();
++		/* may be we dont need to wait for NMI to be processed. 
++		   just write out the header at the end of dumping, if
++		   this IPI is not processed until then, there probably
++		   is a problem and we just fail to capture state of 
++		   other cpus. */
++		while(atomic_read(&waiting_for_dump_ipi) > 0) {
++			cpu_relax();
++		}
++
++		unset_nmi_callback();
++	}
++}
++
++/*
++ * Routine to save the old irq affinities and change affinities of all irqs to
++ * the dumping cpu.
++ */
++static void 
++set_irq_affinity(void)
++{
++	int i;
++	cpumask_t cpu = CPU_MASK_NONE;
++
++	cpu_set(smp_processor_id(), cpu);
++	memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long));
++	for (i = 0; i < NR_IRQS; i++) {
++		if (irq_desc[i].handler == NULL)
++			continue;
++		irq_affinity[i] = cpu;
++		if (irq_desc[i].handler->set_affinity != NULL)
++			irq_desc[i].handler->set_affinity(i, irq_affinity[i]);
++	}
++}
++
++/*
++ * Restore old irq affinities.
++ */
++static void 
++reset_irq_affinity(void)
++{
++	int i;
++
++	memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long));
++	for (i = 0; i < NR_IRQS; i++) {
++		if (irq_desc[i].handler == NULL)
++			continue;
++		if (irq_desc[i].handler->set_affinity != NULL)
++			irq_desc[i].handler->set_affinity(i, saved_affinity[i]);
++	}
++}
++
++#else /* !CONFIG_SMP */
++#define set_irq_affinity()	do { } while (0)
++#define reset_irq_affinity()	do { } while (0)
++#define save_other_cpu_states() do { } while (0)
++#endif /* !CONFIG_SMP */
++
++/* 
++ * Kludge - dump from interrupt context is unreliable (Fixme)
++ *
++ * We do this so that softirqs initiated for dump i/o 
++ * get processed and we don't hang while waiting for i/o
++ * to complete or in any irq synchronization attempt.
++ *
++ * This is not quite legal of course, as it has the side 
++ * effect of making all interrupts & softirqs triggered 
++ * while dump is in progress complete before currently 
++ * pending softirqs and the currently executing interrupt 
++ * code. 
++ */
++static inline void
++irq_bh_save(void)
++{
++	saved_irq_count = irq_count();
++	preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK);
++}
++
++static inline void
++irq_bh_restore(void)
++{
++	preempt_count() |= saved_irq_count;
++}
++
++/*
++ * Name: __dump_irq_enable
++ * Func: Reset system so interrupts are enabled.
++ *	 This is used for dump methods that require interrupts
++ *	 Eventually, all methods will have interrupts disabled
++ *	 and this code can be removed.
++ *
++ *     Change irq affinities
++ *     Re-enable interrupts
++ */
++int
++__dump_irq_enable(void)
++{
++	set_irq_affinity();
++	irq_bh_save();
++	local_irq_enable();
++	return 0;
++}
++
++/*
++ * Name: __dump_irq_restore
++ * Func: Resume the system state in an architecture-specific way.
++
++ */
++void 
++__dump_irq_restore(void)
++{
++	local_irq_disable();
++	reset_irq_affinity();
++	irq_bh_restore();
++}
++
++/*
++ * Name: __dump_configure_header()
++ * Func: Meant to fill in arch specific header fields except per-cpu state
++ * already captured via __dump_save_context for all CPUs.
++ */
++int
++__dump_configure_header(const struct pt_regs *regs)
++{
++	return (0);
++}
++
++/*
++ * Name: __dump_init()
++ * Func: Initialize the dumping routine process.
++ */
++void
++__dump_init(uint64_t local_memory_start)
++{
++	return;
++}
++
++/*
++ * Name: __dump_open()
++ * Func: Open the dump device (architecture specific).
++ */
++void
++__dump_open(void)
++{
++	alloc_dha_stack();
++}
++
++/*
++ * Name: __dump_cleanup()
++ * Func: Free any architecture specific data structures. This is called
++ *       when the dump module is being removed.
++ */
++void
++__dump_cleanup(void)
++{
++	free_dha_stack();
++}
++
++extern int pfn_is_ram(unsigned long);
++
++/*
++ * Name: __dump_page_valid()
++ * Func: Check if page is valid to dump.
++ */ 
++int 
++__dump_page_valid(unsigned long index)
++{
++	if (!pfn_valid(index))
++		return 0;
++
++	return pfn_is_ram(index);
++}
++
++/* 
++ * Name: manual_handle_crashdump()
++ * Func: Interface for the lkcd dump command. Calls dump_execute()
++ */
++int
++manual_handle_crashdump(void) {
++
++	struct pt_regs regs;
++	
++	get_current_regs(&regs);
++	dump_execute("manual", &regs);
++	return 0;
++}
++
++/*
++ * Name: __dump_clean_irq_state()
++ * Func: Clean up from the previous IRQ handling state. Such as oops from 
++ *       interrupt handler or bottom half.
++ */
++void
++__dump_clean_irq_state(void)
++{
++    return;
++}
+Index: linux-2.6.10/drivers/dump/dump_ia64.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_ia64.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_ia64.c	2005-04-05 16:47:53.928207384 +0800
+@@ -0,0 +1,458 @@
++/*
++ * Architecture specific (ia64) functions for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ * Contributions from SGI, IBM, and others.
++ *
++ * 2.4  kernel modifications by:  Matt D. Robinson (yakker@alacritech.com)
++ * ia64 kernel modifications by: Piet Delaney (piet@www.piet.net)
++ *
++ * Copyright (C) 2001 - 2002 Matt D. Robinson (yakker@alacritech.com)
++ * Copyright (C) 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * The hooks for dumping the kernel virtual memory to disk are in this
++ * file.  Any time a modification is made to the virtual memory mechanism,
++ * these routines must be changed to use the new mechanisms.
++ */
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/smp.h>
++#include <linux/fs.h>
++#include <linux/vmalloc.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++#include <linux/mm.h>
++#include <asm/processor.h>
++#include <asm-ia64/dump.h>
++#include <asm/hardirq.h>
++#include <linux/irq.h>
++#include <linux/delay.h>
++
++static __s32         saved_irq_count;   /* saved preempt_count() flags */
++
++
++static int alloc_dha_stack(void)
++{
++	int i;
++	void *ptr;
++	
++	if (dump_header_asm.dha_stack[0])
++	{
++		return 0;
++	}
++       	ptr = vmalloc(THREAD_SIZE * num_online_cpus());
++       	if (!ptr) {
++		printk("vmalloc for dha_stacks failed\n");
++		return -ENOMEM;
++	}
++	bzero(ptr,THREAD_SIZE );
++
++	for (i = 0; i < num_online_cpus(); i++) {
++		dump_header_asm.dha_stack[i] = (uint64_t)((unsigned long)ptr + (i * THREAD_SIZE));
++	}
++	return 0;
++}
++
++static int free_dha_stack(void) 
++{
++	if (dump_header_asm.dha_stack[0])
++	{
++		vfree((void*)dump_header_asm.dha_stack[0]);
++		dump_header_asm.dha_stack[0] = 0;
++	}
++	return 0;
++}
++
++/* a structure to get arguments into the following callback routine */
++struct unw_args {
++	int cpu;
++	struct task_struct *tsk;
++};
++
++static void
++do_save_sw(struct unw_frame_info *info, void *arg)
++{
++	struct unw_args *uwargs = (struct unw_args *)arg;
++	int cpu = uwargs->cpu;
++	struct task_struct *tsk = uwargs->tsk;
++
++	dump_header_asm.dha_stack_ptr[cpu] = (uint64_t)info->sw;
++
++	if (tsk && dump_header_asm.dha_stack[cpu]) {
++		memcpy((void *)dump_header_asm.dha_stack[cpu],
++				STACK_START_POSITION(tsk),
++				THREAD_SIZE);
++	}
++}
++
++void
++__dump_save_context(int cpu, const struct pt_regs *regs, 
++	struct task_struct *tsk)
++{
++	struct unw_args uwargs;
++
++	dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk;
++
++	if (regs) {
++		dump_header_asm.dha_smp_regs[cpu] = *regs;
++	}
++
++	/* save a snapshot of the stack in a nice state for unwinding */
++	uwargs.cpu = cpu;
++	uwargs.tsk = tsk;
++
++	unw_init_running(do_save_sw, (void *)&uwargs);
++}
++
++#ifdef CONFIG_SMP
++
++extern cpumask_t irq_affinity[];
++#define irq_desc _irq_desc
++extern irq_desc_t irq_desc[];
++extern void dump_send_ipi(void);
++static cpumask_t saved_affinity[NR_IRQS];
++
++/*
++ * Routine to save the old irq affinities and change affinities of all irqs to
++ * the dumping cpu.
++ */
++static void
++set_irq_affinity(void)
++{
++        int i;
++	cpumask_t cpu = CPU_MASK_NONE;
++
++	cpu_set(smp_processor_id(), cpu);
++        memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long));
++        for (i = 0; i < NR_IRQS; i++) {
++                if (irq_desc[i].handler == NULL)
++                        continue;
++		irq_affinity[i] = cpu;
++                if (irq_desc[i].handler->set_affinity != NULL)
++                        irq_desc[i].handler->set_affinity(i, irq_affinity[i]);
++        }
++}
++
++/*
++ * Restore old irq affinities.
++ */
++static void
++reset_irq_affinity(void)
++{
++        int i;
++
++        memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long));
++        for (i = 0; i < NR_IRQS; i++) {
++                if (irq_desc[i].handler == NULL)
++                        continue;
++                if (irq_desc[i].handler->set_affinity != NULL)
++                        irq_desc[i].handler->set_affinity(i, saved_affinity[i]);
++        }
++}
++
++#else /* !CONFIG_SMP */
++#define set_irq_affinity()      do { } while (0)
++#define reset_irq_affinity()    do { } while (0)
++#define save_other_cpu_states() do { } while (0)
++#endif /* !CONFIG_SMP */
++
++#ifdef CONFIG_SMP
++static int dump_expect_ipi[NR_CPUS];
++static atomic_t waiting_for_dump_ipi;
++static int wait_for_dump_ipi = 2000; /* wait 2000 ms for ipi to be handled */
++extern void (*dump_trace_ptr)(struct pt_regs *);
++
++
++extern void stop_this_cpu(void);
++
++static int
++dump_nmi_callback(struct pt_regs *regs, int cpu)
++{
++        if (!dump_expect_ipi[cpu])
++                return 0;
++
++        dump_expect_ipi[cpu] = 0;
++
++        dump_save_this_cpu(regs);
++        atomic_dec(&waiting_for_dump_ipi);
++
++ level_changed:
++        switch (dump_silence_level) {
++        case DUMP_HARD_SPIN_CPUS:       /* Spin until dump is complete */
++                while (dump_oncpu) {
++                        barrier();      /* paranoia */
++                        if (dump_silence_level != DUMP_HARD_SPIN_CPUS)
++                                goto level_changed;
++
++                        cpu_relax();    /* kill time nicely */
++                }
++                break;
++
++        case DUMP_HALT_CPUS:            /* Execute halt */
++                stop_this_cpu();
++                break;
++
++        case DUMP_SOFT_SPIN_CPUS:
++                /* Mark the task so it spins in schedule */
++                set_tsk_thread_flag(current, TIF_NEED_RESCHED);
++                break;
++        }
++
++        return 1;
++}
++
++int IPI_handler(struct pt_regs *regs)
++{
++	int cpu;
++	cpu = task_cpu(current);
++	return(dump_nmi_callback(regs, cpu));
++}
++
++/* save registers on other processors */
++void
++__dump_save_other_cpus(void)
++{
++        int i, cpu = smp_processor_id();
++        int other_cpus = num_online_cpus()-1;
++	int wait_time = wait_for_dump_ipi;
++
++        if (other_cpus > 0) {
++                atomic_set(&waiting_for_dump_ipi, other_cpus);
++
++                for (i = 0; i < NR_CPUS; i++) {
++                        dump_expect_ipi[i] = (i != cpu && cpu_online(i));
++                }
++
++		dump_ipi_function_ptr = IPI_handler;
++		
++                wmb();
++
++                dump_send_ipi();
++                /* may be we dont need to wait for IPI to be processed.
++                 * just write out the header at the end of dumping, if
++                 * this IPI is not processed until then, there probably
++                 * is a problem and we just fail to capture state of
++                 * other cpus. */
++                while(wait_time-- && (atomic_read(&waiting_for_dump_ipi) > 0)) {
++			barrier();
++			mdelay(1);
++                }
++		if (wait_time <= 0) {
++			printk("dump ipi timeout, proceeding...\n");
++		}
++        }
++}
++#endif
++/*
++ * Kludge - dump from interrupt context is unreliable (Fixme)
++ *
++ * We do this so that softirqs initiated for dump i/o
++ * get processed and we don't hang while waiting for i/o
++ * to complete or in any irq synchronization attempt.
++ *
++ * This is not quite legal of course, as it has the side
++ * effect of making all interrupts & softirqs triggered
++ * while dump is in progress complete before currently
++ * pending softirqs and the currently executing interrupt
++ * code.
++ */
++static inline void
++irq_bh_save(void)
++{
++        saved_irq_count = irq_count();
++        preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK);
++}
++
++static inline void
++irq_bh_restore(void)
++{
++        preempt_count() |= saved_irq_count;
++}
++
++/*
++ * Name: __dump_configure_header()
++ * Func: Configure the dump header with all proper values.
++ */
++int
++__dump_configure_header(const struct pt_regs *regs)
++{
++	return (0);
++}
++
++
++#define dim(x) (sizeof(x)/sizeof(*(x)))
++
++/*
++ * Name: __dump_irq_enable
++ * Func: Reset system so interrupts are enabled.
++ *       This is used for dump methods that require interrupts
++ *       Eventually, all methods will have interrupts disabled
++ *       and this code can be removed.
++ *
++ *     Change irq affinities
++ *     Re-enable interrupts
++ */
++int
++__dump_irq_enable(void)
++{
++        set_irq_affinity();
++        irq_bh_save();
++	ia64_srlz_d();
++	/* 
++	 * reduce the task priority level
++  	 * to get disk interrupts
++	 */
++	ia64_setreg(_IA64_REG_CR_TPR, 0);
++	ia64_srlz_d();
++        local_irq_enable();
++	return 0;
++}
++
++/*
++ * Name: __dump_irq_restore
++ * Func: Resume the system state in an architecture-specific way.
++
++ */
++void
++__dump_irq_restore(void)
++{
++        local_irq_disable();
++        reset_irq_affinity();
++        irq_bh_restore();
++}
++
++/*
++ * Name: __dump_page_valid()
++ * Func: Check if page is valid to dump.
++ */
++int
++__dump_page_valid(unsigned long index)
++{
++        if (!pfn_valid(index))
++	{
++                return 0;
++	}
++        return 1;
++}
++
++/*
++ * Name: __dump_init()
++ * Func: Initialize the dumping routine process.  This is in case
++ *       it's necessary in the future.
++ */
++void
++__dump_init(uint64_t local_memory_start)
++{
++	return;
++}
++
++/*
++ * Name: __dump_open()
++ * Func: Open the dump device (architecture specific).  This is in
++ *       case it's necessary in the future.
++ */
++void
++__dump_open(void)
++{
++	alloc_dha_stack();
++	return;
++}
++
++
++/*
++ * Name: __dump_cleanup()
++ * Func: Free any architecture specific data structures. This is called
++ *       when the dump module is being removed.
++ */
++void
++__dump_cleanup(void)
++{
++	free_dha_stack();
++
++	return;
++}
++
++
++
++int __dump_memcpy_mc_expected = 0;		/* Doesn't help yet */
++
++/*
++ * An ia64 version of memcpy() that trys to avoid machine checks.
++ *
++ * NB: 
++ * 	By itself __dump_memcpy_mc_expected() ins't providing any
++ *	protection against Machine Checks. We are looking into the
++ *	possability of adding code to the arch/ia64/kernel/mca.c fuction
++ *	ia64_mca_ucmc_handler() to restore state so that a IA64_MCA_CORRECTED
++ *	can be returned to the firmware. Curently it always returns 
++ *	IA64_MCA_COLD_BOOT and reboots the machine.
++ */
++/*
++void * __dump_memcpy(void * dest, const void *src, size_t count)
++{
++	void *vp;
++
++	if (__dump_memcpy_mc_expected) {
++		ia64_pal_mc_expected((u64) 1, 0);
++	}
++
++	vp = memcpy(dest, src, count);
++
++	if (__dump_memcpy_mc_expected) {
++		ia64_pal_mc_expected((u64) 0, 0);
++	}
++	return(vp);
++}
++*/
++/*
++ * Name: manual_handle_crashdump()
++ * Func: Interface for the lkcd dump command. Calls dump_execute()
++ */
++int
++manual_handle_crashdump(void) {
++
++        struct pt_regs regs;
++
++        get_current_regs(&regs);
++        dump_execute("manual", &regs);
++        return 0;
++}
++
++/*
++ * Name: __dump_clean_irq_state()
++ * Func: Clean up from the previous IRQ handling state. Such as oops from 
++ *       interrupt handler or bottom half.
++ */
++void
++__dump_clean_irq_state(void)
++{
++    	unsigned long saved_tpr;
++	unsigned long TPR_MASK = 0xFFFFFFFFFFFEFF0F;
++	
++	
++	/* Get the processors task priority register */
++	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
++	/* clear the mmi and mic bit's of the TPR to unmask interrupts */
++	saved_tpr = saved_tpr & TPR_MASK; 
++	ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
++	ia64_srlz_d();
++
++	/* Tell the processor we're done with the interrupt
++	 * that got us here.
++	 */
++	
++	ia64_eoi();
++
++	/* local implementation of irq_exit(); */
++	preempt_count() -= IRQ_EXIT_OFFSET;
++	preempt_enable_no_resched();
++
++	return;
++}
++
+Index: linux-2.6.10/drivers/dump/dump_rle.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_rle.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_rle.c	2005-04-05 16:47:53.935206320 +0800
+@@ -0,0 +1,176 @@
++/*
++ * RLE Compression functions for kernel crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sourceforge.net)
++ * Copyright 2001 Matt D. Robinson.  All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* header files */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/file.h>
++#include <linux/init.h>
++#include <linux/dump.h>
++
++/*
++ * Name: dump_compress_rle()
++ * Func: Compress a DUMP_PAGE_SIZE (hardware) page down to something more
++ *       reasonable, if possible.  This is the same routine we use in IRIX.
++ */
++static u32
++dump_compress_rle(const u8 *old, u32 oldsize, u8 *new, u32 newsize,
++		unsigned long loc)
++{
++	u16 ri, wi, count = 0;
++	u_char value = 0, cur_byte;
++
++	/*
++	 * If the block should happen to "compress" to larger than the
++	 * buffer size, allocate a larger one and change cur_buf_size.
++	 */
++
++	wi = ri = 0;
++
++	while (ri < oldsize) {
++		if (!ri) {
++			cur_byte = value = old[ri];
++			count = 0;
++		} else {
++			if (count == 255) {
++				if (wi + 3 > oldsize) {
++					return oldsize;
++				}
++				new[wi++] = 0;
++				new[wi++] = count;
++				new[wi++] = value;
++				value = cur_byte = old[ri];
++				count = 0;
++			} else { 
++				if ((cur_byte = old[ri]) == value) {
++					count++;
++				} else {
++					if (count > 1) {
++						if (wi + 3 > oldsize) {
++							return oldsize;
++						}
++						new[wi++] = 0;
++						new[wi++] = count;
++						new[wi++] = value;
++					} else if (count == 1) {
++						if (value == 0) {
++							if (wi + 3 > oldsize) {
++								return oldsize;
++							}
++							new[wi++] = 0;
++							new[wi++] = 1;
++							new[wi++] = 0;
++						} else {
++							if (wi + 2 > oldsize) {
++								return oldsize;
++							}
++							new[wi++] = value;
++							new[wi++] = value;
++						}
++					} else { /* count == 0 */
++						if (value == 0) {
++							if (wi + 2 > oldsize) {
++								return oldsize;
++							}
++							new[wi++] = value;
++							new[wi++] = value;
++						} else {
++							if (wi + 1 > oldsize) {
++								return oldsize;
++							}
++							new[wi++] = value;
++						}
++					} /* if count > 1 */
++
++					value = cur_byte;
++					count = 0;
++
++				} /* if byte == value */
++
++			} /* if count == 255 */
++
++		} /* if ri == 0 */
++		ri++;
++
++	}
++	if (count > 1) {
++		if (wi + 3 > oldsize) {
++			return oldsize;
++		}
++		new[wi++] = 0;
++		new[wi++] = count;
++		new[wi++] = value;
++	} else if (count == 1) {
++		if (value == 0) {
++			if (wi + 3 > oldsize)
++				return oldsize;
++			new[wi++] = 0;
++			new[wi++] = 1;
++			new[wi++] = 0;
++		} else {
++			if (wi + 2 > oldsize)
++				return oldsize;
++			new[wi++] = value;
++			new[wi++] = value;
++		}
++	} else { /* count == 0 */
++		if (value == 0) {
++			if (wi + 2 > oldsize)
++				return oldsize;
++			new[wi++] = value;
++			new[wi++] = value;
++		} else {
++			if (wi + 1 > oldsize)
++				return oldsize;
++			new[wi++] = value;
++		}
++	} /* if count > 1 */
++
++	value = cur_byte;
++	count = 0;
++	return wi;
++}
++
++/* setup the rle compression functionality */
++static struct __dump_compress dump_rle_compression = {
++	.compress_type = DUMP_COMPRESS_RLE,
++	.compress_func = dump_compress_rle,
++	.compress_name = "RLE",
++};
++
++/*
++ * Name: dump_compress_rle_init()
++ * Func: Initialize rle compression for dumping.
++ */
++static int __init
++dump_compress_rle_init(void)
++{
++	dump_register_compression(&dump_rle_compression);
++	return 0;
++}
++
++/*
++ * Name: dump_compress_rle_cleanup()
++ * Func: Remove rle compression for dumping.
++ */
++static void __exit
++dump_compress_rle_cleanup(void)
++{
++	dump_unregister_compression(DUMP_COMPRESS_RLE);
++}
++
++/* module initialization */
++module_init(dump_compress_rle_init);
++module_exit(dump_compress_rle_cleanup);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("LKCD Development Team <lkcd-devel@lists.sourceforge.net>");
++MODULE_DESCRIPTION("RLE compression module for crash dump driver");
+Index: linux-2.6.10/drivers/dump/dump_execute.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_execute.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_execute.c	2005-04-05 16:47:53.943205104 +0800
+@@ -0,0 +1,144 @@
++/*
++ * The file has the common/generic dump execution code 
++ *
++ * Started: Oct 2002 -  Suparna Bhattacharya <suparna@in.ibm.com>
++ * 	Split and rewrote high level dump execute code to make use 
++ * 	of dump method interfaces.
++ *
++ * Derived from original code in dump_base.c created by 
++ * 	Matt Robinson <yakker@sourceforge.net>)
++ * 	
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson.  All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * Assumes dumper and dump config settings are in place
++ * (invokes corresponding dumper specific routines as applicable)
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++#include <linux/kernel.h>
++#include <linux/notifier.h>
++#include <linux/dump.h>
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include "dump_methods.h"
++
++struct notifier_block *dump_notifier_list; /* dump started/ended callback */
++
++extern int panic_timeout;
++
++/* Dump progress indicator */
++void 
++dump_speedo(int i)
++{
++	static const char twiddle[4] =  { '|', '\\', '-', '/' };
++	printk("%c\b", twiddle[i&3]);
++}
++
++/* Make the device ready and write out the header */
++int dump_begin(void)
++{
++	int err = 0;
++
++	/* dump_dev = dump_config.dumper->dev; */
++	dumper_reset();
++	if ((err = dump_dev_silence())) {
++		/* quiesce failed, can't risk continuing */
++		/* Todo/Future: switch to alternate dump scheme if possible */
++		printk("dump silence dev failed ! error %d\n", err);
++		return err;
++	}
++
++	pr_debug("Writing dump header\n");
++	if ((err = dump_update_header())) {
++		printk("dump update header failed ! error %d\n", err);
++		dump_dev_resume();
++		return err;
++	}
++
++	dump_config.dumper->curr_offset = DUMP_BUFFER_SIZE;
++
++	return 0;
++}
++
++/* 
++ * Write the dump terminator, a final header update and let go of 
++ * exclusive use of the device for dump.
++ */
++int dump_complete(void)
++{
++	int ret = 0;
++
++	if (dump_config.level != DUMP_LEVEL_HEADER) {
++		if ((ret = dump_update_end_marker())) {
++			printk("dump update end marker error %d\n", ret);
++		}
++		if ((ret = dump_update_header())) {
++			printk("dump update header error %d\n", ret);
++		}
++	}
++	ret = dump_dev_resume();
++
++	if ((panic_timeout > 0) && (!(dump_config.flags & (DUMP_FLAGS_SOFTBOOT | DUMP_FLAGS_NONDISRUPT)))) {
++		mdelay(panic_timeout * 1000);
++		machine_restart(NULL);
++	}
++
++	return ret;
++}
++
++/* Saves all dump data */
++int dump_execute_savedump(void)
++{
++	int ret = 0, err = 0;
++
++	if ((ret = dump_begin()))  {
++		return ret;
++	}
++
++	if (dump_config.level != DUMP_LEVEL_HEADER) { 
++		ret = dump_sequencer();
++	}
++	if ((err = dump_complete())) {
++		printk("Dump complete failed. Error %d\n", err);
++	}
++
++	return ret;
++}
++
++extern void dump_calc_bootmap_pages(void);
++
++/* Does all the real work:  Capture and save state */
++int dump_generic_execute(const char *panic_str, const struct pt_regs *regs)
++{
++	int ret = 0;
++
++#ifdef CONFIG_DISCONTIGMEM
++        printk(KERN_INFO "Reconfiguring memory bank information....\n");
++        printk(KERN_INFO "This may take a while....\n");
++        dump_reconfigure_mbanks();
++#endif
++
++	if ((ret = dump_configure_header(panic_str, regs))) {
++		printk("dump config header failed ! error %d\n", ret);
++		return ret;	
++	}
++
++	dump_calc_bootmap_pages();
++	/* tell interested parties that a dump is about to start */
++	notifier_call_chain(&dump_notifier_list, DUMP_BEGIN, 
++		&dump_config.dump_device);
++
++	if (dump_config.level != DUMP_LEVEL_NONE)
++		ret = dump_execute_savedump();
++
++	pr_debug("dumped %ld blocks of %d bytes each\n", 
++		dump_config.dumper->count, DUMP_BUFFER_SIZE);
++	
++	/* tell interested parties that a dump has completed */
++	notifier_call_chain(&dump_notifier_list, DUMP_END, 
++		&dump_config.dump_device);
++
++	return ret;
++}
+Index: linux-2.6.10/drivers/dump/dump_netdev.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_netdev.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_netdev.c	2005-04-05 16:47:53.936206168 +0800
+@@ -0,0 +1,566 @@
++/*
++ * Implements the dump driver interface for saving a dump via network
++ * interface. 
++ *
++ * Some of this code has been taken/adapted from Ingo Molnar's netconsole
++ * code. LKCD team expresses its thanks to Ingo.
++ *
++ * Started: June 2002 - Mohamed Abbas <mohamed.abbas@intel.com>
++ * 	Adapted netconsole code to implement LKCD dump over the network.
++ *
++ * Nov 2002 - Bharata B. Rao <bharata@in.ibm.com>
++ * 	Innumerable code cleanups, simplification and some fixes.
++ *	Netdump configuration done by ioctl instead of using module parameters.
++ * Oct 2003 - Prasanna S Panchamukhi <prasanna@in.ibm.com>
++ *	Netdump code modified to use Netpoll API's.
++ *
++ * Copyright (C) 2001  Ingo Molnar <mingo@redhat.com>
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ *  This code is released under version 2 of the GNU GPL.
++ */
++
++#include <net/tcp.h>
++#include <net/udp.h>
++#include <linux/delay.h>
++#include <linux/random.h>
++#include <linux/reboot.h>
++#include <linux/module.h>
++#include <linux/dump.h>
++#include <linux/dump_netdev.h>
++
++#include <asm/unaligned.h>
++
++static int startup_handshake;
++static int page_counter;
++static unsigned long flags_global;
++static int netdump_in_progress;
++
++/*
++ * security depends on the trusted path between the netconsole
++ * server and netconsole client, since none of the packets are
++ * encrypted. The random magic number protects the protocol
++ * against spoofing.
++ */
++static u64 dump_magic;
++
++/*
++ * We maintain a small pool of fully-sized skbs,
++ * to make sure the message gets out even in
++ * extreme OOM situations.
++ */
++
++static void rx_hook(struct netpoll *np, int port, char *msg, int size);
++int new_req = 0;
++static req_t req;
++
++static void rx_hook(struct netpoll *np, int port, char *msg, int size)
++{
++	req_t * __req = (req_t *) msg;
++	/* 
++	 * First check if were are dumping or doing startup handshake, if
++	 * not quickly return.
++	 */
++
++	if (!netdump_in_progress)
++		return ;
++
++	if ((ntohl(__req->command) != COMM_GET_MAGIC) &&
++	    (ntohl(__req->command) != COMM_HELLO) &&
++	    (ntohl(__req->command) != COMM_START_WRITE_NETDUMP_ACK) &&
++	    (ntohl(__req->command) != COMM_START_NETDUMP_ACK) &&
++	    (memcmp(&__req->magic, &dump_magic, sizeof(dump_magic)) != 0))
++		goto out;
++
++	req.magic = ntohl(__req->magic);
++	req.command = ntohl(__req->command);
++	req.from = ntohl(__req->from);
++	req.to = ntohl(__req->to);
++	req.nr = ntohl(__req->nr);
++	new_req = 1;
++out:
++	return ;
++}
++static char netdump_membuf[1024 + HEADER_LEN + 1];
++/*
++ * Fill the netdump_membuf with the header information from reply_t structure 
++ * and send it down to netpoll_send_udp() routine.
++ */
++static void 
++netdump_send_packet(struct netpoll *np, reply_t *reply, size_t data_len) {
++	char *b;
++
++	b = &netdump_membuf[1];
++	netdump_membuf[0] = NETCONSOLE_VERSION;
++	put_unaligned(htonl(reply->nr), (u32 *) b);
++	put_unaligned(htonl(reply->code), (u32 *) (b + sizeof(reply->code)));
++	put_unaligned(htonl(reply->info), (u32 *) (b + sizeof(reply->code) + 
++		sizeof(reply->info)));
++	netpoll_send_udp(np, netdump_membuf, data_len + HEADER_LEN);
++}
++
++static void
++dump_send_mem(struct netpoll *np, req_t *req, const char* buff, size_t len)
++{
++	int i;
++
++	int nr_chunks = len/1024;
++	reply_t reply;
++
++	reply.nr = req->nr;
++	reply.code = REPLY_MEM;
++        if ( nr_chunks <= 0)
++		 nr_chunks = 1;
++	for (i = 0; i < nr_chunks; i++) {
++		unsigned int offset = i*1024;
++		reply.info = offset;
++		memcpy((netdump_membuf + HEADER_LEN), (buff + offset), 1024);
++		netdump_send_packet(np, &reply, 1024);
++	}
++}
++
++/*
++ * This function waits for the client to acknowledge the receipt
++ * of the netdump startup reply, with the possibility of packets
++ * getting lost. We resend the startup packet if no ACK is received,
++ * after a 1 second delay.
++ *
++ * (The client can test the success of the handshake via the HELLO
++ * command, and send ACKs until we enter netdump mode.)
++ */
++static int
++dump_handshake(struct dump_dev *net_dev)
++{
++	reply_t reply;
++	int i, j;
++	size_t str_len;
++
++	if (startup_handshake) {
++		sprintf((netdump_membuf + HEADER_LEN), 
++			"NETDUMP start, waiting for start-ACK.\n");
++		reply.code = REPLY_START_NETDUMP;
++		reply.nr = 0;
++		reply.info = 0;
++	} else {
++		sprintf((netdump_membuf + HEADER_LEN), 
++			"NETDUMP start, waiting for start-ACK.\n");
++		reply.code = REPLY_START_WRITE_NETDUMP;
++		reply.nr = net_dev->curr_offset;
++		reply.info = net_dev->curr_offset;
++	}
++	str_len = strlen(netdump_membuf + HEADER_LEN);
++	
++	/* send 300 handshake packets before declaring failure */
++	for (i = 0; i < 300; i++) {
++		netdump_send_packet(&net_dev->np, &reply, str_len);
++
++		/* wait 1 sec */
++		for (j = 0; j < 10000; j++) {
++			udelay(100);
++			netpoll_poll(&net_dev->np);
++			if (new_req)
++				break;
++		}
++
++		/* 
++		 * if there is no new request, try sending the handshaking
++		 * packet again
++		 */
++		if (!new_req)
++			continue;
++
++		/* 
++		 * check if the new request is of the expected type,
++		 * if so, return, else try sending the handshaking
++		 * packet again
++		 */
++		if (startup_handshake) {
++			if (req.command == COMM_HELLO || req.command ==
++				COMM_START_NETDUMP_ACK) {
++				return 0;
++			} else {
++				new_req = 0;
++				continue;
++			}
++		} else {
++			if (req.command == COMM_SEND_MEM) {
++				return 0;
++			} else {
++				new_req = 0;
++				continue;
++			}
++		}
++	}
++	return -1;
++}
++
++static ssize_t
++do_netdump(struct dump_dev *net_dev, const char* buff, size_t len)
++{
++	reply_t reply;
++	ssize_t  ret = 0;
++	int repeatCounter, counter, total_loop;
++	size_t str_len;
++	
++	netdump_in_progress = 1;
++
++	if (dump_handshake(net_dev) < 0) {
++		printk("network dump failed due to handshake failure\n");
++		goto out;
++	}
++
++	/*
++	 * Ideally startup handshake should be done during dump configuration,
++	 * i.e., in dump_net_open(). This will be done when I figure out
++	 * the dependency between startup handshake, subsequent write and
++	 * various commands wrt to net-server.
++	 */
++	if (startup_handshake)
++		startup_handshake = 0;
++
++        counter = 0;
++	repeatCounter = 0;
++	total_loop = 0;
++	while (1) {
++                if (!new_req) {
++			netpoll_poll(&net_dev->np);
++		}
++		if (!new_req) {
++			repeatCounter++;
++
++			if (repeatCounter > 5) {
++				counter++;
++				if (counter > 10000) {
++					if (total_loop >= 100000) {
++						printk("Time OUT LEAVE NOW\n");
++						goto out;
++					} else {
++						total_loop++;
++						printk("Try number %d out of "
++							"10 before Time Out\n",
++							total_loop);
++					}
++				}
++				mdelay(1);
++				repeatCounter = 0;
++			}	
++			continue;
++		}
++		repeatCounter = 0;
++		counter = 0;
++		total_loop = 0;
++		new_req = 0;
++		switch (req.command) {
++		case COMM_NONE:
++			break;
++
++		case COMM_SEND_MEM:
++			dump_send_mem(&net_dev->np, &req, buff, len);
++			break;
++
++		case COMM_EXIT:
++                case COMM_START_WRITE_NETDUMP_ACK:
++			ret = len;
++			goto out;
++
++		case COMM_HELLO:
++			sprintf((netdump_membuf + HEADER_LEN), 
++				"Hello, this is netdump version " "0.%02d\n",
++				 NETCONSOLE_VERSION);
++			str_len = strlen(netdump_membuf + HEADER_LEN);
++			reply.code = REPLY_HELLO;
++			reply.nr = req.nr;
++                        reply.info = net_dev->curr_offset;
++			netdump_send_packet(&net_dev->np, &reply, str_len);
++			break;
++
++		case COMM_GET_PAGE_SIZE:
++			sprintf((netdump_membuf + HEADER_LEN), 
++				"PAGE_SIZE: %ld\n", PAGE_SIZE);
++			str_len = strlen(netdump_membuf + HEADER_LEN);
++			reply.code = REPLY_PAGE_SIZE;
++			reply.nr = req.nr;
++			reply.info = PAGE_SIZE;
++			netdump_send_packet(&net_dev->np, &reply, str_len);
++			break;
++
++		case COMM_GET_NR_PAGES:
++			reply.code = REPLY_NR_PAGES;
++			reply.nr = req.nr;
++			reply.info = num_physpages;
++			reply.info = page_counter;
++			sprintf((netdump_membuf + HEADER_LEN), 
++				"Number of pages: %ld\n", num_physpages);
++			str_len = strlen(netdump_membuf + HEADER_LEN);
++			netdump_send_packet(&net_dev->np, &reply, str_len);
++			break;
++
++		case COMM_GET_MAGIC:
++			reply.code = REPLY_MAGIC;
++			reply.nr = req.nr;
++			reply.info = NETCONSOLE_VERSION;
++			sprintf((netdump_membuf + HEADER_LEN), 
++				(char *)&dump_magic, sizeof(dump_magic));
++			str_len = strlen(netdump_membuf + HEADER_LEN);
++			netdump_send_packet(&net_dev->np, &reply, str_len);
++			break;
++
++		default:
++			reply.code = REPLY_ERROR;
++			reply.nr = req.nr;
++			reply.info = req.command;
++			sprintf((netdump_membuf + HEADER_LEN), 
++				"Got unknown command code %d!\n", req.command);
++			str_len = strlen(netdump_membuf + HEADER_LEN);
++			netdump_send_packet(&net_dev->np, &reply, str_len);
++			break;
++		}
++	}
++out:
++	netdump_in_progress = 0;
++	return ret;
++}
++
++static int
++dump_validate_config(struct netpoll *np)
++{
++	if (!np->local_ip) {
++		printk("network device %s has no local address, "
++				"aborting.\n", np->name);
++		return -1;
++	}
++
++#define IP(x) ((unsigned char *)&np->local_ip)[x]
++	printk("Source %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3));
++#undef IP
++
++	if (!np->local_port) {
++		printk("source_port parameter not specified, aborting.\n");
++		return -1;
++	}
++
++	if (!np->remote_ip) {
++		printk("target_ip parameter not specified, aborting.\n");
++		return -1;
++	}
++
++	np->remote_ip = ntohl(np->remote_ip);
++#define IP(x) ((unsigned char *)&np->remote_ip)[x]
++	printk("Target %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3));
++#undef IP
++
++	if (!np->remote_port) {
++		printk("target_port parameter not specified, aborting.\n");
++		return -1;
++	}
++	printk("Target Ethernet Address %02x:%02x:%02x:%02x:%02x:%02x",
++		np->remote_mac[0], np->remote_mac[1], np->remote_mac[2], 
++		np->remote_mac[3], np->remote_mac[4], np->remote_mac[5]);
++
++	if ((np->remote_mac[0] & np->remote_mac[1] & np->remote_mac[2] & 
++		np->remote_mac[3] & np->remote_mac[4] & np->remote_mac[5]) == 255)
++		printk("(Broadcast)");
++	printk("\n");
++	return 0;
++}
++
++/*
++ * Prepares the dump device so we can take a dump later. 
++ * Validates the netdump configuration parameters.
++ *
++ * TODO: Network connectivity check should be done here.
++ */
++static int
++dump_net_open(struct dump_dev *net_dev, unsigned long arg)
++{
++	int retval = 0;
++
++	/* get the interface name */
++	if (copy_from_user(net_dev->np.dev_name, (void *)arg, IFNAMSIZ))
++		return -EFAULT;
++	net_dev->np.rx_hook = rx_hook;	
++	retval = netpoll_setup(&net_dev->np);
++
++	dump_validate_config(&net_dev->np);
++	net_dev->curr_offset = 0;
++	printk("Network device %s successfully configured for dumping\n",
++			net_dev->np.dev_name);
++	return retval;
++}
++
++/*
++ * Close the dump device and release associated resources
++ * Invoked when unconfiguring the dump device.
++ */
++static int
++dump_net_release(struct dump_dev *net_dev)
++{
++	netpoll_cleanup(&net_dev->np);
++	return 0;
++}
++
++/*
++ * Prepare the dump device for use (silence any ongoing activity
++ * and quiesce state) when the system crashes.
++ */
++static int
++dump_net_silence(struct dump_dev *net_dev)
++{
++	netpoll_set_trap(1);
++	local_irq_save(flags_global);
++        startup_handshake = 1;
++	net_dev->curr_offset = 0;
++	printk("Dumping to network device %s on CPU %d ...\n", net_dev->np.name,
++			smp_processor_id());
++	return 0;
++}
++
++/*
++ * Invoked when dumping is done. This is the time to put things back 
++ * (i.e. undo the effects of dump_block_silence) so the device is 
++ * available for normal use.
++ */
++static int
++dump_net_resume(struct dump_dev *net_dev)
++{
++	int indx;
++	size_t str_len;
++	reply_t reply;
++
++	sprintf((netdump_membuf + HEADER_LEN), "NETDUMP end.\n");
++	str_len = strlen(netdump_membuf + HEADER_LEN);
++	for( indx = 0; indx < 6; indx++) {
++		reply.code = REPLY_END_NETDUMP;
++		reply.nr = 0;
++		reply.info = 0;
++		netdump_send_packet(&net_dev->np, &reply, str_len);
++	}
++	printk("NETDUMP END!\n");
++	local_irq_restore(flags_global);
++	netpoll_set_trap(0);
++	startup_handshake = 0;
++	return 0;
++}
++
++/*
++ * Seek to the specified offset in the dump device.
++ * Makes sure this is a valid offset, otherwise returns an error.
++ */
++static  int
++dump_net_seek(struct dump_dev *net_dev, loff_t off)
++{
++	net_dev->curr_offset = off;
++	return 0;
++}
++
++/*
++ *
++ */
++static int
++dump_net_write(struct dump_dev *net_dev, void *buf, unsigned long len)
++{
++	int cnt, i, off;
++	ssize_t ret;
++
++	cnt = len/ PAGE_SIZE;
++
++	for (i = 0; i < cnt; i++) {
++		off = i* PAGE_SIZE;
++		ret = do_netdump(net_dev, buf+off, PAGE_SIZE);
++		if (ret <= 0)
++			return -1;
++		net_dev->curr_offset = net_dev->curr_offset + PAGE_SIZE;
++	}
++	return len;
++}
++
++/*
++ * check if the last dump i/o is over and ready for next request
++ */
++static int
++dump_net_ready(struct dump_dev *net_dev, void *buf)
++{
++	return 0;
++}
++
++/*
++ * ioctl function used for configuring network dump
++ */
++static int
++dump_net_ioctl(struct dump_dev *net_dev, unsigned int cmd, unsigned long arg)
++{
++	switch (cmd) {
++	case DIOSTARGETIP:
++		net_dev->np.remote_ip= arg;
++		break;
++	case DIOSTARGETPORT:
++		net_dev->np.remote_port = (u16)arg;
++		break;
++	case DIOSSOURCEPORT:
++		net_dev->np.local_port = (u16)arg;
++		break;
++	case DIOSETHADDR:
++		return copy_from_user(net_dev->np.remote_mac, (void *)arg, 6);
++		break;
++	case DIOGTARGETIP:
++	case DIOGTARGETPORT:
++	case DIOGSOURCEPORT:
++	case DIOGETHADDR:
++		break;
++	default:
++		return -EINVAL;
++	}
++	return 0;
++}
++
++struct dump_dev_ops dump_netdev_ops = {
++	.open 		= dump_net_open,
++	.release	= dump_net_release,
++	.silence	= dump_net_silence,
++	.resume 	= dump_net_resume,
++	.seek		= dump_net_seek,
++	.write		= dump_net_write,
++	/* .read not implemented */
++	.ready		= dump_net_ready,
++	.ioctl		= dump_net_ioctl
++};
++
++static struct dump_dev default_dump_netdev = {
++	.type_name = "networkdev", 
++	.ops = &dump_netdev_ops, 
++	.curr_offset = 0,
++	.np.name = "netdump",
++	.np.dev_name = "eth0",
++	.np.rx_hook = rx_hook,
++	.np.local_port = 6688,
++	.np.remote_port = 6688,
++	.np.remote_mac = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
++};
++
++static int __init
++dump_netdev_init(void)
++{
++	default_dump_netdev.curr_offset = 0;
++
++	if (dump_register_device(&default_dump_netdev) < 0) {
++		printk("network dump device driver registration failed\n");
++		return -1;
++	}
++	printk("network device driver for LKCD registered\n");
++ 
++	get_random_bytes(&dump_magic, sizeof(dump_magic));
++	return 0;
++}
++
++static void __exit
++dump_netdev_cleanup(void)
++{
++	dump_unregister_device(&default_dump_netdev);
++}
++
++MODULE_AUTHOR("LKCD Development Team <lkcd-devel@lists.sourceforge.net>");
++MODULE_DESCRIPTION("Network Dump Driver for Linux Kernel Crash Dump (LKCD)");
++MODULE_LICENSE("GPL");
++
++module_init(dump_netdev_init);
++module_exit(dump_netdev_cleanup);
+Index: linux-2.6.10/drivers/dump/dump_x8664.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_x8664.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_x8664.c	2005-04-05 16:47:53.932206776 +0800
+@@ -0,0 +1,362 @@
++/*
++ * Architecture specific (x86-64) functions for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ *
++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com)
++ * Copyright 2000 TurboLinux, Inc.  All rights reserved.
++ *
++ * x86-64 port Copyright 2002 Andi Kleen, SuSE Labs
++ * x86-64 port Sachin Sant ( sachinp@in.ibm.com )
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * The hooks for dumping the kernel virtual memory to disk are in this
++ * file.  Any time a modification is made to the virtual memory mechanism,
++ * these routines must be changed to use the new mechanisms.
++ */
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/smp.h>
++#include <linux/fs.h>
++#include <linux/vmalloc.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++#include <linux/mm.h>
++#include <linux/rcupdate.h>
++#include <asm/processor.h>
++#include <asm/hardirq.h>
++#include <asm/kdebug.h>
++#include <asm/uaccess.h>
++#include <asm/nmi.h>
++#include <asm/kdebug.h>
++
++static __s32 	saved_irq_count; /* saved preempt_count() flag */
++
++void (*dump_trace_ptr)(struct pt_regs *);
++
++static int alloc_dha_stack(void)
++{
++	int i;
++	void *ptr;
++	
++	if (dump_header_asm.dha_stack[0])
++		return 0;
++
++       	ptr = vmalloc(THREAD_SIZE * num_online_cpus());
++	if (!ptr) {
++		printk("vmalloc for dha_stacks failed\n");
++		return -ENOMEM;
++	}
++
++	for (i = 0; i < num_online_cpus(); i++) {
++		dump_header_asm.dha_stack[i] = 
++			(uint64_t)((unsigned long)ptr + (i * THREAD_SIZE));
++	}
++	return 0;
++}
++
++static int free_dha_stack(void) 
++{
++	if (dump_header_asm.dha_stack[0]) {
++		vfree((void *)dump_header_asm.dha_stack[0]);	
++		dump_header_asm.dha_stack[0] = 0;
++	}	
++	return 0;
++}
++
++void
++__dump_save_regs(struct pt_regs* dest_regs, const struct pt_regs* regs)
++{
++	if (regs)
++		memcpy(dest_regs, regs, sizeof(struct pt_regs));
++}
++
++void
++__dump_save_context(int cpu, const struct pt_regs *regs, 
++	struct task_struct *tsk)
++{
++	dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk;
++	__dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs);
++
++	/* take a snapshot of the stack */
++	/* doing this enables us to tolerate slight drifts on this cpu */
++
++	if (dump_header_asm.dha_stack[cpu]) {
++		memcpy((void *)dump_header_asm.dha_stack[cpu],
++				STACK_START_POSITION(tsk),
++				THREAD_SIZE);
++	}
++	dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info);
++}
++
++#ifdef CONFIG_SMP
++extern cpumask_t irq_affinity[];
++extern irq_desc_t irq_desc[];
++extern void dump_send_ipi(void);
++static int dump_expect_ipi[NR_CPUS];
++static atomic_t waiting_for_dump_ipi;
++static unsigned long saved_affinity[NR_IRQS];
++
++extern void stop_this_cpu(void *);
++
++static int
++dump_nmi_callback(struct pt_regs *regs, int cpu) 
++{
++	if (!dump_expect_ipi[cpu]) {
++		return 0;
++	}
++	
++	dump_expect_ipi[cpu] = 0;
++
++	dump_save_this_cpu(regs);
++	atomic_dec(&waiting_for_dump_ipi);
++
++level_changed:
++
++	switch (dump_silence_level) {
++        case DUMP_HARD_SPIN_CPUS:       /* Spin until dump is complete */
++                while (dump_oncpu) {
++                        barrier();      /* paranoia */
++                        if (dump_silence_level != DUMP_HARD_SPIN_CPUS)
++                                goto level_changed;
++
++                        cpu_relax();    /* kill time nicely */
++                }
++                break;
++
++        case DUMP_HALT_CPUS:            /* Execute halt */
++                stop_this_cpu(NULL);
++                break;
++
++        case DUMP_SOFT_SPIN_CPUS:
++                /* Mark the task so it spins in schedule */
++                set_tsk_thread_flag(current, TIF_NEED_RESCHED);
++                break;
++        }
++
++	return 1;
++}
++
++/* save registers on other processors */
++void 
++__dump_save_other_cpus(void) 
++{
++	int i, cpu = smp_processor_id();
++	int other_cpus = num_online_cpus() - 1;
++
++	if (other_cpus > 0) {
++		atomic_set(&waiting_for_dump_ipi, other_cpus);
++
++		for (i = 0; i < NR_CPUS; i++)
++			dump_expect_ipi[i] = (i != cpu && cpu_online(i));
++		
++		set_nmi_callback(dump_nmi_callback);
++		wmb();
++
++		dump_send_ipi();
++
++		/* may be we dont need to wait for NMI to be processed. 
++		   just write out the header at the end of dumping, if
++		   this IPI is not processed untill then, there probably
++		   is a problem and we just fail to capture state of 
++		   other cpus. */
++		while(atomic_read(&waiting_for_dump_ipi) > 0)
++			cpu_relax();
++
++		unset_nmi_callback();
++	}
++	return;
++}
++
++/*
++ * Routine to save the old irq affinities and change affinities of all irqs to
++ * the dumping cpu.
++ */
++static void
++set_irq_affinity(void)
++{
++	int i;
++	cpumask_t cpu = CPU_MASK_NONE;
++
++	cpu_set(smp_processor_id(), cpu); 
++	memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long));
++	for (i = 0; i < NR_IRQS; i++) {
++		if (irq_desc[i].handler == NULL)
++			continue;
++		irq_affinity[i] = cpu;
++		if (irq_desc[i].handler->set_affinity != NULL)
++			irq_desc[i].handler->set_affinity(i, irq_affinity[i]);
++	}
++}
++
++/*
++ * Restore old irq affinities.
++ */
++static void
++reset_irq_affinity(void)
++{
++	int i;
++
++	memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long));
++	for (i = 0; i < NR_IRQS; i++) {
++		if (irq_desc[i].handler == NULL)
++			continue;
++		if (irq_desc[i].handler->set_affinity != NULL)
++			irq_desc[i].handler->set_affinity(i, saved_affinity[i]);
++	}
++}
++
++#else /* !CONFIG_SMP */
++#define set_irq_affinity()	do { } while (0)
++#define reset_irq_affinity()	do { } while (0)
++#define save_other_cpu_states() do { } while (0)
++#endif /* !CONFIG_SMP */
++
++static inline void
++irq_bh_save(void)
++{
++	saved_irq_count = irq_count();
++	preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK);
++}
++
++static inline void
++irq_bh_restore(void)
++{
++	preempt_count() |= saved_irq_count;
++}
++
++/*
++ * Name: __dump_irq_enable
++ * Func: Reset system so interrupts are enabled.
++ *       This is used for dump methods that require interrupts
++ *       Eventually, all methods will have interrupts disabled
++ *       and this code can be removed.
++ *
++ *     Change irq affinities
++ *     Re-enable interrupts
++ */
++int
++__dump_irq_enable(void)
++{
++        set_irq_affinity();
++        irq_bh_save();
++        local_irq_enable();
++	return 0;
++}
++
++/*
++ * Name: __dump_irq_restore
++ * Func: Resume the system state in an architecture-speeific way.
++ *
++ */
++void
++__dump_irq_restore(void)
++{
++        local_irq_disable();
++        reset_irq_affinity();
++        irq_bh_restore();
++}
++
++/*
++ * Name: __dump_configure_header()
++ * Func: Configure the dump header with all proper values.
++ */
++int
++__dump_configure_header(const struct pt_regs *regs)
++{
++	/* Dummy function - return */
++	return (0);
++}
++
++static int notify(struct notifier_block *nb, unsigned long code, void *data)
++{
++	if (code == DIE_NMI_IPI && dump_oncpu)
++		return NOTIFY_BAD; 
++	return NOTIFY_DONE; 
++} 
++
++static struct notifier_block dump_notifier = { 
++	.notifier_call = notify,	
++}; 
++
++/*
++ * Name: __dump_init()
++ * Func: Initialize the dumping routine process.
++ */
++void
++__dump_init(uint64_t local_memory_start)
++{
++	notifier_chain_register(&die_chain, &dump_notifier);
++}
++
++/*
++ * Name: __dump_open()
++ * Func: Open the dump device (architecture specific).  This is in
++ *       case it's necessary in the future.
++ */
++void
++__dump_open(void)
++{
++	alloc_dha_stack();
++	/* return */
++	return;
++}
++
++/*
++ * Name: __dump_cleanup()
++ * Func: Free any architecture specific data structures. This is called
++ *       when the dump module is being removed.
++ */
++void
++__dump_cleanup(void)
++{
++	free_dha_stack();
++	notifier_chain_unregister(&die_chain, &dump_notifier);
++	synchronize_kernel(); 
++	return;
++}
++
++extern int page_is_ram(unsigned long);
++
++/*
++ * Name: __dump_page_valid()
++ * Func: Check if page is valid to dump.
++ */
++int
++__dump_page_valid(unsigned long index)
++{
++	if (!pfn_valid(index))
++		return 0;
++
++	return page_is_ram(index);
++}
++
++/*
++ * Name: manual_handle_crashdump()
++ * Func: Interface for the lkcd dump command. Calls dump_execute()
++ */
++int
++manual_handle_crashdump(void) {
++
++        struct pt_regs regs;
++
++        get_current_regs(&regs);
++        dump_execute("manual", &regs);
++        return 0;
++}
++
++/*
++ * Name: __dump_clean_irq_state()
++ * Func: Clean up from the previous IRQ handling state. Such as oops from 
++ *       interrupt handler or bottom half.
++ */
++void
++__dump_clean_irq_state(void)
++{
++    return;
++}
+Index: linux-2.6.10/drivers/dump/dump_overlay.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_overlay.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_overlay.c	2005-04-05 16:47:53.934206472 +0800
+@@ -0,0 +1,890 @@
++/*
++ * Two-stage soft-boot based dump scheme methods (memory overlay
++ * with post soft-boot writeout)
++ *
++ * Started: Oct 2002 -  Suparna Bhattacharya <suparna@in.ibm.com>
++ *
++ * This approach of saving the dump in memory and writing it 
++ * out after a softboot without clearing memory is derived from the 
++ * Mission Critical Linux dump implementation. Credits and a big
++ * thanks for letting the lkcd project make use of the excellent 
++ * piece of work and also for helping with clarifications and 
++ * tips along the way are due to:
++ * 	Dave Winchell <winchell@mclx.com> (primary author of mcore)
++ * 	and also to
++ * 	Jeff Moyer <moyer@mclx.com>
++ * 	Josh Huber <huber@mclx.com>
++ * 
++ * For those familiar with the mcore implementation, the key 
++ * differences/extensions here are in allowing entire memory to be 
++ * saved (in compressed form) through a careful ordering scheme 
++ * on both the way down as well on the way up after boot, the latter
++ * for supporting the LKCD notion of passes in which most critical 
++ * data is the first to be saved to the dump device. Also the post 
++ * boot writeout happens from within the kernel rather than driven 
++ * from userspace.
++ *
++ * The sequence is orchestrated through the abstraction of "dumpers",
++ * one for the first stage which then sets up the dumper for the next 
++ * stage, providing for a smooth and flexible reuse of the singlestage 
++ * dump scheme methods and a handle to pass dump device configuration 
++ * information across the soft boot. 
++ *
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * Disruptive dumping using the second kernel soft-boot option
++ * for issuing dump i/o operates in 2 stages:
++ * 
++ * (1) - Saves the (compressed & formatted) dump in memory using a 
++ *       carefully ordered overlay scheme designed to capture the 
++ *       entire physical memory or selective portions depending on 
++ *       dump config settings, 
++ *     - Registers the stage 2 dumper and 
++ *     - Issues a soft reboot w/o clearing memory. 
++ *
++ *     The overlay scheme starts with a small bootstrap free area
++ *     and follows a reverse ordering of passes wherein it 
++ *     compresses and saves data starting with the least critical 
++ *     areas first, thus freeing up the corresponding pages to 
++ *     serve as destination for subsequent data to be saved, and
++ *     so on. With a good compression ratio, this makes it feasible
++ *     to capture an entire physical memory dump without significantly
++ *     reducing memory available during regular operation.
++ *
++ * (2) Post soft-reboot, runs through the saved memory dump and
++ *     writes it out to disk, this time around, taking care to
++ *     save the more critical data first (i.e. pages which figure 
++ *     in early passes for a regular dump). Finally issues a 
++ *     clean reboot.
++ *     
++ *     Since the data was saved in memory after selection/filtering
++ *     and formatted as per the chosen output dump format, at this 
++ *     stage the filter and format actions are just dummy (or
++ *     passthrough) actions, except for influence on ordering of
++ *     passes.
++ */
++
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/highmem.h>
++#include <linux/bootmem.h>
++#include <linux/dump.h>
++#ifdef CONFIG_KEXEC
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include <linux/kexec.h>
++#endif
++#include "dump_methods.h"
++
++extern struct list_head dumper_list_head;
++extern struct dump_memdev *dump_memdev;
++extern struct dumper dumper_stage2;
++struct dump_config_block *dump_saved_config = NULL;
++extern struct dump_blockdev *dump_blockdev;
++static struct dump_memdev *saved_dump_memdev = NULL;
++static struct dumper *saved_dumper = NULL;
++
++#ifdef CONFIG_KEXEC
++extern int panic_timeout;
++#endif
++
++/* For testing 
++extern void dump_display_map(struct dump_memdev *);
++*/
++
++struct dumper *dumper_by_name(char *name)
++{
++#ifdef LATER
++	struct dumper *dumper;
++	list_for_each_entry(dumper, &dumper_list_head, dumper_list)
++		if (!strncmp(dumper->name, name, 32))
++			return dumper;
++
++	/* not found */
++	return NULL; 
++#endif
++	/* Temporary proof of concept */
++	if (!strncmp(dumper_stage2.name, name, 32))
++		return &dumper_stage2;
++	else
++		return NULL;
++}
++
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++extern void dump_early_reserve_map(struct dump_memdev *);
++
++void crashdump_reserve(void)
++{
++	extern unsigned long crashdump_addr;
++
++	if (crashdump_addr == 0xdeadbeef) 
++		return;
++
++	/* reserve dump config and saved dump pages */
++	dump_saved_config = (struct dump_config_block *)crashdump_addr;
++	/* magic verification */
++	if (dump_saved_config->magic != DUMP_MAGIC_LIVE) {
++		printk("Invalid dump magic. Ignoring dump\n");
++		dump_saved_config = NULL;
++		return;
++	}
++			
++	printk("Dump may be available from previous boot\n");
++
++#ifdef CONFIG_X86_64
++	reserve_bootmem_node(NODE_DATA(0), 
++		virt_to_phys((void *)crashdump_addr), 
++		PAGE_ALIGN(sizeof(struct dump_config_block)));
++#else
++	reserve_bootmem(virt_to_phys((void *)crashdump_addr), 
++		PAGE_ALIGN(sizeof(struct dump_config_block)));
++#endif
++	dump_early_reserve_map(&dump_saved_config->memdev);
++
++}
++#endif
++
++/* 
++ * Loads the dump configuration from a memory block saved across soft-boot
++ * The ops vectors need fixing up as the corresp. routines may have 
++ * relocated in the new soft-booted kernel.
++ */
++int dump_load_config(struct dump_config_block *config)
++{
++	struct dumper *dumper;
++	struct dump_data_filter *filter_table, *filter;
++	struct dump_dev *dev;
++	int i;
++
++	if (config->magic != DUMP_MAGIC_LIVE)
++		return -ENOENT; /* not a valid config */
++
++	/* initialize generic config data */
++	memcpy(&dump_config, &config->config, sizeof(dump_config));
++
++	/* initialize dumper state */
++	if (!(dumper = dumper_by_name(config->dumper.name)))  {
++		printk("dumper name mismatch\n");
++		return -ENOENT; /* dumper mismatch */
++	}
++	
++	/* verify and fixup schema */
++	if (strncmp(dumper->scheme->name, config->scheme.name, 32)) {
++		printk("dumper scheme mismatch\n");
++		return -ENOENT; /* mismatch */
++	}
++	config->scheme.ops = dumper->scheme->ops;
++	config->dumper.scheme = &config->scheme;
++	
++	/* verify and fixup filter operations */
++	filter_table = dumper->filter;
++	for (i = 0, filter = config->filter_table; 
++		((i < MAX_PASSES) && filter_table[i].selector); 
++		i++, filter++) {
++		if (strncmp(filter_table[i].name, filter->name, 32)) {
++			printk("dump filter mismatch\n");
++			return -ENOENT; /* filter name mismatch */
++		}
++		filter->selector = filter_table[i].selector;
++	}
++	config->dumper.filter = config->filter_table;
++
++	/* fixup format */
++	if (strncmp(dumper->fmt->name, config->fmt.name, 32)) {
++		printk("dump format mismatch\n");
++		return -ENOENT; /* mismatch */
++	}
++	config->fmt.ops = dumper->fmt->ops;
++	config->dumper.fmt = &config->fmt;
++
++	/* fixup target device */
++	dev = (struct dump_dev *)(&config->dev[0]);
++	if (dumper->dev == NULL) {
++		pr_debug("Vanilla dumper - assume default\n");
++		if (dump_dev == NULL)
++			return -ENODEV;
++		dumper->dev = dump_dev;
++	}
++
++	if (strncmp(dumper->dev->type_name, dev->type_name, 32)) { 
++		printk("dump dev type mismatch %s instead of %s\n",
++				dev->type_name, dumper->dev->type_name);
++		return -ENOENT; /* mismatch */
++	}
++	dev->ops = dumper->dev->ops; 
++	config->dumper.dev = dev;
++	
++	/* fixup memory device containing saved dump pages */
++	/* assume statically init'ed dump_memdev */
++	config->memdev.ddev.ops = dump_memdev->ddev.ops; 
++	/* switch to memdev from prev boot */
++	saved_dump_memdev = dump_memdev; /* remember current */
++	dump_memdev = &config->memdev;
++
++	/* Make this the current primary dumper */
++	dump_config.dumper = &config->dumper;
++
++	return 0;
++}
++
++/* Saves the dump configuration in a memory block for use across a soft-boot */
++int dump_save_config(struct dump_config_block *config)
++{
++	printk("saving dump config settings\n");
++
++	/* dump config settings */
++	memcpy(&config->config, &dump_config, sizeof(dump_config));
++
++	/* dumper state */
++	memcpy(&config->dumper, dump_config.dumper, sizeof(struct dumper));
++	memcpy(&config->scheme, dump_config.dumper->scheme, 
++		sizeof(struct dump_scheme));
++	memcpy(&config->fmt, dump_config.dumper->fmt, sizeof(struct dump_fmt));
++	memcpy(&config->dev[0], dump_config.dumper->dev, 
++		sizeof(struct dump_anydev));
++	memcpy(&config->filter_table, dump_config.dumper->filter, 
++		sizeof(struct dump_data_filter)*MAX_PASSES);
++
++	/* handle to saved mem pages */
++	memcpy(&config->memdev, dump_memdev, sizeof(struct dump_memdev));
++
++	config->magic = DUMP_MAGIC_LIVE;
++	
++	return 0;
++}
++
++int dump_init_stage2(struct dump_config_block *saved_config)
++{
++	int err = 0;
++
++	pr_debug("dump_init_stage2\n");
++	/* Check if dump from previous boot exists */
++	if (saved_config) {
++		printk("loading dumper from previous boot \n");
++		/* load and configure dumper from previous boot */
++		if ((err = dump_load_config(saved_config)))
++			return err;
++
++		if (!dump_oncpu) {
++			if ((err = dump_configure(dump_config.dump_device))) {
++				printk("Stage 2 dump configure failed\n");
++				return err;
++			}
++		}
++
++		dumper_reset();
++		dump_dev = dump_config.dumper->dev;
++		/* write out the dump */
++		err = dump_generic_execute(NULL, NULL);
++		
++		dump_saved_config = NULL;
++
++		if (!dump_oncpu) {
++			dump_unconfigure(); 
++		}
++		
++		return err;
++
++	} else {
++		/* no dump to write out */
++		printk("no dumper from previous boot \n");
++		return 0;
++	}
++}
++
++extern void dump_mem_markpages(struct dump_memdev *);
++
++int dump_switchover_stage(void)
++{
++	int ret = 0;
++
++	/* trigger stage 2 rightaway - in real life would be after soft-boot */
++	/* dump_saved_config would be a boot param */
++	saved_dump_memdev = dump_memdev;
++	saved_dumper = dump_config.dumper;
++	ret = dump_init_stage2(dump_saved_config);
++	dump_memdev = saved_dump_memdev;
++	dump_config.dumper = saved_dumper;
++	return ret;
++}
++
++int dump_activate_softboot(void) 
++{
++        int err = 0;
++#ifdef CONFIG_KEXEC
++        int num_cpus_online = 0;
++        struct kimage *image;
++#endif
++
++        /* temporary - switchover to writeout previously saved dump */
++#ifndef CONFIG_KEXEC
++        err = dump_switchover_stage(); /* non-disruptive case */
++        if (dump_oncpu)
++	                dump_config.dumper = &dumper_stage1; /* set things back */
++
++        return err;
++#else
++
++        dump_silence_level = DUMP_HALT_CPUS;
++        /* wait till we become the only cpu */
++        /* maybe by checking for online cpus ? */
++
++        while((num_cpus_online = num_online_cpus()) > 1);
++
++        /* now call into kexec */
++
++        image = xchg(&kexec_image, 0);
++        if (image) {
++	                mdelay(panic_timeout*1000);
++		                machine_kexec(image);
++			        }
++
++
++        /* TBD/Fixme:
++	 *          * should we call reboot notifiers ? inappropriate for panic ?
++	 *                   * what about device_shutdown() ?
++	 *                            * is explicit bus master disabling needed or can we do that
++	 *                                     * through driverfs ?
++	 *                                              */
++        return 0;
++#endif
++}
++
++/* --- DUMP SCHEME ROUTINES  --- */
++
++static inline int dump_buf_pending(struct dumper *dumper)
++{
++	return (dumper->curr_buf - dumper->dump_buf);
++}
++
++/* Invoked during stage 1 of soft-reboot based dumping */
++int dump_overlay_sequencer(void)
++{
++	struct dump_data_filter *filter = dump_config.dumper->filter;
++	struct dump_data_filter *filter2 = dumper_stage2.filter;
++	int pass = 0, err = 0, save = 0;
++	int (*action)(unsigned long, unsigned long);
++
++	/* Make sure gzip compression is being used */
++	if (dump_config.dumper->compress->compress_type != DUMP_COMPRESS_GZIP) {
++		printk(" Please set GZIP compression \n");
++		return -EINVAL;
++	}
++
++	/* start filling in dump data right after the header */
++	dump_config.dumper->curr_offset = 
++		PAGE_ALIGN(dump_config.dumper->header_len);
++
++	/* Locate the last pass */
++	for (;filter->selector; filter++, pass++);
++	
++	/* 
++	 * Start from the end backwards: overlay involves a reverse 
++	 * ordering of passes, since less critical pages are more
++	 * likely to be reusable as scratch space once we are through
++	 * with them. 
++	 */
++	for (--pass, --filter; pass >= 0; pass--, filter--)
++	{
++		/* Assumes passes are exclusive (even across dumpers) */
++		/* Requires care when coding the selection functions */
++		if ((save = filter->level_mask & dump_config.level))
++			action = dump_save_data;
++		else
++			action = dump_skip_data;
++
++		/* Remember the offset where this pass started */
++		/* The second stage dumper would use this */
++		if (dump_buf_pending(dump_config.dumper) & (PAGE_SIZE - 1)) {
++			pr_debug("Starting pass %d with pending data\n", pass);
++			pr_debug("filling dummy data to page-align it\n");
++			dump_config.dumper->curr_buf = (void *)PAGE_ALIGN(
++				(unsigned long)dump_config.dumper->curr_buf);
++		}
++		
++		filter2[pass].start[0] = dump_config.dumper->curr_offset
++			+ dump_buf_pending(dump_config.dumper);
++
++		err = dump_iterator(pass, action, filter);
++
++		filter2[pass].end[0] = dump_config.dumper->curr_offset
++			+ dump_buf_pending(dump_config.dumper);
++		filter2[pass].num_mbanks = 1;
++
++		if (err < 0) {
++			printk("dump_overlay_seq: failure %d in pass %d\n", 
++				err, pass);
++			break;
++		}	
++		printk("\n %d overlay pages %s of %d each in pass %d\n", 
++		err, save ? "saved" : "skipped", DUMP_PAGE_SIZE, pass);
++	}
++
++	return err;
++}
++
++/* from dump_memdev.c */
++extern struct page *dump_mem_lookup(struct dump_memdev *dev, unsigned long loc);
++extern struct page *dump_mem_next_page(struct dump_memdev *dev);
++
++static inline struct page *dump_get_saved_page(loff_t loc)
++{
++	return (dump_mem_lookup(dump_memdev, loc >> PAGE_SHIFT));
++}
++
++static inline struct page *dump_next_saved_page(void)
++{
++	return (dump_mem_next_page(dump_memdev));
++}
++
++/* 
++ * Iterates over list of saved dump pages. Invoked during second stage of 
++ * soft boot dumping
++ *
++ * Observation: If additional selection is desired at this stage then
++ * a different iterator could be written which would advance 
++ * to the next page header everytime instead of blindly picking up
++ * the data. In such a case loc would be interpreted differently. 
++ * At this moment however a blind pass seems sufficient, cleaner and
++ * faster.
++ */
++int dump_saved_data_iterator(int pass, int (*action)(unsigned long, 
++	unsigned long), struct dump_data_filter *filter)
++{
++	loff_t loc, end;
++	struct page *page;
++	unsigned long count = 0;
++	int i, err = 0;
++	unsigned long sz;
++
++	for (i = 0; i < filter->num_mbanks; i++) {
++		loc  = filter->start[i];
++		end = filter->end[i];
++		printk("pass %d, start off 0x%llx end offset 0x%llx\n", pass,
++			loc, end);
++
++		/* loc will get treated as logical offset into stage 1 */
++		page = dump_get_saved_page(loc);
++			
++		for (; loc < end; loc += PAGE_SIZE) {
++			dump_config.dumper->curr_loc = loc;
++			if (!page) {
++				printk("no more saved data for pass %d\n", 
++					pass);
++				break;
++			}
++			sz = (loc + PAGE_SIZE > end) ? end - loc : PAGE_SIZE;
++
++			if (page && filter->selector(pass, (unsigned long)page, 
++				PAGE_SIZE))  {
++				pr_debug("mem offset 0x%llx\n", loc);
++				if ((err = action((unsigned long)page, sz))) 
++					break;
++				else
++					count++;
++				/* clear the contents of page */
++				/* fixme: consider using KM_DUMP instead */
++				clear_highpage(page);
++			
++			}
++			page = dump_next_saved_page();
++		}
++	}
++
++	return err ? err : count;
++}
++
++static inline int dump_overlay_pages_done(struct page *page, int nr)
++{
++	int ret=0;
++
++	for (; nr ; page++, nr--) {
++		if (dump_check_and_free_page(dump_memdev, page))
++			ret++;
++	}
++	return ret;
++}
++
++int dump_overlay_save_data(unsigned long loc, unsigned long len)
++{
++	int err = 0;
++	struct page *page = (struct page *)loc;
++	static unsigned long cnt = 0;
++
++	if ((err = dump_generic_save_data(loc, len)))
++		return err;
++
++	if (dump_overlay_pages_done(page, len >> PAGE_SHIFT)) {
++		cnt++;
++		if (!(cnt & 0x7f))
++			pr_debug("released page 0x%lx\n", page_to_pfn(page));
++	}
++	
++	return err;
++}
++
++
++int dump_overlay_skip_data(unsigned long loc, unsigned long len)
++{
++	struct page *page = (struct page *)loc;
++
++	dump_overlay_pages_done(page, len >> PAGE_SHIFT);
++	return 0;
++}
++
++int dump_overlay_resume(void)
++{
++	int err = 0;
++
++	/* 
++	 * switch to stage 2 dumper, save dump_config_block
++	 * and then trigger a soft-boot
++	 */
++	dumper_stage2.header_len = dump_config.dumper->header_len;
++	dump_config.dumper = &dumper_stage2;
++	if ((err = dump_save_config(dump_saved_config)))
++		return err;
++
++	dump_dev = dump_config.dumper->dev;
++
++#ifdef CONFIG_KEXEC
++        /* If we are doing a disruptive dump, activate softboot now */
++        if((panic_timeout > 0) && (!(dump_config.flags & DUMP_FLAGS_NONDISRUPT)))
++        err = dump_activate_softboot();
++#endif
++		
++	return err;
++	err = dump_switchover_stage();  /* plugs into soft boot mechanism */
++	dump_config.dumper = &dumper_stage1; /* set things back */
++	return err;
++}
++
++int dump_overlay_configure(unsigned long devid)
++{
++	struct dump_dev *dev;
++	struct dump_config_block *saved_config = dump_saved_config;
++	int err = 0;
++
++	/* If there is a previously saved dump, write it out first */
++	if (saved_config) {
++		printk("Processing old dump pending writeout\n");
++		err = dump_switchover_stage();
++		if (err) {
++			printk("failed to writeout saved dump\n");
++			return err;
++		}
++		dump_free_mem(saved_config); /* testing only: not after boot */
++	}
++
++	dev = dumper_stage2.dev = dump_config.dumper->dev;
++	/* From here on the intermediate dump target is memory-only */
++	dump_dev = dump_config.dumper->dev = &dump_memdev->ddev;
++	if ((err = dump_generic_configure(0))) {
++		printk("dump generic configure failed: err %d\n", err);
++		return err;
++	}
++	/* temporary */
++	dumper_stage2.dump_buf = dump_config.dumper->dump_buf;
++
++	/* Sanity check on the actual target dump device */
++	if (!dev || (err = dev->ops->open(dev, devid))) {
++		return err;
++	}
++	/* TBD: should we release the target if this is soft-boot only ? */
++
++	/* alloc a dump config block area to save across reboot */
++	if (!(dump_saved_config = dump_alloc_mem(sizeof(struct 
++		dump_config_block)))) {
++		printk("dump config block alloc failed\n");
++		/* undo configure */
++		dump_generic_unconfigure();
++		return -ENOMEM;
++	}
++	dump_config.dump_addr = (unsigned long)dump_saved_config;
++	printk("Dump config block of size %d set up at 0x%lx\n", 
++		sizeof(*dump_saved_config), (unsigned long)dump_saved_config);
++	return 0;
++}
++
++int dump_overlay_unconfigure(void)
++{
++	struct dump_dev *dev = dumper_stage2.dev;
++	int err = 0;
++
++	pr_debug("dump_overlay_unconfigure\n");
++	/* Close the secondary device */
++	dev->ops->release(dev); 
++	pr_debug("released secondary device\n");
++
++	err = dump_generic_unconfigure();
++	pr_debug("Unconfigured generic portions\n");
++	dump_free_mem(dump_saved_config);
++	dump_saved_config = NULL;
++	pr_debug("Freed saved config block\n");
++	dump_dev = dump_config.dumper->dev = dumper_stage2.dev;
++
++	printk("Unconfigured overlay dumper\n");
++	return err;
++}
++
++int dump_staged_unconfigure(void)
++{
++	int err = 0;
++	struct dump_config_block *saved_config = dump_saved_config;
++	struct dump_dev *dev;
++
++	pr_debug("dump_staged_unconfigure\n");
++	err = dump_generic_unconfigure();
++
++	/* now check if there is a saved dump waiting to be written out */
++	if (saved_config) {
++		printk("Processing saved dump pending writeout\n");
++		if ((err = dump_switchover_stage())) {
++			printk("Error in commiting saved dump at 0x%lx\n", 
++				(unsigned long)saved_config);
++			printk("Old dump may hog memory\n");
++		} else {
++			dump_free_mem(saved_config);
++			pr_debug("Freed saved config block\n");
++		}
++		dump_saved_config = NULL;
++	} else {
++		dev = &dump_memdev->ddev;
++		dev->ops->release(dev);
++	}
++	printk("Unconfigured second stage dumper\n");
++
++	return 0;
++}
++
++/* ----- PASSTHRU FILTER ROUTINE --------- */
++
++/* transparent - passes everything through */
++int dump_passthru_filter(int pass, unsigned long loc, unsigned long sz)
++{
++	return 1;
++}
++
++/* ----- PASSTRU FORMAT ROUTINES ---- */
++
++
++int dump_passthru_configure_header(const char *panic_str, const struct pt_regs *regs)
++{
++	dump_config.dumper->header_dirty++;
++	return 0;
++}
++
++/* Copies bytes of data from page(s) to the specified buffer */
++int dump_copy_pages(void *buf, struct page *page, unsigned long sz)
++{
++	unsigned long len = 0, bytes;
++	void *addr;
++
++	while (len < sz) {
++		addr = kmap_atomic(page, KM_DUMP);
++		bytes = (sz > len + PAGE_SIZE) ? PAGE_SIZE : sz - len;	
++		memcpy(buf, addr, bytes); 
++		kunmap_atomic(addr, KM_DUMP);
++		buf += bytes;
++		len += bytes;
++		page++;
++	}
++	/* memset(dump_config.dumper->curr_buf, 0x57, len); temporary */
++
++	return sz - len;
++}
++
++int dump_passthru_update_header(void)
++{
++	long len = dump_config.dumper->header_len;
++	struct page *page;
++	void *buf = dump_config.dumper->dump_buf;
++	int err = 0;
++
++	if (!dump_config.dumper->header_dirty)
++		return 0;
++
++	pr_debug("Copying header of size %ld bytes from memory\n", len);
++	if (len > DUMP_BUFFER_SIZE) 
++		return -E2BIG;
++
++	page = dump_mem_lookup(dump_memdev, 0);
++	for (; (len > 0) && page; buf += PAGE_SIZE, len -= PAGE_SIZE) {
++		if ((err = dump_copy_pages(buf, page, PAGE_SIZE)))
++			return err;
++		page = dump_mem_next_page(dump_memdev);
++	}
++	if (len > 0) {
++		printk("Incomplete header saved in mem\n");
++		return -ENOENT;
++	}
++
++	if ((err = dump_dev_seek(0))) {
++		printk("Unable to seek to dump header offset\n");
++		return err;
++	}
++	err = dump_ll_write(dump_config.dumper->dump_buf, 
++		buf - dump_config.dumper->dump_buf);
++	if (err < dump_config.dumper->header_len)
++		return (err < 0) ? err : -ENOSPC;
++
++	dump_config.dumper->header_dirty = 0;
++	return 0;
++}
++
++static loff_t next_dph_offset = 0;
++
++static int dph_valid(struct __dump_page *dph)
++{
++	if ((dph->dp_address & (PAGE_SIZE - 1)) || (dph->dp_flags 
++	      > DUMP_DH_COMPRESSED) || (!dph->dp_flags) ||
++		(dph->dp_size > PAGE_SIZE)) {
++	printk("dp->address = 0x%llx, dp->size = 0x%x, dp->flag = 0x%x\n",
++		dph->dp_address, dph->dp_size, dph->dp_flags);
++		return 0;
++	}
++	return 1;
++}
++
++int dump_verify_lcrash_data(void *buf, unsigned long sz)
++{
++	struct __dump_page *dph;
++
++	/* sanity check for page headers */
++	while (next_dph_offset + sizeof(*dph) < sz) {
++		dph = (struct __dump_page *)(buf + next_dph_offset);
++		if (!dph_valid(dph)) {
++			printk("Invalid page hdr at offset 0x%llx\n",
++				next_dph_offset);
++			return -EINVAL;
++		}
++		next_dph_offset += dph->dp_size + sizeof(*dph);
++	}
++
++	next_dph_offset -= sz;	
++	return 0;
++}
++
++/* 
++ * TBD/Later: Consider avoiding the copy by using a scatter/gather 
++ * vector representation for the dump buffer
++ */
++int dump_passthru_add_data(unsigned long loc, unsigned long sz)
++{
++	struct page *page = (struct page *)loc;
++	void *buf = dump_config.dumper->curr_buf;
++	int err = 0;
++
++	if ((err = dump_copy_pages(buf, page, sz))) {
++		printk("dump_copy_pages failed");
++		return err;
++	}
++
++	if ((err = dump_verify_lcrash_data(buf, sz))) {
++		printk("dump_verify_lcrash_data failed\n");
++		printk("Invalid data for pfn 0x%lx\n", page_to_pfn(page));
++		printk("Page flags 0x%lx\n", page->flags);
++		printk("Page count 0x%x\n", page_count(page));
++		return err;
++	}
++
++	dump_config.dumper->curr_buf = buf + sz;
++
++	return 0;
++}
++
++
++/* Stage 1 dumper: Saves compressed dump in memory and soft-boots system */
++
++/* Scheme to overlay saved data in memory for writeout after a soft-boot */
++struct dump_scheme_ops dump_scheme_overlay_ops = {
++	.configure	= dump_overlay_configure,
++	.unconfigure	= dump_overlay_unconfigure,
++	.sequencer	= dump_overlay_sequencer,
++	.iterator	= dump_page_iterator,
++	.save_data	= dump_overlay_save_data,
++	.skip_data	= dump_overlay_skip_data,
++	.write_buffer	= dump_generic_write_buffer
++};
++
++struct dump_scheme dump_scheme_overlay = {
++	.name		= "overlay",
++	.ops		= &dump_scheme_overlay_ops
++};
++
++
++/* Stage 1 must use a good compression scheme - default to gzip */
++extern struct __dump_compress dump_gzip_compression;
++
++struct dumper dumper_stage1 = {
++	.name		= "stage1",
++	.scheme		= &dump_scheme_overlay,
++	.fmt		= &dump_fmt_lcrash,
++	.compress 	= &dump_none_compression, /* needs to be gzip */
++	.filter		= dump_filter_table,
++	.dev		= NULL,
++};		
++
++/* Stage 2 dumper: Activated after softboot to write out saved dump to device */
++
++/* Formatter that transfers data as is (transparent) w/o further conversion */
++struct dump_fmt_ops dump_fmt_passthru_ops = {
++	.configure_header	= dump_passthru_configure_header,
++	.update_header		= dump_passthru_update_header,
++	.save_context		= NULL, /* unused */
++	.add_data		= dump_passthru_add_data,
++	.update_end_marker	= dump_lcrash_update_end_marker
++};
++
++struct dump_fmt dump_fmt_passthru = {
++	.name	= "passthru",
++	.ops	= &dump_fmt_passthru_ops
++};
++
++/* Filter that simply passes along any data within the range (transparent)*/
++/* Note: The start and end ranges in the table are filled in at run-time */
++
++extern int dump_filter_none(int pass, unsigned long loc, unsigned long sz);
++
++struct dump_data_filter dump_passthru_filtertable[MAX_PASSES] = {
++{.name = "passkern", .selector = dump_passthru_filter, 
++	.level_mask = DUMP_MASK_KERN },
++{.name = "passuser", .selector = dump_passthru_filter, 
++	.level_mask = DUMP_MASK_USED },
++{.name = "passunused", .selector = dump_passthru_filter, 
++	.level_mask = DUMP_MASK_UNUSED },
++{.name = "none", .selector = dump_filter_none, 
++	.level_mask = DUMP_MASK_REST }
++};
++
++
++/* Scheme to handle data staged / preserved across a soft-boot */
++struct dump_scheme_ops dump_scheme_staged_ops = {
++	.configure	= dump_generic_configure,
++	.unconfigure	= dump_staged_unconfigure,
++	.sequencer	= dump_generic_sequencer,
++	.iterator	= dump_saved_data_iterator,
++	.save_data	= dump_generic_save_data,
++	.skip_data	= dump_generic_skip_data,
++	.write_buffer	= dump_generic_write_buffer
++};
++
++struct dump_scheme dump_scheme_staged = {
++	.name		= "staged",
++	.ops		= &dump_scheme_staged_ops
++};
++
++/* The stage 2 dumper comprising all these */
++struct dumper dumper_stage2 = {
++	.name		= "stage2",
++	.scheme		= &dump_scheme_staged,
++	.fmt		= &dump_fmt_passthru,
++	.compress 	= &dump_none_compression,
++	.filter		= dump_passthru_filtertable,
++	.dev		= NULL,
++};		
++
+Index: linux-2.6.10/drivers/dump/dump_memdev.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_memdev.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_memdev.c	2005-04-05 16:47:53.947204496 +0800
+@@ -0,0 +1,655 @@
++/*
++ * Implements the dump driver interface for saving a dump in available
++ * memory areas. The saved pages may be written out to persistent storage  
++ * after a soft reboot.
++ *
++ * Started: Oct 2002 -  Suparna Bhattacharya <suparna@in.ibm.com>
++ *
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * This code is released under version 2 of the GNU GPL.
++ *
++ * The approach of tracking pages containing saved dump using map pages 
++ * allocated as needed has been derived from the Mission Critical Linux 
++ * mcore dump implementation. 
++ *
++ * Credits and a big thanks for letting the lkcd project make use of 
++ * the excellent piece of work and also helping with clarifications 
++ * and tips along the way are due to:
++ * 	Dave Winchell <winchell@mclx.com> (primary author of mcore)
++ * 	Jeff Moyer <moyer@mclx.com>
++ * 	Josh Huber <huber@mclx.com>
++ *
++ * For those familiar with the mcore code, the main differences worth
++ * noting here (besides the dump device abstraction) result from enabling 
++ * "high" memory pages (pages not permanently mapped in the kernel 
++ * address space) to be used for saving dump data (because of which a 
++ * simple virtual address based linked list cannot be used anymore for 
++ * managing free pages), an added level of indirection for faster 
++ * lookups during the post-boot stage, and the idea of pages being 
++ * made available as they get freed up while dump to memory progresses 
++ * rather than one time before starting the dump. The last point enables 
++ * a full memory snapshot to be saved starting with an initial set of 
++ * bootstrap pages given a good compression ratio. (See dump_overlay.c)
++ *
++ */
++
++/*
++ * -----------------MEMORY LAYOUT ------------------
++ * The memory space consists of a set of discontiguous pages, and
++ * discontiguous map pages as well, rooted in a chain of indirect
++ * map pages (also discontiguous). Except for the indirect maps 
++ * (which must be preallocated in advance), the rest of the pages 
++ * could be in high memory.
++ *
++ * root
++ *  |    ---------    --------        --------
++ *  -->  | .  . +|--->|  .  +|------->| . .  |       indirect 
++ *       --|--|---    ---|----        --|-|---	     maps
++ *         |  |          |     	        | |	
++ *    ------  ------   -------     ------ -------
++ *    | .  |  | .  |   | .  . |    | .  | |  . . |   maps 
++ *    --|---  --|---   --|--|--    --|--- ---|-|--
++ *     page    page    page page   page   page page  data
++ *                                                   pages
++ *
++ * Writes to the dump device happen sequentially in append mode.
++ * The main reason for the existence of the indirect map is
++ * to enable a quick way to lookup a specific logical offset in
++ * the saved data post-soft-boot, e.g. to writeout pages
++ * with more critical data first, even though such pages
++ * would have been compressed and copied last, being the lowest
++ * ranked candidates for reuse due to their criticality.
++ * (See dump_overlay.c)
++ */
++#include <linux/mm.h>
++#include <linux/highmem.h>
++#include <linux/bootmem.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++
++#define DUMP_MAP_SZ (PAGE_SIZE / sizeof(unsigned long)) /* direct map size */
++#define DUMP_IND_MAP_SZ	DUMP_MAP_SZ - 1  /* indirect map size */
++#define DUMP_NR_BOOTSTRAP	64  /* no of bootstrap pages */
++
++extern int dump_low_page(struct page *);
++
++/* check if the next entry crosses a page boundary */
++static inline int is_last_map_entry(unsigned long *map)
++{
++	unsigned long addr = (unsigned long)(map + 1);
++
++	return (!(addr & (PAGE_SIZE - 1)));
++}
++
++/* Todo: should have some validation checks */
++/* The last entry in the indirect map points to the next indirect map */
++/* Indirect maps are referred to directly by virtual address */
++static inline unsigned long *next_indirect_map(unsigned long *map)
++{
++	return (unsigned long *)map[DUMP_IND_MAP_SZ];
++}
++
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++/* Called during early bootup - fixme: make this __init */
++void dump_early_reserve_map(struct dump_memdev *dev)
++{
++	unsigned long *map1, *map2;
++	loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT;
++	int i, j;
++	
++	printk("Reserve bootmap space holding previous dump of %lld pages\n",
++			last);
++	map1= (unsigned long *)dev->indirect_map_root;
++
++	while (map1 && (off < last)) {
++#ifdef CONFIG_X86_64
++		reserve_bootmem_node(NODE_DATA(0), virt_to_phys((void *)map1),
++				 PAGE_SIZE);
++#else
++		reserve_bootmem(virt_to_phys((void *)map1), PAGE_SIZE);
++#endif
++		for (i=0;  (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last); 
++			i++, off += DUMP_MAP_SZ) {
++			pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
++			if (map1[i] >= max_low_pfn)
++				continue;
++#ifdef CONFIG_X86_64
++			reserve_bootmem_node(NODE_DATA(0), 
++					map1[i] << PAGE_SHIFT, PAGE_SIZE);
++#else
++			reserve_bootmem(map1[i] << PAGE_SHIFT, PAGE_SIZE);
++#endif
++			map2 = pfn_to_kaddr(map1[i]);
++			for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] && 
++				(off + j < last); j++) {
++				pr_debug("\t map[%d][%d] = 0x%lx\n", i, j, 
++					map2[j]);
++				if (map2[j] < max_low_pfn) {
++#ifdef CONFIG_X86_64
++					reserve_bootmem_node(NODE_DATA(0),
++						map2[j] << PAGE_SHIFT,
++						PAGE_SIZE);
++#else
++					reserve_bootmem(map2[j] << PAGE_SHIFT,
++						PAGE_SIZE);
++#endif
++				}
++			}
++		}
++		map1 = next_indirect_map(map1);
++	}
++	dev->nr_free = 0; /* these pages don't belong to this boot */
++}
++#endif
++
++/* mark dump pages so that they aren't used by this kernel */
++void dump_mark_map(struct dump_memdev *dev)
++{
++	unsigned long *map1, *map2;
++	loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT;
++	struct page *page;
++	int i, j;
++	
++	printk("Dump: marking pages in use by previous dump\n");
++	map1= (unsigned long *)dev->indirect_map_root;
++
++	while (map1 && (off < last)) {
++		page = virt_to_page(map1);	
++		set_page_count(page, 1);
++		for (i=0;  (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last); 
++			i++, off += DUMP_MAP_SZ) {
++			pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
++			page = pfn_to_page(map1[i]);
++			set_page_count(page, 1);
++			map2 = kmap_atomic(page, KM_DUMP);
++			for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] && 
++				(off + j < last); j++) {
++				pr_debug("\t map[%d][%d] = 0x%lx\n", i, j, 
++					map2[j]);
++				page = pfn_to_page(map2[j]);
++				set_page_count(page, 1);
++			}
++		}
++		map1 = next_indirect_map(map1);
++	}
++}
++	
++
++/* 
++ * Given a logical offset into the mem device lookup the 
++ * corresponding page 
++ * 	loc is specified in units of pages 
++ * Note: affects curr_map (even in the case where lookup fails)
++ */
++struct page *dump_mem_lookup(struct dump_memdev *dump_mdev, unsigned long loc)
++{
++	unsigned long *map;
++	unsigned long i, index = loc / DUMP_MAP_SZ;
++	struct page *page = NULL;
++	unsigned long curr_pfn, curr_map, *curr_map_ptr = NULL;
++
++	map = (unsigned long *)dump_mdev->indirect_map_root;
++	if (!map) 
++		return NULL;
++	if (loc > dump_mdev->last_offset >> PAGE_SHIFT)
++		return NULL;
++
++	/* 
++	 * first locate the right indirect map 
++	 * in the chain of indirect maps 
++	 */
++	for (i = 0; i + DUMP_IND_MAP_SZ < index ; i += DUMP_IND_MAP_SZ) {
++		if (!(map = next_indirect_map(map))) 
++			return NULL;
++	}
++	/* then the right direct map */
++	/* map entries are referred to by page index */
++	if ((curr_map = map[index - i])) {
++		page = pfn_to_page(curr_map);
++		/* update the current traversal index */
++		/* dump_mdev->curr_map = &map[index - i];*/
++		curr_map_ptr = &map[index - i];
++	}
++
++	if (page)
++		map = kmap_atomic(page, KM_DUMP);
++	else 
++		return NULL;
++
++	/* and finally the right entry therein */
++	/* data pages are referred to by page index */
++	i = index * DUMP_MAP_SZ;
++	if ((curr_pfn = map[loc - i])) {
++		page = pfn_to_page(curr_pfn);
++		dump_mdev->curr_map = curr_map_ptr;
++		dump_mdev->curr_map_offset = loc - i;
++		dump_mdev->ddev.curr_offset = loc << PAGE_SHIFT;
++	} else {
++		page = NULL;
++	}
++	kunmap_atomic(map, KM_DUMP);
++
++	return page;
++}
++			
++/* 
++ * Retrieves a pointer to the next page in the dump device 
++ * Used during the lookup pass post-soft-reboot 
++ */
++struct page *dump_mem_next_page(struct dump_memdev *dev)
++{
++	unsigned long i; 
++	unsigned long *map;	
++	struct page *page = NULL;
++
++	if (dev->ddev.curr_offset + PAGE_SIZE >= dev->last_offset) {
++		return NULL;
++	}
++
++	if ((i = (unsigned long)(++dev->curr_map_offset)) >= DUMP_MAP_SZ) {
++		/* move to next map */	
++		if (is_last_map_entry(++dev->curr_map)) {
++			/* move to the next indirect map page */
++			printk("dump_mem_next_page: go to next indirect map\n");
++			dev->curr_map = (unsigned long *)*dev->curr_map;
++			if (!dev->curr_map)
++				return NULL;
++		}
++		i = dev->curr_map_offset = 0;
++		pr_debug("dump_mem_next_page: next map 0x%lx, entry 0x%lx\n",
++				dev->curr_map, *dev->curr_map);
++
++	};
++	
++	if (*dev->curr_map) {
++		map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_DUMP);
++		if (map[i])
++			page = pfn_to_page(map[i]);
++		kunmap_atomic(map, KM_DUMP);
++		dev->ddev.curr_offset += PAGE_SIZE;
++	};
++
++	return page;
++}
++
++/* Copied from dump_filters.c */
++static inline int kernel_page(struct page *p)
++{
++	/* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */
++	return (PageReserved(p) && !PageInuse(p)) || (!PageLRU(p) && PageInuse(p));
++}
++
++static inline int user_page(struct page *p)
++{
++	return PageInuse(p) && (!PageReserved(p) && PageLRU(p));
++}
++
++int dump_reused_by_boot(struct page *page)
++{
++	/* Todo
++	 * Checks:
++	 * if PageReserved 
++	 * if < __end + bootmem_bootmap_pages for this boot + allowance 
++	 * if overwritten by initrd (how to check ?)
++	 * Also, add more checks in early boot code
++	 * e.g. bootmem bootmap alloc verify not overwriting dump, and if
++	 * so then realloc or move the dump pages out accordingly.
++	 */
++
++	/* Temporary proof of concept hack, avoid overwriting kern pages */
++
++	return (kernel_page(page) || dump_low_page(page) || user_page(page));
++}
++
++
++/* Uses the free page passed in to expand available space */
++int dump_mem_add_space(struct dump_memdev *dev, struct page *page)
++{
++	struct page *map_page;
++	unsigned long *map;	
++	unsigned long i; 
++
++	if (!dev->curr_map)
++		return -ENOMEM; /* must've exhausted indirect map */
++
++	if (!*dev->curr_map || dev->curr_map_offset >= DUMP_MAP_SZ) {
++		/* add map space */
++		*dev->curr_map = page_to_pfn(page);
++		dev->curr_map_offset = 0;
++		return 0;
++	}
++
++	/* add data space */
++	i = dev->curr_map_offset;
++	map_page = pfn_to_page(*dev->curr_map);
++	map = (unsigned long *)kmap_atomic(map_page, KM_DUMP);
++	map[i] = page_to_pfn(page);
++	kunmap_atomic(map, KM_DUMP);
++	dev->curr_map_offset = ++i;
++	dev->last_offset += PAGE_SIZE;
++	if (i >= DUMP_MAP_SZ) {
++		/* move to next map */
++		if (is_last_map_entry(++dev->curr_map)) {
++			/* move to the next indirect map page */
++			pr_debug("dump_mem_add_space: using next"
++			"indirect map\n");
++			dev->curr_map = (unsigned long *)*dev->curr_map;
++		}
++	}		
++	return 0;
++}
++
++
++/* Caution: making a dest page invalidates existing contents of the page */
++int dump_check_and_free_page(struct dump_memdev *dev, struct page *page)
++{
++	int err = 0;
++
++	/* 
++	 * the page can be used as a destination only if we are sure
++	 * it won't get overwritten by the soft-boot, and is not
++	 * critical for us right now.
++	 */
++	if (dump_reused_by_boot(page))
++		return 0;
++
++	if ((err = dump_mem_add_space(dev, page))) {
++		printk("Warning: Unable to extend memdev space. Err %d\n",
++		err);
++		return 0;
++	}
++
++	dev->nr_free++;
++	return 1;
++}
++
++
++/* Set up the initial maps and bootstrap space  */
++/* Must be called only after any previous dump is written out */
++int dump_mem_open(struct dump_dev *dev, unsigned long devid)
++{
++	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++	unsigned long nr_maps, *map, *prev_map = &dump_mdev->indirect_map_root;
++	void *addr;
++	struct page *page;
++	unsigned long i = 0;
++	int err = 0;
++
++	/* Todo: sanity check for unwritten previous dump */
++
++	/* allocate pages for indirect map (non highmem area) */
++	nr_maps = num_physpages / DUMP_MAP_SZ; /* maps to cover entire mem */
++	for (i = 0; i < nr_maps; i += DUMP_IND_MAP_SZ) {
++		if (!(map = (unsigned long *)dump_alloc_mem(PAGE_SIZE))) {
++			printk("Unable to alloc indirect map %ld\n", 
++				i / DUMP_IND_MAP_SZ);
++			return -ENOMEM;
++		}
++		clear_page(map);
++		*prev_map = (unsigned long)map;
++		prev_map = &map[DUMP_IND_MAP_SZ];
++	};
++		
++	dump_mdev->curr_map = (unsigned long *)dump_mdev->indirect_map_root;
++	dump_mdev->curr_map_offset = 0;	
++
++	/* 
++	 * allocate a few bootstrap pages: at least 1 map and 1 data page
++	 * plus enough to save the dump header
++	 */
++	i = 0;
++	do {
++		if (!(addr = dump_alloc_mem(PAGE_SIZE))) {
++			printk("Unable to alloc bootstrap page %ld\n", i);
++			return -ENOMEM;
++		}
++
++		page = virt_to_page(addr);
++		if (dump_low_page(page)) {
++			dump_free_mem(addr);
++			continue;
++		}
++
++		if (dump_mem_add_space(dump_mdev, page)) {
++			printk("Warning: Unable to extend memdev "
++					"space. Err %d\n", err);
++			dump_free_mem(addr);
++			continue;
++		}
++		i++;
++	} while (i < DUMP_NR_BOOTSTRAP);
++
++	printk("dump memdev init: %ld maps, %ld bootstrap pgs, %ld free pgs\n",
++		nr_maps, i, dump_mdev->last_offset >> PAGE_SHIFT);
++	
++	dump_mdev->last_bs_offset = dump_mdev->last_offset;
++
++	return 0;
++}
++
++/* Releases all pre-alloc'd pages */
++int dump_mem_release(struct dump_dev *dev)
++{
++	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++	struct page *page, *map_page;
++	unsigned long *map, *prev_map;
++	void *addr;
++	int i;
++
++	if (!dump_mdev->nr_free)
++		return 0;
++
++	pr_debug("dump_mem_release\n");
++	page = dump_mem_lookup(dump_mdev, 0);
++	for (i = 0; page && (i < DUMP_NR_BOOTSTRAP - 1); i++) {
++		if (PageHighMem(page))
++			break;
++		addr = page_address(page);
++		if (!addr) {
++			printk("page_address(%p) = NULL\n", page);
++			break;
++		}
++		pr_debug("Freeing page at 0x%lx\n", addr); 
++		dump_free_mem(addr);
++		if (dump_mdev->curr_map_offset >= DUMP_MAP_SZ - 1) {
++			map_page = pfn_to_page(*dump_mdev->curr_map);
++			if (PageHighMem(map_page))
++				break;
++			page = dump_mem_next_page(dump_mdev);
++			addr = page_address(map_page);
++			if (!addr) {
++				printk("page_address(%p) = NULL\n", 
++					map_page);
++				break;
++			}
++			pr_debug("Freeing map page at 0x%lx\n", addr);
++			dump_free_mem(addr);
++			i++;
++		} else {
++			page = dump_mem_next_page(dump_mdev);
++		}
++	}
++
++	/* now for the last used bootstrap page used as a map page */
++	if ((i < DUMP_NR_BOOTSTRAP) && (*dump_mdev->curr_map)) {
++		map_page = pfn_to_page(*dump_mdev->curr_map);
++		if ((map_page) && !PageHighMem(map_page)) {
++			addr = page_address(map_page);
++			if (!addr) {
++				printk("page_address(%p) = NULL\n", map_page);
++			} else {
++				pr_debug("Freeing map page at 0x%lx\n", addr);
++				dump_free_mem(addr);
++				i++;
++			}
++		}
++	}
++
++	printk("Freed %d bootstrap pages\n", i);
++
++	/* free the indirect maps */
++	map = (unsigned long *)dump_mdev->indirect_map_root;
++
++	i = 0;
++	while (map) {
++		prev_map = map;
++		map = next_indirect_map(map);
++		dump_free_mem(prev_map);
++		i++;
++	}
++
++	printk("Freed %d indirect map(s)\n", i);
++
++	/* Reset the indirect map */
++	dump_mdev->indirect_map_root = 0;
++	dump_mdev->curr_map = 0;
++
++	/* Reset the free list */
++	dump_mdev->nr_free = 0;
++
++	dump_mdev->last_offset = dump_mdev->ddev.curr_offset = 0;
++	dump_mdev->last_used_offset = 0;
++	dump_mdev->curr_map = NULL;
++	dump_mdev->curr_map_offset = 0;
++	return 0;
++}
++
++/*
++ * Long term:
++ * It is critical for this to be very strict. Cannot afford
++ * to have anything running and accessing memory while we overwrite 
++ * memory (potential risk of data corruption).
++ * If in doubt (e.g if a cpu is hung and not responding) just give
++ * up and refuse to proceed with this scheme.
++ *
++ * Note: I/O will only happen after soft-boot/switchover, so we can 
++ * safely disable interrupts and force stop other CPUs if this is
++ * going to be a disruptive dump, no matter what they
++ * are in the middle of.
++ */
++/* 
++ * ATM Most of this is already taken care of in the nmi handler 
++ * We may halt the cpus rightaway if we know this is going to be disruptive 
++ * For now, since we've limited ourselves to overwriting free pages we
++ * aren't doing much here. Eventually, we'd have to wait to make sure other
++ * cpus aren't using memory we could be overwriting
++ */
++int dump_mem_silence(struct dump_dev *dev)
++{
++	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++
++	if (dump_mdev->last_offset > dump_mdev->last_bs_offset) {
++		/* prefer to run lkcd config & start with a clean slate */
++		return -EEXIST;
++	}
++	return 0;
++}
++
++extern int dump_overlay_resume(void);
++
++/* Trigger the next stage of dumping */
++int dump_mem_resume(struct dump_dev *dev)
++{
++	dump_overlay_resume(); 
++	return 0;
++}
++
++/* 
++ * Allocate mem dev pages as required and copy buffer contents into it.
++ * Fails if the no free pages are available
++ * Keeping it simple and limited for starters (can modify this over time)
++ *  Does not handle holes or a sparse layout
++ *  Data must be in multiples of PAGE_SIZE
++ */
++int dump_mem_write(struct dump_dev *dev, void *buf, unsigned long len)
++{
++	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++	struct page *page;
++	unsigned long n = 0;
++	void *addr;
++	unsigned long *saved_curr_map, saved_map_offset;
++	int ret = 0;
++
++	pr_debug("dump_mem_write: offset 0x%llx, size %ld\n", 
++		dev->curr_offset, len);
++
++	if (dev->curr_offset + len > dump_mdev->last_offset)  {
++		printk("Out of space to write\n");
++		return -ENOSPC;
++	}
++	
++	if ((len & (PAGE_SIZE - 1)) || (dev->curr_offset & (PAGE_SIZE - 1)))
++		return -EINVAL; /* not aligned in units of page size */
++
++	saved_curr_map = dump_mdev->curr_map;
++	saved_map_offset = dump_mdev->curr_map_offset;
++	page = dump_mem_lookup(dump_mdev, dev->curr_offset >> PAGE_SHIFT);
++
++	for (n = len; (n > 0) && page; n -= PAGE_SIZE, buf += PAGE_SIZE ) {
++		addr = kmap_atomic(page, KM_DUMP);
++		/* memset(addr, 'x', PAGE_SIZE); */
++		memcpy(addr, buf, PAGE_SIZE);
++		kunmap_atomic(addr, KM_DUMP);
++		/* dev->curr_offset += PAGE_SIZE; */
++		page = dump_mem_next_page(dump_mdev);
++	}
++
++	dump_mdev->curr_map = saved_curr_map;
++	dump_mdev->curr_map_offset = saved_map_offset;
++
++	if (dump_mdev->last_used_offset < dev->curr_offset)
++		dump_mdev->last_used_offset = dev->curr_offset;
++
++	return (len - n) ? (len - n) : ret ;
++}
++
++/* dummy - always ready */
++int dump_mem_ready(struct dump_dev *dev, void *buf)
++{
++	return 0;
++}
++
++/* 
++ * Should check for availability of space to write upto the offset 
++ * affects only the curr_offset; last_offset untouched 
++ * Keep it simple: Only allow multiples of PAGE_SIZE for now 
++ */
++int dump_mem_seek(struct dump_dev *dev, loff_t offset)
++{
++	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++
++	if (offset & (PAGE_SIZE - 1))
++		return -EINVAL; /* allow page size units only for now */
++	
++	/* Are we exceeding available space ? */
++	if (offset > dump_mdev->last_offset) {
++		printk("dump_mem_seek failed for offset 0x%llx\n",
++			offset);
++		return -ENOSPC;	
++	}
++
++	dump_mdev->ddev.curr_offset = offset;
++	return 0;
++}
++
++struct dump_dev_ops dump_memdev_ops = {
++	.open 		= dump_mem_open,
++	.release	= dump_mem_release,
++	.silence	= dump_mem_silence,
++	.resume 	= dump_mem_resume,
++	.seek		= dump_mem_seek,
++	.write		= dump_mem_write,
++	.read		= NULL, /* not implemented at the moment */
++	.ready		= dump_mem_ready
++};
++
++static struct dump_memdev default_dump_memdev = {
++	.ddev = {.type_name = "memdev", .ops = &dump_memdev_ops,
++        	 .device_id = 0x14}
++	/* assume the rest of the fields are zeroed by default */
++};	
++	
++/* may be overwritten if a previous dump exists */
++struct dump_memdev *dump_memdev = &default_dump_memdev;
++
+Index: linux-2.6.10/drivers/dump/dump_blockdev.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_blockdev.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_blockdev.c	2005-04-05 16:47:53.945204800 +0800
+@@ -0,0 +1,469 @@
++/*
++ * Implements the dump driver interface for saving a dump to 
++ * a block device through the kernel's generic low level block i/o
++ * routines.
++ *
++ * Started: June 2002 - Mohamed Abbas <mohamed.abbas@intel.com>
++ * 	Moved original lkcd kiobuf dump i/o code from dump_base.c
++ * 	to use generic dump device interfaces
++ *
++ * Sept 2002 - Bharata B. Rao <bharata@in.ibm.com>
++ * 	Convert dump i/o to directly use bio instead of kiobuf for 2.5
++ *
++ * Oct 2002  - Suparna Bhattacharya <suparna@in.ibm.com>
++ * 	Rework to new dumpdev.h structures, implement open/close/
++ * 	silence, misc fixes (blocknr removal, bio_add_page usage)  
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson.  All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#include <linux/types.h>
++#include <linux/proc_fs.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/blkdev.h>
++#include <linux/bio.h>
++#include <asm/hardirq.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++
++extern void *dump_page_buf;
++
++/* The end_io callback for dump i/o completion */
++static int
++dump_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
++{
++	struct dump_blockdev *dump_bdev;
++
++	if (bio->bi_size) {
++		/* some bytes still left to transfer */
++		return 1; /* not complete */
++	}
++
++	dump_bdev = (struct dump_blockdev *)bio->bi_private;
++	if (error) {
++		printk("IO error while writing the dump, aborting\n");
++	}
++
++	dump_bdev->err = error;
++
++	/* no wakeup needed, since caller polls for completion */
++	return 0;
++}
++
++/* Check if the dump bio is already mapped to the specified buffer */
++static int
++dump_block_map_valid(struct dump_blockdev *dev, struct page *page, 
++	int len) 
++{
++	struct bio *bio = dev->bio;
++	unsigned long bsize = 0;
++
++	if (!bio->bi_vcnt)
++		return 0; /* first time, not mapped */
++
++
++	if ((bio_page(bio) != page) || (len > bio->bi_vcnt << PAGE_SHIFT))
++		return 0; /* buffer not mapped */
++
++	bsize = bdev_hardsect_size(bio->bi_bdev);
++	if ((len & (PAGE_SIZE - 1)) || (len & bsize))
++		return 0; /* alignment checks needed */
++
++	/* quick check to decide if we need to redo bio_add_page */
++	if (bdev_get_queue(bio->bi_bdev)->merge_bvec_fn)
++		return 0; /* device may have other restrictions */
++
++	return 1; /* already mapped */
++}
++
++/* 
++ * Set up the dump bio for i/o from the specified buffer 
++ * Return value indicates whether the full buffer could be mapped or not
++ */
++static int
++dump_block_map(struct dump_blockdev *dev, void *buf, int len)
++{
++	struct page *page = virt_to_page(buf);
++	struct bio *bio = dev->bio;
++	unsigned long bsize = 0;
++
++	bio->bi_bdev = dev->bdev;
++	bio->bi_sector = (dev->start_offset + dev->ddev.curr_offset) >> 9;
++	bio->bi_idx = 0; /* reset index to the beginning */
++
++	if (dump_block_map_valid(dev, page, len)) {
++		/* already mapped and usable rightaway */
++		bio->bi_size = len; /* reset size to the whole bio */
++		bio->bi_vcnt = (len + PAGE_SIZE - 1) / PAGE_SIZE; /* Set the proper vector cnt */
++	} else {
++		/* need to map the bio */
++		bio->bi_size = 0;
++		bio->bi_vcnt = 0;
++		bsize = bdev_hardsect_size(bio->bi_bdev);
++
++		/* first a few sanity checks */
++		if (len < bsize) {
++			printk("map: len less than hardsect size \n");
++			return -EINVAL;
++		}
++
++		if ((unsigned long)buf & bsize) {
++			printk("map: not aligned \n");
++			return -EINVAL;
++		}
++
++		/* assume contig. page aligned low mem buffer( no vmalloc) */
++		if ((page_address(page) != buf) || (len & (PAGE_SIZE - 1))) {
++			printk("map: invalid buffer alignment!\n");
++			return -EINVAL; 
++		}
++		/* finally we can go ahead and map it */
++		while (bio->bi_size < len)
++			if (bio_add_page(bio, page++, PAGE_SIZE, 0) == 0) {
++				break;
++			}
++
++		bio->bi_end_io = dump_bio_end_io;
++		bio->bi_private = dev;
++	}
++
++	if (bio->bi_size != len) {
++		printk("map: bio size = %d not enough for len = %d!\n",
++			bio->bi_size, len);
++		return -E2BIG;
++	}
++	return 0;
++}
++
++static void
++dump_free_bio(struct bio *bio)
++{
++	if (bio)
++		kfree(bio->bi_io_vec);
++	kfree(bio);
++}
++
++/*
++ * Prepares the dump device so we can take a dump later. 
++ * The caller is expected to have filled up the dev_id field in the 
++ * block dump dev structure.
++ *
++ * At dump time when dump_block_write() is invoked it will be too 
++ * late to recover, so as far as possible make sure obvious errors 
++ * get caught right here and reported back to the caller.
++ */
++static int
++dump_block_open(struct dump_dev *dev, unsigned long arg)
++{
++	struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++	struct block_device *bdev;
++	int retval = 0;
++	struct bio_vec *bvec;
++
++	/* make sure this is a valid block device */
++	if (!arg) {
++		retval = -EINVAL;
++		goto err;
++	}
++
++	/* Convert it to the new dev_t format */
++	arg = MKDEV((arg >> OLDMINORBITS), (arg & OLDMINORMASK));
++	
++	/* get a corresponding block_dev struct for this */
++	bdev = bdget((dev_t)arg);
++	if (!bdev) {
++		retval = -ENODEV;
++		goto err;
++	}
++
++	/* get the block device opened */
++	if ((retval = blkdev_get(bdev, O_RDWR | O_LARGEFILE, 0))) {
++		goto err1;
++	}
++
++	if ((dump_bdev->bio = kmalloc(sizeof(struct bio), GFP_KERNEL)) 
++		== NULL) {
++		printk("Cannot allocate bio\n");
++		retval = -ENOMEM;
++		goto err2;
++	}
++
++	bio_init(dump_bdev->bio);
++
++	if ((bvec = kmalloc(sizeof(struct bio_vec) * 
++		(DUMP_BUFFER_SIZE >> PAGE_SHIFT), GFP_KERNEL)) == NULL) {
++		retval = -ENOMEM;
++		goto err3;
++	}
++
++	/* assign the new dump dev structure */
++	dump_bdev->dev_id = (dev_t)arg;
++	dump_bdev->bdev = bdev;
++
++	/* make a note of the limit */
++	dump_bdev->limit = bdev->bd_inode->i_size;
++	
++	/* now make sure we can map the dump buffer */
++	dump_bdev->bio->bi_io_vec = bvec;
++	dump_bdev->bio->bi_max_vecs = DUMP_BUFFER_SIZE >> PAGE_SHIFT;
++
++	retval = dump_block_map(dump_bdev, dump_config.dumper->dump_buf, 
++		DUMP_BUFFER_SIZE);
++		
++	if (retval) {
++		printk("open: dump_block_map failed, ret %d\n", retval);
++		goto err3;
++	}
++
++	printk("Block device (%d,%d) successfully configured for dumping\n",
++	       MAJOR(dump_bdev->dev_id),
++	       MINOR(dump_bdev->dev_id));
++
++
++	/* after opening the block device, return */
++	return retval;
++
++err3:	dump_free_bio(dump_bdev->bio);
++	dump_bdev->bio = NULL;
++err2:	if (bdev) blkdev_put(bdev);
++		goto err;
++err1:	if (bdev) bdput(bdev);
++	dump_bdev->bdev = NULL;
++err:	return retval;
++}
++
++/*
++ * Close the dump device and release associated resources
++ * Invoked when unconfiguring the dump device.
++ */
++static int
++dump_block_release(struct dump_dev *dev)
++{
++	struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++
++	/* release earlier bdev if present */
++	if (dump_bdev->bdev) {
++		blkdev_put(dump_bdev->bdev);
++		dump_bdev->bdev = NULL;
++	}
++
++	dump_free_bio(dump_bdev->bio);
++	dump_bdev->bio = NULL;
++
++	return 0;
++}
++
++
++/*
++ * Prepare the dump device for use (silence any ongoing activity
++ * and quiesce state) when the system crashes.
++ */
++static int
++dump_block_silence(struct dump_dev *dev)
++{
++	struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++	struct request_queue *q = bdev_get_queue(dump_bdev->bdev);
++	int ret;
++
++	/* If we can't get request queue lock, refuse to take the dump */
++	if (!spin_trylock(q->queue_lock))
++		return -EBUSY;
++
++	ret = elv_queue_empty(q);
++	spin_unlock(q->queue_lock);
++
++	/* For now we assume we have the device to ourselves */
++	/* Just a quick sanity check */
++	if (!ret) {
++		/* Warn the user and move on */
++		printk(KERN_ALERT "Warning: Non-empty request queue\n");
++		printk(KERN_ALERT "I/O requests in flight at dump time\n");
++	}
++
++	/* 
++	 * Move to a softer level of silencing where no spin_lock_irqs 
++	 * are held on other cpus
++	 */
++	dump_silence_level = DUMP_SOFT_SPIN_CPUS;	
++
++	ret = __dump_irq_enable();
++	if (ret) {
++		return ret;
++	}
++
++	printk("Dumping to block device (%d,%d) on CPU %d ...\n",
++	       MAJOR(dump_bdev->dev_id), MINOR(dump_bdev->dev_id),
++	       smp_processor_id());
++	
++	return 0;
++}
++
++/*
++ * Invoked when dumping is done. This is the time to put things back 
++ * (i.e. undo the effects of dump_block_silence) so the device is 
++ * available for normal use.
++ */
++static int
++dump_block_resume(struct dump_dev *dev)
++{
++	__dump_irq_restore();
++	return 0;
++}
++
++
++/*
++ * Seek to the specified offset in the dump device.
++ * Makes sure this is a valid offset, otherwise returns an error.
++ */
++static int
++dump_block_seek(struct dump_dev *dev, loff_t off)
++{
++	struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++	loff_t offset = off + dump_bdev->start_offset;
++	
++	if (offset & ( PAGE_SIZE - 1)) {
++		printk("seek: non-page aligned\n");
++		return -EINVAL;
++	}
++
++	if (offset & (bdev_hardsect_size(dump_bdev->bdev) - 1)) {
++		printk("seek: not sector aligned \n");
++		return -EINVAL;
++	}
++
++	if (offset > dump_bdev->limit) {
++		printk("seek: not enough space left on device!\n");
++		return -ENOSPC; 
++	}
++	dev->curr_offset = off;
++	return 0;
++}
++
++/*
++ * Write out a buffer after checking the device limitations, 
++ * sector sizes, etc. Assumes the buffer is in directly mapped 
++ * kernel address space (not vmalloc'ed).
++ *
++ * Returns: number of bytes written or -ERRNO. 
++ */
++static int
++dump_block_write(struct dump_dev *dev, void *buf, 
++	unsigned long len)
++{
++	struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++	loff_t offset = dev->curr_offset + dump_bdev->start_offset;
++	int retval = -ENOSPC;
++
++	if (offset >= dump_bdev->limit) {
++		printk("write: not enough space left on device!\n");
++		goto out;
++	}
++
++	/* don't write more blocks than our max limit */
++	if (offset + len > dump_bdev->limit) 
++		len = dump_bdev->limit - offset;
++
++
++	retval = dump_block_map(dump_bdev, buf, len);
++	if (retval){
++		printk("write: dump_block_map failed! err %d\n", retval);
++		goto out;
++	}
++
++	/*
++	 * Write out the data to disk.
++	 * Assumes the entire buffer mapped to a single bio, which we can
++	 * submit and wait for io completion. In the future, may consider
++	 * increasing the dump buffer size and submitting multiple bio s 
++	 * for better throughput.
++	 */
++	dump_bdev->err = -EAGAIN;
++	submit_bio(WRITE, dump_bdev->bio);
++
++	dump_bdev->ddev.curr_offset += len;
++	retval = len;
++ out:
++	return retval;
++}
++
++/*
++ * Name: dump_block_ready()
++ * Func: check if the last dump i/o is over and ready for next request
++ */
++static int
++dump_block_ready(struct dump_dev *dev, void *buf)
++{
++	struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++	request_queue_t *q = bdev_get_queue(dump_bdev->bio->bi_bdev);
++
++	/* check for io completion */
++	if (dump_bdev->err == -EAGAIN) {
++		q->unplug_fn(q);
++		return -EAGAIN;
++	}
++
++	if (dump_bdev->err) {
++		printk("dump i/o err\n");
++		return dump_bdev->err;
++	}
++
++	return 0;
++}
++
++
++struct dump_dev_ops dump_blockdev_ops = {
++	.open 		= dump_block_open,
++	.release	= dump_block_release,
++	.silence	= dump_block_silence,
++	.resume 	= dump_block_resume,
++	.seek		= dump_block_seek,
++	.write		= dump_block_write,
++	/* .read not implemented */
++	.ready		= dump_block_ready
++};
++
++static struct dump_blockdev default_dump_blockdev = {
++	.ddev = {.type_name = "blockdev", .ops = &dump_blockdev_ops, 
++			.curr_offset = 0},
++	/* 
++	 * leave enough room for the longest swap header possibly written 
++	 * written by mkswap (likely the largest page size supported by
++	 * the arch
++	 */
++	.start_offset 	= DUMP_HEADER_OFFSET,
++	.err 		= 0
++	/* assume the rest of the fields are zeroed by default */
++};	
++	
++struct dump_blockdev *dump_blockdev = &default_dump_blockdev;
++
++static int __init
++dump_blockdev_init(void)
++{
++	if (dump_register_device(&dump_blockdev->ddev) < 0) {
++		printk("block device driver registration failed\n");
++		return -1;
++	}
++		
++	printk("block device driver for LKCD registered\n");
++	return 0;
++}
++
++static void __exit
++dump_blockdev_cleanup(void)
++{
++	dump_unregister_device(&dump_blockdev->ddev);
++	printk("block device driver for LKCD unregistered\n");
++}
++
++MODULE_AUTHOR("LKCD Development Team <lkcd-devel@lists.sourceforge.net>");
++MODULE_DESCRIPTION("Block Dump Driver for Linux Kernel Crash Dump (LKCD)");
++MODULE_LICENSE("GPL");
++
++module_init(dump_blockdev_init);
++module_exit(dump_blockdev_cleanup);
+Index: linux-2.6.10/drivers/dump/dump_fmt.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_fmt.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_fmt.c	2005-04-05 16:47:53.941205408 +0800
+@@ -0,0 +1,407 @@
++/*
++ * Implements the routines which handle the format specific
++ * aspects of dump for the default dump format.
++ *
++ * Used in single stage dumping and stage 1 of soft-boot based dumping 
++ * Saves data in LKCD (lcrash) format 
++ *
++ * Previously a part of dump_base.c
++ *
++ * Started: Oct 2002 -  Suparna Bhattacharya <suparna@in.ibm.com>
++ *	Split off and reshuffled LKCD dump format code around generic
++ *	dump method interfaces.
++ *
++ * Derived from original code created by 
++ * 	Matt Robinson <yakker@sourceforge.net>)
++ *
++ * Contributions from SGI, IBM, HP, MCL, and others.
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2000 - 2002 TurboLinux, Inc.  All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson.  All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/time.h>
++#include <linux/sched.h>
++#include <linux/ptrace.h>
++#include <linux/utsname.h>
++#include <linux/dump.h>
++#include <asm/dump.h>
++#include "dump_methods.h"
++
++/*
++ * SYSTEM DUMP LAYOUT
++ * 
++ * System dumps are currently the combination of a dump header and a set
++ * of data pages which contain the system memory.  The layout of the dump
++ * (for full dumps) is as follows:
++ *
++ *             +-----------------------------+
++ *             |     generic dump header     |
++ *             +-----------------------------+
++ *             |   architecture dump header  |
++ *             +-----------------------------+
++ *             |         page header         |
++ *             +-----------------------------+
++ *             |          page data          |
++ *             +-----------------------------+
++ *             |         page header         |
++ *             +-----------------------------+
++ *             |          page data          |
++ *             +-----------------------------+
++ *             |              |              |
++ *             |              |              |
++ *             |              |              |
++ *             |              |              |
++ *             |              V              |
++ *             +-----------------------------+
++ *             |        PAGE_END header      |
++ *             +-----------------------------+
++ *
++ * There are two dump headers, the first which is architecture
++ * independent, and the other which is architecture dependent.  This
++ * allows different architectures to dump different data structures
++ * which are specific to their chipset, CPU, etc.
++ *
++ * After the dump headers come a succession of dump page headers along
++ * with dump pages.  The page header contains information about the page
++ * size, any flags associated with the page (whether it's compressed or
++ * not), and the address of the page.  After the page header is the page
++ * data, which is either compressed (or not).  Each page of data is
++ * dumped in succession, until the final dump header (PAGE_END) is
++ * placed at the end of the dump, assuming the dump device isn't out
++ * of space.
++ *
++ * This mechanism allows for multiple compression types, different
++ * types of data structures, different page ordering, etc., etc., etc.
++ * It's a very straightforward mechanism for dumping system memory.
++ */
++
++struct __dump_header dump_header;  /* the primary dump header              */
++struct __dump_header_asm dump_header_asm; /* the arch-specific dump header */
++
++/* Replace a runtime sanity check on the DUMP_BUFFER_SIZE with a 
++ * compile-time check.  The compile_time_assertions routine will not
++ * compile if the assertion is false. 
++ *
++ * If you fail this assert you are most likely on a large machine and 
++ * should use a special 6.0.0 version of LKCD or a version > 7.0.0. See
++ * the LKCD website for more information.
++ */
++
++#define COMPILE_TIME_ASSERT(const_expr) \
++	switch(0){case 0: case (const_expr):;}
++
++static inline void compile_time_assertions(void)
++{
++	COMPILE_TIME_ASSERT((sizeof(struct __dump_header) +
++		sizeof(struct __dump_header_asm)) <= DUMP_BUFFER_SIZE);
++}
++
++/*
++ *  Set up common header fields (mainly the arch indep section) 
++ *  Per-cpu state is handled by lcrash_save_context
++ *  Returns the size of the header in bytes.
++ */
++static int lcrash_init_dump_header(const char *panic_str)
++{
++	struct timeval dh_time;
++	u64 temp_memsz = dump_header.dh_memory_size;
++
++	/* initialize the dump headers to zero */
++	/* save dha_stack pointer because it may contains pointer for stack! */
++	memset(&dump_header, 0, sizeof(dump_header));
++	memset(&dump_header_asm, 0,
++		offsetof(struct __dump_header_asm, dha_stack));
++	memset(&dump_header_asm.dha_stack+1, 0,
++		sizeof(dump_header_asm) -
++		offsetof(struct __dump_header_asm, dha_stack) -
++		sizeof(dump_header_asm.dha_stack));
++	dump_header.dh_memory_size = temp_memsz;
++
++	/* configure dump header values */
++	dump_header.dh_magic_number = DUMP_MAGIC_NUMBER;
++	dump_header.dh_version = DUMP_VERSION_NUMBER;
++	dump_header.dh_memory_start = PAGE_OFFSET;
++	dump_header.dh_memory_end = DUMP_MAGIC_NUMBER;
++	dump_header.dh_header_size = sizeof(struct __dump_header);
++	dump_header.dh_page_size = PAGE_SIZE;
++	dump_header.dh_dump_level = dump_config.level;
++	dump_header.dh_current_task = (unsigned long) current;
++	dump_header.dh_dump_compress = dump_config.dumper->compress->
++		compress_type;
++	dump_header.dh_dump_flags = dump_config.flags;
++	dump_header.dh_dump_device = dump_config.dumper->dev->device_id; 
++
++#if DUMP_DEBUG >= 6
++	dump_header.dh_num_bytes = 0;
++#endif
++	dump_header.dh_num_dump_pages = 0;
++	do_gettimeofday(&dh_time);
++	dump_header.dh_time.tv_sec = dh_time.tv_sec;
++	dump_header.dh_time.tv_usec = dh_time.tv_usec;
++
++	memcpy((void *)&(dump_header.dh_utsname_sysname), 
++		(const void *)&(system_utsname.sysname), __NEW_UTS_LEN + 1);
++	memcpy((void *)&(dump_header.dh_utsname_nodename), 
++		(const void *)&(system_utsname.nodename), __NEW_UTS_LEN + 1);
++	memcpy((void *)&(dump_header.dh_utsname_release), 
++		(const void *)&(system_utsname.release), __NEW_UTS_LEN + 1);
++	memcpy((void *)&(dump_header.dh_utsname_version), 
++		(const void *)&(system_utsname.version), __NEW_UTS_LEN + 1);
++	memcpy((void *)&(dump_header.dh_utsname_machine), 
++		(const void *)&(system_utsname.machine), __NEW_UTS_LEN + 1);
++	memcpy((void *)&(dump_header.dh_utsname_domainname), 
++		(const void *)&(system_utsname.domainname), __NEW_UTS_LEN + 1);
++
++	if (panic_str) {
++		memcpy((void *)&(dump_header.dh_panic_string),
++			(const void *)panic_str, DUMP_PANIC_LEN);
++	}
++
++        dump_header_asm.dha_magic_number = DUMP_ASM_MAGIC_NUMBER;
++        dump_header_asm.dha_version = DUMP_ASM_VERSION_NUMBER;
++        dump_header_asm.dha_header_size = sizeof(dump_header_asm);
++#ifdef CONFIG_ARM
++	dump_header_asm.dha_physaddr_start = PHYS_OFFSET;
++#endif
++
++	dump_header_asm.dha_smp_num_cpus = num_online_cpus();
++	pr_debug("smp_num_cpus in header %d\n", 
++		dump_header_asm.dha_smp_num_cpus);
++
++	dump_header_asm.dha_dumping_cpu = smp_processor_id();
++	
++	return sizeof(dump_header) + sizeof(dump_header_asm);
++}
++
++
++int dump_lcrash_configure_header(const char *panic_str, 
++	const struct pt_regs *regs)
++{
++	int retval = 0;
++
++	dump_config.dumper->header_len = lcrash_init_dump_header(panic_str);
++
++	/* capture register states for all processors */
++	dump_save_this_cpu(regs);
++	__dump_save_other_cpus(); /* side effect:silence cpus */
++
++	/* configure architecture-specific dump header values */
++	if ((retval = __dump_configure_header(regs))) 
++		return retval;
++
++	dump_config.dumper->header_dirty++;
++	return 0;
++}
++/* save register and task context */
++void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, 
++	struct task_struct *tsk)
++{
++	/* This level of abstraction might be redundantly redundant */
++	__dump_save_context(cpu, regs, tsk);
++}
++
++/* write out the header */
++int dump_write_header(void)
++{
++	int retval = 0, size;
++	void *buf = dump_config.dumper->dump_buf;
++
++	/* accounts for DUMP_HEADER_OFFSET if applicable */
++	if ((retval = dump_dev_seek(0))) {
++		printk("Unable to seek to dump header offset: %d\n", 
++			retval);
++		return retval;
++	}
++
++	memcpy(buf, (void *)&dump_header, sizeof(dump_header));
++	size = sizeof(dump_header);
++	memcpy(buf + size, (void *)&dump_header_asm, sizeof(dump_header_asm));
++	size += sizeof(dump_header_asm);
++	size = PAGE_ALIGN(size);
++	retval = dump_ll_write(buf , size);
++
++	if (retval < size) 
++		return (retval >= 0) ? ENOSPC : retval;
++	return 0;
++}
++
++int dump_generic_update_header(void)
++{
++	int err = 0;
++
++	if (dump_config.dumper->header_dirty) {
++		if ((err = dump_write_header())) {
++			printk("dump write header failed !err %d\n", err);
++		} else {
++			dump_config.dumper->header_dirty = 0;
++		}
++	}
++
++	return err;
++}
++
++static inline int is_curr_stack_page(struct page *page, unsigned long size)
++{
++	unsigned long thread_addr = (unsigned long)current_thread_info();
++	unsigned long addr = (unsigned long)page_address(page);
++
++	return !PageHighMem(page) && (addr < thread_addr + THREAD_SIZE)
++		&& (addr + size > thread_addr);
++}
++
++static inline int is_dump_page(struct page *page, unsigned long size)
++{
++	unsigned long addr = (unsigned long)page_address(page);
++	unsigned long dump_buf = (unsigned long)dump_config.dumper->dump_buf;
++
++	return !PageHighMem(page) && (addr < dump_buf + DUMP_BUFFER_SIZE)
++		&& (addr + size > dump_buf);
++}
++
++int dump_allow_compress(struct page *page, unsigned long size)
++{
++	/*
++	 * Don't compress the page if any part of it overlaps
++	 * with the current stack or dump buffer (since the contents
++	 * in these could be changing while compression is going on)
++	 */
++	return !is_curr_stack_page(page, size) && !is_dump_page(page, size);
++}
++
++void lcrash_init_pageheader(struct __dump_page *dp, struct page *page, 
++	unsigned long sz)
++{
++	memset(dp, sizeof(struct __dump_page), 0);
++	dp->dp_flags = 0; 
++	dp->dp_size = 0;
++	if (sz > 0)
++		dp->dp_address = (loff_t)page_to_pfn(page) << PAGE_SHIFT;
++
++#if DUMP_DEBUG > 6
++	dp->dp_page_index = dump_header.dh_num_dump_pages;
++	dp->dp_byte_offset = dump_header.dh_num_bytes + DUMP_BUFFER_SIZE
++		+ DUMP_HEADER_OFFSET; /* ?? */
++#endif /* DUMP_DEBUG */
++}
++
++int dump_lcrash_add_data(unsigned long loc, unsigned long len)
++{
++	struct page *page = (struct page *)loc;
++	void *addr, *buf = dump_config.dumper->curr_buf;
++	struct __dump_page *dp = (struct __dump_page *)buf; 
++	int bytes, size;
++
++	if (buf > dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE)
++		return -ENOMEM;
++
++	lcrash_init_pageheader(dp, page, len);
++	buf += sizeof(struct __dump_page);
++
++	while (len) {
++		addr = kmap_atomic(page, KM_DUMP);
++		size = bytes = (len > PAGE_SIZE) ? PAGE_SIZE : len;	
++		/* check for compression */
++		if (dump_allow_compress(page, bytes)) {
++			size = dump_compress_data((char *)addr, bytes, 
++				(char *)buf, loc);
++		}
++		/* set the compressed flag if the page did compress */
++		if (size && (size < bytes)) {
++			dp->dp_flags |= DUMP_DH_COMPRESSED;
++		} else {
++			/* compression failed -- default to raw mode */
++			dp->dp_flags |= DUMP_DH_RAW;
++			memcpy(buf, addr, bytes);
++			size = bytes;
++		}
++		/* memset(buf, 'A', size); temporary: testing only !! */
++		kunmap_atomic(addr, KM_DUMP);
++		dp->dp_size += size;
++		buf += size;
++		len -= bytes;
++		page++;
++	}
++
++	/* now update the header */
++#if DUMP_DEBUG > 6
++	dump_header.dh_num_bytes += dp->dp_size + sizeof(*dp);
++#endif
++	dump_header.dh_num_dump_pages++;
++	dump_config.dumper->header_dirty++;
++
++	dump_config.dumper->curr_buf = buf;	
++
++	return len;
++}
++
++int dump_lcrash_update_end_marker(void)
++{
++	struct __dump_page *dp = 
++		(struct __dump_page *)dump_config.dumper->curr_buf;
++	unsigned long left;
++	int ret = 0;
++		
++	lcrash_init_pageheader(dp, NULL, 0);
++	dp->dp_flags |= DUMP_DH_END; /* tbd: truncation test ? */
++	
++	/* now update the header */
++#if DUMP_DEBUG > 6
++	dump_header.dh_num_bytes += sizeof(*dp);
++#endif
++	dump_config.dumper->curr_buf += sizeof(*dp);
++	left = dump_config.dumper->curr_buf - dump_config.dumper->dump_buf;
++
++	printk("\n");
++
++	while (left) {
++		if ((ret = dump_dev_seek(dump_config.dumper->curr_offset))) {
++			printk("Seek failed at offset 0x%llx\n", 
++			dump_config.dumper->curr_offset);
++			return ret;
++		}
++
++		if (DUMP_BUFFER_SIZE > left) 
++			memset(dump_config.dumper->curr_buf, 'm', 
++				DUMP_BUFFER_SIZE - left);
++
++		if ((ret = dump_ll_write(dump_config.dumper->dump_buf, 
++			DUMP_BUFFER_SIZE)) < DUMP_BUFFER_SIZE) {
++			return (ret < 0) ? ret : -ENOSPC;
++		}
++
++		dump_config.dumper->curr_offset += DUMP_BUFFER_SIZE;
++	
++		if (left > DUMP_BUFFER_SIZE) {
++			left -= DUMP_BUFFER_SIZE;
++			memcpy(dump_config.dumper->dump_buf, 
++			dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE, left);
++			dump_config.dumper->curr_buf -= DUMP_BUFFER_SIZE;
++		} else {
++			left = 0;
++		}
++	}
++	return 0;
++}
++
++
++/* Default Formatter (lcrash) */
++struct dump_fmt_ops dump_fmt_lcrash_ops = {
++	.configure_header	= dump_lcrash_configure_header,
++	.update_header		= dump_generic_update_header,
++	.save_context		= dump_lcrash_save_context,
++	.add_data		= dump_lcrash_add_data,
++	.update_end_marker	= dump_lcrash_update_end_marker
++};
++
++struct dump_fmt dump_fmt_lcrash = {
++	.name	= "lcrash",
++	.ops	= &dump_fmt_lcrash_ops
++};
++
+Index: linux-2.6.10/drivers/dump/dump_setup.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_setup.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_setup.c	2005-04-05 16:47:53.939205712 +0800
+@@ -0,0 +1,923 @@
++/*
++ * Standard kernel function entry points for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sourceforge.net)
++ * Contributions from SGI, IBM, HP, MCL, and others.
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2000 - 2002 TurboLinux, Inc.  All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson.  All rights reserved.
++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * -----------------------------------------------------------------------
++ *
++ * DUMP HISTORY
++ *
++ * This dump code goes back to SGI's first attempts at dumping system
++ * memory on SGI systems running IRIX.  A few developers at SGI needed
++ * a way to take this system dump and analyze it, and created 'icrash',
++ * or IRIX Crash.  The mechanism (the dumps and 'icrash') were used
++ * by support people to generate crash reports when a system failure
++ * occurred.  This was vital for large system configurations that
++ * couldn't apply patch after patch after fix just to hope that the
++ * problems would go away.  So the system memory, along with the crash
++ * dump analyzer, allowed support people to quickly figure out what the
++ * problem was on the system with the crash dump.
++ *
++ * In comes Linux.  SGI started moving towards the open source community,
++ * and upon doing so, SGI wanted to take its support utilities into Linux
++ * with the hopes that they would end up the in kernel and user space to
++ * be used by SGI's customers buying SGI Linux systems.  One of the first
++ * few products to be open sourced by SGI was LKCD, or Linux Kernel Crash
++ * Dumps.  LKCD comprises of a patch to the kernel to enable system
++ * dumping, along with 'lcrash', or Linux Crash, to analyze the system
++ * memory dump.  A few additional system scripts and kernel modifications
++ * are also included to make the dump mechanism and dump data easier to
++ * process and use.
++ *
++ * As soon as LKCD was released into the open source community, a number
++ * of larger companies started to take advantage of it.  Today, there are
++ * many community members that contribute to LKCD, and it continues to
++ * flourish and grow as an open source project.
++ */
++
++/*
++ * DUMP TUNABLES (read/write with ioctl, readonly with /proc)
++ *
++ * This is the list of system tunables (via /proc) that are available
++ * for Linux systems.  All the read, write, etc., functions are listed
++ * here.  Currently, there are a few different tunables for dumps:
++ *
++ * dump_device (used to be dumpdev):
++ *     The device for dumping the memory pages out to.  This 
++ *     may be set to the primary swap partition for disruptive dumps,
++ *     and must be an unused partition for non-disruptive dumps.
++ *     Todo: In the case of network dumps, this may be interpreted 
++ *     as the IP address of the netdump server to connect to.
++ *
++ * dump_compress (used to be dump_compress_pages):
++ *     This is the flag which indicates which compression mechanism
++ *     to use.  This is a BITMASK, not an index (0,1,2,4,8,16,etc.).
++ *     This is the current set of values:
++ *
++ *     0: DUMP_COMPRESS_NONE -- Don't compress any pages.
++ *     1: DUMP_COMPRESS_RLE  -- This uses RLE compression.
++ *     2: DUMP_COMPRESS_GZIP -- This uses GZIP compression.
++ *
++ * dump_level:
++ *     The amount of effort the dump module should make to save
++ *     information for post crash analysis.  This value is now
++ *     a BITMASK value, not an index:
++ *
++ *     0:   Do nothing, no dumping. (DUMP_LEVEL_NONE)
++ *
++ *     1:   Print out the dump information to the dump header, and
++ *          write it out to the dump_device. (DUMP_LEVEL_HEADER)
++ *
++ *     2:   Write out the dump header and all kernel memory pages.
++ *          (DUMP_LEVEL_KERN)
++ *
++ *     4:   Write out the dump header and all kernel and user
++ *          memory pages.  (DUMP_LEVEL_USED)
++ *
++ *     8:   Write out the dump header and all conventional/cached 
++ *	    memory (RAM) pages in the system (kernel, user, free).  
++ *	    (DUMP_LEVEL_ALL_RAM)
++ *
++ *    16:   Write out everything, including non-conventional memory
++ *	    like firmware, proms, I/O registers, uncached memory.
++ *	    (DUMP_LEVEL_ALL)
++ *
++ *     The dump_level will default to 1.
++ *
++ * dump_flags:
++ *     These are the flags to use when talking about dumps.  There
++ *     are lots of possibilities.  This is a BITMASK value, not an index.
++ * 
++ * -----------------------------------------------------------------------
++ */
++
++#include <linux/kernel.h>
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include <linux/fs.h>
++#include <linux/dump.h>
++#include <linux/ioctl32.h>
++#include <linux/syscalls.h>
++#include "dump_methods.h"
++#include <linux/proc_fs.h>
++#include <linux/module.h>
++#include <linux/utsname.h>
++#include <linux/highmem.h>
++#include <linux/miscdevice.h>
++#include <linux/sysrq.h>
++#include <linux/sysctl.h>
++#include <linux/nmi.h>
++#include <linux/init.h>
++#include <asm/hardirq.h>
++#include <asm/uaccess.h>
++
++
++/*
++ * -----------------------------------------------------------------------
++ *                         V A R I A B L E S
++ * -----------------------------------------------------------------------
++ */
++
++/* Dump tunables */
++struct dump_config dump_config = {
++	.level 		= 0,
++	.flags 		= 0,
++	.dump_device	= 0,
++	.dump_addr	= 0,
++	.dumper 	= NULL
++};
++#ifdef CONFIG_ARM 
++static _dump_regs_t all_regs;
++#endif
++
++/* Global variables used in dump.h */
++/* degree of system freeze when dumping */
++enum dump_silence_levels dump_silence_level = DUMP_HARD_SPIN_CPUS;	 
++
++/* Other global fields */
++extern struct __dump_header dump_header; 
++struct dump_dev *dump_dev = NULL;  /* Active dump device                   */
++static int dump_compress = 0;
++
++static u32 dump_compress_none(const u8 *old, u32 oldsize, u8 *new, u32 newsize,
++				unsigned long loc);
++struct __dump_compress dump_none_compression = {
++	.compress_type	= DUMP_COMPRESS_NONE,
++	.compress_func	= dump_compress_none,
++	.compress_name  = "none",
++};
++
++/* our device operations and functions */
++static int dump_ioctl(struct inode *i, struct file *f,
++	unsigned int cmd, unsigned long arg);
++
++#ifdef CONFIG_COMPAT
++static int dw_long(unsigned int, unsigned int, unsigned long, struct file*);
++#endif
++
++static struct file_operations dump_fops = {
++	.owner	= THIS_MODULE,
++	.ioctl	= dump_ioctl,
++};
++
++static struct miscdevice dump_miscdev = {
++	.minor  = CRASH_DUMP_MINOR,
++	.name   = "dump",
++	.fops   = &dump_fops,
++};
++MODULE_ALIAS_MISCDEV(CRASH_DUMP_MINOR);
++
++/* static variables							*/
++static int dump_okay = 0;		/* can we dump out to disk?	*/
++static spinlock_t dump_lock = SPIN_LOCK_UNLOCKED;
++
++/* used for dump compressors */
++static struct list_head dump_compress_list = LIST_HEAD_INIT(dump_compress_list);
++
++/* list of registered dump targets */
++static struct list_head dump_target_list = LIST_HEAD_INIT(dump_target_list);
++
++/* lkcd info structure -- this is used by lcrash for basic system data     */
++struct __lkcdinfo lkcdinfo = {
++	.ptrsz		= (sizeof(void *) * 8),
++#if defined(__LITTLE_ENDIAN) 
++	.byte_order	= __LITTLE_ENDIAN,
++#else
++	.byte_order	= __BIG_ENDIAN,
++#endif
++	.page_shift	= PAGE_SHIFT,
++	.page_size	= PAGE_SIZE,
++	.page_mask	= PAGE_MASK,
++	.page_offset	= PAGE_OFFSET,
++};
++
++/*
++ * -----------------------------------------------------------------------
++ *            / P R O C   T U N A B L E   F U N C T I O N S
++ * -----------------------------------------------------------------------
++ */
++
++static int proc_dump_device(ctl_table *ctl, int write, struct file *f,
++			    void __user *buffer, size_t *lenp, loff_t *ppos);
++
++static int proc_doulonghex(ctl_table *ctl, int write, struct file *f,
++			    void __user *buffer, size_t *lenp, loff_t *ppos);
++/*
++ * sysctl-tuning infrastructure.
++ */
++static ctl_table dump_table[] = {
++	{ .ctl_name = CTL_DUMP_LEVEL,
++	  .procname = DUMP_LEVEL_NAME, 
++	  .data = &dump_config.level, 	 
++	  .maxlen = sizeof(int),
++	  .mode = 0444,
++	  .proc_handler = proc_doulonghex, },
++
++	{ .ctl_name = CTL_DUMP_FLAGS,
++	  .procname = DUMP_FLAGS_NAME,
++	  .data = &dump_config.flags,	
++	  .maxlen = sizeof(int),
++	  .mode = 0444,
++	  .proc_handler = proc_doulonghex, },
++
++	{ .ctl_name = CTL_DUMP_COMPRESS,
++	  .procname = DUMP_COMPRESS_NAME,
++	  .data = &dump_compress, /* FIXME */
++	  .maxlen = sizeof(int),
++	  .mode = 0444,
++	  .proc_handler = proc_dointvec, },
++	  
++	{ .ctl_name = CTL_DUMP_DEVICE,
++	  .procname = DUMP_DEVICE_NAME,
++	  .mode = 0444,
++	  .data = &dump_config.dump_device, /* FIXME */
++	  .maxlen = sizeof(int),
++	  .proc_handler = proc_dump_device },
++
++#ifdef CONFIG_CRASH_DUMP_MEMDEV
++	{ .ctl_name = CTL_DUMP_ADDR,
++	  .procname = DUMP_ADDR_NAME,
++	  .mode = 0444,
++	  .data = &dump_config.dump_addr,
++	  .maxlen = sizeof(unsigned long),
++	  .proc_handler = proc_doulonghex },
++#endif
++
++	{ 0, }
++};
++
++static ctl_table dump_root[] = {
++	{ .ctl_name = KERN_DUMP,
++	  .procname = "dump",
++	  .mode = 0555, 
++	  .child = dump_table },
++	{ 0, }
++};
++
++static ctl_table kernel_root[] = {
++	{ .ctl_name = CTL_KERN,
++	  .procname = "kernel",
++	  .mode = 0555,
++	  .child = dump_root, },
++	{ 0, }
++};
++
++static struct ctl_table_header *sysctl_header;
++
++/*
++ * -----------------------------------------------------------------------
++ *              C O M P R E S S I O N   F U N C T I O N S
++ * -----------------------------------------------------------------------
++ */
++
++/*
++ * Name: dump_compress_none()
++ * Func: Don't do any compression, period.
++ */
++static u32
++dump_compress_none(const u8 *old, u32 oldsize, u8 *new, u32 newsize,
++		unsigned long loc)
++{
++	/* just return the old size */
++	return oldsize;
++}
++
++
++/*
++ * Name: dump_execute()
++ * Func: Execute the dumping process.  This makes sure all the appropriate
++ *       fields are updated correctly, and calls dump_execute_memdump(),
++ *       which does the real work.
++ */
++void
++dump_execute(const char *panic_str, const struct pt_regs *regs)
++{
++	int state = -1;
++	unsigned long flags;
++
++	/* make sure we can dump */
++	if (!dump_okay) {
++		pr_info("LKCD not yet configured, can't take dump now\n");
++		return;
++	}
++
++	/* Exclude multiple dumps at the same time,
++	 * and disable interrupts,  some drivers may re-enable
++	 * interrupts in with silence()
++	 *
++	 * Try and acquire spin lock. If successful, leave preempt
++	 * and interrupts disabled.  See spin_lock_irqsave in spinlock.h
++	 */
++	local_irq_save(flags);
++	if (!spin_trylock(&dump_lock)) {
++		local_irq_restore(flags);
++		pr_info("LKCD dump already in progress\n");
++		return;
++	}
++
++	/* What state are interrupts really in? */
++	if (in_interrupt()){ 
++	        if(in_irq())
++		    printk(KERN_ALERT "Dumping from interrupt handler!\n");
++		else 
++		    printk(KERN_ALERT "Dumping from bottom half!\n");
++
++		__dump_clean_irq_state();
++	}
++
++
++	/* Bring system into the strictest level of quiescing for min drift 
++	 * dump drivers can soften this as required in dev->ops->silence() 
++	 */
++	dump_oncpu = smp_processor_id() + 1;
++	dump_silence_level = DUMP_HARD_SPIN_CPUS; 
++
++	state = dump_generic_execute(panic_str, regs);
++	
++	dump_oncpu = 0;
++	spin_unlock_irqrestore(&dump_lock, flags);
++
++	if (state < 0) {
++		printk("Dump Incomplete or failed!\n");
++	} else {
++		printk("Dump Complete; %d dump pages saved.\n", 
++		       dump_header.dh_num_dump_pages);
++	}
++}
++
++/*
++ * Name: dump_register_compression()
++ * Func: Register a dump compression mechanism.
++ */
++void
++dump_register_compression(struct __dump_compress *item)
++{
++	if (item)
++		list_add(&(item->list), &dump_compress_list);
++}
++
++/*
++ * Name: dump_unregister_compression()
++ * Func: Remove a dump compression mechanism, and re-assign the dump
++ *       compression pointer if necessary.
++ */
++void
++dump_unregister_compression(int compression_type)
++{
++	struct list_head *tmp;
++	struct __dump_compress *dc;
++
++	/* let's make sure our list is valid */
++	if (compression_type != DUMP_COMPRESS_NONE) {
++		list_for_each(tmp, &dump_compress_list) {
++			dc = list_entry(tmp, struct __dump_compress, list);
++			if (dc->compress_type == compression_type) {
++				list_del(&(dc->list));
++				break;
++			}
++		}
++	}
++}
++
++/*
++ * Name: dump_compress_init()
++ * Func: Initialize (or re-initialize) compression scheme.
++ */
++static int
++dump_compress_init(int compression_type)
++{
++	struct list_head *tmp;
++	struct __dump_compress *dc;
++
++	/* try to remove the compression item */
++	list_for_each(tmp, &dump_compress_list) {
++		dc = list_entry(tmp, struct __dump_compress, list);
++		if (dc->compress_type == compression_type) {
++			dump_config.dumper->compress = dc;
++			dump_compress = compression_type;
++ 			pr_debug("Dump Compress %s\n", dc->compress_name);
++			return 0;
++		}
++	}
++
++	/* 
++	 * nothing on the list -- return ENODATA to indicate an error 
++	 *
++	 * NB: 
++	 *	EAGAIN: reports "Resource temporarily unavailable" which
++	 *		isn't very enlightening.
++	 */
++	printk("compression_type:%d not found\n", compression_type);
++
++	return -ENODATA;
++}
++
++static int
++dumper_setup(unsigned long flags, unsigned long devid)
++{
++	int ret = 0;
++
++	/* unconfigure old dumper if it exists */
++	dump_okay = 0;
++	if (dump_config.dumper) {
++		pr_debug("Unconfiguring current dumper\n");
++		dump_unconfigure();
++	}
++	/* set up new dumper */
++	if (dump_config.flags & DUMP_FLAGS_SOFTBOOT) {
++		printk("Configuring softboot based dump \n");
++#ifdef CONFIG_CRASH_DUMP_MEMDEV
++		dump_config.dumper = &dumper_stage1; 
++#else
++		printk("Requires CONFIG_CRASHDUMP_MEMDEV. Can't proceed.\n");
++		return -1;
++#endif
++	} else {
++		dump_config.dumper = &dumper_singlestage;
++	}	
++	dump_config.dumper->dev = dump_dev;
++
++	ret = dump_configure(devid);
++	if (!ret) {
++		dump_okay = 1;
++		pr_debug("%s dumper set up for dev 0x%lx\n", 
++			dump_config.dumper->name, devid);
++ 		dump_config.dump_device = devid;
++	} else {
++		printk("%s dumper set up failed for dev 0x%lx\n", 
++		       dump_config.dumper->name, devid);
++ 		dump_config.dumper = NULL;
++	}
++	return ret;
++}
++
++static int
++dump_target_init(int target)
++{
++	char type[20];
++	struct list_head *tmp;
++	struct dump_dev *dev;
++	
++	switch (target) {
++		case DUMP_FLAGS_DISKDUMP:
++			strcpy(type, "blockdev"); break;
++		case DUMP_FLAGS_NETDUMP:
++			strcpy(type, "networkdev"); break;
++		default:
++			return -1;
++	}
++
++	/*
++	 * This is a bit stupid, generating strings from flag
++	 * and doing strcmp. This is done because 'struct dump_dev'
++	 * has string 'type_name' and not interger 'type'.
++	 */
++	list_for_each(tmp, &dump_target_list) {
++		dev = list_entry(tmp, struct dump_dev, list);
++		if (strcmp(type, dev->type_name) == 0) {
++			dump_dev = dev;
++			return 0;
++		}
++	}
++	return -1;
++}
++
++/*
++ * Name: dump_ioctl()
++ * Func: Allow all dump tunables through a standard ioctl() mechanism.
++ *       This is far better than before, where we'd go through /proc,
++ *       because now this will work for multiple OS and architectures.
++ */
++static int
++dump_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg)
++{
++	/* check capabilities */
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	if (!dump_config.dumper && cmd == DIOSDUMPCOMPRESS)
++		/* dump device must be configured first */
++		return -ENODEV;
++
++	/*
++	 * This is the main mechanism for controlling get/set data
++	 * for various dump device parameters.  The real trick here
++	 * is setting the dump device (DIOSDUMPDEV).  That's what
++	 * triggers everything else.
++	 */
++	switch (cmd) {
++	case DIOSDUMPDEV:	/* set dump_device */
++		pr_debug("Configuring dump device\n"); 
++		if (!(f->f_flags & O_RDWR))
++			return -EPERM;
++
++		__dump_open();
++		return dumper_setup(dump_config.flags, arg);
++
++		
++	case DIOGDUMPDEV:	/* get dump_device */
++		return put_user((long)dump_config.dump_device, (long *)arg);
++
++	case DIOSDUMPLEVEL:	/* set dump_level */
++		if (!(f->f_flags & O_RDWR))
++			return -EPERM;
++
++		/* make sure we have a positive value */
++		if (arg < 0)
++			return -EINVAL;
++
++		/* Fixme: clean this up */
++		dump_config.level = 0;
++		switch ((int)arg) {
++			case DUMP_LEVEL_ALL:
++			case DUMP_LEVEL_ALL_RAM:
++				dump_config.level |= DUMP_MASK_UNUSED;
++			case DUMP_LEVEL_USED:
++				dump_config.level |= DUMP_MASK_USED;
++			case DUMP_LEVEL_KERN:
++				dump_config.level |= DUMP_MASK_KERN;
++			case DUMP_LEVEL_HEADER:
++				dump_config.level |= DUMP_MASK_HEADER;
++			case DUMP_LEVEL_NONE:
++				break;
++			default:
++				return (-EINVAL);
++			}
++		pr_debug("Dump Level 0x%lx\n", dump_config.level);
++		break;
++
++	case DIOGDUMPLEVEL:	/* get dump_level */
++		/* fixme: handle conversion */
++		return put_user((long)dump_config.level, (long *)arg);
++
++		
++	case DIOSDUMPFLAGS:	/* set dump_flags */
++		/* check flags */
++		if (!(f->f_flags & O_RDWR))
++			return -EPERM;
++
++		/* make sure we have a positive value */
++		if (arg < 0)
++			return -EINVAL;
++			
++		if (dump_target_init(arg & DUMP_FLAGS_TARGETMASK) < 0)
++			return -EINVAL; /* return proper error */
++
++		dump_config.flags = arg;
++		
++		pr_debug("Dump Flags 0x%lx\n", dump_config.flags);
++		break;
++		
++	case DIOGDUMPFLAGS:	/* get dump_flags */
++		return put_user((long)dump_config.flags, (long *)arg);
++
++	case DIOSDUMPCOMPRESS:	/* set the dump_compress status */
++		if (!(f->f_flags & O_RDWR))
++			return -EPERM;
++
++		return dump_compress_init((int)arg);
++
++	case DIOGDUMPCOMPRESS:	/* get the dump_compress status */
++		return put_user((long)(dump_config.dumper ? 
++			dump_config.dumper->compress->compress_type : 0), 
++			(long *)arg);
++	case DIOGDUMPOKAY: /* check if dump is configured */
++		return put_user((long)dump_okay, (long *)arg);
++	
++	case DIOSDUMPTAKE: /* Trigger a manual dump */
++		/* Do not proceed if lkcd not yet configured */
++		if(!dump_okay) {
++			printk("LKCD not yet configured. Cannot take manual dump\n");
++			return -ENODEV;
++		}
++
++		/* Take the dump */
++		return	manual_handle_crashdump();
++			
++	default:
++		/* 
++		 * these are network dump specific ioctls, let the
++		 * module handle them.
++		 */
++		return dump_dev_ioctl(cmd, arg);
++	}
++	return 0;
++}
++
++/*
++ * Handle special cases for dump_device 
++ * changing dump device requires doing an opening the device
++ */
++static int 
++proc_dump_device(ctl_table *ctl, int write, struct file *f,
++		 void __user *buffer, size_t *lenp, loff_t *ppos)
++{
++	int *valp = ctl->data;
++	int oval = *valp;
++	int ret = -EPERM;
++
++	/* same permission checks as ioctl */
++	if (capable(CAP_SYS_ADMIN)) {
++		ret = proc_doulonghex(ctl, write, f, buffer, lenp, ppos);
++		if (ret == 0 && write && *valp != oval) {
++			/* need to restore old value to close properly */
++			dump_config.dump_device = (dev_t) oval;
++			__dump_open();
++			ret = dumper_setup(dump_config.flags, (dev_t) *valp);
++		}
++	}
++
++	return ret;
++}
++
++/* All for the want of a proc_do_xxx routine which prints values in hex */
++/* Write is not implemented correctly, so mode is set to 0444 above. */
++static int 
++proc_doulonghex(ctl_table *ctl, int write, struct file *f,
++		 void __user *buffer, size_t *lenp, loff_t *ppos)
++{
++#define TMPBUFLEN 21
++	unsigned long *i;
++	size_t len, left;
++	char buf[TMPBUFLEN];
++
++	if (!ctl->data || !ctl->maxlen || !*lenp || (*ppos && !write)) {
++		*lenp = 0;
++		return 0;
++	}
++	
++	i = (unsigned long *) ctl->data;
++	left = *lenp;
++	
++	sprintf(buf, "0x%lx\n", (*i));
++	len = strlen(buf);
++	if (len > left)
++		len = left;
++	if(copy_to_user(buffer, buf, len))
++		return -EFAULT;
++	
++	left -= len;
++	*lenp -= left;
++	*ppos += *lenp;
++	return 0;
++}
++
++/*
++ * -----------------------------------------------------------------------
++ *                     I N I T   F U N C T I O N S
++ * -----------------------------------------------------------------------
++ */
++
++#ifdef CONFIG_COMPAT
++static int dw_long(unsigned int fd, unsigned int cmd, unsigned long arg,
++                struct file *f)
++{
++        mm_segment_t old_fs = get_fs();
++        int err;
++        unsigned long val;
++
++        set_fs (KERNEL_DS);
++        err = sys_ioctl(fd, cmd, (u64)&val);
++        set_fs (old_fs);
++        if (!err && put_user((unsigned int) val, (u32 *)arg))
++               return -EFAULT;
++        return err;
++}
++#endif
++
++/*
++ * These register and unregister routines are exported for modules
++ * to register their dump drivers (like block, net etc)
++ */
++int
++dump_register_device(struct dump_dev *ddev)
++{
++	struct list_head *tmp;
++	struct dump_dev *dev;
++
++	list_for_each(tmp, &dump_target_list) {
++		dev = list_entry(tmp, struct dump_dev, list);
++		if (strcmp(ddev->type_name, dev->type_name) == 0) {
++			printk("Target type %s already registered\n",
++					dev->type_name);
++			return -1; /* return proper error */
++		}
++	}
++	list_add(&(ddev->list), &dump_target_list);
++	
++	return 0;
++}
++
++void
++dump_unregister_device(struct dump_dev *ddev)
++{
++	list_del(&(ddev->list));
++	if (ddev != dump_dev)
++		return;
++
++	dump_okay = 0;
++
++	if (dump_config.dumper)
++		dump_unconfigure();
++
++	dump_config.flags &= ~DUMP_FLAGS_TARGETMASK;
++	dump_okay = 0;
++	dump_dev = NULL;
++	dump_config.dumper = NULL;
++}
++
++static int panic_event(struct notifier_block *this, unsigned long event,
++		       void *ptr)
++{
++#ifdef CONFIG_ARM
++	get_current_general_regs(&all_regs);
++	get_current_cp14_regs(&all_regs);
++	get_current_cp15_regs(&all_regs);
++	dump_execute((const char *)ptr, &all_regs);
++#else
++	struct pt_regs regs;
++	
++	get_current_regs(&regs);
++	dump_execute((const char *)ptr, &regs);
++#endif
++	return 0;
++}
++
++extern struct notifier_block *panic_notifier_list;
++static int panic_event(struct notifier_block *, unsigned long, void *);
++static struct notifier_block panic_block = {
++	.notifier_call = panic_event,
++};
++
++#ifdef CONFIG_MAGIC_SYSRQ
++/* Sysrq handler */
++static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs,
++		struct tty_struct *tty) {
++	if(!pt_regs) {
++		struct pt_regs regs;
++		get_current_regs(&regs);
++		dump_execute("sysrq", &regs);
++
++	} else {
++		dump_execute("sysrq", pt_regs);
++	}
++}
++
++static struct sysrq_key_op sysrq_crashdump_op = {
++	.handler	=	sysrq_handle_crashdump,
++	.help_msg	=	"Dump",
++	.action_msg	=	"Starting crash dump",
++};
++#endif
++
++static inline void
++dump_sysrq_register(void) 
++{
++#ifdef CONFIG_MAGIC_SYSRQ
++	register_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op);
++#endif
++}
++
++static inline void
++dump_sysrq_unregister(void)
++{
++#ifdef CONFIG_MAGIC_SYSRQ
++	unregister_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op);
++#endif
++}
++
++/*
++ * Name: dump_init()
++ * Func: Initialize the dump process.  This will set up any architecture
++ *       dependent code.  The big key is we need the memory offsets before
++ *       the page table is initialized, because the base memory offset
++ *       is changed after paging_init() is called.
++ */
++static int __init
++dump_init(void)
++{
++	struct sysinfo info;
++	int err;
++
++	/* try to create our dump device */
++	err = misc_register(&dump_miscdev);
++	if (err) {
++		printk("cannot register dump character device!\n");
++		return err;
++	}
++
++	__dump_init((u64)PAGE_OFFSET);
++
++#ifdef CONFIG_COMPAT
++       err = register_ioctl32_conversion(DIOSDUMPDEV, NULL);
++       err |= register_ioctl32_conversion(DIOGDUMPDEV, NULL);
++       err |= register_ioctl32_conversion(DIOSDUMPLEVEL, NULL);
++       err |= register_ioctl32_conversion(DIOGDUMPLEVEL, dw_long);
++       err |= register_ioctl32_conversion(DIOSDUMPFLAGS, NULL);
++       err |= register_ioctl32_conversion(DIOGDUMPFLAGS, dw_long);
++       err |= register_ioctl32_conversion(DIOSDUMPCOMPRESS, NULL);
++       err |= register_ioctl32_conversion(DIOGDUMPCOMPRESS, dw_long);
++       err |= register_ioctl32_conversion(DIOSTARGETIP, NULL);
++       err |= register_ioctl32_conversion(DIOGTARGETIP, NULL);
++       err |= register_ioctl32_conversion(DIOSTARGETPORT, NULL);
++       err |= register_ioctl32_conversion(DIOGTARGETPORT, NULL);
++       err |= register_ioctl32_conversion(DIOSSOURCEPORT, NULL);
++       err |= register_ioctl32_conversion(DIOGSOURCEPORT, NULL);
++       err |= register_ioctl32_conversion(DIOSETHADDR, NULL);
++       err |= register_ioctl32_conversion(DIOGETHADDR, NULL);
++       err |= register_ioctl32_conversion(DIOGDUMPOKAY, dw_long);
++       err |= register_ioctl32_conversion(DIOSDUMPTAKE, NULL);
++       if (err) {
++                printk(KERN_ERR "LKCD: registering ioctl32 translations failed\
++");
++       }
++#endif
++	/* set the dump_compression_list structure up */
++	dump_register_compression(&dump_none_compression);
++
++	/* grab the total memory size now (not if/when we crash) */
++	si_meminfo(&info);
++
++	/* set the memory size */
++	dump_header.dh_memory_size = (u64)info.totalram;
++
++	sysctl_header = register_sysctl_table(kernel_root, 0);
++	dump_sysrq_register();
++
++	notifier_chain_register(&panic_notifier_list, &panic_block);
++	dump_function_ptr = dump_execute;
++
++	pr_info("Crash dump driver initialized.\n");
++	return 0;
++}
++
++static void __exit
++dump_cleanup(void)
++{
++	int err;
++	dump_okay = 0;
++
++	if (dump_config.dumper)
++		dump_unconfigure();
++
++	/* arch-specific cleanup routine */
++	__dump_cleanup();
++
++#ifdef CONFIG_COMPAT
++	err = unregister_ioctl32_conversion(DIOSDUMPDEV);
++	err |= unregister_ioctl32_conversion(DIOGDUMPDEV);
++	err |= unregister_ioctl32_conversion(DIOSDUMPLEVEL);
++	err |= unregister_ioctl32_conversion(DIOGDUMPLEVEL);
++	err |= unregister_ioctl32_conversion(DIOSDUMPFLAGS);
++	err |= unregister_ioctl32_conversion(DIOGDUMPFLAGS);
++	err |= unregister_ioctl32_conversion(DIOSDUMPCOMPRESS);
++	err |= unregister_ioctl32_conversion(DIOGDUMPCOMPRESS);
++	err |= unregister_ioctl32_conversion(DIOSTARGETIP);
++	err |= unregister_ioctl32_conversion(DIOGTARGETIP);
++	err |= unregister_ioctl32_conversion(DIOSTARGETPORT);
++	err |= unregister_ioctl32_conversion(DIOGTARGETPORT);
++	err |= unregister_ioctl32_conversion(DIOSSOURCEPORT);
++	err |= unregister_ioctl32_conversion(DIOGSOURCEPORT);
++	err |= unregister_ioctl32_conversion(DIOSETHADDR);
++	err |= unregister_ioctl32_conversion(DIOGETHADDR);
++	err |= unregister_ioctl32_conversion(DIOGDUMPOKAY);
++	err |= unregister_ioctl32_conversion(DIOSDUMPTAKE);
++	if (err) {
++		printk(KERN_ERR "LKCD: Unregistering ioctl32 translations failed\n");
++	}
++#endif
++
++	/* ignore errors while unregistering -- since can't do anything */
++	unregister_sysctl_table(sysctl_header);
++	misc_deregister(&dump_miscdev);
++	dump_sysrq_unregister();
++	notifier_chain_unregister(&panic_notifier_list, &panic_block);
++	dump_function_ptr = NULL;
++}
++
++EXPORT_SYMBOL(dump_register_compression);
++EXPORT_SYMBOL(dump_unregister_compression);
++EXPORT_SYMBOL(dump_register_device);
++EXPORT_SYMBOL(dump_unregister_device);
++EXPORT_SYMBOL(dump_config);
++EXPORT_SYMBOL(dump_silence_level);
++
++EXPORT_SYMBOL(__dump_irq_enable);
++EXPORT_SYMBOL(__dump_irq_restore);
++
++MODULE_AUTHOR("Matt D. Robinson <yakker@sourceforge.net>");
++MODULE_DESCRIPTION("Linux Kernel Crash Dump (LKCD) driver");
++MODULE_LICENSE("GPL");
++
++module_init(dump_init);
++module_exit(dump_cleanup);
+Index: linux-2.6.10/drivers/dump/dump_scheme.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_scheme.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_scheme.c	2005-04-05 16:47:53.944204952 +0800
+@@ -0,0 +1,430 @@
++/* 
++ * Default single stage dump scheme methods
++ *
++ * Previously a part of dump_base.c
++ *
++ * Started: Oct 2002 -  Suparna Bhattacharya <suparna@in.ibm.com>
++ * 	Split and rewrote LKCD dump scheme to generic dump method 
++ * 	interfaces 
++ * Derived from original code created by
++ * 	Matt Robinson <yakker@sourceforge.net>)
++ *
++ * Contributions from SGI, IBM, HP, MCL, and others.
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson.  All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * Implements the default dump scheme, i.e. single-stage gathering and 
++ * saving of dump data directly to the target device, which operates in
++ * a push mode, where the dumping system decides what data it saves
++ * taking into account pre-specified dump config options.
++ *
++ * Aside: The 2-stage dump scheme, where there is a soft-reset between
++ * the gathering and saving phases, also reuses some of these
++ * default routines (see dump_overlay.c) 
++ */ 
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include <linux/nmi.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++
++extern int panic_timeout;  /* time before reboot */
++
++extern void dump_speedo(int);
++
++/* Default sequencer used during single stage dumping */
++/* Also invoked during stage 2 of soft-boot based dumping */
++int dump_generic_sequencer(void)
++{
++	struct dump_data_filter *filter = dump_config.dumper->filter;
++	int pass = 0, err = 0, save = 0;
++	int (*action)(unsigned long, unsigned long);
++
++	/* 
++	 * We want to save the more critical data areas first in 
++	 * case we run out of space, encounter i/o failures, or get
++	 * interrupted otherwise and have to give up midway
++	 * So, run through the passes in increasing order 
++	 */
++	for (;filter->selector; filter++, pass++)
++	{
++		/* Assumes passes are exclusive (even across dumpers) */
++		/* Requires care when coding the selection functions */
++		if ((save = filter->level_mask & dump_config.level))
++			action = dump_save_data;
++		else
++			action = dump_skip_data;
++
++		if ((err = dump_iterator(pass, action, filter)) < 0)
++			break;
++
++		printk("\n %d dump pages %s of %d each in pass %d\n", 
++		err, save ? "saved" : "skipped", (int)DUMP_PAGE_SIZE, pass);
++
++	}
++
++	return (err < 0) ? err : 0;
++}
++
++static inline struct page *dump_get_page(loff_t loc)
++{
++
++	unsigned long page_index = loc >> PAGE_SHIFT;
++
++	/* todo: complete this  to account for ia64/discontig mem */
++	/* todo: and to check for validity, ram page, no i/o mem etc */
++	/* need to use pfn/physaddr equiv of kern_addr_valid */
++
++	/* Important:
++	 *   On ARM/XScale system, the physical address starts from 
++	 *   PHYS_OFFSET, and it maybe the situation that PHYS_OFFSET != 0. 
++	 *   For example on Intel's PXA250, PHYS_OFFSET = 0xa0000000. And the 
++	 *   page index starts from PHYS_PFN_OFFSET. When configuring
++ 	 *   filter, filter->start is assigned to 0 in dump_generic_configure.
++	 *   Here we want to adjust it by adding PHYS_PFN_OFFSET to it!
++	 */
++#ifdef CONFIG_ARM
++	page_index += PHYS_PFN_OFFSET;
++#endif
++	if (__dump_page_valid(page_index))
++		return pfn_to_page(page_index);
++	else
++		return NULL;
++
++}
++
++/* Default iterator: for singlestage and stage 1 of soft-boot dumping */
++/* Iterates over range of physical memory pages in DUMP_PAGE_SIZE increments */
++int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned long), 
++	struct dump_data_filter *filter)
++{
++	/* Todo : fix unit, type */
++	loff_t loc, start, end;
++	int i, count = 0, err = 0;
++	struct page *page;
++
++	/* Todo: Add membanks code */
++	/* TBD: Check if we need to address DUMP_PAGE_SIZE < PAGE_SIZE */	
++
++	for (i = 0; i < filter->num_mbanks; i++) {
++		start = filter->start[i];
++		end = filter->end[i];
++		for (loc = start; loc < end; loc += DUMP_PAGE_SIZE) {
++			dump_config.dumper->curr_loc = loc;
++			page = dump_get_page(loc);
++			if (page && filter->selector(pass, 
++				(unsigned long) page, DUMP_PAGE_SIZE)) { 
++				if ((err = action((unsigned long)page, 
++					DUMP_PAGE_SIZE))) {
++					printk("dump_page_iterator: err %d for "
++						"loc 0x%llx, in pass %d\n", 
++						err, loc, pass);
++					return err ? err : count;
++				} else
++					count++;
++			}
++		}
++	}
++
++	return err ? err : count;
++}
++
++/* 
++ * Base function that saves the selected block of data in the dump 
++ * Action taken when iterator decides that data needs to be saved 
++ */
++int dump_generic_save_data(unsigned long loc, unsigned long sz)
++{
++	void *buf;
++	void *dump_buf = dump_config.dumper->dump_buf;
++	int left, bytes, ret;
++
++	if ((ret = dump_add_data(loc, sz))) {
++		return ret;
++	}
++	buf = dump_config.dumper->curr_buf;
++
++	/* If we've filled up the buffer write it out */
++	if ((left = buf - dump_buf) >= DUMP_BUFFER_SIZE) {
++		bytes = dump_write_buffer(dump_buf, DUMP_BUFFER_SIZE);
++		if (bytes < DUMP_BUFFER_SIZE) {
++			printk("dump_write_buffer failed %d\n", bytes);
++			return bytes ? -ENOSPC : bytes;
++		}
++
++		left -= bytes;
++		
++		/* -- A few chores to do from time to time -- */
++		dump_config.dumper->count++;
++
++		if (!(dump_config.dumper->count & 0x3f)) {
++			/* Update the header every one in a while */
++			memset((void *)dump_buf, 'b', DUMP_BUFFER_SIZE);
++			if ((ret = dump_update_header()) < 0) {
++				/* issue warning */
++				return ret;
++			}
++			printk(".");
++
++			touch_nmi_watchdog();
++		} else if (!(dump_config.dumper->count & 0x7)) {
++			/* Show progress so the user knows we aren't hung */
++			dump_speedo(dump_config.dumper->count >> 3); 
++		}
++		/* Todo: Touch/Refresh watchdog */
++
++		/* --- Done with periodic chores -- */
++
++		/* 
++		 * extra bit of copying to simplify verification  
++		 * in the second kernel boot based scheme
++		 */
++		memcpy(dump_buf - DUMP_PAGE_SIZE, dump_buf + 
++			DUMP_BUFFER_SIZE - DUMP_PAGE_SIZE, DUMP_PAGE_SIZE);
++
++		/* now adjust the leftover bits back to the top of the page */
++		/* this case would not arise during stage 2 (passthru) */
++		memset(dump_buf, 'z', DUMP_BUFFER_SIZE);
++		if (left) {
++			memcpy(dump_buf, dump_buf + DUMP_BUFFER_SIZE, left);
++		}
++		buf -= DUMP_BUFFER_SIZE;
++		dump_config.dumper->curr_buf = buf;
++	}
++				
++	return 0;
++}
++
++int dump_generic_skip_data(unsigned long loc, unsigned long sz)
++{
++	/* dummy by default */
++	return 0;
++}
++
++/* 
++ * Common low level routine to write a buffer to current dump device 
++ * Expects checks for space etc to have been taken care of by the caller 
++ * Operates serially at the moment for simplicity. 
++ * TBD/Todo: Consider batching for improved throughput
++ */
++int dump_ll_write(void *buf, unsigned long len)
++{
++	long transferred = 0, last_transfer = 0;
++	int ret = 0;
++
++	/* make sure device is ready */
++	while ((ret = dump_dev_ready(NULL)) == -EAGAIN);
++	if  (ret < 0) {
++		printk("dump_dev_ready failed !err %d\n", ret);
++		return ret;
++	}
++
++	while (len) {
++		if ((last_transfer = dump_dev_write(buf, len)) <= 0)  {
++			ret = last_transfer;
++			printk("dump_dev_write failed !err %d\n", 
++			ret);
++			break;
++		}
++		/* wait till complete */
++		while ((ret = dump_dev_ready(buf)) == -EAGAIN)
++			cpu_relax();
++
++		if  (ret < 0) {
++			printk("i/o failed !err %d\n", ret);
++			break;
++		}
++
++		len -= last_transfer;
++		buf += last_transfer;
++		transferred += last_transfer;
++	}
++	return (ret < 0) ? ret : transferred;
++}
++
++/* default writeout routine for single dump device */
++/* writes out the dump data ensuring enough space is left for the end marker */
++int dump_generic_write_buffer(void *buf, unsigned long len)
++{
++	long written = 0;
++	int err = 0;
++
++	/* check for space */
++	if ((err = dump_dev_seek(dump_config.dumper->curr_offset + len + 
++			2*DUMP_BUFFER_SIZE)) < 0) {
++		printk("dump_write_buffer: insuff space after offset 0x%llx\n",
++			dump_config.dumper->curr_offset);
++		return err;
++	}
++	/* alignment check would happen as a side effect of this */
++	if ((err = dump_dev_seek(dump_config.dumper->curr_offset)) < 0)
++		return err; 
++
++	written = dump_ll_write(buf, len);
++
++	/* all or none */
++
++	if (written < len)
++		written = written ? -ENOSPC : written;
++	else
++		dump_config.dumper->curr_offset += len;
++
++	return written;
++}
++
++int dump_generic_configure(unsigned long devid)
++{
++	struct dump_dev *dev = dump_config.dumper->dev;
++	struct dump_data_filter *filter;
++	void *buf;
++	int ret = 0;
++
++	/* Allocate the dump buffer and initialize dumper state */
++	/* Assume that we get aligned addresses */
++	if (!(buf = dump_alloc_mem(DUMP_BUFFER_SIZE + 3 * DUMP_PAGE_SIZE)))
++		return -ENOMEM;
++
++	if ((unsigned long)buf & (PAGE_SIZE - 1)) {
++		/* sanity check for page aligned address */
++		dump_free_mem(buf);
++		return -ENOMEM; /* fixme: better error code */
++	}
++
++	/* Initialize the rest of the fields */
++	dump_config.dumper->dump_buf = buf + DUMP_PAGE_SIZE;
++	dumper_reset();
++
++	/* Open the dump device */
++	if (!dev)
++		return -ENODEV;
++
++	if ((ret = dev->ops->open(dev, devid))) {
++	       return ret;
++	}
++
++	/* Initialise the memory ranges in the dump filter */
++	for (filter = dump_config.dumper->filter ;filter->selector; filter++) {
++		if (!filter->start[0] && !filter->end[0]) {
++			pg_data_t *pgdat;
++			int i = 0;
++			for_each_pgdat(pgdat) {
++				filter->start[i] = 
++					(loff_t)pgdat->node_start_pfn << PAGE_SHIFT;
++				filter->end[i] =
++					(loff_t)(pgdat->node_start_pfn + pgdat->node_spanned_pages) << PAGE_SHIFT;
++				i++;
++			}
++			filter->num_mbanks = i;
++		}
++	}
++
++	return 0;
++}
++
++int dump_generic_unconfigure(void)
++{
++	struct dump_dev *dev = dump_config.dumper->dev;
++	void *buf = dump_config.dumper->dump_buf;
++	int ret = 0;
++
++	pr_debug("Generic unconfigure\n");
++	/* Close the dump device */
++	if (dev && (ret = dev->ops->release(dev)))
++		return ret;
++
++	printk("Closed dump device\n");
++	
++	if (buf)
++		dump_free_mem((buf - DUMP_PAGE_SIZE));
++
++	dump_config.dumper->curr_buf = dump_config.dumper->dump_buf = NULL;
++	pr_debug("Released dump buffer\n");
++
++	return 0;
++}
++
++#ifdef CONFIG_DISCONTIGMEM
++
++void dump_reconfigure_mbanks(void) 
++{
++        pg_data_t *pgdat;
++        loff_t start, end, loc, loc_end;
++        int i=0;
++        struct dump_data_filter *filter = dump_config.dumper->filter;
++
++        for_each_pgdat(pgdat) {
++
++                start = (loff_t)(pgdat->node_start_pfn << PAGE_SHIFT);
++                end = ((loff_t)(pgdat->node_start_pfn + pgdat->node_spanned_pages) << PAGE_SHIFT);
++		for(loc = start; loc < end; loc += (DUMP_PAGE_SIZE)) {
++
++                        if(!(__dump_page_valid(loc >> PAGE_SHIFT)))
++                                continue;
++
++                        /* We found a valid page. This is the start */
++                        filter->start[i] = loc;
++
++                        /* Now loop here till you find the end */
++                        for(loc_end = loc; loc_end < end; loc_end += (DUMP_PAGE_SIZE)) {
++                                
++				if(__dump_page_valid(loc_end >> PAGE_SHIFT)) {
++                                /* This page could very well be the last page */
++                                        filter->end[i] = loc_end;
++                                        continue;
++                                }
++                                break;
++                        }
++                        i++;
++                        loc = loc_end;
++                }
++        }
++        filter->num_mbanks = i;
++
++        /* Propagate memory bank information to other filters */
++        for (filter = dump_config.dumper->filter, filter++ ;filter->selector; filter++) {
++                for(i = 0; i < dump_config.dumper->filter->num_mbanks; i++) {
++                        filter->start[i] = dump_config.dumper->filter->start[i];
++                        filter->end[i] = dump_config.dumper->filter->end[i];
++                        filter->num_mbanks = dump_config.dumper->filter->num_mbanks;
++                }
++        }
++}
++#endif
++
++/* Set up the default dump scheme */
++
++struct dump_scheme_ops dump_scheme_singlestage_ops = {
++	.configure	= dump_generic_configure,
++	.unconfigure	= dump_generic_unconfigure,
++	.sequencer	= dump_generic_sequencer,
++	.iterator	= dump_page_iterator,
++	.save_data	= dump_generic_save_data,
++	.skip_data	= dump_generic_skip_data,
++	.write_buffer	= dump_generic_write_buffer,
++};
++
++struct dump_scheme dump_scheme_singlestage = {
++	.name		= "single-stage",
++	.ops		= &dump_scheme_singlestage_ops
++};
++
++/* The single stage dumper comprising all these */
++struct dumper dumper_singlestage = {
++	.name		= "single-stage",
++	.scheme		= &dump_scheme_singlestage,
++	.fmt		= &dump_fmt_lcrash,
++	.compress 	= &dump_none_compression,
++	.filter		= dump_filter_table,
++	.dev		= NULL,
++};		
++
+Index: linux-2.6.10/drivers/dump/dump_gzip.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_gzip.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_gzip.c	2005-04-05 16:47:53.937206016 +0800
+@@ -0,0 +1,174 @@
++/*
++ * GZIP Compression functions for kernel crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sourceforge.net)
++ * Copyright 2001 Matt D. Robinson.  All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* header files */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/file.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/dump.h>
++#include <linux/zlib.h>
++#include <linux/vmalloc.h>
++
++static void *deflate_workspace;
++static unsigned long workspace_paddr[2];
++
++static u8 *safety_buffer;
++
++/*
++ * Name: dump_compress_gzip()
++ * Func: Compress a DUMP_PAGE_SIZE page using gzip-style algorithms (the.
++ *       deflate functions similar to what's used in PPP).
++ */
++static u32
++dump_compress_gzip(const u8 *old, u32 oldsize, u8 *new, u32 newsize,
++		unsigned long loc)
++{
++	/* error code and dump stream */
++	int err;
++	z_stream dump_stream;
++	struct page *pg = (struct page *)loc;
++	unsigned long paddr =  page_to_pfn(pg) << PAGE_SHIFT;
++	static int warning = 0;
++
++	dump_stream.workspace = deflate_workspace;
++	if ((paddr == workspace_paddr[0]) || (paddr == workspace_paddr[1])) {
++		/* 
++		 * This page belongs to deflate_workspace used as temporary 
++		 * buffer for compression. Hence, dump them without compression.
++		 */
++		return(0);
++	}
++	if ((err = zlib_deflateInit(&dump_stream, Z_BEST_COMPRESSION)) != Z_OK) {
++		/* fall back to RLE compression */
++		printk("dump_compress_gzip(): zlib_deflateInit() "
++			"failed (%d)!\n", err);
++		return 0;
++	}
++
++	/* copy the old page to the safety buffer */
++	if (oldsize <= DUMP_PAGE_SIZE) {
++		memcpy(safety_buffer, old, oldsize);
++		dump_stream.next_in = (u8 *) safety_buffer;
++	} else {
++		if (!warning) {
++			printk("dump_compress_gzip oversize input: %d\n",
++					oldsize);
++			warning++;
++		}
++		dump_stream.next_in = (u8 *) old;
++	}
++
++	/* use old (page of memory) and size (DUMP_PAGE_SIZE) as in-streams */
++	dump_stream.avail_in = oldsize;
++
++	/* out streams are new (dpcpage) and new size (DUMP_DPC_PAGE_SIZE) */
++	dump_stream.next_out = new;
++	dump_stream.avail_out = newsize;
++
++	/* deflate the page -- check for error */
++	err = zlib_deflate(&dump_stream, Z_FINISH);
++	if (err != Z_STREAM_END) {
++		/* zero is return code here */
++		(void)zlib_deflateEnd(&dump_stream);
++		printk("dump_compress_gzip(): zlib_deflate() failed (%d)!\n",
++			err);
++		return 0;
++	}
++
++	/* let's end the deflated compression stream */
++	if ((err = zlib_deflateEnd(&dump_stream)) != Z_OK) {
++		printk("dump_compress_gzip(): zlib_deflateEnd() "
++			"failed (%d)!\n", err);
++	}
++
++	/* return the compressed byte total (if it's smaller) */
++	if (dump_stream.total_out >= oldsize) {
++		return oldsize;
++	}
++	return dump_stream.total_out;
++}
++
++/* setup the gzip compression functionality */
++static struct __dump_compress dump_gzip_compression = {
++	.compress_type = DUMP_COMPRESS_GZIP,
++	.compress_func = dump_compress_gzip,
++	.compress_name = "GZIP",
++};
++
++/*
++ * Name: dump_compress_gzip_init()
++ * Func: Initialize gzip as a compression mechanism.
++ */
++static int __init
++dump_compress_gzip_init(void)
++{
++	struct page *pg;
++
++	deflate_workspace = vmalloc(zlib_deflate_workspacesize());
++	if (!deflate_workspace) {
++		printk("dump_compress_gzip_init(): Failed to "
++			"alloc %d bytes for deflate workspace\n",
++			zlib_deflate_workspacesize());
++		return -ENOMEM;
++	}
++	/*
++	 * Need to find pages (workspace) that are used for compression.
++	 * Even though zlib_deflate_workspacesize() is 64 pages (approximately)
++	 * depends on the arch, we used only 2 pages. Hence, get the physical
++	 * addresses for these 2 pages and used them to not to compress those
++	 * pages.
++	 */
++	pg = vmalloc_to_page(deflate_workspace);
++	workspace_paddr[0] = page_to_pfn(pg) << PAGE_SHIFT;
++	pg = vmalloc_to_page(deflate_workspace + DUMP_PAGE_SIZE);
++	workspace_paddr[1] = page_to_pfn(pg) << PAGE_SHIFT;
++
++	/* Eliminate the possibility of real data getting a compression
++	 * failure.
++	 */
++
++	if (!(safety_buffer = (void *)__get_free_pages(GFP_KERNEL, 
++					get_order(DUMP_PAGE_SIZE))))
++		return -ENOMEM;
++
++	printk("dump gzip safety buffer: %p, %d\n", safety_buffer,
++			(int)DUMP_PAGE_SIZE);
++
++	dump_register_compression(&dump_gzip_compression);
++	return 0;
++}
++
++/*
++ * Name: dump_compress_gzip_cleanup()
++ * Func: Remove gzip as a compression mechanism.
++ */
++static void __exit
++dump_compress_gzip_cleanup(void)
++{
++	vfree(deflate_workspace);
++	if (safety_buffer) {
++		free_pages((unsigned long)safety_buffer, 
++				get_order(DUMP_PAGE_SIZE));
++		safety_buffer = NULL;
++	}
++
++	dump_unregister_compression(DUMP_COMPRESS_GZIP);
++}
++
++/* module initialization */
++module_init(dump_compress_gzip_init);
++module_exit(dump_compress_gzip_cleanup);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("LKCD Development Team <lkcd-devel@lists.sourceforge.net>");
++MODULE_DESCRIPTION("Gzip compression module for crash dump driver");
+Index: linux-2.6.10/drivers/dump/dump_filters.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_filters.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_filters.c	2005-04-05 16:47:53.942205256 +0800
+@@ -0,0 +1,143 @@
++/*
++ * Default filters to select data to dump for various passes.
++ *
++ * Started: Oct 2002 -  Suparna Bhattacharya <suparna@in.ibm.com>
++ * 	Split and rewrote default dump selection logic to generic dump 
++ * 	method interfaces 
++ * Derived from a portion of dump_base.c created by 
++ * 	Matt Robinson <yakker@sourceforge.net>)
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson.  All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * Used during single-stage dumping and during stage 1 of the 2-stage scheme
++ * (Stage 2 of the 2-stage scheme uses the fully transparent filters
++ * i.e. passthru filters in dump_overlay.c)
++ *
++ * Future: Custom selective dump may involve a different set of filters.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#include <linux/kernel.h>
++#include <linux/bootmem.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++
++#define DUMP_PFN_SAFETY_MARGIN  1024  /* 4 MB */
++static unsigned long bootmap_pages;
++
++/* Copied from mm/bootmem.c - FIXME */
++/* return the number of _pages_ that will be allocated for the boot bitmap */
++void dump_calc_bootmap_pages (void)
++{
++	unsigned long mapsize;
++	unsigned long pages = num_physpages;
++
++	mapsize = (pages+7)/8;
++	mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
++	mapsize >>= PAGE_SHIFT;
++	bootmap_pages = mapsize + DUMP_PFN_SAFETY_MARGIN + 1;
++}
++
++
++/* temporary */
++extern unsigned long min_low_pfn;
++
++
++int dump_low_page(struct page *p)
++{
++	return ((page_to_pfn(p) >= min_low_pfn) &&
++		(page_to_pfn(p) < (min_low_pfn + bootmap_pages)));
++}
++
++static inline int kernel_page(struct page *p)
++{
++	/* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */
++	return (PageReserved(p) && !PageInuse(p)) || (!PageLRU(p) && PageInuse(p));
++}
++
++static inline int user_page(struct page *p)
++{
++	return PageInuse(p) && (!PageReserved(p) && PageLRU(p));
++}
++
++static inline int unreferenced_page(struct page *p)
++{
++	return !PageInuse(p) && !PageReserved(p);
++}
++
++
++/* loc marks the beginning of a range of pages */
++int dump_filter_kernpages(int pass, unsigned long loc, unsigned long sz)
++{
++	struct page *page = (struct page *)loc;
++	/* if any of the pages is a kernel page, select this set */	
++	while (sz) {
++		if (dump_low_page(page) || kernel_page(page))
++			return 1;
++		sz -= PAGE_SIZE;
++		page++;
++	}	
++	return 0;
++}
++
++
++/* loc marks the beginning of a range of pages */
++int dump_filter_userpages(int pass, unsigned long loc, unsigned long sz)
++{
++	struct page *page = (struct page *)loc;
++	int ret = 0;
++	/* select if the set has any user page, and no kernel pages  */	
++	while (sz) {
++		if (user_page(page) && !dump_low_page(page)) {
++			ret = 1;
++		} else if (kernel_page(page) || dump_low_page(page)) {
++			return 0;
++		}
++		page++;
++		sz -= PAGE_SIZE;
++	}	
++	return ret;
++}
++
++
++
++/* loc marks the beginning of a range of pages */
++int dump_filter_unusedpages(int pass, unsigned long loc, unsigned long sz)
++{
++	struct page *page = (struct page *)loc;
++
++	/* select if the set does not have any used pages  */	
++	while (sz) {
++		if (!unreferenced_page(page) || dump_low_page(page)) {
++			return 0;
++		}
++		page++;
++		sz -= PAGE_SIZE;
++	}	
++	return 1;
++}
++
++/* dummy: last (non-existent) pass */
++int dump_filter_none(int pass, unsigned long loc, unsigned long sz)
++{
++	return 0;
++}
++
++/* TBD: resolve level bitmask ? */
++struct dump_data_filter dump_filter_table[] = {
++	{ .name = "kern", .selector = dump_filter_kernpages, 
++		.level_mask = DUMP_MASK_KERN},
++	{ .name = "user", .selector = dump_filter_userpages, 
++		.level_mask = DUMP_MASK_USED},
++	{ .name = "unused", .selector = dump_filter_unusedpages, 
++		.level_mask = DUMP_MASK_UNUSED},
++	{ .name = "none", .selector = dump_filter_none, 
++		.level_mask = DUMP_MASK_REST},
++	{ .name = "", .selector = NULL, .level_mask = 0}
++};
++
+Index: linux-2.6.10/drivers/dump/dump_ppc64.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_ppc64.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_ppc64.c	2005-04-05 16:47:53.931206928 +0800
+@@ -0,0 +1,410 @@
++/*
++ * Architecture specific (ppc64) functions for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ * 
++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com)
++ * Copyright 2000 TurboLinux, Inc.  All rights reserved.
++ * Copyright 2003, 2004 IBM Corporation
++ * 
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * The hooks for dumping the kernel virtual memory to disk are in this
++ * file.  Any time a modification is made to the virtual memory mechanism,
++ * these routines must be changed to use the new mechanisms.
++ */
++#include <linux/types.h>
++#include <linux/fs.h>
++#include <linux/dump.h>
++#include <linux/mm.h>
++#include <linux/vmalloc.h>
++#include <linux/delay.h>
++#include <linux/syscalls.h> 
++#include <asm/hardirq.h>
++#include "dump_methods.h"
++#include <linux/irq.h>
++#include <asm/machdep.h>
++#include <asm/uaccess.h>
++#include <asm/irq.h>
++#include <asm/page.h>
++#if defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE)
++#include <linux/kdb.h>
++#endif
++
++extern cpumask_t irq_affinity[];
++
++static cpumask_t saved_affinity[NR_IRQS];
++
++static __s32         saved_irq_count;   /* saved preempt_count() flags */
++
++static int alloc_dha_stack(void)
++{
++        int i;
++        void *ptr;
++
++        if (dump_header_asm.dha_stack[0])
++                return 0;
++
++        ptr = (void *)vmalloc(THREAD_SIZE * num_possible_cpus());
++        if (!ptr) {
++                return -ENOMEM;
++        }
++
++        for (i = 0; i < num_possible_cpus(); i++) {
++                dump_header_asm.dha_stack[i] = 
++			(uint64_t)((unsigned long)ptr + (i * THREAD_SIZE));
++	}
++	return 0;
++}
++
++static int free_dha_stack(void)
++{
++        if (dump_header_asm.dha_stack[0]) {
++                vfree((void*)dump_header_asm.dha_stack[0]);
++		dump_header_asm.dha_stack[0] = 0;
++	}
++        return 0;
++}
++#ifdef CONFIG_SMP
++static int dump_expect_ipi[NR_CPUS];
++static atomic_t waiting_for_dump_ipi;
++
++extern void stop_this_cpu(void *);
++static int
++dump_ipi_handler(struct pt_regs *regs) 
++{
++	int cpu = smp_processor_id();
++
++	if (!dump_expect_ipi[cpu])
++		return 0;
++	dump_save_this_cpu(regs);
++	atomic_dec(&waiting_for_dump_ipi);
++
++ level_changed:
++	switch (dump_silence_level) {
++	case DUMP_HARD_SPIN_CPUS:       /* Spin until dump is complete */
++		while (dump_oncpu) {
++			barrier();      /* paranoia */
++			if (dump_silence_level != DUMP_HARD_SPIN_CPUS)
++				goto level_changed;
++			cpu_relax();    /* kill time nicely */
++		}
++		break;
++
++	case DUMP_HALT_CPUS:            /* Execute halt */
++		stop_this_cpu(NULL);
++		break;
++	
++	case DUMP_SOFT_SPIN_CPUS:
++		/* Mark the task so it spins in schedule */
++		set_tsk_thread_flag(current, TIF_NEED_RESCHED);
++		break;
++	}
++
++	return 1;
++}
++
++/* save registers on other processors
++ * If the other cpus don't respond we simply do not get their states.
++ */
++void 
++__dump_save_other_cpus(void)
++{
++	int i, cpu = smp_processor_id();
++	int other_cpus = num_online_cpus()-1;
++	
++	if (other_cpus > 0) {
++		atomic_set(&waiting_for_dump_ipi, other_cpus);
++		for (i = 0; i < NR_CPUS; i++)
++			dump_expect_ipi[i] = (i != cpu && cpu_online(i));
++
++		printk(KERN_ALERT "sending IPI to other cpus...\n");
++		dump_send_ipi(dump_ipi_handler);
++		/*
++		 * may be we dont need to wait for IPI to be processed.
++		 * just write out the header at the end of dumping, if
++		 * this IPI is not processed until then, there probably
++		 * is a problem and we just fail to capture state of
++		 * other cpus.
++		 * However, we will wait 10 secs for other CPUs to respond. 
++		 * If not, proceed the dump process even though we failed
++		 * to capture other CPU states. 
++		 */
++		i = 10000; /* wait max of 10 seconds */
++		while ((atomic_read(&waiting_for_dump_ipi) > 0) && (--i > 0)) {
++			barrier();
++			mdelay(1);
++		} 
++		printk(KERN_ALERT "done waiting: %d cpus not responding\n",
++		       atomic_read(&waiting_for_dump_ipi));
++		dump_send_ipi(NULL);	/* clear handler */
++	}
++}
++
++/*
++ * Restore old irq affinities.
++ */
++static void
++__dump_reset_irq_affinity(void)
++{
++	int i;
++	irq_desc_t *irq_d;
++
++	memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long));
++
++	for_each_irq(i) {
++		irq_d = get_irq_desc(i);
++		if (irq_d->handler == NULL) {
++			continue;
++		}
++		if (irq_d->handler->set_affinity != NULL) {
++			irq_d->handler->set_affinity(i, saved_affinity[i]);
++		}
++	}
++}
++
++/*
++ * Routine to save the old irq affinities and change affinities of all irqs to
++ * the dumping cpu.
++ *
++ * NB: Need to be expanded to multiple nodes.
++ */
++static void
++__dump_set_irq_affinity(void)
++{
++	int i;
++	cpumask_t cpu = CPU_MASK_NONE;
++	irq_desc_t *irq_d;
++
++	cpu_set(smp_processor_id(), cpu);
++
++	memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long));
++
++	for_each_irq(i) {
++		irq_d = get_irq_desc(i);
++		if (irq_d->handler == NULL) {
++			continue;
++		}
++		irq_affinity[i] = cpu;
++		if (irq_d->handler->set_affinity != NULL) {
++			irq_d->handler->set_affinity(i, irq_affinity[i]);
++		}
++	}
++}
++#else /* !CONFIG_SMP */
++#define __dump_save_other_cpus() do { } while (0)
++#define __dump_set_irq_affinity()      do { } while (0)
++#define __dump_reset_irq_affinity()    do { } while (0)
++#endif /* !CONFIG_SMP */
++
++void
++__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs)
++{
++	if (regs) {
++		memcpy(dest_regs, regs, sizeof(struct pt_regs));
++	} 
++}
++
++void
++__dump_save_context(int cpu, const struct pt_regs *regs, 
++	struct task_struct *tsk)
++{
++	dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk;
++	__dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs);
++
++	/* take a snapshot of the stack */
++	/* doing this enables us to tolerate slight drifts on this cpu */
++
++	if (dump_header_asm.dha_stack[cpu]) {
++		memcpy((void *)dump_header_asm.dha_stack[cpu],
++				STACK_START_POSITION(tsk),
++				THREAD_SIZE);
++	}
++	dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info);
++}
++
++/*
++ * Name: __dump_configure_header()
++ * Func: Configure the dump header with all proper values.
++ */
++int
++__dump_configure_header(const struct pt_regs *regs)
++{
++	return (0);
++}
++
++#if defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE)
++int
++kdb_sysdump(int argc, const char **argv, const char **envp, struct pt_regs *regs)
++{
++	kdb_printf("Dumping to disk...\n");
++	dump("dump from kdb", regs);
++	kdb_printf("Dump Complete\n");
++	return 0;
++}
++#endif
++
++/*
++ * Name: __dump_init()
++ * Func: Initialize the dumping routine process.  This is in case
++ *       it's necessary in the future.
++ */
++void
++__dump_init(uint64_t local_memory_start)
++{
++#if defined(FIXME) && defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE)
++	/* This won't currently work because interrupts are off in kdb
++	 * and the dump process doesn't understand how to recover.
++	 */
++	/* ToDo: add a command to query/set dump configuration */
++	kdb_register_repeat("sysdump", kdb_sysdump, "", "use lkcd to dump the system to disk (if configured)", 0, KDB_REPEAT_NONE);
++#endif
++
++	/* return */
++	return;
++}
++
++/*
++ * Name: __dump_open()
++ * Func: Open the dump device (architecture specific).  This is in
++ *       case it's necessary in the future.
++ */
++void
++__dump_open(void)
++{
++	alloc_dha_stack();
++}
++
++
++/*
++ * Name: __dump_cleanup()
++ * Func: Free any architecture specific data structures. This is called
++ *       when the dump module is being removed.
++ */
++void
++__dump_cleanup(void)
++{
++	free_dha_stack();
++}
++
++/*
++ * Kludge - dump from interrupt context is unreliable (Fixme)
++ *
++ * We do this so that softirqs initiated for dump i/o
++ * get processed and we don't hang while waiting for i/o
++ * to complete or in any irq synchronization attempt.
++ *
++ * This is not quite legal of course, as it has the side
++ * effect of making all interrupts & softirqs triggered
++ * while dump is in progress complete before currently
++ * pending softirqs and the currently executing interrupt
++ * code.
++ */
++static inline void
++irq_bh_save(void)
++{
++	saved_irq_count = irq_count();
++	preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK);
++}
++
++static inline void
++irq_bh_restore(void)
++{
++	preempt_count() |= saved_irq_count;
++}
++
++/*
++ * Name: __dump_irq_enable
++ * Func: Reset system so interrupts are enabled.
++ * This is used for dump methods that require interrupts
++ * Eventually, all methods will have interrupts disabled
++ * and this code can be removed.
++ *
++ * Change irq affinities
++ * Re-enable interrupts
++ */
++int
++__dump_irq_enable(void)
++{
++	__dump_set_irq_affinity();
++	irq_bh_save();
++	local_irq_enable();
++	return 0;
++}
++
++/*
++ * Name: __dump_irq_restore
++ * Func: Resume the system state in an architecture-specific way.
++ */
++void
++__dump_irq_restore(void)
++{
++	local_irq_disable();
++	__dump_reset_irq_affinity();
++	irq_bh_restore(); 
++}
++
++#if 0
++/* Cheap progress hack.  It estimates pages to write and
++ * assumes all pages will go -- so it may get way off.
++ * As the progress is not displayed for other architectures, not used at this 
++ * moment.
++ */
++void
++__dump_progress_add_page(void)
++{
++	unsigned long total_pages = nr_free_pages() + nr_inactive_pages + nr_active_pages;
++	unsigned int percent = (dump_header.dh_num_dump_pages * 100) / total_pages;
++	char buf[30];
++
++	if (percent > last_percent && percent <= 100) {
++		sprintf(buf, "Dump %3d%%     ", percent);
++		ppc64_dump_msg(0x2, buf);
++		last_percent = percent;
++	}
++
++}
++#endif
++
++extern int dump_page_is_ram(unsigned long);
++/*
++ * Name: __dump_page_valid()
++ * Func: Check if page is valid to dump.
++ */
++int
++__dump_page_valid(unsigned long index)
++{
++	if (!pfn_valid(index))
++		return 0;
++
++	return dump_page_is_ram(index);
++}
++
++/*
++ * Name: manual_handle_crashdump()
++ * Func: Interface for the lkcd dump command. Calls dump_execute()
++ */
++int
++manual_handle_crashdump(void)
++{
++	struct pt_regs regs;
++
++	get_current_regs(&regs);
++	dump_execute("manual", &regs);
++	return 0;
++}
++
++/*
++ * Name: __dump_clean_irq_state()
++ * Func: Clean up from the previous IRQ handling state. Such as oops from 
++ *       interrupt handler or bottom half.
++ */
++void
++__dump_clean_irq_state(void)
++{
++    return;
++}
+Index: linux-2.6.10/drivers/dump/dump_methods.h
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_methods.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_methods.h	2005-04-05 16:47:53.930207080 +0800
+@@ -0,0 +1,357 @@
++/*
++ * Generic interfaces for flexible system dump 
++ *
++ * Started: Oct 2002 -  Suparna Bhattacharya (suparna@in.ibm.com)
++ *
++ * Copyright (C) 2002 International Business Machines Corp. 
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#ifndef _LINUX_DUMP_METHODS_H
++#define _LINUX_DUMP_METHODS_H
++
++/*
++ * Inspired by Matt Robinson's suggestion of introducing dump 
++ * methods as a way to enable different crash dump facilities to 
++ * coexist where each employs its own scheme or dumping policy.
++ *
++ * The code here creates a framework for flexible dump by defining 
++ * a set of methods and providing associated helpers that differentiate
++ * between the underlying mechanism (how to dump), overall scheme 
++ * (sequencing of stages and data dumped and associated quiescing), 
++ * output format (what the dump output looks like), target type 
++ * (where to save the dump; see dumpdev.h), and selection policy 
++ * (state/data to dump).
++ * 
++ * These sets of interfaces can be mixed and matched to build a 
++ * dumper suitable for a given situation, allowing for 
++ * flexibility as well appropriate degree of code reuse.
++ * For example all features and options of lkcd (including
++ * granular selective dumping in the near future) should be
++ * available even when say, the 2 stage soft-boot based mechanism 
++ * is used for taking disruptive dumps.
++ *
++ * Todo: Additionally modules or drivers may supply their own
++ * custom dumpers which extend dump with module specific
++ * information or hardware state, and can even tweak the
++ * mechanism when it comes to saving state relevant to
++ * them.
++ */
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/highmem.h>
++#include <linux/dumpdev.h>
++#include <asm/page.h>	/* get_order */
++
++#define MAX_PASSES 	6
++#define MAX_DEVS	4
++
++
++/* To customise selection of pages to be dumped in a given pass/group */
++struct dump_data_filter{
++	char name[32];
++	int (*selector)(int, unsigned long, unsigned long);
++	ulong level_mask; /* dump level(s) for which this filter applies */
++	loff_t start[MAX_NUMNODES], end[MAX_NUMNODES]; /* location range applicable */
++	ulong num_mbanks;  /* Number of memory banks. Greater than one for discontig memory (NUMA) */
++};
++
++
++/* 
++ * Determined by the kind of dump mechanism and appropriate 
++ * overall scheme 
++ */ 
++struct dump_scheme_ops {
++	/* sets aside memory, inits data structures etc */
++	int (*configure)(unsigned long devid); 
++	/* releases  resources */
++	int (*unconfigure)(void); 
++
++	/* ordering of passes, invoking iterator */
++	int (*sequencer)(void); 
++        /* iterates over system data, selects and acts on data to dump */
++	int (*iterator)(int, int (*)(unsigned long, unsigned long), 
++		struct dump_data_filter *); 
++        /* action when data is selected for dump */
++	int (*save_data)(unsigned long, unsigned long); 
++        /* action when data is to be excluded from dump */
++	int (*skip_data)(unsigned long, unsigned long); 
++	/* policies for space, multiple dump devices etc */
++	int (*write_buffer)(void *, unsigned long); 
++};
++
++struct dump_scheme {
++	/* the name serves as an anchor to locate the scheme after reboot */
++	char name[32]; 
++	struct dump_scheme_ops *ops;
++	struct list_head list;
++};
++
++/* Quiescing/Silence levels (controls IPI callback behaviour) */
++extern enum dump_silence_levels {
++	DUMP_SOFT_SPIN_CPUS	= 1,
++	DUMP_HARD_SPIN_CPUS	= 2,
++	DUMP_HALT_CPUS		= 3,
++} dump_silence_level;
++
++/* determined by the dump (file) format */
++struct dump_fmt_ops {
++	/* build header */
++	int (*configure_header)(const char *, const struct pt_regs *); 
++	int (*update_header)(void); /* update header and write it out */
++	/* save curr context  */
++	void (*save_context)(int, const struct pt_regs *, 
++		struct task_struct *); 
++	/* typically called by the save_data action */
++	/* add formatted data to the dump buffer */
++	int (*add_data)(unsigned long, unsigned long); 
++	int (*update_end_marker)(void);
++};
++
++struct dump_fmt {
++	unsigned long magic; 
++	char name[32];  /* lcrash, crash, elf-core etc */
++	struct dump_fmt_ops *ops;
++	struct list_head list;
++};
++
++/* 
++ * Modules will be able add their own data capture schemes by 
++ * registering their own dumpers. Typically they would use the 
++ * primary dumper as a template and tune it with their routines.
++ * Still Todo.
++ */
++
++/* The combined dumper profile (mechanism, scheme, dev, fmt) */
++struct dumper {
++	char name[32]; /* singlestage, overlay (stg1), passthru(stg2), pull */
++	struct dump_scheme *scheme;
++	struct dump_fmt *fmt;
++	struct __dump_compress *compress;
++	struct dump_data_filter *filter;
++	struct dump_dev *dev; 
++	/* state valid only for active dumper(s) - per instance */
++	/* run time state/context */
++	int curr_pass;
++	unsigned long count;
++	loff_t curr_offset; /* current logical offset into dump device */
++	loff_t curr_loc; /* current memory location */
++	void *curr_buf; /* current position in the dump buffer */
++	void *dump_buf; /* starting addr of dump buffer */
++	int header_dirty; /* whether the header needs to be written out */
++	int header_len; 
++	struct list_head dumper_list; /* links to other dumpers */
++};	
++
++/* Starting point to get to the current configured state */
++struct dump_config {
++	ulong level;
++	ulong flags;
++	struct dumper *dumper;
++	unsigned long dump_device;
++	unsigned long dump_addr; /* relevant only for in-memory dumps */
++	struct list_head dump_dev_list;
++};	
++
++extern struct dump_config dump_config;
++
++/* Used to save the dump config across a reboot for 2-stage dumps: 
++ * 
++ * Note: The scheme, format, compression and device type should be 
++ * registered at bootup, for this config to be sharable across soft-boot. 
++ * The function addresses could have changed and become invalid, and
++ * need to be set up again.
++ */
++struct dump_config_block {
++	u64 magic; /* for a quick sanity check after reboot */
++	struct dump_memdev memdev; /* handle to dump stored in memory */
++	struct dump_config config;
++	struct dumper dumper;
++	struct dump_scheme scheme;
++	struct dump_fmt fmt;
++	struct __dump_compress compress;
++	struct dump_data_filter filter_table[MAX_PASSES];
++	struct dump_anydev dev[MAX_DEVS]; /* target dump device */
++};
++
++
++/* Wrappers that invoke the methods for the current (active) dumper */
++
++/* Scheme operations */
++
++static inline int dump_sequencer(void)
++{
++	return dump_config.dumper->scheme->ops->sequencer();
++}
++
++static inline int dump_iterator(int pass, int (*action)(unsigned long, 
++	unsigned long), struct dump_data_filter *filter)
++{
++	return dump_config.dumper->scheme->ops->iterator(pass, action, filter);
++}
++
++#define dump_save_data dump_config.dumper->scheme->ops->save_data
++#define dump_skip_data dump_config.dumper->scheme->ops->skip_data
++
++static inline int dump_write_buffer(void *buf, unsigned long len)
++{
++	return dump_config.dumper->scheme->ops->write_buffer(buf, len);
++}
++
++static inline int dump_configure(unsigned long devid)
++{
++	return dump_config.dumper->scheme->ops->configure(devid);
++}
++
++static inline int dump_unconfigure(void)
++{
++	return dump_config.dumper->scheme->ops->unconfigure();
++}
++
++/* Format operations */
++
++static inline int dump_configure_header(const char *panic_str, 
++	const struct pt_regs *regs)
++{
++	return dump_config.dumper->fmt->ops->configure_header(panic_str, regs);
++}
++
++static inline void dump_save_context(int cpu, const struct pt_regs *regs, 
++		struct task_struct *tsk)
++{
++	dump_config.dumper->fmt->ops->save_context(cpu, regs, tsk);
++}
++
++static inline int dump_save_this_cpu(const struct pt_regs *regs)
++{
++	int cpu = smp_processor_id();
++
++	dump_save_context(cpu, regs, current);
++	return 1;
++}
++
++static inline int dump_update_header(void)
++{
++	return dump_config.dumper->fmt->ops->update_header();
++}
++
++static inline int dump_update_end_marker(void)
++{
++	return dump_config.dumper->fmt->ops->update_end_marker();
++}
++
++static inline int dump_add_data(unsigned long loc, unsigned long sz)
++{
++	return dump_config.dumper->fmt->ops->add_data(loc, sz);
++}
++
++/* Compression operation */
++static inline int dump_compress_data(char *src, int slen, char *dst,
++		unsigned long loc)
++{
++	return dump_config.dumper->compress->compress_func(src, slen, 
++		dst, DUMP_DPC_PAGE_SIZE, loc);
++}
++
++
++/* Prototypes of some default implementations of dump methods */
++
++extern struct __dump_compress dump_none_compression;
++
++/* Default scheme methods (dump_scheme.c) */
++
++extern int dump_generic_sequencer(void);
++extern int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned
++	long), struct dump_data_filter *filter);
++extern int dump_generic_save_data(unsigned long loc, unsigned long sz);
++extern int dump_generic_skip_data(unsigned long loc, unsigned long sz);
++extern int dump_generic_write_buffer(void *buf, unsigned long len);
++extern int dump_generic_configure(unsigned long);
++extern int dump_generic_unconfigure(void);
++#ifdef CONFIG_DISCONTIGMEM
++extern void dump_reconfigure_mbanks(void);
++#endif
++
++/* Default scheme template */
++extern struct dump_scheme dump_scheme_singlestage;
++
++/* Default dump format methods */
++
++extern int dump_lcrash_configure_header(const char *panic_str, 
++	const struct pt_regs *regs);
++extern void dump_lcrash_save_context(int  cpu, const struct pt_regs *regs, 
++	struct task_struct *tsk);
++extern int dump_generic_update_header(void);
++extern int dump_lcrash_add_data(unsigned long loc, unsigned long sz);
++extern int dump_lcrash_update_end_marker(void);
++
++/* Default format (lcrash) template */
++extern struct dump_fmt dump_fmt_lcrash;
++
++/* Default dump selection filter table */
++
++/* 
++ * Entries listed in order of importance and correspond to passes
++ * The last entry (with a level_mask of zero) typically reflects data that 
++ * won't be dumped  -- this may for example be used to identify data 
++ * that will be skipped for certain so the corresponding memory areas can be 
++ * utilized as scratch space.
++ */   
++extern struct dump_data_filter dump_filter_table[];
++
++/* Some pre-defined dumpers */
++extern struct dumper dumper_singlestage;
++extern struct dumper dumper_stage1;
++extern struct dumper dumper_stage2;
++
++/* These are temporary */
++#define DUMP_MASK_HEADER	DUMP_LEVEL_HEADER
++#define DUMP_MASK_KERN		DUMP_LEVEL_KERN
++#define DUMP_MASK_USED		DUMP_LEVEL_USED
++#define DUMP_MASK_UNUSED	DUMP_LEVEL_ALL_RAM
++#define DUMP_MASK_REST		0 /* dummy for now */
++
++/* Helpers - move these to dump.h later ? */
++
++int dump_generic_execute(const char *panic_str, const struct pt_regs *regs);
++extern int dump_ll_write(void *buf, unsigned long len); 
++int dump_check_and_free_page(struct dump_memdev *dev, struct page *page);
++
++static inline void dumper_reset(void)
++{
++	dump_config.dumper->curr_buf = dump_config.dumper->dump_buf;
++	dump_config.dumper->curr_loc = 0;
++	dump_config.dumper->curr_offset = 0;
++	dump_config.dumper->count = 0;
++	dump_config.dumper->curr_pass = 0;
++}
++
++/* 
++ * May later be moulded to perform boot-time allocations so we can dump 
++ * earlier during bootup 
++ */
++static inline void *dump_alloc_mem(unsigned long size)
++{
++	return (void *) __get_free_pages(GFP_KERNEL, get_order(size));
++}
++
++static inline void dump_free_mem(void *buf)
++{
++	struct page *page;
++
++	/* ignore reserved pages (e.g. post soft boot stage) */
++	if (buf && (page = virt_to_page(buf))) {
++		if (PageReserved(page))
++			return;
++	}
++	/*
++	 * Allocated using __get_free_pages().
++	 */
++	free_pages((unsigned long)buf, 
++		get_order(DUMP_BUFFER_SIZE + 3 * DUMP_PAGE_SIZE));
++}
++
++
++#endif /*  _LINUX_DUMP_METHODS_H */
+Index: linux-2.6.10/drivers/dump/Makefile
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/Makefile	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/Makefile	2005-04-05 16:47:53.947204496 +0800
+@@ -0,0 +1,22 @@
++#
++# Makefile for the dump device drivers.
++#
++
++dump-y					:= dump_setup.o dump_fmt.o dump_filters.o dump_scheme.o dump_execute.o
++ifeq ($(CONFIG_X86_64),)
++ifeq ($(CONFIG_X86),y)
++dump-$(CONFIG_X86)			+= dump_i386.o
++endif
++endif
++dump-$(CONFIG_ARM)			+= dump_arm.o
++dump-$(CONFIG_PPC64)                    += dump_ppc64.o
++dump-$(CONFIG_X86_64)			+= dump_x8664.o
++dump-$(CONFIG_IA64)			+= dump_ia64.o
++dump-$(CONFIG_CRASH_DUMP_MEMDEV)	+= dump_memdev.o dump_overlay.o
++dump-objs				+= $(dump-y)
++
++obj-$(CONFIG_CRASH_DUMP)		+= dump.o 
++obj-$(CONFIG_CRASH_DUMP_BLOCKDEV)	+= dump_blockdev.o
++obj-$(CONFIG_CRASH_DUMP_NETDEV)	+= dump_netdev.o
++obj-$(CONFIG_CRASH_DUMP_COMPRESS_RLE)	+= dump_rle.o
++obj-$(CONFIG_CRASH_DUMP_COMPRESS_GZIP)	+= dump_gzip.o
+Index: linux-2.6.10/drivers/Makefile
+===================================================================
+--- linux-2.6.10.orig/drivers/Makefile	2004-12-25 05:36:00.000000000 +0800
++++ linux-2.6.10/drivers/Makefile	2005-04-05 16:47:53.950204040 +0800
+@@ -60,3 +60,4 @@
+ obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
+ obj-$(CONFIG_MMC)		+= mmc/
+ obj-y				+= firmware/
++obj-$(CONFIG_CRASH_DUMP)	+= dump/
diff --git a/lustre/kernel_patches/patches/uml-2.6.10-fc3.patch b/lustre/kernel_patches/patches/uml-2.6.10-fc3.patch
new file mode 100644
index 0000000..a5abf90
--- /dev/null
+++ b/lustre/kernel_patches/patches/uml-2.6.10-fc3.patch
@@ -0,0 +1,3746 @@
+Index: linux-2.6.10/include/asm-um/archparam-i386.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-um/archparam-i386.h	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/include/asm-um/archparam-i386.h	2005-04-05 12:40:36.075903800 +0800
+@@ -10,7 +10,8 @@
+ 
+ #include "user.h"
+ 
+-#define ELF_PLATFORM "i586"
++extern char * elf_aux_platform;
++#define ELF_PLATFORM (elf_aux_platform)
+ 
+ #define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3)
+ 
+@@ -56,15 +57,13 @@
+ 	pr_reg[16] = PT_REGS_SS(regs);		\
+ } while(0);
+ 
+-#if 0 /* Turn this back on when UML has VSYSCALL working */
+-#define VSYSCALL_BASE	(__fix_to_virt(FIX_VSYSCALL))
+-#else
+-#define VSYSCALL_BASE	0
+-#endif
+ 
+-#define VSYSCALL_EHDR	((const struct elfhdr *) VSYSCALL_BASE)
+-#define VSYSCALL_ENTRY	((unsigned long) &__kernel_vsyscall)
+-extern void *__kernel_vsyscall;
++extern unsigned long vsyscall_ehdr;
++extern unsigned long vsyscall_end;
++extern unsigned long __kernel_vsyscall;
++
++#define VSYSCALL_BASE vsyscall_ehdr
++#define VSYSCALL_END vsyscall_end
+ 
+ /*
+  * Architecture-neutral AT_ values in 0-17, leave some room
+@@ -75,8 +74,10 @@
+ 
+ #define ARCH_DLINFO						\
+ do {								\
+-		NEW_AUX_ENT(AT_SYSINFO,	VSYSCALL_ENTRY);	\
+-		NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE);	\
++	if ( vsyscall_ehdr ) {					\
++		NEW_AUX_ENT(AT_SYSINFO,	__kernel_vsyscall);	\
++		NEW_AUX_ENT(AT_SYSINFO_EHDR, vsyscall_ehdr);	\
++	}							\
+ } while (0)
+ 
+ /*
+@@ -87,22 +88,18 @@
+  * Dumping its extra ELF program headers includes all the other information
+  * a debugger needs to easily find how the vsyscall DSO was being used.
+  */
+-#if 0
+-#define ELF_CORE_EXTRA_PHDRS		(VSYSCALL_EHDR->e_phnum)
+-#endif
+-
+-#undef ELF_CORE_EXTRA_PHDRS
++#define ELF_CORE_EXTRA_PHDRS						      \
++	(vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0 )
+ 
+-#if 0
+ #define ELF_CORE_WRITE_EXTRA_PHDRS					      \
+-do {									      \
+-	const struct elf_phdr *const vsyscall_phdrs =			      \
+-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
+-					   + VSYSCALL_EHDR->e_phoff);	      \
++if ( vsyscall_ehdr ) {							      \
++	const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr;    \
++	const struct elf_phdr *const phdrp =				      \
++		(const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff);   \
+ 	int i;								      \
+ 	Elf32_Off ofs = 0;						      \
+-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
+-		struct elf_phdr phdr = vsyscall_phdrs[i];		      \
++	for (i = 0; i < ehdrp->e_phnum; ++i) {				      \
++		struct elf_phdr phdr = phdrp[i];			      \
+ 		if (phdr.p_type == PT_LOAD) {				      \
+ 			ofs = phdr.p_offset = offset;			      \
+ 			offset += phdr.p_filesz;			      \
+@@ -112,23 +109,19 @@
+ 		phdr.p_paddr = 0; /* match other core phdrs */		      \
+ 		DUMP_WRITE(&phdr, sizeof(phdr));			      \
+ 	}								      \
+-} while (0)
++}
+ #define ELF_CORE_WRITE_EXTRA_DATA					      \
+-do {									      \
+-	const struct elf_phdr *const vsyscall_phdrs =			      \
+-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
+-					   + VSYSCALL_EHDR->e_phoff);	      \
++if ( vsyscall_ehdr ) {							      \
++	const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr;    \
++	const struct elf_phdr *const phdrp =				      \
++		(const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff);   \
+ 	int i;								      \
+-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
+-		if (vsyscall_phdrs[i].p_type == PT_LOAD)		      \
+-			DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr,	      \
+-				   vsyscall_phdrs[i].p_filesz);		      \
++	for (i = 0; i < ehdrp->e_phnum; ++i) {				      \
++		if (phdrp[i].p_type == PT_LOAD)				      \
++			DUMP_WRITE((void *) phdrp[i].p_vaddr,		      \
++				   phdrp[i].p_filesz);			      \
+ 	}								      \
+-} while (0)
+-#endif
+-
+-#undef ELF_CORE_WRITE_EXTRA_PHDRS
+-#undef ELF_CORE_WRITE_EXTRA_DATA
++}
+ 
+ #define R_386_NONE	0
+ #define R_386_32	1
+Index: linux-2.6.10/include/asm-um/elf.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-um/elf.h	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/include/asm-um/elf.h	2005-04-05 12:40:36.074903952 +0800
+@@ -3,7 +3,8 @@
+ 
+ #include "asm/archparam.h"
+ 
+-#define ELF_HWCAP (0)
++extern long elf_aux_hwcap;
++#define ELF_HWCAP (elf_aux_hwcap)
+ 
+ #define SET_PERSONALITY(ex, ibcs2) do ; while(0)
+ 
+Index: linux-2.6.10/include/asm-um/fixmap.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-um/fixmap.h	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/include/asm-um/fixmap.h	2005-04-05 12:40:36.075903800 +0800
+@@ -3,6 +3,7 @@
+ 
+ #include <linux/config.h>
+ #include <asm/kmap_types.h>
++#include <asm/archparam.h>
+ 
+ /*
+  * Here we define all the compile-time 'special' virtual
+@@ -34,7 +35,6 @@
+ 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
+ 	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+ #endif
+-	FIX_VSYSCALL,
+ 	__end_of_fixed_addresses
+ };
+ 
+@@ -68,8 +68,8 @@
+  * This is the range that is readable by user mode, and things
+  * acting like user mode such as get_user_pages.
+  */
+-#define FIXADDR_USER_START	(__fix_to_virt(FIX_VSYSCALL))
+-#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
++#define FIXADDR_USER_START	VSYSCALL_BASE
++#define FIXADDR_USER_END	VSYSCALL_END
+ 
+ extern void __this_fixmap_does_not_exist(void);
+ 
+Index: linux-2.6.10/include/asm-i386/thread_info.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/thread_info.h	2005-03-31 16:20:10.000000000 +0800
++++ linux-2.6.10/include/asm-i386/thread_info.h	2005-04-05 12:40:36.076903648 +0800
+@@ -139,6 +139,7 @@
+ #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
+ #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
+ #define TIF_IRET		5	/* return with iret */
++#define TIF_SYSCALL_EMU		6	/* syscall emulation active */
+ #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+ #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+ 
+@@ -148,12 +149,14 @@
+ #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
+ #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
+ #define _TIF_IRET		(1<<TIF_IRET)
++#define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
+ #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+ #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
+ 
+ /* work to do on interrupt/exception return */
+ #define _TIF_WORK_MASK \
+-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
++  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|\
++		  _TIF_SYSCALL_EMU))
+ #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
+ 
+ /*
+Index: linux-2.6.10/include/asm-i386/mmu_context.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/mmu_context.h	2004-12-25 05:33:48.000000000 +0800
++++ linux-2.6.10/include/asm-i386/mmu_context.h	2005-04-05 12:40:36.077903496 +0800
+@@ -6,13 +6,25 @@
+ #include <asm/atomic.h>
+ #include <asm/pgalloc.h>
+ #include <asm/tlbflush.h>
++#include <asm/semaphore.h>
+ 
+ /*
+- * Used for LDT copy/destruction.
++ * Used for LDT initialization/destruction. You cannot copy an LDT with
++ * init_new_context, since it thinks you are passing it a new LDT and won't
++ * deallocate its old content.
+  */
+ int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+ void destroy_context(struct mm_struct *mm);
+ 
++/* LDT initialization for a clean environment - needed for SKAS.*/
++static inline void init_new_empty_context(struct mm_struct *mm)
++{
++	init_MUTEX(&mm->context.sem);
++	mm->context.size = 0;
++}
++
++/* LDT copy for SKAS - for the above problem.*/
++int copy_context(struct mm_struct *mm, struct mm_struct *old_mm);
+ 
+ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+ {
+@@ -29,6 +41,10 @@
+ {
+ 	int cpu = smp_processor_id();
+ 
++#ifdef CONFIG_SMP
++	prev = per_cpu(cpu_tlbstate, cpu).active_mm;
++#endif
++
+ 	if (likely(prev != next)) {
+ 		/* stop flush ipis for the previous mm */
+ 		cpu_clear(cpu, prev->cpu_vm_mask);
+@@ -50,7 +66,6 @@
+ #ifdef CONFIG_SMP
+ 	else {
+ 		per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
+-		BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
+ 
+ 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ 			/* We were in lazy tlb mode and leave_mm disabled 
+Index: linux-2.6.10/include/asm-i386/ptrace.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/ptrace.h	2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/include/asm-i386/ptrace.h	2005-04-05 12:40:36.077903496 +0800
+@@ -64,4 +64,26 @@
+ #endif
+ #endif
+ 
++/*For SKAS3 support.*/
++#ifndef _LINUX_PTRACE_STRUCT_DEF
++#define _LINUX_PTRACE_STRUCT_DEF
++
++#define PTRACE_FAULTINFO	  52
++#define PTRACE_SIGPENDING	  53
++#define PTRACE_LDT		  54
++#define PTRACE_SWITCH_MM 	  55
++
++struct ptrace_faultinfo {
++	int is_write;
++	unsigned long addr;
++};
++
++struct ptrace_ldt {
++	int func;
++  	void *ptr;
++	unsigned long bytecount;
++};
++
++#endif /*ifndef _LINUX_PTRACE_STRUCT_DEF*/
++
+ #endif
+Index: linux-2.6.10/include/asm-i386/desc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/desc.h	2005-03-31 16:20:09.000000000 +0800
++++ linux-2.6.10/include/asm-i386/desc.h	2005-04-05 12:40:36.078903344 +0800
+@@ -126,6 +126,9 @@
+ 	put_cpu();
+ }
+ 
++extern int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr,
++		      unsigned long bytecount);
++
+ #endif /* !__ASSEMBLY__ */
+ 
+ #endif
+Index: linux-2.6.10/include/linux/ptrace.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ptrace.h	2005-03-31 15:35:23.000000000 +0800
++++ linux-2.6.10/include/linux/ptrace.h	2005-04-05 12:40:36.071904408 +0800
+@@ -20,6 +20,7 @@
+ #define PTRACE_DETACH		0x11
+ 
+ #define PTRACE_SYSCALL		  24
++#define PTRACE_SYSEMU		  31
+ 
+ /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
+ #define PTRACE_SETOPTIONS	0x4200
+Index: linux-2.6.10/include/linux/mm.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mm.h	2005-03-31 16:10:15.000000000 +0800
++++ linux-2.6.10/include/linux/mm.h	2005-04-05 12:40:36.072904256 +0800
+@@ -625,6 +625,9 @@
+ extern struct shrinker *set_shrinker(int, shrinker_t);
+ extern void remove_shrinker(struct shrinker *shrinker);
+ 
++extern long do_mprotect(struct mm_struct *mm, unsigned long start,
++			size_t len, unsigned long prot);
++
+ /*
+  * On a two-level page table, this ends up being trivial. Thus the
+  * inlining and the symmetry break with pte_alloc_map() that does all
+@@ -684,9 +687,15 @@
+ 
+ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ 
+-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
++extern unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file *file,
++				   unsigned long addr, unsigned long len,
++				   unsigned long prot, unsigned long flag,
++				   unsigned long pgoff);
++static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+ 	unsigned long len, unsigned long prot,
+-	unsigned long flag, unsigned long pgoff);
++	unsigned long flag, unsigned long pgoff) {
++	return __do_mmap_pgoff(current->mm, file, addr, len, prot, flag, pgoff);
++}
+ 
+ static inline unsigned long do_mmap(struct file *file, unsigned long addr,
+ 	unsigned long len, unsigned long prot,
+Index: linux-2.6.10/include/linux/proc_mm.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/proc_mm.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/proc_mm.h	2005-04-05 12:40:36.073904104 +0800
+@@ -0,0 +1,48 @@
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PROC_MM_H
++#define __PROC_MM_H
++
++#include "linux/sched.h"
++
++#define MM_MMAP 54
++#define MM_MUNMAP 55
++#define MM_MPROTECT 56
++#define MM_COPY_SEGMENTS 57
++
++struct mm_mmap {
++	unsigned long addr;
++	unsigned long len;
++	unsigned long prot;
++	unsigned long flags;
++	unsigned long fd;
++	unsigned long offset;
++};
++
++struct mm_munmap {
++	unsigned long addr;
++	unsigned long len;
++};
++
++struct mm_mprotect {
++	unsigned long addr;
++	unsigned long len;
++        unsigned int prot;
++};
++
++struct proc_mm_op {
++	int op;
++	union {
++		struct mm_mmap mmap;
++		struct mm_munmap munmap;
++	        struct mm_mprotect mprotect;
++		int copy_segments;
++	} u;
++};
++
++extern struct mm_struct *proc_mm_get_mm(int fd);
++
++#endif
+Index: linux-2.6.10/lib/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/lib/Kconfig.debug	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/lib/Kconfig.debug	2005-04-05 12:40:36.010913680 +0800
+@@ -23,7 +23,6 @@
+ config MAGIC_SYSRQ
+ 	bool "Magic SysRq key"
+ 	depends on DEBUG_KERNEL && (H8300 || M68KNOMMU || V850)
+-	depends (USERMODE && MCONSOLE)
+ 	help
+ 	  Enables console device to interpret special characters as
+ 	  commands to dump state information.
+Index: linux-2.6.10/kernel/fork.c
+===================================================================
+--- linux-2.6.10.orig/kernel/fork.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/kernel/fork.c	2005-04-05 12:40:36.070904560 +0800
+@@ -927,6 +927,9 @@
+ 	 * of CLONE_PTRACE.
+ 	 */
+ 	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
++#ifdef TIF_SYSCALL_EMU
++	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
++#endif
+ 
+ 	/* Our parent execution domain becomes current domain
+ 	   These must match for thread signalling to apply */
+Index: linux-2.6.10/mm/mmap.c
+===================================================================
+--- linux-2.6.10.orig/mm/mmap.c	2005-03-31 16:20:10.000000000 +0800
++++ linux-2.6.10/mm/mmap.c	2005-04-05 12:40:36.013913224 +0800
+@@ -759,11 +759,11 @@
+  * The caller must hold down_write(current->mm->mmap_sem).
+  */
+ 
+-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+-			unsigned long len, unsigned long prot,
+-			unsigned long flags, unsigned long pgoff)
++unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file * file,
++			    unsigned long addr, unsigned long len,
++			    unsigned long prot, unsigned long flags,
++			    unsigned long pgoff)
+ {
+-	struct mm_struct * mm = current->mm;
+ 	struct vm_area_struct * vma, * prev;
+ 	struct inode *inode;
+ 	unsigned int vm_flags;
+@@ -1037,7 +1037,7 @@
+ 	return error;
+ }
+ 
+-EXPORT_SYMBOL(do_mmap_pgoff);
++EXPORT_SYMBOL(__do_mmap_pgoff);
+ 
+ /* Get an address range which is currently unmapped.
+  * For shmat() with addr=0.
+Index: linux-2.6.10/mm/proc_mm.c
+===================================================================
+--- linux-2.6.10.orig/mm/proc_mm.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/mm/proc_mm.c	2005-04-05 12:40:36.014913072 +0800
+@@ -0,0 +1,181 @@
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/mm.h"
++#include "linux/init.h"
++#include "linux/proc_fs.h"
++#include "linux/proc_mm.h"
++#include "linux/file.h"
++#include "linux/mman.h"
++#include "asm/uaccess.h"
++#include "asm/mmu_context.h"
++
++static struct file_operations proc_mm_fops;
++
++struct mm_struct *proc_mm_get_mm(int fd)
++{
++	struct mm_struct *ret = ERR_PTR(-EBADF);
++	struct file *file;
++
++	file = fget(fd);
++	if (!file)
++		goto out;
++
++	ret = ERR_PTR(-EINVAL);
++	if(file->f_op != &proc_mm_fops)
++		goto out_fput;
++
++	ret = file->private_data;
++ out_fput:
++	fput(file);
++ out:
++	return(ret);
++}
++
++extern long do_mmap2(struct mm_struct *mm, unsigned long addr,
++		     unsigned long len, unsigned long prot,
++		     unsigned long flags, unsigned long fd,
++		     unsigned long pgoff);
++
++static ssize_t write_proc_mm(struct file *file, const char *buffer,
++			     size_t count, loff_t *ppos)
++{
++	struct mm_struct *mm = file->private_data;
++	struct proc_mm_op req;
++	int n, ret;
++
++	if(count > sizeof(req))
++		return(-EINVAL);
++
++	n = copy_from_user(&req, buffer, count);
++	if(n != 0)
++		return(-EFAULT);
++
++	ret = count;
++	switch(req.op){
++	case MM_MMAP: {
++		struct mm_mmap *map = &req.u.mmap;
++
++		/* Nobody ever noticed it, but do_mmap_pgoff() calls
++		 * get_unmapped_area() which checks current->mm, if
++		 * MAP_FIXED is not set, so mmap() could replace
++		 * an old mapping.
++		 */
++		if (! (map->flags & MAP_FIXED))
++			return(-EINVAL);
++
++		ret = do_mmap2(mm, map->addr, map->len, map->prot,
++			       map->flags, map->fd, map->offset >> PAGE_SHIFT);
++		if((ret & ~PAGE_MASK) == 0)
++			ret = count;
++
++		break;
++	}
++	case MM_MUNMAP: {
++		struct mm_munmap *unmap = &req.u.munmap;
++
++		down_write(&mm->mmap_sem);
++		ret = do_munmap(mm, unmap->addr, unmap->len);
++		up_write(&mm->mmap_sem);
++
++		if(ret == 0)
++			ret = count;
++		break;
++	}
++	case MM_MPROTECT: {
++		struct mm_mprotect *protect = &req.u.mprotect;
++
++		ret = do_mprotect(mm, protect->addr, protect->len,
++				  protect->prot);
++		if(ret == 0)
++			ret = count;
++		break;
++	}
++
++	case MM_COPY_SEGMENTS: {
++		struct mm_struct *from = proc_mm_get_mm(req.u.copy_segments);
++
++		if(IS_ERR(from)){
++			ret = PTR_ERR(from);
++			break;
++		}
++
++		ret = copy_context(mm, from);
++		if(ret == 0)
++			ret = count;
++		break;
++	}
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	return(ret);
++}
++
++static int open_proc_mm(struct inode *inode, struct file *file)
++{
++	struct mm_struct *mm = mm_alloc();
++	int ret;
++
++	ret = -ENOMEM;
++	if(mm == NULL)
++		goto out_mem;
++
++	init_new_empty_context(mm);
++	arch_pick_mmap_layout(mm);
++
++	spin_lock(&mmlist_lock);
++	list_add(&mm->mmlist, &current->mm->mmlist);
++	spin_unlock(&mmlist_lock);
++
++	file->private_data = mm;
++
++	return(0);
++
++ out_mem:
++	return(ret);
++}
++
++static int release_proc_mm(struct inode *inode, struct file *file)
++{
++	struct mm_struct *mm = file->private_data;
++
++	mmput(mm);
++	return(0);
++}
++
++static struct file_operations proc_mm_fops = {
++	.open		= open_proc_mm,
++	.release	= release_proc_mm,
++	.write		= write_proc_mm,
++};
++
++static int make_proc_mm(void)
++{
++	struct proc_dir_entry *ent;
++
++	ent = create_proc_entry("mm", 0222, &proc_root);
++	if(ent == NULL){
++		printk("make_proc_mm : Failed to register /proc/mm\n");
++		return(0);
++	}
++	ent->proc_fops = &proc_mm_fops;
++
++	return(0);
++}
++
++__initcall(make_proc_mm);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+Index: linux-2.6.10/mm/mprotect.c
+===================================================================
+--- linux-2.6.10.orig/mm/mprotect.c	2005-03-31 16:20:10.000000000 +0800
++++ linux-2.6.10/mm/mprotect.c	2005-04-05 12:40:36.011913528 +0800
+@@ -93,19 +93,20 @@
+ {
+ 	pgd_t *dir;
+ 	unsigned long beg = start;
++	struct mm_struct * mm = vma->vm_mm;
+ 
+-	dir = pgd_offset(current->mm, start);
++	dir = pgd_offset(mm, start);
+ 	flush_cache_range(vma, beg, end);
+ 	if (start >= end)
+ 		BUG();
+-	spin_lock(&current->mm->page_table_lock);
++	spin_lock(&mm->page_table_lock);
+ 	do {
+ 		change_pmd_range(dir, start, end - start, newprot);
+ 		start = (start + PGDIR_SIZE) & PGDIR_MASK;
+ 		dir++;
+ 	} while (start && (start < end));
+ 	flush_tlb_range(vma, beg, end);
+-	spin_unlock(&current->mm->page_table_lock);
++	spin_unlock(&mm->page_table_lock);
+ 	return;
+ }
+ 
+@@ -190,8 +191,9 @@
+ 	return error;
+ }
+ 
+-asmlinkage long
+-sys_mprotect(unsigned long start, size_t len, unsigned long prot)
++long
++do_mprotect(struct mm_struct *mm, unsigned long start, size_t len,
++	     unsigned long prot)
+ {
+ 	unsigned long vm_flags, nstart, end, tmp;
+ 	struct vm_area_struct *vma, *prev;
+@@ -220,9 +222,9 @@
+ 
+ 	vm_flags = calc_vm_prot_bits(prot);
+ 
+-	down_write(&current->mm->mmap_sem);
++	down_write(&mm->mmap_sem);
+ 
+-	vma = find_vma_prev(current->mm, start, &prev);
++	vma = find_vma_prev(mm, start, &prev);
+ 	error = -ENOMEM;
+ 	if (!vma)
+ 		goto out;
+@@ -288,6 +290,11 @@
+ 		}
+ 	}
+ out:
+-	up_write(&current->mm->mmap_sem);
++	up_write(&mm->mmap_sem);
+ 	return error;
+ }
++
++asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
++{
++        return(do_mprotect(current->mm, start, len, prot));
++}
+Index: linux-2.6.10/mm/Makefile
+===================================================================
+--- linux-2.6.10.orig/mm/Makefile	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/mm/Makefile	2005-04-05 12:40:36.014913072 +0800
+@@ -18,3 +18,4 @@
+ obj-$(CONFIG_SHMEM) += shmem.o
+ obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+ 
++obj-$(CONFIG_PROC_MM)	+= proc_mm.o
+Index: linux-2.6.10/arch/i386/kernel/entry.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/entry.S	2005-03-31 16:20:08.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/entry.S	2005-04-05 12:40:36.064905472 +0800
+@@ -222,7 +222,7 @@
+ 	SAVE_ALL
+ 	GET_THREAD_INFO(%ebp)
+ 
+-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++	testb $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ 	jnz syscall_trace_entry
+ 	cmpl $(nr_syscalls), %eax
+ 	jae syscall_badsys
+@@ -245,8 +245,8 @@
+ 	pushl %eax			# save orig_eax
+ 	SAVE_ALL
+ 	GET_THREAD_INFO(%ebp)
+-					# system call tracing in operation
+-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++					# system call tracing in operation / emulation
++	testb $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ 	jnz syscall_trace_entry
+ 	cmpl $(nr_syscalls), %eax
+ 	jae syscall_badsys
+@@ -307,6 +307,9 @@
+ 	movl %esp, %eax
+ 	xorl %edx,%edx
+ 	call do_syscall_trace
++	cmpl $0, %eax
++	jne syscall_exit		# ret != 0 -> running under PTRACE_SYSEMU,
++					# so must skip actual syscall
+ 	movl ORIG_EAX(%esp), %eax
+ 	cmpl $(nr_syscalls), %eax
+ 	jnae syscall_call
+Index: linux-2.6.10/arch/i386/kernel/ptrace.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/ptrace.c	2004-12-25 05:34:29.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/ptrace.c	2005-04-05 12:40:36.061905928 +0800
+@@ -15,6 +15,7 @@
+ #include <linux/user.h>
+ #include <linux/security.h>
+ #include <linux/audit.h>
++#include <linux/proc_mm.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -406,15 +407,27 @@
+ 		  }
+ 		  break;
+ 
++ 	case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
+ 	case PTRACE_SYSCALL:	/* continue and stop at next (return from) syscall */
+ 	case PTRACE_CONT:	/* restart after signal. */
+ 		ret = -EIO;
+ 		if ((unsigned long) data > _NSIG)
+ 			break;
++		/* If we came here with PTRACE_SYSEMU and now continue with
++		 * PTRACE_SYSCALL, entry.S used to intercept the syscall return.
++		 * But it shouldn't!
++		 * So we don't clear TIF_SYSCALL_EMU, which is always unused in
++		 * this special case, to remember, we came from SYSEMU. That
++		 * flag will be cleared by do_syscall_trace().
++		 */
++		if (request == PTRACE_SYSEMU) {
++			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
++		} else if (request == PTRACE_CONT) {
++			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
++		}
+ 		if (request == PTRACE_SYSCALL) {
+ 			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+-		}
+-		else {
++		} else {
+ 			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ 		}
+ 		child->exit_code = data;
+@@ -443,6 +456,8 @@
+ 		ret = -EIO;
+ 		if ((unsigned long) data > _NSIG)
+ 			break;
++		/*See do_syscall_trace to know why we don't clear
++		 * TIF_SYSCALL_EMU.*/
+ 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ 		set_singlestep(child);
+ 		child->exit_code = data;
+@@ -542,6 +557,58 @@
+ 					(struct user_desc __user *) data);
+ 		break;
+ 
++#ifdef CONFIG_PROC_MM
++	case PTRACE_FAULTINFO: {
++		struct ptrace_faultinfo fault;
++
++		fault = ((struct ptrace_faultinfo)
++			{ .is_write	= child->thread.error_code,
++			  .addr		= child->thread.cr2 });
++		ret = copy_to_user((unsigned long *) data, &fault,
++				   sizeof(fault));
++		if(ret)
++			break;
++		break;
++	}
++
++	case PTRACE_SIGPENDING:
++		ret = copy_to_user((unsigned long *) data,
++				   &child->pending.signal,
++				   sizeof(child->pending.signal));
++		break;
++
++	case PTRACE_LDT: {
++		struct ptrace_ldt ldt;
++
++		if(copy_from_user(&ldt, (unsigned long *) data,
++				  sizeof(ldt))){
++			ret = -EIO;
++			break;
++		}
++		ret = __modify_ldt(child->mm, ldt.func, ldt.ptr, ldt.bytecount);
++		break;
++	}
++
++	case PTRACE_SWITCH_MM: {
++		struct mm_struct *old = child->mm;
++		struct mm_struct *new = proc_mm_get_mm(data);
++
++		if(IS_ERR(new)){
++			ret = PTR_ERR(new);
++			break;
++		}
++
++		atomic_inc(&new->mm_users);
++		task_lock(child);
++		child->mm = new;
++		child->active_mm = new;
++		task_unlock(child);
++		mmput(old);
++		ret = 0;
++		break;
++	}
++#endif
++
+ 	default:
+ 		ret = ptrace_request(child, request, addr, data);
+ 		break;
+@@ -557,8 +624,9 @@
+  * - triggered by current->work.syscall_trace
+  */
+ __attribute__((regparm(3)))
+-void do_syscall_trace(struct pt_regs *regs, int entryexit)
++int do_syscall_trace(struct pt_regs *regs, int entryexit)
+ {
++	int is_sysemu, is_systrace, is_singlestep;
+ 	if (unlikely(current->audit_context)) {
+ 		if (!entryexit)
+ 			audit_syscall_entry(current, regs->orig_eax,
+@@ -567,16 +635,27 @@
+ 		else
+ 			audit_syscall_exit(current, regs->eax);
+ 	}
+-
+-	if (!test_thread_flag(TIF_SYSCALL_TRACE) &&
+-	    !test_thread_flag(TIF_SINGLESTEP))
+-		return;
++	is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
++	is_systrace = test_thread_flag(TIF_SYSCALL_TRACE);
++	is_singlestep = test_thread_flag(TIF_SINGLESTEP);
++
++	if (!is_systrace && !is_singlestep && !is_sysemu)
++		return 0;
++	/* We can detect the case of coming from PTRACE_SYSEMU and now running
++	 * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being
++	 * set additionally.
++	 * If so let's reset the flag and return without action.
++	 */
++	if (is_sysemu && (is_systrace || is_singlestep)) {
++		clear_thread_flag(TIF_SYSCALL_EMU);
++		return 0;
++	}
+ 	if (!(current->ptrace & PT_PTRACED))
+-		return;
++		return 0;
+ 	/* the 0x80 provides a way for the tracing parent to distinguish
+ 	   between a syscall stop and SIGTRAP delivery */
+ 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) &&
+-				 !test_thread_flag(TIF_SINGLESTEP) ? 0x80 : 0));
++				 !is_singlestep ? 0x80 : 0));
+ 
+ 	/*
+ 	 * this isn't the same as continuing with a signal, but it will do
+@@ -587,4 +666,6 @@
+ 		send_sig(current->exit_code, current, 1);
+ 		current->exit_code = 0;
+ 	}
++	/* != 0 if nullifying the syscall, 0 if running it normally */
++	return is_sysemu;
+ }
+Index: linux-2.6.10/arch/i386/kernel/ldt.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/ldt.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/ldt.c	2005-04-05 12:40:36.062905776 +0800
+@@ -18,6 +18,7 @@
+ #include <asm/system.h>
+ #include <asm/ldt.h>
+ #include <asm/desc.h>
++#include <asm/mmu_context.h>
+ 
+ #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+ static void flush_ldt(void *null)
+@@ -27,11 +28,12 @@
+ }
+ #endif
+ 
+-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
++static int alloc_ldt(struct mm_struct *mm, int mincount, int reload)
+ {
+ 	void *oldldt;
+ 	void *newldt;
+ 	int oldsize;
++	mm_context_t * pc = &mm->context;
+ 
+ 	if (mincount <= pc->size)
+ 		return 0;
+@@ -58,13 +60,15 @@
+ #ifdef CONFIG_SMP
+ 		cpumask_t mask;
+ 		preempt_disable();
+-		load_LDT(pc);
++		if (&current->active_mm->context == pc)
++			load_LDT(pc);
+ 		mask = cpumask_of_cpu(smp_processor_id());
+-		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
++		if (!cpus_equal(mm->cpu_vm_mask, mask))
+ 			smp_call_function(flush_ldt, NULL, 1, 1);
+ 		preempt_enable();
+ #else
+-		load_LDT(pc);
++		if (&current->active_mm->context == pc)
++			load_LDT(pc);
+ #endif
+ 	}
+ 	if (oldsize) {
+@@ -76,12 +80,12 @@
+ 	return 0;
+ }
+ 
+-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
++static inline int copy_ldt(struct mm_struct *new, struct mm_struct *old)
+ {
+-	int err = alloc_ldt(new, old->size, 0);
++	int err = alloc_ldt(new, old->context.size, 0);
+ 	if (err < 0)
+ 		return err;
+-	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
++	memcpy(new->context.ldt, old->context.ldt, old->context.size*LDT_ENTRY_SIZE);
+ 	return 0;
+ }
+ 
+@@ -89,22 +93,24 @@
+  * we do not have to muck with descriptors here, that is
+  * done in switch_mm() as needed.
+  */
+-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
++int copy_context(struct mm_struct *mm, struct mm_struct *old_mm)
+ {
+-	struct mm_struct * old_mm;
+ 	int retval = 0;
+ 
+-	init_MUTEX(&mm->context.sem);
+-	mm->context.size = 0;
+-	old_mm = current->mm;
+ 	if (old_mm && old_mm->context.size > 0) {
+ 		down(&old_mm->context.sem);
+-		retval = copy_ldt(&mm->context, &old_mm->context);
++		retval = copy_ldt(mm, old_mm);
+ 		up(&old_mm->context.sem);
+ 	}
+ 	return retval;
+ }
+ 
++int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
++{
++	init_new_empty_context(mm);
++	return copy_context(mm, current->mm);
++}
++
+ /*
+  * No need to lock the MM as we are the last user
+  */
+@@ -121,11 +127,11 @@
+ 	}
+ }
+ 
+-static int read_ldt(void __user * ptr, unsigned long bytecount)
++static int read_ldt(struct mm_struct * mm, void __user * ptr,
++		    unsigned long bytecount)
+ {
+ 	int err;
+ 	unsigned long size;
+-	struct mm_struct * mm = current->mm;
+ 
+ 	if (!mm->context.size)
+ 		return 0;
+@@ -174,9 +180,8 @@
+ 	return err;
+ }
+ 
+-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
++static int write_ldt(struct mm_struct * mm, void __user * ptr, unsigned long bytecount, int oldmode)
+ {
+-	struct mm_struct * mm = current->mm;
+ 	__u32 entry_1, entry_2, *lp;
+ 	int error;
+ 	struct user_desc ldt_info;
+@@ -200,7 +205,7 @@
+ 
+ 	down(&mm->context.sem);
+ 	if (ldt_info.entry_number >= mm->context.size) {
+-		error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
++		error = alloc_ldt(mm, ldt_info.entry_number+1, 1);
+ 		if (error < 0)
+ 			goto out_unlock;
+ 	}
+@@ -233,23 +238,29 @@
+ 	return error;
+ }
+ 
+-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
++int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr,
++	       unsigned long bytecount)
+ {
+ 	int ret = -ENOSYS;
+ 
+ 	switch (func) {
+ 	case 0:
+-		ret = read_ldt(ptr, bytecount);
++		ret = read_ldt(mm, ptr, bytecount);
+ 		break;
+ 	case 1:
+-		ret = write_ldt(ptr, bytecount, 1);
++		ret = write_ldt(mm, ptr, bytecount, 1);
+ 		break;
+ 	case 2:
+ 		ret = read_default_ldt(ptr, bytecount);
+ 		break;
+ 	case 0x11:
+-		ret = write_ldt(ptr, bytecount, 0);
++		ret = write_ldt(mm, ptr, bytecount, 0);
+ 		break;
+ 	}
+ 	return ret;
+ }
++
++asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
++{
++	return __modify_ldt(current->mm, func, ptr, bytecount);
++}
+Index: linux-2.6.10/arch/i386/kernel/sys_i386.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/sys_i386.c	2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/sys_i386.c	2005-04-05 12:40:36.063905624 +0800
+@@ -41,7 +41,7 @@
+ }
+ 
+ /* common code for old and new mmaps */
+-static inline long do_mmap2(
++long do_mmap2(struct mm_struct *mm,
+ 	unsigned long addr, unsigned long len,
+ 	unsigned long prot, unsigned long flags,
+ 	unsigned long fd, unsigned long pgoff)
+@@ -56,9 +56,9 @@
+ 			goto out;
+ 	}
+ 
+-	down_write(&current->mm->mmap_sem);
+-	error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+-	up_write(&current->mm->mmap_sem);
++	down_write(&mm->mmap_sem);
++	error = __do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff);
++	up_write(&mm->mmap_sem);
+ 
+ 	if (file)
+ 		fput(file);
+@@ -70,7 +70,7 @@
+ 	unsigned long prot, unsigned long flags,
+ 	unsigned long fd, unsigned long pgoff)
+ {
+-	return do_mmap2(addr, len, prot, flags, fd, pgoff);
++	return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff);
+ }
+ 
+ /*
+@@ -101,7 +101,7 @@
+ 	if (a.offset & ~PAGE_MASK)
+ 		goto out;
+ 
+-	err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
++	err = do_mmap2(current->mm, a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
+ out:
+ 	return err;
+ }
+Index: linux-2.6.10/arch/i386/Kconfig
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig	2005-03-31 15:35:23.000000000 +0800
++++ linux-2.6.10/arch/i386/Kconfig	2005-04-05 12:40:36.066905168 +0800
+@@ -738,6 +738,10 @@
+ 	depends on HIGHMEM64G
+ 	default y
+ 
++config PROC_MM
++	bool "/proc/mm support"
++	default y
++
+ # Common NUMA Features
+ config NUMA
+ 	bool "Numa Memory Allocation and Scheduler Support"
+Index: linux-2.6.10/arch/um/include/frame.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/frame.h	2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/arch/um/include/frame.h	2005-04-05 19:01:49.158500672 +0800
+@@ -1,53 +0,0 @@
+-/* 
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_H_
+-#define __FRAME_H_
+-
+-#include "sysdep/frame.h"
+-
+-struct frame_common {
+-	void *data;
+-	int len;
+-	int sig_index;
+-	int sr_index;
+-	int sr_relative;
+-	int sp_index;
+-	struct arch_frame_data arch;
+-};
+-
+-struct sc_frame {
+-	struct frame_common common;
+-	int sc_index;
+-};
+-
+-extern struct sc_frame signal_frame_sc;
+-
+-extern struct sc_frame signal_frame_sc_sr;
+-
+-struct si_frame {
+-	struct frame_common common;
+-	int sip_index;
+-	int si_index;
+-	int ucp_index;
+-	int uc_index;
+-};
+-
+-extern struct si_frame signal_frame_si;
+-
+-extern void capture_signal_stack(void);
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only.  This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/frame_kern.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/frame_kern.h	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/arch/um/include/frame_kern.h	2005-04-05 12:40:36.056906688 +0800
+@@ -6,8 +6,8 @@
+ #ifndef __FRAME_KERN_H_
+ #define __FRAME_KERN_H_
+ 
+-#include "frame.h"
+-#include "sysdep/frame_kern.h"
++#define _S(nr) (1<<((nr)-1))
++#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
+ 
+ extern int setup_signal_stack_sc(unsigned long stack_top, int sig, 
+ 				 struct k_sigaction *ka,
+Index: linux-2.6.10/arch/um/include/frame_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/frame_user.h	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/um/include/frame_user.h	2005-04-05 19:01:49.158500672 +0800
+@@ -1,23 +0,0 @@
+-/* 
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_USER_H_
+-#define __FRAME_USER_H_
+-
+-#include "sysdep/frame_user.h"
+-#include "frame.h"
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only.  This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/ptrace_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/ptrace_user.h	2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/arch/um/include/ptrace_user.h	2005-04-05 12:40:36.057906536 +0800
+@@ -26,4 +26,35 @@
+ int get_using_sysemu(void);
+ extern int sysemu_supported;
+ 
++
++/* syscall emulation path in ptrace */
++
++#ifndef PTRACE_SYSEMU
++#define PTRACE_SYSEMU 31
++#endif
++
++/* On architectures, that started to support PTRACE_O_TRACESYSGOOD
++ * in linux 2.4, there are two different definitions of
++ * PTRACE_SETOPTIONS: linux 2.4 uses 21 while linux 2.6 uses 0x4200.
++ * For binary compatibility, 2.6 also supports the old "21", named
++ * PTRACE_OLDSETOPTION. On these architectures, UML always must use
++ * "21", to ensure the kernel runs on 2.4 and 2.6 host without
++ * recompilation. So, we use PTRACE_OLDSETOPTIONS in UML.
++ * We also want to be able to build the kernel on 2.4, which doesn't
++ * have PTRACE_OLDSETOPTIONS. So, if it is missing, we declare
++ * PTRACE_OLDSETOPTIONS to to be the same as PTRACE_SETOPTIONS.
++ *
++ * On architectures, that start to support PTRACE_O_TRACESYSGOOD on
++ * linux 2.6, PTRACE_OLDSETOPTIONS never is defined, and also isn't
++ * supported by the host kernel. In that case, our trick lets us use
++ * the new 0x4200 with the name PTRACE_OLDSETOPTIONS.
++ */
++#ifndef PTRACE_OLDSETOPTIONS
++#define PTRACE_OLDSETOPTIONS PTRACE_SETOPTIONS
++#endif
++
++void set_using_sysemu(int value);
++int get_using_sysemu(void);
++extern int sysemu_supported;
++
+ #endif
+Index: linux-2.6.10/arch/um/include/sysdep-i386/frame.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame.h	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/um/include/sysdep-i386/frame.h	2005-04-05 19:01:49.158500672 +0800
+@@ -1,29 +0,0 @@
+-/* 
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_I386_H
+-#define __FRAME_I386_H
+-
+-struct arch_frame_data_raw {
+-	unsigned long fp_start;
+-	unsigned long sr;
+-};
+-
+-struct arch_frame_data {
+-	int fpstate_size;
+-};
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only.  This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/sysdep-i386/frame_kern.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame_kern.h	2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/um/include/sysdep-i386/frame_kern.h	2005-04-05 19:01:49.158500672 +0800
+@@ -1,69 +0,0 @@
+-/* 
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_KERN_I386_H
+-#define __FRAME_KERN_I386_H
+-
+-/* This is called from sys_sigreturn.  It takes the sp at the point of the
+- * sigreturn system call and returns the address of the sigcontext struct
+- * on the stack.
+- */
+-
+-static inline void *sp_to_sc(unsigned long sp)
+-{
+-	return((void *) sp);
+-}
+-
+-static inline void *sp_to_uc(unsigned long sp)
+-{
+-	unsigned long uc;
+-
+-	uc = sp + signal_frame_si.uc_index - 
+-		signal_frame_si.common.sp_index - 4;
+-	return((void *) uc);
+-}
+-
+-static inline void *sp_to_rt_sc(unsigned long sp)
+-{
+-	unsigned long sc;
+-
+-	sc = sp - signal_frame_si.common.sp_index + 
+-		signal_frame_si.common.len - 4;
+-	return((void *) sc);
+-}
+-
+-static inline void *sp_to_mask(unsigned long sp)
+-{
+-	unsigned long mask;
+-
+-	mask = sp - signal_frame_sc.common.sp_index + 
+-		signal_frame_sc.common.len - 8;
+-	return((void *) mask);
+-}
+-
+-extern int sc_size(void *data);
+-
+-static inline void *sp_to_rt_mask(unsigned long sp)
+-{
+-	unsigned long mask;
+-
+-	mask = sp - signal_frame_si.common.sp_index + 
+-		signal_frame_si.common.len + 
+-		sc_size(&signal_frame_si.common.arch) - 4;
+-	return((void *) mask);
+-}
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only.  This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/sysdep-i386/frame_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame_user.h	2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/arch/um/include/sysdep-i386/frame_user.h	2005-04-05 19:01:49.158500672 +0800
+@@ -1,91 +0,0 @@
+-/* 
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_USER_I386_H
+-#define __FRAME_USER_I386_H
+-
+-#include <asm/page.h>
+-#include "sysdep/frame.h"
+-
+-/* This stuff is to calculate the size of the fp state struct at runtime
+- * because it has changed between 2.2 and 2.4 and it would be good for a
+- * UML compiled on one to work on the other.
+- * So, setup_arch_frame_raw fills in the arch struct with the raw data, which
+- * just contains the address of the end of the sigcontext.  This is invoked
+- * from the signal handler.
+- * setup_arch_frame uses that data to figure out what 
+- * arch_frame_data.fpstate_size should be.  It really has no idea, since it's
+- * not allowed to do sizeof(struct fpstate) but it's safe to consider that it's
+- * everything from the end of the sigcontext up to the top of the stack.  So,
+- * it masks off the page number to get the offset within the page and subtracts
+- * that from the page size, and that's how big the fpstate struct will be
+- * considered to be.
+- */
+-
+-static inline void setup_arch_frame_raw(struct arch_frame_data_raw *data,
+-					void *end, unsigned long srp)
+-{
+-	unsigned long sr = *((unsigned long *) srp);
+-
+-	data->fp_start = (unsigned long) end;
+-	if((sr & PAGE_MASK) == ((unsigned long) end & PAGE_MASK))
+-		data->sr = sr;
+-	else data->sr = 0;
+-}
+-
+-static inline void setup_arch_frame(struct arch_frame_data_raw *in, 
+-				    struct arch_frame_data *out)
+-{
+-	unsigned long fpstate_start = in->fp_start;
+-
+-	if(in->sr == 0){
+-		fpstate_start &= ~PAGE_MASK;
+-		out->fpstate_size = PAGE_SIZE - fpstate_start;
+-	}
+-	else {
+-		out->fpstate_size = in->sr - fpstate_start;
+-	}
+-}
+-
+-/* This figures out where on the stack the SA_RESTORER function address
+- * is stored.  For i386, it's the signal handler return address, so it's
+- * located next to the frame pointer.
+- * This is inlined, so __builtin_frame_address(0) is correct.  Otherwise,
+- * it would have to be __builtin_frame_address(1).
+- */
+-
+-#define frame_restorer() \
+-({ \
+-	unsigned long *fp; \
+-\
+-	fp = __builtin_frame_address(0); \
+-	((unsigned long) (fp + 1)); \
+-})
+-
+-/* Similarly, this returns the value of sp when the handler was first
+- * entered.  This is used to calculate the proper sp when delivering
+- * signals.
+- */
+-
+-#define frame_sp() \
+-({ \
+-	unsigned long *fp; \
+-\
+-	fp = __builtin_frame_address(0); \
+-	((unsigned long) (fp + 1)); \
+-})
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only.  This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/elf_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/elf_user.h	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/um/include/elf_user.h	2005-04-05 12:40:36.054906992 +0800
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH
++ * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
++ * Licensed under the GPL
++ */
++
++#ifndef __ELF_USER_H__
++#define __ELF_USER_H__
++
++/* For compilation on a host that doesn't support AT_SYSINFO (Linux 2.4)  */
++
++#ifndef AT_SYSINFO
++#define AT_SYSINFO 32
++#endif
++#ifndef AT_SYSINFO_EHDR
++#define AT_SYSINFO_EHDR 33
++#endif
++
++#endif
+Index: linux-2.6.10/arch/um/include/skas_ptrace.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/skas_ptrace.h	2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/arch/um/include/skas_ptrace.h	2005-04-05 12:40:36.056906688 +0800
+@@ -6,6 +6,7 @@
+ #ifndef __SKAS_PTRACE_H
+ #define __SKAS_PTRACE_H
+ 
++#ifndef PTRACE_FAULTINFO
+ struct ptrace_faultinfo {
+ 	int is_write;
+ 	unsigned long addr;
+@@ -21,6 +22,7 @@
+ #define PTRACE_SIGPENDING 53
+ #define PTRACE_LDT 54
+ #define PTRACE_SWITCH_MM 55
++#endif
+ 
+ #endif
+ 
+Index: linux-2.6.10/arch/um/include/signal_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/signal_user.h	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/um/include/signal_user.h	2005-04-05 12:40:36.055906840 +0800
+@@ -14,6 +14,8 @@
+ extern int set_signals(int enable);
+ extern int get_signals(void);
+ 
++#define SYSCALL_TRAP 0x80
++
+ #endif
+ 
+ /*
+Index: linux-2.6.10/arch/um/sys-i386/ptrace_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/ptrace_user.c	2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/arch/um/sys-i386/ptrace_user.c	2005-04-05 12:40:36.022911856 +0800
+@@ -17,17 +17,30 @@
+ 
+ int ptrace_getregs(long pid, unsigned long *regs_out)
+ {
+-	return(ptrace(PTRACE_GETREGS, pid, 0, regs_out));
++	if(ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0)
++		return(-errno);
++	return(0);
+ }
+ 
+ int ptrace_setregs(long pid, unsigned long *regs)
+ {
+-	return(ptrace(PTRACE_SETREGS, pid, 0, regs));
++	if(ptrace(PTRACE_SETREGS, pid, 0, regs) < 0)
++		return(-errno);
++	return(0);
+ }
+ 
+ int ptrace_getfpregs(long pid, unsigned long *regs)
+ {
+-	return(ptrace(PTRACE_GETFPREGS, pid, 0, regs));
++	if(ptrace(PTRACE_GETFPREGS, pid, 0, regs) < 0)
++		return(-errno);
++	return(0);
++}
++
++int ptrace_setfpregs(long pid, unsigned long *regs)
++{
++	if(ptrace(PTRACE_SETFPREGS, pid, 0, regs) < 0)
++		return(-errno);
++	return(0);
+ }
+ 
+ static void write_debugregs(int pid, unsigned long *regs)
+Index: linux-2.6.10/arch/um/sys-i386/sigcontext.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/sigcontext.c	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/um/sys-i386/sigcontext.c	2005-04-05 12:40:36.023911704 +0800
+@@ -9,22 +9,14 @@
+ #include <asm/sigcontext.h>
+ #include "sysdep/ptrace.h"
+ #include "kern_util.h"
+-#include "frame_user.h"
+-
+-int sc_size(void *data)
+-{
+-	struct arch_frame_data *arch = data;
+-
+-	return(sizeof(struct sigcontext) + arch->fpstate_size);
+-}
+ 
+ void sc_to_sc(void *to_ptr, void *from_ptr)
+ {
+ 	struct sigcontext *to = to_ptr, *from = from_ptr;
+-	int size = sizeof(*to) + signal_frame_sc.common.arch.fpstate_size;
+ 
+-	memcpy(to, from, size);
+-	if(from->fpstate != NULL) to->fpstate = (struct _fpstate *) (to + 1);
++	memcpy(to, from, sizeof(*to) + sizeof(struct _fpstate));
++	if(from->fpstate != NULL)
++		to->fpstate = (struct _fpstate *) (to + 1);
+ }
+ 
+ unsigned long *sc_sigmask(void *sc_ptr)
+Index: linux-2.6.10/arch/um/sys-i386/sysrq.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/sysrq.c	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/um/sys-i386/sysrq.c	2005-04-05 12:40:36.022911856 +0800
+@@ -33,3 +33,13 @@
+ 
+         show_trace((unsigned long *) &regs);
+ }
++
++/* Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+Index: linux-2.6.10/arch/um/sys-i386/signal.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/signal.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/um/sys-i386/signal.c	2005-04-05 12:40:36.021912008 +0800
+@@ -0,0 +1,374 @@
++/*
++ * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/signal.h"
++#include "linux/ptrace.h"
++#include "asm/current.h"
++#include "asm/ucontext.h"
++#include "asm/uaccess.h"
++#include "asm/unistd.h"
++#include "frame_kern.h"
++#include "signal_user.h"
++#include "ptrace_user.h"
++#include "sigcontext.h"
++#include "mode.h"
++
++#ifdef CONFIG_MODE_SKAS
++
++#include "skas.h"
++
++static int copy_sc_from_user_skas(struct pt_regs *regs,
++				  struct sigcontext *from)
++{
++  	struct sigcontext sc;
++	unsigned long fpregs[HOST_FP_SIZE];
++	int err;
++
++	err = copy_from_user(&sc, from, sizeof(sc));
++	err |= copy_from_user(fpregs, sc.fpstate, sizeof(fpregs));
++	if(err)
++		return(err);
++
++	REGS_GS(regs->regs.skas.regs) = sc.gs;
++	REGS_FS(regs->regs.skas.regs) = sc.fs;
++	REGS_ES(regs->regs.skas.regs) = sc.es;
++	REGS_DS(regs->regs.skas.regs) = sc.ds;
++	REGS_EDI(regs->regs.skas.regs) = sc.edi;
++	REGS_ESI(regs->regs.skas.regs) = sc.esi;
++	REGS_EBP(regs->regs.skas.regs) = sc.ebp;
++	REGS_SP(regs->regs.skas.regs) = sc.esp;
++	REGS_EBX(regs->regs.skas.regs) = sc.ebx;
++	REGS_EDX(regs->regs.skas.regs) = sc.edx;
++	REGS_ECX(regs->regs.skas.regs) = sc.ecx;
++	REGS_EAX(regs->regs.skas.regs) = sc.eax;
++	REGS_IP(regs->regs.skas.regs) = sc.eip;
++	REGS_CS(regs->regs.skas.regs) = sc.cs;
++	REGS_EFLAGS(regs->regs.skas.regs) = sc.eflags;
++	REGS_SS(regs->regs.skas.regs) = sc.ss;
++	regs->regs.skas.fault_addr = sc.cr2;
++	regs->regs.skas.fault_type = FAULT_WRITE(sc.err);
++	regs->regs.skas.trap_type = sc.trapno;
++
++	err = ptrace_setfpregs(userspace_pid[0], fpregs);
++	if(err < 0){
++	  	printk("copy_sc_from_user_skas - PTRACE_SETFPREGS failed, "
++		       "errno = %d\n", err);
++		return(1);
++	}
++
++	return(0);
++}
++
++int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp,
++			 struct pt_regs *regs, unsigned long fault_addr,
++			 int fault_type)
++{
++  	struct sigcontext sc;
++	unsigned long fpregs[HOST_FP_SIZE];
++	int err;
++
++	sc.gs = REGS_GS(regs->regs.skas.regs);
++	sc.fs = REGS_FS(regs->regs.skas.regs);
++	sc.es = REGS_ES(regs->regs.skas.regs);
++	sc.ds = REGS_DS(regs->regs.skas.regs);
++	sc.edi = REGS_EDI(regs->regs.skas.regs);
++	sc.esi = REGS_ESI(regs->regs.skas.regs);
++	sc.ebp = REGS_EBP(regs->regs.skas.regs);
++	sc.esp = REGS_SP(regs->regs.skas.regs);
++	sc.ebx = REGS_EBX(regs->regs.skas.regs);
++	sc.edx = REGS_EDX(regs->regs.skas.regs);
++	sc.ecx = REGS_ECX(regs->regs.skas.regs);
++	sc.eax = REGS_EAX(regs->regs.skas.regs);
++	sc.eip = REGS_IP(regs->regs.skas.regs);
++	sc.cs = REGS_CS(regs->regs.skas.regs);
++	sc.eflags = REGS_EFLAGS(regs->regs.skas.regs);
++	sc.esp_at_signal = regs->regs.skas.regs[UESP];
++	sc.ss = regs->regs.skas.regs[SS];
++	sc.cr2 = fault_addr;
++	sc.err = TO_SC_ERR(fault_type);
++	sc.trapno = regs->regs.skas.trap_type;
++
++	err = ptrace_getfpregs(userspace_pid[0], fpregs);
++	if(err < 0){
++	  	printk("copy_sc_to_user_skas - PTRACE_GETFPREGS failed, "
++		       "errno = %d\n", err);
++		return(1);
++	}
++	to_fp = (to_fp ? to_fp : (struct _fpstate *) (to + 1));
++	sc.fpstate = to_fp;
++
++	if(err)
++	  	return(err);
++
++	return(copy_to_user(to, &sc, sizeof(sc)) ||
++	       copy_to_user(to_fp, fpregs, sizeof(fpregs)));
++}
++#endif
++
++#ifdef CONFIG_MODE_TT
++int copy_sc_from_user_tt(struct sigcontext *to, struct sigcontext *from,
++			 int fpsize)
++{
++	struct _fpstate *to_fp, *from_fp;
++	unsigned long sigs;
++	int err;
++
++	to_fp = to->fpstate;
++	from_fp = from->fpstate;
++	sigs = to->oldmask;
++	err = copy_from_user(to, from, sizeof(*to));
++	to->oldmask = sigs;
++	if(to_fp != NULL){
++		err |= copy_from_user(&to->fpstate, &to_fp,
++				      sizeof(to->fpstate));
++		err |= copy_from_user(to_fp, from_fp, fpsize);
++	}
++	return(err);
++}
++
++int copy_sc_to_user_tt(struct sigcontext *to, struct _fpstate *fp,
++		       struct sigcontext *from, int fpsize)
++{
++	struct _fpstate *to_fp, *from_fp;
++	int err;
++
++	to_fp =	(fp ? fp : (struct _fpstate *) (to + 1));
++	from_fp = from->fpstate;
++	err = copy_to_user(to, from, sizeof(*to));
++	if(from_fp != NULL){
++		err |= copy_to_user(&to->fpstate, &to_fp,
++					 sizeof(to->fpstate));
++		err |= copy_to_user(to_fp, from_fp, fpsize);
++	}
++	return(err);
++}
++#endif
++
++static int copy_sc_from_user(struct pt_regs *to, void *from)
++{
++	int ret;
++
++	ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from,
++					       sizeof(struct _fpstate)),
++			  copy_sc_from_user_skas(to, from));
++	return(ret);
++}
++
++static int copy_sc_to_user(struct sigcontext *to, struct _fpstate *fp,
++			   struct pt_regs *from)
++{
++	return(CHOOSE_MODE(copy_sc_to_user_tt(to, fp, UPT_SC(&from->regs),
++					      sizeof(*fp)),
++			   copy_sc_to_user_skas(to, fp, from,
++						current->thread.cr2,
++						current->thread.err)));
++}
++
++static int copy_ucontext_to_user(struct ucontext *uc, struct _fpstate *fp,
++				 sigset_t *set, unsigned long sp)
++{
++	int err = 0;
++
++	err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp);
++	err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags);
++	err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size);
++	err |= copy_sc_to_user(&uc->uc_mcontext, fp, &current->thread.regs);
++	err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set));
++	return(err);
++}
++
++struct sigframe
++{
++	char *pretcode;
++	int sig;
++	struct sigcontext sc;
++	struct _fpstate fpstate;
++	unsigned long extramask[_NSIG_WORDS-1];
++	char retcode[8];
++};
++
++struct rt_sigframe
++{
++	char *pretcode;
++	int sig;
++	struct siginfo *pinfo;
++	void *puc;
++	struct siginfo info;
++	struct ucontext uc;
++	struct _fpstate fpstate;
++	char retcode[8];
++};
++
++int setup_signal_stack_sc(unsigned long stack_top, int sig,
++			  struct k_sigaction *ka, struct pt_regs *regs,
++			  sigset_t *mask)
++{
++	struct sigframe __user *frame;
++	void *restorer;
++	int err = 0;
++
++	stack_top &= -8UL;
++	frame = (struct sigframe *) stack_top - 1;
++	if(verify_area(VERIFY_WRITE, frame, sizeof(*frame)))
++		return(1);
++
++	restorer = (void *) frame->retcode;
++	if(ka->sa.sa_flags & SA_RESTORER)
++		restorer = ka->sa.sa_restorer;
++
++	err |= __put_user(restorer, &frame->pretcode);
++	err |= __put_user(sig, &frame->sig);
++	err |= copy_sc_to_user(&frame->sc, NULL, regs);
++	err |= __put_user(mask->sig[0], &frame->sc.oldmask);
++	if (_NSIG_WORDS > 1)
++		err |= __copy_to_user(&frame->extramask, &mask->sig[1],
++				      sizeof(frame->extramask));
++
++	/*
++	 * This is popl %eax ; movl $,%eax ; int $0x80
++	 *
++	 * WE DO NOT USE IT ANY MORE! It's only left here for historical
++	 * reasons and because gdb uses it as a signature to notice
++	 * signal handler stack frames.
++	 */
++	err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
++	err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
++	err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
++
++	if(err)
++		return(err);
++
++	PT_REGS_SP(regs) = (unsigned long) frame;
++	PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
++	PT_REGS_EAX(regs) = (unsigned long) sig;
++	PT_REGS_EDX(regs) = (unsigned long) 0;
++	PT_REGS_ECX(regs) = (unsigned long) 0;
++
++	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
++		ptrace_notify(SIGTRAP);
++	return(0);
++}
++
++int setup_signal_stack_si(unsigned long stack_top, int sig,
++			  struct k_sigaction *ka, struct pt_regs *regs,
++			  siginfo_t *info, sigset_t *mask)
++{
++	struct rt_sigframe __user *frame;
++	void *restorer;
++	int err = 0;
++
++	stack_top &= -8UL;
++	frame = (struct rt_sigframe *) stack_top - 1;
++	if(verify_area(VERIFY_WRITE, frame, sizeof(*frame)))
++		return(1);
++
++	restorer = (void *) frame->retcode;
++	if(ka->sa.sa_flags & SA_RESTORER)
++		restorer = ka->sa.sa_restorer;
++
++	err |= __put_user(restorer, &frame->pretcode);
++	err |= __put_user(sig, &frame->sig);
++	err |= __put_user(&frame->info, &frame->pinfo);
++	err |= __put_user(&frame->uc, &frame->puc);
++	err |= copy_siginfo_to_user(&frame->info, info);
++	err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask,
++				     PT_REGS_SP(regs));
++
++	/*
++	 * This is movl $,%eax ; int $0x80
++	 *
++	 * WE DO NOT USE IT ANY MORE! It's only left here for historical
++	 * reasons and because gdb uses it as a signature to notice
++	 * signal handler stack frames.
++	 */
++	err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
++	err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
++	err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
++
++	if(err)
++		return(err);
++
++	PT_REGS_SP(regs) = (unsigned long) frame;
++	PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
++	PT_REGS_EAX(regs) = (unsigned long) sig;
++	PT_REGS_EDX(regs) = (unsigned long) &frame->info;
++	PT_REGS_ECX(regs) = (unsigned long) &frame->uc;
++
++	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
++		ptrace_notify(SIGTRAP);
++	return(0);
++}
++
++long sys_sigreturn(struct pt_regs regs)
++{
++	unsigned long __user sp = PT_REGS_SP(&current->thread.regs);
++	struct sigframe __user *frame = (struct sigframe *)(sp - 8);
++	sigset_t set;
++	struct sigcontext __user *sc = &frame->sc;
++	unsigned long __user *oldmask = &sc->oldmask;
++	unsigned long __user *extramask = &frame->extramask;
++	int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
++
++	if(copy_from_user(&set.sig[0], oldmask, sizeof(&set.sig[0])) ||
++	   copy_from_user(&set.sig[1], extramask, sig_size))
++		goto segfault;
++
++	sigdelsetmask(&set, ~_BLOCKABLE);
++
++	spin_lock_irq(&current->sighand->siglock);
++	current->blocked = set;
++	recalc_sigpending();
++	spin_unlock_irq(&current->sighand->siglock);
++
++	if(copy_sc_from_user(&current->thread.regs, sc))
++		goto segfault;
++
++	PT_REGS_SYSCALL_NR(&current->thread.regs) = -1; /* Avoid ERESTART handling */
++	return(PT_REGS_SYSCALL_RET(&current->thread.regs));
++
++ segfault:
++	force_sig(SIGSEGV, current);
++	return 0;
++}
++
++long sys_rt_sigreturn(struct pt_regs regs)
++{
++	unsigned long __user sp = PT_REGS_SP(&current->thread.regs);
++	struct rt_sigframe __user *frame = (struct rt_sigframe *) (sp - 4);
++	sigset_t set;
++	struct ucontext __user *uc = &frame->uc;
++	int sig_size = _NSIG_WORDS * sizeof(unsigned long);
++
++	if(copy_from_user(&set, &uc->uc_sigmask, sig_size))
++		goto segfault;
++
++	sigdelsetmask(&set, ~_BLOCKABLE);
++
++	spin_lock_irq(&current->sighand->siglock);
++	current->blocked = set;
++	recalc_sigpending();
++	spin_unlock_irq(&current->sighand->siglock);
++
++	if(copy_sc_from_user(&current->thread.regs, &uc->uc_mcontext))
++		goto segfault;
++
++	PT_REGS_SYSCALL_NR(&current->thread.regs) = -1; /* Avoid ERESTART handling */
++	return(PT_REGS_SYSCALL_RET(&current->thread.regs));
++
++ segfault:
++	force_sig(SIGSEGV, current);
++	return 0;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+Index: linux-2.6.10/arch/um/sys-i386/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/Makefile	2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/arch/um/sys-i386/Makefile	2005-04-05 12:40:36.023911704 +0800
+@@ -1,5 +1,5 @@
+ obj-y = bitops.o bugs.o checksum.o fault.o ksyms.o ldt.o ptrace.o \
+-	ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o
++	ptrace_user.o semaphore.o signal.o sigcontext.o syscalls.o sysrq.o
+ 
+ obj-$(CONFIG_HIGHMEM) += highmem.o
+ obj-$(CONFIG_MODULES) += module.o
+Index: linux-2.6.10/arch/um/kernel/mem_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/mem_user.c	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/mem_user.c	2005-04-05 12:40:36.051907448 +0800
+@@ -101,6 +101,8 @@
+ 	}
+ 	printf("OK\n");
+ 	munmap(addr, UM_KERN_PAGE_SIZE);
++
++	os_close_file(fd);
+ }
+ 
+ static int have_devanon = 0;
+@@ -261,6 +263,39 @@
+ }
+ #endif
+ 
++#if 0
++/* Debugging facility for dumping stuff out to the host, avoiding the timing
++ * problems that come with printf and breakpoints.
++ * Enable in case of emergency.
++ */
++
++int logging = 1;
++int logging_fd = -1;
++
++int logging_line = 0;
++char logging_buf[512];
++
++void log(char *fmt, ...)
++{
++        va_list ap;
++        struct timeval tv;
++        struct openflags flags;
++
++        if(logging == 0) return;
++        if(logging_fd < 0){
++                flags = of_create(of_trunc(of_rdwr(OPENFLAGS())));
++                logging_fd = os_open_file("log", flags, 0644);
++        }
++        gettimeofday(&tv, NULL);
++        sprintf(logging_buf, "%d\t %u.%u  ", logging_line++, tv.tv_sec,
++                tv.tv_usec);
++        va_start(ap, fmt);
++        vsprintf(&logging_buf[strlen(logging_buf)], fmt, ap);
++        va_end(ap);
++        write(logging_fd, logging_buf, strlen(logging_buf));
++}
++#endif
++
+ /*
+  * Overrides for Emacs so that we follow Linus's tabbing style.
+  * Emacs will notice this stuff at the end of the file and automatically
+Index: linux-2.6.10/arch/um/kernel/time.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/time.c	2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/time.c	2005-04-05 12:40:36.046908208 +0800
+@@ -60,6 +60,9 @@
+ 	   (setitimer(ITIMER_REAL, &disable, NULL) < 0))
+ 		printk("disnable_timer - setitimer failed, errno = %d\n",
+ 		       errno);
++	/* If there are signals already queued, after unblocking ignore them */
++	set_handler(SIGALRM, SIG_IGN, 0, -1);
++	set_handler(SIGVTALRM, SIG_IGN, 0, -1);
+ }
+ 
+ void switch_timers(int to_real)
+Index: linux-2.6.10/arch/um/kernel/ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/ksyms.c	2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/ksyms.c	2005-04-05 12:40:36.049907752 +0800
+@@ -48,6 +48,7 @@
+ EXPORT_SYMBOL(mode_tt);
+ EXPORT_SYMBOL(handle_page_fault);
+ EXPORT_SYMBOL(find_iomem);
++EXPORT_SYMBOL(end_iomem);
+ 
+ #ifdef CONFIG_MODE_TT
+ EXPORT_SYMBOL(strncpy_from_user_tt);
+Index: linux-2.6.10/arch/um/kernel/um_arch.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/um_arch.c	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/um_arch.c	2005-04-05 12:40:36.045908360 +0800
+@@ -44,11 +44,6 @@
+ 	.ipi_pipe		= { -1, -1 }
+ };
+ 
+-/* Placeholder to make UML link until the vsyscall stuff is actually
+- * implemented
+- */
+-void *__kernel_vsyscall;
+-
+ unsigned long thread_saved_pc(struct task_struct *task)
+ {
+ 	return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas,
+@@ -326,6 +321,11 @@
+ 	 */
+ 	check_tmpexec();
+ 
++	/* Need to check this early because mmapping happens before the
++	 * kernel is running.
++	 */
++	check_tmpexec();
++
+ 	brk_start = (unsigned long) sbrk(0);
+ 	CHOOSE_MODE_PROC(before_mem_tt, before_mem_skas, brk_start);
+ 	/* Increase physical memory size for exec-shield users
+Index: linux-2.6.10/arch/um/kernel/process.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/process.c	2004-12-25 05:35:25.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/process.c	2005-04-05 12:40:36.025911400 +0800
+@@ -13,6 +13,7 @@
+ #include <setjmp.h>
+ #include <sys/time.h>
+ #include <sys/ptrace.h>
++#include <linux/ptrace.h>
+ #include <sys/wait.h>
+ #include <sys/mman.h>
+ #include <asm/ptrace.h>
+@@ -285,6 +286,9 @@
+ 	printk("Checking that ptrace can change system call numbers...");
+ 	pid = start_ptraced_child(&stack);
+ 
++	if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0)
++		panic("check_ptrace: PTRACE_SETOPTIONS failed, errno = %d", errno);
++
+ 	while(1){
+ 		if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
+ 			panic("check_ptrace : ptrace failed, errno = %d", 
+@@ -292,8 +296,8 @@
+ 		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
+ 		if(n < 0)
+ 			panic("check_ptrace : wait failed, errno = %d", errno);
+-		if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
+-			panic("check_ptrace : expected SIGTRAP, "
++		if(!WIFSTOPPED(status) || (WSTOPSIG(status) != (SIGTRAP|SYSCALL_TRAP)))
++			panic("check_ptrace : expected (SIGTRAP|SYSCALL_TRAP), "
+ 			      "got status = %d", status);
+ 		
+ 		syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
+Index: linux-2.6.10/arch/um/kernel/process_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/process_kern.c	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/process_kern.c	2005-04-05 12:40:36.047908056 +0800
+@@ -291,8 +291,6 @@
+ 
+ EXPORT_SYMBOL(disable_hlt);
+ 
+-extern int signal_frame_size;
+-
+ void *um_kmalloc(int size)
+ {
+ 	return(kmalloc(size, GFP_KERNEL));
+Index: linux-2.6.10/arch/um/kernel/signal_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/signal_user.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/signal_user.c	2005-04-05 12:40:36.050907600 +0800
+@@ -61,6 +61,10 @@
+  * disable profiling; it's safe because the profiling code does not interact
+  * with the kernel code at all.*/
+ 
++/* Both here and in set/get_signal we don't touch SIGPROF, because we must not
++ * disable profiling; it's safe because the profiling code does not interact
++ * with the kernel code at all.*/
++
+ static void change_signals(int type)
+ {
+ 	sigset_t mask;
+Index: linux-2.6.10/arch/um/kernel/initrd_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/initrd_user.c	2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/initrd_user.c	2005-04-05 12:40:36.026911248 +0800
+@@ -29,6 +29,8 @@
+ 		       filename, -n);
+ 		return(-1);
+ 	}
++
++	os_close_file(fd);
+ 	return(0);
+ }
+ 
+Index: linux-2.6.10/arch/um/kernel/dyn.lds.S
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/dyn.lds.S	2004-12-25 05:34:48.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/dyn.lds.S	2005-04-05 12:40:36.044908512 +0800
+@@ -7,8 +7,11 @@
+ 
+ SECTIONS
+ {
++  PROVIDE (__executable_start = START);
+   . = START + SIZEOF_HEADERS;
+   .interp         : { *(.interp) }
++  /* Used in arch/um/kernel/mem.c. Any memory between START and __binary_start
++   * is remapped.*/
+   __binary_start = .;
+   . = ALIGN(4096);		/* Init code and data */
+   _stext = .;
+Index: linux-2.6.10/arch/um/kernel/ptrace.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/ptrace.c	2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/ptrace.c	2005-04-05 12:40:36.044908512 +0800
+@@ -16,6 +16,7 @@
+ #include "asm/uaccess.h"
+ #include "kern_util.h"
+ #include "ptrace_user.h"
++#include "signal_user.h"
+ 
+ /*
+  * Called by kernel/ptrace.c when detaching..
+@@ -328,8 +329,10 @@
+ 	/* the 0x80 provides a way for the tracing parent to distinguish
+ 	   between a syscall stop and SIGTRAP delivery */
+ 	tracesysgood = (current->ptrace & PT_TRACESYSGOOD) && !is_singlestep;
+-	ptrace_notify(SIGTRAP | (tracesysgood ? 0x80 : 0));
+-
++	ptrace_notify(SIGTRAP | (tracesysgood ? SYSCALL_TRAP : 0));
++ 	if ( entryexit ) /* force do_signal() --> is_syscall() */
++ 		set_thread_flag(TIF_SIGPENDING);
++ 
+ 	/* force do_signal() --> is_syscall() */
+ 	set_thread_flag(TIF_SIGPENDING);
+ 
+Index: linux-2.6.10/arch/um/kernel/uml.lds.S
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/uml.lds.S	2005-04-01 12:25:25.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/uml.lds.S	2005-04-05 12:40:36.049907752 +0800
+@@ -7,8 +7,12 @@
+ 
+ SECTIONS
+ {
++  /*This must contain the right address - not quite the default ELF one.*/
++  PROVIDE (__executable_start = START);
+   . = START + SIZEOF_HEADERS;
+ 
++  /* Used in arch/um/kernel/mem.c. Any memory between START and __binary_start
++   * is remapped.*/
+   __binary_start = .;
+ #ifdef MODE_TT
+   .thread_private : {
+@@ -20,9 +24,13 @@
+   }
+   . = ALIGN(4096);
+   .remap : { arch/um/kernel/tt/unmap_fin.o (.text) }
+-#endif
+ 
++  /*If you put this after #endif, STATIC build without TT mode
++  gives a segfaulting binary. And after all, a hole just after
++  binary_start is not very polite to glibc.*/
+   . = ALIGN(4096);		/* Init code and data */
++#endif
++
+   _stext = .;
+   __init_begin = .;
+   .init.text : {
+Index: linux-2.6.10/arch/um/kernel/main.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/main.c	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/main.c	2005-04-05 12:40:36.024911552 +0800
+@@ -81,6 +81,8 @@
+ 
+ extern int uml_exitcode;
+ 
++extern void scan_elf_aux( char **envp);
++
+ int main(int argc, char **argv, char **envp)
+ {
+ 	char **new_argv;
+@@ -147,6 +149,8 @@
+ 	set_handler(SIGTERM, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
+ 	set_handler(SIGHUP, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
+ 
++	scan_elf_aux( envp);
++
+ 	do_uml_initcalls();
+ 	ret = linux_main(argc, argv);
+ 
+@@ -155,18 +159,20 @@
+ 		int err;
+ 
+ 		printf("\n");
+-
+-		/* Let any pending signals fire, then disable them.  This
+-		 * ensures that they won't be delivered after the exec, when
+-		 * they are definitely not expected.
+-		 */
+-		unblock_signals();
++		/* stop timers and set SIG*ALRM to be ignored */
+ 		disable_timer();
++		/* disable SIGIO for the fds and set SIGIO to be ignored */
+ 		err = deactivate_all_fds();
+ 		if(err)
+ 			printf("deactivate_all_fds failed, errno = %d\n",
+ 			       -err);
+ 
++		/* Let any pending signals fire now.  This ensures
++		 * that they won't be delivered after the exec, when
++		 * they are definitely not expected.
++		 */
++		unblock_signals();
++
+ 		execvp(new_argv[0], new_argv);
+ 		perror("Failed to exec kernel");
+ 		ret = 1;
+Index: linux-2.6.10/arch/um/kernel/irq_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/irq_user.c	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/irq_user.c	2005-04-05 12:40:36.028910944 +0800
+@@ -374,6 +374,8 @@
+ 		if(err)
+ 			return(err);
+ 	}
++	/* If there is a signal already queued, after unblocking ignore it */
++	set_handler(SIGIO, SIG_IGN, 0, -1);
+ 
+ 	return(0);
+ }
+Index: linux-2.6.10/arch/um/kernel/signal_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/signal_kern.c	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/signal_kern.c	2005-04-05 12:40:36.048907904 +0800
+@@ -230,53 +230,6 @@
+ 	return(do_sigaltstack(uss, uoss, PT_REGS_SP(&current->thread.regs)));
+ }
+ 
+-extern int userspace_pid[];
+-
+-static int copy_sc_from_user(struct pt_regs *to, void *from, 
+-			     struct arch_frame_data *arch)
+-{
+-	int ret;
+-
+-	ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from, arch),
+-			  copy_sc_from_user_skas(userspace_pid[0],
+-						 &to->regs, from));
+-	return(ret);
+-}
+-
+-long sys_sigreturn(struct pt_regs regs)
+-{
+-	void __user *sc = sp_to_sc(PT_REGS_SP(&current->thread.regs));
+-	void __user *mask = sp_to_mask(PT_REGS_SP(&current->thread.regs));
+-	int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
+-
+-	spin_lock_irq(&current->sighand->siglock);
+-	copy_from_user(&current->blocked.sig[0], sc_sigmask(sc), 
+-		       sizeof(current->blocked.sig[0]));
+-	copy_from_user(&current->blocked.sig[1], mask, sig_size);
+-	sigdelsetmask(&current->blocked, ~_BLOCKABLE);
+-	recalc_sigpending();
+-	spin_unlock_irq(&current->sighand->siglock);
+-	copy_sc_from_user(&current->thread.regs, sc, 
+-			  &signal_frame_sc.common.arch);
+-	return(PT_REGS_SYSCALL_RET(&current->thread.regs));
+-}
+-
+-long sys_rt_sigreturn(struct pt_regs regs)
+-{
+-	unsigned long sp = PT_REGS_SP(&current->thread.regs);
+-	struct ucontext __user *uc = sp_to_uc(sp);
+-	int sig_size = _NSIG_WORDS * sizeof(unsigned long);
+-
+-	spin_lock_irq(&current->sighand->siglock);
+-	copy_from_user(&current->blocked, &uc->uc_sigmask, sig_size);
+-	sigdelsetmask(&current->blocked, ~_BLOCKABLE);
+-	recalc_sigpending();
+-	spin_unlock_irq(&current->sighand->siglock);
+-	copy_sc_from_user(&current->thread.regs, &uc->uc_mcontext,
+-			  &signal_frame_si.common.arch);
+-	return(PT_REGS_SYSCALL_RET(&current->thread.regs));
+-}
+-
+ /*
+  * Overrides for Emacs so that we follow Linus's tabbing style.
+  * Emacs will notice this stuff at the end of the file and automatically
+Index: linux-2.6.10/arch/um/kernel/skas/include/uaccess-skas.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/include/uaccess-skas.h	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/include/uaccess-skas.h	2005-04-05 12:40:36.037909576 +0800
+@@ -7,6 +7,51 @@
+ #define __SKAS_UACCESS_H
+ 
+ #include "asm/errno.h"
++#include "asm/fixmap.h"
++
++#define access_ok_skas(type, addr, size) \
++	((segment_eq(get_fs(), KERNEL_DS)) || \
++	 (((unsigned long) (addr) < TASK_SIZE) && \
++	  ((unsigned long) (addr) + (size) <= TASK_SIZE)) || \
++	 ((type == VERIFY_READ ) && \
++	  ((unsigned long) (addr) >= FIXADDR_USER_START) && \
++	  ((unsigned long) (addr) + (size) <= FIXADDR_USER_END) && \
++	  ((unsigned long) (addr) + (size) >= (unsigned long)(addr))))
++
++static inline int verify_area_skas(int type, const void * addr,
++				   unsigned long size)
++{
++	return(access_ok_skas(type, addr, size) ? 0 : -EFAULT);
++}
++
++extern int copy_from_user_skas(void *to, const void *from, int n);
++extern int copy_to_user_skas(void *to, const void *from, int n);
++extern int strncpy_from_user_skas(char *dst, const char *src, int count);
++extern int __clear_user_skas(void *mem, int len);
++extern int clear_user_skas(void *mem, int len);
++extern int strnlen_user_skas(const void *str, int len);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_UACCESS_H
++#define __SKAS_UACCESS_H
++
++#include "asm/errno.h"
+ 
+ #define access_ok_skas(type, addr, size) \
+ 	((segment_eq(get_fs(), KERNEL_DS)) || \
+Index: linux-2.6.10/arch/um/kernel/skas/include/mmu-skas.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/include/mmu-skas.h	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/include/mmu-skas.h	2005-04-05 12:40:36.035909880 +0800
+@@ -22,3 +22,27 @@
+  * c-file-style: "linux"
+  * End:
+  */
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_MMU_H
++#define __SKAS_MMU_H
++
++struct mmu_context_skas {
++	int mm_fd;
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+Index: linux-2.6.10/arch/um/kernel/skas/include/mode-skas.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/include/mode-skas.h	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/include/mode-skas.h	2005-04-05 12:40:36.036909728 +0800
+@@ -14,6 +14,40 @@
+ extern int have_fpx_regs;
+ 
+ extern void user_time_init_skas(void);
++extern void sig_handler_common_skas(int sig, void *sc_ptr);
++extern void halt_skas(void);
++extern void reboot_skas(void);
++extern void kill_off_processes_skas(void);
++extern int is_skas_winch(int pid, int fd, void *data);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MODE_SKAS_H__
++#define __MODE_SKAS_H__
++
++#include <sysdep/ptrace.h>
++
++extern unsigned long exec_regs[];
++extern unsigned long exec_fp_regs[];
++extern unsigned long exec_fpx_regs[];
++extern int have_fpx_regs;
++
++extern void user_time_init_skas(void);
+ extern int copy_sc_from_user_skas(int pid, union uml_pt_regs *regs,
+ 				  void *from_ptr);
+ extern int copy_sc_to_user_skas(int pid, void *to_ptr, void *fp,
+Index: linux-2.6.10/arch/um/kernel/skas/sys-i386/sigcontext.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/sys-i386/sigcontext.c	2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/sys-i386/sigcontext.c	2005-04-05 19:01:49.158500672 +0800
+@@ -1,114 +0,0 @@
+-/* 
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#include <errno.h>
+-#include <asm/sigcontext.h>
+-#include <sys/ptrace.h>
+-#include <linux/ptrace.h>
+-#include "sysdep/ptrace.h"
+-#include "sysdep/ptrace_user.h"
+-#include "kern_util.h"
+-#include "user.h"
+-#include "sigcontext.h"
+-#include "mode.h"
+-
+-int copy_sc_from_user_skas(int pid, union uml_pt_regs *regs, void *from_ptr)
+-{
+-  	struct sigcontext sc, *from = from_ptr;
+-	unsigned long fpregs[FP_FRAME_SIZE];
+-	int err;
+-
+-	err = copy_from_user_proc(&sc, from, sizeof(sc));
+-	err |= copy_from_user_proc(fpregs, sc.fpstate, sizeof(fpregs));
+-	if(err)
+-		return(err);
+-
+-	regs->skas.regs[GS] = sc.gs;
+-	regs->skas.regs[FS] = sc.fs;
+-	regs->skas.regs[ES] = sc.es;
+-	regs->skas.regs[DS] = sc.ds;
+-	regs->skas.regs[EDI] = sc.edi;
+-	regs->skas.regs[ESI] = sc.esi;
+-	regs->skas.regs[EBP] = sc.ebp;
+-	regs->skas.regs[UESP] = sc.esp;
+-	regs->skas.regs[EBX] = sc.ebx;
+-	regs->skas.regs[EDX] = sc.edx;
+-	regs->skas.regs[ECX] = sc.ecx;
+-	regs->skas.regs[EAX] = sc.eax;
+-	regs->skas.regs[EIP] = sc.eip;
+-	regs->skas.regs[CS] = sc.cs;
+-	regs->skas.regs[EFL] = sc.eflags;
+-	regs->skas.regs[SS] = sc.ss;
+-	regs->skas.fault_addr = sc.cr2;
+-	regs->skas.fault_type = FAULT_WRITE(sc.err);
+-	regs->skas.trap_type = sc.trapno;
+-
+-	err = ptrace(PTRACE_SETFPREGS, pid, 0, fpregs);
+-	if(err < 0){
+-	  	printk("copy_sc_to_user - PTRACE_SETFPREGS failed, "
+-		       "errno = %d\n", errno);
+-		return(1);
+-	}
+-
+-	return(0);
+-}
+-
+-int copy_sc_to_user_skas(int pid, void *to_ptr, void *fp,
+-			 union uml_pt_regs *regs, unsigned long fault_addr,
+-			 int fault_type)
+-{
+-  	struct sigcontext sc, *to = to_ptr;
+-	struct _fpstate *to_fp;
+-	unsigned long fpregs[FP_FRAME_SIZE];
+-	int err;
+-
+-	sc.gs = regs->skas.regs[GS];
+-	sc.fs = regs->skas.regs[FS];
+-	sc.es = regs->skas.regs[ES];
+-	sc.ds = regs->skas.regs[DS];
+-	sc.edi = regs->skas.regs[EDI];
+-	sc.esi = regs->skas.regs[ESI];
+-	sc.ebp = regs->skas.regs[EBP];
+-	sc.esp = regs->skas.regs[UESP];
+-	sc.ebx = regs->skas.regs[EBX];
+-	sc.edx = regs->skas.regs[EDX];
+-	sc.ecx = regs->skas.regs[ECX];
+-	sc.eax = regs->skas.regs[EAX];
+-	sc.eip = regs->skas.regs[EIP];
+-	sc.cs = regs->skas.regs[CS];
+-	sc.eflags = regs->skas.regs[EFL];
+-	sc.esp_at_signal = regs->skas.regs[UESP];
+-	sc.ss = regs->skas.regs[SS];
+-	sc.cr2 = fault_addr;
+-	sc.err = TO_SC_ERR(fault_type);
+-	sc.trapno = regs->skas.trap_type;
+-
+-	err = ptrace(PTRACE_GETFPREGS, pid, 0, fpregs);
+-	if(err < 0){
+-	  	printk("copy_sc_to_user - PTRACE_GETFPREGS failed, "
+-		       "errno = %d\n", errno);
+-		return(1);
+-	}
+-	to_fp = (struct _fpstate *) 
+-		(fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to)));
+-	sc.fpstate = to_fp;
+-
+-	if(err)
+-	  	return(err);
+-
+-	return(copy_to_user_proc(to, &sc, sizeof(sc)) ||
+-	       copy_to_user_proc(to_fp, fpregs, sizeof(fpregs)));
+-}
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only.  This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/kernel/skas/sys-i386/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/sys-i386/Makefile	2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/sys-i386/Makefile	2005-04-05 19:01:49.158500672 +0800
+@@ -1,12 +0,0 @@
+-# 
+-# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+-# Licensed under the GPL
+-#
+-
+-obj-y = sigcontext.o
+-
+-USER_OBJS = sigcontext.o
+-USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
+-
+-$(USER_OBJS) : %.o: %.c
+-	$(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
+Index: linux-2.6.10/arch/um/kernel/skas/process.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/process.c	2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/process.c	2005-04-05 12:40:36.030910640 +0800
+@@ -11,6 +11,7 @@
+ #include <sched.h>
+ #include <sys/wait.h>
+ #include <sys/ptrace.h>
++#include <linux/ptrace.h>
+ #include <sys/mman.h>
+ #include <sys/user.h>
+ #include <asm/unistd.h>
+@@ -60,15 +61,10 @@
+ /*To use the same value of using_sysemu as the caller, ask it that value (in local_using_sysemu)*/
+ static void handle_trap(int pid, union uml_pt_regs *regs, int local_using_sysemu)
+ {
+-	int err, syscall_nr, status;
+-
+-	syscall_nr = PT_SYSCALL_NR(regs->skas.regs);
+-	UPT_SYSCALL_NR(regs) = syscall_nr;
+-	if(syscall_nr < 0){
+-		relay_signal(SIGTRAP, regs);
+-		return;
+-	}
+ 
++ 	int err, status;
++  
++ 	UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->skas.regs); /* Mark this as a syscall */
+ 	if (!local_using_sysemu)
+ 	{
+ 		err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid);
+@@ -82,7 +78,8 @@
+ 			      "errno = %d\n", errno);
+ 
+ 		CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED));
+-		if((err < 0) || !WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
++		if((err < 0) || !WIFSTOPPED(status) ||
++		                (WSTOPSIG(status) != (SIGTRAP|SYSCALL_TRAP)))
+ 			panic("handle_trap - failed to wait at end of syscall, "
+ 			      "errno = %d, status = %d\n", errno, status);
+ 	}
+@@ -131,6 +128,10 @@
+ 		panic("start_userspace : expected SIGSTOP, got status = %d",
+ 		      status);
+ 
++	if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, (void *)PTRACE_O_TRACESYSGOOD) < 0)
++		panic("start_userspace : PTRACE_SETOPTIONS failed, errno=%d\n",
++		      errno);
++
+ 	if(munmap(stack, PAGE_SIZE) < 0)
+ 		panic("start_userspace : munmap failed, errno = %d\n", errno);
+ 
+@@ -160,15 +161,19 @@
+ 
+ 		regs->skas.is_user = 1;
+ 		save_registers(regs);
++		UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
+ 
+ 		if(WIFSTOPPED(status)){
+ 		  	switch(WSTOPSIG(status)){
+ 			case SIGSEGV:
+ 				handle_segv(pid);
+ 				break;
+-			case SIGTRAP:
++			case (SIGTRAP|SYSCALL_TRAP):
+ 			        handle_trap(pid, regs, local_using_sysemu);
+ 				break;
++			case SIGTRAP:
++				relay_signal(SIGTRAP, regs);
++				break;
+ 			case SIGIO:
+ 			case SIGVTALRM:
+ 			case SIGILL:
+@@ -222,9 +227,10 @@
+ 	block_signals();
+ 	if(sigsetjmp(fork_buf, 1) == 0)
+ 		new_thread_proc(stack, handler);
+-	set_signals(flags);
+ 
+ 	remove_sigstack();
++
++	set_signals(flags);
+ }
+ 
+ void thread_wait(void *sw, void *fb)
+Index: linux-2.6.10/arch/um/kernel/skas/process_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/process_kern.c	2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/process_kern.c	2005-04-05 12:40:36.032910336 +0800
+@@ -19,7 +19,6 @@
+ #include "os.h"
+ #include "user_util.h"
+ #include "tlb.h"
+-#include "frame.h"
+ #include "kern.h"
+ #include "mode.h"
+ #include "proc_mm.h"
+@@ -183,7 +182,6 @@
+ int start_uml_skas(void)
+ {
+ 	start_userspace(0);
+-	capture_signal_stack();
+ 
+ 	init_new_thread_signals(1);
+ 	uml_idle_timer();
+Index: linux-2.6.10/arch/um/kernel/skas/syscall_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/syscall_kern.c	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/syscall_kern.c	2005-04-05 12:40:36.034910032 +0800
+@@ -6,6 +6,7 @@
+ #include "linux/sys.h"
+ #include "linux/ptrace.h"
+ #include "asm/errno.h"
++#include "linux/ptrace.h"
+ #include "asm/unistd.h"
+ #include "asm/ptrace.h"
+ #include "asm/current.h"
+Index: linux-2.6.10/arch/um/kernel/skas/trap_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/trap_user.c	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/trap_user.c	2005-04-05 12:40:36.033910184 +0800
+@@ -21,6 +21,14 @@
+ 	int save_errno = errno;
+ 	int save_user;
+ 
++	/* This is done because to allow SIGSEGV to be delivered inside a SEGV
++	 * handler.  This can happen in copy_user, and if SEGV is disabled,
++	 * the process will die.
++	 * XXX Figure out why this is better than SA_NODEFER
++	 */
++	if(sig == SIGSEGV)
++		change_sig(SIGSEGV, 1);
++
+ 	r = &TASK_REGS(get_current())->skas;
+ 	save_user = r->is_user;
+ 	r->is_user = 0;
+Index: linux-2.6.10/arch/um/kernel/skas/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/Makefile	2004-12-25 05:34:30.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/Makefile	2005-04-05 12:40:36.034910032 +0800
+@@ -4,8 +4,7 @@
+ #
+ 
+ obj-y := exec_kern.o mem.o mem_user.o mmu.o process.o process_kern.o \
+-	syscall_kern.o syscall_user.o time.o tlb.o trap_user.o uaccess.o \
+-	sys-$(SUBARCH)/
++ 	syscall_kern.o syscall_user.o time.o tlb.o trap_user.o uaccess.o
+ 
+ subdir-y := util
+ 
+Index: linux-2.6.10/arch/um/kernel/helper.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/helper.c	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/helper.c	2005-04-05 12:40:36.027911096 +0800
+@@ -49,14 +49,14 @@
+ 	return(0);
+ }
+ 
+-/* XXX The alloc_stack here breaks if this is called in the tracing thread */
+-
++/* Returns either the pid of the child process we run or -E* on failure.
++ * XXX The alloc_stack here breaks if this is called in the tracing thread */
+ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv,
+ 	       unsigned long *stack_out)
+ {
+ 	struct helper_data data;
+ 	unsigned long stack, sp;
+-	int pid, fds[2], err, n;
++	int pid, fds[2], ret, n;
+ 
+ 	if((stack_out != NULL) && (*stack_out != 0))
+ 		stack = *stack_out;
+@@ -64,16 +64,16 @@
+ 	if(stack == 0)
+ 		return(-ENOMEM);
+ 
+-	err = os_pipe(fds, 1, 0);
+-	if(err < 0){
+-		printk("run_helper : pipe failed, err = %d\n", -err);
++	ret = os_pipe(fds, 1, 0);
++	if(ret < 0){
++		printk("run_helper : pipe failed, ret = %d\n", -ret);
+ 		goto out_free;
+ 	}
+ 
+-	err = os_set_exec_close(fds[1], 1);
+-	if(err < 0){
+-		printk("run_helper : setting FD_CLOEXEC failed, err = %d\n",
+-		       -err);
++	ret = os_set_exec_close(fds[1], 1);
++	if(ret < 0){
++		printk("run_helper : setting FD_CLOEXEC failed, ret = %d\n",
++		       -ret);
+ 		goto out_close;
+ 	}
+ 
+@@ -85,34 +85,36 @@
+ 	pid = clone(helper_child, (void *) sp, CLONE_VM | SIGCHLD, &data);
+ 	if(pid < 0){
+ 		printk("run_helper : clone failed, errno = %d\n", errno);
+-		err = -errno;
++		ret = -errno;
+ 		goto out_close;
+ 	}
+ 
+ 	os_close_file(fds[1]);
+-	n = os_read_file(fds[0], &err, sizeof(err));
++	fds[1] = -1;
++
++	/*Read the errno value from the child.*/
++	n = os_read_file(fds[0], &ret, sizeof(ret));
+ 	if(n < 0){
+-		printk("run_helper : read on pipe failed, err = %d\n", -n);
+-		err = n;
+-		goto out_kill;
++		printk("run_helper : read on pipe failed, ret = %d\n", -n);
++		ret = n;
++		os_kill_process(pid, 1);
+ 	}
+ 	else if(n != 0){
+ 		CATCH_EINTR(n = waitpid(pid, NULL, 0));
+-		pid = -errno;
++		ret = -errno;
++	} else {
++		ret = pid;
+ 	}
+ 
+-	if(stack_out == NULL) free_stack(stack, 0);
+-        else *stack_out = stack;
+-	return(pid);
+-
+- out_kill:
+-	os_kill_process(pid, 1);
+  out_close:
++	if (fds[1] != -1)
++		os_close_file(fds[1]);
+ 	os_close_file(fds[0]);
+-	os_close_file(fds[1]);
+  out_free:
+-	free_stack(stack, 0);
+-	return(err);
++	if(stack_out == NULL)
++		free_stack(stack, 0);
++        else *stack_out = stack;
++	return(ret);
+ }
+ 
+ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, 
+Index: linux-2.6.10/arch/um/kernel/time_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/time_kern.c	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/time_kern.c	2005-04-05 12:40:36.027911096 +0800
+@@ -170,7 +170,7 @@
+ void timer_handler(int sig, union uml_pt_regs *regs)
+ {
+ 	local_irq_disable();
+-	update_process_times(user_context(UPT_SP(regs)));
++	update_process_times(CHOOSE_MODE(user_context(UPT_SP(regs)), (regs)->skas.is_user));
+ 	local_irq_enable();
+ 	if(current_thread->cpu == 0)
+ 		timer_irq(regs);
+Index: linux-2.6.10/arch/um/kernel/tt/include/mode-tt.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/include/mode-tt.h	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/include/mode-tt.h	2005-04-05 12:40:36.042908816 +0800
+@@ -14,6 +14,41 @@
+ 
+ extern int tracer(int (*init_proc)(void *), void *sp);
+ extern void user_time_init_tt(void);
++extern void sig_handler_common_tt(int sig, void *sc);
++extern void syscall_handler_tt(int sig, union uml_pt_regs *regs);
++extern void reboot_tt(void);
++extern void halt_tt(void);
++extern int is_tracer_winch(int pid, int fd, void *data);
++extern void kill_off_processes_tt(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MODE_TT_H__
++#define __MODE_TT_H__
++
++#include "sysdep/ptrace.h"
++
++enum { OP_NONE, OP_EXEC, OP_FORK, OP_TRACE_ON, OP_REBOOT, OP_HALT, OP_CB };
++
++extern int tracing_pid;
++
++extern int tracer(int (*init_proc)(void *), void *sp);
++extern void user_time_init_tt(void);
+ extern int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data);
+ extern int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr,
+ 			      void *data);
+Index: linux-2.6.10/arch/um/kernel/tt/include/tt.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/include/tt.h	2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/include/tt.h	2005-04-05 12:40:36.043908664 +0800
+@@ -26,7 +26,8 @@
+ extern int is_tracing(void *task);
+ extern void syscall_handler(int sig, union uml_pt_regs *regs);
+ extern void exit_kernel(int pid, void *task);
+-extern int do_syscall(void *task, int pid, int local_using_sysemu);
++extern void do_syscall(void *task, int pid, int local_using_sysemu);
++extern void do_sigtrap(void *task);
+ extern int is_valid_pid(int pid);
+ extern void remap_data(void *segment_start, void *segment_end, int w);
+ 
+Index: linux-2.6.10/arch/um/kernel/tt/exec_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/exec_user.c	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/exec_user.c	2005-04-05 12:40:36.039909272 +0800
+@@ -10,6 +10,7 @@
+ #include <errno.h>
+ #include <sys/wait.h>
+ #include <sys/ptrace.h>
++#include <linux/ptrace.h>
+ #include <signal.h>
+ #include "user_util.h"
+ #include "kern_util.h"
+@@ -35,7 +36,10 @@
+ 		tracer_panic("do_exec failed to get registers - errno = %d",
+ 			     errno);
+ 
+-	kill(old_pid, SIGKILL);
++	os_kill_ptraced_process(old_pid, 0);
++
++	if (ptrace(PTRACE_OLDSETOPTIONS, new_pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0)
++		tracer_panic("do_exec: PTRACE_SETOPTIONS failed, errno = %d", errno);
+ 
+ 	if(ptrace_setregs(new_pid, regs) < 0)
+ 		tracer_panic("do_exec failed to start new proc - errno = %d",
+Index: linux-2.6.10/arch/um/kernel/tt/sys-i386/sigcontext.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/sys-i386/sigcontext.c	2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/sys-i386/sigcontext.c	2005-04-05 19:01:49.158500672 +0800
+@@ -1,60 +0,0 @@
+-/* 
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#include <stdlib.h>
+-#include <asm/sigcontext.h>
+-#include "kern_util.h"
+-#include "sysdep/frame.h"
+-
+-int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data)
+-{
+-	struct arch_frame_data *arch = data;
+-	struct sigcontext *to = to_ptr, *from = from_ptr;
+-	struct _fpstate *to_fp, *from_fp;
+-	unsigned long sigs;
+-	int err;
+-
+-	to_fp = to->fpstate;
+-	from_fp = from->fpstate;
+-	sigs = to->oldmask;
+-	err = copy_from_user_proc(to, from, sizeof(*to));
+-	to->oldmask = sigs;
+-	if(to_fp != NULL){
+-		err |= copy_from_user_proc(&to->fpstate, &to_fp,
+-					   sizeof(to->fpstate));
+-		err |= copy_from_user_proc(to_fp, from_fp, arch->fpstate_size);
+-	}
+-	return(err);
+-}
+-
+-int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr, void *data)
+-{
+-	struct arch_frame_data *arch = data;
+-	struct sigcontext *to = to_ptr, *from = from_ptr;
+-	struct _fpstate *to_fp, *from_fp;
+-	int err;
+-
+-	to_fp = (struct _fpstate *) 
+-		(fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to)));
+-	from_fp = from->fpstate;
+-	err = copy_to_user_proc(to, from, sizeof(*to));
+-	if(from_fp != NULL){
+-		err |= copy_to_user_proc(&to->fpstate, &to_fp,
+-					 sizeof(to->fpstate));
+-		err |= copy_to_user_proc(to_fp, from_fp, arch->fpstate_size);
+-	}
+-	return(err);
+-}
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only.  This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/kernel/tt/sys-i386/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/sys-i386/Makefile	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/sys-i386/Makefile	2005-04-05 19:01:49.158500672 +0800
+@@ -1,12 +0,0 @@
+-# 
+-# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+-# Licensed under the GPL
+-#
+-
+-obj-y = sigcontext.o
+-
+-USER_OBJS = sigcontext.o
+-USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
+-
+-$(USER_OBJS) : %.o: %.c
+-	$(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
+Index: linux-2.6.10/arch/um/kernel/tt/syscall_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/syscall_user.c	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/syscall_user.c	2005-04-05 12:40:36.037909576 +0800
+@@ -42,37 +42,31 @@
+ 	syscall_trace(regs, 1);
+ 	record_syscall_end(index, result);
+ }
+-
+-int do_syscall(void *task, int pid, int local_using_sysemu)
+-{
+-	unsigned long proc_regs[FRAME_SIZE];
+-	union uml_pt_regs *regs;
+-	int syscall;
+-
+-	if(ptrace_getregs(pid, proc_regs) < 0)
+-		tracer_panic("Couldn't read registers");
+-	syscall = PT_SYSCALL_NR(proc_regs);
+-
+-	regs = TASK_REGS(task);
+-	UPT_SYSCALL_NR(regs) = syscall;
+-
+-	if(syscall < 0)
+-		return(0);
+-
+-	if((syscall != __NR_sigreturn) &&
+-	   ((unsigned long *) PT_IP(proc_regs) >= &_stext) && 
+-	   ((unsigned long *) PT_IP(proc_regs) <= &_etext))
+-		tracer_panic("I'm tracing myself and I can't get out");
+-
+-	if(local_using_sysemu)
+-		return(1);
+-
+-	if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, 
+-		  __NR_getpid) < 0)
+-		tracer_panic("do_syscall : Nullifying syscall failed, "
+-			     "errno = %d", errno);
+-	return(1);
+-}
++  
++ void do_sigtrap(void *task)
++ {
++ 	UPT_SYSCALL_NR(TASK_REGS(task)) = -1;
++ }
++ 
++ void do_syscall(void *task, int pid, int local_using_sysemu)
++  {
++  	unsigned long proc_regs[FRAME_SIZE];
++  
++  	if(ptrace_getregs(pid, proc_regs) < 0)
++  		tracer_panic("Couldn't read registers");
++  
++ 	UPT_SYSCALL_NR(TASK_REGS(task)) = PT_SYSCALL_NR(proc_regs);
++  
++ 	if(((unsigned long *) PT_IP(proc_regs) >= &_stext) &&
++  	   ((unsigned long *) PT_IP(proc_regs) <= &_etext))
++  		tracer_panic("I'm tracing myself and I can't get out");
++  
++ 	/* syscall number -1 in sysemu skips syscall restarting in host */
++  	if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, 
++ 		  local_using_sysemu ? -1 : __NR_getpid) < 0)
++  		tracer_panic("do_syscall : Nullifying syscall failed, "
++  			     "errno = %d", errno);
++  }
+ 
+ /*
+  * Overrides for Emacs so that we follow Linus's tabbing style.
+Index: linux-2.6.10/arch/um/kernel/tt/tracer.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/tracer.c	2005-04-01 01:16:47.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/tracer.c	2005-04-05 12:40:36.041908968 +0800
+@@ -13,6 +13,7 @@
+ #include <string.h>
+ #include <sys/mman.h>
+ #include <sys/ptrace.h>
++#include <linux/ptrace.h>
+ #include <sys/time.h>
+ #include <sys/wait.h>
+ #include "user.h"
+@@ -25,7 +26,6 @@
+ #include "mem_user.h"
+ #include "process.h"
+ #include "kern_util.h"
+-#include "frame.h"
+ #include "chan_user.h"
+ #include "ptrace_user.h"
+ #include "mode.h"
+@@ -72,6 +72,8 @@
+ 	   (ptrace(PTRACE_CONT, pid, 0, 0) < 0))
+ 		tracer_panic("OP_FORK failed to attach pid");
+ 	wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL);
++	if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0)
++		tracer_panic("OP_FORK: PTRACE_SETOPTIONS failed, errno = %d", errno);
+ 	if(ptrace(PTRACE_CONT, pid, 0, 0) < 0)
+ 		tracer_panic("OP_FORK failed to continue process");
+ }
+@@ -141,7 +143,7 @@
+ 	 * any more, the trace of those will land here.  So, we need to just 
+ 	 * PTRACE_SYSCALL it.
+ 	 */
+-	case SIGTRAP:
++	case (SIGTRAP|SYSCALL_TRAP):
+ 		if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
+ 			tracer_panic("sleeping_process_signal : Failed to "
+ 				     "PTRACE_SYSCALL pid %d, errno = %d\n",
+@@ -184,9 +186,8 @@
+ 	unsigned long eip = 0;
+ 	int status, pid = 0, sig = 0, cont_type, tracing = 0, op = 0;
+ 	int last_index, proc_id = 0, n, err, old_tracing = 0, strace = 0;
+-	int pt_syscall_parm, local_using_sysemu;
++	int pt_syscall_parm, local_using_sysemu = 0;
+ 
+-	capture_signal_stack();
+ 	signal(SIGPIPE, SIG_IGN);
+ 	setup_tracer_winch();
+ 	tracing_pid = os_getpid();
+@@ -198,6 +199,10 @@
+ 		printf("waitpid on idle thread failed, errno = %d\n", errno);
+ 		exit(1);
+ 	}
++	if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) {
++		printf("Failed to PTRACE_SETOPTIONS for idle thread, errno = %d\n", errno);
++		exit(1);
++	}
+ 	if((ptrace(PTRACE_CONT, pid, 0, 0) < 0)){
+ 		printf("Failed to continue idle thread, errno = %d\n", errno);
+ 		exit(1);
+@@ -315,7 +320,8 @@
+ 			task = cpu_tasks[proc_id].task;
+ 			tracing = is_tracing(task);
+ 			old_tracing = tracing;
+-
++ 			if ( tracing ) /* Assume: no syscall, when coming from user */
++ 				do_sigtrap(task);
+ 			local_using_sysemu = get_using_sysemu();
+ 			pt_syscall_parm = local_using_sysemu ? PTRACE_SYSEMU : PTRACE_SYSCALL;
+ 
+@@ -324,6 +330,15 @@
+ 				sig = 0;
+ 				op = do_proc_op(task, proc_id);
+ 				switch(op){
++				/*
++ 				 * This is called when entering user mode; after
++ 				 * this, we start intercepting syscalls.
++ 				 *
++ 				 * In fact, a process is started in kernel mode,
++ 				 * so with is_tracing() == 0 (and that is reset
++ 				 * when executing syscalls, since UML kernel has
++ 				 * the right to do syscalls);
++ 				 */
+ 				case OP_TRACE_ON:
+ 					arch_leave_kernel(task, pid);
+ 					tracing = 1;
+@@ -332,7 +347,13 @@
+ 				case OP_HALT:
+ 					unmap_physmem();
+ 					kmalloc_ok = 0;
+-					ptrace(PTRACE_KILL, pid, 0, 0);
++					os_kill_ptraced_process(pid, 0);
++					/* Now let's reap remaining zombies */
++					errno = 0;
++					do {
++						waitpid(-1, &status,
++							WUNTRACED);
++					} while (errno != ECHILD);
+ 					return(op == OP_REBOOT);
+ 				case OP_NONE:
+ 					printf("Detaching pid %d\n", pid);
+@@ -346,14 +367,26 @@
+ 				 */
+ 				pid = cpu_tasks[proc_id].pid;
+ 				break;
++			case (SIGTRAP|SYSCALL_TRAP):
++ 				if(!tracing && (debugger_pid != -1)){
++ 					child_signal(pid, W_STOPCODE(SIGTRAP));
++ 					continue;
++ 				}
++ 				tracing = 0;
++ 				/* local_using_sysemu has been already set
++ 				 * below, since if we are here, is_tracing() on
++ 				 * the traced task was 1, i.e. the process had
++ 				 * already run through one iteration of the
++ 				 * loop which executed a OP_TRACE_ON request.*/
++ 				do_syscall(task, pid, local_using_sysemu);
++ 				sig = SIGUSR2;
++ 				break;
+ 			case SIGTRAP:
+ 				if(!tracing && (debugger_pid != -1)){
+ 					child_signal(pid, status);
+ 					continue;
+ 				}
+ 				tracing = 0;
+-				if(do_syscall(task, pid, local_using_sysemu))
+-					sig = SIGUSR2;
+ 				break;
+ 			case SIGPROF:
+ 				if(tracing) sig = 0;
+@@ -389,6 +422,9 @@
+ 				continue;
+ 			}
+ 
++ 			local_using_sysemu = get_using_sysemu();
++ 			pt_syscall_parm = local_using_sysemu ? PTRACE_SYSEMU : PTRACE_SYSCALL;
++ 
+ 			if(tracing){
+ 				if(singlestepping(task))
+ 					cont_type = PTRACE_SINGLESTEP;
+Index: linux-2.6.10/arch/um/kernel/tt/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/Makefile	2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/Makefile	2005-04-05 12:40:36.041908968 +0800
+@@ -8,7 +8,7 @@
+ 
+ obj-y = exec_kern.o exec_user.o gdb.o ksyms.o mem.o mem_user.o process_kern.o \
+ 	syscall_kern.o syscall_user.o time.o tlb.o tracer.o trap_user.o \
+-	uaccess.o uaccess_user.o sys-$(SUBARCH)/
++	uaccess.o uaccess_user.o 
+ 
+ obj-$(CONFIG_PT_PROXY) += gdb_kern.o ptproxy/
+ 
+Index: linux-2.6.10/arch/um/kernel/trap_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/trap_user.c	2004-12-25 05:34:44.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/trap_user.c	2005-04-05 12:40:36.047908056 +0800
+@@ -18,7 +18,6 @@
+ #include "sigcontext.h"
+ #include "sysdep/sigcontext.h"
+ #include "irq_user.h"
+-#include "frame_user.h"
+ #include "signal_user.h"
+ #include "time_user.h"
+ #include "task.h"
+Index: linux-2.6.10/arch/um/kernel/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/Makefile	2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/Makefile	2005-04-05 12:40:36.051907448 +0800
+@@ -6,7 +6,7 @@
+ extra-y := vmlinux.lds
+ clean-files := vmlinux.lds.S
+ 
+-obj-y = checksum.o config.o exec_kern.o exitcode.o frame_kern.o frame.o \
++obj-y = checksum.o config.o exec_kern.o exitcode.o \
+ 	helper.o init_task.o irq.o irq_user.o ksyms.o main.o mem.o mem_user.o \
+ 	physmem.o process.o process_kern.o ptrace.o reboot.o resource.o \
+ 	sigio_user.o sigio_kern.o signal_kern.o signal_user.o smp.o \
+Index: linux-2.6.10/arch/um/kernel/mem.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/mem.c	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/mem.c	2005-04-05 12:40:36.029910792 +0800
+@@ -175,6 +175,30 @@
+ }
+ #endif /* CONFIG_HIGHMEM */
+ 
++static void __init fixaddr_user_init( void)
++{
++	long size = FIXADDR_USER_END - FIXADDR_USER_START;
++	pgd_t *pgd;
++	pmd_t *pmd;
++	pte_t *pte;
++	unsigned long paddr, vaddr = FIXADDR_USER_START;
++
++	if (  ! size )
++		return;
++
++	fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir);
++	paddr = (unsigned long)alloc_bootmem_low_pages( size);
++	memcpy( (void *)paddr, (void *)FIXADDR_USER_START, size);
++	paddr = __pa(paddr);
++	for ( ; size > 0; size-=PAGE_SIZE, vaddr+=PAGE_SIZE, paddr+=PAGE_SIZE) {
++		pgd = swapper_pg_dir + pgd_index(vaddr);
++		pmd = pmd_offset(pgd, vaddr);
++		pte = pte_offset_kernel(pmd, vaddr);
++		/*pte_set_val( (*pte), paddr, PAGE_READONLY);*/
++		pte_val(*pte) = paddr | pgprot_val(PAGE_READONLY);
++	}
++}
++
+ void paging_init(void)
+ {
+ 	unsigned long zones_size[MAX_NR_ZONES], vaddr;
+@@ -195,6 +219,8 @@
+ 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+ 	fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir);
+ 
++	fixaddr_user_init();
++
+ #ifdef CONFIG_HIGHMEM
+ 	init_highmem();
+ #endif
+Index: linux-2.6.10/arch/um/os-Linux/user_syms.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/os-Linux/user_syms.c	2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/arch/um/os-Linux/user_syms.c	2005-04-05 12:40:36.019912312 +0800
+@@ -26,6 +26,9 @@
+ 
+ EXPORT_SYMBOL(strstr);
+ 
++EXPORT_SYMBOL(vsyscall_ehdr);
++EXPORT_SYMBOL(vsyscall_end);
++
+ /* Here, instead, I can provide a fake prototype. Yes, someone cares: genksyms.
+  * However, the modules will use the CRC defined *here*, no matter if it is
+  * good; so the versions of these symbols will always match
+Index: linux-2.6.10/arch/um/os-Linux/elf_aux.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/os-Linux/elf_aux.c	2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/um/os-Linux/elf_aux.c	2005-04-05 12:40:36.018912464 +0800
+@@ -0,0 +1,67 @@
++/*
++ *  arch/um/kernel/elf_aux.c
++ *
++ *  Scan the Elf auxiliary vector provided by the host to extract
++ *  information about vsyscall-page, etc.
++ *
++ *  Copyright (C) 2004 Fujitsu Siemens Computers GmbH
++ *  Author: Bodo Stroesser (bodo.stroesser@fujitsu-siemens.com)
++ */
++#include <elf.h>
++#include <stddef.h>
++#include "init.h"
++#include "elf_user.h"
++
++#if ELF_CLASS == ELFCLASS32
++typedef Elf32_auxv_t elf_auxv_t;
++#else
++typedef Elf64_auxv_t elf_auxv_t;
++#endif
++
++char * elf_aux_platform;
++long elf_aux_hwcap;
++
++unsigned long vsyscall_ehdr;
++unsigned long vsyscall_end;
++
++unsigned long __kernel_vsyscall;
++
++
++__init void scan_elf_aux( char **envp)
++{
++	long page_size = 0;
++	elf_auxv_t * auxv;
++
++	while ( *envp++ != NULL) ;
++
++	for ( auxv = (elf_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++) {
++		switch ( auxv->a_type ) {
++			case AT_SYSINFO:
++				__kernel_vsyscall = auxv->a_un.a_val;
++				break;
++			case AT_SYSINFO_EHDR:
++				vsyscall_ehdr = auxv->a_un.a_val;
++				break;
++			case AT_HWCAP:
++				elf_aux_hwcap = auxv->a_un.a_val;
++				break;
++			case AT_PLATFORM:
++				elf_aux_platform = auxv->a_un.a_ptr;
++				break;
++			case AT_PAGESZ:
++				page_size = auxv->a_un.a_val;
++				break;
++		}
++	}
++	if ( ! __kernel_vsyscall || ! vsyscall_ehdr ||
++	     ! elf_aux_hwcap || ! elf_aux_platform ||
++	     ! page_size || (vsyscall_ehdr % page_size) ) {
++		__kernel_vsyscall = 0;
++		vsyscall_ehdr = 0;
++		elf_aux_hwcap = 0;
++		elf_aux_platform = "i586";
++	}
++	else {
++		vsyscall_end = vsyscall_ehdr + page_size;
++	}
++}
+Index: linux-2.6.10/arch/um/os-Linux/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/os-Linux/Makefile	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/arch/um/os-Linux/Makefile	2005-04-05 12:40:36.019912312 +0800
+@@ -3,9 +3,9 @@
+ # Licensed under the GPL
+ #
+ 
+-obj-y = file.o process.o time.o tty.o user_syms.o drivers/
++obj-y = elf_aux.o file.o process.o time.o tty.o user_syms.o drivers/
+ 
+-USER_OBJS := $(foreach file,file.o process.o time.o tty.o,$(obj)/$(file))
++USER_OBJS := $(foreach file,elf_aux.o file.o process.o time.o tty.o,$(obj)/$(file))
+ 
+ $(USER_OBJS) : %.o: %.c
+ 	$(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
+Index: linux-2.6.10/arch/um/drivers/net_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/drivers/net_kern.c	2004-12-25 05:34:44.000000000 +0800
++++ linux-2.6.10/arch/um/drivers/net_kern.c	2005-04-05 12:40:36.016912768 +0800
+@@ -126,10 +126,6 @@
+ 	lp->tl.data = (unsigned long) &lp->user;
+ 	netif_start_queue(dev);
+ 
+-	spin_lock(&opened_lock);
+-	list_add(&lp->list, &opened);
+-	spin_unlock(&opened_lock);
+-
+ 	/* clear buffer - it can happen that the host side of the interface
+ 	 * is full when we get here.  In this case, new data is never queued,
+ 	 * SIGIOs never arrive, and the net never works.
+@@ -152,9 +148,6 @@
+ 	free_irq(dev->irq, dev);
+ 	if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user);
+ 	lp->fd = -1;
+-	spin_lock(&opened_lock);
+-	list_del(&lp->list);
+-	spin_unlock(&opened_lock);
+ 
+ 	spin_unlock(&lp->lock);
+ 	return 0;
+@@ -397,6 +390,11 @@
+ 
+ 	if (device->have_mac)
+ 		set_ether_mac(dev, device->mac);
++
++	spin_lock(&opened_lock);
++	list_add(&lp->list, &opened);
++	spin_unlock(&opened_lock);
++
+ 	return(0);
+ }
+ 
+@@ -705,7 +703,7 @@
+ static void close_devices(void)
+ {
+ 	struct list_head *ele;
+-	struct uml_net_private *lp;	
++	struct uml_net_private *lp;
+ 
+ 	list_for_each(ele, &opened){
+ 		lp = list_entry(ele, struct uml_net_private, list);
+Index: linux-2.6.10/arch/um/drivers/mconsole_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/drivers/mconsole_kern.c	2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/um/drivers/mconsole_kern.c	2005-04-05 12:40:36.015912920 +0800
+@@ -204,6 +204,68 @@
+ }
+ #endif
+ 
++/* This is a more convoluted version of mconsole_proc, which has some stability
++ * problems; however, we need it fixed, because it is expected that UML users
++ * mount HPPFS instead of procfs on /proc. And we want mconsole_proc to still
++ * show the real procfs content, not the ones from hppfs.*/
++#if 0
++void mconsole_proc(struct mc_request *req)
++{
++	char path[64];
++	char *buf;
++	int len;
++	int fd;
++	int first_chunk = 1;
++	char *ptr = req->request.data;
++
++	ptr += strlen("proc");
++	while(isspace(*ptr)) ptr++;
++	snprintf(path, sizeof(path), "/proc/%s", ptr);
++
++	fd = sys_open(path, 0, 0);
++	if (fd < 0) {
++		mconsole_reply(req, "Failed to open file", 1, 0);
++		printk("open %s: %d\n",path,fd);
++		goto out;
++	}
++
++	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
++	if(buf == NULL){
++		mconsole_reply(req, "Failed to allocate buffer", 1, 0);
++		goto out_close;
++	}
++
++	for (;;) {
++		len = sys_read(fd, buf, PAGE_SIZE-1);
++		if (len < 0) {
++			mconsole_reply(req, "Read of file failed", 1, 0);
++			goto out_free;
++		}
++		/*Begin the file content on his own line.*/
++		if (first_chunk) {
++			mconsole_reply(req, "\n", 0, 1);
++			first_chunk = 0;
++		}
++		if (len == PAGE_SIZE-1) {
++			buf[len] = '\0';
++			mconsole_reply(req, buf, 0, 1);
++		} else {
++			buf[len] = '\0';
++			mconsole_reply(req, buf, 0, 0);
++			break;
++		}
++	}
++	/*END*/
++
++ out_free:
++	kfree(buf);
++ out_close:
++	sys_close(fd);
++ out:
++	/* nothing */;
++}
++#endif
++
+ void mconsole_proc(struct mc_request *req)
+ {
+ 	char path[64];
+Index: linux-2.6.10/arch/um/drivers/net_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/drivers/net_user.c	2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/um/drivers/net_user.c	2005-04-05 12:40:36.017912616 +0800
+@@ -173,10 +173,12 @@
+ 	pe_data.stdout = fds[1];
+ 	pid = run_helper(change_pre_exec, &pe_data, argv, NULL);
+ 
+-	os_close_file(fds[1]);
+ 	read_output(fds[0], output, output_len);
++	os_close_file(fds[0]);
++	os_close_file(fds[1]);
+ 
+-	CATCH_EINTR(err = waitpid(pid, NULL, 0));
++	if (pid > 0)
++		CATCH_EINTR(err = waitpid(pid, NULL, 0));
+ 	return(pid);
+ }
+ 
+Index: linux-2.6.10/arch/um/Kconfig
+===================================================================
+--- linux-2.6.10.orig/arch/um/Kconfig	2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/arch/um/Kconfig	2005-04-05 12:40:36.053907144 +0800
+@@ -139,6 +139,25 @@
+ 
+         It is safe to say 'Y' here.
+ 
++config MAGIC_SYSRQ
++        bool "Magic SysRq key"
++        depends on MCONSOLE
++        ---help---
++        If you say Y here, you will have some control over the system even
++        if the system crashes for example during kernel debugging (e.g., you
++        will be able to flush the buffer cache to disk, reboot the system
++        immediately or dump some status information). A key for each of the
++        possible requests is provided.
++
++        This is the feature normally accomplished by pressing a key
++        while holding SysRq (Alt+PrintScreen).
++
++        On UML, this is accomplished by sending a "sysrq" command with
++        mconsole, followed by the letter for the requested command.
++
++        The keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
++        unless you really know what this hack does.
++
+ config HOST_2G_2G
+ 	bool "2G/2G host address space split"
+ 	default n
+@@ -153,28 +172,28 @@
+ 	So, if you do not know what to do here, say 'N'.
+ 
+ config SMP
+-	bool "Symmetric multi-processing support (EXPERIMENTAL)"
+-	default n
+-	depends on MODE_TT && EXPERIMENTAL
+-	help
+-	This option enables UML SMP support.
+-	It is NOT related to having a real SMP box. Not directly, at least.
++        bool "Symmetric multi-processing support (EXPERIMENTAL)"
++        default n
++        depends on MODE_TT && EXPERIMENTAL
++        help
++        This option enables UML SMP support.
++        It is NOT related to having a real SMP box. Not directly, at least.
++
++        UML implements virtual SMP by allowing as many processes to run
++        simultaneously on the host as there are virtual processors configured.
++
++        Obviously, if the host is a uniprocessor, those processes will
++        timeshare, but, inside UML, will appear to be running simultaneously.
++        If the host is a multiprocessor, then UML processes may run
++        simultaneously, depending on the host scheduler.
++
++        This, however, is supported only in TT mode. So, if you use the SKAS
++        patch on your host, switching to TT mode and enabling SMP usually gives
++        you worse performances.
++        Also, since the support for SMP has been under-developed, there could
++        be some bugs being exposed by enabling SMP.
+ 
+-	UML implements virtual SMP by allowing as many processes to run
+-	simultaneously on the host as there are virtual processors configured.
+-
+-	Obviously, if the host is a uniprocessor, those processes will
+-	timeshare, but, inside UML, will appear to be running simultaneously.
+-	If the host is a multiprocessor, then UML processes may run
+-	simultaneously, depending on the host scheduler.
+-
+-	This, however, is supported only in TT mode. So, if you use the SKAS
+-	patch on your host, switching to TT mode and enabling SMP usually gives
+-	you worse performances.
+-	Also, since the support for SMP has been under-developed, there could
+-	be some bugs being exposed by enabling SMP.
+-
+-	If you don't know what to do, say N.
++        If you don't know what to do, say N.
+ 
+ config NR_CPUS
+ 	int "Maximum number of CPUs (2-32)"
+@@ -282,4 +301,8 @@
+ 	bool
+ 	default n
+ 
++config INPUT
++	bool
++	default n
++
+ source "arch/um/Kconfig.debug"
+Index: linux-2.6.10/arch/um/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/Makefile	2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/arch/um/Makefile	2005-04-05 12:40:53.158306880 +0800
+@@ -77,6 +77,8 @@
+   echo '		   find in the kernel root.'
+ endef
+ 
++.PHONY: linux
++
+ prepare: $(ARCH_SYMLINKS) $(SYS_HEADERS) $(GEN_HEADERS) \
+ 	$(ARCH_DIR)/kernel/vmlinux.lds.S
+ 
+Index: linux-2.6.10/fs/hostfs/hostfs.h
+===================================================================
+--- linux-2.6.10.orig/fs/hostfs/hostfs.h	2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/fs/hostfs/hostfs.h	2005-04-05 12:40:36.068904864 +0800
+@@ -16,9 +16,30 @@
+ #define HOSTFS_ATTR_CTIME	64
+ #define HOSTFS_ATTR_ATIME_SET	128
+ #define HOSTFS_ATTR_MTIME_SET	256
++
++/* These two are unused by hostfs. */
+ #define HOSTFS_ATTR_FORCE	512	/* Not a change, but a change it */
+ #define HOSTFS_ATTR_ATTR_FLAG	1024
+ 
++/* If you are very careful, you'll notice that these two are missing:
++ *
++ * #define ATTR_KILL_SUID	2048
++ * #define ATTR_KILL_SGID	4096
++ *
++ * and this is because they were added in 2.5 development in this patch:
++ *
++ * http://linux.bkbits.net:8080/linux-2.5/
++ * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
++ * |src/.|src/include|src/include/linux|related/include/linux/fs.h
++ *
++ * Actually, they are not needed by most ->setattr() methods - they are set by
++ * callers of notify_change() to notify that the setuid/setgid bits must be
++ * dropped.
++ * notify_change() will delete those flags, make sure attr->ia_valid & ATTR_MODE
++ * is on, and remove the appropriate bits from attr->ia_mode (attr is a
++ * "struct iattr *"). -BlaisorBlade
++ */
++
+ struct hostfs_iattr {
+ 	unsigned int	ia_valid;
+ 	mode_t		ia_mode;
+Index: linux-2.6.10/fs/hostfs/hostfs_kern.c
+===================================================================
+--- linux-2.6.10.orig/fs/hostfs/hostfs_kern.c	2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/fs/hostfs/hostfs_kern.c	2005-04-05 12:40:36.069904712 +0800
+@@ -393,6 +393,7 @@
+ static struct file_operations hostfs_file_fops = {
+ 	.llseek		= generic_file_llseek,
+ 	.read		= generic_file_read,
++	.sendfile	= generic_file_sendfile,
+ 	.write		= generic_file_write,
+ 	.mmap		= generic_file_mmap,
+ 	.open		= hostfs_file_open,
+@@ -818,6 +819,10 @@
+ 	char *name;
+ 	int err;
+ 
++	err = inode_change_ok(dentry->d_inode, attr);
++	if (err)
++		return err;
++
+ 	if(append)
+ 		attr->ia_valid &= ~ATTR_SIZE;
+ 
diff --git a/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch
new file mode 100644
index 0000000..16ae126
--- /dev/null
+++ b/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch
@@ -0,0 +1,113 @@
+Introduce lock-free versions of d_rehash and d_move.
+
+ fs/dcache.c            |   22 ++++++++++++++++++----
+ include/linux/dcache.h |    2 ++
+ 2 files changed, 20 insertions(+), 4 deletions(-)
+
+Index: linux-2.6.10/fs/dcache.c
+===================================================================
+--- linux-2.6.10.orig/fs/dcache.c	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/dcache.c	2005-03-31 19:16:50.807244880 +0800
+@@ -1116,29 +1116,23 @@
+ 	spin_unlock(&dcache_lock);
+ }
+ 
+-static void __d_rehash(struct dentry * entry, struct hlist_head *list)
++void __d_rehash(struct dentry * entry)
+ {
+-
+- 	entry->d_flags &= ~DCACHE_UNHASHED;
+- 	hlist_add_head_rcu(&entry->d_hash, list);
++  	struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
++  
++  	spin_lock(&entry->d_lock);
++   	entry->d_flags &= ~DCACHE_UNHASHED;
++   	hlist_add_head_rcu(&entry->d_hash, list);
++  	spin_unlock(&entry->d_lock);
+ }
+-
+-/**
+- * d_rehash	- add an entry back to the hash
+- * @entry: dentry to add to the hash
+- *
+- * Adds a dentry to the hash according to its name.
+- */
+  
++EXPORT_SYMBOL(__d_rehash);
++  
+ void d_rehash(struct dentry * entry)
+ {
+-	struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+-
+-	spin_lock(&dcache_lock);
+-	spin_lock(&entry->d_lock);
+-	__d_rehash(entry, list);
+-	spin_unlock(&entry->d_lock);
+-	spin_unlock(&dcache_lock);
++ 	spin_lock(&dcache_lock);
++ 	__d_rehash(entry);
++  	spin_unlock(&dcache_lock);
+ }
+ 
+ #define do_switch(x,y) do { \
+@@ -1213,14 +1207,13 @@
+  * dcache entries should not be moved in this way.
+  */
+ 
+-void d_move(struct dentry * dentry, struct dentry * target)
++void __d_move(struct dentry * dentry, struct dentry * target)
+ {
+ 	struct hlist_head *list;
+ 
+ 	if (!dentry->d_inode)
+ 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+ 
+-	spin_lock(&dcache_lock);
+ 	write_seqlock(&rename_lock);
+ 	/*
+ 	 * XXXX: do we really need to take target->d_lock?
+@@ -1241,7 +1234,8 @@
+ 
+ already_unhashed:
+ 	list = d_hash(target->d_parent, target->d_name.hash);
+-	__d_rehash(dentry, list);
++   	dentry->d_flags &= ~DCACHE_UNHASHED;
++   	hlist_add_head_rcu(&dentry->d_hash, list);
+ 
+ 	/* Unhash the target: dput() will then get rid of it */
+ 	__d_drop(target);
+@@ -1280,6 +1274,14 @@
+ 	spin_unlock(&target->d_lock);
+ 	spin_unlock(&dentry->d_lock);
+ 	write_sequnlock(&rename_lock);
++}
++
++EXPORT_SYMBOL(__d_move);
++
++void d_move(struct dentry *dentry, struct dentry *target)
++{
++	spin_lock(&dcache_lock);
++	__d_move(dentry, target);
+ 	spin_unlock(&dcache_lock);
+ }
+ 
+Index: linux-2.6.10/include/linux/dcache.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dcache.h	2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/dcache.h	2005-03-31 19:15:49.684536944 +0800
+@@ -228,6 +228,7 @@
+  * This adds the entry to the hash queues.
+  */
+ extern void d_rehash(struct dentry *);
++extern void __d_rehash(struct dentry *);
+ 
+ /**
+  * d_add - add dentry to hash queues
+@@ -246,6 +247,7 @@
+ 
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
++extern void __d_move(struct dentry *, struct dentry *);
+ 
+ /* appendix may either be NULL or be used for transname suffixes */
+ extern struct dentry * d_lookup(struct dentry *, struct qstr *);
diff --git a/lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch
new file mode 100644
index 0000000..85cb332
--- /dev/null
+++ b/lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch
@@ -0,0 +1,34 @@
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c	2005-03-31 17:03:37.000000000 +0800
++++ linux-2.6.10/fs/namespace.c	2005-03-31 17:58:42.827926064 +0800
+@@ -365,7 +365,7 @@
+ 	}
+ }
+ 
+-static int do_umount(struct vfsmount *mnt, int flags)
++int do_umount(struct vfsmount *mnt, int flags)
+ {
+ 	struct super_block * sb = mnt->mnt_sb;
+ 	int retval;
+@@ -458,6 +458,8 @@
+ 	return retval;
+ }
+ 
++EXPORT_SYMBOL(do_umount);
++
+ /*
+  * Now umount can handle mount points as well as block devices.
+  * This is important for filesystems which use unnamed block devices.
+Index: linux-2.6.10/include/linux/mount.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mount.h	2005-03-31 17:15:40.000000000 +0800
++++ linux-2.6.10/include/linux/mount.h	2005-03-31 17:59:41.914943472 +0800
+@@ -70,6 +70,7 @@
+ extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
+ 				      const char *name, void *data);
+ 
++extern int do_umount(struct vfsmount *mnt, int flags);
+ struct nameidata;
+ 
+ extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
diff --git a/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch
new file mode 100644
index 0000000..dfcf347
--- /dev/null
+++ b/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch
@@ -0,0 +1,557 @@
+Index: linux-2.6.10/fs/open.c
+===================================================================
+--- linux-2.6.10.orig/fs/open.c	2005-03-31 15:35:27.683586616 +0800
++++ linux-2.6.10/fs/open.c	2005-03-31 17:13:48.440535208 +0800
+@@ -217,11 +217,12 @@
+ 	struct inode * inode;
+ 	int error;
+ 
++	intent_init(&nd.intent.open, IT_GETATTR);
+ 	error = -EINVAL;
+ 	if (length < 0)	/* sorry, but loff_t says... */
+ 		goto out;
+ 
+-	error = user_path_walk(path, &nd);
++	error = user_path_walk_it(path, &nd);
+ 	if (error)
+ 		goto out;
+ 	inode = nd.dentry->d_inode;
+@@ -476,6 +477,7 @@
+ 	kernel_cap_t old_cap;
+ 	int res;
+ 
++	intent_init(&nd.intent.open, IT_GETATTR);
+ 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
+ 		return -EINVAL;
+ 
+@@ -499,7 +501,7 @@
+ 	else
+ 		current->cap_effective = current->cap_permitted;
+ 
+-	res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
++	res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+ 	if (!res) {
+ 		res = permission(nd.dentry->d_inode, mode, &nd);
+ 		/* SuS v2 requires we report a read only fs too */
+@@ -521,7 +523,8 @@
+ 	struct nameidata nd;
+ 	int error;
+ 
+-	error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++	intent_init(&nd.intent.open, IT_GETATTR);
++	error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+ 	if (error)
+ 		goto out;
+ 
+@@ -574,7 +577,8 @@
+ 	struct nameidata nd;
+ 	int error;
+ 
+-	error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++	intent_init(&nd.intent.open, IT_GETATTR);
++	error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+ 	if (error)
+ 		goto out;
+ 
+@@ -759,6 +763,7 @@
+ {
+ 	int namei_flags, error;
+ 	struct nameidata nd;
++	intent_init(&nd.intent.open, IT_OPEN);
+ 
+ 	namei_flags = flags;
+ 	if ((namei_flags+1) & O_ACCMODE)
+@@ -768,14 +773,14 @@
+ 
+ 	error = open_namei(filename, namei_flags, mode, &nd);
+ 	if (!error)
+-		return dentry_open(nd.dentry, nd.mnt, flags);
++		return dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent.open);
+ 
+ 	return ERR_PTR(error);
+ }
+ 
+ EXPORT_SYMBOL(filp_open);
+ 
+-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags, struct open_intent *it)
+ {
+ 	struct file * f;
+ 	struct inode *inode;
+@@ -787,6 +792,7 @@
+ 		goto cleanup_dentry;
+ 	f->f_flags = flags;
+ 	f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
++	f->f_it = it;
+ 	inode = dentry->d_inode;
+ 	if (f->f_mode & FMODE_WRITE) {
+ 		error = get_write_access(inode);
+@@ -805,6 +811,7 @@
+ 		error = f->f_op->open(inode,f);
+ 		if (error)
+ 			goto cleanup_all;
++		intent_release(it);
+ 	}
+ 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+ 
+@@ -830,11 +837,20 @@
+ cleanup_file:
+ 	put_filp(f);
+ cleanup_dentry:
++	intent_release(it);
+ 	dput(dentry);
+ 	mntput(mnt);
+ 	return ERR_PTR(error);
+ }
+ 
++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++{
++	struct open_intent it;
++	intent_init(&it, IT_LOOKUP);
++
++	return dentry_open_it(dentry, mnt, flags, &it);
++}
++
+ EXPORT_SYMBOL(dentry_open);
+ 
+ /*
+Index: linux-2.6.10/fs/xattr.c
+===================================================================
+--- linux-2.6.10.orig/fs/xattr.c	2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/fs/xattr.c	2005-03-31 17:03:37.148465728 +0800
+@@ -164,7 +164,8 @@
+ 	struct nameidata nd;
+ 	ssize_t error;
+ 
+-	error = user_path_walk(path, &nd);
++	intent_init(&nd.intent.open, IT_GETXATTR);
++	error = user_path_walk_it(path, &nd);
+ 	if (error)
+ 		return error;
+ 	error = getxattr(nd.dentry, name, value, size);
+@@ -179,7 +180,8 @@
+ 	struct nameidata nd;
+ 	ssize_t error;
+ 
+-	error = user_path_walk_link(path, &nd);
++	intent_init(&nd.intent.open, IT_GETXATTR);
++	error = user_path_walk_link_it(path, &nd);
+ 	if (error)
+ 		return error;
+ 	error = getxattr(nd.dentry, name, value, size);
+@@ -245,7 +247,8 @@
+ 	struct nameidata nd;
+ 	ssize_t error;
+ 
+-	error = user_path_walk(path, &nd);
++	intent_init(&nd.intent.open, IT_GETXATTR);
++	error = user_path_walk_it(path, &nd);
+ 	if (error)
+ 		return error;
+ 	error = listxattr(nd.dentry, list, size);
+@@ -259,7 +262,8 @@
+ 	struct nameidata nd;
+ 	ssize_t error;
+ 
+-	error = user_path_walk_link(path, &nd);
++	intent_init(&nd.intent.open, IT_GETXATTR);
++	error = user_path_walk_link_it(path, &nd);
+ 	if (error)
+ 		return error;
+ 	error = listxattr(nd.dentry, list, size);
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c	2005-03-31 15:35:26.294797744 +0800
++++ linux-2.6.10/fs/namei.c	2005-03-31 17:12:26.403006808 +0800
+@@ -288,8 +288,19 @@
+ 	return 0;
+ }
+ 
++void intent_release(struct open_intent *it)
++{
++	if (!it)
++		return;
++	if (it->magic != INTENT_MAGIC)
++		return;
++	if (it->op_release)
++		it->op_release(it);
++}
++
+ void path_release(struct nameidata *nd)
+ {
++	intent_release(&nd->intent.open);
+ 	dput(nd->dentry);
+ 	mntput(nd->mnt);
+ }
+@@ -448,6 +459,7 @@
+ static inline int __vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+ 	int res = 0;
++ 	struct open_intent it = nd->intent.open;
+ 	char *name;
+ 	if (IS_ERR(link))
+ 		goto fail;
+@@ -458,6 +470,10 @@
+ 			/* weird __emul_prefix() stuff did it */
+ 			goto out;
+ 	}
++ 	intent_release(&nd->intent.open);
++ 	intent_init(&nd->intent.open, it.op);
++ 	nd->intent.open.flags = it.flags;
++ 	nd->intent.open.create_mode = it.create_mode;
+ 	res = link_path_walk(link, nd);
+ out:
+ 	if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -876,8 +892,14 @@
+ 	return err;
+ }
+ 
++int fastcall path_walk_it(const char * name, struct nameidata *nd)
++{
++	current->total_link_count = 0;
++	return link_path_walk(name, nd);
++}
+ int fastcall path_walk(const char * name, struct nameidata *nd)
+ {
++	intent_init(&nd->intent.open, IT_LOOKUP);
+ 	current->total_link_count = 0;
+ 	return link_path_walk(name, nd);
+ }
+@@ -886,7 +908,7 @@
+ /* returns 1 if everything is done */
+ static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
+ {
+-	if (path_walk(name, nd))
++	if (path_walk_it(name, nd))
+ 		return 0;		/* something went wrong... */
+ 
+ 	if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) {
+@@ -947,7 +969,18 @@
+ 	}
+ }
+ 
+-int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
++static inline int it_mode_from_lookup_flags(int flags)
++{
++	int mode = IT_LOOKUP;
++
++	if (flags & LOOKUP_OPEN)
++		mode = IT_OPEN;
++	if (flags & LOOKUP_CREATE)
++		mode |= IT_CREAT;
++	return mode;
++}
++
++int fastcall path_lookup_it(const char *name, unsigned int flags, struct nameidata *nd)
+ {
+ 	int retval;
+ 
+@@ -982,6 +1015,12 @@
+ 	return retval;
+ }
+ 
++int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
++{
++	intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags));
++	return path_lookup_it(name, flags, nd);
++}
++
+ /*
+  * Restricted form of lookup. Doesn't follow links, single-component only,
+  * needs parent already locked. Doesn't follow mounts.
+@@ -1032,7 +1071,7 @@
+ }
+ 
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd)
+ {
+ 	unsigned long hash;
+ 	struct qstr this;
+@@ -1052,11 +1091,16 @@
+ 	}
+ 	this.hash = end_name_hash(hash);
+ 
+-	return lookup_hash(&this, base);
++	return __lookup_hash(&this, base, nd);
+ access:
+ 	return ERR_PTR(-EACCES);
+ }
+ 
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++	return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+  *	namei()
+  *
+@@ -1068,18 +1112,24 @@
+  * that namei follows links, while lnamei does not.
+  * SMP-safe
+  */
+-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)
+ {
+ 	char *tmp = getname(name);
+ 	int err = PTR_ERR(tmp);
+ 
+ 	if (!IS_ERR(tmp)) {
+-		err = path_lookup(tmp, flags, nd);
++		err = path_lookup_it(tmp, flags, nd);
+ 		putname(tmp);
+ 	}
+ 	return err;
+ }
+ 
++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++{
++	intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags));
++	return __user_walk_it(name, flags, nd);
++}
++
+ /*
+  * It's inline, so penalty for filesystems that don't use sticky bit is
+  * minimal.
+@@ -1370,7 +1420,7 @@
+ 	 * The simplest case - just a plain lookup.
+ 	 */
+ 	if (!(flag & O_CREAT)) {
+-		error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
++		error = path_lookup_it(pathname, lookup_flags(flag), nd);
+ 		if (error)
+ 			return error;
+ 		goto ok;
+@@ -1379,7 +1429,8 @@
+ 	/*
+ 	 * Create - we need to know the parent.
+ 	 */
+-	error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
++	nd->intent.open.op |= IT_CREAT;
++	error = path_lookup_it(pathname, LOOKUP_PARENT, nd);
+ 	if (error)
+ 		return error;
+ 
+@@ -2344,6 +2395,7 @@
+ 	}
+ }
+ 
++
+ int page_symlink(struct inode *inode, const char *symname, int len)
+ {
+ 	struct address_space *mapping = inode->i_mapping;
+@@ -2405,8 +2457,10 @@
+ EXPORT_SYMBOL(page_symlink);
+ EXPORT_SYMBOL(page_symlink_inode_operations);
+ EXPORT_SYMBOL(path_lookup);
++EXPORT_SYMBOL(path_lookup_it);
+ EXPORT_SYMBOL(path_release);
+ EXPORT_SYMBOL(path_walk);
++EXPORT_SYMBOL(path_walk_it);
+ EXPORT_SYMBOL(permission);
+ EXPORT_SYMBOL(unlock_rename);
+ EXPORT_SYMBOL(vfs_create);
+Index: linux-2.6.10/fs/stat.c
+===================================================================
+--- linux-2.6.10.orig/fs/stat.c	2004-12-25 05:34:02.000000000 +0800
++++ linux-2.6.10/fs/stat.c	2005-03-31 17:03:37.144466336 +0800
+@@ -60,15 +60,15 @@
+ 	}
+ 	return 0;
+ }
+-
+ EXPORT_SYMBOL(vfs_getattr);
+ 
+ int vfs_stat(char __user *name, struct kstat *stat)
+ {
+ 	struct nameidata nd;
+ 	int error;
++	intent_init(&nd.intent.open, IT_GETATTR);
+ 
+-	error = user_path_walk(name, &nd);
++	error = user_path_walk_it(name, &nd);
+ 	if (!error) {
+ 		error = vfs_getattr(nd.mnt, nd.dentry, stat);
+ 		path_release(&nd);
+@@ -82,8 +82,9 @@
+ {
+ 	struct nameidata nd;
+ 	int error;
++	intent_init(&nd.intent.open, IT_GETATTR);
+ 
+-	error = user_path_walk_link(name, &nd);
++	error = user_path_walk_link_it(name, &nd);
+ 	if (!error) {
+ 		error = vfs_getattr(nd.mnt, nd.dentry, stat);
+ 		path_release(&nd);
+@@ -97,9 +98,12 @@
+ {
+ 	struct file *f = fget(fd);
+ 	int error = -EBADF;
++	struct nameidata nd;
++	intent_init(&nd.intent.open, IT_GETATTR);
+ 
+ 	if (f) {
+ 		error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat);
++		intent_release(&nd.intent.open);
+ 		fput(f);
+ 	}
+ 	return error;
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c	2005-03-31 15:35:26.295797592 +0800
++++ linux-2.6.10/fs/namespace.c	2005-03-31 17:03:37.145466184 +0800
+@@ -113,6 +113,7 @@
+ 
+ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
+ {
++	memset(old_nd, 0, sizeof(*old_nd));
+ 	old_nd->dentry = mnt->mnt_mountpoint;
+ 	old_nd->mnt = mnt->mnt_parent;
+ 	mnt->mnt_parent = mnt;
+Index: linux-2.6.10/fs/exec.c
+===================================================================
+--- linux-2.6.10.orig/fs/exec.c	2005-03-31 16:20:09.692859232 +0800
++++ linux-2.6.10/fs/exec.c	2005-03-31 17:03:37.147465880 +0800
+@@ -125,8 +125,9 @@
+ 	struct nameidata nd;
+ 	int error;
+ 
++	intent_init(&nd.intent.open, IT_OPEN);
+ 	nd.intent.open.flags = FMODE_READ;
+-	error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++	error = user_path_walk_it(library, &nd);
+ 	if (error)
+ 		goto out;
+ 
+@@ -138,7 +139,7 @@
+ 	if (error)
+ 		goto exit;
+ 
+-	file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++	file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open);
+ 	error = PTR_ERR(file);
+ 	if (IS_ERR(file))
+ 		goto out;
+@@ -485,8 +486,9 @@
+ 	int err;
+ 	struct file *file;
+ 
++	intent_init(&nd.intent.open, IT_OPEN);
+ 	nd.intent.open.flags = FMODE_READ;
+-	err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++	err = path_lookup_it(name, LOOKUP_FOLLOW, &nd);
+ 	file = ERR_PTR(err);
+ 
+ 	if (!err) {
+@@ -499,7 +501,7 @@
+ 				err = -EACCES;
+ 			file = ERR_PTR(err);
+ 			if (!err) {
+-				file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++				file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open);
+ 				if (!IS_ERR(file)) {
+ 					err = deny_write_access(file);
+ 					if (err) {
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h	2005-03-31 15:35:26.317794248 +0800
++++ linux-2.6.10/include/linux/fs.h	2005-03-31 17:03:37.135467704 +0800
+@@ -600,6 +600,7 @@
+ 	spinlock_t		f_ep_lock;
+ #endif /* #ifdef CONFIG_EPOLL */
+ 	struct address_space	*f_mapping;
++	struct open_intent	*f_it;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+@@ -1245,6 +1246,7 @@
+ extern int do_truncate(struct dentry *, loff_t start);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct open_intent *);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char __user *);
+ 
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h	2005-03-31 15:35:27.689585704 +0800
++++ linux-2.6.10/include/linux/namei.h	2005-03-31 17:10:14.746021712 +0800
+@@ -2,14 +2,41 @@
+ #define _LINUX_NAMEI_H
+ 
+ #include <linux/linkage.h>
++#include <linux/string.h>
+ 
+ struct vfsmount;
+ 
++/* intent opcodes */
++#define IT_OPEN		(1)
++#define IT_CREAT	(1<<1)
++#define IT_READDIR	(1<<2)
++#define IT_GETATTR	(1<<3)
++#define IT_LOOKUP	(1<<4)
++#define IT_UNLINK	(1<<5)
++#define IT_TRUNC	(1<<6)
++#define IT_GETXATTR	(1<<7)
++ 
++#define INTENT_MAGIC 0x19620323
++ 
++
+ struct open_intent {
++ 	int	magic;
++ 	int	op;
++ 	void	(*op_release)(struct open_intent *);
+ 	int	flags;
+ 	int	create_mode;
++ 	union {
++ 		void *fs_data; /* FS-specific intent data */
++ 	} d;
+ };
+ 
++static inline void intent_init(struct open_intent *it, int op)
++{
++ 	memset(it, 0, sizeof(*it));
++ 	it->magic = INTENT_MAGIC;
++ 	it->op = op;
++}
++ 
+ enum { MAX_NESTED_LINKS = 8 };
+ 
+ struct nameidata {
+@@ -55,14 +82,22 @@
+ #define LOOKUP_ACCESS		(0x0400)
+ 
+ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char __user *, unsigned, struct nameidata *));
+ #define user_path_walk(name,nd) \
+ 	__user_walk(name, LOOKUP_FOLLOW, nd)
++#define user_path_walk_it(name,nd) \
++ 	__user_walk_it(name, LOOKUP_FOLLOW, nd)
+ #define user_path_walk_link(name,nd) \
+ 	__user_walk(name, 0, nd)
++#define user_path_walk_link_it(name,nd) \
++ 	__user_walk_it(name, 0, nd)
+ extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
++extern int FASTCALL(path_lookup_it(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
++extern int FASTCALL(path_walk_it(const char *, struct nameidata *));
+ extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
+ extern void path_release(struct nameidata *);
++extern void intent_release(struct open_intent *);
+ extern void path_release_on_umount(struct nameidata *);
+ 
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+Index: linux-2.6.10/include/linux/mount.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mount.h	2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/include/linux/mount.h	2005-03-31 17:15:40.613482328 +0800
+@@ -36,6 +36,8 @@
+ 	struct list_head mnt_list;
+ 	struct list_head mnt_fslink;	/* link in fs-specific expiry list */
+ 	struct namespace *mnt_namespace; /* containing namespace */
++        struct list_head mnt_lustre_list; /* GNS mount list */
++        unsigned long mnt_last_used;      /* for GNS auto-umount (jiffies) */
+ };
+ 
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
diff --git a/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch
new file mode 100644
index 0000000..1bb1634
--- /dev/null
+++ b/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch
@@ -0,0 +1,78 @@
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c	2005-03-31 17:12:26.403006808 +0800
++++ linux-2.6.10/fs/namei.c	2005-03-31 17:20:37.388365688 +0800
+@@ -783,8 +783,11 @@
+ 			goto out_dput;
+ 
+ 		if (inode->i_op->follow_link) {
++			int saved_flags = nd->flags;
+ 			mntget(next.mnt);
++			nd->flags |= LOOKUP_LINK_NOTLAST;
+ 			err = do_follow_link(next.dentry, nd);
++			nd->flags = saved_flags;
+ 			dput(next.dentry);
+ 			mntput(next.mnt);
+ 			if (err)
+@@ -830,7 +833,9 @@
+ 			if (err < 0)
+ 				break;
+ 		}
++ 		nd->flags |= LOOKUP_LAST;
+ 		err = do_lookup(nd, &this, &next, atomic);
++ 		nd->flags &= ~LOOKUP_LAST;
+ 		if (err)
+ 			break;
+ 		follow_mount(&next.mnt, &next.dentry);
+@@ -876,10 +881,14 @@
+ 		 */
+ 		if (nd->dentry && nd->dentry->d_sb &&
+ 		    (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+-			err = -ESTALE;
++			nd->flags |= LOOKUP_LAST;
++			err = !nd->dentry->d_op->d_revalidate(nd->dentry, nd);
++                        nd->flags &= ~LOOKUP_LAST;
+ 			/* Note: we do not d_invalidate() */
+-			if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
++			if (err) {
++				err = -ESTALE;
+ 				break;
++			}
+ 		}
+ return_base:
+ 		return 0;
+@@ -1446,7 +1455,9 @@
+ 	dir = nd->dentry;
+ 	nd->flags &= ~LOOKUP_PARENT;
+ 	down(&dir->d_inode->i_sem);
++	nd->flags |= LOOKUP_LAST;
+ 	dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++	nd->flags &= ~LOOKUP_LAST;
+ 
+ do_last:
+ 	error = PTR_ERR(dentry);
+@@ -1559,7 +1570,9 @@
+ 	}
+ 	dir = nd->dentry;
+ 	down(&dir->d_inode->i_sem);
++	nd->flags |= LOOKUP_LAST;
+ 	dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++	nd->flags &= ~LOOKUP_LAST;
+ 	putname(nd->last.name);
+ 	goto do_last;
+ }
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h	2005-03-31 17:10:14.746021712 +0800
++++ linux-2.6.10/include/linux/namei.h	2005-03-31 17:21:41.178668088 +0800
+@@ -73,7 +73,9 @@
+ #define LOOKUP_PARENT		16
+ #define LOOKUP_NOALT		32
+ #define LOOKUP_ATOMIC		64
+-
++#define LOOKUP_LAST		128	
++#define LOOKUP_LINK_NOTLAST	256	
+++ 
+ /*
+  * Intent data
+  */
diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch
new file mode 100644
index 0000000..57098d2
--- /dev/null
+++ b/lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch
@@ -0,0 +1,274 @@
+ fs/inode.c         |    1 
+ fs/namei.c         |   66 ++++++++++++++++++++++++++++++++++++++---------------
+ include/linux/fs.h |   11 ++++----
+ 3 files changed, 54 insertions(+), 24 deletions(-)
+
+Index: linux-2.6.10/fs/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/inode.c	2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/fs/inode.c	2005-03-31 18:03:53.551688872 +0800
+@@ -166,6 +166,7 @@
+ 		}
+ 		memset(&inode->u, 0, sizeof(inode->u));
+ 		inode->i_mapping = mapping;
++		dynlock_init(&inode->i_dcache_lock);
+ 	}
+ 	return inode;
+ }
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c	2005-03-31 17:57:10.767921312 +0800
++++ linux-2.6.10/fs/namei.c	2005-03-31 18:05:52.839554360 +0800
+@@ -104,6 +104,38 @@
+  * any extra contention...
+  */
+ 
++void *lock_dir(struct inode *dir, struct qstr *name)
++{
++	unsigned long hash;
++	
++	if (!IS_PDIROPS(dir)) {
++		down(&dir->i_sem);
++		return 0;
++	}
++
++	/* OK. fs understands parallel directory operations.
++	 * so, we try to acquire lock for hash of requested
++	 * filename in order to prevent any operations with
++	 * same name in same time -bzzz */
++
++	/* calculate name hash */
++	hash = full_name_hash(name->name, name->len);
++
++	/* lock this hash */
++	return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC);
++}
++EXPORT_SYMBOL(lock_dir);
++
++void unlock_dir(struct inode *dir, void *lock)
++{
++	if (!IS_PDIROPS(dir)) {
++		up(&dir->i_sem);
++		return;
++	}
++	dynlock_unlock(&dir->i_dcache_lock, lock);
++}
++EXPORT_SYMBOL(unlock_dir);
++
+ /* In order to reduce some races, while at the same time doing additional
+  * checking and hopefully speeding things up, we copy filenames to the
+  * kernel data space before using them..
+@@ -390,8 +422,9 @@
+ {
+ 	struct dentry * result;
+ 	struct inode *dir = parent->d_inode;
++	void *lock;
+ 
+-	down(&dir->i_sem);
++	lock = lock_dir(dir, name);
+ 	/*
+ 	 * First re-do the cached lookup just in case it was created
+ 	 * while we waited for the directory semaphore..
+@@ -417,7 +450,7 @@
+ 			else
+ 				result = dentry;
+ 		}
+-		up(&dir->i_sem);
++		unlock_dir(dir, lock);
+ 		return result;
+ 	}
+ 
+@@ -425,7 +458,7 @@
+ 	 * Uhhuh! Nasty case: the cache was re-populated while
+ 	 * we waited on the semaphore. Need to revalidate.
+ 	 */
+-	up(&dir->i_sem);
++	unlock_dir(dir, lock);
+ 	if (result->d_op && result->d_op->d_revalidate) {
+ 		if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
+ 			dput(result);
+@@ -1461,7 +1494,7 @@
+ 
+ 	dir = nd->dentry;
+ 	nd->flags &= ~LOOKUP_PARENT;
+-	down(&dir->d_inode->i_sem);
++	nd->lock = lock_dir(dir->d_inode, &nd->last);
+ 	nd->flags |= LOOKUP_LAST;
+ 	dentry = __lookup_hash(&nd->last, nd->dentry, nd);
+ 	nd->flags &= ~LOOKUP_LAST;
+@@ -1469,7 +1502,7 @@
+ do_last:
+ 	error = PTR_ERR(dentry);
+ 	if (IS_ERR(dentry)) {
+-		up(&dir->d_inode->i_sem);
++		unlock_dir(dir->d_inode, nd->lock);
+ 		goto exit;
+ 	}
+ 
+@@ -1478,7 +1511,7 @@
+ 		if (!IS_POSIXACL(dir->d_inode))
+ 			mode &= ~current->fs->umask;
+ 		error = vfs_create(dir->d_inode, dentry, mode, nd);
+-		up(&dir->d_inode->i_sem);
++		unlock_dir(dir->d_inode, nd->lock);		
+ 		dput(nd->dentry);
+ 		nd->dentry = dentry;
+ 		if (error)
+@@ -1492,7 +1525,7 @@
+ 	/*
+ 	 * It already exists.
+ 	 */
+-	up(&dir->d_inode->i_sem);
++	unlock_dir(dir->d_inode, nd->lock);
+ 
+ 	error = -EEXIST;
+ 	if (flag & O_EXCL)
+@@ -1576,7 +1609,7 @@
+ 		goto exit;
+ 	}
+ 	dir = nd->dentry;
+-	down(&dir->d_inode->i_sem);
++	nd->lock = lock_dir(dir->d_inode, &nd->last);
+ 	nd->flags |= LOOKUP_LAST;
+ 	dentry = __lookup_hash(&nd->last, nd->dentry, nd);
+ 	nd->flags &= ~LOOKUP_LAST;
+@@ -1596,7 +1629,7 @@
+ {
+ 	struct dentry *dentry;
+ 
+-	down(&nd->dentry->d_inode->i_sem);
++	nd->lock = lock_dir(nd->dentry->d_inode, &nd->last);
+ 	dentry = ERR_PTR(-EEXIST);
+ 	if (nd->last_type != LAST_NORM)
+ 		goto fail;
+@@ -1688,7 +1721,7 @@
+ 		}
+ 		dput(dentry);
+ 	}
+-	up(&nd.dentry->d_inode->i_sem);
++	unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+ 	path_release(&nd);
+ out:
+@@ -1747,7 +1780,7 @@
+ 			error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
+ 			dput(dentry);
+ 		}
+-		up(&nd.dentry->d_inode->i_sem);
++		unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+ 		path_release(&nd);
+ out:
+@@ -1852,14 +1885,14 @@
+ 			error = -EBUSY;
+ 			goto exit1;
+ 	}
+-	down(&nd.dentry->d_inode->i_sem);
++	nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
+ 	dentry = lookup_hash(&nd.last, nd.dentry);
+ 	error = PTR_ERR(dentry);
+ 	if (!IS_ERR(dentry)) {
+ 		error = vfs_rmdir(nd.dentry->d_inode, dentry);
+ 		dput(dentry);
+ 	}
+-	up(&nd.dentry->d_inode->i_sem);
++	unlock_dir(nd.dentry->d_inode, nd.lock);
+ exit1:
+ 	path_release(&nd);
+ exit:
+@@ -1925,7 +1958,7 @@
+ 	error = -EISDIR;
+ 	if (nd.last_type != LAST_NORM)
+ 		goto exit1;
+-	down(&nd.dentry->d_inode->i_sem);
++	nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
+ 	dentry = lookup_hash(&nd.last, nd.dentry);
+ 	error = PTR_ERR(dentry);
+ 	if (!IS_ERR(dentry)) {
+@@ -1939,7 +1972,7 @@
+ 	exit2:
+ 		dput(dentry);
+ 	}
+-	up(&nd.dentry->d_inode->i_sem);
++ 	unlock_dir(nd.dentry->d_inode, nd.lock);
+ 	if (inode)
+ 		iput(inode);	/* truncate the inode here */
+ exit1:
+@@ -2005,7 +2038,7 @@
+ 			error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
+ 			dput(dentry);
+ 		}
+-		up(&nd.dentry->d_inode->i_sem);
++		unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+ 		path_release(&nd);
+ out:
+@@ -2094,7 +2127,7 @@
+ 		error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+ 		dput(new_dentry);
+ 	}
+-	up(&nd.dentry->d_inode->i_sem);
++	unlock_dir(nd.dentry->d_inode, nd.lock);
+ out_release:
+ 	path_release(&nd);
+ out:
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h	2005-03-31 17:57:13.330531736 +0800
++++ linux-2.6.10/include/linux/fs.h	2005-03-31 18:08:59.645155592 +0800
+@@ -19,6 +19,7 @@
+ #include <linux/prio_tree.h>
+ #include <linux/kobject.h>
+ #include <asm/atomic.h>
++#include <linux/dynlocks.h>
+ 
+ struct iovec;
+ struct nameidata;
+@@ -151,7 +152,7 @@
+ #define S_DIRSYNC	64	/* Directory modifications are synchronous */
+ #define S_NOCMTIME	128	/* Do not update file c/mtime */
+ #define S_SWAPFILE	256	/* Do not truncate: swapon got its bmaps */
+-
++#define S_PDIROPS	512	/* Parallel directory operations */
+ /*
+  * Note that nosuid etc flags are inode-specific: setting some file-system
+  * flags just means all the inodes inherit those flags by default. It might be
+@@ -181,6 +182,7 @@
+ #define IS_NODIRATIME(inode)	__IS_FLG(inode, MS_NODIRATIME)
+ #define IS_POSIXACL(inode)	__IS_FLG(inode, MS_POSIXACL)
+ #define IS_ONE_SECOND(inode)	__IS_FLG(inode, MS_ONE_SECOND)
++#define IS_PDIROPS(inode)	__IS_FLG(inode, S_PDIROPS)
+ 
+ #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
+ #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
+@@ -482,6 +484,7 @@
+ 
+ 	atomic_t		i_writecount;
+ 	void			*i_security;
++ 	struct dynlock		i_dcache_lock;	/* for parallel directory ops */
+ 	union {
+ 		void		*generic_ip;
+ 	} u;
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h	2005-03-31 17:50:12.533502608 +0800
++++ linux-2.6.10/include/linux/namei.h	2005-03-31 18:10:30.237383480 +0800
+@@ -63,7 +63,8 @@
+ 	int		last_type;
+ 	unsigned	depth;
+ 	char *saved_names[MAX_NESTED_LINKS + 1];
+-
++ 	
++	void *lock;
+ 	/* Intent data */
+ 	union {
+ 		struct open_intent open;
+@@ -91,7 +92,7 @@
+ #define LOOKUP_ATOMIC		64
+ #define LOOKUP_LAST		128	
+ #define LOOKUP_LINK_NOTLAST	256	
+-+ 
++ 
+ /*
+  * Intent data
+  */
diff --git a/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch
new file mode 100644
index 0000000..ad2d3ab
--- /dev/null
+++ b/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch
@@ -0,0 +1,235 @@
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c	2005-03-31 17:43:42.417809208 +0800
++++ linux-2.6.10/fs/namei.c	2005-03-31 17:47:14.292599344 +0800
+@@ -474,6 +474,7 @@
+  	intent_init(&nd->intent.open, it.op);
+  	nd->intent.open.flags = it.flags;
+  	nd->intent.open.create_mode = it.create_mode;
++ 	nd->intent.open.create = it.create;
+ 	res = link_path_walk(link, nd);
+ out:
+ 	if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -866,14 +867,20 @@
+ lookup_parent:
+ 		nd->last = this;
+ 		nd->last_type = LAST_NORM;
+-		if (this.name[0] != '.')
+-			goto return_base;
+-		if (this.len == 1)
+-			nd->last_type = LAST_DOT;
+-		else if (this.len == 2 && this.name[1] == '.')
+-			nd->last_type = LAST_DOTDOT;
+-		else
+-			goto return_base;
++		if (this.name[0] == '.') {
++			if (this.len == 1)
++				nd->last_type = LAST_DOT;
++			else if (this.len == 2 && this.name[1] == '.')
++				nd->last_type = LAST_DOTDOT;
++		}
++
++		if ((nd->last_type == LAST_NORM) && inode->i_op &&
++		    inode->i_op->endparentlookup) {
++			err = inode->i_op->endparentlookup(nd);
++			if (err)
++				break;
++		}
++		goto return_base;
+ return_reval:
+ 		/*
+ 		 * We bypassed the ordinary revalidation routines.
+@@ -1646,9 +1653,16 @@
+ 	if (IS_ERR(tmp))
+ 		return PTR_ERR(tmp);
+ 
+-	error = path_lookup(tmp, LOOKUP_PARENT, &nd);
++	intent_init(&nd.intent.open, IT_MKNOD);
++	nd.intent.open.create_mode = mode;
++	nd.intent.open.create.dev = dev;
++
++	error = path_lookup_it(tmp, LOOKUP_PARENT, &nd);
+ 	if (error)
+ 		goto out;
++	if (nd.intent.open.flags & IT_STATUS_RAW)
++		goto out2;
++
+ 	dentry = lookup_create(&nd, 0);
+ 	error = PTR_ERR(dentry);
+ 
+@@ -1675,6 +1689,7 @@
+ 		dput(dentry);
+ 	}
+ 	up(&nd.dentry->d_inode->i_sem);
++out2:
+ 	path_release(&nd);
+ out:
+ 	putname(tmp);
+@@ -1717,9 +1732,13 @@
+ 		struct dentry *dentry;
+ 		struct nameidata nd;
+ 
+-		error = path_lookup(tmp, LOOKUP_PARENT, &nd);
++		intent_init(&nd.intent.open, IT_MKDIR);
++		nd.intent.open.create_mode = mode;
++		error = path_lookup_it(tmp, LOOKUP_PARENT, &nd);
+ 		if (error)
+ 			goto out;
++		if (nd.intent.open.flags & IT_STATUS_RAW)
++			goto out2;
+ 		dentry = lookup_create(&nd, 1);
+ 		error = PTR_ERR(dentry);
+ 		if (!IS_ERR(dentry)) {
+@@ -1729,6 +1748,7 @@
+ 			dput(dentry);
+ 		}
+ 		up(&nd.dentry->d_inode->i_sem);
++out2:
+ 		path_release(&nd);
+ out:
+ 		putname(tmp);
+@@ -1814,9 +1834,12 @@
+ 	if(IS_ERR(name))
+ 		return PTR_ERR(name);
+ 
+-	error = path_lookup(name, LOOKUP_PARENT, &nd);
++	intent_init(&nd.intent.open, IT_RMDIR);
++	error = path_lookup_it(name, LOOKUP_PARENT, &nd);
+ 	if (error)
+ 		goto exit;
++	if (nd.intent.open.flags & IT_STATUS_RAW)
++		goto exit1;
+ 
+ 	switch(nd.last_type) {
+ 		case LAST_DOTDOT:
+@@ -1892,9 +1915,13 @@
+ 	if(IS_ERR(name))
+ 		return PTR_ERR(name);
+ 
+-	error = path_lookup(name, LOOKUP_PARENT, &nd);
++	intent_init(&nd.intent.open, IT_UNLINK);
++	error = path_lookup_it(name, LOOKUP_PARENT, &nd);
+ 	if (error)
+ 		goto exit;
++	if (nd.intent.open.flags & IT_STATUS_RAW)
++		goto exit1;
++
+ 	error = -EISDIR;
+ 	if (nd.last_type != LAST_NORM)
+ 		goto exit1;
+@@ -1965,9 +1992,13 @@
+ 		struct dentry *dentry;
+ 		struct nameidata nd;
+ 
+-		error = path_lookup(to, LOOKUP_PARENT, &nd);
++		intent_init(&nd.intent.open, IT_SYMLINK);
++		nd.intent.open.create.link = from;
++		error = path_lookup_it(to, LOOKUP_PARENT, &nd);
+ 		if (error)
+ 			goto out;
++		if (nd.intent.open.flags & IT_STATUS_RAW)
++			goto out2;
+ 		dentry = lookup_create(&nd, 0);
+ 		error = PTR_ERR(dentry);
+ 		if (!IS_ERR(dentry)) {
+@@ -1975,6 +2006,7 @@
+ 			dput(dentry);
+ 		}
+ 		up(&nd.dentry->d_inode->i_sem);
++out2:
+ 		path_release(&nd);
+ out:
+ 		putname(to);
+@@ -2046,9 +2078,13 @@
+ 	error = __user_walk(oldname, 0, &old_nd);
+ 	if (error)
+ 		goto exit;
+-	error = path_lookup(to, LOOKUP_PARENT, &nd);
++	intent_init(&nd.intent.open, IT_LINK);
++	nd.intent.open.create.source_nd = &old_nd;
++	error = path_lookup_it(to, LOOKUP_PARENT, &nd);
+ 	if (error)
+ 		goto out;
++	if (nd.intent.open.flags & IT_STATUS_RAW)
++		goto out_release;
+ 	error = -EXDEV;
+ 	if (old_nd.mnt != nd.mnt)
+ 		goto out_release;
+@@ -2229,9 +2265,18 @@
+ 	if (error)
+ 		goto exit;
+ 
+-	error = path_lookup(newname, LOOKUP_PARENT, &newnd);
++	error = -EBUSY;
++	if (oldnd.last_type != LAST_NORM)
++		goto exit1;
++
++	intent_init(&newnd.intent.open, IT_RENAME);
++	newnd.intent.open.create.source_nd = &oldnd;
++	error = path_lookup_it(newname, LOOKUP_PARENT, &newnd);
+ 	if (error)
+ 		goto exit1;
++	if (newnd.intent.open.flags & IT_STATUS_RAW) {
++		goto exit2;
++	}
+ 
+ 	error = -EXDEV;
+ 	if (oldnd.mnt != newnd.mnt)
+@@ -2239,8 +2284,6 @@
+ 
+ 	old_dir = oldnd.dentry;
+ 	error = -EBUSY;
+-	if (oldnd.last_type != LAST_NORM)
+-		goto exit2;
+ 
+ 	new_dir = newnd.dentry;
+ 	if (newnd.last_type != LAST_NORM)
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h	2005-03-31 17:03:37.000000000 +0800
++++ linux-2.6.10/include/linux/fs.h	2005-03-31 17:46:35.715463960 +0800
+@@ -956,6 +956,7 @@
+ 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
+ 	int (*removexattr) (struct dentry *, const char *);
++	int (*endparentlookup) (struct nameidata *);
+ };
+ 
+ struct seq_file;
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h	2005-03-31 17:43:42.472800848 +0800
++++ linux-2.6.10/include/linux/namei.h	2005-03-31 17:50:12.533502608 +0800
+@@ -15,8 +15,19 @@
+ #define IT_UNLINK	(1<<5)
+ #define IT_TRUNC	(1<<6)
+ #define IT_GETXATTR	(1<<7)
+- 
++#define IT_RMDIR	(1<<8)
++#define IT_LINK		(1<<9)
++#define IT_RENAME	(1<<10)
++#define IT_MKDIR	(1<<11)
++#define IT_MKNOD	(1<<12)
++#define IT_SYMLINK	(1<<13)
++#define IT_CHDIR	(1<<14)
++
+ #define INTENT_MAGIC 0x19620323
++#define IT_STATUS_RAW (1<<10)  /* Setting this in it_flags on exit from lookup
++                                   means everything was done already and return
++                                   value from lookup is in fact status of
++                                   already performed operation */
+  
+ 
+ struct open_intent {
+@@ -26,6 +37,11 @@
+ 	int	flags;
+ 	int	create_mode;
+  	union {
++ 		unsigned	dev;	/* For mknod */
++ 		char	*link;	/* For symlink */
++ 		struct nameidata *source_nd; /* For link/rename */
++ 	} create;
++ 	union {
+  		void *fs_data; /* FS-specific intent data */
+  	} d;
+ };
diff --git a/lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch
new file mode 100644
index 0000000..07d1008
--- /dev/null
+++ b/lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch
@@ -0,0 +1,22 @@
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c	2005-03-31 17:58:42.827926064 +0800
++++ linux-2.6.10/fs/namespace.c	2005-03-31 18:19:21.976546840 +0800
+@@ -62,6 +62,7 @@
+ 		INIT_LIST_HEAD(&mnt->mnt_mounts);
+ 		INIT_LIST_HEAD(&mnt->mnt_list);
+ 		INIT_LIST_HEAD(&mnt->mnt_fslink);
++ 		INIT_LIST_HEAD(&mnt->mnt_lustre_list);
+ 		if (name) {
+ 			int size = strlen(name)+1;
+ 			char *newname = kmalloc(size, GFP_KERNEL);
+@@ -177,6 +178,9 @@
+ {
+ 	struct super_block *sb = mnt->mnt_sb;
+ 	dput(mnt->mnt_root);
++        spin_lock(&dcache_lock);
++        list_del(&mnt->mnt_lustre_list);
++        spin_unlock(&dcache_lock);
+ 	free_vfsmnt(mnt);
+ 	deactivate_super(sb);
+ }
-- 
1.8.3.1