--- /dev/null
+ drivers/block/ll_rw_blk.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/blkdev.h | 1
+ 2 files changed, 50 insertions(+)
+
+Index: linux-2.6.10/drivers/block/ll_rw_blk.c
+===================================================================
+--- linux-2.6.10.orig/drivers/block/ll_rw_blk.c 2004-12-25 05:33:59.000000000 +0800
++++ linux-2.6.10/drivers/block/ll_rw_blk.c 2005-04-05 15:42:58.075467024 +0800
+@@ -2679,6 +2679,13 @@
+ if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))
+ goto end_io;
+
++ /* this is cfs's dev_rdonly check */
++ if (bio->bi_rw == WRITE &&
++ dev_check_rdonly(bio->bi_bdev->bd_dev)) {
++ bio_endio(bio, bio->bi_size, 0);
++ break;
++ }
++
+ block_wait_queue_running(q);
+
+ /*
+@@ -3287,6 +3294,58 @@
+ return queue_var_show(max_hw_sectors_kb, (page));
+ }
+
++#define MAX_RDONLY_DEVS 16
++
++static dev_t rdonly_devs[MAX_RDONLY_DEVS] = {0, };
++
++/*
++ * Debug code for turning block devices "read-only" (will discard writes
++ * silently). This is for filesystem crash/recovery testing.
++ */
++void dev_set_rdonly(struct block_device *bdev, int no_write)
++{
++ if (no_write >= MAX_RDONLY_DEVS) {
++ printk(KERN_ALERT "%s:%d illegal arg %d (max %d)\n",
++ __FILE__, __LINE__, no_write, MAX_RDONLY_DEVS);
++ return;
++ }
++
++ if (bdev) {
++ printk(KERN_WARNING "Turning device %s read-only at %d\n",
++ bdev->bd_disk ? bdev->bd_disk->disk_name : "?",
++ no_write);
++ rdonly_devs[no_write] = bdev->bd_dev;
++ }
++}
++
++void dev_clear_rdonly(int no_write)
++{
++ if (no_write >= MAX_RDONLY_DEVS) {
++ printk(KERN_ALERT "%s:%d illegal arg %d (max %d)\n",
++ __FILE__, __LINE__, no_write, MAX_RDONLY_DEVS);
++ return;
++ }
++
++ if (rdonly_devs[no_write] == 0)
++ return;
++
++ printk(KERN_WARNING "Clearing read-only at %d\n", no_write);
++ rdonly_devs[no_write] = 0;
++}
++
++int dev_check_rdonly(dev_t dev)
++{
++ int i;
++
++ for (i = 0; i < MAX_RDONLY_DEVS; i++)
++ if (rdonly_devs[i] == dev)
++ return 1;
++ return 0;
++}
++
++EXPORT_SYMBOL(dev_set_rdonly);
++EXPORT_SYMBOL(dev_clear_rdonly);
++EXPORT_SYMBOL(dev_check_rdonly);
+
+ static struct queue_sysfs_entry queue_requests_entry = {
+ .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
--- /dev/null
+ include/linux/dynlocks.h | 33 ++++++++++
+ lib/Makefile | 4 -
+ lib/dynlocks.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 187 insertions(+), 2 deletions(-)
+
+Index: linux-2.6.10/lib/dynlocks.c
+===================================================================
+--- linux-2.6.10.orig/lib/dynlocks.c 2005-03-31 16:59:29.399768040 +0800
++++ linux-2.6.10/lib/dynlocks.c 2005-03-31 18:02:41.470646856 +0800
+@@ -0,0 +1,187 @@
++/*
++ * Dynamic Locks
++ *
++ * struct dynlock is lockspace
++ * one may request lock (exclusive or shared) for some value
++ * in that lockspace
++ *
++ */
++
++#include <linux/dynlocks.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++
++static kmem_cache_t * dynlock_cachep = NULL;
++
++void __init dynlock_cache_init(void)
++{
++ printk(KERN_INFO "init dynlocks cache\n");
++ dynlock_cachep = kmem_cache_create("dynlock_cache",
++ sizeof(struct dynlock_member),
++ 0,
++ SLAB_HWCACHE_ALIGN,
++ NULL, NULL);
++ if (dynlock_cachep == NULL)
++ panic("Can't create dynlock cache");
++}
++
++/*
++ * dynlock_init
++ *
++ * initialize lockspace
++ *
++ */
++void dynlock_init(struct dynlock *dl)
++{
++ spin_lock_init(&dl->dl_list_lock);
++ INIT_LIST_HEAD(&dl->dl_list);
++ dl->dl_magic = DYNLOCK_LIST_MAGIC;
++}
++
++/*
++ * dynlock_lock
++ *
++ * acquires lock (exclusive or shared) in specified lockspace
++ * each lock in lockspace is allocated separately, so user have
++ * to specify GFP flags.
++ * routine returns pointer to lock. this pointer is intended to
++ * be passed to dynlock_unlock
++ *
++ */
++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp)
++{
++ struct dynlock_member *nhl = NULL;
++ struct dynlock_member *hl;
++ struct list_head *cur;
++ int num = 0;
++
++ BUG_ON(dl == NULL);
++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC);
++repeat:
++ /* find requested lock in lockspace */
++ spin_lock(&dl->dl_list_lock);
++ BUG_ON(dl->dl_list.next == NULL);
++ BUG_ON(dl->dl_list.prev == NULL);
++ list_for_each(cur, &dl->dl_list) {
++ BUG_ON(cur->next == NULL);
++ BUG_ON(cur->prev == NULL);
++ hl = list_entry(cur, struct dynlock_member, dl_list);
++ BUG_ON(hl->dl_magic != DYNLOCK_MAGIC);
++ if (hl->dl_value == value) {
++ /* lock is found */
++ if (nhl) {
++ /* someone else just allocated
++ * lock we didn't find and just created
++ * so, we drop our lock
++ */
++ kmem_cache_free(dynlock_cachep, nhl);
++ nhl = NULL;
++ }
++ hl->dl_refcount++;
++ goto found;
++ }
++ num++;
++ }
++ /* lock not found */
++ if (nhl) {
++ /* we already have allocated lock. use it */
++ hl = nhl;
++ nhl = NULL;
++ list_add(&hl->dl_list, &dl->dl_list);
++ goto found;
++ }
++ spin_unlock(&dl->dl_list_lock);
++
++ /* lock not found and we haven't allocated lock yet. allocate it */
++ nhl = kmem_cache_alloc(dynlock_cachep, gfp);
++ if (nhl == NULL)
++ return NULL;
++ nhl->dl_refcount = 1;
++ nhl->dl_value = value;
++ nhl->dl_readers = 0;
++ nhl->dl_writers = 0;
++ nhl->dl_magic = DYNLOCK_MAGIC;
++ init_waitqueue_head(&nhl->dl_wait);
++
++ /* while lock is being allocated, someone else may allocate it
++ * and put onto to list. check this situation
++ */
++ goto repeat;
++
++found:
++ if (rw) {
++ /* exclusive lock: user don't want to share lock at all
++ * NOTE: one process may take the same lock several times
++ * this functionaly is useful for rename operations */
++ while ((hl->dl_writers && hl->dl_pid != current->pid) ||
++ hl->dl_readers) {
++ spin_unlock(&dl->dl_list_lock);
++ wait_event(hl->dl_wait,
++ hl->dl_writers == 0 && hl->dl_readers == 0);
++ spin_lock(&dl->dl_list_lock);
++ }
++ hl->dl_writers++;
++ } else {
++ /* shared lock: user do not want to share lock with writer */
++ while (hl->dl_writers) {
++ spin_unlock(&dl->dl_list_lock);
++ wait_event(hl->dl_wait, hl->dl_writers == 0);
++ spin_lock(&dl->dl_list_lock);
++ }
++ hl->dl_readers++;
++ }
++ hl->dl_pid = current->pid;
++ spin_unlock(&dl->dl_list_lock);
++
++ return hl;
++}
++
++
++/*
++ * dynlock_unlock
++ *
++ * user have to specify lockspace (dl) and pointer to lock structure
++ * returned by dynlock_lock()
++ *
++ */
++void dynlock_unlock(struct dynlock *dl, void *lock)
++{
++ struct dynlock_member *hl = lock;
++ int wakeup = 0;
++
++ BUG_ON(dl == NULL);
++ BUG_ON(hl == NULL);
++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC);
++ BUG_ON(hl->dl_magic != DYNLOCK_MAGIC);
++ BUG_ON(current->pid != hl->dl_pid);
++
++ spin_lock(&dl->dl_list_lock);
++ if (hl->dl_writers) {
++ BUG_ON(hl->dl_readers > 0 || hl->dl_readers < 0);
++ hl->dl_writers--;
++ if (hl->dl_writers == 0)
++ wakeup = 1;
++ } else if (hl->dl_readers) {
++ hl->dl_readers--;
++ if (hl->dl_readers == 0)
++ wakeup = 1;
++ } else {
++ BUG_ON(1);
++ }
++ if (wakeup) {
++ hl->dl_pid = 0;
++ wake_up(&hl->dl_wait);
++ }
++ if (--(hl->dl_refcount) == 0) {
++ hl->dl_magic = DYNLOCK_MAGIC2;
++ list_del(&hl->dl_list);
++ kmem_cache_free(dynlock_cachep, hl);
++ }
++ spin_unlock(&dl->dl_list_lock);
++}
++
++EXPORT_SYMBOL(dynlock_init);
++EXPORT_SYMBOL(dynlock_lock);
++EXPORT_SYMBOL(dynlock_unlock);
++
+Index: linux-2.6.10/lib/Makefile
+===================================================================
+--- linux-2.6.10.orig/lib/Makefile 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/lib/Makefile 2005-03-31 18:03:16.727287032 +0800
+@@ -5,7 +5,7 @@
+ lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
+ bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
+ kobject.o kref.o idr.o div64.o parser.o int_sqrt.o \
+- bitmap.o extable.o kobject_uevent.o
++ bitmap.o extable.o kobject_uevent.o dynlocks.o
+
+ ifeq ($(CONFIG_DEBUG_KOBJECT),y)
+ CFLAGS_kobject.o += -DDEBUG
+Index: linux-2.6.10/fs/dcache.c
+===================================================================
+--- linux-2.6.10.orig/fs/dcache.c 2005-03-31 17:02:41.000000000 +0800
++++ linux-2.6.10/fs/dcache.c 2005-03-31 18:02:41.474646248 +0800
+@@ -1655,6 +1655,7 @@
+
+ extern void bdev_cache_init(void);
+ extern void chrdev_init(void);
++extern void dynlock_cache_init(void);
+
+ void __init vfs_caches_init_early(void)
+ {
+@@ -1684,6 +1685,7 @@
+ mnt_init(mempages);
+ bdev_cache_init();
+ chrdev_init();
++ dynlock_cache_init();
+ }
+
+ EXPORT_SYMBOL(d_alloc);
+Index: linux-2.6.10/include/linux/dynlocks.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dynlocks.h 2005-03-31 16:59:29.399768040 +0800
++++ linux-2.6.10/include/linux/dynlocks.h 2005-03-31 18:02:41.469647008 +0800
+@@ -0,0 +1,43 @@
++#ifndef _LINUX_DYNLOCKS_H
++#define _LINUX_DYNLOCKS_H
++
++#include <linux/list.h>
++#include <linux/wait.h>
++
++#define DYNLOCK_MAGIC 0xd19a10c
++#define DYNLOCK_MAGIC2 0xd1956ee
++
++struct dynlock;
++
++struct dynlock_member {
++ unsigned dl_magic;
++ struct list_head dl_list;
++ unsigned long dl_value; /* lock value */
++ int dl_refcount; /* number of users */
++ int dl_readers;
++ int dl_writers;
++ int dl_pid; /* holder of the lock */
++ wait_queue_head_t dl_wait;
++};
++
++/*
++ * lock's namespace:
++ * - list of locks
++ * - lock to protect this list
++ */
++
++#define DYNLOCK_LIST_MAGIC 0x11ee91e6
++
++struct dynlock {
++ unsigned dl_magic;
++ struct list_head dl_list;
++ spinlock_t dl_list_lock;
++};
++
++void dynlock_init(struct dynlock *dl);
++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp);
++void dynlock_unlock(struct dynlock *dl, void *lock);
++
++
++#endif
++
--- /dev/null
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c 2005-03-31 18:44:38.935933960 +0800
++++ linux-2.6.10/fs/ext3/super.c 2005-03-31 18:46:03.008153040 +0800
+@@ -123,6 +123,8 @@
+ journal_abort_handle(handle);
+ }
+
++EXPORT_SYMBOL(ext3_journal_abort_handle);
++
+ /* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+@@ -2016,6 +2018,8 @@
+ return ret;
+ }
+
++EXPORT_SYMBOL(ext3_force_commit);
++
+ /*
+ * Ext3 always journals updates to the superblock itself, so we don't
+ * have to propagate any other updates to the superblock on disk at this
+@@ -2447,6 +2451,10 @@
+ unsigned long *blocks, int *created, int create);
+ EXPORT_SYMBOL(ext3_map_inode_page);
+
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_set_handle);
++EXPORT_SYMBOL(ext3_bread);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+ MODULE_LICENSE("GPL");
--- /dev/null
+Index: linux-2.6.10/net/core/sock.c
+===================================================================
+--- linux-2.6.10.orig/net/core/sock.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/net/core/sock.c 2005-03-31 20:42:01.084364672 +0800
+@@ -1359,6 +1359,7 @@
+ EXPORT_SYMBOL(sk_alloc);
+ EXPORT_SYMBOL(sk_free);
+ EXPORT_SYMBOL(sk_send_sigurg);
++EXPORT_SYMBOL(sock_getsockopt);
+ EXPORT_SYMBOL(sock_alloc_send_pskb);
+ EXPORT_SYMBOL(sock_alloc_send_skb);
+ EXPORT_SYMBOL(sock_init_data);
+Index: linux-2.6.10/fs/dcache.c
+===================================================================
+--- linux-2.6.10.orig/fs/dcache.c 2005-03-31 19:44:53.000000000 +0800
++++ linux-2.6.10/fs/dcache.c 2005-03-31 22:02:08.130582568 +0800
+@@ -1691,6 +1691,7 @@
+
+ EXPORT_SYMBOL(d_alloc);
+ EXPORT_SYMBOL(d_alloc_anon);
++EXPORT_SYMBOL(is_subdir);
+ EXPORT_SYMBOL(d_alloc_root);
+ EXPORT_SYMBOL(d_delete);
+ EXPORT_SYMBOL(d_find_alias);
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c 2005-03-31 19:44:54.000000000 +0800
++++ linux-2.6.10/fs/namespace.c 2005-03-31 22:03:44.906870336 +0800
+@@ -1239,6 +1239,7 @@
+ mntput(old_pwdmnt);
+ }
+ }
++EXPORT_SYMBOL(set_fs_pwd);
+
+ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
+ {
+Index: linux-2.6.10/fs/file_table.c
+===================================================================
+--- linux-2.6.10.orig/fs/file_table.c 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/fs/file_table.c 2005-03-31 20:44:40.924065344 +0800
+@@ -196,6 +196,7 @@
+ file_free(file);
+ }
+ }
++EXPORT_SYMBOL(put_filp);
+
+ void file_move(struct file *file, struct list_head *list)
+ {
+Index: linux-2.6.10/kernel/sched.c
+===================================================================
+--- linux-2.6.10.orig/kernel/sched.c 2005-03-31 15:57:21.000000000 +0800
++++ linux-2.6.10/kernel/sched.c 2005-03-31 22:00:30.616406976 +0800
+@@ -2942,6 +2942,19 @@
+
+ EXPORT_SYMBOL(sleep_on_timeout);
+
++void fastcall __sched sleep_on(wait_queue_head_t *q)
++{
++ SLEEP_ON_VAR
++
++ current->state = TASK_UNINTERRUPTIBLE;
++
++ SLEEP_ON_HEAD
++ schedule();
++ SLEEP_ON_TAIL
++}
++
++EXPORT_SYMBOL(sleep_on);
++
+ void set_user_nice(task_t *p, long nice)
+ {
+ unsigned long flags;
+Index: linux-2.6.10/kernel/exit.c
+===================================================================
+--- linux-2.6.10.orig/kernel/exit.c 2005-03-31 19:44:52.509587264 +0800
++++ linux-2.6.10/kernel/exit.c 2005-03-31 20:47:18.034180976 +0800
+@@ -515,6 +515,7 @@
+ {
+ __exit_mm(tsk);
+ }
++EXPORT_SYMBOL(exit_mm);
+
+ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+ {
--- /dev/null
+Index: linux-2.6.10/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs_sb.h 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/include/linux/ext3_fs_sb.h 2005-03-31 18:44:21.076648984 +0800
+@@ -19,9 +19,12 @@
+ #ifdef __KERNEL__
+ #include <linux/timer.h>
+ #include <linux/wait.h>
++#ifndef EXT_INCLUDE
++#define EXT_INCLUDE
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
+ #endif
++#endif
+ #include <linux/rbtree.h>
+
+ /*
--- /dev/null
+%patch
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-04-05 12:26:19.494124024 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h 2005-04-05 12:26:25.474214912 +0800
+@@ -186,6 +186,7 @@
+ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */
+
+ #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+@@ -238,7 +239,9 @@
+ #endif
+ #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
+ #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
+-
++#define EXT3_IOC_GET_EXTENTS _IOR('f', 10, long)
++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 11, long)
++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 12, long)
+ /*
+ * Structure of an inode on the disk
+ */
+@@ -361,6 +364,8 @@
+ #define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */
+ #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
++#define EXT3_MOUNT_EXTENTS 0x100000 /* Extents support */
++#define EXT3_MOUNT_EXTDEBUG 0x200000 /* Extents debug */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -549,11 +554,13 @@
+ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+ #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */
+
+ #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER| \
+- EXT3_FEATURE_INCOMPAT_META_BG)
++ EXT3_FEATURE_INCOMPAT_META_BG| \
++ EXT3_FEATURE_INCOMPAT_EXTENTS)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+@@ -759,6 +766,7 @@
+
+
+ /* inode.c */
++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+ extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -839,6 +847,14 @@
+ extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++ struct buffer_head *, int, int);
++extern void ext3_ext_truncate(struct inode *, struct page *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
+
+ #endif /* __KERNEL__ */
+
+Index: linux-2.6.10/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs_i.h 2005-04-05 12:26:19.377141808 +0800
++++ linux-2.6.10/include/linux/ext3_fs_i.h 2005-04-05 12:26:25.473215064 +0800
+@@ -134,6 +134,8 @@
+ struct dynlock i_htree_lock;
+ struct semaphore i_append_sem;
+ struct semaphore i_rename_sem;
++
++ __u32 i_cached_extent[3];
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
+Index: linux-2.6.10/include/linux/ext3_extents.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_extents.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/ext3_extents.h 2005-04-05 12:26:25.476214608 +0800
+@@ -0,0 +1,238 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++#ifndef _LINUX_EXT3_EXTENTS
++#define _LINUX_EXT3_EXTENTS
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if CHECK_BINSEARCH defined, then results of binary search
++ * will be checked by linear search
++ */
++#define CHECK_BINSEARCH_
++
++/*
++ * if EXT_DEBUG is defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG
++#ifdef EXT_DEBUG
++#define ext_debug(tree,fmt,a...) \
++do { \
++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \
++ printk(fmt, ##a); \
++} while (0);
++#else
++#define ext_debug(tree,fmt,a...)
++#endif
++
++/*
++ * if EXT_STATS is defined then stats numbers are collected
++ * these number will be displayed at umount time
++ */
++#define EXT_STATS_
++
++
++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ * - tree depth (0 mean there is no tree yet. all extents in the inode)
++ * - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++ __u32 ee_block; /* first logical block extent covers */
++ __u16 ee_len; /* number of blocks covered by extent */
++ __u16 ee_start_hi; /* high 16 bits of physical block */
++ __u32 ee_start; /* low 32 bigs of physical block */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++ __u32 ei_block; /* index covers logical blocks from 'block' */
++ __u32 ei_leaf; /* pointer to the physical block of the next *
++ * level. leaf or next index could bet here */
++ __u16 ei_leaf_hi; /* high 16 bits of physical block */
++ __u16 ei_unused;
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {
++ __u16 eh_magic; /* probably will support different formats */
++ __u16 eh_entries; /* number of valid entries */
++ __u16 eh_max; /* capacity of store in entries */
++ __u16 eh_depth; /* has tree real underlaying blocks? */
++ __u32 eh_generation; /* generation of the tree */
++};
++
++#define EXT3_EXT_MAGIC 0xf30a
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++ __u32 p_block;
++ __u16 p_depth;
++ struct ext3_extent *p_ext;
++ struct ext3_extent_idx *p_idx;
++ struct ext3_extent_header *p_hdr;
++ struct buffer_head *p_bh;
++};
++
++/*
++ * structure for external API
++ */
++
++/*
++ * ext3_extents_tree is used to pass initial information
++ * to top-level extents API
++ */
++struct ext3_extents_helpers;
++struct ext3_extents_tree {
++ struct inode *inode; /* inode which tree belongs to */
++ void *root; /* ptr to data top of tree resides at */
++ void *buffer; /* will be passed as arg to ^^ routines */
++ int buffer_len;
++ void *private;
++ struct ext3_extent *cex;/* last found extent */
++ struct ext3_extents_helpers *ops;
++};
++
++struct ext3_extents_helpers {
++ int (*get_write_access)(handle_t *h, void *buffer);
++ int (*mark_buffer_dirty)(handle_t *h, void *buffer);
++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
++ int (*remove_extent_credits)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*remove_extent)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*new_block)(handle_t *, struct ext3_extents_tree *,
++ struct ext3_ext_path *, struct ext3_extent *,
++ int *);
++};
++
++/*
++ * to be called by ext3_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext3_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
++ struct ext3_ext_path *,
++ struct ext3_extent *, int);
++
++#define EXT_CONTINUE 0
++#define EXT_BREAK 1
++#define EXT_REPEAT 2
++
++
++#define EXT_MAX_BLOCK 0xffffffff
++#define EXT_CACHE_MARK 0xffff
++
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++ ((struct ext3_extent *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++
++#define EXT_ROOT_HDR(tree) \
++ ((struct ext3_extent_header *) (tree)->root)
++#define EXT_BLOCK_HDR(bh) \
++ ((struct ext3_extent_header *) (bh)->b_data)
++#define EXT_DEPTH(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
++#define EXT_GENERATION(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++
++/*
++ * this structure is used to gather extents from the tree via ioctl
++ */
++struct ext3_extent_buf {
++ unsigned long start;
++ int buflen;
++ void *buffer;
++ void *cur;
++ int err;
++};
++
++/*
++ * this structure is used to collect stats info about the tree
++ */
++struct ext3_extent_tree_stats {
++ int depth;
++ int extents_num;
++ int leaf_num;
++};
++
++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *);
++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int);
++
++static inline void
++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
++{
++ if (tree->cex)
++ tree->cex->ee_len = 0;
++}
++
++
++#endif /* _LINUX_EXT3_EXTENTS */
++
+Index: linux-2.6.10/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/inode.c 2005-04-05 12:26:19.367143328 +0800
++++ linux-2.6.10/fs/ext3/inode.c 2005-04-05 12:26:25.462216736 +0800
+@@ -796,6 +796,17 @@
+ goto reread;
+ }
+
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++ struct buffer_head *bh, int create, int extend_disksize)
++{
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_get_block(handle, inode, block, bh, create,
++ extend_disksize);
++ return ext3_get_block_handle(handle, inode, block, bh, create,
++ extend_disksize);
++}
++
+ static int ext3_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+ {
+@@ -806,8 +817,8 @@
+ handle = ext3_journal_current_handle();
+ J_ASSERT(handle != 0);
+ }
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 1);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 1);
+ return ret;
+ }
+
+@@ -851,8 +862,8 @@
+
+ get_block:
+ if (ret == 0)
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 0);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 0);
+ bh_result->b_size = (1 << inode->i_blkbits);
+ return ret;
+ }
+@@ -871,7 +882,7 @@
+ dummy.b_state = 0;
+ dummy.b_blocknr = -1000;
+ buffer_trace_init(&dummy.b_history);
+- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
+ if (!*errp && buffer_mapped(&dummy)) {
+ struct buffer_head *bh;
+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1591,7 +1602,7 @@
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
++int ext3_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+ {
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+@@ -2089,6 +2100,9 @@
+ return;
+ }
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_truncate(inode, page);
++
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ if (page) {
+@@ -2817,6 +2831,9 @@
+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ int ret;
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_writepage_trans_blocks(inode, bpp);
++
+ if (ext3_should_journal_data(inode))
+ ret = 3 * (bpp + indirects) + 2;
+ else
+Index: linux-2.6.10/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ioctl.c 2005-04-05 12:25:13.631136720 +0800
++++ linux-2.6.10/fs/ext3/ioctl.c 2005-04-05 12:26:25.471215368 +0800
+@@ -245,6 +245,10 @@
+ return err;
+ }
+
++ case EXT3_IOC_GET_EXTENTS:
++ case EXT3_IOC_GET_TREE_STATS:
++ case EXT3_IOC_GET_TREE_DEPTH:
++ return ext3_ext_ioctl(inode, filp, cmd, arg);
+
+ default:
+ return -ENOTTY;
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c 2005-04-05 12:26:19.438132536 +0800
++++ linux-2.6.10/fs/ext3/super.c 2005-04-05 12:26:25.471215368 +0800
+@@ -394,6 +394,7 @@
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+@@ -463,6 +464,9 @@
+ dynlock_init(&ei->i_htree_lock);
+ sema_init(&ei->i_rename_sem, 1);
+ sema_init(&ei->i_append_sem, 1);
++ ei->i_cached_extent[0] = 0;
++ ei->i_cached_extent[1] = 0;
++ ei->i_cached_extent[2] = 0;
+ return &ei->vfs_inode;
+ }
+
+@@ -595,6 +599,7 @@
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
++ Opt_extents, Opt_extdebug,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+ };
+
+@@ -647,6 +652,8 @@
+ {Opt_iopen, "iopen"},
+ {Opt_noiopen, "noiopen"},
+ {Opt_iopen_nopriv, "iopen_nopriv"},
++ {Opt_extents, "extents"},
++ {Opt_extdebug, "extdebug"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+ };
+@@ -950,6 +957,12 @@
+ match_int(&args[0], &option);
+ *n_blocks_count = option;
+ break;
++ case Opt_extents:
++ set_opt (sbi->s_mount_opt, EXTENTS);
++ break;
++ case Opt_extdebug:
++ set_opt (sbi->s_mount_opt, EXTDEBUG);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1635,6 +1648,8 @@
+ percpu_counter_mod(&sbi->s_dirs_counter,
+ ext3_count_dirs(sb));
+
++ ext3_ext_init(sb);
++
+ return 0;
+
+ cantfind_ext3:
+Index: linux-2.6.10/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/extents.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/ext3/extents.c 2005-04-05 12:26:25.468215824 +0800
+@@ -0,0 +1,2306 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++/*
++ * Extents support for EXT3
++ *
++ * TODO:
++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
++ * - ext3_ext_calc_credits() could take 'mergable' into account
++ * - ext3*_error() should be used in some situations
++ * - find_goal() [to be tested and improved]
++ * - smart tree reduction
++ * - arch-independence
++ * common on-disk format for big/little-endian arch
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/ext3_extents.h>
++#include <asm/uaccess.h>
++
++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++ int err;
++
++ if (handle->h_buffer_credits > needed)
++ return handle;
++ if (!ext3_journal_extend(handle, needed))
++ return handle;
++ err = ext3_journal_restart(handle, needed);
++
++ return handle;
++}
++
++static int inline
++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->get_write_access)
++ return tree->ops->get_write_access(h,tree->buffer);
++ else
++ return 0;
++}
++
++static int inline
++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->mark_buffer_dirty)
++ return tree->ops->mark_buffer_dirty(h,tree->buffer);
++ else
++ return 0;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++
++ if (path->p_bh) {
++ /* path points to block */
++ err = ext3_journal_get_write_access(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_get_access_for_root(handle, tree);
++ }
++ return err;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ * - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++ if (path->p_bh) {
++ /* path points to block */
++ err =ext3_journal_dirty_metadata(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_mark_root_dirty(handle, tree);
++ }
++ return err;
++}
++
++static int inline
++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, struct ext3_extent *ex,
++ int *err)
++{
++ int goal, depth, newblock;
++ struct inode *inode;
++
++ EXT_ASSERT(tree);
++ if (tree->ops->new_block)
++ return tree->ops->new_block(handle, tree, path, ex, err);
++
++ inode = tree->inode;
++ depth = EXT_DEPTH(tree);
++ if (path && depth > 0) {
++ goal = path[depth-1].p_block;
++ } else {
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++
++ bg_start = (ei->i_block_group *
++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ goal = bg_start + colour;
++ }
++
++ newblock = ext3_new_block(handle, inode, goal, err);
++ return newblock;
++}
++
++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *neh;
++ neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation++;
++}
++
++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 6;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 5;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len - sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 3;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 4;
++#endif
++ return size;
++}
++
++static void ext3_ext_show_path(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int k, l = path->p_depth;
++
++ ext_debug(tree, "path:");
++ for (k = 0; k <= l; k++, path++) {
++ if (path->p_idx) {
++ ext_debug(tree, " %d->%d", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++ } else if (path->p_ext) {
++ ext_debug(tree, " %d:%d:%d",
++ path->p_ext->ee_block,
++ path->p_ext->ee_len,
++ path->p_ext->ee_start);
++ } else
++ ext_debug(tree, " []");
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *eh;
++ struct ext3_extent *ex;
++ int i;
++
++ if (!path)
++ return;
++
++ eh = path[depth].p_hdr;
++ ex = EXT_FIRST_EXTENT(eh);
++
++ for (i = 0; i < eh->eh_entries; i++, ex++) {
++ ext_debug(tree, "%d:%d:%d ",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_drop_refs(struct ext3_ext_path *path)
++{
++ int depth = path->p_depth;
++ int i;
++
++ for (i = 0; i <= depth; i++, path++)
++ if (path->p_bh) {
++ brelse(path->p_bh);
++ path->p_bh = NULL;
++ }
++}
++
++/*
++ * binary search for closest index by given block
++ */
++static inline void
++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent_idx *ix;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_entries > 0);
++
++ ext_debug(tree, "binsearch for %d(idx): ", block);
++
++ path->p_idx = ix = EXT_FIRST_INDEX(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ix[l + k].ei_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ix += l;
++ path->p_idx = ix;
++ ext_debug(tree, " -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf);
++
++ while (l++ < r) {
++ if (block < ix->ei_block)
++ break;
++ path->p_idx = ix++;
++ }
++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent_idx *chix;
++
++ chix = ix = EXT_FIRST_INDEX(eh);
++ for (k = 0; k < eh->eh_entries; k++, ix++) {
++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) {
++ printk("k=%d, ix=0x%p, first=0x%p\n", k,
++ ix, EXT_FIRST_INDEX(eh));
++ printk("%u <= %u\n",
++ ix->ei_block,ix[-1].ei_block);
++ }
++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block);
++ if (block < ix->ei_block)
++ break;
++ chix = ix;
++ }
++ EXT_ASSERT(chix == path->p_idx);
++ }
++#endif
++
++}
++
++/*
++ * binary search for closest extent by given block
++ */
++static inline void
++ext3_ext_binsearch(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent *ex;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++
++ if (eh->eh_entries == 0) {
++ /*
++ * this leaf is empty yet:
++ * we get such a leaf in split/add case
++ */
++ return;
++ }
++
++ ext_debug(tree, "binsearch for %d: ", block);
++
++ path->p_ext = ex = EXT_FIRST_EXTENT(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ex[l + k].ee_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ex += l;
++ path->p_ext = ex;
++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++ while (l++ < r) {
++ if (block < ex->ee_block)
++ break;
++ path->p_ext = ex++;
++ }
++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent *chex;
++
++ chex = ex = EXT_FIRST_EXTENT(eh);
++ for (k = 0; k < eh->eh_entries; k++, ex++) {
++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block);
++ if (block < ex->ee_block)
++ break;
++ chex = ex;
++ }
++ EXT_ASSERT(chex == path->p_ext);
++ }
++#endif
++
++}
++
++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *eh;
++
++ BUG_ON(tree->buffer_len == 0);
++ ext3_ext_get_access_for_root(handle, tree);
++ eh = EXT_ROOT_HDR(tree);
++ eh->eh_depth = 0;
++ eh->eh_entries = 0;
++ eh->eh_magic = EXT3_EXT_MAGIC;
++ eh->eh_max = ext3_ext_space_root(tree);
++ ext3_ext_mark_root_dirty(handle, tree);
++ ext3_ext_invalidate_cache(tree);
++ return 0;
++}
++
++struct ext3_ext_path *
++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ struct buffer_head *bh;
++ int depth, i, ppos = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ eh = EXT_ROOT_HDR(tree);
++ EXT_ASSERT(eh);
++ i = depth = EXT_DEPTH(tree);
++ EXT_ASSERT(eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(i == 0 || eh->eh_entries > 0);
++
++ /* account possible depth increase */
++ if (!path) {
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++ GFP_NOFS);
++ if (!path)
++ return ERR_PTR(-ENOMEM);
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[0].p_hdr = eh;
++
++ /* walk through the tree */
++ while (i) {
++ ext_debug(tree, "depth %d: num %d, max %d\n",
++ ppos, eh->eh_entries, eh->eh_max);
++ ext3_ext_binsearch_idx(tree, path + ppos, block);
++ path[ppos].p_block = path[ppos].p_idx->ei_leaf;
++ path[ppos].p_depth = i;
++ path[ppos].p_ext = NULL;
++
++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
++ if (!bh) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ return ERR_PTR(-EIO);
++ }
++ eh = EXT_BLOCK_HDR(bh);
++ ppos++;
++ EXT_ASSERT(ppos <= depth);
++ path[ppos].p_bh = bh;
++ path[ppos].p_hdr = eh;
++ i--;
++ }
++
++ path[ppos].p_depth = i;
++ path[ppos].p_hdr = eh;
++ path[ppos].p_ext = NULL;
++
++ /* find extent */
++ ext3_ext_binsearch(tree, path + ppos, block);
++
++ ext3_ext_show_path(tree, path);
++
++ return path;
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *curp,
++ int logical, int ptr)
++{
++ struct ext3_extent_idx *ix;
++ int len, err;
++
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ return err;
++
++ EXT_ASSERT(logical != curp->p_idx->ei_block);
++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++ if (logical > curp->p_idx->ei_block) {
++ /* insert after */
++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
++ len = (len - 1) * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d after: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ (curp->p_idx + 1), (curp->p_idx + 2));
++ memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++ }
++ ix = curp->p_idx + 1;
++ } else {
++ /* insert before */
++ len = len * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d before: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ curp->p_idx, (curp->p_idx + 1));
++ memmove(curp->p_idx + 1, curp->p_idx, len);
++ ix = curp->p_idx;
++ }
++
++ ix->ei_block = logical;
++ ix->ei_leaf = ptr;
++ curp->p_hdr->eh_entries++;
++
++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max);
++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
++
++ err = ext3_ext_dirty(handle, tree, curp);
++ ext3_std_error(tree->inode->i_sb, err);
++
++ return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ * - allocates all needed blocks (new leaf and all intermediate index blocks)
++ * - makes decision where to split
++ * - moves remaining extens and index entries (right to the split point)
++ * into the newly allocated blocks
++ * - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext, int at)
++{
++ struct buffer_head *bh = NULL;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct ext3_extent *ex;
++ int i = at, k, m, a;
++ unsigned long newblock, oldblock, border;
++ int *ablocks = NULL; /* array of allocated blocks */
++ int err = 0;
++
++ /* make decision: where to split? */
++ /* FIXME: now desicion is simplest: at current extent */
++
++ /* if current leaf will be splitted, then we should use
++ * border from split point */
++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ border = path[depth].p_ext[1].ee_block;
++ ext_debug(tree, "leaf will be splitted."
++ " next leaf starts at %d\n",
++ (int)border);
++ } else {
++ border = newext->ee_block;
++ ext_debug(tree, "leaf will be added."
++ " next leaf starts at %d\n",
++ (int)border);
++ }
++
++ /*
++ * if error occurs, then we break processing
++ * and turn filesystem read-only. so, index won't
++ * be inserted and tree will be in consistent
++ * state. next mount will repair buffers too
++ */
++
++ /*
++ * get array to track all allocated blocks
++ * we need this to handle errors and free blocks
++ * upon them
++ */
++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
++ if (!ablocks)
++ return -ENOMEM;
++ memset(ablocks, 0, sizeof(unsigned long) * depth);
++
++ /* allocate all needed blocks */
++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
++ for (a = 0; a < depth - at; a++) {
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ goto cleanup;
++ ablocks[a] = newblock;
++ }
++
++ /* initialize new leaf */
++ newblock = ablocks[--a];
++ EXT_ASSERT(newblock);
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 0;
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_depth = 0;
++ ex = EXT_FIRST_EXTENT(neh);
++
++ /* move remain of path[depth] to the new leaf */
++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max);
++ /* start copy from next extent */
++ /* TODO: we could do it by single memmove */
++ m = 0;
++ path[depth].p_ext++;
++ while (path[depth].p_ext <=
++ EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
++ path[depth].p_ext->ee_block,
++ path[depth].p_ext->ee_start,
++ path[depth].p_ext->ee_len,
++ newblock);
++ memmove(ex++, path[depth].p_ext++,
++ sizeof(struct ext3_extent));
++ neh->eh_entries++;
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old leaf */
++ if (m) {
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++ path[depth].p_hdr->eh_entries -= m;
++ if ((err = ext3_ext_dirty(handle, tree, path + depth)))
++ goto cleanup;
++
++ }
++
++ /* create intermediate indexes */
++ k = depth - at - 1;
++ EXT_ASSERT(k >= 0);
++ if (k)
++ ext_debug(tree, "create %d intermediate indices\n", k);
++ /* insert new index into current index block */
++ /* current depth stored in i var */
++ i = depth - 1;
++ while (k--) {
++ oldblock = newblock;
++ newblock = ablocks[--a];
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 1;
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ neh->eh_depth = depth - i;
++ fidx = EXT_FIRST_INDEX(neh);
++ fidx->ei_block = border;
++ fidx->ei_leaf = oldblock;
++
++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n",
++ i, newblock, border, oldblock);
++ /* copy indexes */
++ m = 0;
++ path[i].p_idx++;
++
++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++ EXT_MAX_INDEX(path[i].p_hdr));
++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++ EXT_LAST_INDEX(path[i].p_hdr));
++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
++ ext_debug(tree, "%d: move %d:%d in new index %lu\n",
++ i, path[i].p_idx->ei_block,
++ path[i].p_idx->ei_leaf, newblock);
++ memmove(++fidx, path[i].p_idx++,
++ sizeof(struct ext3_extent_idx));
++ neh->eh_entries++;
++ EXT_ASSERT(neh->eh_entries <= neh->eh_max);
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old index */
++ if (m) {
++ err = ext3_ext_get_access(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ path[i].p_hdr->eh_entries -= m;
++ err = ext3_ext_dirty(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ }
++
++ i--;
++ }
++
++ /* insert new index */
++ if (!err)
++ err = ext3_ext_insert_index(handle, tree, path + at,
++ border, newblock);
++
++cleanup:
++ if (bh) {
++ if (buffer_locked(bh))
++ unlock_buffer(bh);
++ brelse(bh);
++ }
++
++ if (err) {
++ /* free all allocated blocks in error case */
++ for (i = 0; i < depth; i++) {
++ if (!ablocks[i])
++ continue;
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ }
++ }
++ kfree(ablocks);
++
++ return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ * - allocates new block
++ * - moves top-level data (index block or leaf) into the new block
++ * - initialize new top-level, creating index that points to the
++ * just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp = path;
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct buffer_head *bh;
++ unsigned long newblock;
++ int err = 0;
++
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ return err;
++
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ ext3_std_error(tree->inode->i_sb, err);
++ return err;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh))) {
++ unlock_buffer(bh);
++ goto out;
++ }
++
++ /* move top-level index/leaf into new block */
++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
++
++ /* set size of new block */
++ neh = EXT_BLOCK_HDR(bh);
++ /* old root could have indexes or leaves
++ * so calculate e_max right way */
++ if (EXT_DEPTH(tree))
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ else
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto out;
++
++ /* create index in new top-level index: num,max,pointer */
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ goto out;
++
++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC;
++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree);
++ curp->p_hdr->eh_entries = 1;
++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++ /* FIXME: it works, but actually path[0] can be index */
++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
++ curp->p_idx->ei_leaf = newblock;
++
++ neh = EXT_ROOT_HDR(tree);
++ fidx = EXT_FIRST_INDEX(neh);
++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf);
++
++ neh->eh_depth = path->p_depth + 1;
++ err = ext3_ext_dirty(handle, tree, curp);
++out:
++ brelse(bh);
++
++ return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp;
++ int depth, i, err = 0;
++
++repeat:
++ i = depth = EXT_DEPTH(tree);
++
++ /* walk up to the tree and look for free index entry */
++ curp = path + depth;
++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++ i--;
++ curp--;
++ }
++
++ /* we use already allocated block for index block
++ * so, subsequent data blocks should be contigoues */
++ if (EXT_HAS_FREE_INDEX(curp)) {
++ /* if we found index with free entry, then use that
++ * entry: create all needed subtree and add new leaf */
++ err = ext3_ext_split(handle, tree, path, newext, i);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++ } else {
++ /* tree is full, time to grow in depth */
++ err = ext3_ext_grow_indepth(handle, tree, path, newext);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++
++ /*
++ * only first (depth 0 -> 1) produces free space
++ * in all other cases we have to split growed tree
++ */
++ depth = EXT_DEPTH(tree);
++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
++ /* now we need split */
++ goto repeat;
++ }
++ }
++
++ if (err)
++ return err;
++
++ return 0;
++}
++
++/*
++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static unsigned long
++ext3_ext_next_allocated_block(struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ if (depth == 0 && path->p_ext == NULL)
++ return EXT_MAX_BLOCK;
++
++ /* FIXME: what if index isn't full ?! */
++ while (depth >= 0) {
++ if (depth == path->p_depth) {
++ /* leaf */
++ if (path[depth].p_ext !=
++ EXT_LAST_EXTENT(path[depth].p_hdr))
++ return path[depth].p_ext[1].ee_block;
++ } else {
++ /* index */
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ }
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * returns first allocated block from next leaf or EXT_MAX_BLOCK
++ */
++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ /* zero-tree has no leaf blocks at all */
++ if (depth == 0)
++ return EXT_MAX_BLOCK;
++
++ /* go to index block */
++ depth--;
++
++ while (depth >= 0) {
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex;
++ unsigned long border;
++ int k, err = 0;
++
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(eh);
++
++ if (depth == 0) {
++ /* there is no tree at all */
++ return 0;
++ }
++
++ if (ex != EXT_FIRST_EXTENT(eh)) {
++ /* we correct tree if first leaf got modified only */
++ return 0;
++ }
++
++ /*
++ * TODO: we need correction if border is smaller then current one
++ */
++ k = depth - 1;
++ border = path[depth].p_ext->ee_block;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ return err;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ return err;
++
++ while (k--) {
++ /* change all left-side indexes */
++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
++ break;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ break;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ break;
++ }
++
++ return err;
++}
++
++static int inline
++ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block)
++ return 0;
++
++#ifdef AGRESSIVE_TEST
++ if (ex1->ee_len >= 4)
++ return 0;
++#endif
++
++ if (!tree->ops->mergable)
++ return 1;
++
++ return tree->ops->mergable(ex1, ex2);
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_extent_header * eh;
++ struct ext3_extent *ex, *fex;
++ struct ext3_extent *nearex; /* nearest extent */
++ struct ext3_ext_path *npath = NULL;
++ int depth, len, err, next;
++
++ EXT_ASSERT(newext->ee_len > 0);
++ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK);
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(path[depth].p_hdr);
++
++ /* try to insert block into found extent and return */
++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
++ ext_debug(tree, "append %d block to %d:%d (from %d)\n",
++ newext->ee_len, ex->ee_block, ex->ee_len,
++ ex->ee_start);
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ return err;
++ ex->ee_len += newext->ee_len;
++ eh = path[depth].p_hdr;
++ nearex = ex;
++ goto merge;
++ }
++
++repeat:
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max)
++ goto has_space;
++
++ /* probably next leaf has space for us? */
++ fex = EXT_LAST_EXTENT(eh);
++ next = ext3_ext_next_leaf_block(tree, path);
++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) {
++ ext_debug(tree, "next leaf block - %d\n", next);
++ EXT_ASSERT(!npath);
++ npath = ext3_ext_find_extent(tree, next, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ EXT_ASSERT(npath->p_depth == path->p_depth);
++ eh = npath[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max) {
++ ext_debug(tree, "next leaf isnt full(%d)\n",
++ eh->eh_entries);
++ path = npath;
++ goto repeat;
++ }
++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
++ eh->eh_entries, eh->eh_max);
++ }
++
++ /*
++ * there is no free space in found leaf
++ * we're gonna add new leaf in the tree
++ */
++ err = ext3_ext_create_new_leaf(handle, tree, path, newext);
++ if (err)
++ goto cleanup;
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++
++has_space:
++ nearex = path[depth].p_ext;
++
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++
++ if (!nearex) {
++ /* there is no extent in this leaf, create first one */
++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len);
++ path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++ } else if (newext->ee_block > nearex->ee_block) {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ if (nearex != EXT_LAST_EXTENT(eh)) {
++ len = EXT_MAX_EXTENT(eh) - nearex;
++ len = (len - 1) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 2, nearex + 1, len);
++ }
++ path[depth].p_ext = nearex + 1;
++ } else {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start, newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 1, nearex, len);
++ path[depth].p_ext = nearex;
++ }
++
++ eh->eh_entries++;
++ nearex = path[depth].p_ext;
++ nearex->ee_block = newext->ee_block;
++ nearex->ee_start = newext->ee_start;
++ nearex->ee_len = newext->ee_len;
++ /* FIXME: support for large fs */
++ nearex->ee_start_hi = 0;
++
++merge:
++ /* try to merge extents to the right */
++ while (nearex < EXT_LAST_EXTENT(eh)) {
++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
++ break;
++ /* merge with next extent! */
++ nearex->ee_len += nearex[1].ee_len;
++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
++ len = (EXT_LAST_EXTENT(eh) - nearex - 1)
++ * sizeof(struct ext3_extent);
++ memmove(nearex + 1, nearex + 2, len);
++ }
++ eh->eh_entries--;
++ EXT_ASSERT(eh->eh_entries > 0);
++ }
++
++ /* try to merge extents to the left */
++
++ /* time to correct all indexes above */
++ err = ext3_ext_correct_indexes(handle, tree, path);
++ if (err)
++ goto cleanup;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++
++cleanup:
++ if (npath) {
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++ }
++ ext3_ext_tree_changed(tree);
++ ext3_ext_invalidate_cache(tree);
++ return err;
++}
++
++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
++ unsigned long num, ext_prepare_callback func)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent *ex, cbex;
++ unsigned long next, start = 0, end = 0;
++ unsigned long last = block + num;
++ int depth, exists, err = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(func);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ while (block < last && block != EXT_MAX_BLOCK) {
++ num = last - block;
++ /* find extent for this block */
++ path = ext3_ext_find_extent(tree, block, path);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ break;
++ }
++
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(path[depth].p_hdr);
++ ex = path[depth].p_ext;
++ next = ext3_ext_next_allocated_block(path);
++
++ exists = 0;
++ if (!ex) {
++ /* there is no extent yet, so try to allocate
++ * all requested space */
++ start = block;
++ end = block + num;
++ } else if (ex->ee_block > block) {
++ /* need to allocate space before found extent */
++ start = block;
++ end = ex->ee_block;
++ if (block + num < end)
++ end = block + num;
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ /* need to allocate space after found extent */
++ start = block;
++ end = block + num;
++ if (end >= next)
++ end = next;
++ } else if (block >= ex->ee_block) {
++ /*
++ * some part of requested space is covered
++ * by found extent
++ */
++ start = block;
++ end = ex->ee_block + ex->ee_len;
++ if (block + num < end)
++ end = block + num;
++ exists = 1;
++ } else {
++ BUG();
++ }
++ EXT_ASSERT(end > start);
++
++ if (!exists) {
++ cbex.ee_block = start;
++ cbex.ee_len = end - start;
++ cbex.ee_start = 0;
++ } else
++ cbex = *ex;
++
++ EXT_ASSERT(path[depth].p_hdr);
++ err = func(tree, path, &cbex, exists);
++ ext3_ext_drop_refs(path);
++
++ if (err < 0)
++ break;
++ if (err == EXT_REPEAT)
++ continue;
++ else if (err == EXT_BREAK) {
++ err = 0;
++ break;
++ }
++
++ if (EXT_DEPTH(tree) != depth) {
++ /* depth was changed. we have to realloc path */
++ kfree(path);
++ path = NULL;
++ }
++
++ block = cbex.ee_block + cbex.ee_len;
++ }
++
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++
++ return err;
++}
++
++static inline void
++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex)
++{
++ if (tree->cex) {
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_len);
++ tree->cex->ee_block = ex->ee_block;
++ tree->cex->ee_start = ex->ee_start;
++ tree->cex->ee_len = ex->ee_len;
++ }
++}
++
++/*
++ * this routine calculate boundaries of the gap requested block fits into
++ * and cache this gap
++ */
++static inline void
++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ unsigned long block)
++{
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex, gex;
++
++ if (!tree->cex)
++ return;
++
++ ex = path[depth].p_ext;
++ if (ex == NULL) {
++ /* there is no extent yet, so gap is [0;-] */
++ gex.ee_block = 0;
++ gex.ee_len = EXT_CACHE_MARK;
++ ext_debug(tree, "cache gap(whole file):");
++ } else if (block < ex->ee_block) {
++ gex.ee_block = block;
++ gex.ee_len = ex->ee_block - block;
++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len);
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ gex.ee_block = ex->ee_block + ex->ee_len;
++ gex.ee_len = ext3_ext_next_allocated_block(path);
++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) block);
++ EXT_ASSERT(gex.ee_len > gex.ee_block);
++ gex.ee_len = gex.ee_len - gex.ee_block;
++ } else {
++ BUG();
++ }
++
++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.ee_block,
++ (unsigned long) gex.ee_len);
++ gex.ee_start = EXT_CACHE_MARK;
++ ext3_ext_put_in_cache(tree, &gex);
++}
++
++static inline int
++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
++ struct ext3_extent *ex)
++{
++ struct ext3_extent *cex = tree->cex;
++
++ /* is there cache storage at all? */
++ if (!cex)
++ return 0;
++
++ /* has cache valid data? */
++ if (cex->ee_len == 0)
++ return 0;
++
++ if (block >= cex->ee_block && block < cex->ee_block + cex->ee_len) {
++ ex->ee_block = cex->ee_block;
++ ex->ee_start = cex->ee_start;
++ ex->ee_len = cex->ee_len;
++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) ex->ee_start);
++ return 1;
++ }
++
++ /* not in cache */
++ return 0;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct buffer_head *bh;
++ int err;
++
++ /* free index block */
++ path--;
++ EXT_ASSERT(path->p_hdr->eh_entries);
++ if ((err = ext3_ext_get_access(handle, tree, path)))
++ return err;
++ path->p_hdr->eh_entries--;
++ if ((err = ext3_ext_dirty(handle, tree, path)))
++ return err;
++ ext_debug(tree, "index is empty, remove it, free block %d\n",
++ path->p_idx->ei_leaf);
++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ return err;
++}
++
++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth = EXT_DEPTH(tree);
++ int needed;
++
++ if (path) {
++ /* probably there is space in leaf? */
++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max)
++ return 1;
++ }
++
++ /*
++ * the worste case we're expecting is creation of the
++ * new root (growing in depth) with index splitting
++ * for splitting we have to consider depth + 1 because
++ * previous growing could increase it
++ */
++ depth = depth + 1;
++
++ /*
++ * growing in depth:
++ * block allocation + new root + old root
++ */
++ needed = EXT3_ALLOC_NEEDED + 2;
++
++ /* index split. we may need:
++ * allocate intermediate indexes and new leaf
++ * change two blocks at each level, but root
++ * modify root block (inode)
++ */
++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++ return needed;
++}
++
++static int
++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, tex;
++ struct ext3_ext_path *npath;
++ int depth, creds, err;
++
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1);
++ EXT_ASSERT(ex->ee_block < start);
++
++ /* calculate tail extent */
++ tex.ee_block = end + 1;
++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len);
++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block;
++
++ creds = ext3_ext_calc_credits_for_insert(tree, path);
++ handle = ext3_ext_journal_restart(handle, creds);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ /* calculate head extent. use primary extent */
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ return err;
++ ex->ee_len = start - ex->ee_block;
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ return err;
++
++ /* FIXME: some callback to free underlying resource
++ * and correct ee_start? */
++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len);
++
++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block);
++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len);
++
++ err = ext3_ext_insert_extent(handle, tree, npath, &tex);
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++
++ return err;
++
++}
++
++static int
++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, *fu = NULL, *lu, *le;
++ int err = 0, correct_index = 0;
++ int depth = EXT_DEPTH(tree), credits;
++ struct ext3_extent_header *eh;
++ unsigned a, b, block, num;
++
++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
++ if (!path[depth].p_hdr)
++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
++ eh = path[depth].p_hdr;
++ EXT_ASSERT(eh);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++
++ /* find where to start removing */
++ le = ex = EXT_LAST_EXTENT(eh);
++ while (ex != EXT_FIRST_EXTENT(eh)) {
++ if (ex->ee_block <= end)
++ break;
++ ex--;
++ }
++
++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) {
++ /* removal of internal part of the extent requested
++ * tail and head must be placed in different extent
++ * so, we have to insert one more extent */
++ path[depth].p_ext = ex;
++ return ext3_ext_split_for_rm(handle, tree, path, start, end);
++ }
++
++ lu = ex;
++ while (ex >= EXT_FIRST_EXTENT(eh) &&
++ ex->ee_block + ex->ee_len > start) {
++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len);
++ path[depth].p_ext = ex;
++
++ a = ex->ee_block > start ? ex->ee_block : start;
++ b = ex->ee_block + ex->ee_len - 1 < end ?
++ ex->ee_block + ex->ee_len - 1 : end;
++
++ ext_debug(tree, " border %u:%u\n", a, b);
++
++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) {
++ block = 0;
++ num = 0;
++ BUG();
++ } else if (a != ex->ee_block) {
++ /* remove tail of the extent */
++ block = ex->ee_block;
++ num = a - block;
++ } else if (b != ex->ee_block + ex->ee_len - 1) {
++ /* remove head of the extent */
++ block = a;
++ num = b - a;
++ } else {
++ /* remove whole extent: excelent! */
++ block = ex->ee_block;
++ num = 0;
++ EXT_ASSERT(a == ex->ee_block &&
++ b == ex->ee_block + ex->ee_len - 1);
++ }
++
++ if (ex == EXT_FIRST_EXTENT(eh))
++ correct_index = 1;
++
++ credits = 1;
++ if (correct_index)
++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
++ if (tree->ops->remove_extent_credits)
++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b);
++
++ handle = ext3_ext_journal_restart(handle, credits);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ goto out;
++ }
++
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ if (tree->ops->remove_extent)
++ err = tree->ops->remove_extent(tree, ex, a, b);
++ if (err)
++ goto out;
++
++ if (num == 0) {
++ /* this extent is removed entirely mark slot unused */
++ ex->ee_start = 0;
++ eh->eh_entries--;
++ fu = ex;
++ }
++
++ ex->ee_block = block;
++ ex->ee_len = num;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ ext_debug(tree, "new extent: %u:%u:%u\n",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ ex--;
++ }
++
++ if (fu) {
++ /* reuse unused slots */
++ while (lu < le) {
++ if (lu->ee_start) {
++ *fu = *lu;
++ lu->ee_start = 0;
++ fu++;
++ }
++ lu++;
++ }
++ }
++
++ if (correct_index && eh->eh_entries)
++ err = ext3_ext_correct_indexes(handle, tree, path);
++
++ /* if this leaf is free, then we should
++ * remove it from index block above */
++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
++ err = ext3_ext_rm_idx(handle, tree, path + depth);
++
++out:
++ return err;
++}
++
++
++static struct ext3_extent_idx *
++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
++{
++ struct ext3_extent_idx *ix;
++
++ ix = EXT_LAST_INDEX(hdr);
++ while (ix != EXT_FIRST_INDEX(hdr)) {
++ if (ix->ei_block <= block)
++ break;
++ ix--;
++ }
++ return ix;
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int inline
++ext3_ext_more_to_rm(struct ext3_ext_path *path)
++{
++ EXT_ASSERT(path->p_idx);
++
++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++ return 0;
++
++ /*
++ * if truncate on deeper level happened it it wasn't partial
++ * so we have to consider current index for truncation
++ */
++ if (path->p_hdr->eh_entries == path->p_block)
++ return 0;
++ return 1;
++}
++
++int ext3_ext_remove_space(struct ext3_extents_tree *tree,
++ unsigned long start, unsigned long end)
++{
++ struct inode *inode = tree->inode;
++ struct super_block *sb = inode->i_sb;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_ext_path *path;
++ handle_t *handle;
++ int i = 0, err = 0;
++
++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
++
++ /* probably first extent we're gonna free will be last in block */
++ handle = ext3_journal_start(inode, depth + 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ ext3_ext_invalidate_cache(tree);
++
++ /*
++ * we start scanning from right side freeing all the blocks
++ * after i_size and walking into the deep
++ */
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++ if (IS_ERR(path)) {
++ ext3_error(sb, "ext3_ext_remove_space",
++ "Can't allocate path array");
++ ext3_journal_stop(handle);
++ return -ENOMEM;
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[i].p_hdr = EXT_ROOT_HDR(tree);
++
++ while (i >= 0 && err == 0) {
++ if (i == depth) {
++ /* this is leaf block */
++ err = ext3_ext_rm_leaf(handle, tree, path, start, end);
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ continue;
++ }
++
++ /* this is index block */
++ if (!path[i].p_hdr) {
++ ext_debug(tree, "initialize header\n");
++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
++ }
++
++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max);
++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC);
++
++ if (!path[i].p_idx) {
++ /* this level hasn't touched yet */
++ path[i].p_idx =
++ ext3_ext_last_covered(path[i].p_hdr, end);
++ path[i].p_block = path[i].p_hdr->eh_entries + 1;
++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
++ path[i].p_hdr, path[i].p_hdr->eh_entries);
++ } else {
++ /* we've already was here, see at next index */
++ path[i].p_idx--;
++ }
++
++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
++ i, EXT_FIRST_INDEX(path[i].p_hdr),
++ path[i].p_idx);
++ if (ext3_ext_more_to_rm(path + i)) {
++ /* go to the next level */
++ ext_debug(tree, "move to level %d (block %d)\n",
++ i + 1, path[i].p_idx->ei_leaf);
++ memset(path + i + 1, 0, sizeof(*path));
++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf);
++ if (!path[i+1].p_bh) {
++ /* should we reset i_size? */
++ err = -EIO;
++ break;
++ }
++ /* put actual number of indexes to know is this
++ * number got changed at the next iteration */
++ path[i].p_block = path[i].p_hdr->eh_entries;
++ i++;
++ } else {
++ /* we finish processing this index, go up */
++ if (path[i].p_hdr->eh_entries == 0 && i > 0) {
++ /* index is empty, remove it
++ * handle must be already prepared by the
++ * truncatei_leaf() */
++ err = ext3_ext_rm_idx(handle, tree, path + i);
++ }
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ ext_debug(tree, "return to level %d\n", i);
++ }
++ }
++
++ /* TODO: flexible tree reduction should be here */
++ if (path->p_hdr->eh_entries == 0) {
++ /*
++ * truncate to zero freed all the tree
++ * so, we need to correct eh_depth
++ */
++ err = ext3_ext_get_access(handle, tree, path);
++ if (err == 0) {
++ EXT_ROOT_HDR(tree)->eh_depth = 0;
++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree);
++ err = ext3_ext_dirty(handle, tree, path);
++ }
++ }
++ ext3_ext_tree_changed(tree);
++
++ kfree(path);
++ ext3_journal_stop(handle);
++
++ return err;
++}
++
++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks)
++{
++ int lcap, icap, rcap, leafs, idxs, num;
++
++ rcap = ext3_ext_space_root(tree);
++ if (blocks <= rcap) {
++ /* all extents fit to the root */
++ return 0;
++ }
++
++ rcap = ext3_ext_space_root_idx(tree);
++ lcap = ext3_ext_space_block(tree);
++ icap = ext3_ext_space_block_idx(tree);
++
++ num = leafs = (blocks + lcap - 1) / lcap;
++ if (leafs <= rcap) {
++ /* all pointers to leafs fit to the root */
++ return leafs;
++ }
++
++ /* ok. we need separate index block(s) to link all leaf blocks */
++ idxs = (leafs + icap - 1) / icap;
++ do {
++ num += idxs;
++ idxs = (idxs + icap - 1) / icap;
++ } while (idxs > rcap);
++
++ return num;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++ /*
++ * possible initialization would be here
++ */
++
++ if (test_opt(sb, EXTENTS)) {
++ printk("EXT3-fs: file extents enabled");
++#ifdef AGRESSIVE_TEST
++ printk(", agressive tests");
++#endif
++#ifdef CHECK_BINSEARCH
++ printk(", check binsearch");
++#endif
++ printk("\n");
++ }
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++}
++
++/************************************************************************
++ * VFS related routines
++ ************************************************************************/
++
++static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
++{
++ /* we use in-core data, not bh */
++ return 0;
++}
++
++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
++{
++ struct inode *inode = buffer;
++ return ext3_mark_inode_dirty(handle, inode);
++}
++
++static int ext3_ext_mergable(struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ /* FIXME: support for large fs */
++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start)
++ return 1;
++ return 0;
++}
++
++static int
++ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed;
++
++ /* at present, extent can't cross block group */;
++ needed = 4; /* bitmap + group desc + sb + inode */
++
++#ifdef CONFIG_QUOTA
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++ return needed;
++}
++
++static int
++ext3_remove_blocks(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
++ handle_t *handle = ext3_journal_start(tree->inode, needed);
++ struct buffer_head *bh;
++ int i;
++
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
++ /* tail removal */
++ unsigned long num, start;
++ num = ex->ee_block + ex->ee_len - from;
++ start = ex->ee_start + ex->ee_len - num;
++ ext_debug(tree, "free last %lu blocks starting %lu\n",
++ num, start);
++ for (i = 0; i < num; i++) {
++ bh = sb_find_get_block(tree->inode->i_sb, start + i);
++ ext3_forget(handle, 0, tree->inode, bh, start + i);
++ }
++ ext3_free_blocks(handle, tree->inode, start, num);
++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
++ printk("strange request: removal %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ } else {
++ printk("strange request: removal(2) %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ }
++ ext3_journal_stop(handle);
++ return 0;
++}
++
++static int ext3_ext_find_goal(struct inode *inode,
++ struct ext3_ext_path *path, unsigned long block)
++{
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++ int depth;
++
++ if (path) {
++ struct ext3_extent *ex;
++ depth = path->p_depth;
++
++ /* try to predict block placement */
++ if ((ex = path[depth].p_ext))
++ return ex->ee_start + (block - ex->ee_block);
++
++ /* it looks index is empty
++ * try to find starting from index itself */
++ if (path[depth].p_bh)
++ return path[depth].p_bh->b_blocknr;
++ }
++
++ /* OK. use inode's group */
++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ return bg_start + colour + block;
++}
++
++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int *err)
++{
++ struct inode *inode = tree->inode;
++ int newblock, goal;
++
++ EXT_ASSERT(path);
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_start);
++ EXT_ASSERT(ex->ee_len);
++
++ /* reuse block from the extent to order data/metadata */
++ newblock = ex->ee_start++;
++ ex->ee_len--;
++ if (ex->ee_len == 0) {
++ ex->ee_len = 1;
++ /* allocate new block for the extent */
++ goal = ext3_ext_find_goal(inode, path, ex->ee_block);
++ ex->ee_start = ext3_new_block(handle, inode, goal, err);
++ if (ex->ee_start == 0) {
++ /* error occured: restore old extent */
++ ex->ee_start = newblock;
++ return 0;
++ }
++ }
++ return newblock;
++}
++
++static struct ext3_extents_helpers ext3_blockmap_helpers = {
++ .get_write_access = ext3_get_inode_write_access,
++ .mark_buffer_dirty = ext3_mark_buffer_dirty,
++ .mergable = ext3_ext_mergable,
++ .new_block = ext3_new_block_cb,
++ .remove_extent = ext3_remove_blocks,
++ .remove_extent_credits = ext3_remove_blocks_credits,
++};
++
++void ext3_init_tree_desc(struct ext3_extents_tree *tree,
++ struct inode *inode)
++{
++ tree->inode = inode;
++ tree->root = (void *) EXT3_I(inode)->i_data;
++ tree->buffer = (void *) inode;
++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
++ tree->cex = (struct ext3_extent *) &EXT3_I(inode)->i_cached_extent;
++ tree->ops = &ext3_blockmap_helpers;
++}
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode,
++ long iblock, struct buffer_head *bh_result,
++ int create, int extend_disksize)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent newex;
++ struct ext3_extent *ex;
++ int goal, newblock, err = 0, depth;
++ struct ext3_extents_tree tree;
++
++ clear_buffer_new(bh_result);
++ ext3_init_tree_desc(&tree, inode);
++ ext_debug(&tree, "block %d requested for inode %u\n",
++ (int) iblock, (unsigned) inode->i_ino);
++ down(&EXT3_I(inode)->truncate_sem);
++
++ /* check in cache */
++ if (ext3_ext_in_cache(&tree, iblock, &newex)) {
++ if (newex.ee_start == EXT_CACHE_MARK) {
++ /* this is cached gap */
++ if (!create) {
++ /* block isn't allocated yet and
++ * user don't want to allocate it */
++ goto out2;
++ }
++ /* we should allocate requested block */
++ } else if (newex.ee_start) {
++ /* block is already allocated */
++ newblock = iblock - newex.ee_block + newex.ee_start;
++ goto out;
++ }
++ }
++
++ /* find extent for this block */
++ path = ext3_ext_find_extent(&tree, iblock, NULL);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ goto out2;
++ }
++
++ depth = EXT_DEPTH(&tree);
++
++ /*
++ * consistent leaf must not be empty
++ * this situations is possible, though, _during_ tree modification
++ * this is why assert can't be put in ext3_ext_find_extent()
++ */
++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
++
++ if ((ex = path[depth].p_ext)) {
++ /* if found exent covers block, simple return it */
++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) {
++ newblock = iblock - ex->ee_block + ex->ee_start;
++ ext_debug(&tree, "%d fit into %d:%d -> %d\n",
++ (int) iblock, ex->ee_block, ex->ee_len,
++ newblock);
++ ext3_ext_put_in_cache(&tree, ex);
++ goto out;
++ }
++ }
++
++ /*
++ * requested block isn't allocated yet
++ * we couldn't try to create block if create flag is zero
++ */
++ if (!create) {
++ /* put just found gap into cache to speedup subsequest reqs */
++ ext3_ext_put_gap_in_cache(&tree, path, iblock);
++ goto out2;
++ }
++
++ /* allocate new block */
++ goal = ext3_ext_find_goal(inode, path, iblock);
++ newblock = ext3_new_block(handle, inode, goal, &err);
++ if (!newblock)
++ goto out2;
++ ext_debug(&tree, "allocate new block: goal %d, found %d\n",
++ goal, newblock);
++
++ /* try to insert new extent into found leaf and return */
++ newex.ee_block = iblock;
++ newex.ee_start = newblock;
++ newex.ee_len = 1;
++ err = ext3_ext_insert_extent(handle, &tree, path, &newex);
++ if (err)
++ goto out2;
++
++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize)
++ EXT3_I(inode)->i_disksize = inode->i_size;
++
++ /* previous routine could use block we allocated */
++ newblock = newex.ee_start;
++ set_buffer_new(bh_result);
++
++ ext3_ext_put_in_cache(&tree, &newex);
++out:
++ ext3_ext_show_leaf(&tree, path);
++ map_bh(bh_result, inode->i_sb, newblock);
++out2:
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++ up(&EXT3_I(inode)->truncate_sem);
++
++ return err;
++}
++
++void ext3_ext_truncate(struct inode * inode, struct page *page)
++{
++ struct address_space *mapping = inode->i_mapping;
++ struct super_block *sb = inode->i_sb;
++ struct ext3_extents_tree tree;
++ unsigned long last_block;
++ handle_t *handle;
++ int err = 0;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ /*
++ * probably first extent we're gonna free will be last in block
++ */
++ err = ext3_writepage_trans_blocks(inode) + 3;
++ handle = ext3_journal_start(inode, err);
++ if (IS_ERR(handle)) {
++ if (page) {
++ clear_highpage(page);
++ flush_dcache_page(page);
++ unlock_page(page);
++ page_cache_release(page);
++ }
++ return;
++ }
++
++ if (page)
++ ext3_block_truncate_page(handle, page, mapping, inode->i_size);
++
++ down(&EXT3_I(inode)->truncate_sem);
++ ext3_ext_invalidate_cache(&tree);
++
++ /*
++ * TODO: optimization is possible here
++ * probably we need not scaning at all,
++ * because page truncation is enough
++ */
++ if (ext3_orphan_add(handle, inode))
++ goto out_stop;
++
++ /* we have to know where to truncate from in crash case */
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_mark_inode_dirty(handle, inode);
++
++ last_block = (inode->i_size + sb->s_blocksize - 1)
++ >> EXT3_BLOCK_SIZE_BITS(sb);
++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK);
++
++ /* In a multi-transaction truncate, we only make the final
++ * transaction synchronous */
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++
++out_stop:
++ /*
++ * If this was a simple ftruncate(), and the file will remain alive
++ * then we need to clear up the orphan record which we created above.
++ * However, if this was a real unlink then we were called by
++ * ext3_delete_inode(), and we allow that function to clean up the
++ * orphan info for us.
++ */
++ if (inode->i_nlink)
++ ext3_orphan_del(handle, inode);
++
++ up(&EXT3_I(inode)->truncate_sem);
++ ext3_journal_stop(handle);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++ struct ext3_extents_tree tree;
++ int needed;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
++
++ /* caller want to allocate num blocks */
++ needed *= num;
++
++#ifdef CONFIG_QUOTA
++ /*
++ * FIXME: real calculation should be here
++ * it depends on blockmap format of qouta file
++ */
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++ return needed;
++}
++
++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ ext3_extent_tree_init(handle, &tree);
++}
++
++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ return ext3_ext_calc_metadata_amount(&tree, blocks);
++}
++
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newex, int exist)
++{
++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++ if (!exist)
++ return EXT_CONTINUE;
++ if (buf->err < 0)
++ return EXT_BREAK;
++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++ return EXT_BREAK;
++
++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++ buf->err++;
++ buf->cur += sizeof(*newex);
++ } else {
++ buf->err = -EFAULT;
++ return EXT_BREAK;
++ }
++ return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int exist)
++{
++ struct ext3_extent_tree_stats *buf =
++ (struct ext3_extent_tree_stats *) tree->private;
++ int depth;
++
++ if (!exist)
++ return EXT_CONTINUE;
++
++ depth = EXT_DEPTH(tree);
++ buf->extents_num++;
++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++ buf->leaf_num++;
++ return EXT_CONTINUE;
++}
++
++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++ unsigned long arg)
++{
++ int err = 0;
++
++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++ return -EINVAL;
++
++ if (cmd == EXT3_IOC_GET_EXTENTS) {
++ struct ext3_extent_buf buf;
++ struct ext3_extents_tree tree;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++
++ ext3_init_tree_desc(&tree, inode);
++ buf.cur = buf.buffer;
++ buf.err = 0;
++ tree.private = &buf;
++ down(&EXT3_I(inode)->truncate_sem);
++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++ ext3_ext_store_extent_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (err == 0)
++ err = buf.err;
++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) {
++ struct ext3_extent_tree_stats buf;
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ buf.depth = EXT_DEPTH(&tree);
++ buf.extents_num = 0;
++ buf.leaf_num = 0;
++ tree.private = &buf;
++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK,
++ ext3_ext_collect_stats_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (!err)
++ err = copy_to_user((void *) arg, &buf, sizeof(buf));
++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
++ struct ext3_extents_tree tree;
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ err = EXT_DEPTH(&tree);
++ up(&EXT3_I(inode)->truncate_sem);
++ }
++
++ return err;
++}
++
++EXPORT_SYMBOL(ext3_init_tree_desc);
++EXPORT_SYMBOL(ext3_mark_inode_dirty);
++EXPORT_SYMBOL(ext3_ext_invalidate_cache);
++EXPORT_SYMBOL(ext3_ext_insert_extent);
++EXPORT_SYMBOL(ext3_ext_walk_space);
++EXPORT_SYMBOL(ext3_ext_find_goal);
++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
++
+Index: linux-2.6.10/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ialloc.c 2005-04-05 12:26:19.368143176 +0800
++++ linux-2.6.10/fs/ext3/ialloc.c 2005-04-05 12:26:25.464216432 +0800
+@@ -644,6 +644,17 @@
+ DQUOT_FREE_INODE(inode);
+ goto fail2;
+ }
++ if (test_opt(sb, EXTENTS)) {
++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++ ext3_extents_initialize_blockmap(handle, inode);
++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) {
++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++ if (err) goto fail;
++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS);
++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++ }
++ }
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext3_std_error(sb, err);
+Index: linux-2.6.10/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/Makefile 2005-04-05 12:26:06.897039072 +0800
++++ linux-2.6.10/fs/ext3/Makefile 2005-04-05 12:27:00.597875304 +0800
+@@ -5,8 +5,8 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+- ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o
+-
++ ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \
++ extents.o
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+ ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o
+
+%diffstat
+ fs/ext3/Makefile | 4
+ fs/ext3/extents.c | 2306 +++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/ialloc.c | 11
+ fs/ext3/inode.c | 29
+ fs/ext3/ioctl.c | 4
+ fs/ext3/super.c | 15
+ include/linux/ext3_extents.h | 238 ++++
+ include/linux/ext3_fs.h | 20
+ include/linux/ext3_fs_i.h | 2
+ 9 files changed, 2619 insertions(+), 10 deletions(-)
+
--- /dev/null
+Index: linux-2.6.10/fs/ext3/xattr.h
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/xattr.h 2005-04-05 12:26:19.376141960 +0800
++++ linux-2.6.10/fs/ext3/xattr.h 2005-04-05 12:27:55.527524728 +0800
+@@ -70,6 +70,7 @@
+ extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,const void *,size_t,int);
+ extern int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *,const void *,size_t,int);
+
++extern int ext3_xattr_get_ea_loc(struct inode *, int, const char *, struct buffer_head **, int *, int *);
+ extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
+ extern void ext3_xattr_put_super(struct super_block *);
+
+Index: linux-2.6.10/fs/ext3/extents-in-ea.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/extents-in-ea.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/ext3/extents-in-ea.c 2005-04-05 12:27:55.524525184 +0800
+@@ -0,0 +1,224 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/ext3_extents.h>
++#include <linux/ext3_xattr.h>
++#include <linux/slab.h>
++#include <asm/uaccess.h>
++
++static int ext3_get_ea_write_access(handle_t *handle, void *buffer)
++{
++ struct buffer_head *bh = (struct buffer_head *) buffer;
++ return ext3_journal_get_write_access(handle, bh);
++}
++
++static int ext3_mark_ea_buffer_dirty(handle_t *handle, void *buffer)
++{
++ struct buffer_head *bh = (struct buffer_head *) buffer;
++ ext3_journal_dirty_metadata(handle, bh);
++ return 0;
++}
++
++static struct ext3_extents_helpers ext3_ea_helpers = {
++ .get_write_access = ext3_get_ea_write_access,
++ .mark_buffer_dirty = ext3_mark_ea_buffer_dirty,
++ .mergable = NULL,
++ .new_block = NULL,
++ .remove_extent = NULL,
++ .remove_extent_credits = NULL,
++};
++
++int ext3_init_tree_in_ea_desc(struct ext3_extents_tree *tree,
++ struct inode *inode, int name_index,
++ const char *eaname)
++{
++ struct buffer_head *bh;
++ int offset, err, size;
++
++ err = ext3_xattr_get_ea_loc(inode, name_index, eaname,
++ &bh, &offset, &size);
++ if (err)
++ return err;
++
++ EXT_ASSERT(bh);
++ EXT_ASSERT(size >= sizeof(struct ext3_extent_header)
++ + sizeof(struct ext3_extent));
++ tree->inode = inode;
++ tree->root = (void *) bh->b_data + offset;
++ tree->buffer_len = size;
++ tree->buffer = (void *) bh;
++ tree->ops = &ext3_ea_helpers;
++ tree->cex = NULL; /* FIXME: add cache store later */
++ return 0;
++}
++
++void ext3_release_tree_in_ea_desc(struct ext3_extents_tree *tree)
++{
++ struct buffer_head *bh;
++
++ bh = (struct buffer_head *) tree->buffer;
++ EXT_ASSERT(bh);
++ brelse(bh);
++}
++
++int ext3_init_tree_in_ea(struct inode *inode, int name_index,
++ const char *eaname, int size)
++{
++ struct ext3_extents_tree tree;
++ handle_t *handle;
++ char *root;
++ int err;
++
++ root = kmalloc(size, GFP_USER);
++ if (!root)
++ return -ENOMEM;
++ memset(root, 0, size);
++
++ /* first, create ea to store root of the tree */
++ handle = ext3_journal_start(inode, EXT3_ALLOC_NEEDED + 3);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ if ((err = ext3_xattr_set(inode, name_index,
++ eaname, root, size, 0)))
++ goto out;
++ if ((err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname)))
++ goto out;
++ err = ext3_extent_tree_init(handle, &tree);
++ ext3_release_tree_in_ea_desc(&tree);
++out:
++ ext3_journal_stop(handle, inode);
++ kfree(root);
++ return err;
++}
++
++static int
++ext3_ext_in_ea_new_extent(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newex, int exist)
++{
++ struct inode *inode = tree->inode;
++ handle_t *handle;
++ int needed, err;
++ unsigned long tgen;
++
++ if (exist)
++ return EXT_CONTINUE;
++
++ tgen = EXT_GENERATION(tree);
++ needed = ext3_ext_calc_credits_for_insert(tree, path);
++ up(&EXT3_I(inode)->truncate_sem);
++ handle = ext3_journal_start(tree->inode, needed + 10);
++ if (IS_ERR(handle)) {
++ down_write(&EXT3_I(inode)->truncate_sem);
++ return PTR_ERR(handle);
++ }
++
++ if (tgen != EXT_GENERATION(tree)) {
++ /* the tree has changed. so path can be invalid at moment */
++ ext3_journal_stop(handle, inode);
++ down_write(&EXT3_I(inode)->truncate_sem);
++ return EXT_REPEAT;
++ }
++
++ down_write(&EXT3_I(inode)->truncate_sem);
++
++ /* insert new extent */
++ newex->ee_start = 0;
++ err = ext3_ext_insert_extent(handle, tree, path, newex);
++ if (!err)
++ ext3_journal_stop(handle, tree->inode);
++
++ return err;
++}
++
++int ext3_ext_in_ea_alloc_space(struct inode *inode, int name_index,
++ const char *eaname, unsigned long from,
++ unsigned long num)
++{
++ struct ext3_extents_tree tree;
++ int err;
++
++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname);
++ if (err == 0) {
++ down_write(&EXT3_I(inode)->truncate_sem);
++ err = ext3_ext_walk_space(&tree, from, num,
++ ext3_ext_in_ea_new_extent);
++ ext3_release_tree_in_ea_desc(&tree);
++ up_write(&EXT3_I(inode)->truncate_sem);
++ }
++ return err;
++}
++
++int ext3_ext_in_ea_remove_space(struct inode *inode, int name_index,
++ const char *eaname, unsigned long from,
++ unsigned long num)
++{
++ struct ext3_extents_tree tree;
++ int err;
++
++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname);
++ if (err == 0) {
++ err = ext3_ext_remove_space(&tree, from, num);
++ ext3_release_tree_in_ea_desc(&tree);
++ }
++ return err;
++}
++
++int ext3_ext_in_ea_presence(struct inode *inode, int name_index,
++ const char *eaname, unsigned long block)
++{
++ struct ext3_extents_tree tree;
++ struct ext3_ext_path *path;
++ struct ext3_extent *ex;
++ int err, depth;
++
++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname);
++ if (err)
++ return err;
++
++ /* find extent for this block */
++ path = ext3_ext_find_extent(&tree, block, NULL);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ goto out;
++ }
++
++ depth = EXT_DEPTH(&tree);
++ ex = path[depth].p_ext;
++ if (!ex) {
++ /* there is no extent yet */
++ goto out;
++ }
++
++ if (block >= ex->ee_block && block < ex->ee_block + ex->ee_len)
++ err = 1;
++out:
++ ext3_release_tree_in_ea_desc(&tree);
++ return err;
++}
++
+Index: linux-2.6.10/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/xattr.c 2005-04-05 12:26:19.370142872 +0800
++++ linux-2.6.10/fs/ext3/xattr.c 2005-04-05 12:27:55.527524728 +0800
+@@ -590,7 +590,8 @@
+ */
+ int
+ ext3_xattr_ibody_find(struct inode *inode, int name_index,
+- const char *name, struct ext3_xattr_entry *rentry, int *free)
++ const char *name, struct ext3_xattr_entry *rentry, int *free,
++ struct buffer_head **bh, int *offset)
+ {
+ struct ext3_xattr_entry *last;
+ struct ext3_inode *raw_inode;
+@@ -637,6 +638,15 @@
+ name_len == last->e_name_len &&
+ !memcmp(name, last->e_name, name_len)) {
+ memcpy(rentry, last, sizeof(struct ext3_xattr_entry));
++ if (offset) {
++ void *voff;
++ voff = start + le16_to_cpu(last->e_value_offs);
++ *offset = voff - (void *) iloc.bh->b_data;
++ }
++ if (bh) {
++ get_bh(iloc.bh);
++ *bh = iloc.bh;
++ }
+ ret = 0;
+ } else {
+ *free -= EXT3_XATTR_LEN(last->e_name_len);
+@@ -657,7 +667,8 @@
+ */
+ int
+ ext3_xattr_block_find(struct inode *inode, int name_index, const char *name,
+- struct ext3_xattr_entry *rentry, int *free)
++ struct ext3_xattr_entry *rentry, int *free,
++ struct buffer_head **tbh, int *offset)
+ {
+ struct buffer_head *bh = NULL;
+ struct ext3_xattr_entry *entry;
+@@ -700,6 +711,12 @@
+ memcmp(name, entry->e_name, name_len) == 0) {
+ memcpy(rentry, entry, sizeof(struct ext3_xattr_entry));
+ error = 0;
++ if (offset)
++ *offset = le16_to_cpu(entry->e_value_offs);
++ if (tbh) {
++ get_bh(bh);
++ *tbh = bh;
++ }
+ } else {
+ *free -= EXT3_XATTR_LEN(entry->e_name_len);
+ *free -= le32_to_cpu(entry->e_value_size);
+@@ -894,7 +911,8 @@
+ down_write(&EXT3_I(inode)->xattr_sem);
+
+ /* try to find attribute in inode body */
+- err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1);
++ err = ext3_xattr_ibody_find(inode, name_index, name,
++ &entry, &free1, NULL, NULL);
+ if (err == 0) {
+ /* found EA in inode */
+ found = 1;
+@@ -903,7 +921,7 @@
+ /* there is no such attribute in inode body */
+ /* try to find attribute in dedicated block */
+ err = ext3_xattr_block_find(inode, name_index, name,
+- &entry, &free2);
++ &entry, &free2, NULL, NULL);
+ if (err != 0 && err != -ENOENT) {
+ /* not found EA in block */
+ goto finish;
+@@ -960,6 +978,35 @@
+ return err;
+ }
+
++int ext3_xattr_get_ea_loc(struct inode *inode, int name_index,
++ const char *name, struct buffer_head **bh,
++ int *offset, int *size)
++{
++ int free1 = -1, free2 = -1, err, name_len;
++ struct ext3_xattr_entry entry;
++
++ ea_idebug(inode, "name=%d.%s", name_index, name);
++
++ if (name == NULL)
++ return -EINVAL;
++ name_len = strlen(name);
++ if (name_len > 255)
++ return -ERANGE;
++
++ /* try to find attribute in inode body */
++ err = ext3_xattr_ibody_find(inode, name_index, name,
++ &entry, &free1, bh, offset);
++ if (err == -ENOENT) {
++ /* there is no such attribute in inode body */
++ /* try to find attribute in dedicated block */
++ err = ext3_xattr_block_find(inode, name_index, name,
++ &entry, &free2, bh, offset);
++ }
++ if (err == 0 && size)
++ *size = le32_to_cpu(entry.e_value_size);
++ return err;
++}
++
+ /*
+ * ext3_xattr_block_set()
+ *
+Index: linux-2.6.10/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/Makefile 2005-04-05 12:27:00.597875304 +0800
++++ linux-2.6.10/fs/ext3/Makefile 2005-04-05 12:28:26.989741744 +0800
+@@ -7,6 +7,6 @@
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \
+ extents.o
+-ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
++ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o extents-in-ea.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+ ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o
--- /dev/null
+Index: linux-2.6.10/fs/ext3/extents-in-ea.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/extents-in-ea.c 2005-03-31 19:41:09.471494208 +0800
++++ linux-2.6.10/fs/ext3/extents-in-ea.c 2005-03-31 19:41:09.580477640 +0800
+@@ -27,7 +27,7 @@
+ #include <linux/quotaops.h>
+ #include <linux/string.h>
+ #include <linux/ext3_extents.h>
+-#include <linux/ext3_xattr.h>
++#include "xattr.h"
+ #include <linux/slab.h>
+ #include <asm/uaccess.h>
+
+@@ -111,7 +111,7 @@
+ err = ext3_extent_tree_init(handle, &tree);
+ ext3_release_tree_in_ea_desc(&tree);
+ out:
+- ext3_journal_stop(handle, inode);
++ ext3_journal_stop(handle);
+ kfree(root);
+ return err;
+ }
+@@ -134,24 +134,24 @@
+ up(&EXT3_I(inode)->truncate_sem);
+ handle = ext3_journal_start(tree->inode, needed + 10);
+ if (IS_ERR(handle)) {
+- down_write(&EXT3_I(inode)->truncate_sem);
++ down(&EXT3_I(inode)->truncate_sem);
+ return PTR_ERR(handle);
+ }
+
+ if (tgen != EXT_GENERATION(tree)) {
+ /* the tree has changed. so path can be invalid at moment */
+- ext3_journal_stop(handle, inode);
+- down_write(&EXT3_I(inode)->truncate_sem);
++ ext3_journal_stop(handle);
++ down(&EXT3_I(inode)->truncate_sem);
+ return EXT_REPEAT;
+ }
+
+- down_write(&EXT3_I(inode)->truncate_sem);
++ down(&EXT3_I(inode)->truncate_sem);
+
+ /* insert new extent */
+ newex->ee_start = 0;
+ err = ext3_ext_insert_extent(handle, tree, path, newex);
+ if (!err)
+- ext3_journal_stop(handle, tree->inode);
++ ext3_journal_stop(handle);
+
+ return err;
+ }
+@@ -165,11 +165,11 @@
+
+ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname);
+ if (err == 0) {
+- down_write(&EXT3_I(inode)->truncate_sem);
++ down(&EXT3_I(inode)->truncate_sem);
+ err = ext3_ext_walk_space(&tree, from, num,
+ ext3_ext_in_ea_new_extent);
+ ext3_release_tree_in_ea_desc(&tree);
+- up_write(&EXT3_I(inode)->truncate_sem);
++ up(&EXT3_I(inode)->truncate_sem);
+ }
+ return err;
+ }
+@@ -222,3 +222,112 @@
+ return err;
+ }
+
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newex, int exist)
++{
++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++ if (!exist)
++ return EXT_CONTINUE;
++ if (buf->err < 0)
++ return EXT_BREAK;
++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++ return EXT_BREAK;
++
++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++ buf->err++;
++ buf->cur += sizeof(*newex);
++ } else {
++ buf->err = -EFAULT;
++ return EXT_BREAK;
++ }
++ return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int exist)
++{
++ struct ext3_extent_tree_stats *buf =
++ (struct ext3_extent_tree_stats *) tree->private;
++ int depth;
++
++ if (!exist)
++ return EXT_CONTINUE;
++
++ depth = EXT_DEPTH(tree);
++ buf->extents_num++;
++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++ buf->leaf_num++;
++ return EXT_CONTINUE;
++}
++
++struct ea_tree_desc {
++ int name_index;
++ char eaname[256];
++};
++
++int ext3_ext_in_ea_ioctl(struct inode *inode, struct file *filp,
++ unsigned int cmd, unsigned long arg)
++{
++ int err = 0;
++
++ if (cmd == EXT3_IOC_EA_TREE_INIT) {
++ struct ea_tree_desc desc;
++
++ if (copy_from_user(&desc, (void *) arg, sizeof(desc)))
++ return -EFAULT;
++ err = ext3_init_tree_in_ea(inode, desc.name_index,
++ desc.eaname, 64);
++ } else if (cmd == EXT3_IOC_GET_EA_EXTENTS) {
++ struct ext3_extents_tree tree;
++ struct ext3_extent_buf buf;
++ struct ea_tree_desc desc;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++ if (copy_from_user(&desc, buf.cur, sizeof(desc)))
++ return -EFAULT;
++ err = ext3_init_tree_in_ea_desc(&tree, inode,
++ desc.name_index, desc.eaname);
++ if (err)
++ goto out;
++ buf.cur = buf.buffer;
++ buf.err = 0;
++ tree.private = &buf;
++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++ ext3_ext_store_extent_cb);
++ if (err == 0)
++ err = buf.err;
++ ext3_release_tree_in_ea_desc(&tree);
++ } else if (cmd == EXT3_IOC_EA_TREE_ALLOCATE) {
++ struct ext3_extent_buf buf;
++ struct ea_tree_desc desc;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++ if (copy_from_user(&desc, buf.cur, sizeof(desc)))
++ return -EFAULT;
++ err = ext3_ext_in_ea_alloc_space(inode, desc.name_index,
++ desc.eaname, buf.start,
++ buf.err);
++ } else if (cmd == EXT3_IOC_EA_TREE_REMOVE) {
++ struct ext3_extent_buf buf;
++ struct ea_tree_desc desc;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++ if (copy_from_user(&desc, buf.cur, sizeof(desc)))
++ return -EFAULT;
++ err = ext3_ext_in_ea_remove_space(inode, desc.name_index,
++ desc.eaname, buf.start,
++ buf.err);
++ }
++
++out:
++ return err;
++}
++
+Index: linux-2.6.10/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ioctl.c 2005-03-31 19:41:09.365510320 +0800
++++ linux-2.6.10/fs/ext3/ioctl.c 2005-03-31 19:41:09.580477640 +0800
+@@ -249,7 +249,13 @@
+ case EXT3_IOC_GET_TREE_STATS:
+ case EXT3_IOC_GET_TREE_DEPTH:
+ return ext3_ext_ioctl(inode, filp, cmd, arg);
+-
++ case EXT3_IOC_GET_EA_EXTENTS:
++ case EXT3_IOC_GET_EA_TREE_DEPTH:
++ case EXT3_IOC_GET_EA_TREE_STATS:
++ case EXT3_IOC_EA_TREE_INIT:
++ case EXT3_IOC_EA_TREE_ALLOCATE:
++ case EXT3_IOC_EA_TREE_REMOVE:
++ return ext3_ext_in_ea_ioctl(inode, filp, cmd, arg);
+ default:
+ return -ENOTTY;
+ }
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-03-31 19:41:09.366510168 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 19:43:30.539048680 +0800
+@@ -242,6 +242,15 @@
+ #define EXT3_IOC_GET_EXTENTS _IOR('f', 10, long)
+ #define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 11, long)
+ #define EXT3_IOC_GET_TREE_STATS _IOR('f', 12, long)
++
++#define EXT3_IOC_GET_EA_EXTENTS _IOR('f', 13, long)
++#define EXT3_IOC_GET_EA_TREE_DEPTH _IOR('f', 14, long)
++#define EXT3_IOC_GET_EA_TREE_STATS _IOR('f', 15, long)
++#define EXT3_IOC_EA_TREE_INIT _IOW('f', 16, long)
++#define EXT3_IOC_EA_TREE_ALLOCATE _IOW('f', 17, long)
++#define EXT3_IOC_EA_TREE_REMOVE _IOW('f', 18, long)
++
++
+ /*
+ * Structure of an inode on the disk
+ */
+@@ -788,7 +797,10 @@
+ /* ioctl.c */
+ extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
+ unsigned long);
+-
++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++ unsigned long arg);
++extern int ext3_ext_in_ea_ioctl(struct inode *inode, struct file *filp,
++ unsigned int cmd, unsigned long arg);
+ /* namei.c */
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
--- /dev/null
+Index: linux-2.6.10/fs/ext3/dir.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/dir.c 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/fs/ext3/dir.c 2005-03-31 18:56:02.961946200 +0800
+@@ -53,6 +53,9 @@
+
+ static unsigned char get_dtype(struct super_block *sb, int filetype)
+ {
++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM))
++ return DT_UNKNOWN;
++
+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
+ (filetype >= EXT3_FT_MAX))
+ return DT_UNKNOWN;
+@@ -79,7 +82,8 @@
+ error_msg = "directory entry across blocks";
+ else if (le32_to_cpu(de->inode) >
+ le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+- error_msg = "inode out of bounds";
++ if (de->file_type != 128)
++ error_msg = "inode out of bounds";
+
+ if (error_msg != NULL)
+ ext3_error (dir->i_sb, function,
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c 2005-03-31 18:41:15.880803032 +0800
++++ linux-2.6.10/fs/ext3/namei.c 2005-03-31 18:56:02.960946352 +0800
+@@ -24,6 +24,7 @@
+ * Theodore Ts'o, 2002
+ */
+
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+ #include <linux/jbd.h>
+@@ -1148,6 +1149,23 @@
+ inode = NULL;
+ if (bh) {
+ unsigned long ino = le32_to_cpu(de->inode);
++ unsigned type = de->file_type;
++ __u32 *mds;
++ mds = (__u32 *)((char *) de + EXT3_DIR_REC_LEN(de->name_len));
++ if ((type & 128) && EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb,
++ EXT3_FEATURE_INCOMPAT_MDSNUM) &&
++ mds[0] != EXT3_SB(dir->i_sb)->s_mdsnum) {
++ struct ext3_super_block *es;
++ es = EXT3_SB(dir->i_sb)->s_es;
++ brelse (bh);
++ dentry->d_flags |= DCACHE_CROSS_REF;
++ dentry->d_generation = mds[1];
++ dentry->d_mdsnum = mds[0];
++ dentry->d_inum = ino;
++ ext3_unlock_htree(dir, lock);
++ d_add(dentry, NULL);
++ return NULL;
++ }
+ ext3_unlock_htree(dir, lock);
+ brelse (bh);
+ inode = iget(dir->i_sb, ino);
+@@ -1221,7 +1239,7 @@
+ while (count--) {
+ struct ext3_dir_entry_2 *de =
+ (struct ext3_dir_entry_2 *) (from + map->offs);
+- rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ rec_len = EXT3_DIR_REC_LEN_DE(de);
+ memcpy (to, de, rec_len);
+ ((struct ext3_dir_entry_2 *) to)->rec_len =
+ cpu_to_le16(rec_len);
+@@ -1243,7 +1261,7 @@
+ next = (struct ext3_dir_entry_2 *) ((char *) de +
+ le16_to_cpu(de->rec_len));
+ if (de->inode && de->name_len) {
+- rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ rec_len = EXT3_DIR_REC_LEN_DE(de);
+ if (de > to)
+ memmove(to, de, rec_len);
+ to->rec_len = cpu_to_le16(rec_len);
+@@ -1359,6 +1377,7 @@
+ struct buffer_head * bh)
+ {
+ struct inode *dir = dentry->d_parent->d_inode;
++ struct super_block *sb = dir->i_sb;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned long offset = 0;
+@@ -1367,6 +1386,10 @@
+ char *top;
+
+ reclen = EXT3_DIR_REC_LEN(namelen);
++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM)
++ && (dentry->d_flags & DCACHE_CROSS_REF)
++ && (dentry->d_mdsnum != EXT3_SB(sb)->s_mdsnum))
++ reclen += 8; /* we need space to store mds num */
+ if (!de) {
+ de = (struct ext3_dir_entry_2 *)bh->b_data;
+ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+@@ -1380,7 +1403,7 @@
+ brelse (bh);
+ return -EEXIST;
+ }
+- nlen = EXT3_DIR_REC_LEN(de->name_len);
++ nlen = EXT3_DIR_REC_LEN_DE(de);
+ rlen = le16_to_cpu(de->rec_len);
+ if ((de->inode? rlen - nlen: rlen) >= reclen)
+ break;
+@@ -1399,7 +1422,7 @@
+ }
+
+ /* By now the buffer is marked for journaling */
+- nlen = EXT3_DIR_REC_LEN(de->name_len);
++ nlen = EXT3_DIR_REC_LEN_DE(de);
+ rlen = le16_to_cpu(de->rec_len);
+ if (de->inode) {
+ struct ext3_dir_entry_2 *de1 =
+@@ -1411,8 +1434,20 @@
+ de->file_type = EXT3_FT_UNKNOWN;
+ if (inode) {
+ de->inode = cpu_to_le32(inode->i_ino);
+- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+- } else
++ ext3_set_de_type(sb, de, inode->i_mode);
++ } else if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM)
++ && (dentry->d_flags & DCACHE_CROSS_REF)) {
++ if (dentry->d_mdsnum != EXT3_SB(sb)->s_mdsnum) {
++ __u32 *mds;
++ mds = (__u32 *)((char *)de + EXT3_DIR_REC_LEN(namelen));
++ mds[0] = cpu_to_le32(dentry->d_mdsnum);
++ mds[1] = cpu_to_le32(dentry->d_generation);
++ de->inode = cpu_to_le32(dentry->d_inum);
++ de->file_type = 128;
++ } else {
++ de->inode = cpu_to_le32(dentry->d_inum);
++ }
++ } else
+ de->inode = 0;
+ de->name_len = namelen;
+ memcpy (de->name, name, namelen);
+@@ -2737,6 +2772,81 @@
+ }
+
+ /*
++ * caller has to make sure directory is protected
++ */
++int ext3_add_dir_entry(struct dentry *dentry)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ handle_t *handle;
++ int err;
++
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS);
++ if (IS_ERR(handle)) {
++ return PTR_ERR(handle);
++ }
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ err = ext3_add_entry(handle, dentry, NULL);
++ ext3_journal_stop(handle);
++ return err;
++}
++EXPORT_SYMBOL(ext3_add_dir_entry);
++/*
++ * caller has to make sure directory is protected
++ */
++int ext3_del_dir_entry(struct dentry *dentry)
++{
++ struct inode * inode;
++ struct inode * dir = dentry->d_parent->d_inode;
++ struct buffer_head * bh;
++ struct ext3_dir_entry_2 * de;
++ handle_t *handle;
++ int retval;
++ void *lock = NULL;
++
++ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
++ if (IS_ERR(handle)) {
++ return PTR_ERR(handle);
++ }
++
++ if (IS_SYNC(dir))
++ handle->h_sync = 1;
++
++ retval = -ENOENT;
++ bh = ext3_find_entry (dentry, &de, 1, &lock);
++ ext3_unlock_htree(dir, lock);
++ if (!bh)
++ goto end_unlink;
++
++ inode = dentry->d_inode;
++ if (inode)
++ DQUOT_INIT(inode);
++
++ retval = ext3_delete_entry(handle, dir, de, bh);
++ if (retval)
++ goto end_unlink;
++ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
++ ext3_update_dx_flag(dir);
++ if (inode) {
++ inode->i_ctime = dir->i_ctime;
++ ext3_mark_inode_dirty(handle, inode);
++ if (S_ISDIR(inode->i_mode))
++ dir->i_nlink--;
++ }
++ ext3_mark_inode_dirty(handle, dir);
++ retval = 0;
++
++end_unlink:
++ ext3_journal_stop(handle);
++ brelse (bh);
++ return retval;
++}
++
++EXPORT_SYMBOL(ext3_del_dir_entry);
++/*
+ * directories can handle most operations...
+ */
+ struct inode_operations ext3_dir_inode_operations = {
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-03-31 18:54:32.497698856 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 18:56:41.955018352 +0800
+@@ -483,7 +483,8 @@
+ __u16 s_reserved_word_pad;
+ __le32 s_default_mount_opts;
+ __le32 s_first_meta_bg; /* First metablock block group */
+- __u32 s_reserved[190]; /* Padding to the end of the block */
++ __u32 s_mdsnum;
++ __u32 s_reserved[189]; /* Padding to the end of the block */
+ };
+
+ #ifdef __KERNEL__
+@@ -563,12 +564,14 @@
+ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+ #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
++#define EXT3_FEATURE_INCOMPAT_MDSNUM 0x0020 /* direntry has mdsnum */
+ #define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */
+
+ #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER| \
+ EXT3_FEATURE_INCOMPAT_META_BG| \
++ EXT3_FEATURE_INCOMPAT_MDSNUM| \
+ EXT3_FEATURE_INCOMPAT_EXTENTS)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -643,6 +646,9 @@
+ #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
++#define EXT3_DIR_REC_LEN_DE(de) (EXT3_DIR_REC_LEN((de)->name_len) + \
++ (((de)->file_type & 128) ? 8 : 0))
++
+ /*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+@@ -868,6 +874,9 @@
+ extern void ext3_ext_release(struct super_block *);
+ extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
+
++extern int ext3_add_dir_entry(struct dentry *dentry);
++
++extern int ext3_del_dir_entry(struct dentry *dentry);
+ #endif /* __KERNEL__ */
+
+ #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
+Index: linux-2.6.10/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs_sb.h 2005-03-31 18:44:21.076648984 +0800
++++ linux-2.6.10/include/linux/ext3_fs_sb.h 2005-03-31 18:56:02.964945744 +0800
+@@ -81,6 +81,7 @@
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+ #endif
++ u32 s_mdsnum;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
--- /dev/null
+ fs/ext3/ialloc.c | 3
+ fs/ext3/inode.c | 3
+ fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++---------
+ fs/ext3/super.c | 14 +
+ include/linux/ext3_fs.h | 1
+ include/linux/ext3_fs_i.h | 6
+ 6 files changed, 500 insertions(+), 109 deletions(-)
+
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/ext3/super.c 2005-03-31 19:44:54.251322480 +0800
+@@ -458,6 +458,9 @@
+ #endif
+ ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+ ei->vfs_inode.i_version = 1;
++ dynlock_init(&ei->i_htree_lock);
++ sema_init(&ei->i_rename_sem, 1);
++ sema_init(&ei->i_append_sem, 1);
+ return &ei->vfs_inode;
+ }
+
+@@ -588,7 +591,7 @@
+ Opt_commit, Opt_journal_update, Opt_journal_inum,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
++ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+ };
+
+@@ -637,6 +640,7 @@
+ {Opt_ignore, "quota"},
+ {Opt_ignore, "usrquota"},
+ {Opt_barrier, "barrier=%u"},
++ {Opt_pdirops, "pdirops"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+ };
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/fs/ext3/namei.c 2005-03-31 19:48:53.958881392 +0800
+@@ -53,6 +53,9 @@
+ {
+ struct buffer_head *bh;
+
++ /* with parallel dir operations all appends
++ * have to be serialized -bzzz */
++ down(&EXT3_I(inode)->i_append_sem);
+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+@@ -60,6 +63,8 @@
+ EXT3_I(inode)->i_disksize = inode->i_size;
+ ext3_journal_get_write_access(handle,bh);
+ }
++ up(&EXT3_I(inode)->i_append_sem);
++
+ return bh;
+ }
+
+@@ -133,6 +138,8 @@
+ struct buffer_head *bh;
+ struct dx_entry *entries;
+ struct dx_entry *at;
++ unsigned long leaf;
++ unsigned int curidx;
+ };
+
+ struct dx_map_entry
+@@ -141,6 +148,30 @@
+ u32 offs;
+ };
+
++/* FIXME: this should be reworked using bb_spin_lock
++ * introduced in -mm tree
++ */
++#define BH_DXLock 25
++
++static inline void dx_lock_bh(struct buffer_head volatile *bh)
++{
++#ifdef CONFIG_SMP
++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
++ while (test_bit(BH_DXLock, &bh->b_state))
++ cpu_relax();
++ }
++#endif
++}
++
++static inline void dx_unlock_bh(struct buffer_head *bh)
++{
++#ifdef CONFIG_SMP
++ smp_mb__before_clear_bit();
++ clear_bit(BH_DXLock, &bh->b_state);
++#endif
++}
++
++
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block (struct dx_entry *entry);
+ static void dx_set_block (struct dx_entry *entry, unsigned value);
+@@ -152,7 +183,7 @@
+ static void dx_set_limit (struct dx_entry *entries, unsigned value);
+ static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+ static unsigned dx_node_limit (struct inode *dir);
+-static struct dx_frame *dx_probe(struct dentry *dentry,
++static struct dx_frame *dx_probe(struct qstr *name,
+ struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct dx_frame *frame,
+@@ -164,15 +195,18 @@
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+ struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+ __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+- struct ext3_dir_entry_2 **res_dir, int *err);
++ struct ext3_dir_entry_2 **res_dir, int *err,
++ int rwlock, void **lock);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
++static void *ext3_lock_htree(struct inode *, unsigned long, int);
++static void ext3_unlock_htree(struct inode *, void *);
+
+ /*
+ * Future: use high four bits of block for coalesce-on-delete flags
+@@ -316,6 +350,94 @@
+ #endif /* DX_DEBUG */
+
+ /*
++ * dx_find_position
++ *
++ * search position of specified hash in index
++ *
++ */
++
++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash)
++{
++ struct dx_entry *p, *q, *m;
++ int count;
++
++ count = dx_get_count(entries);
++ p = entries + 1;
++ q = entries + count - 1;
++ while (p <= q)
++ {
++ m = p + (q - p)/2;
++ if (dx_get_hash(m) > hash)
++ q = m - 1;
++ else
++ p = m + 1;
++ }
++ return p - 1;
++}
++
++/*
++ * returns 1 if path is unchanged
++ */
++int dx_check_path(struct dx_frame *frame, u32 hash)
++{
++ struct dx_entry *p;
++ int ret = 1;
++
++ dx_lock_bh(frame->bh);
++ p = dx_find_position(frame->entries, hash);
++ if (frame->leaf != dx_get_block(p))
++ ret = 0;
++ dx_unlock_bh(frame->bh);
++
++ return ret;
++}
++
++/*
++ * 0 - changed
++ * 1 - hasn't changed
++ */
++static int
++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo)
++{
++ struct dx_entry *p;
++ struct dx_frame *frame = frames;
++ u32 leaf;
++
++ /* check first level */
++ dx_lock_bh(frame->bh);
++ p = dx_find_position(frame->entries, hinfo->hash);
++ leaf = dx_get_block(p);
++ dx_unlock_bh(frame->bh);
++
++ if (leaf != frame->leaf)
++ return 0;
++
++ /* is there 2nd level? */
++ frame++;
++ if (frame->bh == NULL)
++ return 1;
++
++ /* check second level */
++ dx_lock_bh(frame->bh);
++
++ /* probably 1st level got changed, check it */
++ if (!dx_check_path(frames, hinfo->hash)) {
++ /* path changed */
++ dx_unlock_bh(frame->bh);
++ return 0;
++ }
++
++ p = dx_find_position(frame->entries, hinfo->hash);
++ leaf = dx_get_block(p);
++ dx_unlock_bh(frame->bh);
++
++ if (leaf != frame->leaf)
++ return 0;
++
++ return 1;
++}
++
++/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+@@ -325,19 +447,20 @@
+ * back to userspace.
+ */
+ static struct dx_frame *
+-dx_probe(struct dentry *dentry, struct inode *dir,
++dx_probe(struct qstr *name, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+ {
+- unsigned count, indirect;
+- struct dx_entry *at, *entries, *p, *q, *m;
++ unsigned indirect;
++ struct dx_entry *at, *entries;
+ struct dx_root *root;
+ struct buffer_head *bh;
+ struct dx_frame *frame = frame_in;
+ u32 hash;
++ unsigned int curidx;
+
+ frame->bh = NULL;
+- if (dentry)
+- dir = dentry->d_parent->d_inode;
++ frame[1].bh = NULL;
++
+ if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+ goto fail;
+ root = (struct dx_root *) bh->b_data;
+@@ -353,8 +476,8 @@
+ }
+ hinfo->hash_version = root->info.hash_version;
+ hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
+- if (dentry)
+- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++ if (name)
++ ext3fs_dirhash(name->name, name->len, hinfo);
+ hash = hinfo->hash;
+
+ if (root->info.unused_flags & 1) {
+@@ -366,7 +489,19 @@
+ goto fail;
+ }
+
++repeat:
++ curidx = 0;
++ entries = (struct dx_entry *) (((char *)&root->info) +
++ root->info.info_length);
++ assert(dx_get_limit(entries) == dx_root_limit(dir,
++ root->info.info_length));
++ dxtrace (printk("Look up %x", hash));
++ dx_lock_bh(bh);
++ /* indirect must be initialized under bh lock because
++ * 2nd level creation procedure may change it and dx_probe()
++ * will suggest htree is still single-level -bzzz */
+ if ((indirect = root->info.indirect_levels) > 1) {
++ dx_unlock_bh(bh);
+ ext3_warning(dir->i_sb, __FUNCTION__,
+ "Unimplemented inode hash depth: %#06x",
+ root->info.indirect_levels);
+@@ -374,56 +509,46 @@
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+-
+- entries = (struct dx_entry *) (((char *)&root->info) +
+- root->info.info_length);
+- assert(dx_get_limit(entries) == dx_root_limit(dir,
+- root->info.info_length));
+- dxtrace (printk("Look up %x", hash));
++
+ while (1)
+ {
+- count = dx_get_count(entries);
+- assert (count && count <= dx_get_limit(entries));
+- p = entries + 1;
+- q = entries + count - 1;
+- while (p <= q)
+- {
+- m = p + (q - p)/2;
+- dxtrace(printk("."));
+- if (dx_get_hash(m) > hash)
+- q = m - 1;
+- else
+- p = m + 1;
+- }
+-
+- if (0) // linear search cross check
+- {
+- unsigned n = count - 1;
+- at = entries;
+- while (n--)
+- {
+- dxtrace(printk(","));
+- if (dx_get_hash(++at) > hash)
+- {
+- at--;
+- break;
+- }
+- }
+- assert (at == p - 1);
+- }
+-
+- at = p - 1;
+- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++ at = dx_find_position(entries, hinfo->hash);
++ dxtrace(printk(" %x->%u\n",
++ at == entries? 0: dx_get_hash(at),
++ dx_get_block(at)));
+ frame->bh = bh;
+ frame->entries = entries;
+ frame->at = at;
+- if (!indirect--) return frame;
+- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++ frame->curidx = curidx;
++ frame->leaf = dx_get_block(at);
++ if (!indirect--) {
++ dx_unlock_bh(bh);
++ return frame;
++ }
++
++ /* step into next htree level */
++ curidx = dx_get_block(at);
++ dx_unlock_bh(bh);
++ if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err)))
+ goto fail2;
++
++ dx_lock_bh(bh);
++ /* splitting may change root index block and move
++ * hash we're looking for into another index block
++ * so, we have to check this situation and repeat
++ * from begining if path got changed -bzzz */
++ if (!dx_check_path(frame, hash)) {
++ dx_unlock_bh(bh);
++ bh = frame->bh;
++ indirect++;
++ goto repeat;
++ }
++
+ at = entries = ((struct dx_node *) bh->b_data)->entries;
+ assert (dx_get_limit(entries) == dx_node_limit (dir));
+ frame++;
+ }
++ dx_unlock_bh(bh);
+ fail2:
+ while (frame >= frame_in) {
+ brelse(frame->bh);
+@@ -437,8 +562,7 @@
+ {
+ if (frames[0].bh == NULL)
+ return;
+-
+- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++ if (frames[1].bh != NULL)
+ brelse(frames[1].bh);
+ brelse(frames[0].bh);
+ }
+@@ -479,8 +603,10 @@
+ * nodes need to be read.
+ */
+ while (1) {
+- if (++(p->at) < p->entries + dx_get_count(p->entries))
++ if (++(p->at) < p->entries + dx_get_count(p->entries)) {
++ p->leaf = dx_get_block(p->at);
+ break;
++ }
+ if (p == frames)
+ return 0;
+ num_frames++;
+@@ -506,13 +632,17 @@
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+- 0, &err)))
++ u32 idx;
++
++ idx = p->leaf = dx_get_block(p->at);
++ if (!(bh = ext3_bread(NULL, dir, idx, 0, &err)))
+ return err; /* Failure */
+ p++;
+ brelse (p->bh);
+ p->bh = bh;
+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++ p->curidx = idx;
++ p->leaf = dx_get_block(p->at);
+ }
+ return 1;
+ }
+@@ -673,7 +803,8 @@
+ count++;
+ }
+ /* XXX: do we need to check rec_len == 0 case? -Chris */
+- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ de = (struct ext3_dir_entry_2 *)((char*)de +
++ le16_to_cpu(de->rec_len));
+ }
+ return count;
+ }
+@@ -706,7 +837,8 @@
+ } while(more);
+ }
+
+-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct inode *dir, struct dx_frame *frame,
++ u32 hash, u32 block, u32 idx)
+ {
+ struct dx_entry *entries = frame->entries;
+ struct dx_entry *old = frame->at, *new = old + 1;
+@@ -718,6 +850,7 @@
+ dx_set_hash(new, hash);
+ dx_set_block(new, block);
+ dx_set_count(entries, count + 1);
++
+ }
+ #endif
+
+@@ -798,7 +931,8 @@
+ * to brelse() it when appropriate.
+ */
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+- struct ext3_dir_entry_2 ** res_dir)
++ struct ext3_dir_entry_2 ** res_dir,
++ int rwlock, void **lock)
+ {
+ struct super_block * sb;
+ struct buffer_head * bh_use[NAMEI_RA_SIZE];
+@@ -814,6 +948,7 @@
+ int namelen;
+ const u8 *name;
+ unsigned blocksize;
++ int do_not_use_dx = 0;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+@@ -822,9 +957,10 @@
+ name = dentry->d_name.name;
+ if (namelen > EXT3_NAME_LEN)
+ return NULL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+ if (is_dx(dir)) {
+- bh = ext3_dx_find_entry(dentry, res_dir, &err);
++ bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock);
+ /*
+ * On success, or if the error was file not found,
+ * return. Otherwise, fall back to doing a search the
+@@ -833,8 +969,14 @@
+ if (bh || (err != ERR_BAD_DX_DIR))
+ return bh;
+ dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++ do_not_use_dx = 1;
+ }
+ #endif
++ *lock = ext3_lock_htree(dir, 0, rwlock);
++ if (is_dx(dir) && !do_not_use_dx) {
++ ext3_unlock_htree(dir, *lock);
++ goto repeat;
++ }
+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+ start = EXT3_I(dir)->i_dir_start_lookup;
+ if (start >= nblocks)
+@@ -907,12 +1049,17 @@
+ /* Clean up the read-ahead blocks */
+ for (; ra_ptr < ra_max; ra_ptr++)
+ brelse (bh_use[ra_ptr]);
++ if (!ret) {
++ ext3_unlock_htree(dir, *lock);
++ *lock = NULL;
++ }
+ return ret;
+ }
+
+ #ifdef CONFIG_EXT3_INDEX
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+- struct ext3_dir_entry_2 **res_dir, int *err)
++ struct ext3_dir_entry_2 **res_dir, int *err,
++ int rwlock, void **lock)
+ {
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+@@ -927,11 +1074,21 @@
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ sb = dir->i_sb;
+- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
++repeat:
++ if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err)))
+ return NULL;
++
++ *lock = ext3_lock_htree(dir, frame->leaf, rwlock);
++ /* while locking leaf we just found may get splitted
++ * so, we need another leaf. check this */
++ if (!dx_check_full_path(frames, &hinfo)) {
++ ext3_unlock_htree(dir, *lock);
++ dx_release(frames);
++ goto repeat;
++ }
+ hash = hinfo.hash;
+ do {
+- block = dx_get_block(frame->at);
++ block = frame->leaf;
+ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+ goto errout;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -966,6 +1123,8 @@
+ *err = -ENOENT;
+ errout:
+ dxtrace(printk("%s not found\n", name));
++ ext3_unlock_htree(dir, *lock);
++ *lock = NULL;
+ dx_release (frames);
+ return NULL;
+ }
+@@ -976,14 +1135,16 @@
+ struct inode * inode;
+ struct ext3_dir_entry_2 * de;
+ struct buffer_head * bh;
++ void *lock = NULL;
+
+ if (dentry->d_name.len > EXT3_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+- bh = ext3_find_entry(dentry, &de);
++ bh = ext3_find_entry(dentry, &de, 0, &lock);
+ inode = NULL;
+ if (bh) {
+ unsigned long ino = le32_to_cpu(de->inode);
++ ext3_unlock_htree(dir, lock);
+ brelse (bh);
+ inode = iget(dir->i_sb, ino);
+
+@@ -1005,17 +1166,19 @@
+ struct dentry dotdot;
+ struct ext3_dir_entry_2 * de;
+ struct buffer_head *bh;
++ void *lock = NULL;
+
+ dotdot.d_name.name = "..";
+ dotdot.d_name.len = 2;
+ dotdot.d_parent = child; /* confusing, isn't it! */
+
+- bh = ext3_find_entry(&dotdot, &de);
++ bh = ext3_find_entry(&dotdot, &de, 0, &lock);
+ inode = NULL;
+ if (!bh)
+ return ERR_PTR(-ENOENT);
+ ino = le32_to_cpu(de->inode);
+ brelse(bh);
++ ext3_unlock_htree(child->d_inode, lock);
+ inode = iget(child->d_inode->i_sb, ino);
+
+ if (!inode)
+@@ -1054,7 +1217,8 @@
+ unsigned rec_len = 0;
+
+ while (count--) {
+- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++ struct ext3_dir_entry_2 *de =
++ (struct ext3_dir_entry_2 *) (from + map->offs);
+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ memcpy (to, de, rec_len);
+ ((struct ext3_dir_entry_2 *) to)->rec_len =
+@@ -1068,7 +1232,8 @@
+
+ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+ {
+- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++ struct ext3_dir_entry_2 *next, *to, *prev;
++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base;
+ unsigned rec_len = 0;
+
+ prev = to = de;
+@@ -1090,7 +1255,8 @@
+
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+ struct buffer_head **bh,struct dx_frame *frame,
+- struct dx_hash_info *hinfo, int *error)
++ struct dx_hash_info *hinfo, void **target,
++ int *error)
+ {
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count, continued;
+@@ -1137,23 +1303,30 @@
+ hash2 = map[split].hash;
+ continued = hash2 == map[split - 1].hash;
+ dxtrace(printk("Split block %i at %x, %i/%i\n",
+- dx_get_block(frame->at), hash2, split, count-split));
+-
++ frame->leaf, hash2, split, count-split));
++
+ /* Fancy dance to stay within two buffers */
+ de2 = dx_move_dirents(data1, data2, map + split, count - split);
+ de = dx_pack_dirents(data1,blocksize);
+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1));
++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1));
+
+ /* Which block gets the new entry? */
++ *target = NULL;
+ if (hinfo->hash >= hash2)
+ {
+ swap(*bh, bh2);
+ de = de2;
+- }
+- dx_insert_block (frame, hash2 + continued, newblock);
++
++ /* entry will be stored into new block
++ * we have to lock it before add_dirent_to_buf */
++ *target = ext3_lock_htree(dir, newblock, 1);
++ }
++ dx_lock_bh(frame->bh);
++ dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx);
++ dx_unlock_bh(frame->bh);
+ err = ext3_journal_dirty_metadata (handle, bh2);
+ if (err)
+ goto journal_error;
+@@ -1227,7 +1400,8 @@
+ nlen = EXT3_DIR_REC_LEN(de->name_len);
+ rlen = le16_to_cpu(de->rec_len);
+ if (de->inode) {
+- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++ struct ext3_dir_entry_2 *de1 =
++ (struct ext3_dir_entry_2 *)((char *)de + nlen);
+ de1->rec_len = cpu_to_le16(rlen - nlen);
+ de->rec_len = cpu_to_le16(nlen);
+ de = de1;
+@@ -1286,6 +1460,7 @@
+ struct dx_hash_info hinfo;
+ u32 block;
+ struct fake_dirent *fde;
++ void *lock, *new_lock;
+
+ blocksize = dir->i_sb->s_blocksize;
+ dxtrace(printk("Creating index\n"));
+@@ -1305,6 +1480,8 @@
+ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
+ data1 = bh2->b_data;
+
++ lock = ext3_lock_htree(dir, block, 1);
++
+ /* The 0th block becomes the root, move the dirents out */
+ fde = &root->dotdot;
+ de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+@@ -1334,13 +1511,25 @@
+ frame->entries = entries;
+ frame->at = entries;
+ frame->bh = bh;
++ frame->curidx = 0;
++ frame->leaf = 0;
++ frame[1].bh = NULL;
+ bh = bh2;
+- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++ de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval);
+ dx_release (frames);
+ if (!(de))
+- return retval;
++ goto cleanup;
++
++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++cleanup:
++ if (new_lock)
++ ext3_unlock_htree(dir, new_lock);
++ /* we mark directory indexed in order to
++ * avoid races while htree being created -bzzz */
++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++ ext3_unlock_htree(dir, lock);
+
+- return add_dirent_to_buf(handle, dentry, inode, de, bh);
++ return retval;
+ }
+ #endif
+
+@@ -1369,11 +1558,13 @@
+ unsigned blocksize;
+ unsigned nlen, rlen;
+ u32 block, blocks;
++ void *lock;
+
+ sb = dir->i_sb;
+ blocksize = sb->s_blocksize;
+ if (!dentry->d_name.len)
+ return -EINVAL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+ if (is_dx(dir)) {
+ retval = ext3_dx_add_entry(handle, dentry, inode);
+@@ -1384,30 +1575,52 @@
+ ext3_mark_inode_dirty(handle, dir);
+ }
+ #endif
++ lock = ext3_lock_htree(dir, 0, 1);
++ if (is_dx(dir)) {
++ /* we got lock for block 0
++ * probably previous holder of the lock
++ * created htree -bzzz */
++ ext3_unlock_htree(dir, lock);
++ goto repeat;
++ }
++
+ blocks = dir->i_size >> sb->s_blocksize_bits;
+ for (block = 0, offset = 0; block < blocks; block++) {
+ bh = ext3_bread(handle, dir, block, 0, &retval);
+- if(!bh)
+- return retval;
++ if(!bh) {
++ ext3_unlock_htree(dir, lock);
++ return retval;
++ }
+ retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
++ if (retval != -ENOSPC) {
++ ext3_unlock_htree(dir, lock);
++ return retval;
++ }
+ if (retval != -ENOSPC)
+ return retval;
+
+ #ifdef CONFIG_EXT3_INDEX
+ if (blocks == 1 && !dx_fallback &&
+- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+- return make_indexed_dir(handle, dentry, inode, bh);
++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) {
++ retval = make_indexed_dir(handle, dentry, inode, bh);
++ ext3_unlock_htree(dir, lock);
++ return retval;
++ }
+ #endif
+ brelse(bh);
+ }
+ bh = ext3_append(handle, dir, &block, &retval);
+- if (!bh)
+- return retval;
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- de->inode = 0;
+- de->rec_len = cpu_to_le16(rlen = blocksize);
+- nlen = 0;
+- return add_dirent_to_buf(handle, dentry, inode, de, bh);
++ if (!bh) {
++ ext3_unlock_htree(dir, lock);
++ return retval;
++ }
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ de->inode = 0;
++ de->rec_len = cpu_to_le16(rlen = blocksize);
++ nlen = 0;
++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ ext3_unlock_htree(dir, lock);
++ return retval;
+ }
+
+ #ifdef CONFIG_EXT3_INDEX
+@@ -1425,15 +1638,27 @@
+ struct super_block * sb = dir->i_sb;
+ struct ext3_dir_entry_2 *de;
+ int err;
+-
+- frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
++ int curidx;
++ void *idx_lock, *leaf_lock, *newleaf_lock;
++
++repeat:
++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+- entries = frame->entries;
+- at = frame->at;
+-
+- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+- goto cleanup;
++ /* we're going to chage leaf, so lock it first */
++ leaf_lock = ext3_lock_htree(dir, frame->leaf, 1);
++
++ /* while locking leaf we just found may get splitted
++ * so we need to check this */
++ if (!dx_check_full_path(frames, &hinfo)) {
++ ext3_unlock_htree(dir, leaf_lock);
++ dx_release(frames);
++ goto repeat;
++ }
++ if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) {
++ printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err);
++ goto cleanup;
++ }
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, bh);
+@@ -1446,6 +1671,35 @@
+ goto cleanup;
+ }
+
++ /* our leaf has no enough space. hence, we have to
++ * split it. so lock index for this leaf first */
++ curidx = frame->curidx;
++ idx_lock = ext3_lock_htree(dir, curidx, 1);
++
++ /* now check did path get changed? */
++ dx_release(frames);
++
++ frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode,
++ &hinfo, frames, &err);
++ if (!frame) {
++ /* FIXME: error handling here */
++ brelse(bh);
++ ext3_unlock_htree(dir, idx_lock);
++ return err;
++ }
++
++ if (frame->curidx != curidx) {
++ /* path has been changed. we have to drop old lock
++ * and repeat */
++ brelse(bh);
++ ext3_unlock_htree(dir, idx_lock);
++ ext3_unlock_htree(dir, leaf_lock);
++ dx_release(frames);
++ goto repeat;
++ }
++ entries = frame->entries;
++ at = frame->at;
++
+ /* Block full, should compress but for now just split */
+ dxtrace(printk("using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+@@ -1457,7 +1711,8 @@
+ struct dx_entry *entries2;
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+-
++ void *nb_lock;
++
+ if (levels && (dx_get_count(frames->entries) ==
+ dx_get_limit(frames->entries))) {
+ ext3_warning(sb, __FUNCTION__,
+@@ -1468,6 +1723,7 @@
+ bh2 = ext3_append (handle, dir, &newblock, &err);
+ if (!(bh2))
+ goto cleanup;
++ nb_lock = ext3_lock_htree(dir, newblock, 1);
+ node2 = (struct dx_node *)(bh2->b_data);
+ entries2 = node2->entries;
+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+@@ -1479,27 +1735,73 @@
+ if (levels) {
+ unsigned icount1 = icount/2, icount2 = icount - icount1;
+ unsigned hash2 = dx_get_hash(entries + icount1);
+- dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++ void *ri_lock;
+
+- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++ /* we have to protect root htree index against
++ * another dx_add_entry() which would want to
++ * split it too -bzzz */
++ ri_lock = ext3_lock_htree(dir, 0, 1);
++
++ /* as root index block blocked we must repeat
++ * searching for current position of our 2nd index -bzzz */
++ dx_lock_bh(frame->bh);
++ frames->at = dx_find_position(frames->entries, hinfo.hash);
++ dx_unlock_bh(frame->bh);
++
++ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++
++ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle,
+ frames[0].bh);
+ if (err)
+ goto journal_error;
+
++ /* copy index into new one */
+ memcpy ((char *) entries2, (char *) (entries + icount1),
+ icount2 * sizeof(struct dx_entry));
+- dx_set_count (entries, icount1);
+ dx_set_count (entries2, icount2);
+ dx_set_limit (entries2, dx_node_limit(dir));
+
+ /* Which index block gets the new entry? */
+ if (at - entries >= icount1) {
++ /* unlock index we won't use */
++ ext3_unlock_htree(dir, idx_lock);
++ idx_lock = nb_lock;
+ frame->at = at = at - entries - icount1 + entries2;
+- frame->entries = entries = entries2;
++ frame->entries = entries2;
++ frame->curidx = curidx = newblock;
+ swap(frame->bh, bh2);
++ } else {
++ /* we'll use old index,so new one may be freed */
++ ext3_unlock_htree(dir, nb_lock);
+ }
+- dx_insert_block (frames + 0, hash2, newblock);
++
++ /* NOTE: very subtle piece of code
++ * competing dx_probe() may find 2nd level index in root
++ * index, then we insert new index here and set new count
++ * in that 2nd level index. so, dx_probe() may see 2nd
++ * level index w/o hash it looks for. the solution is
++ * to check root index after we locked just founded 2nd
++ * level index -bzzz */
++ dx_lock_bh(frames[0].bh);
++ dx_insert_block (dir, frames + 0, hash2, newblock, 0);
++ dx_unlock_bh(frames[0].bh);
++
++ /* now old and new 2nd level index blocks contain
++ * all pointers, so dx_probe() may find it in the both.
++ * it's OK -bzzz */
++
++ dx_lock_bh(frame->bh);
++ dx_set_count(entries, icount1);
++ dx_unlock_bh(frame->bh);
++
++ /* now old 2nd level index block points to first half
++ * of leafs. it's importand that dx_probe() must
++ * check root index block for changes under
++ * dx_lock_bh(frame->bh) -bzzz */
++
++ ext3_unlock_htree(dir, ri_lock);
++
+ dxtrace(dx_show_index ("node", frames[1].entries));
+ dxtrace(dx_show_index ("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+@@ -1508,38 +1810,60 @@
+ goto journal_error;
+ brelse (bh2);
+ } else {
++ unsigned long leaf = frame->leaf;
+ dxtrace(printk("Creating second level index...\n"));
+ memcpy((char *) entries2, (char *) entries,
+ icount * sizeof(struct dx_entry));
+ dx_set_limit(entries2, dx_node_limit(dir));
+
+ /* Set up root */
++ dx_lock_bh(frames[0].bh);
+ dx_set_count(entries, 1);
+ dx_set_block(entries + 0, newblock);
+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++ dx_unlock_bh(frames[0].bh);
+
+ /* Add new access path frame */
+ frame = frames + 1;
+ frame->at = at = at - entries + entries2;
+ frame->entries = entries = entries2;
+ frame->bh = bh2;
++ frame->curidx = newblock;
++ frame->leaf = leaf;
+ err = ext3_journal_get_write_access(handle,
+ frame->bh);
+ if (err)
+ goto journal_error;
++
++ /* first level index was root. it's already initialized */
++ /* we my unlock it now */
++ ext3_unlock_htree(dir, idx_lock);
++
++ /* current index is just created 2nd level index */
++ curidx = newblock;
++ idx_lock = nb_lock;
+ }
+ ext3_journal_dirty_metadata(handle, frames[0].bh);
+ }
+- de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err);
+ if (!de)
+ goto cleanup;
++
++ /* index splitted */
++ ext3_unlock_htree(dir, idx_lock);
++
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++
++ if (newleaf_lock)
++ ext3_unlock_htree(dir, newleaf_lock);
++
+ bh = NULL;
+ goto cleanup;
+
+ journal_error:
+ ext3_std_error(dir->i_sb, err);
+ cleanup:
++ ext3_unlock_htree(dir, leaf_lock);
+ if (bh)
+ brelse(bh);
+ dx_release(frames);
+@@ -1989,6 +2313,7 @@
+ struct buffer_head * bh;
+ struct ext3_dir_entry_2 * de;
+ handle_t *handle;
++ void *lock;
+
+ /* Initialize quotas before so that eventual writes go in
+ * separate transaction */
+@@ -1998,7 +2323,7 @@
+ return PTR_ERR(handle);
+
+ retval = -ENOENT;
+- bh = ext3_find_entry (dentry, &de);
++ bh = ext3_find_entry (dentry, &de, 1, &lock);
+ if (!bh)
+ goto end_rmdir;
+
+@@ -2008,14 +2333,19 @@
+ inode = dentry->d_inode;
+
+ retval = -EIO;
+- if (le32_to_cpu(de->inode) != inode->i_ino)
++ if (le32_to_cpu(de->inode) != inode->i_ino) {
++ ext3_unlock_htree(dir, lock);
+ goto end_rmdir;
++ }
+
+ retval = -ENOTEMPTY;
+- if (!empty_dir (inode))
++ if (!empty_dir (inode)) {
++ ext3_unlock_htree(dir, lock);
+ goto end_rmdir;
++ }
+
+ retval = ext3_delete_entry(handle, dir, de, bh);
++ ext3_unlock_htree(dir, lock);
+ if (retval)
+ goto end_rmdir;
+ if (inode->i_nlink != 2)
+@@ -2048,6 +2378,7 @@
+ struct buffer_head * bh;
+ struct ext3_dir_entry_2 * de;
+ handle_t *handle;
++ void *lock;
+
+ /* Initialize quotas before so that eventual writes go
+ * in separate transaction */
+@@ -2060,15 +2391,17 @@
+ handle->h_sync = 1;
+
+ retval = -ENOENT;
+- bh = ext3_find_entry (dentry, &de);
++ bh = ext3_find_entry (dentry, &de, 1, &lock);
+ if (!bh)
+ goto end_unlink;
+
+ inode = dentry->d_inode;
+
+ retval = -EIO;
+- if (le32_to_cpu(de->inode) != inode->i_ino)
++ if (le32_to_cpu(de->inode) != inode->i_ino) {
++ ext3_unlock_htree(dir, lock);
+ goto end_unlink;
++ }
+
+ if (!inode->i_nlink) {
+ ext3_warning (inode->i_sb, "ext3_unlink",
+@@ -2077,6 +2410,7 @@
+ inode->i_nlink = 1;
+ }
+ retval = ext3_delete_entry(handle, dir, de, bh);
++ ext3_unlock_htree(dir, lock);
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+@@ -2196,6 +2530,7 @@
+ struct buffer_head * old_bh, * new_bh, * dir_bh;
+ struct ext3_dir_entry_2 * old_de, * new_de;
+ int retval;
++ void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL;
+
+ old_bh = new_bh = dir_bh = NULL;
+
+@@ -2211,7 +2546,10 @@
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ handle->h_sync = 1;
+
+- old_bh = ext3_find_entry (old_dentry, &old_de);
++ if (old_dentry->d_parent == new_dentry->d_parent)
++ down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
++
++ old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+@@ -2224,7 +2562,7 @@
+ goto end_rename;
+
+ new_inode = new_dentry->d_inode;
+- new_bh = ext3_find_entry (new_dentry, &new_de);
++ new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */);
+ if (new_bh) {
+ if (!new_inode) {
+ brelse (new_bh);
+@@ -2288,7 +2626,7 @@
+ struct buffer_head *old_bh2;
+ struct ext3_dir_entry_2 *old_de2;
+
+- old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++ old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */);
+ if (old_bh2) {
+ retval = ext3_delete_entry(handle, old_dir,
+ old_de2, old_bh2);
+@@ -2331,6 +2669,14 @@
+ retval = 0;
+
+ end_rename:
++ if (lock1)
++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1);
++ if (lock2)
++ ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2);
++ if (lock3)
++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3);
++ if (old_dentry->d_parent == new_dentry->d_parent)
++ up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
+ brelse (dir_bh);
+ brelse (old_bh);
+ brelse (new_bh);
+@@ -2339,6 +2685,29 @@
+ }
+
+ /*
++ * this locking primitives are used to protect parts
++ * of dir's htree. protection unit is block: leaf or index
++ */
++static void *ext3_lock_htree(struct inode *dir,
++ unsigned long value, int rwlock)
++{
++ void *lock;
++
++ if (!test_opt(dir->i_sb, PDIROPS))
++ return NULL;
++ lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL);
++ return lock;
++}
++
++static void ext3_unlock_htree(struct inode *dir,
++ void *lock)
++{
++ if (!test_opt(dir->i_sb, PDIROPS) || !lock)
++ return;
++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock);
++}
++
++/*
+ * directories can handle most operations...
+ */
+ struct inode_operations ext3_dir_inode_operations = {
+Index: linux-2.6.10/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs_i.h 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/include/linux/ext3_fs_i.h 2005-03-31 19:44:54.254322024 +0800
+@@ -19,6 +19,7 @@
+ #include <linux/rwsem.h>
+ #include <linux/rbtree.h>
+ #include <linux/seqlock.h>
++#include <linux/dynlocks.h>
+
+ struct ext3_reserve_window {
+ __u32 _rsv_start; /* First byte reserved */
+@@ -125,6 +126,11 @@
+ */
+ struct semaphore truncate_sem;
+ struct inode vfs_inode;
++
++ /* following fields for parallel directory operations -bzzz */
++ struct dynlock i_htree_lock;
++ struct semaphore i_append_sem;
++ struct semaphore i_rename_sem;
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 19:44:54.254322024 +0800
+@@ -355,6 +355,7 @@
+ #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
+ #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
+ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
++#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
--- /dev/null
+ fs/ext3/ialloc.c | 35 ++++++++++++++++++++++++++++++++++-
+ fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++
+ fs/ext3/namei.c | 21 +++++++++++++++++----
+ include/linux/dcache.h | 5 +++++
+ include/linux/ext3_fs.h | 5 ++++-
+ 5 files changed, 85 insertions(+), 6 deletions(-)
+
+Index: linux-2.6.10/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ialloc.c 2005-03-31 18:19:50.911148112 +0800
++++ linux-2.6.10/fs/ext3/ialloc.c 2005-03-31 18:39:48.578075064 +0800
+@@ -419,7 +419,8 @@
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
++struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode,
++ unsigned long goal)
+ {
+ struct super_block *sb;
+ struct buffer_head *bitmap_bh = NULL;
+@@ -447,6 +448,38 @@
+
+ sbi = EXT3_SB(sb);
+ es = sbi->s_es;
++ if (goal) {
++ group = (goal - 1) / EXT3_INODES_PER_GROUP(sb);
++ ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb);
++ gdp = ext3_get_group_desc(sb, group, &bh2);
++
++ err = -EIO;
++ bitmap_bh = read_inode_bitmap (sb, group);
++ if (!bitmap_bh)
++ goto fail;
++
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err) goto fail;
++
++ if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
++ ino, bitmap_bh->b_data)) {
++ printk(KERN_ERR "goal inode %lu unavailable\n", goal);
++ /* Oh well, we tried. */
++ goto continue_allocation;
++ }
++
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++ if (err) goto fail;
++
++ /* We've shortcircuited the allocation system successfully,
++ * now finish filling in the inode.
++ */
++ goto got;
++ }
++
++continue_allocation:
+ if (S_ISDIR(mode)) {
+ if (test_opt (sb, OLDALLOC))
+ group = find_group_dir(sb, dir);
+Index: linux-2.6.10/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ioctl.c 2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/fs/ext3/ioctl.c 2005-03-31 18:39:48.579074912 +0800
+@@ -9,6 +9,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
++#include <linux/namei.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/time.h>
+@@ -25,6 +26,31 @@
+ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
++ case EXT3_IOC_CREATE_INUM: {
++ char name[32];
++ struct dentry *dchild, *dparent;
++ int rc = 0;
++
++ dparent = list_entry(inode->i_dentry.next, struct dentry,
++ d_alias);
++ snprintf(name, sizeof name, "%lu", arg);
++ dchild = lookup_one_len(name, dparent, strlen(name));
++ if (dchild->d_inode) {
++ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n",
++ dparent->d_name.len, dparent->d_name.name, arg,
++ dchild->d_inode->i_ino);
++ rc = -EEXIST;
++ } else {
++ dchild->d_fsdata = (void *)arg;
++ rc = vfs_create(inode, dchild, 0644, NULL);
++ if (rc)
++ printk(KERN_ERR "vfs_create: %d\n", rc);
++ else if (dchild->d_inode->i_ino != arg)
++ rc = -EEXIST;
++ }
++ dput(dchild);
++ return rc;
++ }
+ case EXT3_IOC_GETFLAGS:
+ flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *) arg);
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c 2005-03-31 18:36:12.177972880 +0800
++++ linux-2.6.10/fs/ext3/namei.c 2005-03-31 18:39:48.582074456 +0800
+@@ -1940,6 +1940,19 @@
+ return err;
+ }
+
++static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir,
++ int mode, struct dentry *dentry)
++{
++ unsigned long inum = 0;
++
++ if (dentry->d_fsdata != NULL) {
++ struct dentry_params *param =
++ (struct dentry_params *) dentry->d_fsdata;
++ inum = param->p_inum;
++ }
++ return ext3_new_inode(handle, dir, mode, inum);
++}
++
+ /*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+@@ -1965,7 +1978,7 @@
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, mode);
++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext3_file_inode_operations;
+@@ -1999,7 +2012,7 @@
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, mode);
++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, inode->i_mode, rdev);
+@@ -2035,7 +2048,7 @@
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
++ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+@@ -2450,7 +2463,7 @@
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
++ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-03-31 18:38:11.720799608 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 18:40:36.630769944 +0800
+@@ -230,6 +230,7 @@
+ #define EXT3_IOC_SETVERSION _IOW('f', 4, long)
+ #define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
+ #define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input)
++/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+ #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long)
+ #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+@@ -742,7 +743,8 @@
+ dx_hash_info *hinfo);
+
+ /* ialloc.c */
+-extern struct inode * ext3_new_inode (handle_t *, struct inode *, int);
++extern struct inode * ext3_new_inode (handle_t *, struct inode *, int,
++ unsigned long);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+@@ -834,4 +836,5 @@
+
+ #endif /* __KERNEL__ */
+
++#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
+ #endif /* _LINUX_EXT3_FS_H */
--- /dev/null
+Index: linux-2.6.10/fs/hostfs/hostfs_user.c
+===================================================================
+--- linux-2.6.10.orig/fs/hostfs/hostfs_user.c 2004-12-25 05:35:15.000000000 +0800
++++ linux-2.6.10/fs/hostfs/hostfs_user.c 2005-03-31 19:26:03.810175656 +0800
+@@ -121,13 +121,26 @@
+ {
+ DIR *dir = stream;
+ struct dirent *ent;
++ off_t off = 0;
++ off_t after_seek = 0;
++ off_t after_readdir = 0;
++ off_t after_readdir2 = 0;
+
+ seekdir(dir, *pos);
++ after_seek = telldir(dir);
+ ent = readdir(dir);
++ after_readdir = telldir(dir);
++ if ( after_seek != after_readdir ) {
++ off = after_readdir;
++ } else {
++ readdir(dir);
++ after_readdir2 = telldir(dir);
++ off = after_readdir2;
++ }
+ if(ent == NULL) return(NULL);
+ *len_out = strlen(ent->d_name);
+ *ino_out = ent->d_ino;
+- *pos = telldir(dir);
++ *pos = off;
+ return(ent->d_name);
+ }
+
--- /dev/null
+ fs/ext3/inode.c | 3
+ fs/ext3/iopen.c | 239 +++++++++++++++++++++++++++++++++++++
+ fs/ext3/iopen.h | 15 ++
+ fs/ext3/namei.c | 13 ++
+ fs/ext3/super.c | 17 ++
+ include/linux/ext3_fs.h | 2
+ 7 files changed, 304 insertions(+), 1 deletion(-)
+
+Index: linux-2.6.10/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-04-05 12:25:13.635136112 +0800
++++ linux-2.6.10/include/linux/ext3_fs.h 2005-04-05 12:25:13.801110880 +0800
+@@ -357,6 +357,8 @@
+ #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
+ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
+ #define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */
++#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+Index: linux-2.6.10/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/inode.c 2005-04-05 12:25:13.726122280 +0800
++++ linux-2.6.10/fs/ext3/inode.c 2005-04-05 12:25:13.794111944 +0800
+@@ -37,6 +37,7 @@
+ #include <linux/mpage.h>
+ #include <linux/uio.h>
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+
+ /*
+@@ -2411,6 +2412,9 @@
+ #endif
+ ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+
++ if (ext3_iopen_get_inode(inode))
++ return;
++
+ if (ext3_get_inode_loc(inode, &iloc, 0))
+ goto bad_inode;
+ bh = iloc.bh;
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c 2005-04-05 12:25:13.728121976 +0800
++++ linux-2.6.10/fs/ext3/super.c 2005-04-05 12:25:13.797111488 +0800
+@@ -592,6 +592,7 @@
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+ };
+
+@@ -641,6 +642,9 @@
+ {Opt_ignore, "usrquota"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_pdirops, "pdirops"},
++ {Opt_iopen, "iopen"},
++ {Opt_noiopen, "noiopen"},
++ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+ };
+@@ -921,6 +925,18 @@
+ else
+ clear_opt(sbi->s_mount_opt, BARRIER);
+ break;
++ case Opt_iopen:
++ set_opt (sbi->s_mount_opt, IOPEN);
++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
++ case Opt_noiopen:
++ clear_opt (sbi->s_mount_opt, IOPEN);
++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
++ case Opt_iopen_nopriv:
++ set_opt (sbi->s_mount_opt, IOPEN);
++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
+ case Opt_ignore:
+ break;
+ case Opt_resize:
+Index: linux-2.6.10/fs/ext3/iopen.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/iopen.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/ext3/iopen.c 2005-04-05 12:25:13.791112400 +0800
+@@ -0,0 +1,274 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
++ *
++ * Invariants:
++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ * for an inode at one time.
++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ * aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/smp_lock.h>
++#include <linux/dcache.h>
++#include <linux/security.h>
++#include "iopen.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN 32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry,
++ struct nameidata *nd)
++{
++ struct inode *inode;
++ unsigned long ino;
++ struct list_head *lp;
++ struct dentry *alternate;
++ char buf[IOPEN_NAME_LEN];
++
++ if (dentry->d_name.len >= IOPEN_NAME_LEN)
++ return ERR_PTR(-ENAMETOOLONG);
++
++ memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++ buf[dentry->d_name.len] = 0;
++
++ if (strcmp(buf, ".") == 0)
++ ino = dir->i_ino;
++ else if (strcmp(buf, "..") == 0)
++ ino = EXT3_ROOT_INO;
++ else
++ ino = simple_strtoul(buf, 0, 0);
++
++ if ((ino != EXT3_ROOT_INO &&
++ //ino != EXT3_ACL_IDX_INO &&
++ //ino != EXT3_ACL_DATA_INO &&
++ ino < EXT3_FIRST_INO(dir->i_sb)) ||
++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
++ return ERR_PTR(-ENOENT);
++
++ inode = iget(dir->i_sb, ino);
++ if (!inode)
++ return ERR_PTR(-EACCES);
++ if (is_bad_inode(inode)) {
++ iput(inode);
++ return ERR_PTR(-ENOENT);
++ }
++
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ assert(d_unhashed(dentry)); /* d_rehash */
++
++ /* preferrably return a connected dentry */
++ spin_lock(&dcache_lock);
++ list_for_each(lp, &inode->i_dentry) {
++ alternate = list_entry(lp, struct dentry, d_alias);
++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
++ }
++
++ if (!list_empty(&inode->i_dentry)) {
++ alternate = list_entry(inode->i_dentry.next,
++ struct dentry, d_alias);
++ dget_locked(alternate);
++ spin_lock(&alternate->d_lock);
++ alternate->d_flags |= DCACHE_REFERENCED;
++ spin_unlock(&alternate->d_lock);
++ iput(inode);
++ spin_unlock(&dcache_lock);
++ return alternate;
++ }
++ dentry->d_flags |= DCACHE_DISCONNECTED;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++
++ __d_rehash(dentry); /* d_rehash */
++ spin_unlock(&dcache_lock);
++
++ return NULL;
++}
++
++#define do_switch(x,y) do { \
++ __typeof__ (x) __tmp = x; \
++ x = y; y = __tmp; } while (0)
++
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
++{
++ const unsigned char *old_name, *new_name;
++
++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN);
++ old_name = target->d_name.name;
++ new_name = dentry->d_name.name;
++ if (old_name == target->d_iname)
++ old_name = dentry->d_iname;
++ if (new_name == dentry->d_iname)
++ new_name = target->d_iname;
++ target->d_name.name = new_name;
++ dentry->d_name.name = old_name;
++}
++
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++ int rehash)
++{
++ struct dentry *tmp, *goal = NULL;
++ struct list_head *lp;
++
++ /* verify this dentry is really new */
++ assert(dentry->d_inode == NULL);
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ if (rehash)
++ assert(d_unhashed(dentry)); /* d_rehash */
++ assert(list_empty(&dentry->d_subdirs));
++
++ spin_lock(&dcache_lock);
++ if (!inode)
++ goto do_rehash;
++
++ /* preferrably return a connected dentry */
++ list_for_each(lp, &inode->i_dentry) {
++ tmp = list_entry(lp, struct dentry, d_alias);
++ if (tmp->d_flags & DCACHE_DISCONNECTED) {
++ assert(tmp->d_alias.next == &inode->i_dentry);
++ assert(tmp->d_alias.prev == &inode->i_dentry);
++ goal = tmp;
++ dget_locked(goal);
++ break;
++ }
++ }
++
++ if (!goal)
++ goto do_instantiate;
++
++ /* Move the goal to the de hash queue */
++ goal->d_flags &= ~ DCACHE_DISCONNECTED;
++ security_d_instantiate(goal, inode);
++ __d_rehash(dentry);
++ __d_move(goal, dentry);
++ spin_unlock(&dcache_lock);
++ iput(inode);
++
++ return goal;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++do_rehash:
++ if (rehash)
++ __d_rehash(dentry); /* d_rehash */
++ spin_unlock(&dcache_lock);
++
++ return NULL;
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++ lookup: iopen_lookup, /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++ read: generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++ int len;
++
++ len = strlen(name);
++ if (dentry->d_name.len != len)
++ return 0;
++ if (strncmp(dentry->d_name.name, name, len))
++ return 0;
++ return 1;
++}
++
++/*
++ * This function is spliced into ext3_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++ struct inode *inode;
++
++ if (dir->i_ino != EXT3_ROOT_INO ||
++ !test_opt(dir->i_sb, IOPEN) ||
++ !match_dentry(dentry, "__iopen__"))
++ return 0;
++
++ inode = iget(dir->i_sb, EXT3_BAD_INO);
++
++ if (!inode)
++ return 0;
++ d_add(dentry, inode);
++ return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately. Otherwise, this fuction returns 0.
++ */
++int ext3_iopen_get_inode(struct inode *inode)
++{
++ if (inode->i_ino != EXT3_BAD_INO)
++ return 0;
++
++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++ if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++ inode->i_mode |= 0777;
++ inode->i_uid = 0;
++ inode->i_gid = 0;
++ inode->i_nlink = 1;
++ inode->i_size = 4096;
++ inode->i_atime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME;
++ inode->i_mtime = CURRENT_TIME;
++ EXT3_I(inode)->i_dtime = 0;
++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
++ * (for stat), not the fs block
++ * size */
++ inode->i_blocks = 0;
++ inode->i_version = 1;
++ inode->i_generation = 0;
++
++ inode->i_op = &iopen_inode_operations;
++ inode->i_fop = &iopen_file_operations;
++ inode->i_mapping->a_ops = 0;
++
++ return 1;
++}
+Index: linux-2.6.10/fs/ext3/iopen.h
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/iopen.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/ext3/iopen.h 2005-04-05 12:25:13.792112248 +0800
+@@ -0,0 +1,15 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++ struct inode *inode, int rehash);
+Index: linux-2.6.10/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/Makefile 2004-12-25 05:33:52.000000000 +0800
++++ linux-2.6.10/fs/ext3/Makefile 2005-04-05 12:26:06.897039072 +0800
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+- ioctl.o namei.o super.o symlink.o hash.o resize.o
++ ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c 2005-04-05 12:25:13.633136416 +0800
++++ linux-2.6.10/fs/ext3/namei.c 2005-04-05 12:25:13.799111184 +0800
+@@ -37,6 +37,7 @@
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+
+ /*
+@@ -1140,6 +1141,9 @@
+ if (dentry->d_name.len > EXT3_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
++ if (ext3_check_for_iopen(dir, dentry))
++ return NULL;
++
+ bh = ext3_find_entry(dentry, &de, 0, &lock);
+ inode = NULL;
+ if (bh) {
+@@ -1151,10 +1155,8 @@
+ if (!inode)
+ return ERR_PTR(-EACCES);
+ }
+- if (inode)
+- return d_splice_alias(inode, dentry);
+- d_add(dentry, inode);
+- return NULL;
++
++ return iopen_connect_dentry(dentry, inode, 1);
+ }
+
+
+@@ -2367,10 +2369,6 @@
+ inode->i_nlink);
+ inode->i_version++;
+ inode->i_nlink = 0;
+- /* There's no need to set i_disksize: the fact that i_nlink is
+- * zero will ensure that the right thing happens during any
+- * recovery. */
+- inode->i_size = 0;
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, inode);
+@@ -2497,6 +2495,23 @@
+ return err;
+ }
+
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ int err = ext3_add_entry(handle, dentry, inode);
++ if (!err) {
++ err = ext3_mark_inode_dirty(handle, inode);
++ if (err == 0) {
++ dput(iopen_connect_dentry(dentry, inode, 0));
++ return 0;
++ }
++ }
++ ext3_dec_count(handle, inode);
++ iput(inode);
++ return err;
++}
++
+ static int ext3_link (struct dentry * old_dentry,
+ struct inode * dir, struct dentry *dentry)
+ {
+@@ -2520,7 +2535,8 @@
+ ext3_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+
+- err = ext3_add_nondir(handle, dentry, inode);
++ err = ext3_add_link(handle, dentry, inode);
++ ext3_orphan_del(handle,inode);
+ ext3_journal_stop(handle);
+ if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
--- /dev/null
+--- 1.46/include/linux/jbd.h 2004-10-19 03:40:17 -06:00
++++ 1.47/include/linux/jbd.h 2004-11-07 19:13:24 -07:00
+@@ -352,6 +352,27 @@
+ bit_spin_unlock(BH_JournalHead, &bh->b_state);
+ }
+
++#define HAVE_JOURNAL_CALLBACK_STATUS
++/**
++ * struct journal_callback - Base structure for callback information.
++ * @jcb_list: list information for other callbacks attached to the same handle.
++ * @jcb_func: Function to call with this callback structure.
++ *
++ * This struct is a 'seed' structure for a using with your own callback
++ * structs. If you are using callbacks you must allocate one of these
++ * or another struct of your own definition which has this struct
++ * as it's first element and pass it to journal_callback_set().
++ *
++ * This is used internally by jbd to maintain callback information.
++ *
++ * See journal_callback_set for more information.
++ **/
++struct journal_callback {
++ struct list_head jcb_list; /* t_jcb_lock */
++ void (*jcb_func)(struct journal_callback *jcb, int error);
++ /* user data goes here */
++};
++
+ struct jbd_revoke_table_s;
+
+ /**
+@@ -360,6 +381,7 @@
+ * @h_transaction: Which compound transaction is this update a part of?
+ * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
+ * @h_ref: Reference count on this handle
++ * @h_jcb: List of application registered callbacks for this handle.
+ * @h_err: Field for caller's use to track errors through large fs operations
+ * @h_sync: flag for sync-on-close
+ * @h_jdata: flag to force data journaling
+@@ -385,6 +407,13 @@
+ /* operations */
+ int h_err;
+
++ /*
++ * List of application registered callbacks for this handle. The
++ * function(s) will be called after the transaction that this handle is
++ * part of has been committed to disk. [t_jcb_lock]
++ */
++ struct list_head h_jcb;
++
+ /* Flags [no locking] */
+ unsigned int h_sync: 1; /* sync-on-close */
+ unsigned int h_jdata: 1; /* force data journaling */
+@@ -426,6 +455,8 @@
+ * j_state_lock
+ * ->j_list_lock (journal_unmap_buffer)
+ *
++ * t_handle_lock
++ * ->t_jcb_lock
+ */
+
+ struct transaction_s
+@@ -549,6 +580,15 @@
+ */
+ int t_handle_count;
+
++ /*
++ * Protects the callback list
++ */
++ spinlock_t t_jcb_lock;
++ /*
++ * List of registered callback functions for this transaction.
++ * Called when the transaction is committed. [t_jcb_lock]
++ */
++ struct list_head t_jcb;
+ };
+
+ /**
+@@ -881,6 +921,10 @@
+ extern int journal_try_to_free_buffers(journal_t *, struct page *, int);
+ extern int journal_stop(handle_t *);
+ extern int journal_flush (journal_t *);
++extern void journal_callback_set(handle_t *handle,
++ void (*fn)(struct journal_callback *,int),
++ struct journal_callback *jcb);
++
+ extern void journal_lock_updates (journal_t *);
+ extern void journal_unlock_updates (journal_t *);
+
+--- 1.23/fs/jbd/checkpoint.c 2003-07-10 23:23:54 -06:00
++++ 1.24/fs/jbd/checkpoint.c 2004-11-07 19:13:24 -07:00
+@@ -616,6 +616,7 @@
+ J_ASSERT(transaction->t_log_list == NULL);
+ J_ASSERT(transaction->t_checkpoint_list == NULL);
+ J_ASSERT(transaction->t_updates == 0);
++ J_ASSERT(list_empty(&transaction->t_jcb));
+ J_ASSERT(journal->j_committing_transaction != transaction);
+ J_ASSERT(journal->j_running_transaction != transaction);
+
+
+--- 1.53/fs/jbd/commit.c 2004-10-19 03:40:17 -06:00
++++ 1.54/fs/jbd/commit.c 2004-11-07 19:13:24 -07:00
+@@ -686,6 +686,30 @@
+ if (err)
+ __journal_abort_hard(journal);
+
++ /*
++ * Call any callbacks that had been registered for handles in this
++ * transaction. It is up to the callback to free any allocated
++ * memory.
++ *
++ * The spinlocking (t_jcb_lock) here is surely unnecessary...
++ */
++ spin_lock(&commit_transaction->t_jcb_lock);
++ if (!list_empty(&commit_transaction->t_jcb)) {
++ struct list_head *p, *n;
++ int error = is_journal_aborted(journal);
++
++ list_for_each_safe(p, n, &commit_transaction->t_jcb) {
++ struct journal_callback *jcb;
++
++ jcb = list_entry(p, struct journal_callback, jcb_list);
++ list_del(p);
++ spin_unlock(&commit_transaction->t_jcb_lock);
++ jcb->jcb_func(jcb, error);
++ spin_lock(&commit_transaction->t_jcb_lock);
++ }
++ }
++ spin_unlock(&commit_transaction->t_jcb_lock);
++
+ jbd_debug(3, "JBD: commit phase 7\n");
+
+ J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+
+--- 1.77/fs/jbd/journal.c 2004-09-21 20:58:08 -06:00
++++ 1.78/fs/jbd/journal.c 2004-11-07 19:13:24 -07:00
+@@ -55,6 +55,7 @@
+ #endif
+ EXPORT_SYMBOL(journal_flush);
+ EXPORT_SYMBOL(journal_revoke);
++EXPORT_SYMBOL(journal_callback_set);
+
+ EXPORT_SYMBOL(journal_init_dev);
+ EXPORT_SYMBOL(journal_init_inode);
+@@ -78,6 +79,7 @@
+ EXPORT_SYMBOL(journal_blocks_per_page);
+ EXPORT_SYMBOL(journal_invalidatepage);
+ EXPORT_SYMBOL(journal_try_to_free_buffers);
++EXPORT_SYMBOL(journal_bmap);
+ EXPORT_SYMBOL(journal_force_commit);
+
+ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+
+--- 1.89/fs/jbd/transaction.c 2004-10-19 03:40:17 -06:00
++++ 1.90/fs/jbd/transaction.c 2004-11-07 19:13:24 -07:00
+@@ -50,7 +50,9 @@
+ transaction->t_state = T_RUNNING;
+ transaction->t_tid = journal->j_transaction_sequence++;
+ transaction->t_expires = jiffies + journal->j_commit_interval;
++ INIT_LIST_HEAD(&transaction->t_jcb);
+ spin_lock_init(&transaction->t_handle_lock);
++ spin_lock_init(&transaction->t_jcb_lock);
+
+ /* Set up the commit timer for the new transaction. */
+ journal->j_commit_timer->expires = transaction->t_expires;
+@@ -241,6 +243,7 @@
+ memset(handle, 0, sizeof(*handle));
+ handle->h_buffer_credits = nblocks;
+ handle->h_ref = 1;
++ INIT_LIST_HEAD(&handle->h_jcb);
+
+ return handle;
+ }
+@@ -1274,6 +1277,36 @@
+ }
+
+ /**
++ * void journal_callback_set() - Register a callback function for this handle.
++ * @handle: handle to attach the callback to.
++ * @func: function to callback.
++ * @jcb: structure with additional information required by func() , and
++ * some space for jbd internal information.
++ *
++ * The function will be
++ * called when the transaction that this handle is part of has been
++ * committed to disk with the original callback data struct and the
++ * error status of the journal as parameters. There is no guarantee of
++ * ordering between handles within a single transaction, nor between
++ * callbacks registered on the same handle.
++ *
++ * The caller is responsible for allocating the journal_callback struct.
++ * This is to allow the caller to add as much extra data to the callback
++ * as needed, but reduce the overhead of multiple allocations. The caller
++ * allocated struct must start with a struct journal_callback at offset 0,
++ * and has the caller-specific data afterwards.
++ */
++void journal_callback_set(handle_t *handle,
++ void (*func)(struct journal_callback *jcb, int error),
++ struct journal_callback *jcb)
++{
++ spin_lock(&handle->h_transaction->t_jcb_lock);
++ list_add_tail(&jcb->jcb_list, &handle->h_jcb);
++ spin_unlock(&handle->h_transaction->t_jcb_lock);
++ jcb->jcb_func = func;
++}
++
++/**
+ * int journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+@@ -1338,6 +1371,11 @@
+ if (journal->j_barrier_count)
+ wake_up(&journal->j_wait_transaction_locked);
+ }
++
++ /* Move callbacks from the handle to the transaction. */
++ spin_lock(&transaction->t_jcb_lock);
++ list_splice(&handle->h_jcb, &transaction->t_jcb);
++ spin_unlock(&transaction->t_jcb_lock);
+
+ /*
+ * If the handle is marked SYNC, we need to set another commit
+
--- /dev/null
+fix against credits leak in journal_release_buffer()
+
+The idea is to charge a buffer at a time of modification (journal_dirty_metadata()),
+not at a time of access (journal_get_*_access()). Each buffer has flag first call
+journal_dirty_metadata() sets on the buffer.
+
+Signed-off-by: Alex Tomas <alex@clusterfs.com>
+
+Index: linux-2.6.10/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ialloc.c 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/fs/ext3/ialloc.c 2005-03-31 18:11:10.672236448 +0800
+@@ -474,11 +474,9 @@
+ ino = ext3_find_next_zero_bit((unsigned long *)
+ bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
+ if (ino < EXT3_INODES_PER_GROUP(sb)) {
+- int credits = 0;
+
+ BUFFER_TRACE(bitmap_bh, "get_write_access");
+- err = ext3_journal_get_write_access_credits(handle,
+- bitmap_bh, &credits);
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto fail;
+
+@@ -494,7 +492,7 @@
+ goto got;
+ }
+ /* we lost it */
+- journal_release_buffer(handle, bitmap_bh, credits);
++ journal_release_buffer(handle, bitmap_bh);
+
+ if (++ino < EXT3_INODES_PER_GROUP(sb))
+ goto repeat_in_this_group;
+Index: linux-2.6.10/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/xattr.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/ext3/xattr.c 2005-03-31 18:11:10.675235992 +0800
+@@ -507,8 +507,7 @@
+ goto skip_get_write_access;
+ /* ext3_journal_get_write_access() requires an unlocked bh,
+ which complicates things here. */
+- error = ext3_journal_get_write_access_credits(handle, bh,
+- &credits);
++ error = ext3_journal_get_write_access(handle, bh);
+ if (error)
+ goto cleanup;
+ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev,
+@@ -525,7 +524,7 @@
+ if (ce)
+ mb_cache_entry_release(ce);
+ unlock_buffer(bh);
+- journal_release_buffer(handle, bh, credits);
++ journal_release_buffer(handle, bh);
+ skip_get_write_access:
+ ea_bdebug(bh, "cloning");
+ header = kmalloc(bh->b_size, GFP_KERNEL);
+@@ -669,8 +668,7 @@
+ error = -EDQUOT;
+ if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+ unlock_buffer(new_bh);
+- journal_release_buffer(handle, new_bh,
+- credits);
++ journal_release_buffer(handle, new_bh);
+ goto cleanup;
+ }
+ HDR(new_bh)->h_refcount = cpu_to_le32(1 +
+@@ -986,8 +984,7 @@
+ ext3_error(inode->i_sb, "ext3_xattr_cache_find",
+ "inode %ld: block %ld read error",
+ inode->i_ino, (unsigned long) ce->e_block);
+- } else if (ext3_journal_get_write_access_credits(
+- handle, bh, credits) == 0) {
++ } else if (ext3_journal_get_write_access(handle, bh) == 0) {
+ /* ext3_journal_get_write_access() requires an unlocked
+ * bh, which complicates things here. */
+ lock_buffer(bh);
+@@ -1003,7 +1000,7 @@
+ return bh;
+ }
+ unlock_buffer(bh);
+- journal_release_buffer(handle, bh, *credits);
++ journal_release_buffer(handle, bh);
+ *credits = 0;
+ brelse(bh);
+ }
+Index: linux-2.6.10/fs/ext3/balloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/balloc.c 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/fs/ext3/balloc.c 2005-03-31 18:14:05.705627328 +0800
+@@ -342,7 +342,7 @@
+ */
+ /* @@@ check errors */
+ BUFFER_TRACE(bitmap_bh, "getting undo access");
+- err = ext3_journal_get_undo_access(handle, bitmap_bh, NULL);
++ err = ext3_journal_get_undo_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+@@ -986,7 +986,6 @@
+ unsigned long group_first_block;
+ int ret = 0;
+ int fatal;
+- int credits = 0;
+
+ *errp = 0;
+
+@@ -996,7 +995,7 @@
+ * if the buffer is in BJ_Forget state in the committing transaction.
+ */
+ BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+- fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits);
++ fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
+ if (fatal) {
+ *errp = fatal;
+ return -1;
+@@ -1087,7 +1086,7 @@
+ }
+
+ BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+- ext3_journal_release_buffer(handle, bitmap_bh, credits);
++ ext3_journal_release_buffer(handle, bitmap_bh);
+ return ret;
+ }
+
+Index: linux-2.6.10/fs/jbd/commit.c
+===================================================================
+--- linux-2.6.10.orig/fs/jbd/commit.c 2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/fs/jbd/commit.c 2005-03-31 18:11:10.668237056 +0800
+@@ -204,6 +204,19 @@
+ }
+
+ /*
++ * First, drop modified flag: all accesses to the buffers
++ * will be tracked for a new trasaction only -bzzz
++ */
++ if (commit_transaction->t_buffers) {
++ new_jh = jh = commit_transaction->t_buffers->b_tnext;
++ do {
++ J_ASSERT_JH(new_jh, new_jh->b_modified == 1);
++ new_jh->b_modified = 0;
++ new_jh = new_jh->b_tnext;
++ } while (new_jh != jh);
++ }
++
++ /*
+ * Now try to drop any written-back buffers from the journal's
+ * checkpoint lists. We do this *before* commit because it potentially
+ * frees some memory
+Index: linux-2.6.10/fs/jbd/transaction.c
+===================================================================
+--- linux-2.6.10.orig/fs/jbd/transaction.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/jbd/transaction.c 2005-03-31 18:11:10.666237360 +0800
+@@ -522,7 +522,7 @@
+ */
+ static int
+ do_get_write_access(handle_t *handle, struct journal_head *jh,
+- int force_copy, int *credits)
++ int force_copy)
+ {
+ struct buffer_head *bh;
+ transaction_t *transaction;
+@@ -604,11 +604,6 @@
+ JBUFFER_TRACE(jh, "has frozen data");
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ jh->b_next_transaction = transaction;
+-
+- J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+- handle->h_buffer_credits--;
+- if (credits)
+- (*credits)++;
+ goto done;
+ }
+
+@@ -688,10 +683,6 @@
+ jh->b_next_transaction = transaction;
+ }
+
+- J_ASSERT(handle->h_buffer_credits > 0);
+- handle->h_buffer_credits--;
+- if (credits)
+- (*credits)++;
+
+ /*
+ * Finally, if the buffer is not journaled right now, we need to make
+@@ -749,8 +740,7 @@
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+
+-int journal_get_write_access(handle_t *handle,
+- struct buffer_head *bh, int *credits)
++int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+ {
+ struct journal_head *jh = journal_add_journal_head(bh);
+ int rc;
+@@ -758,7 +748,7 @@
+ /* We do not want to get caught playing with fields which the
+ * log thread also manipulates. Make sure that the buffer
+ * completes any outstanding IO before proceeding. */
+- rc = do_get_write_access(handle, jh, 0, credits);
++ rc = do_get_write_access(handle, jh, 0);
+ journal_put_journal_head(jh);
+ return rc;
+ }
+@@ -814,9 +804,6 @@
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+- J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+- handle->h_buffer_credits--;
+-
+ if (jh->b_transaction == NULL) {
+ jh->b_transaction = transaction;
+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
+@@ -869,8 +856,7 @@
+ *
+ * Returns error number or 0 on success.
+ */
+-int journal_get_undo_access(handle_t *handle, struct buffer_head *bh,
+- int *credits)
++int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+ {
+ int err;
+ struct journal_head *jh = journal_add_journal_head(bh);
+@@ -883,7 +869,7 @@
+ * make sure that obtaining the committed_data is done
+ * atomically wrt. completion of any outstanding commits.
+ */
+- err = do_get_write_access(handle, jh, 1, credits);
++ err = do_get_write_access(handle, jh, 1);
+ if (err)
+ goto out;
+
+@@ -1111,6 +1097,17 @@
+
+ jbd_lock_bh_state(bh);
+
++ if (jh->b_modified == 0) {
++ /*
++ * This buffer's got modified and becoming part
++ * of the transaction. This needs to be done
++ * once a transaction -bzzz
++ */
++ jh->b_modified = 1;
++ J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
++ handle->h_buffer_credits--;
++ }
++
+ /*
+ * fastpath, to avoid expensive locking. If this buffer is already
+ * on the running transaction's metadata list there is nothing to do.
+@@ -1161,24 +1158,11 @@
+ * journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+- * The caller passes in the number of credits which should be put back for
+- * this buffer (zero or one).
+- *
+- * We leave the buffer attached to t_reserved_list because even though this
+- * handle doesn't want it, some other concurrent handle may want to journal
+- * this buffer. If that handle is curently in between get_write_access() and
+- * journal_dirty_metadata() then it expects the buffer to be reserved. If
+- * we were to rip it off t_reserved_list here, the other handle will explode
+- * when journal_dirty_metadata is presented with a non-reserved buffer.
+- *
+- * If nobody really wants to journal this buffer then it will be thrown
+- * away at the start of commit.
+ */
+ void
+-journal_release_buffer(handle_t *handle, struct buffer_head *bh, int credits)
++journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+ {
+ BUFFER_TRACE(bh, "entry");
+- handle->h_buffer_credits += credits;
+ }
+
+ /**
+@@ -1222,6 +1206,12 @@
+ goto not_jbd;
+ }
+
++ /*
++ * The buffer's going from the transaction, we must drop
++ * all references -bzzz
++ */
++ jh->b_modified = 0;
++
+ if (jh->b_transaction == handle->h_transaction) {
+ J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+@@ -2015,7 +2005,10 @@
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = jh->b_next_transaction;
+ jh->b_next_transaction = NULL;
+- __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
++ if (jh->b_modified == 1)
++ __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
++ else
++ __journal_file_buffer(jh, jh->b_transaction, BJ_Reserved);
+ J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+
+ if (was_dirty)
+Index: linux-2.6.10/include/linux/journal-head.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/journal-head.h 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/include/linux/journal-head.h 2005-03-31 18:11:10.658238576 +0800
+@@ -32,6 +32,13 @@
+ unsigned b_jlist;
+
+ /*
++ * This flag signals the buffer has been modified by
++ * the currently running transaction
++ * [jbd_lock_bh_state()]
++ */
++ unsigned b_modified;
++
++ /*
+ * Copy of the buffer data frozen for writing to the log.
+ * [jbd_lock_bh_state()]
+ */
+Index: linux-2.6.10/include/linux/jbd.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/jbd.h 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/jbd.h 2005-03-31 18:12:52.504755552 +0800
+@@ -867,15 +867,12 @@
+ extern handle_t *journal_start(journal_t *, int nblocks);
+ extern int journal_restart (handle_t *, int nblocks);
+ extern int journal_extend (handle_t *, int nblocks);
+-extern int journal_get_write_access(handle_t *, struct buffer_head *,
+- int *credits);
++extern int journal_get_write_access(handle_t *, struct buffer_head *);
+ extern int journal_get_create_access (handle_t *, struct buffer_head *);
+-extern int journal_get_undo_access(handle_t *, struct buffer_head *,
+- int *credits);
++extern int journal_get_undo_access(handle_t *, struct buffer_head *);
+ extern int journal_dirty_data (handle_t *, struct buffer_head *);
+ extern int journal_dirty_metadata (handle_t *, struct buffer_head *);
+-extern void journal_release_buffer (handle_t *, struct buffer_head *,
+- int credits);
++extern void journal_release_buffer (handle_t *, struct buffer_head *);
+ extern int journal_forget (handle_t *, struct buffer_head *);
+ extern void journal_sync_buffer (struct buffer_head *);
+ extern int journal_invalidatepage(journal_t *,
+Index: linux-2.6.10/include/linux/ext3_jbd.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ext3_jbd.h 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/ext3_jbd.h 2005-03-31 18:11:10.660238272 +0800
+@@ -113,9 +113,9 @@
+
+ static inline int
+ __ext3_journal_get_undo_access(const char *where, handle_t *handle,
+- struct buffer_head *bh, int *credits)
++ struct buffer_head *bh)
+ {
+- int err = journal_get_undo_access(handle, bh, credits);
++ int err = journal_get_undo_access(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ return err;
+@@ -123,19 +123,18 @@
+
+ static inline int
+ __ext3_journal_get_write_access(const char *where, handle_t *handle,
+- struct buffer_head *bh, int *credits)
++ struct buffer_head *bh)
+ {
+- int err = journal_get_write_access(handle, bh, credits);
++ int err = journal_get_write_access(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ return err;
+ }
+
+ static inline void
+-ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh,
+- int credits)
++ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+ {
+- journal_release_buffer(handle, bh, credits);
++ journal_release_buffer(handle, bh);
+ }
+
+ static inline int
+@@ -178,12 +177,10 @@
+ }
+
+
+-#define ext3_journal_get_undo_access(handle, bh, credits) \
+- __ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh), (credits))
++#define ext3_journal_get_undo_access(handle, bh) \
++ __ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_get_write_access(handle, bh) \
+- __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh), NULL)
+-#define ext3_journal_get_write_access_credits(handle, bh, credits) \
+- __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh), (credits))
++ __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_revoke(handle, blocknr, bh) \
+ __ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ #define ext3_journal_get_create_access(handle, bh) \
--- /dev/null
+
+
+This kgdb will get called and will trap almost any kernel
+fault WITHOUT BEING ARMED.
+
+It is entered at boot time via "kgdb" in the boot string,
+not "gdb". This entry occurs when the first setup on the
+boot string is called, not sometime later. You will not
+find a "waiting for gdb" on your console, as the console has
+not yet been enabled at this time. (Note, this early stuff
+is a bit fragile as the full trap table has yet to be
+loaded, something I might address, sometime... So don't try
+to look at memory that can not be reached, for example.
+Once the full trap table is loaded this restriction goes
+away.)
+
+If you hard code it, you can put a breakpoint() as the FIRST
+LINE OF C CODE.
+
+It does NOT use the serial driver, but if the serial driver
+is loaded, it tells it to release the port to avoid
+conflict.
+
+The threads stuff is not configurable, does not require
+redirection of schedule() calls and does back track to the
+first non schedule() caller on the info threads command. If
+you switch to the thread, however, it will show it in the
+switch code (as it should).
+
+It is MUCH more aggressive and paranoid about grabbing the
+other cpus on entry. It issues a "send_nmi_all_but_self()"
+rather than depending on them to interrupt or hit an NMI
+sometime in the distant future. If a cpu does not come to
+the party, it will continue without it so all is not lost.
+
+It does not have anything to do with IOCTL calls, but does
+do the control-C thing.
+
+There is a LOT of info in the patch which ends up in
+.../Documentation/i386/kgdb/*
+
+There is a nifty little thing call kgdb_ts() (kgdb time
+stamp) which is a function you can code calls to which puts
+some useful stuff in a circular buffer which can be examined
+with the supplied gdb macros.
+
+It also allows you do to do "p foobar(...)" i.e. to call a
+function from gdb, just like gdb allows in program
+debugging.
+
+In an SMP system, you can choose to "hold" any given set of
+cpus. It also defaults to holding other cpus on single step
+(this can be overridden).
+
+This said, you can imagine my consternation when I found it
+"lost it" on continues on 2.5. I found and fixed this this
+early pm, a hold cpu on exit goof on my part.
+
+Oh, and a final point, the configure options are more
+extensive (the serial port is set up here, for example, (can
+not wait for a command line to do this)). There is one to
+do system call exit tests. This is VERY new and causes the
+kernel to hit a hard "int 3" if a system call attempts to
+exit with preempt count other than zero. This is a fault,
+of course, but the current 2.5 is full of them so I don't
+recommend turning this on.
+
+
+DESC
+kgdbL warning fix
+EDESC
+From: Ingo Molnar <mingo@elte.hu>
+
+this patch fixes a deprecated use of asm input operands. (and shuts up a
+gcc 3.3 warning.)
+
+DESC
+kgdb buffer overflow fix
+EDESC
+From: George Anzinger <george@mvista.com>
+
+
+DESC
+kgdbL warning fix
+EDESC
+From: Ingo Molnar <mingo@elte.hu>
+
+this patch fixes a deprecated use of asm input operands. (and shuts up a
+gcc 3.3 warning.)
+
+DESC
+kgdb: CONFIG_DEBUG_INFO fix
+EDESC
+From: Thomas Schlichter <schlicht@uni-mannheim.de>
+
+that patch sets DEBUG_INFO to y by default, even if whether DEBUG_KERNEL nor
+KGDB is enabled. The attached patch changes this to enable DEBUG_INFO by
+default only if KGDB is enabled.
+
+DESC
+x86_64 fixes
+EDESC
+From Andi Kleen
+
+Fix x86_64 for kgdb. We forget why.
+DESC
+correct kgdb.txt Documentation link (against 2.6.1-rc1-mm2)
+EDESC
+From: Jesper Juhl <juhl-lkml@dif.dk>
+
+The help text for "config KGDB" in arch/i386/Kconfig refers to
+Documentation/i386/kgdb.txt - the actual location is
+Documentation/i386/kgdb/kgdb.txt - patch below to fix that.
+
+DESC
+kgdb: fix for recent gcc
+EDESC
+
+arch/i386/kernel/traps.c:97: error: conflicting types for 'int3'
+arch/i386/kernel/traps.c:77: error: previous declaration of 'int3' was here
+arch/i386/kernel/traps.c:97: error: conflicting types for 'int3'
+arch/i386/kernel/traps.c:77: error: previous declaration of 'int3' was here
+arch/i386/kernel/traps.c:99: error: conflicting types for 'debug'
+arch/i386/kernel/traps.c:75: error: previous declaration of 'debug' was here
+arch/i386/kernel/traps.c:99: error: conflicting types for 'debug'
+arch/i386/kernel/traps.c:75: error: previous declaration of 'debug' was here
+
+DESC
+kgdb warning fixes
+EDESC
+
+arch/i386/kernel/kgdb_stub.c:1306: warning: 'time' might be used uninitialized in this function
+arch/i386/kernel/kgdb_stub.c:1306: warning: 'dum' might be used uninitialized in this function
+DESC
+THREAD_SIZE fixes for kgdb
+EDESC
+From: Matt Mackall <mpm@selenic.com>
+
+Noticed the THREAD_SIZE clean-ups are in -mm now. Here are the missing
+bits for kgdb, tested in -tiny with 4k stacks.
+DESC
+Fix stack overflow test for non-8k stacks
+EDESC
+From: Matt Mackall <mpm@selenic.com>
+
+This is needed to work properly with 4k and 16k stacks.
+DESC
+kgdb-ga.patch fix for i386 single-step into sysenter
+EDESC
+From: Roland McGrath <roland@redhat.com>
+
+Using kgdb-ga.patch from -mm, if userland single-steps (PTRACE_SINGLESTEP)
+into the `sysenter' instruction, kgdb reports a bogus trap:
+
+ Program received signal SIGTRAP, Trace/breakpoint trap.
+ sysenter_past_esp () at arch/i386/kernel/entry.S:249
+ 1: x/i $pc 0xc0106023 <sysenter_past_esp>: sti
+ (gdb)
+
+The hackery in the "FIX_STACK" macro in entry.S changes the saved PC for a
+the spurious kernel-mode debug trap when TF was set on user-mode execution
+of `sysenter', so sysenter_past_esp is where it actually lies in this case.
+ The following patch removes the kgdb hiccup when userland
+PTRACE_SINGLESTEP's into sysenter.
+DESC
+fix TRAP_BAD_SYSCALL_EXITS on i386
+EDESC
+From: Andy Whitcroft <apw@shadowen.org>
+
+We are not using the right offset name, nor the right address when checking
+for a non-zero preempt count. Move to TI_preempt_count(%ebp).
+
+Signed-off-by: Andy Whitcroft <apw@shadowen.org>
+DESC
+add TRAP_BAD_SYSCALL_EXITS config for i386
+EDESC
+From: Andy Whitcroft <apw@shadowen.org>
+
+There seems to be code recently added to -bk and thereby -mm which supports
+extra debug for preempt on system call exit. Oddly there doesn't seem to
+be configuration options to enable them. Below is a possible patch to
+allow enabling this on i386. Sadly the most obvious menu to add this to is
+the Kernel Hacking menu, but that is defined in architecture specific
+configuration. If this makes sense I could patch the other arches?
+
+Add a configuration option to allow enabling TRAP_BAD_SYSCALL_EXITS to the
+Kernel Hacking menu.
+
+Signed-off-by: Andy Whitcroft <apw@shadowen.org>
+DESC
+kgdb-is-incompatible-with-kprobes
+EDESC
+DESC
+kgdb-ga-build-fix
+EDESC
+DESC
+kgdb-ga-fixes
+EDESC
+Signed-off-by: Andrew Morton <akpm@osdl.org>
+Index: linux-2.6.10/include/asm-i386/kgdb_local.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/kgdb_local.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-i386/kgdb_local.h 2005-04-05 12:48:05.371600472 +0800
+@@ -0,0 +1,102 @@
++#ifndef __KGDB_LOCAL
++#define ___KGDB_LOCAL
++#include <linux/config.h>
++#include <linux/types.h>
++#include <linux/serial.h>
++#include <linux/serialP.h>
++#include <linux/spinlock.h>
++#include <asm/processor.h>
++#include <asm/msr.h>
++#include <asm/kgdb.h>
++
++#define PORT 0x3f8
++#ifdef CONFIG_KGDB_PORT
++#undef PORT
++#define PORT CONFIG_KGDB_PORT
++#endif
++#define IRQ 4
++#ifdef CONFIG_KGDB_IRQ
++#undef IRQ
++#define IRQ CONFIG_KGDB_IRQ
++#endif
++#define SB_CLOCK 1843200
++#define SB_BASE (SB_CLOCK/16)
++#define SB_BAUD9600 SB_BASE/9600
++#define SB_BAUD192 SB_BASE/19200
++#define SB_BAUD384 SB_BASE/38400
++#define SB_BAUD576 SB_BASE/57600
++#define SB_BAUD1152 SB_BASE/115200
++#ifdef CONFIG_KGDB_9600BAUD
++#define SB_BAUD SB_BAUD9600
++#endif
++#ifdef CONFIG_KGDB_19200BAUD
++#define SB_BAUD SB_BAUD192
++#endif
++#ifdef CONFIG_KGDB_38400BAUD
++#define SB_BAUD SB_BAUD384
++#endif
++#ifdef CONFIG_KGDB_57600BAUD
++#define SB_BAUD SB_BAUD576
++#endif
++#ifdef CONFIG_KGDB_115200BAUD
++#define SB_BAUD SB_BAUD1152
++#endif
++#ifndef SB_BAUD
++#define SB_BAUD SB_BAUD1152 /* Start with this if not given */
++#endif
++
++#ifndef CONFIG_X86_TSC
++#undef rdtsc
++#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;}
++#undef rdtscll
++#define rdtscll(s) s++
++#endif
++
++#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */
++#undef spin_lock
++#undef spin_trylock
++#undef spin_unlock
++#define spin_lock _raw_spin_lock
++#define spin_trylock _raw_spin_trylock
++#define spin_unlock _raw_spin_unlock
++#else
++#endif
++#undef spin_unlock_wait
++#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \
++ while(spin_is_locked(x))
++
++#define SB_IER 1
++#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS
++
++#define FLAGS 0
++#define SB_STATE { \
++ magic: SSTATE_MAGIC, \
++ baud_base: SB_BASE, \
++ port: PORT, \
++ irq: IRQ, \
++ flags: FLAGS, \
++ custom_divisor:SB_BAUD}
++#define SB_INFO { \
++ magic: SERIAL_MAGIC, \
++ port: PORT,0,FLAGS, \
++ state: &state, \
++ tty: (struct tty_struct *)&state, \
++ IER: SB_IER, \
++ MCR: SB_MCR}
++extern void putDebugChar(int);
++/* RTAI support needs us to really stop/start interrupts */
++
++#define kgdb_sti() __asm__ __volatile__("sti": : :"memory")
++#define kgdb_cli() __asm__ __volatile__("cli": : :"memory")
++#define kgdb_local_save_flags(x) __asm__ __volatile__(\
++ "pushfl ; popl %0":"=g" (x): /* no input */)
++#define kgdb_local_irq_restore(x) __asm__ __volatile__(\
++ "pushl %0 ; popfl": \
++ /* no output */ :"g" (x):"memory", "cc")
++#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli()
++
++#ifdef CONFIG_SERIAL
++extern void shutdown_for_kgdb(struct async_struct *info);
++#endif
++#define INIT_KDEBUG putDebugChar("+");
++#endif /* __KGDB_LOCAL */
+Index: linux-2.6.10/include/asm-i386/kgdb.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/kgdb.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-i386/kgdb.h 2005-04-05 12:48:05.399596216 +0800
+@@ -0,0 +1,59 @@
++#ifndef __KGDB
++#define __KGDB
++
++/*
++ * This file should not include ANY others. This makes it usable
++ * most anywhere without the fear of include order or inclusion.
++ * Make it so!
++ *
++ * This file may be included all the time. It is only active if
++ * CONFIG_KGDB is defined, otherwise it stubs out all the macros
++ * and entry points.
++ */
++#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__)
++
++extern void breakpoint(void);
++#define INIT_KGDB_INTS kgdb_enable_ints()
++
++#ifndef BREAKPOINT
++#define BREAKPOINT asm(" int $3")
++#endif
++/*
++ * GDB debug stub (or any debug stub) can point the 'linux_debug_hook'
++ * pointer to its routine and it will be entered as the first thing
++ * when a trap occurs.
++ *
++ * Return values are, at present, undefined.
++ *
++ * The debug hook routine does not necessarily return to its caller.
++ * It has the register image and thus may choose to resume execution
++ * anywhere it pleases.
++ */
++struct pt_regs;
++
++extern int kgdb_handle_exception(int trapno,
++ int signo, int err_code, struct pt_regs *regs);
++extern int in_kgdb(struct pt_regs *regs);
++
++#ifdef CONFIG_KGDB_TS
++void kgdb_tstamp(int line, char *source, int data0, int data1);
++/*
++ * This is the time stamp function. The macro adds the source info and
++ * does a cast on the data to allow most any 32-bit value.
++ */
++
++#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1)
++#else
++#define kgdb_ts(data0,data1)
++#endif
++#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */
++#ifndef BREAKPOINT
++#define BREAKPOINT
++#endif
++#define kgdb_ts(data0,data1)
++#define in_kgdb
++#define kgdb_handle_exception
++#define breakpoint
++#define INIT_KGDB_INTS
++#endif
++#endif /* __KGDB */
+Index: linux-2.6.10/include/asm-i386/bugs.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/bugs.h 2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/include/asm-i386/bugs.h 2005-04-05 12:48:05.398596368 +0800
+@@ -1,11 +1,11 @@
+ /*
+ * include/asm-i386/bugs.h
+ *
+- * Copyright (C) 1994 Linus Torvalds
++ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Cyrix stuff, June 1998 by:
+ * - Rafael R. Reilova (moved everything from head.S),
+- * <rreilova@ececs.uc.edu>
++ * <rreilova@ececs.uc.edu>
+ * - Channing Corn (tests & fixes),
+ * - Andrew D. Balsa (code cleanup).
+ *
+@@ -25,7 +25,20 @@
+ #include <asm/processor.h>
+ #include <asm/i387.h>
+ #include <asm/msr.h>
+-
++#ifdef CONFIG_KGDB
++/*
++ * Provied the command line "gdb" initial break
++ */
++int __init kgdb_initial_break(char * str)
++{
++ if (*str == '\0'){
++ breakpoint();
++ return 1;
++ }
++ return 0;
++}
++__setup("gdb",kgdb_initial_break);
++#endif
+ static int __init no_halt(char *s)
+ {
+ boot_cpu_data.hlt_works_ok = 0;
+@@ -140,7 +153,7 @@
+ : "ecx", "edi" );
+ /* If this fails, it means that any user program may lock the CPU hard. Too bad. */
+ if (res != 12345678) printk( "Buggy.\n" );
+- else printk( "OK.\n" );
++ else printk( "OK.\n" );
+ #endif
+ }
+
+Index: linux-2.6.10/include/linux/serial_core.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/serial_core.h 2004-12-25 05:34:00.000000000 +0800
++++ linux-2.6.10/include/linux/serial_core.h 2005-04-05 12:48:05.367601080 +0800
+@@ -184,7 +184,6 @@
+ unsigned char x_char; /* xon/xoff char */
+ unsigned char regshift; /* reg offset shift */
+ unsigned char iotype; /* io access style */
+-
+ #define UPIO_PORT (0)
+ #define UPIO_HUB6 (1)
+ #define UPIO_MEM (2)
+Index: linux-2.6.10/include/linux/dwarf2.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dwarf2.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dwarf2.h 2005-04-05 12:48:05.369600776 +0800
+@@ -0,0 +1,738 @@
++/* Declarations and definitions of codes relating to the DWARF2 symbolic
++ debugging information format.
++ Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002
++ Free Software Foundation, Inc.
++
++ Written by Gary Funck (gary@intrepid.com) The Ada Joint Program
++ Office (AJPO), Florida State Unviversity and Silicon Graphics Inc.
++ provided support for this effort -- June 21, 1995.
++
++ Derived from the DWARF 1 implementation written by Ron Guilmette
++ (rfg@netcom.com), November 1990.
++
++ This file is part of GCC.
++
++ GCC is free software; you can redistribute it and/or modify it under
++ the terms of the GNU General Public License as published by the Free
++ Software Foundation; either version 2, or (at your option) any later
++ version.
++
++ GCC is distributed in the hope that it will be useful, but WITHOUT
++ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
++ License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with GCC; see the file COPYING. If not, write to the Free
++ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
++ 02111-1307, USA. */
++
++/* This file is derived from the DWARF specification (a public document)
++ Revision 2.0.0 (July 27, 1993) developed by the UNIX International
++ Programming Languages Special Interest Group (UI/PLSIG) and distributed
++ by UNIX International. Copies of this specification are available from
++ UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.
++
++ This file also now contains definitions from the DWARF 3 specification. */
++
++/* This file is shared between GCC and GDB, and should not contain
++ prototypes. */
++
++#ifndef _ELF_DWARF2_H
++#define _ELF_DWARF2_H
++
++/* Structure found in the .debug_line section. */
++#ifndef __ASSEMBLY__
++typedef struct
++{
++ unsigned char li_length [4];
++ unsigned char li_version [2];
++ unsigned char li_prologue_length [4];
++ unsigned char li_min_insn_length [1];
++ unsigned char li_default_is_stmt [1];
++ unsigned char li_line_base [1];
++ unsigned char li_line_range [1];
++ unsigned char li_opcode_base [1];
++}
++DWARF2_External_LineInfo;
++
++typedef struct
++{
++ unsigned long li_length;
++ unsigned short li_version;
++ unsigned int li_prologue_length;
++ unsigned char li_min_insn_length;
++ unsigned char li_default_is_stmt;
++ int li_line_base;
++ unsigned char li_line_range;
++ unsigned char li_opcode_base;
++}
++DWARF2_Internal_LineInfo;
++
++/* Structure found in .debug_pubnames section. */
++typedef struct
++{
++ unsigned char pn_length [4];
++ unsigned char pn_version [2];
++ unsigned char pn_offset [4];
++ unsigned char pn_size [4];
++}
++DWARF2_External_PubNames;
++
++typedef struct
++{
++ unsigned long pn_length;
++ unsigned short pn_version;
++ unsigned long pn_offset;
++ unsigned long pn_size;
++}
++DWARF2_Internal_PubNames;
++
++/* Structure found in .debug_info section. */
++typedef struct
++{
++ unsigned char cu_length [4];
++ unsigned char cu_version [2];
++ unsigned char cu_abbrev_offset [4];
++ unsigned char cu_pointer_size [1];
++}
++DWARF2_External_CompUnit;
++
++typedef struct
++{
++ unsigned long cu_length;
++ unsigned short cu_version;
++ unsigned long cu_abbrev_offset;
++ unsigned char cu_pointer_size;
++}
++DWARF2_Internal_CompUnit;
++
++typedef struct
++{
++ unsigned char ar_length [4];
++ unsigned char ar_version [2];
++ unsigned char ar_info_offset [4];
++ unsigned char ar_pointer_size [1];
++ unsigned char ar_segment_size [1];
++}
++DWARF2_External_ARange;
++
++typedef struct
++{
++ unsigned long ar_length;
++ unsigned short ar_version;
++ unsigned long ar_info_offset;
++ unsigned char ar_pointer_size;
++ unsigned char ar_segment_size;
++}
++DWARF2_Internal_ARange;
++
++#define ENUM(name) enum name {
++#define IF_NOT_ASM(a) a
++#define COMMA ,
++#else
++#define ENUM(name)
++#define IF_NOT_ASM(a)
++#define COMMA
++
++#endif
++
++/* Tag names and codes. */
++ENUM(dwarf_tag)
++
++ DW_TAG_padding = 0x00 COMMA
++ DW_TAG_array_type = 0x01 COMMA
++ DW_TAG_class_type = 0x02 COMMA
++ DW_TAG_entry_point = 0x03 COMMA
++ DW_TAG_enumeration_type = 0x04 COMMA
++ DW_TAG_formal_parameter = 0x05 COMMA
++ DW_TAG_imported_declaration = 0x08 COMMA
++ DW_TAG_label = 0x0a COMMA
++ DW_TAG_lexical_block = 0x0b COMMA
++ DW_TAG_member = 0x0d COMMA
++ DW_TAG_pointer_type = 0x0f COMMA
++ DW_TAG_reference_type = 0x10 COMMA
++ DW_TAG_compile_unit = 0x11 COMMA
++ DW_TAG_string_type = 0x12 COMMA
++ DW_TAG_structure_type = 0x13 COMMA
++ DW_TAG_subroutine_type = 0x15 COMMA
++ DW_TAG_typedef = 0x16 COMMA
++ DW_TAG_union_type = 0x17 COMMA
++ DW_TAG_unspecified_parameters = 0x18 COMMA
++ DW_TAG_variant = 0x19 COMMA
++ DW_TAG_common_block = 0x1a COMMA
++ DW_TAG_common_inclusion = 0x1b COMMA
++ DW_TAG_inheritance = 0x1c COMMA
++ DW_TAG_inlined_subroutine = 0x1d COMMA
++ DW_TAG_module = 0x1e COMMA
++ DW_TAG_ptr_to_member_type = 0x1f COMMA
++ DW_TAG_set_type = 0x20 COMMA
++ DW_TAG_subrange_type = 0x21 COMMA
++ DW_TAG_with_stmt = 0x22 COMMA
++ DW_TAG_access_declaration = 0x23 COMMA
++ DW_TAG_base_type = 0x24 COMMA
++ DW_TAG_catch_block = 0x25 COMMA
++ DW_TAG_const_type = 0x26 COMMA
++ DW_TAG_constant = 0x27 COMMA
++ DW_TAG_enumerator = 0x28 COMMA
++ DW_TAG_file_type = 0x29 COMMA
++ DW_TAG_friend = 0x2a COMMA
++ DW_TAG_namelist = 0x2b COMMA
++ DW_TAG_namelist_item = 0x2c COMMA
++ DW_TAG_packed_type = 0x2d COMMA
++ DW_TAG_subprogram = 0x2e COMMA
++ DW_TAG_template_type_param = 0x2f COMMA
++ DW_TAG_template_value_param = 0x30 COMMA
++ DW_TAG_thrown_type = 0x31 COMMA
++ DW_TAG_try_block = 0x32 COMMA
++ DW_TAG_variant_part = 0x33 COMMA
++ DW_TAG_variable = 0x34 COMMA
++ DW_TAG_volatile_type = 0x35 COMMA
++ /* DWARF 3. */
++ DW_TAG_dwarf_procedure = 0x36 COMMA
++ DW_TAG_restrict_type = 0x37 COMMA
++ DW_TAG_interface_type = 0x38 COMMA
++ DW_TAG_namespace = 0x39 COMMA
++ DW_TAG_imported_module = 0x3a COMMA
++ DW_TAG_unspecified_type = 0x3b COMMA
++ DW_TAG_partial_unit = 0x3c COMMA
++ DW_TAG_imported_unit = 0x3d COMMA
++ /* SGI/MIPS Extensions. */
++ DW_TAG_MIPS_loop = 0x4081 COMMA
++ /* GNU extensions. */
++ DW_TAG_format_label = 0x4101 COMMA /* For FORTRAN 77 and Fortran 90. */
++ DW_TAG_function_template = 0x4102 COMMA /* For C++. */
++ DW_TAG_class_template = 0x4103 COMMA /* For C++. */
++ DW_TAG_GNU_BINCL = 0x4104 COMMA
++ DW_TAG_GNU_EINCL = 0x4105 COMMA
++ /* Extensions for UPC. See: http://upc.gwu.edu/~upc. */
++ DW_TAG_upc_shared_type = 0x8765 COMMA
++ DW_TAG_upc_strict_type = 0x8766 COMMA
++ DW_TAG_upc_relaxed_type = 0x8767
++IF_NOT_ASM(};)
++
++#define DW_TAG_lo_user 0x4080
++#define DW_TAG_hi_user 0xffff
++
++/* Flag that tells whether entry has a child or not. */
++#define DW_children_no 0
++#define DW_children_yes 1
++
++/* Form names and codes. */
++ENUM(dwarf_form)
++
++ DW_FORM_addr = 0x01 COMMA
++ DW_FORM_block2 = 0x03 COMMA
++ DW_FORM_block4 = 0x04 COMMA
++ DW_FORM_data2 = 0x05 COMMA
++ DW_FORM_data4 = 0x06 COMMA
++ DW_FORM_data8 = 0x07 COMMA
++ DW_FORM_string = 0x08 COMMA
++ DW_FORM_block = 0x09 COMMA
++ DW_FORM_block1 = 0x0a COMMA
++ DW_FORM_data1 = 0x0b COMMA
++ DW_FORM_flag = 0x0c COMMA
++ DW_FORM_sdata = 0x0d COMMA
++ DW_FORM_strp = 0x0e COMMA
++ DW_FORM_udata = 0x0f COMMA
++ DW_FORM_ref_addr = 0x10 COMMA
++ DW_FORM_ref1 = 0x11 COMMA
++ DW_FORM_ref2 = 0x12 COMMA
++ DW_FORM_ref4 = 0x13 COMMA
++ DW_FORM_ref8 = 0x14 COMMA
++ DW_FORM_ref_udata = 0x15 COMMA
++ DW_FORM_indirect = 0x16
++IF_NOT_ASM(};)
++
++/* Attribute names and codes. */
++
++ENUM(dwarf_attribute)
++
++ DW_AT_sibling = 0x01 COMMA
++ DW_AT_location = 0x02 COMMA
++ DW_AT_name = 0x03 COMMA
++ DW_AT_ordering = 0x09 COMMA
++ DW_AT_subscr_data = 0x0a COMMA
++ DW_AT_byte_size = 0x0b COMMA
++ DW_AT_bit_offset = 0x0c COMMA
++ DW_AT_bit_size = 0x0d COMMA
++ DW_AT_element_list = 0x0f COMMA
++ DW_AT_stmt_list = 0x10 COMMA
++ DW_AT_low_pc = 0x11 COMMA
++ DW_AT_high_pc = 0x12 COMMA
++ DW_AT_language = 0x13 COMMA
++ DW_AT_member = 0x14 COMMA
++ DW_AT_discr = 0x15 COMMA
++ DW_AT_discr_value = 0x16 COMMA
++ DW_AT_visibility = 0x17 COMMA
++ DW_AT_import = 0x18 COMMA
++ DW_AT_string_length = 0x19 COMMA
++ DW_AT_common_reference = 0x1a COMMA
++ DW_AT_comp_dir = 0x1b COMMA
++ DW_AT_const_value = 0x1c COMMA
++ DW_AT_containing_type = 0x1d COMMA
++ DW_AT_default_value = 0x1e COMMA
++ DW_AT_inline = 0x20 COMMA
++ DW_AT_is_optional = 0x21 COMMA
++ DW_AT_lower_bound = 0x22 COMMA
++ DW_AT_producer = 0x25 COMMA
++ DW_AT_prototyped = 0x27 COMMA
++ DW_AT_return_addr = 0x2a COMMA
++ DW_AT_start_scope = 0x2c COMMA
++ DW_AT_stride_size = 0x2e COMMA
++ DW_AT_upper_bound = 0x2f COMMA
++ DW_AT_abstract_origin = 0x31 COMMA
++ DW_AT_accessibility = 0x32 COMMA
++ DW_AT_address_class = 0x33 COMMA
++ DW_AT_artificial = 0x34 COMMA
++ DW_AT_base_types = 0x35 COMMA
++ DW_AT_calling_convention = 0x36 COMMA
++ DW_AT_count = 0x37 COMMA
++ DW_AT_data_member_location = 0x38 COMMA
++ DW_AT_decl_column = 0x39 COMMA
++ DW_AT_decl_file = 0x3a COMMA
++ DW_AT_decl_line = 0x3b COMMA
++ DW_AT_declaration = 0x3c COMMA
++ DW_AT_discr_list = 0x3d COMMA
++ DW_AT_encoding = 0x3e COMMA
++ DW_AT_external = 0x3f COMMA
++ DW_AT_frame_base = 0x40 COMMA
++ DW_AT_friend = 0x41 COMMA
++ DW_AT_identifier_case = 0x42 COMMA
++ DW_AT_macro_info = 0x43 COMMA
++ DW_AT_namelist_items = 0x44 COMMA
++ DW_AT_priority = 0x45 COMMA
++ DW_AT_segment = 0x46 COMMA
++ DW_AT_specification = 0x47 COMMA
++ DW_AT_static_link = 0x48 COMMA
++ DW_AT_type = 0x49 COMMA
++ DW_AT_use_location = 0x4a COMMA
++ DW_AT_variable_parameter = 0x4b COMMA
++ DW_AT_virtuality = 0x4c COMMA
++ DW_AT_vtable_elem_location = 0x4d COMMA
++ /* DWARF 3 values. */
++ DW_AT_allocated = 0x4e COMMA
++ DW_AT_associated = 0x4f COMMA
++ DW_AT_data_location = 0x50 COMMA
++ DW_AT_stride = 0x51 COMMA
++ DW_AT_entry_pc = 0x52 COMMA
++ DW_AT_use_UTF8 = 0x53 COMMA
++ DW_AT_extension = 0x54 COMMA
++ DW_AT_ranges = 0x55 COMMA
++ DW_AT_trampoline = 0x56 COMMA
++ DW_AT_call_column = 0x57 COMMA
++ DW_AT_call_file = 0x58 COMMA
++ DW_AT_call_line = 0x59 COMMA
++ /* SGI/MIPS extensions. */
++ DW_AT_MIPS_fde = 0x2001 COMMA
++ DW_AT_MIPS_loop_begin = 0x2002 COMMA
++ DW_AT_MIPS_tail_loop_begin = 0x2003 COMMA
++ DW_AT_MIPS_epilog_begin = 0x2004 COMMA
++ DW_AT_MIPS_loop_unroll_factor = 0x2005 COMMA
++ DW_AT_MIPS_software_pipeline_depth = 0x2006 COMMA
++ DW_AT_MIPS_linkage_name = 0x2007 COMMA
++ DW_AT_MIPS_stride = 0x2008 COMMA
++ DW_AT_MIPS_abstract_name = 0x2009 COMMA
++ DW_AT_MIPS_clone_origin = 0x200a COMMA
++ DW_AT_MIPS_has_inlines = 0x200b COMMA
++ /* GNU extensions. */
++ DW_AT_sf_names = 0x2101 COMMA
++ DW_AT_src_info = 0x2102 COMMA
++ DW_AT_mac_info = 0x2103 COMMA
++ DW_AT_src_coords = 0x2104 COMMA
++ DW_AT_body_begin = 0x2105 COMMA
++ DW_AT_body_end = 0x2106 COMMA
++ DW_AT_GNU_vector = 0x2107 COMMA
++ /* VMS extensions. */
++ DW_AT_VMS_rtnbeg_pd_address = 0x2201 COMMA
++ /* UPC extension. */
++ DW_AT_upc_threads_scaled = 0x3210
++IF_NOT_ASM(};)
++
++#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */
++#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */
++
++/* Location atom names and codes. */
++ENUM(dwarf_location_atom)
++
++ DW_OP_addr = 0x03 COMMA
++ DW_OP_deref = 0x06 COMMA
++ DW_OP_const1u = 0x08 COMMA
++ DW_OP_const1s = 0x09 COMMA
++ DW_OP_const2u = 0x0a COMMA
++ DW_OP_const2s = 0x0b COMMA
++ DW_OP_const4u = 0x0c COMMA
++ DW_OP_const4s = 0x0d COMMA
++ DW_OP_const8u = 0x0e COMMA
++ DW_OP_const8s = 0x0f COMMA
++ DW_OP_constu = 0x10 COMMA
++ DW_OP_consts = 0x11 COMMA
++ DW_OP_dup = 0x12 COMMA
++ DW_OP_drop = 0x13 COMMA
++ DW_OP_over = 0x14 COMMA
++ DW_OP_pick = 0x15 COMMA
++ DW_OP_swap = 0x16 COMMA
++ DW_OP_rot = 0x17 COMMA
++ DW_OP_xderef = 0x18 COMMA
++ DW_OP_abs = 0x19 COMMA
++ DW_OP_and = 0x1a COMMA
++ DW_OP_div = 0x1b COMMA
++ DW_OP_minus = 0x1c COMMA
++ DW_OP_mod = 0x1d COMMA
++ DW_OP_mul = 0x1e COMMA
++ DW_OP_neg = 0x1f COMMA
++ DW_OP_not = 0x20 COMMA
++ DW_OP_or = 0x21 COMMA
++ DW_OP_plus = 0x22 COMMA
++ DW_OP_plus_uconst = 0x23 COMMA
++ DW_OP_shl = 0x24 COMMA
++ DW_OP_shr = 0x25 COMMA
++ DW_OP_shra = 0x26 COMMA
++ DW_OP_xor = 0x27 COMMA
++ DW_OP_bra = 0x28 COMMA
++ DW_OP_eq = 0x29 COMMA
++ DW_OP_ge = 0x2a COMMA
++ DW_OP_gt = 0x2b COMMA
++ DW_OP_le = 0x2c COMMA
++ DW_OP_lt = 0x2d COMMA
++ DW_OP_ne = 0x2e COMMA
++ DW_OP_skip = 0x2f COMMA
++ DW_OP_lit0 = 0x30 COMMA
++ DW_OP_lit1 = 0x31 COMMA
++ DW_OP_lit2 = 0x32 COMMA
++ DW_OP_lit3 = 0x33 COMMA
++ DW_OP_lit4 = 0x34 COMMA
++ DW_OP_lit5 = 0x35 COMMA
++ DW_OP_lit6 = 0x36 COMMA
++ DW_OP_lit7 = 0x37 COMMA
++ DW_OP_lit8 = 0x38 COMMA
++ DW_OP_lit9 = 0x39 COMMA
++ DW_OP_lit10 = 0x3a COMMA
++ DW_OP_lit11 = 0x3b COMMA
++ DW_OP_lit12 = 0x3c COMMA
++ DW_OP_lit13 = 0x3d COMMA
++ DW_OP_lit14 = 0x3e COMMA
++ DW_OP_lit15 = 0x3f COMMA
++ DW_OP_lit16 = 0x40 COMMA
++ DW_OP_lit17 = 0x41 COMMA
++ DW_OP_lit18 = 0x42 COMMA
++ DW_OP_lit19 = 0x43 COMMA
++ DW_OP_lit20 = 0x44 COMMA
++ DW_OP_lit21 = 0x45 COMMA
++ DW_OP_lit22 = 0x46 COMMA
++ DW_OP_lit23 = 0x47 COMMA
++ DW_OP_lit24 = 0x48 COMMA
++ DW_OP_lit25 = 0x49 COMMA
++ DW_OP_lit26 = 0x4a COMMA
++ DW_OP_lit27 = 0x4b COMMA
++ DW_OP_lit28 = 0x4c COMMA
++ DW_OP_lit29 = 0x4d COMMA
++ DW_OP_lit30 = 0x4e COMMA
++ DW_OP_lit31 = 0x4f COMMA
++ DW_OP_reg0 = 0x50 COMMA
++ DW_OP_reg1 = 0x51 COMMA
++ DW_OP_reg2 = 0x52 COMMA
++ DW_OP_reg3 = 0x53 COMMA
++ DW_OP_reg4 = 0x54 COMMA
++ DW_OP_reg5 = 0x55 COMMA
++ DW_OP_reg6 = 0x56 COMMA
++ DW_OP_reg7 = 0x57 COMMA
++ DW_OP_reg8 = 0x58 COMMA
++ DW_OP_reg9 = 0x59 COMMA
++ DW_OP_reg10 = 0x5a COMMA
++ DW_OP_reg11 = 0x5b COMMA
++ DW_OP_reg12 = 0x5c COMMA
++ DW_OP_reg13 = 0x5d COMMA
++ DW_OP_reg14 = 0x5e COMMA
++ DW_OP_reg15 = 0x5f COMMA
++ DW_OP_reg16 = 0x60 COMMA
++ DW_OP_reg17 = 0x61 COMMA
++ DW_OP_reg18 = 0x62 COMMA
++ DW_OP_reg19 = 0x63 COMMA
++ DW_OP_reg20 = 0x64 COMMA
++ DW_OP_reg21 = 0x65 COMMA
++ DW_OP_reg22 = 0x66 COMMA
++ DW_OP_reg23 = 0x67 COMMA
++ DW_OP_reg24 = 0x68 COMMA
++ DW_OP_reg25 = 0x69 COMMA
++ DW_OP_reg26 = 0x6a COMMA
++ DW_OP_reg27 = 0x6b COMMA
++ DW_OP_reg28 = 0x6c COMMA
++ DW_OP_reg29 = 0x6d COMMA
++ DW_OP_reg30 = 0x6e COMMA
++ DW_OP_reg31 = 0x6f COMMA
++ DW_OP_breg0 = 0x70 COMMA
++ DW_OP_breg1 = 0x71 COMMA
++ DW_OP_breg2 = 0x72 COMMA
++ DW_OP_breg3 = 0x73 COMMA
++ DW_OP_breg4 = 0x74 COMMA
++ DW_OP_breg5 = 0x75 COMMA
++ DW_OP_breg6 = 0x76 COMMA
++ DW_OP_breg7 = 0x77 COMMA
++ DW_OP_breg8 = 0x78 COMMA
++ DW_OP_breg9 = 0x79 COMMA
++ DW_OP_breg10 = 0x7a COMMA
++ DW_OP_breg11 = 0x7b COMMA
++ DW_OP_breg12 = 0x7c COMMA
++ DW_OP_breg13 = 0x7d COMMA
++ DW_OP_breg14 = 0x7e COMMA
++ DW_OP_breg15 = 0x7f COMMA
++ DW_OP_breg16 = 0x80 COMMA
++ DW_OP_breg17 = 0x81 COMMA
++ DW_OP_breg18 = 0x82 COMMA
++ DW_OP_breg19 = 0x83 COMMA
++ DW_OP_breg20 = 0x84 COMMA
++ DW_OP_breg21 = 0x85 COMMA
++ DW_OP_breg22 = 0x86 COMMA
++ DW_OP_breg23 = 0x87 COMMA
++ DW_OP_breg24 = 0x88 COMMA
++ DW_OP_breg25 = 0x89 COMMA
++ DW_OP_breg26 = 0x8a COMMA
++ DW_OP_breg27 = 0x8b COMMA
++ DW_OP_breg28 = 0x8c COMMA
++ DW_OP_breg29 = 0x8d COMMA
++ DW_OP_breg30 = 0x8e COMMA
++ DW_OP_breg31 = 0x8f COMMA
++ DW_OP_regx = 0x90 COMMA
++ DW_OP_fbreg = 0x91 COMMA
++ DW_OP_bregx = 0x92 COMMA
++ DW_OP_piece = 0x93 COMMA
++ DW_OP_deref_size = 0x94 COMMA
++ DW_OP_xderef_size = 0x95 COMMA
++ DW_OP_nop = 0x96 COMMA
++ /* DWARF 3 extensions. */
++ DW_OP_push_object_address = 0x97 COMMA
++ DW_OP_call2 = 0x98 COMMA
++ DW_OP_call4 = 0x99 COMMA
++ DW_OP_call_ref = 0x9a COMMA
++ /* GNU extensions. */
++ DW_OP_GNU_push_tls_address = 0xe0
++IF_NOT_ASM(};)
++
++#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */
++#define DW_OP_hi_user 0xff /* Implementation-defined range end. */
++
++/* Type encodings. */
++ENUM(dwarf_type)
++
++ DW_ATE_void = 0x0 COMMA
++ DW_ATE_address = 0x1 COMMA
++ DW_ATE_boolean = 0x2 COMMA
++ DW_ATE_complex_float = 0x3 COMMA
++ DW_ATE_float = 0x4 COMMA
++ DW_ATE_signed = 0x5 COMMA
++ DW_ATE_signed_char = 0x6 COMMA
++ DW_ATE_unsigned = 0x7 COMMA
++ DW_ATE_unsigned_char = 0x8 COMMA
++ /* DWARF 3. */
++ DW_ATE_imaginary_float = 0x9
++IF_NOT_ASM(};)
++
++#define DW_ATE_lo_user 0x80
++#define DW_ATE_hi_user 0xff
++
++/* Array ordering names and codes. */
++ENUM(dwarf_array_dim_ordering)
++
++ DW_ORD_row_major = 0 COMMA
++ DW_ORD_col_major = 1
++IF_NOT_ASM(};)
++
++/* Access attribute. */
++ENUM(dwarf_access_attribute)
++
++ DW_ACCESS_public = 1 COMMA
++ DW_ACCESS_protected = 2 COMMA
++ DW_ACCESS_private = 3
++IF_NOT_ASM(};)
++
++/* Visibility. */
++ENUM(dwarf_visibility_attribute)
++
++ DW_VIS_local = 1 COMMA
++ DW_VIS_exported = 2 COMMA
++ DW_VIS_qualified = 3
++IF_NOT_ASM(};)
++
++/* Virtuality. */
++ENUM(dwarf_virtuality_attribute)
++
++ DW_VIRTUALITY_none = 0 COMMA
++ DW_VIRTUALITY_virtual = 1 COMMA
++ DW_VIRTUALITY_pure_virtual = 2
++IF_NOT_ASM(};)
++
++/* Case sensitivity. */
++ENUM(dwarf_id_case)
++
++ DW_ID_case_sensitive = 0 COMMA
++ DW_ID_up_case = 1 COMMA
++ DW_ID_down_case = 2 COMMA
++ DW_ID_case_insensitive = 3
++IF_NOT_ASM(};)
++
++/* Calling convention. */
++ENUM(dwarf_calling_convention)
++
++ DW_CC_normal = 0x1 COMMA
++ DW_CC_program = 0x2 COMMA
++ DW_CC_nocall = 0x3
++IF_NOT_ASM(};)
++
++#define DW_CC_lo_user 0x40
++#define DW_CC_hi_user 0xff
++
++/* Inline attribute. */
++ENUM(dwarf_inline_attribute)
++
++ DW_INL_not_inlined = 0 COMMA
++ DW_INL_inlined = 1 COMMA
++ DW_INL_declared_not_inlined = 2 COMMA
++ DW_INL_declared_inlined = 3
++IF_NOT_ASM(};)
++
++/* Discriminant lists. */
++ENUM(dwarf_discrim_list)
++
++ DW_DSC_label = 0 COMMA
++ DW_DSC_range = 1
++IF_NOT_ASM(};)
++
++/* Line number opcodes. */
++ENUM(dwarf_line_number_ops)
++
++ DW_LNS_extended_op = 0 COMMA
++ DW_LNS_copy = 1 COMMA
++ DW_LNS_advance_pc = 2 COMMA
++ DW_LNS_advance_line = 3 COMMA
++ DW_LNS_set_file = 4 COMMA
++ DW_LNS_set_column = 5 COMMA
++ DW_LNS_negate_stmt = 6 COMMA
++ DW_LNS_set_basic_block = 7 COMMA
++ DW_LNS_const_add_pc = 8 COMMA
++ DW_LNS_fixed_advance_pc = 9 COMMA
++ /* DWARF 3. */
++ DW_LNS_set_prologue_end = 10 COMMA
++ DW_LNS_set_epilogue_begin = 11 COMMA
++ DW_LNS_set_isa = 12
++IF_NOT_ASM(};)
++
++/* Line number extended opcodes. */
++ENUM(dwarf_line_number_x_ops)
++
++ DW_LNE_end_sequence = 1 COMMA
++ DW_LNE_set_address = 2 COMMA
++ DW_LNE_define_file = 3
++IF_NOT_ASM(};)
++
++/* Call frame information. */
++ENUM(dwarf_call_frame_info)
++
++ DW_CFA_advance_loc = 0x40 COMMA
++ DW_CFA_offset = 0x80 COMMA
++ DW_CFA_restore = 0xc0 COMMA
++ DW_CFA_nop = 0x00 COMMA
++ DW_CFA_set_loc = 0x01 COMMA
++ DW_CFA_advance_loc1 = 0x02 COMMA
++ DW_CFA_advance_loc2 = 0x03 COMMA
++ DW_CFA_advance_loc4 = 0x04 COMMA
++ DW_CFA_offset_extended = 0x05 COMMA
++ DW_CFA_restore_extended = 0x06 COMMA
++ DW_CFA_undefined = 0x07 COMMA
++ DW_CFA_same_value = 0x08 COMMA
++ DW_CFA_register = 0x09 COMMA
++ DW_CFA_remember_state = 0x0a COMMA
++ DW_CFA_restore_state = 0x0b COMMA
++ DW_CFA_def_cfa = 0x0c COMMA
++ DW_CFA_def_cfa_register = 0x0d COMMA
++ DW_CFA_def_cfa_offset = 0x0e COMMA
++
++ /* DWARF 3. */
++ DW_CFA_def_cfa_expression = 0x0f COMMA
++ DW_CFA_expression = 0x10 COMMA
++ DW_CFA_offset_extended_sf = 0x11 COMMA
++ DW_CFA_def_cfa_sf = 0x12 COMMA
++ DW_CFA_def_cfa_offset_sf = 0x13 COMMA
++
++ /* SGI/MIPS specific. */
++ DW_CFA_MIPS_advance_loc8 = 0x1d COMMA
++
++ /* GNU extensions. */
++ DW_CFA_GNU_window_save = 0x2d COMMA
++ DW_CFA_GNU_args_size = 0x2e COMMA
++ DW_CFA_GNU_negative_offset_extended = 0x2f
++IF_NOT_ASM(};)
++
++#define DW_CIE_ID 0xffffffff
++#define DW_CIE_VERSION 1
++
++#define DW_CFA_extended 0
++#define DW_CFA_lo_user 0x1c
++#define DW_CFA_hi_user 0x3f
++
++#define DW_CHILDREN_no 0x00
++#define DW_CHILDREN_yes 0x01
++
++#define DW_ADDR_none 0
++
++/* Source language names and codes. */
++ENUM(dwarf_source_language)
++
++ DW_LANG_C89 = 0x0001 COMMA
++ DW_LANG_C = 0x0002 COMMA
++ DW_LANG_Ada83 = 0x0003 COMMA
++ DW_LANG_C_plus_plus = 0x0004 COMMA
++ DW_LANG_Cobol74 = 0x0005 COMMA
++ DW_LANG_Cobol85 = 0x0006 COMMA
++ DW_LANG_Fortran77 = 0x0007 COMMA
++ DW_LANG_Fortran90 = 0x0008 COMMA
++ DW_LANG_Pascal83 = 0x0009 COMMA
++ DW_LANG_Modula2 = 0x000a COMMA
++ DW_LANG_Java = 0x000b COMMA
++ /* DWARF 3. */
++ DW_LANG_C99 = 0x000c COMMA
++ DW_LANG_Ada95 = 0x000d COMMA
++ DW_LANG_Fortran95 = 0x000e COMMA
++ /* MIPS. */
++ DW_LANG_Mips_Assembler = 0x8001 COMMA
++ /* UPC. */
++ DW_LANG_Upc = 0x8765
++IF_NOT_ASM(};)
++
++#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */
++#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */
++
++/* Names and codes for macro information. */
++ENUM(dwarf_macinfo_record_type)
++
++ DW_MACINFO_define = 1 COMMA
++ DW_MACINFO_undef = 2 COMMA
++ DW_MACINFO_start_file = 3 COMMA
++ DW_MACINFO_end_file = 4 COMMA
++ DW_MACINFO_vendor_ext = 255
++IF_NOT_ASM(};)
++\f
++/* @@@ For use with GNU frame unwind information. */
++
++#define DW_EH_PE_absptr 0x00
++#define DW_EH_PE_omit 0xff
++
++#define DW_EH_PE_uleb128 0x01
++#define DW_EH_PE_udata2 0x02
++#define DW_EH_PE_udata4 0x03
++#define DW_EH_PE_udata8 0x04
++#define DW_EH_PE_sleb128 0x09
++#define DW_EH_PE_sdata2 0x0A
++#define DW_EH_PE_sdata4 0x0B
++#define DW_EH_PE_sdata8 0x0C
++#define DW_EH_PE_signed 0x08
++
++#define DW_EH_PE_pcrel 0x10
++#define DW_EH_PE_textrel 0x20
++#define DW_EH_PE_datarel 0x30
++#define DW_EH_PE_funcrel 0x40
++#define DW_EH_PE_aligned 0x50
++
++#define DW_EH_PE_indirect 0x80
++
++#endif /* _ELF_DWARF2_H */
+Index: linux-2.6.10/include/linux/spinlock.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/spinlock.h 2005-03-31 15:35:27.000000000 +0800
++++ linux-2.6.10/include/linux/spinlock.h 2005-04-05 12:48:05.365601384 +0800
+@@ -15,6 +15,12 @@
+
+ #include <asm/processor.h> /* for cpu relax */
+ #include <asm/system.h>
++#ifdef CONFIG_KGDB
++#include <asm/current.h>
++#define SET_WHO(x, him) (x)->who = him;
++#else
++#define SET_WHO(x, him)
++#endif
+
+ /*
+ * Must define these before including other files, inline functions need them
+@@ -94,6 +100,9 @@
+ const char *module;
+ char *owner;
+ int oline;
++#ifdef CONFIG_KGDB
++ struct task_struct *who;
++#endif
+ } spinlock_t;
+ #define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0}
+
+@@ -105,6 +114,7 @@
+ (x)->module = __FILE__; \
+ (x)->owner = NULL; \
+ (x)->oline = 0; \
++ SET_WHO(x, NULL) \
+ } while (0)
+
+ #define CHECK_LOCK(x) \
+@@ -129,6 +139,7 @@
+ (x)->lock = 1; \
+ (x)->owner = __FILE__; \
+ (x)->oline = __LINE__; \
++ SET_WHO(x, current) \
+ } while (0)
+
+ /* without debugging, spin_is_locked on UP always says
+@@ -159,6 +170,7 @@
+ (x)->lock = 1; \
+ (x)->owner = __FILE__; \
+ (x)->oline = __LINE__; \
++ SET_WHO(x, current) \
+ 1; \
+ })
+
+Index: linux-2.6.10/include/linux/config.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/config.h 2005-03-31 15:35:27.000000000 +0800
++++ linux-2.6.10/include/linux/config.h 2005-04-05 12:48:42.303985896 +0800
+@@ -2,6 +2,10 @@
+ #define _LINUX_CONFIG_H
+
+ #include <linux/autoconf.h>
++#if defined(__i386__) && !defined(IN_BOOTLOADER) && defined(CONFIG_KGDB)
++#include <asm/kgdb.h>
++#endif
++
+ #if !defined (__KERNEL__) && !defined(__KERNGLUE__)
+ #error including kernel header in userspace; use the glibc headers instead!
+ #endif
+Index: linux-2.6.10/include/linux/dwarf2-lang.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dwarf2-lang.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dwarf2-lang.h 2005-04-05 12:48:05.370600624 +0800
+@@ -0,0 +1,132 @@
++#ifndef DWARF2_LANG
++#define DWARF2_LANG
++#include <linux/dwarf2.h>
++
++/*
++ * This is free software; you can redistribute it and/or modify it under
++ * the terms of the GNU General Public License as published by the Free
++ * Software Foundation; either version 2, or (at your option) any later
++ * version.
++ */
++/*
++ * This file defines macros that allow generation of DWARF debug records
++ * for asm files. This file is platform independent. Register numbers
++ * (which are about the only thing that is platform dependent) are to be
++ * supplied by a platform defined file.
++ */
++#define DWARF_preamble() .section .debug_frame,"",@progbits
++/*
++ * This macro starts a debug frame section. The debug_frame describes
++ * where to find the registers that the enclosing function saved on
++ * entry.
++ *
++ * ORD is use by the label generator and should be the same as what is
++ * passed to CFI_postamble.
++ *
++ * pc, pc register gdb ordinal.
++ *
++ * code_align this is the factor used to define locations or regions
++ * where the given definitions apply. If you use labels to define these
++ * this should be 1.
++ *
++ * data_align this is the factor used to define register offsets. If
++ * you use struct offset, this should be the size of the register in
++ * bytes or the negative of that. This is how it is used: you will
++ * define a register as the reference register, say the stack pointer,
++ * then you will say where a register is located relative to this
++ * reference registers value, say 40 for register 3 (the gdb register
++ * number). The <40> will be multiplied by <data_align> to define the
++ * byte offset of the given register (3, in this example). So if your
++ * <40> is the byte offset and the reference register points at the
++ * begining, you would want 1 for the data_offset. If <40> was the 40th
++ * 4-byte element in that structure you would want 4. And if your
++ * reference register points at the end of the structure you would want
++ * a negative data_align value(and you would have to do other math as
++ * well).
++ */
++
++#define CFI_preamble(ORD, pc, code_align, data_align) \
++.section .debug_frame,"",@progbits ; \
++frame/**/_/**/ORD: \
++ .long end/**/_/**/ORD-start/**/_/**/ORD; \
++start/**/_/**/ORD: \
++ .long DW_CIE_ID; \
++ .byte DW_CIE_VERSION; \
++ .byte 0 ; \
++ .uleb128 code_align; \
++ .sleb128 data_align; \
++ .byte pc;
++
++/*
++ * After the above macro and prior to the CFI_postamble, you need to
++ * define the initial state. This starts with defining the reference
++ * register and, usually the pc. Here are some helper macros:
++ */
++
++#define CFA_define_reference(reg, offset) \
++ .byte DW_CFA_def_cfa; \
++ .uleb128 reg; \
++ .uleb128 (offset);
++
++#define CFA_define_offset(reg, offset) \
++ .byte (DW_CFA_offset + reg); \
++ .uleb128 (offset);
++
++#define CFI_postamble(ORD) \
++ .align 4; \
++end/**/_/**/ORD:
++/*
++ * So now your code pushs stuff on the stack, you need a new location
++ * and the rules for what to do. This starts a running description of
++ * the call frame. You need to describe what changes with respect to
++ * the call registers as the location of the pc moves through the code.
++ * The following builds an FDE (fram descriptor entry?). Like the
++ * above, it has a preamble and a postamble. It also is tied to the CFI
++ * above.
++ * The first entry after the preamble must be the location in the code
++ * that the call frame is being described for.
++ */
++#define FDE_preamble(ORD, fde_no, initial_address, length) \
++ .long FDE_end/**/_/**/fde_no-FDE_start/**/_/**/fde_no; \
++FDE_start/**/_/**/fde_no: \
++ .long frame/**/_/**/ORD; \
++ .long initial_address; \
++ .long length;
++
++#define FDE_postamble(fde_no) \
++ .align 4; \
++FDE_end/**/_/**/fde_no:
++/*
++ * That done, you can now add registers, subtract registers, move the
++ * reference and even change the reference. You can also define a new
++ * area of code the info applies to. For discontinuous bits you should
++ * start a new FDE. You may have as many as you like.
++ */
++
++/*
++ * To advance the address by <bytes>
++ */
++
++#define FDE_advance(bytes) \
++ .byte DW_CFA_advance_loc4 \
++ .long bytes
++
++
++
++/*
++ * With the above you can define all the register locations. But
++ * suppose the reference register moves... Takes the new offset NOT an
++ * increment. This is how esp is tracked if it is not saved.
++ */
++
++#define CFA_define_cfa_offset(offset) \
++ .byte $DW_CFA_def_cfa_offset; \
++ .uleb128 (offset);
++/*
++ * Or suppose you want to use a different reference register...
++ */
++#define CFA_define_cfa_register(reg) \
++ .byte DW_CFA_def_cfa_register; \
++ .uleb128 reg;
++
++#endif
+Index: linux-2.6.10/kernel/pid.c
+===================================================================
+--- linux-2.6.10.orig/kernel/pid.c 2005-03-31 15:35:27.000000000 +0800
++++ linux-2.6.10/kernel/pid.c 2005-04-05 12:48:05.363601688 +0800
+@@ -252,6 +252,9 @@
+ * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
+ * more.
+ */
++#ifdef CONFIG_KGDB
++int kgdb_pid_init_done; /* so we don't call prior to... */
++#endif
+ void __init pidhash_init(void)
+ {
+ int i, j, pidhash_size;
+@@ -273,6 +276,9 @@
+ for (j = 0; j < pidhash_size; j++)
+ INIT_HLIST_HEAD(&pid_hash[i][j]);
+ }
++#ifdef CONFIG_KGDB
++ kgdb_pid_init_done++;
++#endif
+ }
+
+ void __init pidmap_init(void)
+Index: linux-2.6.10/kernel/sched.c
+===================================================================
+--- linux-2.6.10.orig/kernel/sched.c 2005-03-31 15:57:21.000000000 +0800
++++ linux-2.6.10/kernel/sched.c 2005-04-05 12:48:05.362601840 +0800
+@@ -2991,6 +2991,13 @@
+
+ EXPORT_SYMBOL(set_user_nice);
+
++#ifdef CONFIG_KGDB
++struct task_struct *kgdb_get_idle(int this_cpu)
++{
++ return cpu_rq(this_cpu)->idle;
++}
++#endif
++
+ #ifdef __ARCH_WANT_SYS_NICE
+
+ /*
+Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit 2005-04-05 12:48:05.263616888 +0800
+@@ -0,0 +1,14 @@
++shell echo -e "\003" >/dev/ttyS0
++set remotebaud 38400
++target remote /dev/ttyS0
++define si
++stepi
++printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx
++printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp
++x/i $eip
++end
++define ni
++nexti
++printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx
++printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp
++x/i $eip
+Index: linux-2.6.10/Documentation/i386/kgdb/kgdb.txt
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/kgdb.txt 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/kgdb.txt 2005-04-05 12:48:05.271615672 +0800
+@@ -0,0 +1,775 @@
++Last edit: <20030806.1637.12>
++This file has information specific to the i386 kgdb option. Other
++platforms with the kgdb option may behave in a similar fashion.
++
++New features:
++============
++20030806.1557.37
++This version was made against the 2.6.0-test2 kernel. We have made the
++following changes:
++
++- The getthread() code in the stub calls find_task_by_pid(). It fails
++ if we are early in the bring up such that the pid arrays have yet to
++ be allocated. We have added a line to kernel/pid.c to make
++ "kgdb_pid_init_done" true once the arrays are allocated. This way the
++ getthread() code knows not to call. This is only used by the thread
++ debugging stuff and threads will not yet exist at this point in the
++ boot.
++
++- For some reason, gdb was not asking for a new thread list when the
++ "info thread" command was given. We changed to the newer version of
++ the thread info command and gdb now seems to ask when needed. Result,
++ we now get all threads in the thread list.
++
++- We now respond to the ThreadExtraInfo request from gdb with the thread
++ name from task_struct .comm. This then appears in the thread list.
++ Thoughts on additional options for this are welcome. Things such as
++ "has BKL" and "Preempted" come to mind. I think we could have a flag
++ word that could enable different bits of info here.
++
++- We now honor, sort of, the C and S commands. These are continue and
++ single set after delivering a signal. We ignore the signal and do the
++ requested action. This only happens when we told gdb that a signal
++ was the reason for entry, which is only done on memory faults. The
++ result is that you can now continue into the Oops.
++
++- We changed the -g to -gdwarf-2. This seems to be the same as -ggdb,
++ but it is more exact on what language to use.
++
++- We added two dwarf2 include files and a bit of code at the end of
++ entry.S. This does not yet work, so it is disabled. Still we want to
++ keep track of the code and "maybe" someone out there can fix it.
++
++- Randy Dunlap sent some fix ups for this file which are now merged.
++
++- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a
++ compiler warning if CONFIG_KGDB is off (now who would do that :).
++
++- Andrew Morton sent a fix for the serial driver which is now merged.
++
++- Andrew also sent a change to the stub around the cpu managment code
++ which is also merged.
++
++- Andrew also sent a patch to make "f" as well as "g" work as SysRq
++ commands to enter kgdb, merged.
++
++- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a
++ "who" field to the spinlock data struct. This is filled with
++ "current" when ever the spinlock suceeds. Useful if you want to know
++ who has the lock.
++
++_ And last, but not least, we fixed the "get_cu" macro to properly get
++ the current value of "current".
++
++New features:
++============
++20030505.1827.27
++We are starting to align with the sourceforge version, at least in
++commands. To this end, the boot command string to start kgdb at
++boot time has been changed from "kgdb" to "gdb".
++
++Andrew Morton sent a couple of patches which are now included as follows:
++1.) We now return a flag to the interrupt handler.
++2.) We no longer use smp_num_cpus (a conflict with the lock meter).
++3.) And from William Lee Irwin III <wli@holomorphy.com> code to make
++ sure high-mem is set up before we attempt to register our interrupt
++ handler.
++We now include asm/kgdb.h from config.h so you will most likely never
++have to include it. It also 'NULLS' the kgdb macros you might have in
++your code when CONFIG_KGDB is not defined. This allows you to just
++turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such.
++This include is conditioned on the machine being an x86 so as to not
++mess with other archs.
++
++20020801.1129.03
++This is currently the version for the 2.4.18 (and beyond?) kernel.
++
++We have several new "features" beginning with this version:
++
++1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI. No more
++ waiting and it will pull that guy out of an IRQ off spin lock :)
++
++2.) We doctored up the code that tells where a task is waiting and
++ included it so that the "info thread" command will show a bit more
++ than "schedule()". Try it...
++
++3.) Added the ability to call a function from gdb. All the standard gdb
++ issues apply, i.e. if you hit a breakpoint in the function, you are
++ not allowed to call another (gdb limitation, not kgdb). To help
++ this capability we added a memory allocation function. Gdb does not
++ return this memory (it is used for strings that you pass to that function
++ you are calling from gdb) so we fixed up a way to allow you to
++ manually return the memory (see below).
++
++4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the
++ interrupt flag to now also include the preemption count and the
++ "in_interrupt" info. The flag is now called "with_pif" to indicate
++ the order, preempt_count, in_interrupt, flag. The preempt_count is
++ shifted left by 4 bits so you can read the count in hex by dropping
++ the low order digit. In_interrupt is in bit 1, and the flag is in
++ bit 0.
++
++5.) The command: "p kgdb_info" is now expanded and prints something
++ like:
++(gdb) p kgdb_info
++$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259,
++ errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1,
++ cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0,
++ regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}}
++
++ Things to note here: a.) used_malloc is the amount of memory that
++ has been malloc'ed to do calls from gdb. You can reclaim this
++ memory like this: "p kgdb_info.used_malloc=0" Cool, huh? b.)
++ cpus_waiting is now "sized" by the number of CPUs you enter at
++ configure time in the kgdb configure section. This is NOT used
++ anywhere else in the system, but it is "nice" here. c.) The task's
++ "pid" is now in the structure. This is the pid you will need to use
++ to decode to the thread id to get gdb to look at that thread.
++ Remember that the "info thread" command prints a list of threads
++ wherein it numbers each thread with its reference number followed
++ by the thread's pid. Note that the per-CPU idle threads actually
++ have pids of 0 (yes, there is more than one pid 0 in an SMP system).
++ To avoid confusion, kgdb numbers these threads with numbers beyond
++ the MAX_PID. That is why you see 32768 and above.
++
++6.) A subtle change, we now provide the complete register set for tasks
++ that are active on the other CPUs. This allows better trace back on
++ those tasks.
++
++ And, let's mention what we could not fix. Back-trace from all but the
++ thread that we trapped will, most likely, have a bogus entry in it.
++ The problem is that gdb does not recognize the entry code for
++ functions that use "current" near (at all?) the entry. The compiler
++ is putting the "current" decode as the first two instructions of the
++ function where gdb expects to find %ebp changing code. Back trace
++ also has trouble with interrupt frames. I am talking with Daniel
++ Jacobowitz about some way to fix this, but don't hold your breath.
++
++20011220.0050.35
++Major enhancement with this version is the ability to hold one or more
++CPUs in an SMP system while allowing the others to continue. Also, by
++default only the current CPU is enabled on single-step commands (please
++note that gdb issues single-step commands at times other than when you
++use the si command).
++
++Another change is to collect some useful information in
++a global structure called "kgdb_info". You should be able to just:
++
++p kgdb_info
++
++although I have seen cases where the first time this is done gdb just
++prints the first member but prints the whole structure if you then enter
++CR (carriage return or enter). This also works:
++
++p *&kgdb_info
++
++Here is a sample:
++(gdb) p kgdb_info
++$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0,
++ vector = 3, print_debug_info = 0}
++
++"Called_from" is the return address from the current entry into kgdb.
++Sometimes it is useful to know why you are in kgdb, for example, was
++it an NMI or a real breakpoint? The simple way to interrogate this
++return address is:
++
++l *0xc010732c
++
++which will print the surrounding few lines of source code.
++
++"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the
++kgdb_ts entries).
++
++"errcode" and "vector" are other entry parameters which may be helpful on
++some traps.
++
++"print_debug_info" is the internal debugging kgdb print enable flag. Yes,
++you can modify it.
++
++In SMP systems kgdb_info also includes the "cpus_waiting" structure and
++"hold_on_step":
++
++(gdb) p kgdb_info
++$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0,
++ vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{
++ task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0,
++ regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0,
++ hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0,
++ hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0,
++ hold = 0, regs = 0x0}}}
++
++"Cpus_waiting" has an entry for each CPU other than the current one that
++has been stopped. Each entry contains the task_struct address for that
++CPU, the address of the regs for that task and a hold flag. All these
++have the proper typing so that, for example:
++
++p *kgdb_info.cpus_waiting[1].regs
++
++will print the registers for CPU 1.
++
++"Hold_on_sstep" is a new feature with this version and comes up set or
++true. What this means is that whenever kgdb is asked to single-step all
++other CPUs are held (i.e. not allowed to execute). The flag applies to
++all but the current CPU and, again, can be changed:
++
++p kgdb_info.hold_on_sstep=0
++
++restores the old behavior of letting all CPUs run during single-stepping.
++
++Likewise, each CPU has a "hold" flag, which if set, locks that CPU out
++of execution. Note that this has some risk in cases where the CPUs need
++to communicate with each other. If kgdb finds no CPU available on exit,
++it will push a message thru gdb and stay in kgdb. Note that it is legal
++to hold the current CPU as long as at least one CPU can execute.
++
++20010621.1117.09
++This version implements an event queue. Events are signaled by calling
++a function in the kgdb stub and may be examined from gdb. See EVENTS
++below for details. This version also tightens up the interrupt and SMP
++handling to not allow interrupts on the way to kgdb from a breakpoint
++trap. It is fine to allow these interrupts for user code, but not
++system debugging.
++
++Version
++=======
++
++This version of the kgdb package was developed and tested on
++kernel version 2.4.16. It will not install on any earlier kernels.
++It is possible that it will continue to work on later versions
++of 2.4 and then versions of 2.5 (I hope).
++
++
++Debugging Setup
++===============
++
++Designate one machine as the "development" machine. This is the
++machine on which you run your compiles and which has your source
++code for the kernel. Designate a second machine as the "target"
++machine. This is the machine that will run your experimental
++kernel.
++
++The two machines will be connected together via a serial line out
++one or the other of the COM ports of the PC. You will need the
++appropriate modem eliminator (null modem) cable(s) for this.
++
++Decide on which tty port you want the machines to communicate, then
++connect them up back-to-back using the null modem cable. COM1 is
++/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection
++with the two machines prior to trying to debug a kernel. Once you
++have it working, on the TARGET machine, enter:
++
++setserial /dev/ttyS0 (or what ever tty you are using)
++
++and record the port address and the IRQ number.
++
++On the DEVELOPMENT machine you need to apply the patch for the kgdb
++hooks. You have probably already done that if you are reading this
++file.
++
++On your DEVELOPMENT machine, go to your kernel source directory and do
++"make Xconfig" where X is one of "x", "menu", or "". If you are
++configuring in the standard serial driver, it must not be a module.
++Either yes or no is ok, but making the serial driver a module means it
++will initialize after kgdb has set up the UART interrupt code and may
++cause a failure of the control-C option discussed below. The configure
++question for the serial driver is under the "Character devices" heading
++and is:
++
++"Standard/generic (8250/16550 and compatible UARTs) serial support"
++
++Go down to the kernel debugging menu item and open it up. Enable the
++kernel kgdb stub code by selecting that item. You can also choose to
++turn on the "-ggdb -O1" compile options. The -ggdb causes the compiler
++to put more debug info (like local symbols) in the object file. On the
++i386 -g and -ggdb are the same so this option just reduces to "O1". The
++-O1 reduces the optimization level. This may be helpful in some cases,
++be aware, however, that this may also mask the problem you are looking
++for.
++
++The baud rate. Default is 115200. What ever you choose be sure that
++the host machine is set to the same speed. I recommend the default.
++
++The port. This is the I/O address of the serial UART that you should
++have gotten using setserial as described above. The standard COM1 port
++(3f8) using IRQ 4 is default. COM2 is 2f8 which by convention uses IRQ
++3.
++
++The port IRQ (see above).
++
++Stack overflow test. This option makes a minor change in the trap,
++system call and interrupt code to detect stack overflow and transfer
++control to kgdb if it happens. (Some platforms have this in the
++baseline code, but the i386 does not.)
++
++You can also configure the system to recognize the boot option
++"console=kgdb" which if given will cause all console output during
++booting to be put thru gdb as well as other consoles. This option
++requires that gdb and kgdb be connected prior to sending console output
++so, if they are not, a breakpoint is executed to force the connection.
++This will happen before any kernel output (it is going thru gdb, right),
++and will stall the boot until the connection is made.
++
++You can also configure in a patch to SysRq to enable the kGdb SysRq.
++This request generates a breakpoint. Since the serial port IRQ line is
++set up after any serial drivers, it is possible that this command will
++work when the control-C will not.
++
++Save and exit the Xconfig program. Then do "make clean" , "make dep"
++and "make bzImage" (or whatever target you want to make). This gets the
++kernel compiled with the "-g" option set -- necessary for debugging.
++
++You have just built the kernel on your DEVELOPMENT machine that you
++intend to run on your TARGET machine.
++
++To install this new kernel, use the following installation procedure.
++Remember, you are on the DEVELOPMENT machine patching the kernel source
++for the kernel that you intend to run on the TARGET machine.
++
++Copy this kernel to your target machine using your usual procedures. I
++usually arrange to copy development:
++/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine
++via a LAN based NFS access. That is, I run the cp command on the target
++and copy from the development machine via the LAN. Run Lilo (see "man
++lilo" for details on how to set this up) on the new kernel on the target
++machine so that it will boot! Then boot the kernel on the target
++machine.
++
++On the DEVELOPMENT machine, create a file called .gdbinit in the
++directory /usr/src/linux. An example .gdbinit file looks like this:
++
++shell echo -e "\003" >/dev/ttyS0
++set remotebaud 38400 (or what ever speed you have chosen)
++target remote /dev/ttyS0
++
++
++Change the "echo" and "target" definition so that it specifies the tty
++port that you intend to use. Change the "remotebaud" definition to
++match the data rate that you are going to use for the com line.
++
++You are now ready to try it out.
++
++Boot your target machine with "kgdb" in the boot command i.e. something
++like:
++
++lilo> test kgdb
++
++or if you also want console output thru gdb:
++
++lilo> test kgdb console=kgdb
++
++You should see the lilo message saying it has loaded the kernel and then
++all output stops. The kgdb stub is trying to connect with gdb. Start
++gdb something like this:
++
++
++On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux".
++When gdb gets the symbols loaded it will read your .gdbinit file and, if
++everything is working correctly, you should see gdb print out a few
++lines indicating that a breakpoint has been taken. It will actually
++show a line of code in the target kernel inside the kgdb activation
++code.
++
++The gdb interaction should look something like this:
++
++ linux-dev:/usr/src/linux# gdb vmlinux
++ GDB is free software and you are welcome to distribute copies of it
++ under certain conditions; type "show copying" to see the conditions.
++ There is absolutely no warranty for GDB; type "show warranty" for details.
++ GDB 4.15.1 (i486-slackware-linux),
++ Copyright 1995 Free Software Foundation, Inc...
++ breakpoint () at i386-stub.c:750
++ 750 }
++ (gdb)
++
++You can now use whatever gdb commands you like to set breakpoints.
++Enter "continue" to start your target machine executing again. At this
++point the target system will run at full speed until it encounters
++your breakpoint or gets a segment violation in the kernel, or whatever.
++
++If you have the kgdb console enabled when you continue, gdb will print
++out all the console messages.
++
++The above example caused a breakpoint relatively early in the boot
++process. For the i386 kgdb it is possible to code a break instruction
++as the first C-language point in init/main.c, i.e. as the first instruction
++in start_kernel(). This could be done as follows:
++
++#include <asm/kgdb.h>
++ breakpoint();
++
++This breakpoint() is really a function that sets up the breakpoint and
++single-step hardware trap cells and then executes a breakpoint. Any
++early hard coded breakpoint will need to use this function. Once the
++trap cells are set up they need not be set again, but doing it again
++does not hurt anything, so you don't need to be concerned about which
++breakpoint is hit first. Once the trap cells are set up (and the kernel
++sets them up in due course even if breakpoint() is never called) the
++macro:
++
++BREAKPOINT;
++
++will generate an inline breakpoint. This may be more useful as it stops
++the processor at the instruction instead of in a function a step removed
++from the location of interest. In either case <asm/kgdb.h> must be
++included to define both breakpoint() and BREAKPOINT.
++
++Triggering kgdbstub at other times
++==================================
++
++Often you don't need to enter the debugger until much later in the boot
++or even after the machine has been running for some time. Once the
++kernel is booted and interrupts are on, you can force the system to
++enter the debugger by sending a control-C to the debug port. This is
++what the first line of the recommended .gdbinit file does. This allows
++you to start gdb any time after the system is up as well as when the
++system is already at a breakpoint. (In the case where the system is
++already at a breakpoint the control-C is not needed, however, it will
++be ignored by the target so no harm is done. Also note the the echo
++command assumes that the port speed is already set. This will be true
++once gdb has connected, but it is best to set the port speed before you
++run gdb.)
++
++Another simple way to do this is to put the following file in you ~/bin
++directory:
++
++#!/bin/bash
++echo -e "\003" > /dev/ttyS0
++
++Here, the ttyS0 should be replaced with what ever port you are using.
++The "\003" is control-C. Once you are connected with gdb, you can enter
++control-C at the command prompt.
++
++An alternative way to get control to the debugger is to enable the kGdb
++SysRq command. Then you would enter Alt-SysRq-g (all three keys at the
++same time, but push them down in the order given). To refresh your
++memory of the available SysRq commands try Alt-SysRq-=. Actually any
++undefined command could replace the "=", but I like to KNOW that what I
++am pushing will never be defined.
++
++Debugging hints
++===============
++
++You can break into the target machine at any time from the development
++machine by typing ^C (see above paragraph). If the target machine has
++interrupts enabled this will stop it in the kernel and enter the
++debugger.
++
++There is unfortunately no way of breaking into the kernel if it is
++in a loop with interrupts disabled, so if this happens to you then
++you need to place exploratory breakpoints or printk's into the kernel
++to find out where it is looping. The exploratory breakpoints can be
++entered either thru gdb or hard coded into the source. This is very
++handy if you do something like:
++
++if (<it hurts>) BREAKPOINT;
++
++
++There is a copy of an e-mail in the Documentation/i386/kgdb/ directory
++(debug-nmi.txt) which describes how to create an NMI on an ISA bus
++machine using a paper clip. I have a sophisticated version of this made
++by wiring a push button switch into a PC104/ISA bus adapter card. The
++adapter card nicely furnishes wire wrap pins for all the ISA bus
++signals.
++
++When you are done debugging the kernel on the target machine it is a
++good idea to leave it in a running state. This makes reboots faster,
++bypassing the fsck. So do a gdb "continue" as the last gdb command if
++this is possible. To terminate gdb itself on the development machine
++and leave the target machine running, first clear all breakpoints and
++continue, then type ^Z to suspend gdb and then kill it with "kill %1" or
++something similar.
++
++If gdbstub Does Not Work
++========================
++
++If it doesn't work, you will have to troubleshoot it. Do the easy
++things first like double checking your cabling and data rates. You
++might try some non-kernel based programs to see if the back-to-back
++connection works properly. Just something simple like cat /etc/hosts
++>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you
++if you can send data from one machine to the other. Make sure it works
++in both directions. There is no point in tearing out your hair in the
++kernel if the line doesn't work.
++
++All of the real action takes place in the file
++/usr/src/linux/arch/i386/kernel/kgdb_stub.c. That is the code on the target
++machine that interacts with gdb on the development machine. In gdb you can
++turn on a debug switch with the following command:
++
++ set remotedebug
++
++This will print out the protocol messages that gdb is exchanging with
++the target machine.
++
++Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is
++the code that talks to the serial port on the target side. There might
++be a problem there. In particular there is a section of this code that
++tests the UART which will tell you what UART you have if you define
++"PRNT" (just remove "_off" from the #define PRNT_off). To view this
++report you will need to boot the system without any beakpoints. This
++allows the kernel to run to the point where it calls kgdb to set up
++interrupts. At this time kgdb will test the UART and print out the type
++it finds. (You need to wait so that the printks are actually being
++printed. Early in the boot they are cached, waiting for the console to
++be enabled. Also, if kgdb is entered thru a breakpoint it is possible
++to cause a dead lock by calling printk when the console is locked. The
++stub thus avoids doing printks from breakpoints, especially in the
++serial code.) At this time, if the UART fails to do the expected thing,
++kgdb will print out (using printk) information on what failed. (These
++messages will be buried in all the other boot up messages. Look for
++lines that start with "gdb_hook_interrupt:". You may want to use dmesg
++once the system is up to view the log. If this fails or if you still
++don't connect, review your answers for the port address. Use:
++
++setserial /dev/ttyS0
++
++to get the current port and IRQ information. This command will also
++tell you what the system found for the UART type. The stub recognizes
++the following UART types:
++
++16450, 16550, and 16550A
++
++If you are really desperate you can use printk debugging in the
++kgdbstub code in the target kernel until you get it working. In particular,
++there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c
++named "remote_debug". Compile your kernel with this set to 1, rather
++than 0 and the debug stub will print out lots of stuff as it does
++what it does. Likewise there are debug printks in the kgdb_serial.c
++code that can be turned on with simple changes in the macro defines.
++
++
++Debugging Loadable Modules
++==========================
++
++This technique comes courtesy of Edouard Parmelan
++<Edouard.Parmelan@quadratec.fr>
++
++When you run gdb, enter the command
++
++source gdbinit-modules
++
++This will read in a file of gdb macros that was installed in your
++kernel source directory when kgdb was installed. This file implements
++the following commands:
++
++mod-list
++ Lists the loaded modules in the form <module-address> <module-name>
++
++mod-print-symbols <module-address>
++ Prints all the symbols in the indicated module.
++
++mod-add-symbols <module-address> <object-file-path-name>
++ Loads the symbols from the object file and associates them
++ with the indicated module.
++
++After you have loaded the module that you want to debug, use the command
++mod-list to find the <module-address> of your module. Then use that
++address in the mod-add-symbols command to load your module's symbols.
++From that point onward you can debug your module as if it were a part
++of the kernel.
++
++The file gdbinit-modules also contains a command named mod-add-lis as
++an example of how to construct a command of your own to load your
++favorite module. The idea is to "can" the pathname of the module
++in the command so you don't have to type so much.
++
++Threads
++=======
++
++Each process in a target machine is seen as a gdb thread. gdb thread
++related commands (info threads, thread n) can be used.
++
++ia-32 hardware breakpoints
++==========================
++
++kgdb stub contains support for hardware breakpoints using debugging features
++of ia-32(x86) processors. These breakpoints do not need code modification.
++They use debugging registers. 4 hardware breakpoints are available in ia-32
++processors.
++
++Each hardware breakpoint can be of one of the following three types.
++
++1. Execution breakpoint - An Execution breakpoint is triggered when code
++ at the breakpoint address is executed.
++
++ As limited number of hardware breakpoints are available, it is
++ advisable to use software breakpoints ( break command ) instead
++ of execution hardware breakpoints, unless modification of code
++ is to be avoided.
++
++2. Write breakpoint - A write breakpoint is triggered when memory
++ location at the breakpoint address is written.
++
++ A write or can be placed for data of variable length. Length of
++ a write breakpoint indicates length of the datatype to be
++ watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for
++ 4 byte data.
++
++3. Access breakpoint - An access breakpoint is triggered when memory
++ location at the breakpoint address is either read or written.
++
++ Access breakpoints also have lengths similar to write breakpoints.
++
++IO breakpoints in ia-32 are not supported.
++
++Since gdb stub at present does not use the protocol used by gdb for hardware
++breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros
++for hardware breakpoints are described below.
++
++hwebrk - Places an execution breakpoint
++ hwebrk breakpointno address
++hwwbrk - Places a write breakpoint
++ hwwbrk breakpointno length address
++hwabrk - Places an access breakpoint
++ hwabrk breakpointno length address
++hwrmbrk - Removes a breakpoint
++ hwrmbrk breakpointno
++exinfo - Tells whether a software or hardware breakpoint has occurred.
++ Prints number of the hardware breakpoint if a hardware breakpoint has
++ occurred.
++
++Arguments required by these commands are as follows
++breakpointno - 0 to 3
++length - 1 to 3
++address - Memory location in hex digits ( without 0x ) e.g c015e9bc
++
++SMP support
++==========
++
++When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb
++client, all the processors are forced to enter the debugger. Current
++thread corresponds to the thread running on the processor where
++breakpoint occurred. Threads running on other processor(s) appear
++similar to other non-running threads in the 'info threads' output.
++Within the kgdb stub there is a structure "waiting_cpus" in which kgdb
++records the values of "current" and "regs" for each CPU other than the
++one that hit the breakpoint. "current" is a pointer to the task
++structure for the task that CPU is running, while "regs" points to the
++saved registers for the task. This structure can be examined with the
++gdb "p" command.
++
++ia-32 hardware debugging registers on all processors are set to same
++values. Hence any hardware breakpoints may occur on any processor.
++
++gdb troubleshooting
++===================
++
++1. gdb hangs
++Kill it. restart gdb. Connect to target machine.
++
++2. gdb cannot connect to target machine (after killing a gdb and
++restarting another) If the target machine was not inside debugger when
++you killed gdb, gdb cannot connect because the target machine won't
++respond. In this case echo "Ctrl+C"(ASCII 3) to the serial line.
++e.g. echo -e "\003" > /dev/ttyS1
++This forces that target machine into the debugger, after which you
++can connect.
++
++3. gdb cannot connect even after echoing Ctrl+C into serial line
++Try changing serial line settings min to 1 and time to 0
++e.g. stty min 1 time 0 < /dev/ttyS1
++Try echoing again
++
++Check serial line speed and set it to correct value if required
++e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1
++
++EVENTS
++======
++
++Ever want to know the order of things happening? Which CPU did what and
++when? How did the spinlock get the way it is? Then events are for
++you. Events are defined by calls to an event collection interface and
++saved for later examination. In this case, kgdb events are saved by a
++very fast bit of code in kgdb which is fully SMP and interrupt protected
++and they are examined by using gdb to display them. Kgdb keeps only
++the last N events, where N must be a power of two and is defined at
++configure time.
++
++
++Events are signaled to kgdb by calling:
++
++kgdb_ts(data0,data1)
++
++For each call kgdb records each call in an array along with other info.
++Here is the array definition:
++
++struct kgdb_and_then_struct {
++#ifdef CONFIG_SMP
++ int on_cpu;
++#endif
++ long long at_time;
++ int from_ln;
++ char * in_src;
++ void *from;
++ int with_if;
++ int data0;
++ int data1;
++};
++
++For SMP machines the CPU is recorded, for all machines the TSC is
++recorded (gets a time stamp) as well as the line number and source file
++the call was made from. The address of the (from), the "if" (interrupt
++flag) and the two data items are also recorded. The macro kgdb_ts casts
++the types to int, so you can put any 32-bit values here. There is a
++configure option to select the number of events you want to keep. A
++nice number might be 128, but you can keep up to 1024 if you want. The
++number must be a power of two. An "andthen" macro library is provided
++for gdb to help you look at these events. It is also possible to define
++a different structure for the event storage and cast the data to this
++structure. For example the following structure is defined in kgdb:
++
++struct kgdb_and_then_struct2 {
++#ifdef CONFIG_SMP
++ int on_cpu;
++#endif
++ long long at_time;
++ int from_ln;
++ char * in_src;
++ void *from;
++ int with_if;
++ struct task_struct *t1;
++ struct task_struct *t2;
++};
++
++If you use this for display, the data elements will be displayed as
++pointers to task_struct entries. You may want to define your own
++structure to use in casting. You should only change the last two items
++and you must keep the structure size the same. Kgdb will handle these
++as 32-bit ints, but within that constraint you can define a structure to
++cast to any 32-bit quantity. This need only be available to gdb and is
++only used for casting in the display code.
++
++Final Items
++===========
++
++I picked up this code from Amit S. Kale and enhanced it.
++
++If you make some really cool modification to this stuff, or if you
++fix a bug, please let me know.
++
++George Anzinger
++<george@mvista.com>
++
++Amit S. Kale
++<akale@veritas.com>
++
++(First kgdb by David Grothe <dave@gcom.com>)
++
++(modified by Tigran Aivazian <tigran@sco.com>)
++ Putting gdbstub into the kernel config menu.
++
++(modified by Scott Foehner <sfoehner@engr.sgi.com>)
++ Hooks for entering gdbstub at boot time.
++
++(modified by Amit S. Kale <akale@veritas.com>)
++ Threads, ia-32 hw debugging, mp support, console support,
++ nmi watchdog handling.
++
++(modified by George Anzinger <george@mvista.com>)
++ Extended threads to include the idle threads.
++ Enhancements to allow breakpoint() at first C code.
++ Use of module_init() and __setup() to automate the configure.
++ Enhanced the cpu "collection" code to work in early bring-up.
++ Added ability to call functions from gdb
++ Print info thread stuff without going back to schedule()
++ Now collect the "other" cpus with an IPI/ NMI.
+Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit.hw
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit.hw 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit.hw 2005-04-05 12:48:05.273615368 +0800
+@@ -0,0 +1,117 @@
++
++#Using ia-32 hardware breakpoints.
++#
++#4 hardware breakpoints are available in ia-32 processors. These breakpoints
++#do not need code modification. They are set using debug registers.
++#
++#Each hardware breakpoint can be of one of the
++#three types: execution, write, access.
++#1. An Execution breakpoint is triggered when code at the breakpoint address is
++#executed.
++#2. A write breakpoint ( aka watchpoints ) is triggered when memory location
++#at the breakpoint address is written.
++#3. An access breakpoint is triggered when memory location at the breakpoint
++#address is either read or written.
++#
++#As hardware breakpoints are available in limited number, use software
++#breakpoints ( br command in gdb ) instead of execution hardware breakpoints.
++#
++#Length of an access or a write breakpoint defines length of the datatype to
++#be watched. Length is 1 for char, 2 short , 3 int.
++#
++#For placing execution, write and access breakpoints, use commands
++#hwebrk, hwwbrk, hwabrk
++#To remove a breakpoint use hwrmbrk command.
++#
++#These commands take following types of arguments. For arguments associated
++#with each command, use help command.
++#1. breakpointno: 0 to 3
++#2. length: 1 to 3
++#3. address: Memory location in hex ( without 0x ) e.g c015e9bc
++#
++#Use the command exinfo to find which hardware breakpoint occured.
++
++#hwebrk breakpointno address
++define hwebrk
++ maintenance packet Y$arg0,0,0,$arg1
++end
++document hwebrk
++ hwebrk <breakpointno> <address>
++ Places a hardware execution breakpoint
++ <breakpointno> = 0 - 3
++ <address> = Hex digits without leading "0x".
++end
++
++#hwwbrk breakpointno length address
++define hwwbrk
++ maintenance packet Y$arg0,1,$arg1,$arg2
++end
++document hwwbrk
++ hwwbrk <breakpointno> <length> <address>
++ Places a hardware write breakpoint
++ <breakpointno> = 0 - 3
++ <length> = 1 (1 byte), 2 (2 byte), 3 (4 byte)
++ <address> = Hex digits without leading "0x".
++end
++
++#hwabrk breakpointno length address
++define hwabrk
++ maintenance packet Y$arg0,1,$arg1,$arg2
++end
++document hwabrk
++ hwabrk <breakpointno> <length> <address>
++ Places a hardware access breakpoint
++ <breakpointno> = 0 - 3
++ <length> = 1 (1 byte), 2 (2 byte), 3 (4 byte)
++ <address> = Hex digits without leading "0x".
++end
++
++#hwrmbrk breakpointno
++define hwrmbrk
++ maintenance packet y$arg0
++end
++document hwrmbrk
++ hwrmbrk <breakpointno>
++ <breakpointno> = 0 - 3
++ Removes a hardware breakpoint
++end
++
++define reboot
++ maintenance packet r
++end
++#exinfo
++define exinfo
++ maintenance packet qE
++end
++document exinfo
++ exinfo
++ Gives information about a breakpoint.
++end
++define get_th
++ p $th=(struct thread_info *)((int)$esp & ~8191)
++end
++document get_th
++ get_tu
++ Gets and prints the current thread_info pointer, Defines th to be it.
++end
++define get_cu
++ p $cu=((struct thread_info *)((int)$esp & ~8191))->task
++end
++document get_cu
++ get_cu
++ Gets and print the "current" value. Defines $cu to be it.
++end
++define int_off
++ set var $flags=$eflags
++ set $eflags=$eflags&~0x200
++ end
++define int_on
++ set var $eflags|=$flags&0x200
++ end
++document int_off
++ saves the current interrupt state and clears the processor interrupt
++ flag. Use int_on to restore the saved flag.
++end
++document int_on
++ Restores the interrupt flag saved by int_off.
++end
+Index: linux-2.6.10/Documentation/i386/kgdb/gdb-globals.txt
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/gdb-globals.txt 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/gdb-globals.txt 2005-04-05 12:48:05.260617344 +0800
+@@ -0,0 +1,71 @@
++Sender: akale@veritas.com
++Date: Fri, 23 Jun 2000 19:26:35 +0530
++From: "Amit S. Kale" <akale@veritas.com>
++Organization: Veritas Software (India)
++To: Dave Grothe <dave@gcom.com>, linux-kernel@vger.rutgers.edu
++CC: David Milburn <dmilburn@wirespeed.com>,
++ "Edouard G. Parmelan" <Edouard.Parmelan@quadratec.fr>,
++ ezannoni@cygnus.com, Keith Owens <kaos@ocs.com.au>
++Subject: Re: Module debugging using kgdb
++
++Dave Grothe wrote:
++>
++> Amit:
++>
++> There is a 2.4.0 version of kgdb on our ftp site:
++> ftp://ftp.gcom.com/pub/linux/src/kgdb. I mirrored your version of gdb
++> and loadmodule.sh there.
++>
++> Have a look at the README file and see if I go it right. If not, send
++> me some corrections and I will update it.
++>
++> Does your version of gdb solve the global variable problem?
++
++Yes.
++Thanks to Elena Zanoni, gdb (developement version) can now calculate
++correctly addresses of dynamically loaded object files. I have not been
++following gdb developement for sometime and am not sure when symbol
++address calculation fix is going to appear in a gdb stable version.
++
++Elena, any idea when the fix will make it to a prebuilt gdb from a
++redhat release?
++
++For the time being I have built a gdb developement version. It can be
++used for module debugging with loadmodule.sh script.
++
++The problem with calculating of module addresses with previous versions
++of gdb was as follows:
++gdb did not use base address of a section while calculating address of
++a symbol in the section in an object file loaded via 'add-symbol-file'.
++It used address of .text segment instead. Due to this addresses of
++symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly.
++
++Above mentioned fix allow gdb to use base address of a segment while
++calculating address of a symbol in it. It adds a parameter '-s' to
++'add-symbol-file' command for specifying base address of a segment.
++
++loadmodule.sh script works as follows.
++
++1. Copy a module file to target machine.
++2. Load the module on the target machine using insmod with -m parameter.
++insmod produces a module load map which contains base addresses of all
++sections in the module and addresses of symbols in the module file.
++3. Find all sections and their base addresses in the module from
++the module map.
++4. Generate a script that loads the module file. The script uses
++'add-symbol-file' and specifies address of text segment followed by
++addresses of all segments in the module.
++
++Here is an example gdb script produced by loadmodule.sh script.
++
++add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5
++-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38
++-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838
++
++With this command gdb can calculate addresses of symbols in ANY segment
++in a module file.
++
++Regards.
++--
++Amit Kale
++Veritas Software ( http://www.veritas.com )
+Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit-modules
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit-modules 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit-modules 2005-04-05 12:48:05.262617040 +0800
+@@ -0,0 +1,146 @@
++#
++# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub.
++#
++# This don't work for Linux-2.0 or older.
++#
++# Author Edouard G. Parmelan <Edouard.Parmelan@quadratec.fr>
++#
++#
++# Fri Apr 30 20:33:29 CEST 1999
++# First public release.
++#
++# Major cleanup after experiment Linux-2.0 kernel without success.
++# Symbols of a module are not in the correct order, I can't explain
++# why :(
++#
++# Fri Mar 19 15:41:40 CET 1999
++# Initial version.
++#
++# Thu Jan 6 16:29:03 CST 2000
++# A little fixing by Dave Grothe <dave@gcom.com>
++#
++# Mon Jun 19 09:33:13 CDT 2000
++# Alignment changes from Edouard Parmelan
++#
++# The basic idea is to find where insmod load the module and inform
++# GDB to load the symbol table of the module with the GDB command
++# ``add-symbol-file <object> <address>''.
++#
++# The Linux kernel holds the list of all loaded modules in module_list,
++# this list end with &kernel_module (exactly with module->next == NULL,
++# but the last module is not a real module).
++#
++# Insmod allocates the struct module before the object file. Since
++# Linux-2.1, this structure contain his size. The real address of
++# the object file is then (char*)module + module->size_of_struct.
++#
++# You can use three user functions ``mod-list'', ``mod-print-symbols''
++# and ``add-module-symbols''.
++#
++# mod-list list all loaded modules with the format:
++# <module-address> <module-name>
++#
++# As soon as you have found the address of your module, you can
++# print its exported symbols (mod-print-symbols) or inform GDB to add
++# symbols from your module file (mod-add-symbols).
++#
++# The argument that you give to mod-print-symbols or mod-add-symbols
++# is the <module-address> from the mod-list command.
++#
++# When using the mod-add-symbols command you must also give the full
++# pathname of the modules object code file.
++#
++# The command mod-add-lis is an example of how to make this easier.
++# You can edit this macro to contain the path name of your own
++# favorite module and then use it as a shorthand to load it. You
++# still need the module-address, however.
++#
++# The internal function ``mod-validate'' set the GDB variable $mod
++# as a ``struct module*'' if the kernel known the module otherwise
++# $mod is set to NULL. This ensure to not add symbols for a wrong
++# address.
++#
++# Have a nice hacking day !
++#
++#
++define mod-list
++ set $mod = (struct module*)module_list
++ # the last module is the kernel, ignore it
++ while $mod != &kernel_module
++ printf "%p\t%s\n", (long)$mod, ($mod)->name
++ set $mod = $mod->next
++ end
++end
++document mod-list
++List all modules in the form: <module-address> <module-name>
++Use the <module-address> as the argument for the other
++mod-commands: mod-print-symbols, mod-add-symbols.
++end
++
++define mod-validate
++ set $mod = (struct module*)module_list
++ while ($mod != $arg0) && ($mod != &kernel_module)
++ set $mod = $mod->next
++ end
++ if $mod == &kernel_module
++ set $mod = 0
++ printf "%p is not a module\n", $arg0
++ end
++end
++document mod-validate
++mod-validate <module-address>
++Internal user-command used to validate the module parameter.
++If <module> is a real loaded module, set $mod to it otherwise set $mod to 0.
++end
++
++
++define mod-print-symbols
++ mod-validate $arg0
++ if $mod != 0
++ set $i = 0
++ while $i < $mod->nsyms
++ set $sym = $mod->syms[$i]
++ printf "%p\t%s\n", $sym->value, $sym->name
++ set $i = $i + 1
++ end
++ end
++end
++document mod-print-symbols
++mod-print-symbols <module-address>
++Print all exported symbols of the module. see mod-list
++end
++
++
++define mod-add-symbols-align
++ mod-validate $arg0
++ if $mod != 0
++ set $mod_base = ($mod->size_of_struct + (long)$mod)
++ if ($arg2 != 0) && (($mod_base & ($arg2 - 1)) != 0)
++ set $mod_base = ($mod_base | ($arg2 - 1)) + 1
++ end
++ add-symbol-file $arg1 $mod_base
++ end
++end
++document mod-add-symbols-align
++mod-add-symbols-align <module-address> <object file path name> <align>
++Load the symbols table of the module from the object file where
++first section aligment is <align>.
++To retreive alignment, use `objdump -h <object file path name>'.
++end
++
++define mod-add-symbols
++ mod-add-symbols-align $arg0 $arg1 sizeof(long)
++end
++document mod-add-symbols
++mod-add-symbols <module-address> <object file path name>
++Load the symbols table of the module from the object file.
++Default alignment is 4. See mod-add-symbols-align.
++end
++
++define mod-add-lis
++ mod-add-symbols-align $arg0 /usr/src/LiS/streams.o 16
++end
++document mod-add-lis
++mod-add-lis <module-address>
++Does mod-add-symbols <module-address> /usr/src/LiS/streams.o
++end
+Index: linux-2.6.10/Documentation/i386/kgdb/debug-nmi.txt
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/debug-nmi.txt 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/debug-nmi.txt 2005-04-05 12:48:05.261617192 +0800
+@@ -0,0 +1,37 @@
++Subject: Debugging with NMI
++Date: Mon, 12 Jul 1999 11:28:31 -0500
++From: David Grothe <dave@gcom.com>
++Organization: Gcom, Inc
++To: David Grothe <dave@gcom.com>
++
++Kernel hackers:
++
++Maybe this is old hat, but it is new to me --
++
++On an ISA bus machine, if you short out the A1 and B1 pins of an ISA
++slot you will generate an NMI to the CPU. This interrupts even a
++machine that is hung in a loop with interrupts disabled. Used in
++conjunction with kgdb <
++ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can
++gain debugger control of a machine that is hung in the kernel! Even
++without kgdb the kernel will print a stack trace so you can find out
++where it was hung.
++
++The A1/B1 pins are directly opposite one another and the farthest pins
++towards the bracket end of the ISA bus socket. You can stick a paper
++clip or multi-meter probe between them to short them out.
++
++I had a spare ISA bus to PC104 bus adapter around. The PC104 end of the
++board consists of two rows of wire wrap pins. So I wired a push button
++between the A1/B1 pins and now have an ISA board that I can stick into
++any ISA bus slot for debugger entry.
++
++Microsoft has a circuit diagram of a PCI card at
++http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM. If you want to
++build one you will have to mail them and ask for the PAL equations.
++Nobody makes one comercially.
++
++[THIS TIP COMES WITH NO WARRANTY WHATSOEVER. It works for me, but if
++your machine catches fire, it is your problem, not mine.]
++
++-- Dave (the kgdb guy)
+Index: linux-2.6.10/Documentation/i386/kgdb/loadmodule.sh
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/loadmodule.sh 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/loadmodule.sh 2005-04-05 12:48:05.274615216 +0800
+@@ -0,0 +1,78 @@
++#/bin/sh
++# This script loads a module on a target machine and generates a gdb script.
++# source generated gdb script to load the module file at appropriate addresses
++# in gdb.
++#
++# Usage:
++# Loading the module on target machine and generating gdb script)
++# [foo]$ loadmodule.sh <modulename>
++#
++# Loading the module file into gdb
++# (gdb) source <gdbscriptpath>
++#
++# Modify following variables according to your setup.
++# TESTMACHINE - Name of the target machine
++# GDBSCRIPTS - The directory where a gdb script will be generated
++#
++# Author: Amit S. Kale (akale@veritas.com).
++#
++# If you run into problems, please check files pointed to by following
++# variables.
++# ERRFILE - /tmp/<modulename>.errs contains stderr output of insmod
++# MAPFILE - /tmp/<modulename>.map contains stdout output of insmod
++# GDBSCRIPT - $GDBSCRIPTS/load<modulename> gdb script.
++
++TESTMACHINE=foo
++GDBSCRIPTS=/home/bar
++
++if [ $# -lt 1 ] ; then {
++ echo Usage: $0 modulefile
++ exit
++} ; fi
++
++MODULEFILE=$1
++MODULEFILEBASENAME=`basename $1`
++
++if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then {
++ MODULEFILE=`pwd`/$MODULEFILE
++} fi
++
++ERRFILE=/tmp/$MODULEFILEBASENAME.errs
++MAPFILE=/tmp/$MODULEFILEBASENAME.map
++GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME
++
++function findaddr() {
++ local ADDR=0x$(echo "$SEGMENTS" | \
++ grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \
++ sed 's/[ ]*[^ ]*$//')
++ echo $ADDR
++}
++
++function checkerrs() {
++ if [ "`cat $ERRFILE`" != "" ] ; then {
++ cat $ERRFILE
++ exit
++ } fi
++}
++
++#load the module
++echo Copying $MODULEFILE to $TESTMACHINE
++rcp $MODULEFILE root@${TESTMACHINE}:
++
++echo Loading module $MODULEFILE
++rsh -l root $TESTMACHINE /sbin/insmod -m ./`basename $MODULEFILE` \
++ > $MAPFILE 2> $ERRFILE
++checkerrs
++
++SEGMENTS=`head -n 11 $MAPFILE | tail -n 10`
++TEXTADDR=$(findaddr "\\.text[^.]")
++LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR"
++SEGADDRS=`echo "$SEGMENTS" | awk '//{
++ if ($1 != ".text" && $1 != ".this" &&
++ $1 != ".kstrtab" && $1 != ".kmodtab") {
++ print " -s " $1 " 0x" $3 " "
++ }
++}'`
++LOADSTRING="$LOADSTRING $SEGADDRS"
++echo Generating script $GDBSCRIPT
++echo $LOADSTRING > $GDBSCRIPT
+Index: linux-2.6.10/Documentation/i386/kgdb/andthen
+===================================================================
+--- linux-2.6.10.orig/Documentation/i386/kgdb/andthen 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/Documentation/i386/kgdb/andthen 2005-04-05 12:48:05.272615520 +0800
+@@ -0,0 +1,100 @@
++
++define set_andthen
++ set var $thp=0
++ set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0]
++ set var $at_size = (sizeof kgdb_data)/(sizeof *$thp)
++ set var $at_oc=kgdb_and_then_count
++ set var $at_cc=$at_oc
++end
++
++define andthen_next
++ set var $at_cc=$arg0
++end
++
++define andthen
++ andthen_set_edge
++ if ($at_cc >= $at_oc)
++ printf "Outside window. Window size is %d\n",($at_oc-$at_low)
++ else
++ printf "%d: ",$at_cc
++ output *($thp+($at_cc++ % $at_size ))
++ printf "\n"
++ end
++end
++define andthen_set_edge
++ set var $at_oc=kgdb_and_then_count
++ set var $at_low = $at_oc - $at_size
++ if ($at_low < 0 )
++ set var $at_low = 0
++ end
++ if (( $at_cc > $at_oc) || ($at_cc < $at_low))
++ printf "Count outside of window, setting count to "
++ if ($at_cc >= $at_oc)
++ set var $at_cc = $at_oc
++ else
++ set var $at_cc = $at_low
++ end
++ printf "%d\n",$at_cc
++ end
++end
++
++define beforethat
++ andthen_set_edge
++ if ($at_cc <= $at_low)
++ printf "Outside window. Window size is %d\n",($at_oc-$at_low)
++ else
++ printf "%d: ",$at_cc-1
++ output *($thp+(--$at_cc % $at_size ))
++ printf "\n"
++ end
++end
++
++document andthen_next
++ andthen_next <count>
++ . sets the number of the event to display next. If this event
++ . is not in the event pool, either andthen or beforethat will
++ . correct it to the nearest event pool edge. The event pool
++ . ends at the last event recorded and begins <number of events>
++ . prior to that. If beforethat is used next, it will display
++ . event <count> -1.
++.
++ andthen commands are: set_andthen, andthen_next, andthen and beforethat
++end
++
++
++document andthen
++ andthen
++. displays the next event in the list. <set_andthen> sets up to display
++. the oldest saved event first.
++. <count> (optional) count of the event to display.
++. note the number of events saved is specified at configure time.
++. if events are saved between calls to andthen the index will change
++. but the displayed event will be the next one (unless the event buffer
++. is overrun).
++.
++. andthen commands are: set_andthen, andthen_next, andthen and beforethat
++end
++
++document set_andthen
++ set_andthen
++. sets up to use the <andthen> and <beforethat> commands.
++. if you have defined your own struct, use the above and
++. then enter the following:
++. p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0]
++. where <kgdb_and_then_structX> is the name of your structure.
++.
++. andthen commands are: set_andthen, andthen_next, andthen and beforethat
++end
++
++document beforethat
++ beforethat
++. displays the next prior event in the list. <set_andthen> sets up to
++. display the last occuring event first.
++.
++. note the number of events saved is specified at configure time.
++. if events are saved between calls to beforethat the index will change
++. but the displayed event will be the next one (unless the event buffer
++. is overrun).
++.
++. andthen commands are: set_andthen, andthen_next, andthen and beforethat
++end
+Index: linux-2.6.10/arch/i386/lib/kgdb_serial.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/lib/kgdb_serial.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/i386/lib/kgdb_serial.c 2005-04-05 12:48:05.193627528 +0800
+@@ -0,0 +1,485 @@
++/*
++ * Serial interface GDB stub
++ *
++ * Written (hacked together) by David Grothe (dave@gcom.com)
++ * Modified to allow invokation early in boot see also
++ * kgdb.h for instructions by George Anzinger(george@mvista.com)
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/errno.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/timer.h>
++#include <linux/interrupt.h>
++#include <linux/tty.h>
++#include <linux/tty_flip.h>
++#include <linux/serial.h>
++#include <linux/serial_reg.h>
++#include <linux/config.h>
++#include <linux/major.h>
++#include <linux/string.h>
++#include <linux/fcntl.h>
++#include <linux/ptrace.h>
++#include <linux/ioport.h>
++#include <linux/mm.h>
++#include <linux/init.h>
++#include <linux/highmem.h>
++#include <asm/system.h>
++#include <asm/io.h>
++#include <asm/segment.h>
++#include <asm/bitops.h>
++#include <asm/system.h>
++#include <asm/kgdb_local.h>
++#ifdef CONFIG_KGDB_USER_CONSOLE
++extern void kgdb_console_finit(void);
++#endif
++#define PRNT_off
++#define TEST_EXISTANCE
++#ifdef PRNT
++#define dbprintk(s) printk s
++#else
++#define dbprintk(s)
++#endif
++#define TEST_INTERRUPT_off
++#ifdef TEST_INTERRUPT
++#define intprintk(s) printk s
++#else
++#define intprintk(s)
++#endif
++
++#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT)
++
++#define GDB_BUF_SIZE 512 /* power of 2, please */
++
++static char gdb_buf[GDB_BUF_SIZE];
++static int gdb_buf_in_inx;
++static atomic_t gdb_buf_in_cnt;
++static int gdb_buf_out_inx;
++
++struct async_struct *gdb_async_info;
++static int gdb_async_irq;
++
++#define outb_px(a,b) outb_p(b,a)
++
++static void program_uart(struct async_struct *info);
++static void write_char(struct async_struct *info, int chr);
++/*
++ * Get a byte from the hardware data buffer and return it
++ */
++static int
++read_data_bfr(struct async_struct *info)
++{
++ char it = inb_p(info->port + UART_LSR);
++
++ if (it & UART_LSR_DR)
++ return (inb_p(info->port + UART_RX));
++ /*
++ * If we have a framing error assume somebody messed with
++ * our uart. Reprogram it and send '-' both ways...
++ */
++ if (it & 0xc) {
++ program_uart(info);
++ write_char(info, '-');
++ return ('-');
++ }
++ return (-1);
++
++} /* read_data_bfr */
++
++/*
++ * Get a char if available, return -1 if nothing available.
++ * Empty the receive buffer first, then look at the interface hardware.
++
++ * Locking here is a bit of a problem. We MUST not lock out communication
++ * if we are trying to talk to gdb about a kgdb entry. ON the other hand
++ * we can loose chars in the console pass thru if we don't lock. It is also
++ * possible that we could hold the lock or be waiting for it when kgdb
++ * NEEDS to talk. Since kgdb locks down the world, it does not need locks.
++ * We do, of course have possible issues with interrupting a uart operation,
++ * but we will just depend on the uart status to help keep that straight.
++
++ */
++static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED;
++#ifdef CONFIG_SMP
++extern spinlock_t kgdb_spinlock;
++#endif
++
++static int
++read_char(struct async_struct *info)
++{
++ int chr;
++ unsigned long flags;
++ local_irq_save(flags);
++#ifdef CONFIG_SMP
++ if (!spin_is_locked(&kgdb_spinlock)) {
++ spin_lock(&uart_interrupt_lock);
++ }
++#endif
++ if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */
++ chr = gdb_buf[gdb_buf_out_inx++];
++ gdb_buf_out_inx &= (GDB_BUF_SIZE - 1);
++ atomic_dec(&gdb_buf_in_cnt);
++ } else {
++ chr = read_data_bfr(info);
++ }
++#ifdef CONFIG_SMP
++ if (!spin_is_locked(&kgdb_spinlock)) {
++ spin_unlock(&uart_interrupt_lock);
++ }
++#endif
++ local_irq_restore(flags);
++ return (chr);
++}
++
++/*
++ * Wait until the interface can accept a char, then write it.
++ */
++static void
++write_char(struct async_struct *info, int chr)
++{
++ while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ;
++
++ outb_p(chr, info->port + UART_TX);
++
++} /* write_char */
++
++/*
++ * Mostly we don't need a spinlock, but since the console goes
++ * thru here with interrutps on, well, we need to catch those
++ * chars.
++ */
++/*
++ * This is the receiver interrupt routine for the GDB stub.
++ * It will receive a limited number of characters of input
++ * from the gdb host machine and save them up in a buffer.
++ *
++ * When the gdb stub routine getDebugChar() is called it
++ * draws characters out of the buffer until it is empty and
++ * then reads directly from the serial port.
++ *
++ * We do not attempt to write chars from the interrupt routine
++ * since the stubs do all of that via putDebugChar() which
++ * writes one byte after waiting for the interface to become
++ * ready.
++ *
++ * The debug stubs like to run with interrupts disabled since,
++ * after all, they run as a consequence of a breakpoint in
++ * the kernel.
++ *
++ * Perhaps someone who knows more about the tty driver than I
++ * care to learn can make this work for any low level serial
++ * driver.
++ */
++static irqreturn_t
++gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++ struct async_struct *info;
++ unsigned long flags;
++
++ info = gdb_async_info;
++ if (!info || !info->tty || irq != gdb_async_irq)
++ return IRQ_NONE;
++
++ local_irq_save(flags);
++ spin_lock(&uart_interrupt_lock);
++ do {
++ int chr = read_data_bfr(info);
++ intprintk(("Debug char on int: %x hex\n", chr));
++ if (chr < 0)
++ continue;
++
++ if (chr == 3) { /* Ctrl-C means remote interrupt */
++ BREAKPOINT;
++ continue;
++ }
++
++ if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) {
++ /* buffer overflow tosses early char */
++ read_char(info);
++ }
++ gdb_buf[gdb_buf_in_inx++] = chr;
++ gdb_buf_in_inx &= (GDB_BUF_SIZE - 1);
++ } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI);
++ spin_unlock(&uart_interrupt_lock);
++ local_irq_restore(flags);
++ return IRQ_HANDLED;
++} /* gdb_interrupt */
++
++/*
++ * Just a NULL routine for testing.
++ */
++void
++gdb_null(void)
++{
++} /* gdb_null */
++
++/* These structure are filled in with values defined in asm/kgdb_local.h
++ */
++static struct serial_state state = SB_STATE;
++static struct async_struct local_info = SB_INFO;
++static int ok_to_enable_ints = 0;
++static void kgdb_enable_ints_now(void);
++
++extern char *kgdb_version;
++/*
++ * Hook an IRQ for KGDB.
++ *
++ * This routine is called from putDebugChar, below.
++ */
++static int ints_disabled = 1;
++int
++gdb_hook_interrupt(struct async_struct *info, int verb)
++{
++ struct serial_state *state = info->state;
++ unsigned long flags;
++ int port;
++#ifdef TEST_EXISTANCE
++ int scratch, scratch2;
++#endif
++
++ /* The above fails if memory managment is not set up yet.
++ * Rather than fail the set up, just keep track of the fact
++ * and pick up the interrupt thing later.
++ */
++ gdb_async_info = info;
++ port = gdb_async_info->port;
++ gdb_async_irq = state->irq;
++ if (verb) {
++ printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n",
++ kgdb_version,
++ port,
++ gdb_async_irq, gdb_async_info->state->custom_divisor);
++ }
++ local_irq_save(flags);
++#ifdef TEST_EXISTANCE
++ /* Existance test */
++ /* Should not need all this, but just in case.... */
++
++ scratch = inb_p(port + UART_IER);
++ outb_px(port + UART_IER, 0);
++ outb_px(0xff, 0x080);
++ scratch2 = inb_p(port + UART_IER);
++ outb_px(port + UART_IER, scratch);
++ if (scratch2) {
++ printk
++ ("gdb_hook_interrupt: Could not clear IER, not a UART!\n");
++ local_irq_restore(flags);
++ return 1; /* We failed; there's nothing here */
++ }
++ scratch2 = inb_p(port + UART_LCR);
++ outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */
++ outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */
++ outb_px(port + UART_LCR, 0);
++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO);
++ scratch = inb_p(port + UART_IIR) >> 6;
++ if (scratch == 1) {
++ printk("gdb_hook_interrupt: Undefined UART type!"
++ " Not a UART! \n");
++ local_irq_restore(flags);
++ return 1;
++ } else {
++ dbprintk(("gdb_hook_interrupt: UART type "
++ "is %d where 0=16450, 2=16550 3=16550A\n", scratch));
++ }
++ scratch = inb_p(port + UART_MCR);
++ outb_px(port + UART_MCR, UART_MCR_LOOP | scratch);
++ outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A);
++ scratch2 = inb_p(port + UART_MSR) & 0xF0;
++ outb_px(port + UART_MCR, scratch);
++ if (scratch2 != 0x90) {
++ printk("gdb_hook_interrupt: "
++ "Loop back test failed! Not a UART!\n");
++ local_irq_restore(flags);
++ return scratch2 + 1000; /* force 0 to fail */
++ }
++#endif /* test existance */
++ program_uart(info);
++ local_irq_restore(flags);
++
++ return (0);
++
++} /* gdb_hook_interrupt */
++
++static void
++program_uart(struct async_struct *info)
++{
++ int port = info->port;
++
++ (void) inb_p(port + UART_RX);
++ outb_px(port + UART_IER, 0);
++
++ (void) inb_p(port + UART_RX); /* serial driver comments say */
++ (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */
++ (void) inb_p(port + UART_MSR);
++ outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB);
++ outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */
++ outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */
++ outb_px(port + UART_MCR, info->MCR);
++
++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */
++ outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */
++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */
++ if (!ints_disabled) {
++ intprintk(("KGDB: Sending %d to port %x offset %d\n",
++ gdb_async_info->IER,
++ (int) gdb_async_info->port, UART_IER));
++ outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER);
++ }
++ return;
++}
++
++/*
++ * getDebugChar
++ *
++ * This is a GDB stub routine. It waits for a character from the
++ * serial interface and then returns it. If there is no serial
++ * interface connection then it returns a bogus value which will
++ * almost certainly cause the system to hang. In the
++ */
++int kgdb_in_isr = 0;
++int kgdb_in_lsr = 0;
++extern spinlock_t kgdb_spinlock;
++
++/* Caller takes needed protections */
++
++int
++getDebugChar(void)
++{
++ volatile int chr, dum, time, end_time;
++
++ dbprintk(("getDebugChar(port %x): ", gdb_async_info->port));
++
++ if (gdb_async_info == NULL) {
++ gdb_hook_interrupt(&local_info, 0);
++ }
++ /*
++ * This trick says if we wait a very long time and get
++ * no char, return the -1 and let the upper level deal
++ * with it.
++ */
++ rdtsc(dum, time);
++ end_time = time + 2;
++ while (((chr = read_char(gdb_async_info)) == -1) &&
++ (end_time - time) > 0) {
++ rdtsc(dum, time);
++ };
++ /*
++ * This covers our butts if some other code messes with
++ * our uart, hay, it happens :o)
++ */
++ if (chr == -1)
++ program_uart(gdb_async_info);
++
++ dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' '));
++ return (chr);
++
++} /* getDebugChar */
++
++static int count = 3;
++static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED;
++
++static int __init
++kgdb_enable_ints(void)
++{
++ if (gdb_async_info == NULL) {
++ gdb_hook_interrupt(&local_info, 1);
++ }
++ ok_to_enable_ints = 1;
++ kgdb_enable_ints_now();
++#ifdef CONFIG_KGDB_USER_CONSOLE
++ kgdb_console_finit();
++#endif
++ return 0;
++}
++
++#ifdef CONFIG_SERIAL_8250
++void shutdown_for_kgdb(struct async_struct *gdb_async_info);
++#endif
++
++#ifdef CONFIG_DISCONTIGMEM
++static inline int kgdb_mem_init_done(void)
++{
++ return highmem_start_page != NULL;
++}
++#else
++static inline int kgdb_mem_init_done(void)
++{
++ return max_mapnr != 0;
++}
++#endif
++
++static void
++kgdb_enable_ints_now(void)
++{
++ if (!spin_trylock(&one_at_atime))
++ return;
++ if (!ints_disabled)
++ goto exit;
++ if (kgdb_mem_init_done() &&
++ ints_disabled) { /* don't try till mem init */
++#ifdef CONFIG_SERIAL_8250
++ /*
++ * The ifdef here allows the system to be configured
++ * without the serial driver.
++ * Don't make it a module, however, it will steal the port
++ */
++ shutdown_for_kgdb(gdb_async_info);
++#endif
++ ints_disabled = request_irq(gdb_async_info->state->irq,
++ gdb_interrupt,
++ IRQ_T(gdb_async_info),
++ "KGDB-stub", NULL);
++ intprintk(("KGDB: request_irq returned %d\n", ints_disabled));
++ }
++ if (!ints_disabled) {
++ intprintk(("KGDB: Sending %d to port %x offset %d\n",
++ gdb_async_info->IER,
++ (int) gdb_async_info->port, UART_IER));
++ outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER);
++ }
++ exit:
++ spin_unlock(&one_at_atime);
++}
++
++/*
++ * putDebugChar
++ *
++ * This is a GDB stub routine. It waits until the interface is ready
++ * to transmit a char and then sends it. If there is no serial
++ * interface connection then it simply returns to its caller, having
++ * pretended to send the char. Caller takes needed protections.
++ */
++void
++putDebugChar(int chr)
++{
++ dbprintk(("putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n",
++ gdb_async_info->port,
++ chr,
++ chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1));
++
++ if (gdb_async_info == NULL) {
++ gdb_hook_interrupt(&local_info, 0);
++ }
++
++ write_char(gdb_async_info, chr); /* this routine will wait */
++ count = (chr == '#') ? 0 : count + 1;
++ if ((count == 2)) { /* try to enable after */
++ if (ints_disabled & ok_to_enable_ints)
++ kgdb_enable_ints_now(); /* try to enable after */
++
++ /* We do this a lot because, well we really want to get these
++ * interrupts. The serial driver will clear these bits when it
++ * initializes the chip. Every thing else it does is ok,
++ * but this.
++ */
++ if (!ints_disabled) {
++ outb_px(gdb_async_info->port + UART_IER,
++ gdb_async_info->IER);
++ }
++ }
++
++} /* putDebugChar */
++
++module_init(kgdb_enable_ints);
+Index: linux-2.6.10/arch/i386/lib/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/i386/lib/Makefile 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/i386/lib/Makefile 2005-04-05 12:48:05.194627376 +0800
+@@ -8,3 +8,4 @@
+
+ lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
+ lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
++lib-$(CONFIG_KGDB) += kgdb_serial.o
+Index: linux-2.6.10/arch/i386/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig.debug 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/arch/i386/Kconfig.debug 2005-04-05 12:48:05.204625856 +0800
+@@ -65,4 +65,6 @@
+ depends on X86_LOCAL_APIC && !X86_VISWS
+ default y
+
++source "arch/i386/Kconfig.kgdb"
++
+ endmenu
+Index: linux-2.6.10/arch/i386/kernel/entry.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/entry.S 2005-04-05 12:48:03.413898088 +0800
++++ linux-2.6.10/arch/i386/kernel/entry.S 2005-04-05 12:48:05.244619776 +0800
+@@ -48,6 +48,18 @@
+ #include <asm/smp.h>
+ #include <asm/page.h>
+ #include "irq_vectors.h"
++ /* We do not recover from a stack overflow, but at least
++ * we know it happened and should be able to track it down.
++ */
++#ifdef CONFIG_STACK_OVERFLOW_TEST
++#define STACK_OVERFLOW_TEST \
++ testl $(THREAD_SIZE - 512),%esp; \
++ jnz 10f; \
++ call stack_overflow; \
++10:
++#else
++#define STACK_OVERFLOW_TEST
++#endif
+
+ #define nr_syscalls ((syscall_table_size)/4)
+
+@@ -94,7 +106,8 @@
+ pushl %ebx; \
+ movl $(__USER_DS), %edx; \
+ movl %edx, %ds; \
+- movl %edx, %es;
++ movl %edx, %es; \
++ STACK_OVERFLOW_TEST
+
+ #define RESTORE_INT_REGS \
+ popl %ebx; \
+@@ -198,6 +211,7 @@
+ # sysenter call handler stub
+ ENTRY(sysenter_entry)
+ movl TSS_sysenter_esp0(%esp),%esp
++ .globl sysenter_past_esp
+ sysenter_past_esp:
+ sti
+ pushl $(__USER_DS)
+@@ -261,6 +275,19 @@
+ testw $_TIF_ALLWORK_MASK, %cx # current->work
+ jne syscall_exit_work
+ restore_all:
++#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS
++ movl EFLAGS(%esp), %eax # mix EFLAGS and CS
++ movb CS(%esp), %al
++ testl $(VM_MASK | 3), %eax
++ jz resume_kernelX # returning to kernel or vm86-space
++
++ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
++ jz resume_kernelX
++
++ int $3
++
++resume_kernelX:
++#endif
+ RESTORE_ALL
+
+ # perform work that needs to be done immediately before resumption
+Index: linux-2.6.10/arch/i386/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/traps.c 2005-03-31 16:20:09.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/traps.c 2005-04-05 12:48:05.221623272 +0800
+@@ -105,6 +105,39 @@
+ return err;
+ }
+
++#ifdef CONFIG_KGDB
++extern void sysenter_past_esp(void);
++#include <asm/kgdb.h>
++#include <linux/init.h>
++void set_intr_gate(unsigned int n, void *addr);
++static void set_intr_usr_gate(unsigned int n, void *addr);
++/*
++ * Should be able to call this breakpoint() very early in
++ * bring up. Just hard code the call where needed.
++ * The breakpoint() code is here because set_?_gate() functions
++ * are local (static) to trap.c. They need be done only once,
++ * but it does not hurt to do them over.
++ */
++void breakpoint(void)
++{
++ set_intr_usr_gate(3,&int3); /* disable ints on trap */
++ set_intr_gate(1,&debug);
++ set_intr_gate(14,&page_fault);
++
++ BREAKPOINT;
++}
++#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \
++ { \
++ if (!user_mode(regs) ) \
++ { \
++ kgdb_handle_exception(trapnr, signr, error_code, regs); \
++ after; \
++ } else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \
++ }
++#else
++#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after)
++#endif
++
+ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+ {
+ return p > (void *)tinfo &&
+@@ -332,6 +365,15 @@
+ #endif
+ if (nl)
+ printk("\n");
++#ifdef CONFIG_KGDB
++ /* This is about the only place we want to go to kgdb even if in
++ * user mode. But we must go in via a trap so within kgdb we will
++ * always be in kernel mode.
++ */
++ if (user_mode(regs))
++ BREAKPOINT;
++#endif
++ CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,)
+ notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
+ show_registers(regs);
+ } else
+@@ -397,6 +439,7 @@
+ #define DO_ERROR(trapnr, signr, str, name) \
+ fastcall void do_##name(struct pt_regs * regs, long error_code) \
+ { \
++ CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,) \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+@@ -420,6 +463,7 @@
+ #define DO_VM86_ERROR(trapnr, signr, str, name) \
+ fastcall void do_##name(struct pt_regs * regs, long error_code) \
+ { \
++ CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return) \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+@@ -503,6 +547,7 @@
+
+ gp_in_kernel:
+ if (!fixup_exception(regs)) {
++ CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,)
+ if (notify_die(DIE_GPF, "general protection fault", regs,
+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
+ return;
+@@ -716,12 +761,35 @@
+ * allowing programs to debug themselves without the ptrace()
+ * interface.
+ */
+- if ((regs->xcs & 3) == 0)
+- goto clear_TF_reenable;
++#ifdef CONFIG_KGDB
++ /*
++ * I think this is the only "real" case of a TF in the kernel
++ * that really belongs to user space. Others are
++ * "Ours all ours!"
++ */
++ if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_past_esp))
++ goto clear_TF_reenable;
++#else
++ if ((regs->xcs & 3) == 0)
++ goto clear_TF_reenable;
++#endif
+ if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE)
+ goto clear_TF;
+ }
+
++#ifdef CONFIG_KGDB
++ /*
++ * If this is a kernel mode trap, we need to reset db7 to allow us
++ * to continue sanely ALSO skip the signal delivery
++ */
++ if ((regs->xcs & 3) == 0)
++ goto clear_dr7;
++
++ /* if not kernel, allow ints but only if they were on */
++ if (regs->eflags & 0x200)
++ local_irq_enable();
++#endif
++
+ /* Ok, finally something we can handle */
+ tsk->thread.trap_no = 1;
+ tsk->thread.error_code = error_code;
+@@ -743,6 +811,7 @@
+ __asm__("movl %0,%%db7"
+ : /* no output */
+ : "r" (0));
++ CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,)
+ return;
+
+ debug_vm86:
+@@ -999,6 +1068,12 @@
+ {
+ _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+ }
++#ifdef CONFIG_KGDB
++void set_intr_usr_gate(unsigned int n, void *addr)
++{
++ _set_gate(idt_table+n,14,3,addr,__KERNEL_CS);
++}
++#endif
+
+
+ void __init trap_init(void)
+@@ -1016,7 +1091,11 @@
+ set_trap_gate(0,÷_error);
+ set_intr_gate(1,&debug);
+ set_intr_gate(2,&nmi);
++#ifndef CONFIG_KGDB
+ set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
++#else
++ set_intr_usr_gate(3,&int3); /* int3-5 can be called from all */
++#endif
+ set_system_gate(4,&overflow);
+ set_system_gate(5,&bounds);
+ set_trap_gate(6,&invalid_op);
+Index: linux-2.6.10/arch/i386/kernel/nmi.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/nmi.c 2005-03-31 15:57:19.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/nmi.c 2005-04-05 12:48:05.222623120 +0800
+@@ -34,7 +34,17 @@
+
+ #include "mach_traps.h"
+
++#ifdef CONFIG_KGDB
++#include <asm/kgdb.h>
++#ifdef CONFIG_SMP
++unsigned int nmi_watchdog = NMI_IO_APIC;
++#else
++unsigned int nmi_watchdog = NMI_LOCAL_APIC;
++#endif
++#else
+ unsigned int nmi_watchdog = NMI_NONE;
++#endif
++
+ extern int unknown_nmi_panic;
+ static unsigned int nmi_hz = HZ;
+ static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+@@ -466,6 +476,9 @@
+ for (i = 0; i < NR_CPUS; i++)
+ alert_counter[i] = 0;
+ }
++#ifdef CONFIG_KGDB
++int tune_watchdog = 5*HZ;
++#endif
+
+ extern void die_nmi(struct pt_regs *, const char *msg);
+
+@@ -480,14 +493,25 @@
+ int sum, cpu = smp_processor_id();
+
+ sum = irq_stat[cpu].apic_timer_irqs;
+-
+- if (last_irq_sums[cpu] == sum) {
++#ifdef CONFIG_KGDB
++ if (!in_kgdb(regs) && last_irq_sums[cpu] == sum) {
++
++#else
++ if (last_irq_sums[cpu] == sum) {
++#endif
+ /*
+ * Ayiee, looks like this CPU is stuck ...
+ * wait a few IRQs (5 seconds) before doing the oops ...
+ */
+ alert_counter[cpu]++;
+- if (alert_counter[cpu] == 30*nmi_hz)
++#ifdef CONFIG_KGDB
++ if (alert_counter[cpu] == tune_watchdog) {
++ kgdb_handle_exception(2, SIGPWR, 0, regs);
++ last_irq_sums[cpu] = sum;
++ alert_counter[cpu] = 0;
++ }
++#endif
++ if (alert_counter[cpu] == 5*nmi_hz)
+ die_nmi(regs, "NMI Watchdog detected LOCKUP");
+ } else {
+ last_irq_sums[cpu] = sum;
+Index: linux-2.6.10/arch/i386/kernel/kgdb_stub.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/kgdb_stub.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/i386/kernel/kgdb_stub.c 2005-04-05 12:48:05.242620080 +0800
+@@ -0,0 +1,2330 @@
++/*
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2, or (at your option) any
++ * later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License for more details.
++ *
++ */
++
++/*
++ * Copyright (c) 2000 VERITAS Software Corporation.
++ *
++ */
++/****************************************************************************
++ * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $
++ *
++ * Module name: remcom.c $
++ * Revision: 1.34 $
++ * Date: 91/03/09 12:29:49 $
++ * Contributor: Lake Stevens Instrument Division$
++ *
++ * Description: low level support for gdb debugger. $
++ *
++ * Considerations: only works on target hardware $
++ *
++ * Written by: Glenn Engel $
++ * Updated by: David Grothe <dave@gcom.com>
++ * ModuleState: Experimental $
++ *
++ * NOTES: See Below $
++ *
++ * Modified for 386 by Jim Kingdon, Cygnus Support.
++ * Compatibility with 2.1.xx kernel by David Grothe <dave@gcom.com>
++ *
++ * Changes to allow auto initilization. All that is needed is that it
++ * be linked with the kernel and a break point (int 3) be executed.
++ * The header file <asm/kgdb.h> defines BREAKPOINT to allow one to do
++ * this. It should also be possible, once the interrupt system is up, to
++ * call putDebugChar("+"). Once this is done, the remote debugger should
++ * get our attention by sending a ^C in a packet. George Anzinger
++ * <george@mvista.com>
++ * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
++ * Added thread support, support for multiple processors,
++ * support for ia-32(x86) hardware debugging.
++ * Amit S. Kale ( akale@veritas.com )
++ *
++ *
++ * To enable debugger support, two things need to happen. One, a
++ * call to set_debug_traps() is necessary in order to allow any breakpoints
++ * or error conditions to be properly intercepted and reported to gdb.
++ * Two, a breakpoint needs to be generated to begin communication. This
++ * is most easily accomplished by a call to breakpoint(). Breakpoint()
++ * simulates a breakpoint by executing an int 3.
++ *
++ *************
++ *
++ * The following gdb commands are supported:
++ *
++ * command function Return value
++ *
++ * g return the value of the CPU registers hex data or ENN
++ * G set the value of the CPU registers OK or ENN
++ *
++ * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN
++ * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN
++ *
++ * c Resume at current address SNN ( signal NN)
++ * cAA..AA Continue at address AA..AA SNN
++ *
++ * s Step one instruction SNN
++ * sAA..AA Step one instruction from AA..AA SNN
++ *
++ * k kill
++ *
++ * ? What was the last sigval ? SNN (signal NN)
++ *
++ * All commands and responses are sent with a packet which includes a
++ * checksum. A packet consists of
++ *
++ * $<packet info>#<checksum>.
++ *
++ * where
++ * <packet info> :: <characters representing the command or response>
++ * <checksum> :: < two hex digits computed as modulo 256 sum of <packetinfo>>
++ *
++ * When a packet is received, it is first acknowledged with either '+' or '-'.
++ * '+' indicates a successful transfer. '-' indicates a failed transfer.
++ *
++ * Example:
++ *
++ * Host: Reply:
++ * $m0,10#2a +$00010203040506070809101112131415#42
++ *
++ ****************************************************************************/
++#define KGDB_VERSION "<20030915.1651.33>"
++#include <linux/config.h>
++#include <linux/types.h>
++#include <asm/string.h> /* for strcpy */
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <asm/vm86.h>
++#include <asm/system.h>
++#include <asm/ptrace.h> /* for linux pt_regs struct */
++#include <asm/kgdb_local.h>
++#include <linux/list.h>
++#include <asm/atomic.h>
++#include <asm/processor.h>
++#include <linux/irq.h>
++#include <asm/desc.h>
++
++/************************************************************************
++ *
++ * external low-level support routines
++ */
++typedef void (*Function) (void); /* pointer to a function */
++
++/* Thread reference */
++typedef unsigned char threadref[8];
++
++extern void putDebugChar(int); /* write a single character */
++extern int getDebugChar(void); /* read and return a single char */
++
++/************************************************************************/
++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/
++/* at least NUMREGBYTES*2 are needed for register packets */
++/* Longer buffer is needed to list all threads */
++#define BUFMAX 400
++
++char *kgdb_version = KGDB_VERSION;
++
++/* debug > 0 prints ill-formed commands in valid packets & checksum errors */
++int debug_regs = 0; /* set to non-zero to print registers */
++
++/* filled in by an external module */
++char *gdb_module_offsets;
++
++static const char hexchars[] = "0123456789abcdef";
++
++/* Number of bytes of registers. */
++#define NUMREGBYTES 64
++/*
++ * Note that this register image is in a different order than
++ * the register image that Linux produces at interrupt time.
++ *
++ * Linux's register image is defined by struct pt_regs in ptrace.h.
++ * Just why GDB uses a different order is a historical mystery.
++ */
++enum regnames { _EAX, /* 0 */
++ _ECX, /* 1 */
++ _EDX, /* 2 */
++ _EBX, /* 3 */
++ _ESP, /* 4 */
++ _EBP, /* 5 */
++ _ESI, /* 6 */
++ _EDI, /* 7 */
++ _PC /* 8 also known as eip */ ,
++ _PS /* 9 also known as eflags */ ,
++ _CS, /* 10 */
++ _SS, /* 11 */
++ _DS, /* 12 */
++ _ES, /* 13 */
++ _FS, /* 14 */
++ _GS /* 15 */
++};
++
++/*************************** ASSEMBLY CODE MACROS *************************/
++/*
++ * Put the error code here just in case the user cares.
++ * Likewise, the vector number here (since GDB only gets the signal
++ * number through the usual means, and that's not very specific).
++ * The called_from is the return address so he can tell how we entered kgdb.
++ * This will allow him to seperate out the various possible entries.
++ */
++#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */
++
++#define PID_MAX PID_MAX_DEFAULT
++
++#ifdef CONFIG_SMP
++void smp_send_nmi_allbutself(void);
++#define IF_SMP(x) x
++#undef MAX_NO_CPUS
++#ifndef CONFIG_NO_KGDB_CPUS
++#define CONFIG_NO_KGDB_CPUS 2
++#endif
++#if CONFIG_NO_KGDB_CPUS > NR_CPUS
++#define MAX_NO_CPUS NR_CPUS
++#else
++#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS
++#endif
++#define hold_init hold_on_sstep: 1,
++#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL)
++#define NUM_CPUS num_online_cpus()
++#else
++#define IF_SMP(x)
++#define hold_init
++#undef MAX_NO_CPUS
++#define MAX_NO_CPUS 1
++#define NUM_CPUS 1
++#endif
++#define NOCPU (struct task_struct *)0xbad1fbad
++/* *INDENT-OFF* */
++struct kgdb_info {
++ int used_malloc;
++ void *called_from;
++ long long entry_tsc;
++ int errcode;
++ int vector;
++ int print_debug_info;
++#ifdef CONFIG_SMP
++ int hold_on_sstep;
++ struct {
++ volatile struct task_struct *task;
++ int pid;
++ int hold;
++ struct pt_regs *regs;
++ } cpus_waiting[MAX_NO_CPUS];
++#endif
++} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1};
++
++/* *INDENT-ON* */
++
++#define used_m kgdb_info.used_malloc
++/*
++ * This is little area we set aside to contain the stack we
++ * need to build to allow gdb to call functions. We use one
++ * per cpu to avoid locking issues. We will do all this work
++ * with interrupts off so that should take care of the protection
++ * issues.
++ */
++#define LOOKASIDE_SIZE 200 /* should be more than enough */
++#define MALLOC_MAX 200 /* Max malloc size */
++struct {
++ unsigned int esp;
++ int array[LOOKASIDE_SIZE];
++} fn_call_lookaside[MAX_NO_CPUS];
++
++static int trap_cpu;
++static unsigned int OLD_esp;
++
++#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE]
++#define IF_BIT 0x200
++#define TF_BIT 0x100
++
++#define MALLOC_ROUND 8-1
++
++static char malloc_array[MALLOC_MAX];
++IF_SMP(static void to_gdb(const char *mess));
++void *
++malloc(int size)
++{
++
++ if (size <= (MALLOC_MAX - used_m)) {
++ int old_used = used_m;
++ used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND));
++ return &malloc_array[old_used];
++ } else {
++ return NULL;
++ }
++}
++
++/*
++ * Gdb calls functions by pushing agruments, including a return address
++ * on the stack and the adjusting EIP to point to the function. The
++ * whole assumption in GDB is that we are on a different stack than the
++ * one the "user" i.e. code that hit the break point, is on. This, of
++ * course is not true in the kernel. Thus various dodges are needed to
++ * do the call without directly messing with EIP (which we can not change
++ * as it is just a location and not a register. To adjust it would then
++ * require that we move every thing below EIP up or down as needed. This
++ * will not work as we may well have stack relative pointer on the stack
++ * (such as the pointer to regs, for example).
++
++ * So here is what we do:
++ * We detect gdb attempting to store into the stack area and instead, store
++ * into the fn_call_lookaside.array at the same relative location as if it
++ * were the area ESP pointed at. We also trap ESP modifications
++ * and uses these to adjust fn_call_lookaside.esp. On entry
++ * fn_call_lookaside.esp will be set to point at the last entry in
++ * fn_call_lookaside.array. This allows us to check if it has changed, and
++ * if so, on exit, we add the registers we will use to do the move and a
++ * trap/ interrupt return exit sequence. We then adjust the eflags in the
++ * regs array (remember we now have a copy in the fn_call_lookaside.array) to
++ * kill the interrupt bit, AND we change EIP to point at our set up stub.
++ * As part of the register set up we preset the registers to point at the
++ * begining and end of the fn_call_lookaside.array, so all the stub needs to
++ * do is move words from the array to the stack until ESP= the desired value
++ * then do the rti. This will then transfer to the desired function with
++ * all the correct registers. Nifty huh?
++ */
++extern asmlinkage void fn_call_stub(void);
++extern asmlinkage void fn_rtn_stub(void);
++/* *INDENT-OFF* */
++__asm__("fn_rtn_stub:\n\t"
++ "movl %eax,%esp\n\t"
++ "fn_call_stub:\n\t"
++ "1:\n\t"
++ "addl $-4,%ebx\n\t"
++ "movl (%ebx), %eax\n\t"
++ "pushl %eax\n\t"
++ "cmpl %esp,%ecx\n\t"
++ "jne 1b\n\t"
++ "popl %eax\n\t"
++ "popl %ebx\n\t"
++ "popl %ecx\n\t"
++ "iret \n\t");
++/* *INDENT-ON* */
++#define gdb_i386vector kgdb_info.vector
++#define gdb_i386errcode kgdb_info.errcode
++#define waiting_cpus kgdb_info.cpus_waiting
++#define remote_debug kgdb_info.print_debug_info
++#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold
++/* gdb locks */
++
++#ifdef CONFIG_SMP
++static int in_kgdb_called;
++static spinlock_t waitlocks[MAX_NO_CPUS] =
++ {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED };
++/*
++ * The following array has the thread pointer of each of the "other"
++ * cpus. We make it global so it can be seen by gdb.
++ */
++volatile int in_kgdb_entry_log[MAX_NO_CPUS];
++volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS];
++/*
++static spinlock_t continuelocks[MAX_NO_CPUS];
++*/
++spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED;
++/* waiters on our spinlock plus us */
++static atomic_t spinlock_waiters = ATOMIC_INIT(1);
++static int spinlock_count = 0;
++static int spinlock_cpu = 0;
++/*
++ * Note we use nested spin locks to account for the case where a break
++ * point is encountered when calling a function by user direction from
++ * kgdb. Also there is the memory exception recursion to account for.
++ * Well, yes, but this lets other cpus thru too. Lets add a
++ * cpu id to the lock.
++ */
++#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \
++ spinlock_cpu != smp_processor_id()){\
++ atomic_inc(&spinlock_waiters); \
++ while (! spin_trylock(x)) {\
++ in_kgdb(®s);\
++ }\
++ atomic_dec(&spinlock_waiters); \
++ spinlock_count = 1; \
++ spinlock_cpu = smp_processor_id(); \
++ }else{ \
++ spinlock_count++; \
++ }
++#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x)
++#else
++unsigned kgdb_spinlock = 0;
++#define KGDB_SPIN_LOCK(x) --*x
++#define KGDB_SPIN_UNLOCK(x) ++*x
++#endif
++
++int
++hex(char ch)
++{
++ if ((ch >= 'a') && (ch <= 'f'))
++ return (ch - 'a' + 10);
++ if ((ch >= '0') && (ch <= '9'))
++ return (ch - '0');
++ if ((ch >= 'A') && (ch <= 'F'))
++ return (ch - 'A' + 10);
++ return (-1);
++}
++
++/* scan for the sequence $<data>#<checksum> */
++void
++getpacket(char *buffer)
++{
++ unsigned char checksum;
++ unsigned char xmitcsum;
++ int i;
++ int count;
++ char ch;
++
++ do {
++ /* wait around for the start character, ignore all other characters */
++ while ((ch = (getDebugChar() & 0x7f)) != '$') ;
++ checksum = 0;
++ xmitcsum = -1;
++
++ count = 0;
++
++ /* now, read until a # or end of buffer is found */
++ while (count < BUFMAX) {
++ ch = getDebugChar() & 0x7f;
++ if (ch == '#')
++ break;
++ checksum = checksum + ch;
++ buffer[count] = ch;
++ count = count + 1;
++ }
++ buffer[count] = 0;
++
++ if (ch == '#') {
++ xmitcsum = hex(getDebugChar() & 0x7f) << 4;
++ xmitcsum += hex(getDebugChar() & 0x7f);
++ if ((remote_debug) && (checksum != xmitcsum)) {
++ printk
++ ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n",
++ checksum, xmitcsum, buffer);
++ }
++
++ if (checksum != xmitcsum)
++ putDebugChar('-'); /* failed checksum */
++ else {
++ putDebugChar('+'); /* successful transfer */
++ /* if a sequence char is present, reply the sequence ID */
++ if (buffer[2] == ':') {
++ putDebugChar(buffer[0]);
++ putDebugChar(buffer[1]);
++ /* remove sequence chars from buffer */
++ count = strlen(buffer);
++ for (i = 3; i <= count; i++)
++ buffer[i - 3] = buffer[i];
++ }
++ }
++ }
++ } while (checksum != xmitcsum);
++
++ if (remote_debug)
++ printk("R:%s\n", buffer);
++}
++
++/* send the packet in buffer. */
++
++void
++putpacket(char *buffer)
++{
++ unsigned char checksum;
++ int count;
++ char ch;
++
++ /* $<packet info>#<checksum>. */
++ do {
++ if (remote_debug)
++ printk("T:%s\n", buffer);
++ putDebugChar('$');
++ checksum = 0;
++ count = 0;
++
++ while ((ch = buffer[count])) {
++ putDebugChar(ch);
++ checksum += ch;
++ count += 1;
++ }
++
++ putDebugChar('#');
++ putDebugChar(hexchars[checksum >> 4]);
++ putDebugChar(hexchars[checksum % 16]);
++
++ } while ((getDebugChar() & 0x7f) != '+');
++
++}
++
++static char remcomInBuffer[BUFMAX];
++static char remcomOutBuffer[BUFMAX];
++static short error;
++
++void
++debug_error(char *format, char *parm)
++{
++ if (remote_debug)
++ printk(format, parm);
++}
++
++static void
++print_regs(struct pt_regs *regs)
++{
++ printk("EAX=%08lx ", regs->eax);
++ printk("EBX=%08lx ", regs->ebx);
++ printk("ECX=%08lx ", regs->ecx);
++ printk("EDX=%08lx ", regs->edx);
++ printk("\n");
++ printk("ESI=%08lx ", regs->esi);
++ printk("EDI=%08lx ", regs->edi);
++ printk("EBP=%08lx ", regs->ebp);
++ printk("ESP=%08lx ", (long) ®s->esp);
++ printk("\n");
++ printk(" DS=%08x ", regs->xds);
++ printk(" ES=%08x ", regs->xes);
++ printk(" SS=%08x ", __KERNEL_DS);
++ printk(" FL=%08lx ", regs->eflags);
++ printk("\n");
++ printk(" CS=%08x ", regs->xcs);
++ printk(" IP=%08lx ", regs->eip);
++#if 0
++ printk(" FS=%08x ", regs->fs);
++ printk(" GS=%08x ", regs->gs);
++#endif
++ printk("\n");
++
++} /* print_regs */
++
++#define NEW_esp fn_call_lookaside[trap_cpu].esp
++
++static void
++regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs)
++{
++ gdb_regs[_EAX] = regs->eax;
++ gdb_regs[_EBX] = regs->ebx;
++ gdb_regs[_ECX] = regs->ecx;
++ gdb_regs[_EDX] = regs->edx;
++ gdb_regs[_ESI] = regs->esi;
++ gdb_regs[_EDI] = regs->edi;
++ gdb_regs[_EBP] = regs->ebp;
++ gdb_regs[_DS] = regs->xds;
++ gdb_regs[_ES] = regs->xes;
++ gdb_regs[_PS] = regs->eflags;
++ gdb_regs[_CS] = regs->xcs;
++ gdb_regs[_PC] = regs->eip;
++ /* Note, as we are a debugging the kernel, we will always
++ * trap in kernel code, this means no priviledge change,
++ * and so the pt_regs structure is not completely valid. In a non
++ * privilege change trap, only EFLAGS, CS and EIP are put on the stack,
++ * SS and ESP are not stacked, this means that the last 2 elements of
++ * pt_regs is not valid (they would normally refer to the user stack)
++ * also, using regs+1 is no good because you end up will a value that is
++ * 2 longs (8) too high. This used to cause stepping over functions
++ * to fail, so my fix is to use the address of regs->esp, which
++ * should point at the end of the stack frame. Note I have ignored
++ * completely exceptions that cause an error code to be stacked, such
++ * as double fault. Stuart Hughes, Zentropix.
++ * original code: gdb_regs[_ESP] = (int) (regs + 1) ;
++
++ * this is now done on entry and moved to OLD_esp (as well as NEW_esp).
++ */
++ gdb_regs[_ESP] = NEW_esp;
++ gdb_regs[_SS] = __KERNEL_DS;
++ gdb_regs[_FS] = 0xFFFF;
++ gdb_regs[_GS] = 0xFFFF;
++} /* regs_to_gdb_regs */
++
++static void
++gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs)
++{
++ regs->eax = gdb_regs[_EAX];
++ regs->ebx = gdb_regs[_EBX];
++ regs->ecx = gdb_regs[_ECX];
++ regs->edx = gdb_regs[_EDX];
++ regs->esi = gdb_regs[_ESI];
++ regs->edi = gdb_regs[_EDI];
++ regs->ebp = gdb_regs[_EBP];
++ regs->xds = gdb_regs[_DS];
++ regs->xes = gdb_regs[_ES];
++ regs->eflags = gdb_regs[_PS];
++ regs->xcs = gdb_regs[_CS];
++ regs->eip = gdb_regs[_PC];
++ NEW_esp = gdb_regs[_ESP]; /* keep the value */
++#if 0 /* can't change these */
++ regs->esp = gdb_regs[_ESP];
++ regs->xss = gdb_regs[_SS];
++ regs->fs = gdb_regs[_FS];
++ regs->gs = gdb_regs[_GS];
++#endif
++
++} /* gdb_regs_to_regs */
++
++int thread_list = 0;
++
++void
++get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs)
++{
++ unsigned long stack_page;
++ int count = 0;
++ IF_SMP(int i);
++ if (!p || p == current) {
++ regs_to_gdb_regs(gdb_regs, regs);
++ return;
++ }
++#ifdef CONFIG_SMP
++ for (i = 0; i < MAX_NO_CPUS; i++) {
++ if (p == kgdb_info.cpus_waiting[i].task) {
++ regs_to_gdb_regs(gdb_regs,
++ kgdb_info.cpus_waiting[i].regs);
++ gdb_regs[_ESP] =
++ (int) &kgdb_info.cpus_waiting[i].regs->esp;
++
++ return;
++ }
++ }
++#endif
++ memset(gdb_regs, 0, NUMREGBYTES);
++ gdb_regs[_ESP] = p->thread.esp;
++ gdb_regs[_PC] = p->thread.eip;
++ gdb_regs[_EBP] = *(int *) gdb_regs[_ESP];
++ gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4);
++ gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8);
++
++/*
++ * This code is to give a more informative notion of where a process
++ * is waiting. It is used only when the user asks for a thread info
++ * list. If he then switches to the thread, s/he will find the task
++ * is in schedule, but a back trace should show the same info we come
++ * up with. This code was shamelessly purloined from process.c. It was
++ * then enhanced to provide more registers than simply the program
++ * counter.
++ */
++
++ if (!thread_list) {
++ return;
++ }
++
++ if (p->state == TASK_RUNNING)
++ return;
++ stack_page = (unsigned long) p->thread_info;
++ if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] >
++ THREAD_SIZE - sizeof(long) + stack_page)
++ return;
++ /* include/asm-i386/system.h:switch_to() pushes ebp last. */
++ do {
++ if (gdb_regs[_EBP] < stack_page ||
++ gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page)
++ return;
++ gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4);
++ gdb_regs[_ESP] = gdb_regs[_EBP] + 8;
++ gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP];
++ if (!in_sched_functions(gdb_regs[_PC]))
++ return;
++ } while (count++ < 16);
++ return;
++}
++
++/* Indicate to caller of mem2hex or hex2mem that there has been an
++ error. */
++static volatile int mem_err = 0;
++static volatile int mem_err_expected = 0;
++static volatile int mem_err_cnt = 0;
++static int garbage_loc = -1;
++
++int
++get_char(char *addr)
++{
++ return *addr;
++}
++
++void
++set_char(char *addr, int val, int may_fault)
++{
++ /*
++ * This code traps references to the area mapped to the kernel
++ * stack as given by the regs and, instead, stores to the
++ * fn_call_lookaside[cpu].array
++ */
++ if (may_fault &&
++ (unsigned int) addr < OLD_esp &&
++ ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) {
++ addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr);
++ }
++ *addr = val;
++}
++
++/* convert the memory pointed to by mem into hex, placing result in buf */
++/* return a pointer to the last char put in buf (null) */
++/* If MAY_FAULT is non-zero, then we should set mem_err in response to
++ a fault; if zero treat a fault like any other fault in the stub. */
++char *
++mem2hex(char *mem, char *buf, int count, int may_fault)
++{
++ int i;
++ unsigned char ch;
++
++ if (may_fault) {
++ mem_err_expected = 1;
++ mem_err = 0;
++ }
++ for (i = 0; i < count; i++) {
++ /* printk("%lx = ", mem) ; */
++
++ ch = get_char(mem++);
++
++ /* printk("%02x\n", ch & 0xFF) ; */
++ if (may_fault && mem_err) {
++ if (remote_debug)
++ printk("Mem fault fetching from addr %lx\n",
++ (long) (mem - 1));
++ *buf = 0; /* truncate buffer */
++ return (buf);
++ }
++ *buf++ = hexchars[ch >> 4];
++ *buf++ = hexchars[ch % 16];
++ }
++ *buf = 0;
++ if (may_fault)
++ mem_err_expected = 0;
++ return (buf);
++}
++
++/* convert the hex array pointed to by buf into binary to be placed in mem */
++/* return a pointer to the character AFTER the last byte written */
++/* NOTE: We use the may fault flag to also indicate if the write is to
++ * the registers (0) or "other" memory (!=0)
++ */
++char *
++hex2mem(char *buf, char *mem, int count, int may_fault)
++{
++ int i;
++ unsigned char ch;
++
++ if (may_fault) {
++ mem_err_expected = 1;
++ mem_err = 0;
++ }
++ for (i = 0; i < count; i++) {
++ ch = hex(*buf++) << 4;
++ ch = ch + hex(*buf++);
++ set_char(mem++, ch, may_fault);
++
++ if (may_fault && mem_err) {
++ if (remote_debug)
++ printk("Mem fault storing to addr %lx\n",
++ (long) (mem - 1));
++ return (mem);
++ }
++ }
++ if (may_fault)
++ mem_err_expected = 0;
++ return (mem);
++}
++
++/**********************************************/
++/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */
++/* RETURN NUMBER OF CHARS PROCESSED */
++/**********************************************/
++int
++hexToInt(char **ptr, int *intValue)
++{
++ int numChars = 0;
++ int hexValue;
++
++ *intValue = 0;
++
++ while (**ptr) {
++ hexValue = hex(**ptr);
++ if (hexValue >= 0) {
++ *intValue = (*intValue << 4) | hexValue;
++ numChars++;
++ } else
++ break;
++
++ (*ptr)++;
++ }
++
++ return (numChars);
++}
++
++#define stubhex(h) hex(h)
++#ifdef old_thread_list
++
++static int
++stub_unpack_int(char *buff, int fieldlength)
++{
++ int nibble;
++ int retval = 0;
++
++ while (fieldlength) {
++ nibble = stubhex(*buff++);
++ retval |= nibble;
++ fieldlength--;
++ if (fieldlength)
++ retval = retval << 4;
++ }
++ return retval;
++}
++#endif
++static char *
++pack_hex_byte(char *pkt, int byte)
++{
++ *pkt++ = hexchars[(byte >> 4) & 0xf];
++ *pkt++ = hexchars[(byte & 0xf)];
++ return pkt;
++}
++
++#define BUF_THREAD_ID_SIZE 16
++
++static char *
++pack_threadid(char *pkt, threadref * id)
++{
++ char *limit;
++ unsigned char *altid;
++
++ altid = (unsigned char *) id;
++ limit = pkt + BUF_THREAD_ID_SIZE;
++ while (pkt < limit)
++ pkt = pack_hex_byte(pkt, *altid++);
++ return pkt;
++}
++
++#ifdef old_thread_list
++static char *
++unpack_byte(char *buf, int *value)
++{
++ *value = stub_unpack_int(buf, 2);
++ return buf + 2;
++}
++
++static char *
++unpack_threadid(char *inbuf, threadref * id)
++{
++ char *altref;
++ char *limit = inbuf + BUF_THREAD_ID_SIZE;
++ int x, y;
++
++ altref = (char *) id;
++
++ while (inbuf < limit) {
++ x = stubhex(*inbuf++);
++ y = stubhex(*inbuf++);
++ *altref++ = (x << 4) | y;
++ }
++ return inbuf;
++}
++#endif
++void
++int_to_threadref(threadref * id, int value)
++{
++ unsigned char *scan;
++
++ scan = (unsigned char *) id;
++ {
++ int i = 4;
++ while (i--)
++ *scan++ = 0;
++ }
++ *scan++ = (value >> 24) & 0xff;
++ *scan++ = (value >> 16) & 0xff;
++ *scan++ = (value >> 8) & 0xff;
++ *scan++ = (value & 0xff);
++}
++int
++int_to_hex_v(unsigned char * id, int value)
++{
++ unsigned char *start = id;
++ int shift;
++ int ch;
++
++ for (shift = 28; shift >= 0; shift -= 4) {
++ if ((ch = (value >> shift) & 0xf) || (id != start)) {
++ *id = hexchars[ch];
++ id++;
++ }
++ }
++ if (id == start)
++ *id++ = '0';
++ return id - start;
++}
++#ifdef old_thread_list
++
++static int
++threadref_to_int(threadref * ref)
++{
++ int i, value = 0;
++ unsigned char *scan;
++
++ scan = (char *) ref;
++ scan += 4;
++ i = 4;
++ while (i-- > 0)
++ value = (value << 8) | ((*scan++) & 0xff);
++ return value;
++}
++#endif
++static int
++cmp_str(char *s1, char *s2, int count)
++{
++ while (count--) {
++ if (*s1++ != *s2++)
++ return 0;
++ }
++ return 1;
++}
++
++#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */
++extern struct task_struct *kgdb_get_idle(int cpu);
++#define idle_task(cpu) kgdb_get_idle(cpu)
++#else
++#define idle_task(cpu) init_tasks[cpu]
++#endif
++
++extern int kgdb_pid_init_done;
++
++struct task_struct *
++getthread(int pid)
++{
++ struct task_struct *thread;
++ if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) {
++
++ return idle_task(pid - PID_MAX);
++ } else {
++ /*
++ * find_task_by_pid is relatively safe all the time
++ * Other pid functions require lock downs which imply
++ * that we may be interrupting them (as we get here
++ * in the middle of most any lock down).
++ * Still we don't want to call until the table exists!
++ */
++ if (kgdb_pid_init_done){
++ thread = find_task_by_pid(pid);
++ if (thread) {
++ return thread;
++ }
++ }
++ }
++ return NULL;
++}
++/* *INDENT-OFF* */
++struct hw_breakpoint {
++ unsigned enabled;
++ unsigned type;
++ unsigned len;
++ unsigned addr;
++} breakinfo[4] = { {enabled:0},
++ {enabled:0},
++ {enabled:0},
++ {enabled:0}};
++/* *INDENT-ON* */
++unsigned hw_breakpoint_status;
++void
++correct_hw_break(void)
++{
++ int breakno;
++ int correctit;
++ int breakbit;
++ unsigned dr7;
++
++ asm volatile ("movl %%db7, %0\n":"=r" (dr7)
++ :);
++ /* *INDENT-OFF* */
++ do {
++ unsigned addr0, addr1, addr2, addr3;
++ asm volatile ("movl %%db0, %0\n"
++ "movl %%db1, %1\n"
++ "movl %%db2, %2\n"
++ "movl %%db3, %3\n"
++ :"=r" (addr0), "=r"(addr1),
++ "=r"(addr2), "=r"(addr3)
++ :);
++ } while (0);
++ /* *INDENT-ON* */
++ correctit = 0;
++ for (breakno = 0; breakno < 3; breakno++) {
++ breakbit = 2 << (breakno << 1);
++ if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
++ correctit = 1;
++ dr7 |= breakbit;
++ dr7 &= ~(0xf0000 << (breakno << 2));
++ dr7 |= (((breakinfo[breakno].len << 2) |
++ breakinfo[breakno].type) << 16) <<
++ (breakno << 2);
++ switch (breakno) {
++ case 0:
++ asm volatile ("movl %0, %%dr0\n"::"r"
++ (breakinfo[breakno].addr));
++ break;
++
++ case 1:
++ asm volatile ("movl %0, %%dr1\n"::"r"
++ (breakinfo[breakno].addr));
++ break;
++
++ case 2:
++ asm volatile ("movl %0, %%dr2\n"::"r"
++ (breakinfo[breakno].addr));
++ break;
++
++ case 3:
++ asm volatile ("movl %0, %%dr3\n"::"r"
++ (breakinfo[breakno].addr));
++ break;
++ }
++ } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
++ correctit = 1;
++ dr7 &= ~breakbit;
++ dr7 &= ~(0xf0000 << (breakno << 2));
++ }
++ }
++ if (correctit) {
++ asm volatile ("movl %0, %%db7\n"::"r" (dr7));
++ }
++}
++
++int
++remove_hw_break(unsigned breakno)
++{
++ if (!breakinfo[breakno].enabled) {
++ return -1;
++ }
++ breakinfo[breakno].enabled = 0;
++ return 0;
++}
++
++int
++set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr)
++{
++ if (breakinfo[breakno].enabled) {
++ return -1;
++ }
++ breakinfo[breakno].enabled = 1;
++ breakinfo[breakno].type = type;
++ breakinfo[breakno].len = len;
++ breakinfo[breakno].addr = addr;
++ return 0;
++}
++
++#ifdef CONFIG_SMP
++static int in_kgdb_console = 0;
++
++int
++in_kgdb(struct pt_regs *regs)
++{
++ unsigned flags;
++ int cpu = smp_processor_id();
++ in_kgdb_called = 1;
++ if (!spin_is_locked(&kgdb_spinlock)) {
++ if (in_kgdb_here_log[cpu] || /* we are holding this cpu */
++ in_kgdb_console) { /* or we are doing slow i/o */
++ return 1;
++ }
++ return 0;
++ }
++
++ /* As I see it the only reason not to let all cpus spin on
++ * the same spin_lock is to allow selected ones to proceed.
++ * This would be a good thing, so we leave it this way.
++ * Maybe someday.... Done !
++
++ * in_kgdb() is called from an NMI so we don't pretend
++ * to have any resources, like printk() for example.
++ */
++
++ kgdb_local_irq_save(flags); /* only local here, to avoid hanging */
++ /*
++ * log arival of this cpu
++ * The NMI keeps on ticking. Protect against recurring more
++ * than once, and ignor the cpu that has the kgdb lock
++ */
++ in_kgdb_entry_log[cpu]++;
++ in_kgdb_here_log[cpu] = regs;
++ if (cpu == spinlock_cpu || waiting_cpus[cpu].task) {
++ goto exit_in_kgdb;
++ }
++ /*
++ * For protection of the initilization of the spin locks by kgdb
++ * it locks the kgdb spinlock before it gets the wait locks set
++ * up. We wait here for the wait lock to be taken. If the
++ * kgdb lock goes away first?? Well, it could be a slow exit
++ * sequence where the wait lock is removed prior to the kgdb lock
++ * so if kgdb gets unlocked, we just exit.
++ */
++ while (spin_is_locked(&kgdb_spinlock) &&
++ !spin_is_locked(waitlocks + cpu)) ;
++ if (!spin_is_locked(&kgdb_spinlock)) {
++ goto exit_in_kgdb;
++ }
++ waiting_cpus[cpu].task = current;
++ waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu);
++ waiting_cpus[cpu].regs = regs;
++
++ spin_unlock_wait(waitlocks + cpu);
++ /*
++ * log departure of this cpu
++ */
++ waiting_cpus[cpu].task = 0;
++ waiting_cpus[cpu].pid = 0;
++ waiting_cpus[cpu].regs = 0;
++ correct_hw_break();
++ exit_in_kgdb:
++ in_kgdb_here_log[cpu] = 0;
++ kgdb_local_irq_restore(flags);
++ return 1;
++ /*
++ spin_unlock(continuelocks + smp_processor_id());
++ */
++}
++
++void
++smp__in_kgdb(struct pt_regs regs)
++{
++ ack_APIC_irq();
++ in_kgdb(®s);
++}
++#else
++int
++in_kgdb(struct pt_regs *regs)
++{
++ return (kgdb_spinlock);
++}
++#endif
++
++void
++printexceptioninfo(int exceptionNo, int errorcode, char *buffer)
++{
++ unsigned dr6;
++ int i;
++ switch (exceptionNo) {
++ case 1: /* debug exception */
++ break;
++ case 3: /* breakpoint */
++ sprintf(buffer, "Software breakpoint");
++ return;
++ default:
++ sprintf(buffer, "Details not available");
++ return;
++ }
++ asm volatile ("movl %%db6, %0\n":"=r" (dr6)
++ :);
++ if (dr6 & 0x4000) {
++ sprintf(buffer, "Single step");
++ return;
++ }
++ for (i = 0; i < 4; ++i) {
++ if (dr6 & (1 << i)) {
++ sprintf(buffer, "Hardware breakpoint %d", i);
++ return;
++ }
++ }
++ sprintf(buffer, "Unknown trap");
++ return;
++}
++
++/*
++ * This function does all command procesing for interfacing to gdb.
++ *
++ * NOTE: The INT nn instruction leaves the state of the interrupt
++ * enable flag UNCHANGED. That means that when this routine
++ * is entered via a breakpoint (INT 3) instruction from code
++ * that has interrupts enabled, then interrupts will STILL BE
++ * enabled when this routine is entered. The first thing that
++ * we do here is disable interrupts so as to prevent recursive
++ * entries and bothersome serial interrupts while we are
++ * trying to run the serial port in polled mode.
++ *
++ * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so
++ * it is always necessary to do a restore_flags before returning
++ * so as to let go of that lock.
++ */
++int
++kgdb_handle_exception(int exceptionVector,
++ int signo, int err_code, struct pt_regs *linux_regs)
++{
++ struct task_struct *usethread = NULL;
++ struct task_struct *thread_list_start = 0, *thread = NULL;
++ int addr, length;
++ int breakno, breaktype;
++ char *ptr;
++ int newPC;
++ threadref thref;
++ int threadid;
++ int thread_min = PID_MAX + MAX_NO_CPUS;
++#ifdef old_thread_list
++ int maxthreads;
++#endif
++ int nothreads;
++ unsigned long flags;
++ int gdb_regs[NUMREGBYTES / 4];
++ int dr6;
++ IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */
++#define NO_NMI 1
++#define NO_SYNC 2
++#define regs (*linux_regs)
++#define NUMREGS NUMREGBYTES/4
++ /*
++ * If the entry is not from the kernel then return to the Linux
++ * trap handler and let it process the interrupt normally.
++ */
++ if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) {
++ printk("ignoring non-kernel exception\n");
++ print_regs(®s);
++ return (0);
++ }
++
++ kgdb_local_irq_save(flags);
++
++ /* Get kgdb spinlock */
++
++ KGDB_SPIN_LOCK(&kgdb_spinlock);
++ rdtscll(kgdb_info.entry_tsc);
++ /*
++ * We depend on this spinlock and the NMI watch dog to control the
++ * other cpus. They will arrive at "in_kgdb()" as a result of the
++ * NMI and will wait there for the following spin locks to be
++ * released.
++ */
++#ifdef CONFIG_SMP
++
++#if 0
++ if (cpu_callout_map & ~MAX_CPU_MASK) {
++ printk("kgdb : too many cpus, possibly not mapped"
++ " in contiguous space, change MAX_NO_CPUS"
++ " in kgdb_stub and make new kernel.\n"
++ " cpu_callout_map is %lx\n", cpu_callout_map);
++ goto exit_just_unlock;
++ }
++#endif
++ if (spinlock_count == 1) {
++ int time = 0, end_time, dum = 0;
++ int i;
++ int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0)
++ };
++ if (remote_debug) {
++ printk("kgdb : cpu %d entry, syncing others\n",
++ smp_processor_id());
++ }
++ for (i = 0; i < MAX_NO_CPUS; i++) {
++ /*
++ * Use trylock as we may already hold the lock if
++ * we are holding the cpu. Net result is all
++ * locked.
++ */
++ spin_trylock(&waitlocks[i]);
++ }
++ for (i = 0; i < MAX_NO_CPUS; i++)
++ cpu_logged_in[i] = 0;
++ /*
++ * Wait for their arrival. We know the watch dog is active if
++ * in_kgdb() has ever been called, as it is always called on a
++ * watchdog tick.
++ */
++ rdtsc(dum, time);
++ end_time = time + 2; /* Note: we use the High order bits! */
++ i = 1;
++ if (num_online_cpus() > 1) {
++ int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()];
++ smp_send_nmi_allbutself();
++ while (i < num_online_cpus() && time != end_time) {
++ int j;
++ for (j = 0; j < MAX_NO_CPUS; j++) {
++ if (waiting_cpus[j].task &&
++ !cpu_logged_in[j]) {
++ i++;
++ cpu_logged_in[j] = 1;
++ if (remote_debug) {
++ printk
++ ("kgdb : cpu %d arrived at kgdb\n",
++ j);
++ }
++ break;
++ } else if (!waiting_cpus[j].task &&
++ !cpu_online(j)) {
++ waiting_cpus[j].task = NOCPU;
++ cpu_logged_in[j] = 1;
++ waiting_cpus[j].hold = 1;
++ break;
++ }
++ if (!waiting_cpus[j].task &&
++ in_kgdb_here_log[j]) {
++
++ int wait = 100000;
++ while (wait--) ;
++ if (!waiting_cpus[j].task &&
++ in_kgdb_here_log[j]) {
++ printk
++ ("kgdb : cpu %d stall"
++ " in in_kgdb\n",
++ j);
++ i++;
++ cpu_logged_in[j] = 1;
++ waiting_cpus[j].task =
++ (struct task_struct
++ *) 1;
++ }
++ }
++ }
++
++ if (in_kgdb_entry_log[smp_processor_id()] >
++ (me_in_kgdb + 10)) {
++ break;
++ }
++
++ rdtsc(dum, time);
++ }
++ if (i < num_online_cpus()) {
++ printk
++ ("kgdb : time out, proceeding without sync\n");
++#if 0
++ printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n",
++ waiting_cpus[0].task != 0,
++ waiting_cpus[1].task != 0);
++ printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n",
++ cpu_logged_in[0], cpu_logged_in[1]);
++ printk
++ ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n",
++ in_kgdb_here_log[0] != 0,
++ in_kgdb_here_log[1] != 0);
++#endif
++ entry_state = NO_SYNC;
++ } else {
++#if 0
++ int ent =
++ in_kgdb_entry_log[smp_processor_id()] -
++ me_in_kgdb;
++ printk("kgdb : sync after %d entries\n", ent);
++#endif
++ }
++ } else {
++ if (remote_debug) {
++ printk
++ ("kgdb : %d cpus, but watchdog not active\n"
++ "proceeding without locking down other cpus\n",
++ num_online_cpus());
++ entry_state = NO_NMI;
++ }
++ }
++ }
++#endif
++
++ if (remote_debug) {
++ unsigned long *lp = (unsigned long *) &linux_regs;
++
++ printk("handle_exception(exceptionVector=%d, "
++ "signo=%d, err_code=%d, linux_regs=%p)\n",
++ exceptionVector, signo, err_code, linux_regs);
++ if (debug_regs) {
++ print_regs(®s);
++ printk("Stk: %8lx %8lx %8lx %8lx"
++ " %8lx %8lx %8lx %8lx\n",
++ lp[0], lp[1], lp[2], lp[3],
++ lp[4], lp[5], lp[6], lp[7]);
++ printk(" %8lx %8lx %8lx %8lx"
++ " %8lx %8lx %8lx %8lx\n",
++ lp[8], lp[9], lp[10], lp[11],
++ lp[12], lp[13], lp[14], lp[15]);
++ printk(" %8lx %8lx %8lx %8lx "
++ "%8lx %8lx %8lx %8lx\n",
++ lp[16], lp[17], lp[18], lp[19],
++ lp[20], lp[21], lp[22], lp[23]);
++ printk(" %8lx %8lx %8lx %8lx "
++ "%8lx %8lx %8lx %8lx\n",
++ lp[24], lp[25], lp[26], lp[27],
++ lp[28], lp[29], lp[30], lp[31]);
++ }
++ }
++
++ /* Disable hardware debugging while we are in kgdb */
++ /* Get the debug register status register */
++/* *INDENT-OFF* */
++ __asm__("movl %0,%%db7"
++ : /* no output */
++ :"r"(0));
++
++ asm volatile ("movl %%db6, %0\n"
++ :"=r" (hw_breakpoint_status)
++ :);
++
++/* *INDENT-ON* */
++ switch (exceptionVector) {
++ case 0: /* divide error */
++ case 1: /* debug exception */
++ case 2: /* NMI */
++ case 3: /* breakpoint */
++ case 4: /* overflow */
++ case 5: /* bounds check */
++ case 6: /* invalid opcode */
++ case 7: /* device not available */
++ case 8: /* double fault (errcode) */
++ case 10: /* invalid TSS (errcode) */
++ case 12: /* stack fault (errcode) */
++ case 16: /* floating point error */
++ case 17: /* alignment check (errcode) */
++ default: /* any undocumented */
++ break;
++ case 11: /* segment not present (errcode) */
++ case 13: /* general protection (errcode) */
++ case 14: /* page fault (special errcode) */
++ case 19: /* cache flush denied */
++ if (mem_err_expected) {
++ /*
++ * This fault occured because of the
++ * get_char or set_char routines. These
++ * two routines use either eax of edx to
++ * indirectly reference the location in
++ * memory that they are working with.
++ * For a page fault, when we return the
++ * instruction will be retried, so we
++ * have to make sure that these
++ * registers point to valid memory.
++ */
++ mem_err = 1; /* set mem error flag */
++ mem_err_expected = 0;
++ mem_err_cnt++; /* helps in debugging */
++ /* make valid address */
++ regs.eax = (long) &garbage_loc;
++ /* make valid address */
++ regs.edx = (long) &garbage_loc;
++ if (remote_debug)
++ printk("Return after memory error: "
++ "mem_err_cnt=%d\n", mem_err_cnt);
++ if (debug_regs)
++ print_regs(®s);
++ goto exit_kgdb;
++ }
++ break;
++ }
++ if (remote_debug)
++ printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id());
++
++ gdb_i386vector = exceptionVector;
++ gdb_i386errcode = err_code;
++ kgdb_info.called_from = __builtin_return_address(0);
++#ifdef CONFIG_SMP
++ /*
++ * OK, we can now communicate, lets tell gdb about the sync.
++ * but only if we had a problem.
++ */
++ switch (entry_state) {
++ case NO_NMI:
++ to_gdb("NMI not active, other cpus not stopped\n");
++ break;
++ case NO_SYNC:
++ to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n");
++ default:;
++ }
++
++#endif
++/*
++ * Set up the gdb function call area.
++ */
++ trap_cpu = smp_processor_id();
++ OLD_esp = NEW_esp = (int) (&linux_regs->esp);
++
++ IF_SMP(once_again:)
++ /* reply to host that an exception has occurred */
++ remcomOutBuffer[0] = 'S';
++ remcomOutBuffer[1] = hexchars[signo >> 4];
++ remcomOutBuffer[2] = hexchars[signo % 16];
++ remcomOutBuffer[3] = 0;
++
++ putpacket(remcomOutBuffer);
++
++ while (1 == 1) {
++ error = 0;
++ remcomOutBuffer[0] = 0;
++ getpacket(remcomInBuffer);
++ switch (remcomInBuffer[0]) {
++ case '?':
++ remcomOutBuffer[0] = 'S';
++ remcomOutBuffer[1] = hexchars[signo >> 4];
++ remcomOutBuffer[2] = hexchars[signo % 16];
++ remcomOutBuffer[3] = 0;
++ break;
++ case 'd':
++ remote_debug = !(remote_debug); /* toggle debug flag */
++ printk("Remote debug %s\n",
++ remote_debug ? "on" : "off");
++ break;
++ case 'g': /* return the value of the CPU registers */
++ get_gdb_regs(usethread, ®s, gdb_regs);
++ mem2hex((char *) gdb_regs,
++ remcomOutBuffer, NUMREGBYTES, 0);
++ break;
++ case 'G': /* set the value of the CPU registers - return OK */
++ hex2mem(&remcomInBuffer[1],
++ (char *) gdb_regs, NUMREGBYTES, 0);
++ if (!usethread || usethread == current) {
++ gdb_regs_to_regs(gdb_regs, ®s);
++ strcpy(remcomOutBuffer, "OK");
++ } else {
++ strcpy(remcomOutBuffer, "E00");
++ }
++ break;
++
++ case 'P':{ /* set the value of a single CPU register -
++ return OK */
++ /*
++ * For some reason, gdb wants to talk about psudo
++ * registers (greater than 15). These may have
++ * meaning for ptrace, but for us it is safe to
++ * ignor them. We do this by dumping them into
++ * _GS which we also ignor, but do have memory for.
++ */
++ int regno;
++
++ ptr = &remcomInBuffer[1];
++ regs_to_gdb_regs(gdb_regs, ®s);
++ if ((!usethread || usethread == current) &&
++ hexToInt(&ptr, ®no) &&
++ *ptr++ == '=' && (regno >= 0)) {
++ regno =
++ (regno >= NUMREGS ? _GS : regno);
++ hex2mem(ptr, (char *) &gdb_regs[regno],
++ 4, 0);
++ gdb_regs_to_regs(gdb_regs, ®s);
++ strcpy(remcomOutBuffer, "OK");
++ break;
++ }
++ strcpy(remcomOutBuffer, "E01");
++ break;
++ }
++
++ /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
++ case 'm':
++ /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */
++ ptr = &remcomInBuffer[1];
++ if (hexToInt(&ptr, &addr) &&
++ (*(ptr++) == ',') && (hexToInt(&ptr, &length))) {
++ ptr = 0;
++ /*
++ * hex doubles the byte count
++ */
++ if (length > (BUFMAX / 2))
++ length = BUFMAX / 2;
++ mem2hex((char *) addr,
++ remcomOutBuffer, length, 1);
++ if (mem_err) {
++ strcpy(remcomOutBuffer, "E03");
++ debug_error("memory fault\n", NULL);
++ }
++ }
++
++ if (ptr) {
++ strcpy(remcomOutBuffer, "E01");
++ debug_error
++ ("malformed read memory command: %s\n",
++ remcomInBuffer);
++ }
++ break;
++
++ /* MAA..AA,LLLL:
++ Write LLLL bytes at address AA.AA return OK */
++ case 'M':
++ /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */
++ ptr = &remcomInBuffer[1];
++ if (hexToInt(&ptr, &addr) &&
++ (*(ptr++) == ',') &&
++ (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) {
++ hex2mem(ptr, (char *) addr, length, 1);
++
++ if (mem_err) {
++ strcpy(remcomOutBuffer, "E03");
++ debug_error("memory fault\n", NULL);
++ } else {
++ strcpy(remcomOutBuffer, "OK");
++ }
++
++ ptr = 0;
++ }
++ if (ptr) {
++ strcpy(remcomOutBuffer, "E02");
++ debug_error
++ ("malformed write memory command: %s\n",
++ remcomInBuffer);
++ }
++ break;
++ case 'S':
++ remcomInBuffer[0] = 's';
++ case 'C':
++ /* Csig;AA..AA where ;AA..AA is optional
++ * continue with signal
++ * Since signals are meaning less to us, delete that
++ * part and then fall into the 'c' code.
++ */
++ ptr = &remcomInBuffer[1];
++ length = 2;
++ while (*ptr && *ptr != ';') {
++ length++;
++ ptr++;
++ }
++ if (*ptr) {
++ do {
++ ptr++;
++ *(ptr - length++) = *ptr;
++ } while (*ptr);
++ } else {
++ remcomInBuffer[1] = 0;
++ }
++
++ /* cAA..AA Continue at address AA..AA(optional) */
++ /* sAA..AA Step one instruction from AA..AA(optional) */
++ /* D detach, reply OK and then continue */
++ case 'c':
++ case 's':
++ case 'D':
++
++ /* try to read optional parameter,
++ pc unchanged if no parm */
++ ptr = &remcomInBuffer[1];
++ if (hexToInt(&ptr, &addr)) {
++ if (remote_debug)
++ printk("Changing EIP to 0x%x\n", addr);
++
++ regs.eip = addr;
++ }
++
++ newPC = regs.eip;
++
++ /* clear the trace bit */
++ regs.eflags &= 0xfffffeff;
++
++ /* set the trace bit if we're stepping */
++ if (remcomInBuffer[0] == 's')
++ regs.eflags |= 0x100;
++
++ /* detach is a friendly version of continue. Note that
++ debugging is still enabled (e.g hit control C)
++ */
++ if (remcomInBuffer[0] == 'D') {
++ strcpy(remcomOutBuffer, "OK");
++ putpacket(remcomOutBuffer);
++ }
++
++ if (remote_debug) {
++ printk("Resuming execution\n");
++ print_regs(®s);
++ }
++ asm volatile ("movl %%db6, %0\n":"=r" (dr6)
++ :);
++ if (!(dr6 & 0x4000)) {
++ for (breakno = 0; breakno < 4; ++breakno) {
++ if (dr6 & (1 << breakno) &&
++ (breakinfo[breakno].type == 0)) {
++ /* Set restore flag */
++ regs.eflags |= 0x10000;
++ break;
++ }
++ }
++ }
++ correct_hw_break();
++ asm volatile ("movl %0, %%db6\n"::"r" (0));
++ goto exit_kgdb;
++
++ /* kill the program */
++ case 'k': /* do nothing */
++ break;
++
++ /* query */
++ case 'q':
++ nothreads = 0;
++ switch (remcomInBuffer[1]) {
++ case 'f':
++ threadid = 1;
++ thread_list = 2;
++ thread_list_start = (usethread ? : current);
++ case 's':
++ if (!cmp_str(&remcomInBuffer[2],
++ "ThreadInfo", 10))
++ break;
++
++ remcomOutBuffer[nothreads++] = 'm';
++ for (; threadid < PID_MAX + MAX_NO_CPUS;
++ threadid++) {
++ thread = getthread(threadid);
++ if (thread) {
++ nothreads += int_to_hex_v(
++ &remcomOutBuffer[
++ nothreads],
++ threadid);
++ if (thread_min > threadid)
++ thread_min = threadid;
++ remcomOutBuffer[
++ nothreads] = ',';
++ nothreads++;
++ if (nothreads > BUFMAX - 10)
++ break;
++ }
++ }
++ if (remcomOutBuffer[nothreads - 1] == 'm') {
++ remcomOutBuffer[nothreads - 1] = 'l';
++ } else {
++ nothreads--;
++ }
++ remcomOutBuffer[nothreads] = 0;
++ break;
++
++#ifdef old_thread_list /* Old thread info request */
++ case 'L':
++ /* List threads */
++ thread_list = 2;
++ thread_list_start = (usethread ? : current);
++ unpack_byte(remcomInBuffer + 3, &maxthreads);
++ unpack_threadid(remcomInBuffer + 5, &thref);
++ do {
++ int buf_thread_limit =
++ (BUFMAX - 22) / BUF_THREAD_ID_SIZE;
++ if (maxthreads > buf_thread_limit) {
++ maxthreads = buf_thread_limit;
++ }
++ } while (0);
++ remcomOutBuffer[0] = 'q';
++ remcomOutBuffer[1] = 'M';
++ remcomOutBuffer[4] = '0';
++ pack_threadid(remcomOutBuffer + 5, &thref);
++
++ threadid = threadref_to_int(&thref);
++ for (nothreads = 0;
++ nothreads < maxthreads &&
++ threadid < PID_MAX + MAX_NO_CPUS;
++ threadid++) {
++ thread = getthread(threadid);
++ if (thread) {
++ int_to_threadref(&thref,
++ threadid);
++ pack_threadid(remcomOutBuffer +
++ 21 +
++ nothreads * 16,
++ &thref);
++ nothreads++;
++ if (thread_min > threadid)
++ thread_min = threadid;
++ }
++ }
++
++ if (threadid == PID_MAX + MAX_NO_CPUS) {
++ remcomOutBuffer[4] = '1';
++ }
++ pack_hex_byte(remcomOutBuffer + 2, nothreads);
++ remcomOutBuffer[21 + nothreads * 16] = '\0';
++ break;
++#endif
++ case 'C':
++ /* Current thread id */
++ remcomOutBuffer[0] = 'Q';
++ remcomOutBuffer[1] = 'C';
++ threadid = current->pid;
++ if (!threadid) {
++ /*
++ * idle thread
++ */
++ for (threadid = PID_MAX;
++ threadid < PID_MAX + MAX_NO_CPUS;
++ threadid++) {
++ if (current ==
++ idle_task(threadid -
++ PID_MAX))
++ break;
++ }
++ }
++ int_to_threadref(&thref, threadid);
++ pack_threadid(remcomOutBuffer + 2, &thref);
++ remcomOutBuffer[18] = '\0';
++ break;
++
++ case 'E':
++ /* Print exception info */
++ printexceptioninfo(exceptionVector,
++ err_code, remcomOutBuffer);
++ break;
++ case 'T':{
++ char * nptr;
++ /* Thread extra info */
++ if (!cmp_str(&remcomInBuffer[2],
++ "hreadExtraInfo,", 15)) {
++ break;
++ }
++ ptr = &remcomInBuffer[17];
++ hexToInt(&ptr, &threadid);
++ thread = getthread(threadid);
++ nptr = &thread->comm[0];
++ length = 0;
++ ptr = &remcomOutBuffer[0];
++ do {
++ length++;
++ ptr = pack_hex_byte(ptr, *nptr++);
++ } while (*nptr && length < 16);
++ /*
++ * would like that 16 to be the size of
++ * task_struct.comm but don't know the
++ * syntax..
++ */
++ *ptr = 0;
++ }
++ }
++ break;
++
++ /* task related */
++ case 'H':
++ switch (remcomInBuffer[1]) {
++ case 'g':
++ ptr = &remcomInBuffer[2];
++ hexToInt(&ptr, &threadid);
++ thread = getthread(threadid);
++ if (!thread) {
++ remcomOutBuffer[0] = 'E';
++ remcomOutBuffer[1] = '\0';
++ break;
++ }
++ /*
++ * Just in case I forget what this is all about,
++ * the "thread info" command to gdb causes it
++ * to ask for a thread list. It then switches
++ * to each thread and asks for the registers.
++ * For this (and only this) usage, we want to
++ * fudge the registers of tasks not on the run
++ * list (i.e. waiting) to show the routine that
++ * called schedule. Also, gdb, is a minimalist
++ * in that if the current thread is the last
++ * it will not re-read the info when done.
++ * This means that in this case we must show
++ * the real registers. So here is how we do it:
++ * Each entry we keep track of the min
++ * thread in the list (the last that gdb will)
++ * get info for. We also keep track of the
++ * starting thread.
++ * "thread_list" is cleared when switching back
++ * to the min thread if it is was current, or
++ * if it was not current, thread_list is set
++ * to 1. When the switch to current comes,
++ * if thread_list is 1, clear it, else do
++ * nothing.
++ */
++ usethread = thread;
++ if ((thread_list == 1) &&
++ (thread == thread_list_start)) {
++ thread_list = 0;
++ }
++ if (thread_list && (threadid == thread_min)) {
++ if (thread == thread_list_start) {
++ thread_list = 0;
++ } else {
++ thread_list = 1;
++ }
++ }
++ /* follow through */
++ case 'c':
++ remcomOutBuffer[0] = 'O';
++ remcomOutBuffer[1] = 'K';
++ remcomOutBuffer[2] = '\0';
++ break;
++ }
++ break;
++
++ /* Query thread status */
++ case 'T':
++ ptr = &remcomInBuffer[1];
++ hexToInt(&ptr, &threadid);
++ thread = getthread(threadid);
++ if (thread) {
++ remcomOutBuffer[0] = 'O';
++ remcomOutBuffer[1] = 'K';
++ remcomOutBuffer[2] = '\0';
++ if (thread_min > threadid)
++ thread_min = threadid;
++ } else {
++ remcomOutBuffer[0] = 'E';
++ remcomOutBuffer[1] = '\0';
++ }
++ break;
++
++ case 'Y': /* set up a hardware breakpoint */
++ ptr = &remcomInBuffer[1];
++ hexToInt(&ptr, &breakno);
++ ptr++;
++ hexToInt(&ptr, &breaktype);
++ ptr++;
++ hexToInt(&ptr, &length);
++ ptr++;
++ hexToInt(&ptr, &addr);
++ if (set_hw_break(breakno & 0x3,
++ breaktype & 0x3,
++ length & 0x3, addr) == 0) {
++ strcpy(remcomOutBuffer, "OK");
++ } else {
++ strcpy(remcomOutBuffer, "ERROR");
++ }
++ break;
++
++ /* Remove hardware breakpoint */
++ case 'y':
++ ptr = &remcomInBuffer[1];
++ hexToInt(&ptr, &breakno);
++ if (remove_hw_break(breakno & 0x3) == 0) {
++ strcpy(remcomOutBuffer, "OK");
++ } else {
++ strcpy(remcomOutBuffer, "ERROR");
++ }
++ break;
++
++ case 'r': /* reboot */
++ strcpy(remcomOutBuffer, "OK");
++ putpacket(remcomOutBuffer);
++ /*to_gdb("Rebooting\n"); */
++ /* triplefault no return from here */
++ {
++ static long no_idt[2];
++ __asm__ __volatile__("lidt %0"::"m"(no_idt[0]));
++ BREAKPOINT;
++ }
++
++ } /* switch */
++
++ /* reply to the request */
++ putpacket(remcomOutBuffer);
++ } /* while(1==1) */
++ /*
++ * reached by goto only.
++ */
++ exit_kgdb:
++ /*
++ * Here is where we set up to trap a gdb function call. NEW_esp
++ * will be changed if we are trying to do this. We handle both
++ * adding and subtracting, thus allowing gdb to put grung on
++ * the stack which it removes later.
++ */
++ if (NEW_esp != OLD_esp) {
++ int *ptr = END_OF_LOOKASIDE;
++ if (NEW_esp < OLD_esp)
++ ptr -= (OLD_esp - NEW_esp) / sizeof (int);
++ *--ptr = linux_regs->eflags;
++ *--ptr = linux_regs->xcs;
++ *--ptr = linux_regs->eip;
++ *--ptr = linux_regs->ecx;
++ *--ptr = linux_regs->ebx;
++ *--ptr = linux_regs->eax;
++ linux_regs->ecx = NEW_esp - (sizeof (int) * 6);
++ linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE;
++ if (NEW_esp < OLD_esp) {
++ linux_regs->eip = (unsigned int) fn_call_stub;
++ } else {
++ linux_regs->eip = (unsigned int) fn_rtn_stub;
++ linux_regs->eax = NEW_esp;
++ }
++ linux_regs->eflags &= ~(IF_BIT | TF_BIT);
++ }
++#ifdef CONFIG_SMP
++ /*
++ * Release gdb wait locks
++ * Sanity check time. Must have at least one cpu to run. Also single
++ * step must not be done if the current cpu is on hold.
++ */
++ if (spinlock_count == 1) {
++ int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep;
++ int cpu_avail = 0;
++ int i;
++
++ for (i = 0; i < MAX_NO_CPUS; i++) {
++ if (!cpu_online(i))
++ break;
++ if (!hold_cpu(i)) {
++ cpu_avail = 1;
++ }
++ }
++ /*
++ * Early in the bring up there will be NO cpus on line...
++ */
++ if (!cpu_avail && !cpus_empty(cpu_online_map)) {
++ to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n");
++ goto once_again;
++ }
++ if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) {
++ to_gdb
++ ("Current cpu must be unblocked to single step\n");
++ goto once_again;
++ }
++ if (!(ss_hold)) {
++ int i;
++ for (i = 0; i < MAX_NO_CPUS; i++) {
++ if (!hold_cpu(i)) {
++ spin_unlock(&waitlocks[i]);
++ }
++ }
++ } else {
++ spin_unlock(&waitlocks[smp_processor_id()]);
++ }
++ /* Release kgdb spinlock */
++ KGDB_SPIN_UNLOCK(&kgdb_spinlock);
++ /*
++ * If this cpu is on hold, this is where we
++ * do it. Note, the NMI will pull us out of here,
++ * but will return as the above lock is not held.
++ * We will stay here till another cpu releases the lock for us.
++ */
++ spin_unlock_wait(waitlocks + smp_processor_id());
++ kgdb_local_irq_restore(flags);
++ return (0);
++ }
++#if 0
++exit_just_unlock:
++#endif
++#endif
++ /* Release kgdb spinlock */
++ KGDB_SPIN_UNLOCK(&kgdb_spinlock);
++ kgdb_local_irq_restore(flags);
++ return (0);
++}
++
++/* this function is used to set up exception handlers for tracing and
++ * breakpoints.
++ * This function is not needed as the above line does all that is needed.
++ * We leave it for backward compatitability...
++ */
++void
++set_debug_traps(void)
++{
++ /*
++ * linux_debug_hook is defined in traps.c. We store a pointer
++ * to our own exception handler into it.
++
++ * But really folks, every hear of labeled common, an old Fortran
++ * concept. Lots of folks can reference it and it is define if
++ * anyone does. Only one can initialize it at link time. We do
++ * this with the hook. See the statement above. No need for any
++ * executable code and it is ready as soon as the kernel is
++ * loaded. Very desirable in kernel debugging.
++
++ linux_debug_hook = handle_exception ;
++ */
++
++ /* In case GDB is started before us, ack any packets (presumably
++ "$?#xx") sitting there.
++ putDebugChar ('+');
++
++ initialized = 1;
++ */
++}
++
++/* This function will generate a breakpoint exception. It is used at the
++ beginning of a program to sync up with a debugger and can be used
++ otherwise as a quick means to stop program execution and "break" into
++ the debugger. */
++/* But really, just use the BREAKPOINT macro. We will handle the int stuff
++ */
++
++#ifdef later
++/*
++ * possibly we should not go thru the traps.c code at all? Someday.
++ */
++void
++do_kgdb_int3(struct pt_regs *regs, long error_code)
++{
++ kgdb_handle_exception(3, 5, error_code, regs);
++ return;
++}
++#endif
++#undef regs
++#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS
++asmlinkage void
++bad_sys_call_exit(int stuff)
++{
++ struct pt_regs *regs = (struct pt_regs *) &stuff;
++ printk("Sys call %d return with %x preempt_count\n",
++ (int) regs->orig_eax, preempt_count());
++}
++#endif
++#ifdef CONFIG_STACK_OVERFLOW_TEST
++#include <asm/kgdb.h>
++asmlinkage void
++stack_overflow(void)
++{
++#ifdef BREAKPOINT
++ BREAKPOINT;
++#else
++ printk("Kernel stack overflow, looping forever\n");
++#endif
++ while (1) {
++ }
++}
++#endif
++
++#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE)
++char gdbconbuf[BUFMAX];
++
++static void
++kgdb_gdb_message(const char *s, unsigned count)
++{
++ int i;
++ int wcount;
++ char *bufptr;
++ /*
++ * This takes care of NMI while spining out chars to gdb
++ */
++ IF_SMP(in_kgdb_console = 1);
++ gdbconbuf[0] = 'O';
++ bufptr = gdbconbuf + 1;
++ while (count > 0) {
++ if ((count << 1) > (BUFMAX - 2)) {
++ wcount = (BUFMAX - 2) >> 1;
++ } else {
++ wcount = count;
++ }
++ count -= wcount;
++ for (i = 0; i < wcount; i++) {
++ bufptr = pack_hex_byte(bufptr, s[i]);
++ }
++ *bufptr = '\0';
++ s += wcount;
++
++ putpacket(gdbconbuf);
++
++ }
++ IF_SMP(in_kgdb_console = 0);
++}
++#endif
++#ifdef CONFIG_SMP
++static void
++to_gdb(const char *s)
++{
++ int count = 0;
++ while (s[count] && (count++ < BUFMAX)) ;
++ kgdb_gdb_message(s, count);
++}
++#endif
++#ifdef CONFIG_KGDB_CONSOLE
++#include <linux/console.h>
++#include <linux/init.h>
++#include <linux/fs.h>
++#include <asm/uaccess.h>
++#include <asm/semaphore.h>
++
++void
++kgdb_console_write(struct console *co, const char *s, unsigned count)
++{
++
++ if (gdb_i386vector == -1) {
++ /*
++ * We have not yet talked to gdb. What to do...
++ * lets break, on continue we can do the write.
++ * But first tell him whats up. Uh, well no can do,
++ * as this IS the console. Oh well...
++ * We do need to wait or the messages will be lost.
++ * Other option would be to tell the above code to
++ * ignore this breakpoint and do an auto return,
++ * but that might confuse gdb. Also this happens
++ * early enough in boot up that we don't have the traps
++ * set up yet, so...
++ */
++ breakpoint();
++ }
++ kgdb_gdb_message(s, count);
++}
++
++/*
++ * ------------------------------------------------------------
++ * Serial KGDB driver
++ * ------------------------------------------------------------
++ */
++
++static struct console kgdbcons = {
++ name:"kgdb",
++ write:kgdb_console_write,
++#ifdef CONFIG_KGDB_USER_CONSOLE
++ device:kgdb_console_device,
++#endif
++ flags:CON_PRINTBUFFER | CON_ENABLED,
++ index:-1,
++};
++
++/*
++ * The trick here is that this file gets linked before printk.o
++ * That means we get to peer at the console info in the command
++ * line before it does. If we are up, we register, otherwise,
++ * do nothing. By returning 0, we allow printk to look also.
++ */
++static int kgdb_console_enabled;
++
++int __init
++kgdb_console_init(char *str)
++{
++ if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) {
++ register_console(&kgdbcons);
++ kgdb_console_enabled = 1;
++ }
++ return 0; /* let others look at the string */
++}
++
++__setup("console=", kgdb_console_init);
++
++#ifdef CONFIG_KGDB_USER_CONSOLE
++static kdev_t kgdb_console_device(struct console *c);
++/* This stuff sort of works, but it knocks out telnet devices
++ * we are leaving it here in case we (or you) find time to figure it out
++ * better..
++ */
++
++/*
++ * We need a real char device as well for when the console is opened for user
++ * space activities.
++ */
++
++static int
++kgdb_consdev_open(struct inode *inode, struct file *file)
++{
++ return 0;
++}
++
++static ssize_t
++kgdb_consdev_write(struct file *file, const char *buf,
++ size_t count, loff_t * ppos)
++{
++ int size, ret = 0;
++ static char kbuf[128];
++ static DECLARE_MUTEX(sem);
++
++ /* We are not reentrant... */
++ if (down_interruptible(&sem))
++ return -ERESTARTSYS;
++
++ while (count > 0) {
++ /* need to copy the data from user space */
++ size = count;
++ if (size > sizeof (kbuf))
++ size = sizeof (kbuf);
++ if (copy_from_user(kbuf, buf, size)) {
++ ret = -EFAULT;
++ break;;
++ }
++ kgdb_console_write(&kgdbcons, kbuf, size);
++ count -= size;
++ ret += size;
++ buf += size;
++ }
++
++ up(&sem);
++
++ return ret;
++}
++
++struct file_operations kgdb_consdev_fops = {
++ open:kgdb_consdev_open,
++ write:kgdb_consdev_write
++};
++static kdev_t
++kgdb_console_device(struct console *c)
++{
++ return MKDEV(TTYAUX_MAJOR, 1);
++}
++
++/*
++ * This routine gets called from the serial stub in the i386/lib
++ * This is so it is done late in bring up (just before the console open).
++ */
++void
++kgdb_console_finit(void)
++{
++ if (kgdb_console_enabled) {
++ char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1));
++ char *cp = cptr;
++ while (*cptr && *cptr != '(')
++ cptr++;
++ *cptr = 0;
++ unregister_chrdev(TTYAUX_MAJOR, cp);
++ register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops);
++ }
++}
++#endif
++#endif
++#ifdef CONFIG_KGDB_TS
++#include <asm/msr.h> /* time stamp code */
++#include <asm/hardirq.h> /* in_interrupt */
++#ifdef CONFIG_KGDB_TS_64
++#define DATA_POINTS 64
++#endif
++#ifdef CONFIG_KGDB_TS_128
++#define DATA_POINTS 128
++#endif
++#ifdef CONFIG_KGDB_TS_256
++#define DATA_POINTS 256
++#endif
++#ifdef CONFIG_KGDB_TS_512
++#define DATA_POINTS 512
++#endif
++#ifdef CONFIG_KGDB_TS_1024
++#define DATA_POINTS 1024
++#endif
++#ifndef DATA_POINTS
++#define DATA_POINTS 128 /* must be a power of two */
++#endif
++#define INDEX_MASK (DATA_POINTS - 1)
++#if (INDEX_MASK & DATA_POINTS)
++#error "CONFIG_KGDB_TS_COUNT must be a power of 2"
++#endif
++struct kgdb_and_then_struct {
++#ifdef CONFIG_SMP
++ int on_cpu;
++#endif
++ struct task_struct *task;
++ long long at_time;
++ int from_ln;
++ char *in_src;
++ void *from;
++ int *with_shpf;
++ int data0;
++ int data1;
++};
++struct kgdb_and_then_struct2 {
++#ifdef CONFIG_SMP
++ int on_cpu;
++#endif
++ struct task_struct *task;
++ long long at_time;
++ int from_ln;
++ char *in_src;
++ void *from;
++ int *with_shpf;
++ struct task_struct *t1;
++ struct task_struct *t2;
++};
++struct kgdb_and_then_struct kgdb_data[DATA_POINTS];
++
++struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0];
++int kgdb_and_then_count;
++
++void
++kgdb_tstamp(int line, char *source, int data0, int data1)
++{
++ static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED;
++ int flags;
++ kgdb_local_irq_save(flags);
++ spin_lock(&ts_spin);
++ rdtscll(kgdb_and_then->at_time);
++#ifdef CONFIG_SMP
++ kgdb_and_then->on_cpu = smp_processor_id();
++#endif
++ kgdb_and_then->task = current;
++ kgdb_and_then->from_ln = line;
++ kgdb_and_then->in_src = source;
++ kgdb_and_then->from = __builtin_return_address(0);
++ kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) |
++ (preempt_count() << 8));
++ kgdb_and_then->data0 = data0;
++ kgdb_and_then->data1 = data1;
++ kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK];
++ spin_unlock(&ts_spin);
++ kgdb_local_irq_restore(flags);
++#ifdef CONFIG_PREEMPT
++
++#endif
++ return;
++}
++#endif
++typedef int gdb_debug_hook(int exceptionVector,
++ int signo, int err_code, struct pt_regs *linux_regs);
++gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */
+Index: linux-2.6.10/arch/i386/kernel/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/Makefile 2005-03-31 15:35:23.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/Makefile 2005-04-05 12:48:05.254618256 +0800
+@@ -14,6 +14,7 @@
+ obj-$(CONFIG_ACPI_BOOT) += acpi/
+ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
+ obj-$(CONFIG_MCA) += mca.o
++obj-$(CONFIG_KGDB) += kgdb_stub.o
+ obj-$(CONFIG_X86_MSR) += msr.o
+ obj-$(CONFIG_X86_CPUID) += cpuid.o
+ obj-$(CONFIG_MICROCODE) += microcode.o
+Index: linux-2.6.10/arch/i386/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/smp.c 2005-03-31 16:20:11.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/smp.c 2005-04-05 12:48:05.218623728 +0800
+@@ -466,7 +466,17 @@
+ {
+ on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+ }
+-
++#ifdef CONFIG_KGDB
++/*
++ * By using the NMI code instead of a vector we just sneak thru the
++ * word generator coming out with just what we want. AND it does
++ * not matter if clustered_apic_mode is set or not.
++ */
++void smp_send_nmi_allbutself(void)
++{
++ send_IPI_allbutself(APIC_DM_NMI);
++}
++#endif
+ /*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+Index: linux-2.6.10/arch/i386/Kconfig.kgdb
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig.kgdb 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/i386/Kconfig.kgdb 2005-04-05 12:48:05.205625704 +0800
+@@ -0,0 +1,175 @@
++config KGDB
++ bool "Include kgdb kernel debugger"
++ depends on DEBUG_KERNEL && !KPROBES
++ help
++ If you say Y here, the system will be compiled with the debug
++ option (-g) and a debugging stub will be included in the
++ kernel. This stub communicates with gdb on another (host)
++ computer via a serial port. The host computer should have
++ access to the kernel binary file (vmlinux) and a serial port
++ that is connected to the target machine. Gdb can be made to
++ configure the serial port or you can use stty and setserial to
++ do this. See the 'target' command in gdb. This option also
++ configures in the ability to request a breakpoint early in the
++ boot process. To request the breakpoint just include 'kgdb'
++ as a boot option when booting the target machine. The system
++ will then break as soon as it looks at the boot options. This
++ option also installs a breakpoint in panic and sends any
++ kernel faults to the debugger. For more information see the
++ Documentation/i386/kgdb/kgdb.txt file.
++
++choice
++ depends on KGDB
++ prompt "Debug serial port BAUD"
++ default KGDB_115200BAUD
++ help
++ Gdb and the kernel stub need to agree on the baud rate to be
++ used. Some systems (x86 family at this writing) allow this to
++ be configured.
++
++config KGDB_9600BAUD
++ bool "9600"
++
++config KGDB_19200BAUD
++ bool "19200"
++
++config KGDB_38400BAUD
++ bool "38400"
++
++config KGDB_57600BAUD
++ bool "57600"
++
++config KGDB_115200BAUD
++ bool "115200"
++endchoice
++
++config KGDB_PORT
++ hex "hex I/O port address of the debug serial port"
++ depends on KGDB
++ default 3f8
++ help
++ Some systems (x86 family at this writing) allow the port
++ address to be configured. The number entered is assumed to be
++ hex, don't put 0x in front of it. The standard address are:
++ COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx
++ will tell you what you have. It is good to test the serial
++ connection with a live system before trying to debug.
++
++config KGDB_IRQ
++ int "IRQ of the debug serial port"
++ depends on KGDB
++ default 4
++ help
++ This is the irq for the debug port. If everything is working
++ correctly and the kernel has interrupts on a control C to the
++ port should cause a break into the kernel debug stub.
++
++config DEBUG_INFO
++ bool
++ depends on KGDB
++ default y
++
++config KGDB_MORE
++ bool "Add any additional compile options"
++ depends on KGDB
++ default n
++ help
++ Saying yes here turns on the ability to enter additional
++ compile options.
++
++
++config KGDB_OPTIONS
++ depends on KGDB_MORE
++ string "Additional compile arguments"
++ default "-O1"
++ help
++ This option allows you enter additional compile options for
++ the whole kernel compile. Each platform will have a default
++ that seems right for it. For example on PPC "-ggdb -O1", and
++ for i386 "-O1". Note that by configuring KGDB "-g" is already
++ turned on. In addition, on i386 platforms
++ "-fomit-frame-pointer" is deleted from the standard compile
++ options.
++
++config NO_KGDB_CPUS
++ int "Number of CPUs"
++ depends on KGDB && SMP
++ default NR_CPUS
++ help
++
++ This option sets the number of cpus for kgdb ONLY. It is used
++ to prune some internal structures so they look "nice" when
++ displayed with gdb. This is to overcome possibly larger
++ numbers that may have been entered above. Enter the real
++ number to get nice clean kgdb_info displays.
++
++config KGDB_TS
++ bool "Enable kgdb time stamp macros?"
++ depends on KGDB
++ default n
++ help
++ Kgdb event macros allow you to instrument your code with calls
++ to the kgdb event recording function. The event log may be
++ examined with gdb at a break point. Turning on this
++ capability also allows you to choose how many events to
++ keep. Kgdb always keeps the lastest events.
++
++choice
++ depends on KGDB_TS
++ prompt "Max number of time stamps to save?"
++ default KGDB_TS_128
++
++config KGDB_TS_64
++ bool "64"
++
++config KGDB_TS_128
++ bool "128"
++
++config KGDB_TS_256
++ bool "256"
++
++config KGDB_TS_512
++ bool "512"
++
++config KGDB_TS_1024
++ bool "1024"
++
++endchoice
++
++config STACK_OVERFLOW_TEST
++ bool "Turn on kernel stack overflow testing?"
++ depends on KGDB
++ default n
++ help
++ This option enables code in the front line interrupt handlers
++ to check for kernel stack overflow on interrupts and system
++ calls. This is part of the kgdb code on x86 systems.
++
++config KGDB_CONSOLE
++ bool "Enable serial console thru kgdb port"
++ depends on KGDB
++ default n
++ help
++ This option enables the command line "console=kgdb" option.
++ When the system is booted with this option in the command line
++ all kernel printk output is sent to gdb (as well as to other
++ consoles). For this to work gdb must be connected. For this
++ reason, this command line option will generate a breakpoint if
++ gdb has not yet connected. After the gdb continue command is
++ given all pent up console output will be printed by gdb on the
++ host machine. Neither this option, nor KGDB require the
++ serial driver to be configured.
++
++config KGDB_SYSRQ
++ bool "Turn on SysRq 'G' command to do a break?"
++ depends on KGDB
++ default y
++ help
++ This option includes an option in the SysRq code that allows
++ you to enter SysRq G which generates a breakpoint to the KGDB
++ stub. This will work if the keyboard is alive and can
++ interrupt the system. Because of constraints on when the
++ serial port interrupt can be enabled, this code may allow you
++ to interrupt the system before the serial port control C is
++ available. Just say yes here.
++
+Index: linux-2.6.10/arch/i386/mm/fault.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/mm/fault.c 2004-12-25 05:33:48.000000000 +0800
++++ linux-2.6.10/arch/i386/mm/fault.c 2005-04-05 12:48:05.196627072 +0800
+@@ -430,6 +430,12 @@
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
++#ifdef CONFIG_KGDB
++ if (!user_mode(regs)){
++ kgdb_handle_exception(14,SIGBUS, error_code, regs);
++ return;
++ }
++#endif
+
+ bust_spinlocks(1);
+
+Index: linux-2.6.10/arch/i386/Kconfig
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig 2005-04-05 12:48:03.417897480 +0800
++++ linux-2.6.10/arch/i386/Kconfig 2005-04-05 12:48:05.257617800 +0800
+@@ -1196,6 +1196,14 @@
+
+ source "fs/Kconfig.binfmt"
+
++config TRAP_BAD_SYSCALL_EXITS
++ bool "Debug bad system call exits"
++ depends on KGDB
++ help
++ If you say Y here the kernel will check for system calls which
++ return without clearing preempt.
++ default n
++
+ endmenu
+
+ source "drivers/Kconfig"
+Index: linux-2.6.10/arch/i386/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Makefile 2005-03-31 15:35:27.000000000 +0800
++++ linux-2.6.10/arch/i386/Makefile 2005-04-05 12:48:05.255618104 +0800
+@@ -99,6 +99,9 @@
+ # default subarch .h files
+ mflags-y += -Iinclude/asm-i386/mach-default
+
++mflags-$(CONFIG_KGDB) += -gdwarf-2
++mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g')
++
+ head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
+
+ libs-y += arch/i386/lib/
+Index: linux-2.6.10/arch/x86_64/boot/compressed/head.S
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/boot/compressed/head.S 2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/x86_64/boot/compressed/head.S 2005-04-05 12:48:05.258617648 +0800
+@@ -26,6 +26,7 @@
+ .code32
+ .text
+
++#define IN_BOOTLOADER
+ #include <linux/linkage.h>
+ #include <asm/segment.h>
+
+Index: linux-2.6.10/arch/x86_64/boot/compressed/misc.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/boot/compressed/misc.c 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/x86_64/boot/compressed/misc.c 2005-04-05 12:48:05.259617496 +0800
+@@ -9,6 +9,7 @@
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+
++#define IN_BOOTLOADER
+ #include "miscsetup.h"
+ #include <asm/io.h>
+
+Index: linux-2.6.10/MAINTAINERS
+===================================================================
+--- linux-2.6.10.orig/MAINTAINERS 2005-03-31 15:35:24.000000000 +0800
++++ linux-2.6.10/MAINTAINERS 2005-04-05 12:48:05.181629352 +0800
+@@ -1245,6 +1245,12 @@
+ W: http://developer.osdl.org/rddunlap/kj-patches/
+ S: Maintained
+
++KGDB FOR I386 PLATFORM
++P: George Anzinger
++M: george@mvista.com
++L: linux-net@vger.kernel.org
++S: Supported
++
+ KERNEL NFSD
+ P: Neil Brown
+ M: neilb@cse.unsw.edu.au
+Index: linux-2.6.10/drivers/char/sysrq.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/sysrq.c 2005-03-31 15:57:20.000000000 +0800
++++ linux-2.6.10/drivers/char/sysrq.c 2005-04-05 12:48:05.191627832 +0800
+@@ -35,6 +35,25 @@
+ #include <linux/spinlock.h>
+
+ #include <asm/ptrace.h>
++#ifdef CONFIG_KGDB_SYSRQ
++
++#define GDB_OP &kgdb_op
++static void kgdb_sysrq(int key, struct pt_regs *pt_regs, struct tty_struct *tty)
++{
++ printk("kgdb sysrq\n");
++ breakpoint();
++}
++
++static struct sysrq_key_op kgdb_op = {
++ .handler = kgdb_sysrq,
++ .help_msg = "kGdb|Fgdb",
++ .action_msg = "Debug breakpoint\n",
++};
++
++#else
++#define GDB_OP NULL
++#endif
++
+
+ extern void reset_vc(unsigned int);
+
+@@ -249,7 +268,7 @@
+ /* d */ NULL,
+ /* e */ &sysrq_term_op,
+ /* f */ NULL,
+-/* g */ NULL,
++/* g */ GDB_OP,
+ /* h */ NULL,
+ /* i */ &sysrq_kill_op,
+ /* j */ NULL,
+Index: linux-2.6.10/drivers/char/keyboard.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/keyboard.c 2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/drivers/char/keyboard.c 2005-04-05 12:48:05.190627984 +0800
+@@ -1078,6 +1078,9 @@
+ }
+ if (sysrq_down && down && !rep) {
+ handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty);
++#ifdef CONFIG_KGDB_SYSRQ
++ sysrq_down = 0; /* in case we miss the "up" event */
++#endif
+ return;
+ }
+ #endif
+Index: linux-2.6.10/drivers/serial/serial_core.c
+===================================================================
+--- linux-2.6.10.orig/drivers/serial/serial_core.c 2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/drivers/serial/serial_core.c 2005-04-05 12:48:05.188628288 +0800
+@@ -1924,6 +1924,15 @@
+ {
+ unsigned int flags;
+
++#ifdef CONFIG_KGDB
++ {
++ extern int kgdb_irq;
++
++ if (port->irq == kgdb_irq)
++ return;
++ }
++#endif
++
+ /*
+ * If there isn't a port here, don't do anything further.
+ */
+Index: linux-2.6.10/drivers/serial/8250.c
+===================================================================
+--- linux-2.6.10.orig/drivers/serial/8250.c 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/drivers/serial/8250.c 2005-04-05 12:48:05.185628744 +0800
+@@ -1350,12 +1350,21 @@
+ spin_unlock_irqrestore(&up->port.lock, flags);
+ }
+
++#ifdef CONFIG_KGDB
++int kgdb_irq = -1;
++#endif
++
+ static int serial8250_startup(struct uart_port *port)
+ {
+ struct uart_8250_port *up = (struct uart_8250_port *)port;
+ unsigned long flags;
+ int retval;
+
++#ifdef CONFIG_KGDB
++ if (up->port.irq == kgdb_irq)
++ return -EBUSY;
++#endif
++
+ up->capabilities = uart_config[up->port.type].flags;
+ up->mcr = 0;
+
+@@ -2438,6 +2447,33 @@
+ }
+ EXPORT_SYMBOL(serial8250_unregister_port);
+
++#ifdef CONFIG_KGDB
++#include <linux/serialP.h>
++
++/*
++ * Find all the ports using the given irq and shut them down.
++ * Result should be that the irq will be released.
++ */
++void shutdown_for_kgdb(struct async_struct * info)
++{
++ int irq = info->state->irq;
++ struct uart_8250_port *up;
++ int ttyS;
++
++ kgdb_irq = irq; /* save for later init */
++ for (ttyS = 0; ttyS < UART_NR; ttyS++){
++ up = &serial8250_ports[ttyS];
++ if (up->port.irq == irq && (irq_lists + irq)->head) {
++#ifdef CONFIG_DEBUG_SPINLOCK /* ugly business... */
++ if(up->port.lock.magic != SPINLOCK_MAGIC)
++ spin_lock_init(&up->port.lock);
++#endif
++ serial8250_shutdown(&up->port);
++ }
++ }
++}
++#endif /* CONFIG_KGDB */
++
+ static int __init serial8250_init(void)
+ {
+ int ret, i;
--- /dev/null
+
+
+The complete set of citi nfsv4 patches combined into one patch.
+
+Changes since 2.6.10-rc3-CITI_NFS4_ALL-3
+ * minor adjustments to xdr buffer length calculations in fs/nfs4xdr.c
+ * client acl revisions: pass acls in page array of xdr bufs, removing
+ arbitrary length restrictions. Temporarily disable acl caching.
+
+Index: linux-2.6.10/include/linux/nfsd/state.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfsd/state.h 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/include/linux/nfsd/state.h 2005-04-05 14:49:13.465682224 +0800
+@@ -67,6 +67,45 @@
+ #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
+ #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+
++/* Delegation recall states */
++#define NFS4_NO_RECALL 0x000
++#define NFS4_RECALL_IN_PROGRESS 0x001
++#define NFS4_RECALL_COMPLETE 0x002
++
++
++/* Delegation flags */
++#define NFS4_DELAY_CLOSE 0x001
++
++struct nfs4_cb_recall {
++ u32 cbr_ident;
++ int cbr_trunc;
++ stateid_t cbr_stateid;
++ u32 cbr_fhlen;
++ u32 cbr_fhval[NFS4_FHSIZE];
++ struct nfs4_delegation *cbr_dp;
++};
++
++struct nfs4_delegation {
++ struct list_head dl_del_perfile; /* nfs4_file->fi_del_perfile */
++ struct list_head dl_del_perclnt; /* nfs4_client->cl_del_perclnt*/
++ struct list_head dl_recall_lru; /* delegation recalled */
++ atomic_t dl_recall_cnt; /* resend cb_recall only once */
++ atomic_t dl_count; /* ref count */
++ atomic_t dl_state; /* recall state */
++ struct nfs4_client *dl_client;
++ struct nfs4_file *dl_file;
++ struct file_lock *dl_flock;
++ struct nfs4_stateid *dl_stp;
++ u32 dl_flags;
++ u32 dl_type;
++ time_t dl_time;
++ struct nfs4_cb_recall dl_recall;
++};
++
++#define dl_stateid dl_recall.cbr_stateid
++#define dl_fhlen dl_recall.cbr_fhlen
++#define dl_fhval dl_recall.cbr_fhval
++
+ /* client delegation callback info */
+ struct nfs4_callback {
+ /* SETCLIENTID info */
+@@ -75,9 +114,8 @@
+ unsigned short cb_port;
+ u32 cb_prog;
+ u32 cb_ident;
+- struct xdr_netobj cb_netid;
+ /* RPC client info */
+- u32 cb_set; /* successful CB_NULL call */
++ atomic_t cb_set; /* successful CB_NULL call */
+ struct rpc_program cb_program;
+ struct rpc_stat cb_stat;
+ struct rpc_clnt * cb_client;
+@@ -97,6 +135,7 @@
+ struct list_head cl_idhash; /* hash by cl_clientid.id */
+ struct list_head cl_strhash; /* hash by cl_name */
+ struct list_head cl_perclient; /* list: stateowners */
++ struct list_head cl_del_perclnt; /* list: delegations */
+ struct list_head cl_lru; /* tail queue */
+ struct xdr_netobj cl_name; /* id generated by client */
+ nfs4_verifier cl_verifier; /* generated by client */
+@@ -106,7 +145,8 @@
+ clientid_t cl_clientid; /* generated by server */
+ nfs4_verifier cl_confirm; /* generated by server */
+ struct nfs4_callback cl_callback; /* callback info */
+- time_t cl_first_state; /* first state aquisition*/
++ atomic_t cl_count; /* ref count */
++ u32 cl_firststate; /* recovery file creation */
+ };
+
+ /* struct nfs4_client_reset
+@@ -117,8 +157,6 @@
+ struct nfs4_client_reclaim {
+ struct list_head cr_strhash; /* hash by cr_name */
+ struct xdr_netobj cr_name; /* id generated by client */
+- time_t cr_first_state; /* first state aquisition */
+- u32 cr_expired; /* boolean: lease expired? */
+ };
+
+ static inline void
+@@ -194,6 +232,7 @@
+ struct nfs4_file {
+ struct list_head fi_hash; /* hash by "struct inode *" */
+ struct list_head fi_perfile; /* list: nfs4_stateid */
++ struct list_head fi_del_perfile; /* list: nfs4_delegation */
+ struct inode *fi_inode;
+ u32 fi_id; /* used with stateowner->so_id
+ * for stateid_hashtbl hash */
+@@ -231,8 +270,10 @@
+ #define CONFIRM 0x00000002
+ #define OPEN_STATE 0x00000004
+ #define LOCK_STATE 0x00000008
+-#define RDWR_STATE 0x00000010
+-#define CLOSE_STATE 0x00000020
++#define RD_STATE 0x00000010
++#define WR_STATE 0x00000020
++#define CLOSE_STATE 0x00000040
++#define DELEG_RET 0x00000080
+
+ #define seqid_mutating_err(err) \
+ (((err) != nfserr_stale_clientid) && \
+@@ -243,14 +284,24 @@
+ extern time_t nfs4_laundromat(void);
+ extern int nfsd4_renew(clientid_t *clid);
+ extern int nfs4_preprocess_stateid_op(struct svc_fh *current_fh,
+- stateid_t *stateid, int flags, struct nfs4_stateid **stpp);
++ stateid_t *stateid, int flags, struct file **filp);
+ extern int nfs4_share_conflict(struct svc_fh *current_fh,
+ unsigned int deny_type);
+ extern void nfs4_lock_state(void);
+ extern void nfs4_unlock_state(void);
+ extern int nfs4_in_grace(void);
+ extern int nfs4_check_open_reclaim(clientid_t *clid);
++extern void put_nfs4_client(struct nfs4_client *clp);
+ extern void nfs4_free_stateowner(struct kref *kref);
++extern void nfsd4_probe_callback(struct nfs4_client *clp);
++extern int nfsd4_cb_recall(struct nfs4_delegation *dp);
++extern int nfsd4_create_clid_file(struct nfs4_client *clp);
++extern void nfsd4_remove_clid_file(struct nfs4_client *clp);
++extern int nfsd4_list_rec_dir(int clear);
++extern void nfsd4_init_rec_dir(char *rec_dirname);
++extern void nfsd4_shutdown_rec_dir(void);
++extern int nfs4_client_to_reclaim(char *name, int namlen);
++
+
+ static inline void
+ nfs4_put_stateowner(struct nfs4_stateowner *so)
+Index: linux-2.6.10/include/linux/nfsd/nfsd.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfsd/nfsd.h 2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/include/linux/nfsd/nfsd.h 2005-04-05 14:49:13.464682376 +0800
+@@ -98,8 +98,12 @@
+ void nfsd_close(struct file *);
+ int nfsd_read(struct svc_rqst *, struct svc_fh *,
+ loff_t, struct kvec *,int, unsigned long *);
++int nfsd_vfs_read(struct svc_rqst *, struct svc_fh *, struct file *,
++ loff_t, struct kvec *, int, unsigned long *);
+ int nfsd_write(struct svc_rqst *, struct svc_fh *,
+ loff_t, struct kvec *,int, unsigned long, int *);
++int nfsd_vfs_write(struct svc_rqst *, struct svc_fh *,struct file *,
++ loff_t, struct kvec *,int, unsigned long, int *);
+ int nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+ char *, int *);
+ int nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+Index: linux-2.6.10/include/linux/nfsd/xdr4.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfsd/xdr4.h 2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/include/linux/nfsd/xdr4.h 2005-04-05 14:49:13.466682072 +0800
+@@ -44,16 +44,6 @@
+ #define NFSD4_MAX_TAGLEN 128
+ #define XDR_LEN(n) (((n) + 3) & ~3)
+
+-typedef u32 delegation_zero_t;
+-typedef u32 delegation_boot_t;
+-typedef u64 delegation_id_t;
+-
+-typedef struct {
+- delegation_zero_t ds_zero;
+- delegation_boot_t ds_boot;
+- delegation_id_t ds_id;
+-} delegation_stateid_t;
+-
+ struct nfsd4_change_info {
+ u32 atomic;
+ u32 before_ctime_sec;
+@@ -104,6 +94,10 @@
+ #define cr_specdata1 u.dev.specdata1
+ #define cr_specdata2 u.dev.specdata2
+
++struct nfsd4_delegreturn {
++ stateid_t dr_stateid;
++};
++
+ struct nfsd4_getattr {
+ u32 ga_bmval[2]; /* request */
+ struct svc_fh *ga_fhp; /* response */
+@@ -202,13 +196,13 @@
+ u32 op_claim_type; /* request */
+ struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */
+ u32 op_delegate_type; /* request - CLAIM_PREV only */
+- delegation_stateid_t op_delegate_stateid; /* request - CLAIM_DELEGATE_CUR only */
++ stateid_t op_delegate_stateid; /* request - response */
+ u32 op_create; /* request */
+ u32 op_createmode; /* request */
+ u32 op_bmval[2]; /* request */
+ union { /* request */
+- struct iattr iattr; /* UNCHECKED4,GUARDED4 */
+- nfs4_verifier verf; /* EXCLUSIVE4 */
++ struct iattr iattr; /* UNCHECKED4,GUARDED4 */
++ nfs4_verifier verf; /* EXCLUSIVE4 */
+ } u;
+ clientid_t op_clientid; /* request */
+ struct xdr_netobj op_owner; /* request */
+@@ -247,6 +241,7 @@
+ u32 rd_length; /* request */
+ struct kvec rd_iov[RPCSVC_MAXPAGES];
+ int rd_vlen;
++ struct file *rd_filp;
+
+ struct svc_rqst *rd_rqstp; /* response */
+ struct svc_fh * rd_fhp; /* response */
+@@ -345,6 +340,7 @@
+ struct nfsd4_close close;
+ struct nfsd4_commit commit;
+ struct nfsd4_create create;
++ struct nfsd4_delegreturn delegreturn;
+ struct nfsd4_getattr getattr;
+ struct svc_fh * getfh;
+ struct nfsd4_link link;
+@@ -456,6 +452,8 @@
+ nfsd4_release_lockowner(struct svc_rqst *rqstp,
+ struct nfsd4_release_lockowner *rlockowner);
+ extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
++extern int nfsd4_delegreturn(struct svc_rqst *rqstp,
++ struct svc_fh *current_fh, struct nfsd4_delegreturn *dr);
+ #endif
+
+ /*
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/fs.h 2005-04-05 14:49:13.461682832 +0800
+@@ -1185,11 +1185,6 @@
+
+ extern int vfs_statfs(struct super_block *, struct kstatfs *);
+
+-/* Return value for VFS lock functions - tells locks.c to lock conventionally
+- * REALLY kosha for root NFS and nfs_lock
+- */
+-#define LOCK_USE_CLNT 1
+-
+ #define FLOCK_VERIFY_READ 1
+ #define FLOCK_VERIFY_WRITE 2
+
+Index: linux-2.6.10/include/linux/dcache.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dcache.h 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/dcache.h 2005-04-05 14:49:13.460682984 +0800
+@@ -200,6 +200,7 @@
+ * These are the low-level FS interfaces to the dcache..
+ */
+ extern void d_instantiate(struct dentry *, struct inode *);
++extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
+ extern void d_delete(struct dentry *);
+
+ /* allocate/de-allocate */
+@@ -244,6 +245,23 @@
+ d_rehash(entry);
+ }
+
++/**
++ * d_add_unique - add dentry to hash queues without aliasing
++ * @entry: dentry to add
++ * @inode: The inode to attach to this dentry
++ *
++ * This adds the entry to the hash queues and initializes @inode.
++ * The entry was actually filled in earlier during d_alloc().
++ */
++static inline struct dentry *d_add_unique(struct dentry *entry, struct inode *inode)
++{
++ struct dentry *res;
++
++ res = d_instantiate_unique(entry, inode);
++ d_rehash(res != NULL ? res : entry);
++ return res;
++}
++
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
+
+Index: linux-2.6.10/include/linux/nfs_fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfs_fs.h 2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/include/linux/nfs_fs.h 2005-04-05 14:49:13.463682528 +0800
+@@ -30,6 +30,7 @@
+ #include <linux/nfs_xdr.h>
+ #include <linux/rwsem.h>
+ #include <linux/workqueue.h>
++#include <linux/mempool.h>
+
+ /*
+ * Enable debugging support for nfs client.
+@@ -201,6 +202,7 @@
+ #define NFS_INO_INVALID_ATTR 0x0008 /* cached attrs are invalid */
+ #define NFS_INO_INVALID_DATA 0x0010 /* cached data is invalid */
+ #define NFS_INO_INVALID_ATIME 0x0020 /* cached atime is invalid */
++#define NFS_INO_INVALID_ACCESS 0x0040 /* cached access cred invalid */
+
+ static inline struct nfs_inode *NFS_I(struct inode *inode)
+ {
+@@ -239,7 +241,7 @@
+ static inline void NFS_CACHEINV(struct inode *inode)
+ {
+ if (!nfs_caches_unstable(inode))
+- NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR;
++ NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+ }
+
+ static inline int nfs_server_capable(struct inode *inode, int cap)
+@@ -424,6 +426,44 @@
+ return nfs_wb_page_priority(inode, page, 0);
+ }
+
++/*
++ * Allocate and free nfs_write_data structures
++ */
++extern mempool_t *nfs_wdata_mempool;
++extern mempool_t *nfs_commit_mempool;
++
++static inline struct nfs_write_data *nfs_writedata_alloc(void)
++{
++ struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
++ if (p) {
++ memset(p, 0, sizeof(*p));
++ INIT_LIST_HEAD(&p->pages);
++ }
++ return p;
++}
++
++static inline void nfs_writedata_free(struct nfs_write_data *p)
++{
++ mempool_free(p, nfs_wdata_mempool);
++}
++
++extern void nfs_writedata_release(struct rpc_task *task);
++
++static inline struct nfs_write_data *nfs_commit_alloc(void)
++{
++ struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
++ if (p) {
++ memset(p, 0, sizeof(*p));
++ INIT_LIST_HEAD(&p->pages);
++ }
++ return p;
++}
++
++static inline void nfs_commit_free(struct nfs_write_data *p)
++{
++ mempool_free(p, nfs_commit_mempool);
++}
++
+ /* Hack for future NFS swap support */
+ #ifndef IS_SWAPFILE
+ # define IS_SWAPFILE(inode) (0)
+@@ -439,6 +479,26 @@
+ extern void nfs_readpage_result(struct rpc_task *);
+
+ /*
++ * Allocate and free nfs_read_data structures
++ */
++extern mempool_t *nfs_rdata_mempool;
++
++static inline struct nfs_read_data *nfs_readdata_alloc(void)
++{
++ struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
++ if (p)
++ memset(p, 0, sizeof(*p));
++ return p;
++}
++
++static inline void nfs_readdata_free(struct nfs_read_data *p)
++{
++ mempool_free(p, nfs_rdata_mempool);
++}
++
++extern void nfs_readdata_release(struct rpc_task *task);
++
++/*
+ * linux/fs/mount_clnt.c
+ * (Used only by nfsroot module)
+ */
+@@ -644,6 +704,12 @@
+
+ extern struct dentry_operations nfs4_dentry_operations;
+ extern struct inode_operations nfs4_dir_inode_operations;
++extern struct inode_operations nfs4_file_inode_operations;
++
++/* inode.c */
++extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t);
++extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
+
+ /* nfs4proc.c */
+ extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short);
+@@ -651,13 +717,14 @@
+ extern int nfs4_open_reclaim(struct nfs4_state_owner *, struct nfs4_state *);
+ extern int nfs4_proc_async_renew(struct nfs4_client *);
+ extern int nfs4_proc_renew(struct nfs4_client *);
+-extern int nfs4_do_close(struct inode *, struct nfs4_state *);
+-extern int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode);
++extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode);
+ extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *);
+ extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
+ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int);
+ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
+ extern int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request);
++extern ssize_t nfs4_proc_get_acl(struct inode *, void *buf, ssize_t buflen);
++extern int nfs4_proc_set_acl(struct inode *, const void *buf, ssize_t buflen);
+
+ /* nfs4renewd.c */
+ extern void nfs4_schedule_state_renewal(struct nfs4_client *);
+Index: linux-2.6.10/include/linux/nfs4.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfs4.h 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/include/linux/nfs4.h 2005-04-05 14:49:13.474680856 +0800
+@@ -28,7 +28,7 @@
+ #define NFS4_ACCESS_DELETE 0x0010
+ #define NFS4_ACCESS_EXECUTE 0x0020
+
+-#define NFS4_FH_PERISTENT 0x0000
++#define NFS4_FH_PERSISTENT 0x0000
+ #define NFS4_FH_NOEXPIRE_WITH_OPEN 0x0001
+ #define NFS4_FH_VOLATILE_ANY 0x0002
+ #define NFS4_FH_VOL_MIGRATION 0x0004
+@@ -382,6 +382,8 @@
+ NFSPROC4_CLNT_READDIR,
+ NFSPROC4_CLNT_SERVER_CAPS,
+ NFSPROC4_CLNT_DELEGRETURN,
++ NFSPROC4_CLNT_GETACL,
++ NFSPROC4_CLNT_SETACL,
+ };
+
+ #endif
+Index: linux-2.6.10/include/linux/sunrpc/auth.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/auth.h 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/auth.h 2005-04-05 14:49:13.468681768 +0800
+@@ -51,7 +51,6 @@
+ };
+ #define RPCAUTH_CRED_LOCKED 0x0001
+ #define RPCAUTH_CRED_UPTODATE 0x0002
+-#define RPCAUTH_CRED_DEAD 0x0004
+
+ #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0
+
+@@ -133,7 +132,6 @@
+ int rpcauth_refreshcred(struct rpc_task *);
+ void rpcauth_invalcred(struct rpc_task *);
+ int rpcauth_uptodatecred(struct rpc_task *);
+-int rpcauth_deadcred(struct rpc_task *);
+ void rpcauth_init_credcache(struct rpc_auth *);
+ void rpcauth_free_credcache(struct rpc_auth *);
+
+Index: linux-2.6.10/include/linux/sunrpc/svc.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/svc.h 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/svc.h 2005-04-05 14:49:13.467681920 +0800
+@@ -251,8 +251,7 @@
+ char * pg_name; /* service name */
+ char * pg_class; /* class name: services sharing authentication */
+ struct svc_stat * pg_stats; /* rpc statistics */
+- /* Override authentication. NULL means use default */
+- int (*pg_authenticate)(struct svc_rqst *, u32 *);
++ int (*pg_authenticate)(struct svc_rqst *);
+ };
+
+ /*
+Index: linux-2.6.10/include/linux/sunrpc/cache.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/cache.h 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/cache.h 2005-04-05 14:49:13.470681464 +0800
+@@ -128,20 +128,17 @@
+ * just like a template in C++, this macro does cache lookup
+ * for us.
+ * The function is passed some sort of HANDLE from which a cache_detail
+- * structure can be determined (via SETUP, DETAIL), a template
++ * structure can be determined (via DETAIL), a template
+ * cache entry (type RTN*), and a "set" flag. Using the HASHFN and the
+ * TEST, the function will try to find a matching cache entry in the cache.
+ * If "set" == 0 :
+ * If an entry is found, it is returned
+ * If no entry is found, a new non-VALID entry is created.
+- * If "set" == 1 and INPLACE == 0 :
++ * If "set" == 1:
+ * If no entry is found a new one is inserted with data from "template"
+ * If a non-CACHE_VALID entry is found, it is updated from template using UPDATE
+ * If a CACHE_VALID entry is found, a new entry is swapped in with data
+ * from "template"
+- * If set == 1, and INPLACE == 1 :
+- * As above, except that if a CACHE_VALID entry is found, we UPDATE in place
+- * instead of swapping in a new entry.
+ *
+ * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not
+ * run but insteead CACHE_NEGATIVE is set in any new item.
+@@ -153,25 +150,22 @@
+ * MEMBER is the member of the cache which is cache_head, which must be first
+ * FNAME is the name for the function
+ * ARGS are arguments to function and must contain RTN *item, int set. May
+- * also contain something to be usedby SETUP or DETAIL to find cache_detail.
+- * SETUP locates the cache detail and makes it available as...
+- * DETAIL identifies the cache detail, possibly set up by SETUP
++ * also contain something to be used by DETAIL to find cache_detail.
++ * DETAIL identifies the cache detail
+ * HASHFN returns a hash value of the cache entry "item"
+ * TEST tests if "tmp" matches "item"
+ * INIT copies key information from "item" to "new"
+ * UPDATE copies content information from "item" to "tmp"
+- * INPLACE is true if updates can happen inplace rather than allocating a new structure
+ *
+ * WARNING: any substantial changes to this must be reflected in
+ * net/sunrpc/svcauth.c(auth_domain_lookup)
+ * which is a similar routine that is open-coded.
+ */
+-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE) \
++#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,DETAIL,HASHFN,TEST,INIT,UPDATE) \
+ RTN *FNAME ARGS \
+ { \
+ RTN *tmp, *new=NULL; \
+ struct cache_head **hp, **head; \
+- SETUP; \
+ head = &(DETAIL)->hash_table[HASHFN]; \
+ retry: \
+ if (set||new) write_lock(&(DETAIL)->hash_lock); \
+@@ -180,14 +174,14 @@
+ tmp = container_of(*hp, RTN, MEMBER); \
+ if (TEST) { /* found a match */ \
+ \
+- if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
++ if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
+ break; \
+ \
+ if (new) \
+ {INIT;} \
+ cache_get(&tmp->MEMBER); \
+ if (set) { \
+- if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
++ if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
+ { /* need to swap in new */ \
+ RTN *t2; \
+ \
+@@ -209,7 +203,7 @@
+ else read_unlock(&(DETAIL)->hash_lock); \
+ if (set) \
+ cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
+- if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0); \
++ if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0); \
+ if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL); \
+ return tmp; \
+ } \
+@@ -242,10 +236,10 @@
+ return NULL; \
+ }
+
+-#define DefineSimpleCacheLookup(STRUCT,INPLACE) \
+- DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */, \
++#define DefineSimpleCacheLookup(STRUCT) \
++ DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), \
+ & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\
+- STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE)
++ STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+
+ #define cache_for_each(pos, detail, index, member) \
+ for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \
+Index: linux-2.6.10/include/linux/sunrpc/sched.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/sched.h 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/sched.h 2005-04-05 14:49:13.472681160 +0800
+@@ -11,7 +11,9 @@
+
+ #include <linux/timer.h>
+ #include <linux/sunrpc/types.h>
++#include <linux/spinlock.h>
+ #include <linux/wait.h>
++#include <linux/workqueue.h>
+ #include <linux/sunrpc/xdr.h>
+
+ /*
+@@ -25,11 +27,18 @@
+ struct rpc_cred * rpc_cred; /* Credentials */
+ };
+
++struct rpc_wait_queue;
++struct rpc_wait {
++ struct list_head list; /* wait queue links */
++ struct list_head links; /* Links to related tasks */
++ wait_queue_head_t waitq; /* sync: sleep on this q */
++ struct rpc_wait_queue * rpc_waitq; /* RPC wait queue we're on */
++};
++
+ /*
+ * This is the RPC task struct
+ */
+ struct rpc_task {
+- struct list_head tk_list; /* wait queue links */
+ #ifdef RPC_DEBUG
+ unsigned long tk_magic; /* 0xf00baa */
+ #endif
+@@ -37,7 +46,6 @@
+ struct rpc_clnt * tk_client; /* RPC client */
+ struct rpc_rqst * tk_rqstp; /* RPC request */
+ int tk_status; /* result of last operation */
+- struct rpc_wait_queue * tk_rpcwait; /* RPC wait queue we're on */
+
+ /*
+ * RPC call state
+@@ -70,13 +78,18 @@
+ * you have a pathological interest in kernel oopses.
+ */
+ struct timer_list tk_timer; /* kernel timer */
+- wait_queue_head_t tk_wait; /* sync: sleep on this q */
+ unsigned long tk_timeout; /* timeout for rpc_sleep() */
+ unsigned short tk_flags; /* misc flags */
+ unsigned char tk_active : 1;/* Task has been activated */
+ unsigned char tk_priority : 2;/* Task priority */
+ unsigned long tk_runstate; /* Task run status */
+- struct list_head tk_links; /* links to related tasks */
++ struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could
++ * be any workqueue
++ */
++ union {
++ struct work_struct tk_work; /* Async task work queue */
++ struct rpc_wait tk_wait; /* RPC wait */
++ } u;
+ #ifdef RPC_DEBUG
+ unsigned short tk_pid; /* debugging aid */
+ #endif
+@@ -87,11 +100,11 @@
+ /* support walking a list of tasks on a wait queue */
+ #define task_for_each(task, pos, head) \
+ list_for_each(pos, head) \
+- if ((task=list_entry(pos, struct rpc_task, tk_list)),1)
++ if ((task=list_entry(pos, struct rpc_task, u.tk_wait.list)),1)
+
+ #define task_for_first(task, head) \
+ if (!list_empty(head) && \
+- ((task=list_entry((head)->next, struct rpc_task, tk_list)),1))
++ ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1))
+
+ /* .. and walking list of all tasks */
+ #define alltask_for_each(task, pos, head) \
+@@ -126,22 +139,39 @@
+ #define RPC_IS_SOFT(t) ((t)->tk_flags & RPC_TASK_SOFT)
+ #define RPC_TASK_UNINTERRUPTIBLE(t) ((t)->tk_flags & RPC_TASK_NOINTR)
+
+-#define RPC_TASK_SLEEPING 0
+-#define RPC_TASK_RUNNING 1
+-#define RPC_IS_SLEEPING(t) (test_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
+-#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define RPC_TASK_RUNNING 0
++#define RPC_TASK_QUEUED 1
++#define RPC_TASK_WAKEUP 2
++#define RPC_TASK_HAS_TIMER 3
+
++#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+ #define rpc_set_running(t) (set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+-#define rpc_clear_running(t) (clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define rpc_test_and_set_running(t) \
++ (test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define rpc_clear_running(t) \
++ do { \
++ smp_mb__before_clear_bit(); \
++ clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \
++ smp_mb__after_clear_bit(); \
++ } while (0)
+
+-#define rpc_set_sleeping(t) (set_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
++#define RPC_IS_QUEUED(t) (test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
++#define rpc_set_queued(t) (set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
++#define rpc_clear_queued(t) \
++ do { \
++ smp_mb__before_clear_bit(); \
++ clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \
++ smp_mb__after_clear_bit(); \
++ } while (0)
+
+-#define rpc_clear_sleeping(t) \
++#define rpc_start_wakeup(t) \
++ (test_and_set_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate) == 0)
++#define rpc_finish_wakeup(t) \
+ do { \
+ smp_mb__before_clear_bit(); \
+- clear_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate); \
++ clear_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate); \
+ smp_mb__after_clear_bit(); \
+- } while(0)
++ } while (0)
+
+ /*
+ * Task priorities.
+@@ -157,6 +187,7 @@
+ * RPC synchronization objects
+ */
+ struct rpc_wait_queue {
++ spinlock_t lock;
+ struct list_head tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */
+ unsigned long cookie; /* cookie of last task serviced */
+ unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */
+@@ -177,6 +208,7 @@
+
+ #ifndef RPC_DEBUG
+ # define RPC_WAITQ_INIT(var,qname) { \
++ .lock = SPIN_LOCK_UNLOCKED, \
+ .tasks = { \
+ [0] = LIST_HEAD_INIT(var.tasks[0]), \
+ [1] = LIST_HEAD_INIT(var.tasks[1]), \
+@@ -185,6 +217,7 @@
+ }
+ #else
+ # define RPC_WAITQ_INIT(var,qname) { \
++ .lock = SPIN_LOCK_UNLOCKED, \
+ .tasks = { \
+ [0] = LIST_HEAD_INIT(var.tasks[0]), \
+ [1] = LIST_HEAD_INIT(var.tasks[1]), \
+@@ -209,13 +242,10 @@
+ int rpc_execute(struct rpc_task *);
+ void rpc_run_child(struct rpc_task *parent, struct rpc_task *child,
+ rpc_action action);
+-int rpc_add_wait_queue(struct rpc_wait_queue *, struct rpc_task *);
+-void rpc_remove_wait_queue(struct rpc_task *);
+ void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
+ void rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
+ void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
+ rpc_action action, rpc_action timer);
+-void rpc_add_timer(struct rpc_task *, rpc_action);
+ void rpc_wake_up_task(struct rpc_task *);
+ void rpc_wake_up(struct rpc_wait_queue *);
+ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
+Index: linux-2.6.10/include/linux/sunrpc/gss_krb5.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/gss_krb5.h 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/gss_krb5.h 2005-04-05 14:49:13.473681008 +0800
+@@ -53,6 +53,8 @@
+ struct xdr_netobj mech_used;
+ };
+
++extern spinlock_t krb5_seq_lock;
++
+ #define KG_TOK_MIC_MSG 0x0101
+ #define KG_TOK_WRAP_MSG 0x0201
+
+@@ -116,18 +118,25 @@
+
+ s32
+ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
+- struct xdr_netobj *cksum);
++ int body_offset, struct xdr_netobj *cksum);
+
+ u32
+ krb5_make_token(struct krb5_ctx *context_handle, int qop_req,
+ struct xdr_buf *input_message_buffer,
+- struct xdr_netobj *output_message_buffer, int toktype);
++ struct xdr_netobj *output_message_buffer);
+
+ u32
+ krb5_read_token(struct krb5_ctx *context_handle,
+ struct xdr_netobj *input_token_buffer,
+- struct xdr_buf *message_buffer,
+- int *qop_state, int toktype);
++ struct xdr_buf *message_buffer, int *qop_state);
++
++u32
++gss_wrap_kerberos(struct gss_ctx *ctx_id, u32 qop, int offset,
++ struct xdr_buf *outbuf, struct page **pages);
++
++u32
++gss_unwrap_kerberos(struct gss_ctx *ctx_id, u32 *qop, int offset,
++ struct xdr_buf *buf, int *out_offset);
+
+ u32
+ krb5_encrypt(struct crypto_tfm * key,
+@@ -137,6 +146,13 @@
+ krb5_decrypt(struct crypto_tfm * key,
+ void *iv, void *in, void *out, int length);
+
++int
++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *outbuf, int offset,
++ struct page **pages);
++
++int
++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *inbuf, int offset);
++
+ s32
+ krb5_make_seq_num(struct crypto_tfm * key,
+ int direction,
+Index: linux-2.6.10/include/linux/sunrpc/xdr.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/xdr.h 2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/xdr.h 2005-04-05 14:49:13.467681920 +0800
+@@ -192,6 +192,7 @@
+ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p);
+ extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
+ extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
++extern void truncate_xdr_buf(struct xdr_buf *xdr, int len);
+
+ #endif /* __KERNEL__ */
+
+Index: linux-2.6.10/include/linux/sunrpc/gss_api.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/gss_api.h 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/gss_api.h 2005-04-05 14:49:13.471681312 +0800
+@@ -47,6 +47,18 @@
+ struct xdr_buf *message,
+ struct xdr_netobj *mic_token,
+ u32 *qstate);
++u32 gss_wrap(
++ struct gss_ctx *ctx_id,
++ u32 qop,
++ int offset,
++ struct xdr_buf *outbuf,
++ struct page **inpages);
++u32 gss_unwrap(
++ struct gss_ctx *ctx_id,
++ u32 *qop,
++ int offset,
++ struct xdr_buf *inbuf,
++ int *out_offset);
+ u32 gss_delete_sec_context(
+ struct gss_ctx **ctx_id);
+
+@@ -93,6 +105,18 @@
+ struct xdr_buf *message,
+ struct xdr_netobj *mic_token,
+ u32 *qstate);
++ u32 (*gss_wrap)(
++ struct gss_ctx *ctx_id,
++ u32 qop,
++ int offset,
++ struct xdr_buf *outbuf,
++ struct page **inpages);
++ u32 (*gss_unwrap)(
++ struct gss_ctx *ctx_id,
++ u32 *qop,
++ int offset,
++ struct xdr_buf *buf,
++ int *out_offset);
+ void (*gss_delete_sec_context)(
+ void *internal_ctx_id);
+ };
+Index: linux-2.6.10/include/linux/sunrpc/svcauth.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/svcauth.h 2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/svcauth.h 2005-04-05 14:49:13.469681616 +0800
+@@ -26,21 +26,23 @@
+ struct svc_rqst; /* forward decl */
+
+ /* Authentication is done in the context of a domain.
+- * For a server, a domain represents a group of clients using
++ *
++ * Currently, the nfs server uses the auth_domain to stand
++ * for the "client" listed in /etc/exports.
++ *
++ * More generally, a domain might represent a group of clients using
+ * a common mechanism for authentication and having a common mapping
+ * between local identity (uid) and network identity. All clients
+ * in a domain have similar general access rights. Each domain can
+ * contain multiple principals which will have different specific right
+ * based on normal Discretionary Access Control.
+ *
+- * For a client, a domain represents a number of servers which all
+- * use a common authentication mechanism and network identity name space.
+- *
+ * A domain is created by an authentication flavour module based on name
+ * only. Userspace then fills in detail on demand.
+ *
+- * The creation of a domain typically implies creation of one or
+- * more caches for storing domain specific information.
++ * In the case of auth_unix and auth_null, the auth_domain is also
++ * associated with entries in another cache representing the mapping
++ * of ip addresses to the given client.
+ */
+ struct auth_domain {
+ struct cache_head h;
+@@ -92,6 +94,7 @@
+ int (*accept)(struct svc_rqst *rq, u32 *authp);
+ int (*release)(struct svc_rqst *rq);
+ void (*domain_release)(struct auth_domain *);
++ int (*set_client)(struct svc_rqst *rq);
+ };
+
+ #define SVC_GARBAGE 1
+@@ -107,6 +110,7 @@
+
+ extern int svc_authenticate(struct svc_rqst *rqstp, u32 *authp);
+ extern int svc_authorise(struct svc_rqst *rqstp);
++extern int svc_set_client(struct svc_rqst *rqstp);
+ extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
+ extern void svc_auth_unregister(rpc_authflavor_t flavor);
+
+Index: linux-2.6.10/include/linux/sunrpc/xprt.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sunrpc/xprt.h 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/include/linux/sunrpc/xprt.h 2005-04-05 14:49:13.471681312 +0800
+@@ -95,7 +95,10 @@
+ int rq_cong; /* has incremented xprt->cong */
+ int rq_received; /* receive completed */
+ u32 rq_seqno; /* gss seq no. used on req. */
+-
++ int rq_enc_pages_num;
++ struct page **rq_enc_pages; /* scratch pages for use by
++ gss privacy code */
++ void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
+ struct list_head rq_list;
+
+ struct xdr_buf rq_private_buf; /* The receive buffer
+Index: linux-2.6.10/include/linux/nfs_xdr.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/nfs_xdr.h 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/include/linux/nfs_xdr.h 2005-04-05 14:49:13.459683136 +0800
+@@ -326,6 +326,20 @@
+ const u32 * bitmask;
+ };
+
++struct nfs_setaclargs {
++ struct nfs_fh * fh;
++ ssize_t acl_len;
++ unsigned int acl_pgbase;
++ struct page ** acl_pages;
++};
++
++struct nfs_getaclargs {
++ struct nfs_fh * fh;
++ ssize_t acl_len;
++ unsigned int acl_pgbase;
++ struct page ** acl_pages;
++};
++
+ struct nfs_setattrres {
+ struct nfs_fattr * fattr;
+ const struct nfs_server * server;
+@@ -666,6 +680,7 @@
+ int version; /* Protocol version */
+ struct dentry_operations *dentry_ops;
+ struct inode_operations *dir_inode_ops;
++ struct inode_operations *file_inode_ops;
+
+ int (*getroot) (struct nfs_server *, struct nfs_fh *,
+ struct nfs_fsinfo *);
+@@ -681,7 +696,7 @@
+ int (*read) (struct nfs_read_data *);
+ int (*write) (struct nfs_write_data *);
+ int (*commit) (struct nfs_write_data *);
+- struct inode * (*create) (struct inode *, struct qstr *,
++ struct inode * (*create) (struct inode *, struct dentry *,
+ struct iattr *, int);
+ int (*remove) (struct inode *, struct qstr *);
+ int (*unlink_setup) (struct rpc_message *,
+Index: linux-2.6.10/net/sunrpc/xprt.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/xprt.c 2004-12-25 05:35:14.000000000 +0800
++++ linux-2.6.10/net/sunrpc/xprt.c 2005-04-05 14:49:13.393693168 +0800
+@@ -891,7 +891,8 @@
+ xprt->tcp_flags &= ~XPRT_COPY_XID;
+ xprt->tcp_flags |= XPRT_COPY_DATA;
+ xprt->tcp_copied = 4;
+- dprintk("RPC: reading reply for XID %08x\n", xprt->tcp_xid);
++ dprintk("RPC: reading reply for XID %08x\n",
++ ntohl(xprt->tcp_xid));
+ tcp_check_recm(xprt);
+ }
+
+@@ -911,7 +912,7 @@
+ if (!req) {
+ xprt->tcp_flags &= ~XPRT_COPY_DATA;
+ dprintk("RPC: XID %08x request not found!\n",
+- xprt->tcp_xid);
++ ntohl(xprt->tcp_xid));
+ spin_unlock(&xprt->sock_lock);
+ return;
+ }
+@@ -1101,7 +1102,7 @@
+ goto out;
+
+ spin_lock_bh(&xprt->sock_lock);
+- if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending)
++ if (xprt->snd_task)
+ rpc_wake_up_task(xprt->snd_task);
+ spin_unlock_bh(&xprt->sock_lock);
+ out:
+@@ -1359,8 +1360,9 @@
+ req->rq_task = task;
+ req->rq_xprt = xprt;
+ req->rq_xid = xprt_alloc_xid(xprt);
++ req->rq_release_snd_buf = NULL;
+ dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid,
+- req, req->rq_xid);
++ req, ntohl(req->rq_xid));
+ }
+
+ /*
+@@ -1384,6 +1386,8 @@
+ mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT);
+ spin_unlock_bh(&xprt->sock_lock);
+ task->tk_rqstp = NULL;
++ if (req->rq_release_snd_buf)
++ req->rq_release_snd_buf(req);
+ memset(req, 0, sizeof(*req)); /* mark unused */
+
+ dprintk("RPC: %4d release request %p\n", task->tk_pid, req);
+Index: linux-2.6.10/net/sunrpc/auth.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth.c 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth.c 2005-04-05 14:49:13.394693016 +0800
+@@ -214,8 +214,6 @@
+ list_for_each_safe(pos, next, &auth->au_credcache[nr]) {
+ struct rpc_cred *entry;
+ entry = list_entry(pos, struct rpc_cred, cr_hash);
+- if (entry->cr_flags & RPCAUTH_CRED_DEAD)
+- continue;
+ if (rpcauth_prune_expired(entry, &free))
+ continue;
+ if (entry->cr_ops->crmatch(acred, entry, taskflags)) {
+@@ -307,9 +305,6 @@
+ if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
+ return;
+
+- if ((cred->cr_flags & RPCAUTH_CRED_DEAD) && !list_empty(&cred->cr_hash))
+- list_del_init(&cred->cr_hash);
+-
+ if (list_empty(&cred->cr_hash)) {
+ spin_unlock(&rpc_credcache_lock);
+ rpcauth_crdestroy(cred);
+@@ -413,10 +408,3 @@
+ return !(task->tk_msg.rpc_cred) ||
+ (task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_UPTODATE);
+ }
+-
+-int
+-rpcauth_deadcred(struct rpc_task *task)
+-{
+- return !(task->tk_msg.rpc_cred) ||
+- (task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_DEAD);
+-}
+Index: linux-2.6.10/net/sunrpc/svcauth_unix.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/svcauth_unix.c 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/net/sunrpc/svcauth_unix.c 2005-04-05 14:49:13.395692864 +0800
+@@ -97,7 +97,7 @@
+ };
+ static struct cache_head *ip_table[IP_HASHMAX];
+
+-void ip_map_put(struct cache_head *item, struct cache_detail *cd)
++static void ip_map_put(struct cache_head *item, struct cache_detail *cd)
+ {
+ struct ip_map *im = container_of(item, struct ip_map,h);
+ if (cache_put(item, cd)) {
+@@ -258,7 +258,7 @@
+ .cache_show = ip_map_show,
+ };
+
+-static DefineSimpleCacheLookup(ip_map, 0)
++static DefineSimpleCacheLookup(ip_map)
+
+
+ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
+@@ -329,14 +329,49 @@
+ cache_purge(&auth_domain_cache);
+ }
+
++int
++svcauth_unix_set_client(struct svc_rqst *rqstp)
++{
++ struct ip_map key, *ipm;
++
++ rqstp->rq_client = NULL;
++ if (rqstp->rq_proc == 0)
++ return SVC_OK;
++
++ strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class);
++ key.m_addr = rqstp->rq_addr.sin_addr;
++
++ ipm = ip_map_lookup(&key, 0);
++
++ if (ipm == NULL)
++ return SVC_DENIED;
++
++ switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
++ case -EAGAIN:
++ return SVC_DROP;
++ case -ENOENT:
++ return SVC_DENIED;
++ case 0:
++ rqstp->rq_client = &ipm->m_client->h;
++ cache_get(&rqstp->rq_client->h);
++ ip_map_put(&ipm->h, &ip_map_cache);
++ return SVC_OK;
++ default:
++ BUG();
++ }
++ /* shut up gcc: */
++ return -1;
++}
+
+ static int
+ svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp)
+ {
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+- int rv=0;
+- struct ip_map key, *ipm;
++ struct svc_cred *cred = &rqstp->rq_cred;
++
++ cred->cr_group_info = NULL;
++ rqstp->rq_client = NULL;
+
+ if (argv->iov_len < 3*4)
+ return SVC_GARBAGE;
+@@ -353,45 +388,17 @@
+ }
+
+ /* Signal that mapping to nobody uid/gid is required */
+- rqstp->rq_cred.cr_uid = (uid_t) -1;
+- rqstp->rq_cred.cr_gid = (gid_t) -1;
+- rqstp->rq_cred.cr_group_info = groups_alloc(0);
+- if (rqstp->rq_cred.cr_group_info == NULL)
++ cred->cr_uid = (uid_t) -1;
++ cred->cr_gid = (gid_t) -1;
++ cred->cr_group_info = groups_alloc(0);
++ if (cred->cr_group_info == NULL)
+ return SVC_DROP; /* kmalloc failure - client must retry */
+
+ /* Put NULL verifier */
+ svc_putu32(resv, RPC_AUTH_NULL);
+ svc_putu32(resv, 0);
+
+- strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class);
+- key.m_addr = rqstp->rq_addr.sin_addr;
+-
+- ipm = ip_map_lookup(&key, 0);
+-
+- rqstp->rq_client = NULL;
+-
+- if (ipm)
+- switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
+- case -EAGAIN:
+- rv = SVC_DROP;
+- break;
+- case -ENOENT:
+- rv = SVC_OK; /* rq_client is NULL */
+- break;
+- case 0:
+- rqstp->rq_client = &ipm->m_client->h;
+- cache_get(&rqstp->rq_client->h);
+- ip_map_put(&ipm->h, &ip_map_cache);
+- rv = SVC_OK;
+- break;
+- default: BUG();
+- }
+- else rv = SVC_DROP;
+-
+- if (rqstp->rq_client == NULL && rqstp->rq_proc != 0)
+- *authp = rpc_autherr_badcred;
+-
+- return rv;
++ return SVC_OK;
+ }
+
+ static int
+@@ -414,6 +421,7 @@
+ .flavour = RPC_AUTH_NULL,
+ .accept = svcauth_null_accept,
+ .release = svcauth_null_release,
++ .set_client = svcauth_unix_set_client,
+ };
+
+
+@@ -425,8 +433,6 @@
+ struct svc_cred *cred = &rqstp->rq_cred;
+ u32 slen, i;
+ int len = argv->iov_len;
+- int rv=0;
+- struct ip_map key, *ipm;
+
+ cred->cr_group_info = NULL;
+ rqstp->rq_client = NULL;
+@@ -458,39 +464,11 @@
+ return SVC_DENIED;
+ }
+
+-
+- strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class);
+- key.m_addr = rqstp->rq_addr.sin_addr;
+-
+-
+- ipm = ip_map_lookup(&key, 0);
+-
+- if (ipm)
+- switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
+- case -EAGAIN:
+- rv = SVC_DROP;
+- break;
+- case -ENOENT:
+- rv = SVC_OK; /* rq_client is NULL */
+- break;
+- case 0:
+- rqstp->rq_client = &ipm->m_client->h;
+- cache_get(&rqstp->rq_client->h);
+- ip_map_put(&ipm->h, &ip_map_cache);
+- rv = SVC_OK;
+- break;
+- default: BUG();
+- }
+- else rv = SVC_DROP;
+-
+- if (rv == SVC_OK && rqstp->rq_client == NULL && rqstp->rq_proc != 0)
+- goto badcred;
+-
+ /* Put NULL verifier */
+ svc_putu32(resv, RPC_AUTH_NULL);
+ svc_putu32(resv, 0);
+
+- return rv;
++ return SVC_OK;
+
+ badcred:
+ *authp = rpc_autherr_badcred;
+@@ -520,5 +498,6 @@
+ .accept = svcauth_unix_accept,
+ .release = svcauth_unix_release,
+ .domain_release = svcauth_unix_domain_release,
++ .set_client = svcauth_unix_set_client,
+ };
+
+Index: linux-2.6.10/net/sunrpc/clnt.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/clnt.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/net/sunrpc/clnt.c 2005-04-05 14:49:13.410690584 +0800
+@@ -636,8 +636,14 @@
+ rpc_exit(task, -EIO);
+ return;
+ }
+- if (encode && (status = rpcauth_wrap_req(task, encode, req, p,
+- task->tk_msg.rpc_argp)) < 0) {
++ if (encode == NULL)
++ return;
++
++ status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp);
++ if (status == -EAGAIN) {
++ printk("XXXJBF: out of memory? Should retry here!!!\n");
++ }
++ if (status < 0) {
+ printk(KERN_WARNING "%s: can't encode arguments: %d\n",
+ clnt->cl_protname, -status);
+ rpc_exit(task, status);
+@@ -935,7 +941,7 @@
+ task->tk_action = call_reserve;
+ if (status >= 0 && rpcauth_uptodatecred(task))
+ return;
+- if (rpcauth_deadcred(task)) {
++ if (status == -EACCES) {
+ rpc_exit(task, -EACCES);
+ return;
+ }
+@@ -993,7 +999,7 @@
+ goto garbage;
+ if ((n = ntohl(*p++)) != RPC_AUTH_ERROR) {
+ printk(KERN_WARNING "call_verify: RPC call rejected: %x\n", n);
+- } else if (--len < 0)
++ } else if (--len == 0)
+ switch ((n = ntohl(*p++))) {
+ case RPC_AUTH_REJECTEDCRED:
+ case RPC_AUTH_REJECTEDVERF:
+Index: linux-2.6.10/net/sunrpc/svcauth.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/svcauth.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/net/sunrpc/svcauth.c 2005-04-05 14:49:13.392693320 +0800
+@@ -59,6 +59,11 @@
+ return aops->accept(rqstp, authp);
+ }
+
++int svc_set_client(struct svc_rqst *rqstp)
++{
++ return rqstp->rq_authop->set_client(rqstp);
++}
++
+ /* A request, which was authenticated, has now executed.
+ * Time to finalise the the credentials and verifier
+ * and release and resources
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_unseal.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_unseal.c 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_unseal.c 2005-04-05 14:49:13.401691952 +0800
+@@ -68,20 +68,13 @@
+ #endif
+
+
+-/* message_buffer is an input if toktype is MIC and an output if it is WRAP:
+- * If toktype is MIC: read_token is a mic token, and message_buffer is the
+- * data that the mic was supposedly taken over.
+- * If toktype is WRAP: read_token is a wrap token, and message_buffer is used
+- * to return the decrypted data.
+- */
++/* read_token is a mic token, and message_buffer is the data that the mic was
++ * supposedly taken over. */
+
+-/* XXX will need to change prototype and/or just split into a separate function
+- * when we add privacy (because read_token will be in pages too). */
+ u32
+ krb5_read_token(struct krb5_ctx *ctx,
+ struct xdr_netobj *read_token,
+- struct xdr_buf *message_buffer,
+- int *qop_state, int toktype)
++ struct xdr_buf *message_buffer, int *qop_state)
+ {
+ int signalg;
+ int sealalg;
+@@ -100,16 +93,12 @@
+ read_token->len))
+ goto out;
+
+- if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff)))
++ if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
++ (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) )
+ goto out;
+
+ /* XXX sanity-check bodysize?? */
+
+- if (toktype == KG_TOK_WRAP_MSG) {
+- /* XXX gone */
+- goto out;
+- }
+-
+ /* get the sign and seal algorithms */
+
+ signalg = ptr[0] + (ptr[1] << 8);
+@@ -120,14 +109,7 @@
+ if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+ goto out;
+
+- if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) ||
+- ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff)))
+- goto out;
+-
+- /* in the current spec, there is only one valid seal algorithm per
+- key type, so a simple comparison is ok */
+-
+- if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg))
++ if (sealalg != 0xffff)
+ goto out;
+
+ /* there are several mappings of seal algorithms to sign algorithms,
+@@ -154,7 +136,7 @@
+ switch (signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ ret = make_checksum(checksum_type, ptr - 2, 8,
+- message_buffer, &md5cksum);
++ message_buffer, 0, &md5cksum);
+ if (ret)
+ goto out;
+
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_mech_switch.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_mech_switch.c 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_mech_switch.c 2005-04-05 14:49:13.408690888 +0800
+@@ -279,6 +279,29 @@
+ qstate);
+ }
+
++u32
++gss_wrap(struct gss_ctx *ctx_id,
++ u32 qop,
++ int offset,
++ struct xdr_buf *buf,
++ struct page **inpages)
++{
++ return ctx_id->mech_type->gm_ops
++ ->gss_wrap(ctx_id, qop, offset, buf, inpages);
++}
++
++u32
++gss_unwrap(struct gss_ctx *ctx_id,
++ u32 *qop,
++ int offset,
++ struct xdr_buf *buf,
++ int *out_offset)
++{
++ return ctx_id->mech_type->gm_ops
++ ->gss_unwrap(ctx_id, qop, offset, buf, out_offset);
++}
++
++
+ /* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_wrap.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_wrap.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_wrap.c 2005-04-05 14:49:13.397692560 +0800
+@@ -0,0 +1,337 @@
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_krb5.h>
++#include <linux/random.h>
++#include <linux/pagemap.h>
++#include <asm/scatterlist.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY RPCDBG_AUTH
++#endif
++
++static inline int
++gss_krb5_padding(int blocksize, int length)
++{
++ /* Most of the code is block-size independent but currently we
++ * use only 8: */
++ BUG_ON(blocksize != 8);
++ return 8 - (length & 7);
++}
++
++static inline void
++gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize)
++{
++ int padding = gss_krb5_padding(blocksize, buf->len - offset);
++ char *p;
++ struct kvec *iov;
++
++ if (buf->page_len || buf->tail[0].iov_len)
++ iov = &buf->tail[0];
++ else
++ iov = &buf->head[0];
++ p = iov->iov_base + iov->iov_len;
++ iov->iov_len += padding;
++ buf->len += padding;
++ memset(p, padding, padding);
++}
++
++static inline int
++gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
++{
++ u8 *ptr;
++ u8 pad;
++ int len = buf->len;
++
++ if (len <= buf->head[0].iov_len) {
++ pad = *(u8 *)(buf->head[0].iov_base + len - 1);
++ goto out;
++ } else
++ len -= buf->head[0].iov_len;
++ if (len <= buf->page_len) {
++ int last = (buf->page_base + len - 1)
++ >>PAGE_CACHE_SHIFT;
++ int offset = (buf->page_base + len - 1)
++ & (PAGE_CACHE_SIZE - 1);
++ ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA);
++ pad = *(ptr + offset);
++ kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA);
++ goto out;
++ } else
++ len -= buf->page_len;
++ BUG_ON(len > buf->tail[0].iov_len);
++ pad = *(u8 *)(buf->tail[0].iov_base + len - 1);
++out:
++ if (pad > blocksize)
++ return -EINVAL;
++ buf->len -= pad;
++ return 0;
++}
++
++static inline void
++make_confounder(char *p, int blocksize)
++{
++ /* XXX? Is this OK to do on every packet? */
++ get_random_bytes(p, blocksize);
++}
++
++/* Assumptions: the head and tail of inbuf are ours to play with.
++ * The pages, however, may be real pages in the page cache and we replace
++ * them with scratch pages from **pages before writing to them. */
++/* XXX: obviously the above should be documentation of wrap interface,
++ * and shouldn't be in this kerberos-specific file. */
++
++/* XXX factor out common code with seal/unseal. */
++
++u32
++gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset,
++ struct xdr_buf *buf, struct page **pages)
++{
++ struct krb5_ctx *kctx = ctx->internal_ctx_id;
++ s32 checksum_type;
++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
++ int blocksize = 0, plainlen;
++ unsigned char *ptr, *krb5_hdr, *msg_start;
++ s32 now;
++ int headlen;
++ struct page **tmp_pages;
++ u32 seq_send;
++
++ dprintk("RPC: gss_wrap_kerberos\n");
++
++ now = get_seconds();
++
++ if (qop != 0)
++ goto out_err;
++
++ switch (kctx->signalg) {
++ case SGN_ALG_DES_MAC_MD5:
++ checksum_type = CKSUMTYPE_RSA_MD5;
++ break;
++ default:
++ dprintk("RPC: gss_krb5_seal: kctx->signalg %d not"
++ " supported\n", kctx->signalg);
++ goto out_err;
++ }
++ if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) {
++ dprintk("RPC: gss_krb5_seal: kctx->sealalg %d not supported\n",
++ kctx->sealalg);
++ goto out_err;
++ }
++
++ blocksize = crypto_tfm_alg_blocksize(kctx->enc);
++ gss_krb5_add_padding(buf, offset, blocksize);
++ BUG_ON((buf->len - offset) % blocksize);
++ plainlen = blocksize + buf->len - offset;
++
++ headlen = g_token_size(&kctx->mech_used, 22 + plainlen) -
++ (buf->len - offset);
++
++ ptr = buf->head[0].iov_base + offset;
++ /* shift data to make room for header. */
++ /* XXX Would be cleverer to encrypt while copying. */
++ /* XXX bounds checking, slack, etc. */
++ memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset);
++ buf->head[0].iov_len += headlen;
++ buf->len += headlen;
++ BUG_ON((buf->len - offset - headlen) % blocksize);
++
++ g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr);
++
++
++ *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff);
++ *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff);
++
++ /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
++ krb5_hdr = ptr - 2;
++ msg_start = krb5_hdr + 24;
++ /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize);
++
++ *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg);
++ memset(krb5_hdr + 4, 0xff, 4);
++ *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg);
++
++ make_confounder(msg_start, blocksize);
++
++ /* XXXJBF: UGH!: */
++ tmp_pages = buf->pages;
++ buf->pages = pages;
++ if (make_checksum(checksum_type, krb5_hdr, 8, buf,
++ offset + headlen - blocksize, &md5cksum))
++ goto out_err;
++ buf->pages = tmp_pages;
++
++ switch (kctx->signalg) {
++ case SGN_ALG_DES_MAC_MD5:
++ if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
++ md5cksum.data, md5cksum.len))
++ goto out_err;
++ memcpy(krb5_hdr + 16,
++ md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
++ KRB5_CKSUM_LENGTH);
++
++ dprintk("RPC: make_seal_token: cksum data: \n");
++ print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0);
++ break;
++ default:
++ BUG();
++ }
++
++ kfree(md5cksum.data);
++
++ spin_lock(&krb5_seq_lock);
++ seq_send = kctx->seq_send++;
++ spin_unlock(&krb5_seq_lock);
++
++ /* XXX would probably be more efficient to compute checksum
++ * and encrypt at the same time: */
++ if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
++ seq_send, krb5_hdr + 16, krb5_hdr + 8)))
++ goto out_err;
++
++ if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
++ pages))
++ goto out_err;
++
++ return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
++out_err:
++ if (md5cksum.data) kfree(md5cksum.data);
++ return GSS_S_FAILURE;
++}
++
++u32
++gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset,
++ struct xdr_buf *buf, int *out_offset)
++{
++ struct krb5_ctx *kctx = ctx->internal_ctx_id;
++ int signalg;
++ int sealalg;
++ s32 checksum_type;
++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
++ s32 now;
++ int direction;
++ s32 seqnum;
++ unsigned char *ptr;
++ int bodysize;
++ u32 ret = GSS_S_DEFECTIVE_TOKEN;
++ u8 *data_start;
++ int blocksize;
++
++ dprintk("RPC: gss_unwrap_kerberos\n");
++
++ ptr = (u8 *)buf->head[0].iov_base + offset;
++ if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
++ buf->len - offset))
++ goto out;
++
++ if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) ||
++ (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) )
++ goto out;
++
++ /* XXX sanity-check bodysize?? */
++
++ /* get the sign and seal algorithms */
++
++ signalg = ptr[0] + (ptr[1] << 8);
++ sealalg = ptr[2] + (ptr[3] << 8);
++
++ /* Sanity checks */
++
++ if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
++ goto out;
++
++ if (sealalg == 0xffff)
++ goto out;
++
++ /* in the current spec, there is only one valid seal algorithm per
++ key type, so a simple comparison is ok */
++
++ if (sealalg != kctx->sealalg)
++ goto out;
++
++ /* there are several mappings of seal algorithms to sign algorithms,
++ but few enough that we can try them all. */
++
++ if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
++ (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
++ (kctx->sealalg == SEAL_ALG_DES3KD &&
++ signalg != SGN_ALG_HMAC_SHA1_DES3_KD))
++ goto out;
++
++ if (gss_decrypt_xdr_buf(kctx->enc, buf,
++ ptr + 22 - (unsigned char *)buf->head[0].iov_base))
++ goto out;
++
++ /* compute the checksum of the message */
++
++ /* initialize the the cksum */
++ switch (signalg) {
++ case SGN_ALG_DES_MAC_MD5:
++ checksum_type = CKSUMTYPE_RSA_MD5;
++ break;
++ default:
++ ret = GSS_S_DEFECTIVE_TOKEN;
++ goto out;
++ }
++
++ switch (signalg) {
++ case SGN_ALG_DES_MAC_MD5:
++ ret = make_checksum(checksum_type, ptr - 2, 8, buf,
++ ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum);
++ if (ret)
++ goto out;
++
++ ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data,
++ md5cksum.data, md5cksum.len);
++ if (ret)
++ goto out;
++
++ if (memcmp(md5cksum.data + 8, ptr + 14, 8)) {
++ ret = GSS_S_BAD_SIG;
++ goto out;
++ }
++ break;
++ default:
++ ret = GSS_S_DEFECTIVE_TOKEN;
++ goto out;
++ }
++
++ /* it got through unscathed. Make sure the context is unexpired */
++
++ if (qop)
++ *qop = GSS_C_QOP_DEFAULT;
++
++ now = get_seconds();
++
++ ret = GSS_S_CONTEXT_EXPIRED;
++ if (now > kctx->endtime)
++ goto out;
++
++ /* do sequencing checks */
++
++ ret = GSS_S_BAD_SIG;
++ if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction,
++ &seqnum)))
++ goto out;
++
++ if ((kctx->initiate && direction != 0xff) ||
++ (!kctx->initiate && direction != 0))
++ goto out;
++
++ /* Copy the data back to the right position. XXX: Would probably be
++ * better to copy and encrypt at the same time. */
++
++ blocksize = crypto_tfm_alg_blocksize(kctx->enc);
++ data_start = ptr + 22 + blocksize;
++ *out_offset = data_start - (u8 *)buf->head[0].iov_base;
++
++ ret = GSS_S_DEFECTIVE_TOKEN;
++ if (gss_krb5_remove_padding(buf, blocksize))
++ goto out;
++
++ ret = GSS_S_COMPLETE;
++out:
++ if (md5cksum.data) kfree(md5cksum.data);
++ return ret;
++}
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_crypto.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_crypto.c 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_crypto.c 2005-04-05 14:49:13.398692408 +0800
+@@ -139,17 +139,91 @@
+ sg->length = len;
+ }
+
++static int
++process_xdr_buf(struct xdr_buf *buf, int offset, int len,
++ int (*actor)(struct scatterlist *, void *), void *data)
++{
++ int i, page_len, thislen, page_offset, ret = 0;
++ struct scatterlist sg[1];
++
++ if (offset >= buf->head[0].iov_len) {
++ offset -= buf->head[0].iov_len;
++ } else {
++ thislen = buf->head[0].iov_len - offset;
++ if (thislen > len)
++ thislen = len;
++ buf_to_sg(sg, buf->head[0].iov_base + offset, thislen);
++ ret = actor(sg, data);
++ if (ret)
++ goto out;
++ offset = 0;
++ len -= thislen;
++ }
++ if (len == 0)
++ goto out;
++
++ if (offset >= buf->page_len) {
++ offset -= buf->page_len;
++ } else {
++ page_len = buf->page_len - offset;
++ if (page_len > len)
++ page_len = len;
++ len -= page_len;
++ page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
++ i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
++ thislen = PAGE_CACHE_SIZE - page_offset;
++ do {
++ if (thislen > page_len)
++ thislen = page_len;
++ sg->page = buf->pages[i];
++ sg->offset = page_offset;
++ sg->length = thislen;
++ ret = actor(sg, data);
++ if (ret)
++ goto out;
++ page_len -= thislen;
++ i++;
++ page_offset = 0;
++ thislen = PAGE_CACHE_SIZE;
++ } while (page_len != 0);
++ offset = 0;
++ }
++ if (len == 0)
++ goto out;
++
++ if (offset < buf->tail[0].iov_len) {
++ thislen = buf->tail[0].iov_len - offset;
++ if (thislen > len)
++ thislen = len;
++ buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen);
++ ret = actor(sg, data);
++ len -= thislen;
++ }
++ if (len != 0)
++ ret = -EINVAL;
++out:
++ return ret;
++}
++
++static int
++checksummer(struct scatterlist *sg, void *data)
++{
++ struct crypto_tfm *tfm = (struct crypto_tfm *)data;
++
++ crypto_digest_update(tfm, sg, 1);
++
++ return 0;
++}
++
+ /* checksum the plaintext data and hdrlen bytes of the token header */
+ s32
+ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
+- struct xdr_netobj *cksum)
++ int body_offset, struct xdr_netobj *cksum)
+ {
+ char *cksumname;
+ struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */
+ struct scatterlist sg[1];
+ u32 code = GSS_S_FAILURE;
+- int len, thislen, offset;
+- int i;
+
+ switch (cksumtype) {
+ case CKSUMTYPE_RSA_MD5:
+@@ -169,35 +243,8 @@
+ crypto_digest_init(tfm);
+ buf_to_sg(sg, header, hdrlen);
+ crypto_digest_update(tfm, sg, 1);
+- if (body->head[0].iov_len) {
+- buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len);
+- crypto_digest_update(tfm, sg, 1);
+- }
+-
+- len = body->page_len;
+- if (len != 0) {
+- offset = body->page_base & (PAGE_CACHE_SIZE - 1);
+- i = body->page_base >> PAGE_CACHE_SHIFT;
+- thislen = PAGE_CACHE_SIZE - offset;
+- do {
+- if (thislen > len)
+- thislen = len;
+- sg->page = body->pages[i];
+- sg->offset = offset;
+- sg->length = thislen;
+- kmap(sg->page); /* XXX kmap_atomic? */
+- crypto_digest_update(tfm, sg, 1);
+- kunmap(sg->page);
+- len -= thislen;
+- i++;
+- offset = 0;
+- thislen = PAGE_CACHE_SIZE;
+- } while(len != 0);
+- }
+- if (body->tail[0].iov_len) {
+- buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len);
+- crypto_digest_update(tfm, sg, 1);
+- }
++ process_xdr_buf(body, body_offset, body->len - body_offset,
++ checksummer, tfm);
+ crypto_digest_final(tfm, cksum->data);
+ code = 0;
+ out:
+@@ -207,3 +254,154 @@
+ }
+
+ EXPORT_SYMBOL(make_checksum);
++
++struct encryptor_desc {
++ u8 iv[8]; /* XXX hard-coded blocksize */
++ struct crypto_tfm *tfm;
++ int pos;
++ struct xdr_buf *outbuf;
++ struct page **pages;
++ struct scatterlist infrags[4];
++ struct scatterlist outfrags[4];
++ int fragno;
++ int fraglen;
++};
++
++static int
++encryptor(struct scatterlist *sg, void *data)
++{
++ struct encryptor_desc *desc = data;
++ struct xdr_buf *outbuf = desc->outbuf;
++ struct page *in_page;
++ int thislen = desc->fraglen + sg->length;
++ int fraglen, ret;
++ int page_pos;
++
++ /* Worst case is 4 fragments: head, end of page 1, start
++ * of page 2, tail. Anything more is a bug. */
++ BUG_ON(desc->fragno > 3);
++ desc->infrags[desc->fragno] = *sg;
++ desc->outfrags[desc->fragno] = *sg;
++
++ page_pos = desc->pos - outbuf->head[0].iov_len;
++ if (page_pos >= 0 && page_pos < outbuf->page_len) {
++ /* pages are not in place: */
++ int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
++ in_page = desc->pages[i];
++ } else {
++ in_page = sg->page;
++ }
++ desc->infrags[desc->fragno].page = in_page;
++ desc->fragno++;
++ desc->fraglen += sg->length;
++ desc->pos += sg->length;
++
++ fraglen = thislen & 7; /* XXX hardcoded blocksize */
++ thislen -= fraglen;
++
++ if (thislen == 0)
++ return 0;
++
++ ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags,
++ thislen, desc->iv);
++ if (ret)
++ return ret;
++ if (fraglen) {
++ desc->outfrags[0].page = sg->page;
++ desc->outfrags[0].offset = sg->offset + sg->length - fraglen;
++ desc->outfrags[0].length = fraglen;
++ desc->infrags[0] = desc->outfrags[0];
++ desc->infrags[0].page = in_page;
++ desc->fragno = 1;
++ desc->fraglen = fraglen;
++ } else {
++ desc->fragno = 0;
++ desc->fraglen = 0;
++ }
++ return 0;
++}
++
++int
++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset,
++ struct page **pages)
++{
++ int ret;
++ struct encryptor_desc desc;
++
++ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
++
++ memset(desc.iv, 0, sizeof(desc.iv));
++ desc.tfm = tfm;
++ desc.pos = offset;
++ desc.outbuf = buf;
++ desc.pages = pages;
++ desc.fragno = 0;
++ desc.fraglen = 0;
++
++ ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc);
++ return ret;
++}
++
++EXPORT_SYMBOL(gss_encrypt_xdr_buf);
++
++struct decryptor_desc {
++ u8 iv[8]; /* XXX hard-coded blocksize */
++ struct crypto_tfm *tfm;
++ struct scatterlist frags[4];
++ int fragno;
++ int fraglen;
++};
++
++static int
++decryptor(struct scatterlist *sg, void *data)
++{
++ struct decryptor_desc *desc = data;
++ int thislen = desc->fraglen + sg->length;
++ int fraglen, ret;
++
++ /* Worst case is 4 fragments: head, end of page 1, start
++ * of page 2, tail. Anything more is a bug. */
++ BUG_ON(desc->fragno > 3);
++ desc->frags[desc->fragno] = *sg;
++ desc->fragno++;
++ desc->fraglen += sg->length;
++
++ fraglen = thislen & 7; /* XXX hardcoded blocksize */
++ thislen -= fraglen;
++
++ if (thislen == 0)
++ return 0;
++
++ ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags,
++ thislen, desc->iv);
++ if (ret)
++ return ret;
++ if (fraglen) {
++ desc->frags[0].page = sg->page;
++ desc->frags[0].offset = sg->offset + sg->length - fraglen;
++ desc->frags[0].length = fraglen;
++ desc->fragno = 1;
++ desc->fraglen = fraglen;
++ } else {
++ desc->fragno = 0;
++ desc->fraglen = 0;
++ }
++ return 0;
++}
++
++int
++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset)
++{
++ struct decryptor_desc desc;
++
++ /* XXXJBF: */
++ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
++
++ memset(desc.iv, 0, sizeof(desc.iv));
++ desc.tfm = tfm;
++ desc.fragno = 0;
++ desc.fraglen = 0;
++ return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc);
++}
++
++EXPORT_SYMBOL(gss_decrypt_xdr_buf);
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_seal.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_seal.c 2004-12-25 05:33:47.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_seal.c 2005-04-05 14:49:13.402691800 +0800
+@@ -70,24 +70,17 @@
+ # define RPCDBG_FACILITY RPCDBG_AUTH
+ #endif
+
+-static inline int
+-gss_krb5_padding(int blocksize, int length) {
+- /* Most of the code is block-size independent but in practice we
+- * use only 8: */
+- BUG_ON(blocksize != 8);
+- return 8 - (length & 7);
+-}
++spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+
+ u32
+ krb5_make_token(struct krb5_ctx *ctx, int qop_req,
+- struct xdr_buf *text, struct xdr_netobj *token,
+- int toktype)
++ struct xdr_buf *text, struct xdr_netobj *token)
+ {
+ s32 checksum_type;
+ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
+- int blocksize = 0, tmsglen;
+ unsigned char *ptr, *krb5_hdr, *msg_start;
+ s32 now;
++ u32 seq_send;
+
+ dprintk("RPC: gss_krb5_seal\n");
+
+@@ -111,21 +104,13 @@
+ goto out_err;
+ }
+
+- if (toktype == KG_TOK_WRAP_MSG) {
+- blocksize = crypto_tfm_alg_blocksize(ctx->enc);
+- tmsglen = blocksize + text->len
+- + gss_krb5_padding(blocksize, blocksize + text->len);
+- } else {
+- tmsglen = 0;
+- }
+-
+- token->len = g_token_size(&ctx->mech_used, 22 + tmsglen);
++ token->len = g_token_size(&ctx->mech_used, 22);
+
+ ptr = token->data;
+- g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr);
++ g_make_token_header(&ctx->mech_used, 22, &ptr);
+
+- *ptr++ = (unsigned char) ((toktype>>8)&0xff);
+- *ptr++ = (unsigned char) (toktype&0xff);
++ *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
++ *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
+
+ /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
+ krb5_hdr = ptr - 2;
+@@ -133,17 +118,9 @@
+
+ *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg);
+ memset(krb5_hdr + 4, 0xff, 4);
+- if (toktype == KG_TOK_WRAP_MSG)
+- *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg);
+
+- if (toktype == KG_TOK_WRAP_MSG) {
+- /* XXX removing support for now */
+- goto out_err;
+- } else { /* Sign only. */
+- if (make_checksum(checksum_type, krb5_hdr, 8, text,
+- &md5cksum))
++ if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum))
+ goto out_err;
+- }
+
+ switch (ctx->signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+@@ -163,12 +140,14 @@
+
+ kfree(md5cksum.data);
+
++ spin_lock(&krb5_seq_lock);
++ seq_send = ctx->seq_send++;
++ spin_unlock(&krb5_seq_lock);
++
+ if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
+- ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8)))
++ seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+ goto out_err;
+
+- ctx->seq_send++;
+-
+ return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
+ out_err:
+ if (md5cksum.data) kfree(md5cksum.data);
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_pseudoflavors.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_pseudoflavors.c 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_pseudoflavors.c 2005-04-05 19:01:49.158500672 +0800
+@@ -1,237 +0,0 @@
+-/*
+- * linux/net/sunrpc/gss_union.c
+- *
+- * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic code
+- *
+- * Copyright (c) 2001 The Regents of the University of Michigan.
+- * All rights reserved.
+- *
+- * Andy Adamson <andros@umich.edu>
+- *
+- */
+-
+-/*
+- * Copyright 1993 by OpenVision Technologies, Inc.
+- *
+- * Permission to use, copy, modify, distribute, and sell this software
+- * and its documentation for any purpose is hereby granted without fee,
+- * provided that the above copyright notice appears in all copies and
+- * that both that copyright notice and this permission notice appear in
+- * supporting documentation, and that the name of OpenVision not be used
+- * in advertising or publicity pertaining to distribution of the software
+- * without specific, written prior permission. OpenVision makes no
+- * representations about the suitability of this software for any
+- * purpose. It is provided "as is" without express or implied warranty.
+- *
+- * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+- * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+- * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+- * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+- * PERFORMANCE OF THIS SOFTWARE.
+- */
+-
+-#include <linux/types.h>
+-#include <linux/slab.h>
+-#include <linux/socket.h>
+-#include <linux/sunrpc/gss_asn1.h>
+-#include <linux/sunrpc/auth_gss.h>
+-
+-#ifdef RPC_DEBUG
+-# define RPCDBG_FACILITY RPCDBG_AUTH
+-#endif
+-
+-static LIST_HEAD(registered_triples);
+-static spinlock_t registered_triples_lock = SPIN_LOCK_UNLOCKED;
+-
+-/* The following must be called with spinlock held: */
+-static struct sup_sec_triple *
+-do_lookup_triple_by_pseudoflavor(u32 pseudoflavor)
+-{
+- struct sup_sec_triple *pos, *triple = NULL;
+-
+- list_for_each_entry(pos, ®istered_triples, triples) {
+- if (pos->pseudoflavor == pseudoflavor) {
+- triple = pos;
+- break;
+- }
+- }
+- return triple;
+-}
+-
+-/* XXX Need to think about reference counting of triples and of mechs.
+- * Currently we do no reference counting of triples, and I think that's
+- * probably OK given the reference counting on mechs, but there's probably
+- * a better way to do all this. */
+-
+-int
+-gss_register_triple(u32 pseudoflavor, struct gss_api_mech *mech,
+- u32 qop, u32 service)
+-{
+- struct sup_sec_triple *triple;
+-
+- if (!(triple = kmalloc(sizeof(*triple), GFP_KERNEL))) {
+- printk("Alloc failed in gss_register_triple");
+- goto err;
+- }
+- triple->pseudoflavor = pseudoflavor;
+- triple->mech = gss_mech_get_by_OID(&mech->gm_oid);
+- triple->qop = qop;
+- triple->service = service;
+-
+- spin_lock(®istered_triples_lock);
+- if (do_lookup_triple_by_pseudoflavor(pseudoflavor)) {
+- printk(KERN_WARNING "RPC: Registered pseudoflavor %d again\n",
+- pseudoflavor);
+- goto err_unlock;
+- }
+- list_add(&triple->triples, ®istered_triples);
+- spin_unlock(®istered_triples_lock);
+- dprintk("RPC: registered pseudoflavor %d\n", pseudoflavor);
+-
+- return 0;
+-
+-err_unlock:
+- kfree(triple);
+- spin_unlock(®istered_triples_lock);
+-err:
+- return -1;
+-}
+-
+-int
+-gss_unregister_triple(u32 pseudoflavor)
+-{
+- struct sup_sec_triple *triple;
+-
+- spin_lock(®istered_triples_lock);
+- if (!(triple = do_lookup_triple_by_pseudoflavor(pseudoflavor))) {
+- spin_unlock(®istered_triples_lock);
+- printk("Can't unregister unregistered pseudoflavor %d\n",
+- pseudoflavor);
+- return -1;
+- }
+- list_del(&triple->triples);
+- spin_unlock(®istered_triples_lock);
+- gss_mech_put(triple->mech);
+- kfree(triple);
+- return 0;
+-
+-}
+-
+-void
+-print_sec_triple(struct xdr_netobj *oid,u32 qop,u32 service)
+-{
+- dprintk("RPC: print_sec_triple:\n");
+- dprintk(" oid_len %d\n oid :\n",oid->len);
+- print_hexl((u32 *)oid->data,oid->len,0);
+- dprintk(" qop %d\n",qop);
+- dprintk(" service %d\n",service);
+-}
+-
+-/* Function: gss_get_cmp_triples
+- *
+- * Description: search sec_triples for a matching security triple
+- * return pseudoflavor if match, else 0
+- * (Note that 0 is a valid pseudoflavor, but not for any gss pseudoflavor
+- * (0 means auth_null), so this shouldn't cause confusion.)
+- */
+-u32
+-gss_cmp_triples(u32 oid_len, char *oid_data, u32 qop, u32 service)
+-{
+- struct sup_sec_triple *triple;
+- u32 pseudoflavor = 0;
+- struct xdr_netobj oid;
+-
+- oid.len = oid_len;
+- oid.data = oid_data;
+-
+- dprintk("RPC: gss_cmp_triples\n");
+- print_sec_triple(&oid,qop,service);
+-
+- spin_lock(®istered_triples_lock);
+- list_for_each_entry(triple, ®istered_triples, triples) {
+- if((g_OID_equal(&oid, &triple->mech->gm_oid))
+- && (qop == triple->qop)
+- && (service == triple->service)) {
+- pseudoflavor = triple->pseudoflavor;
+- break;
+- }
+- }
+- spin_unlock(®istered_triples_lock);
+- dprintk("RPC: gss_cmp_triples return %d\n", pseudoflavor);
+- return pseudoflavor;
+-}
+-
+-u32
+-gss_get_pseudoflavor(struct gss_ctx *ctx, u32 qop, u32 service)
+-{
+- return gss_cmp_triples(ctx->mech_type->gm_oid.len,
+- ctx->mech_type->gm_oid.data,
+- qop, service);
+-}
+-
+-/* Returns nonzero iff the given pseudoflavor is in the supported list.
+- * (Note that without incrementing a reference count or anything, this
+- * doesn't give any guarantees.) */
+-int
+-gss_pseudoflavor_supported(u32 pseudoflavor)
+-{
+- struct sup_sec_triple *triple;
+-
+- spin_lock(®istered_triples_lock);
+- triple = do_lookup_triple_by_pseudoflavor(pseudoflavor);
+- spin_unlock(®istered_triples_lock);
+- return (triple ? 1 : 0);
+-}
+-
+-u32
+-gss_pseudoflavor_to_service(u32 pseudoflavor)
+-{
+- struct sup_sec_triple *triple;
+-
+- spin_lock(®istered_triples_lock);
+- triple = do_lookup_triple_by_pseudoflavor(pseudoflavor);
+- spin_unlock(®istered_triples_lock);
+- if (!triple) {
+- dprintk("RPC: gss_pseudoflavor_to_service called with unsupported pseudoflavor %d\n",
+- pseudoflavor);
+- return 0;
+- }
+- return triple->service;
+-}
+-
+-struct gss_api_mech *
+-gss_pseudoflavor_to_mech(u32 pseudoflavor) {
+- struct sup_sec_triple *triple;
+- struct gss_api_mech *mech = NULL;
+-
+- spin_lock(®istered_triples_lock);
+- triple = do_lookup_triple_by_pseudoflavor(pseudoflavor);
+- spin_unlock(®istered_triples_lock);
+- if (triple)
+- mech = gss_mech_get(triple->mech);
+- else
+- dprintk("RPC: gss_pseudoflavor_to_mech called with unsupported pseudoflavor %d\n",
+- pseudoflavor);
+- return mech;
+-}
+-
+-int
+-gss_pseudoflavor_to_mechOID(u32 pseudoflavor, struct xdr_netobj * oid)
+-{
+- struct gss_api_mech *mech;
+-
+- mech = gss_pseudoflavor_to_mech(pseudoflavor);
+- if (!mech) {
+- dprintk("RPC: gss_pseudoflavor_to_mechOID called with unsupported pseudoflavor %d\n",
+- pseudoflavor);
+- return -1;
+- }
+- oid->len = mech->gm_oid.len;
+- if (!(oid->data = kmalloc(oid->len, GFP_KERNEL)))
+- return -1;
+- memcpy(oid->data, mech->gm_oid.data, oid->len);
+- gss_mech_put(mech);
+- return 0;
+-}
+Index: linux-2.6.10/net/sunrpc/auth_gss/svcauth_gss.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/svcauth_gss.c 2004-12-25 05:34:44.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/svcauth_gss.c 2005-04-05 14:49:13.407691040 +0800
+@@ -37,6 +37,7 @@
+ *
+ */
+
++#include <asm/bitops.h>
+ #include <linux/types.h>
+ #include <linux/module.h>
+ #include <linux/pagemap.h>
+@@ -78,7 +79,6 @@
+
+ static struct cache_head *rsi_table[RSI_HASHMAX];
+ static struct cache_detail rsi_cache;
+-static struct rsi *rsi_lookup(struct rsi *item, int set);
+
+ static void rsi_free(struct rsi *rsii)
+ {
+@@ -125,38 +125,6 @@
+ return dup_to_netobj(dst, src->data, src->len);
+ }
+
+-static inline void rsi_init(struct rsi *new, struct rsi *item)
+-{
+- new->out_handle.data = NULL;
+- new->out_handle.len = 0;
+- new->out_token.data = NULL;
+- new->out_token.len = 0;
+- new->in_handle.len = item->in_handle.len;
+- item->in_handle.len = 0;
+- new->in_token.len = item->in_token.len;
+- item->in_token.len = 0;
+- new->in_handle.data = item->in_handle.data;
+- item->in_handle.data = NULL;
+- new->in_token.data = item->in_token.data;
+- item->in_token.data = NULL;
+-}
+-
+-static inline void rsi_update(struct rsi *new, struct rsi *item)
+-{
+- BUG_ON(new->out_handle.data || new->out_token.data);
+- new->out_handle.len = item->out_handle.len;
+- item->out_handle.len = 0;
+- new->out_token.len = item->out_token.len;
+- item->out_token.len = 0;
+- new->out_handle.data = item->out_handle.data;
+- item->out_handle.data = NULL;
+- new->out_token.data = item->out_token.data;
+- item->out_token.data = NULL;
+-
+- new->major_status = item->major_status;
+- new->minor_status = item->minor_status;
+-}
+-
+ static void rsi_request(struct cache_detail *cd,
+ struct cache_head *h,
+ char **bpp, int *blen)
+@@ -168,6 +136,75 @@
+ (*bpp)[-1] = '\n';
+ }
+
++static inline int
++gssd_reply(struct rsi *item)
++{
++ struct rsi *tmp;
++ struct cache_head **hp, **head;
++
++ head = &rsi_cache.hash_table[rsi_hash(item)];
++ write_lock(&rsi_cache.hash_lock);
++ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct rsi, h);
++ if (rsi_match(tmp, item)) {
++ cache_get(&tmp->h);
++ clear_bit(CACHE_HASHED, &tmp->h.flags);
++ *hp = tmp->h.next;
++ tmp->h.next = NULL;
++ rsi_cache.entries--;
++ if (test_bit(CACHE_VALID, &tmp->h.flags)) {
++ write_unlock(&rsi_cache.hash_lock);
++ rsi_put(&tmp->h, &rsi_cache);
++ return -EINVAL;
++ }
++ set_bit(CACHE_HASHED, &item->h.flags);
++ item->h.next = *hp;
++ *hp = &item->h;
++ rsi_cache.entries++;
++ set_bit(CACHE_VALID, &item->h.flags);
++ item->h.last_refresh = get_seconds();
++ write_unlock(&rsi_cache.hash_lock);
++ cache_fresh(&rsi_cache, &tmp->h, 0);
++ rsi_put(&tmp->h, &rsi_cache);
++ return 0;
++ }
++ }
++ write_unlock(&rsi_cache.hash_lock);
++ return -EINVAL;
++}
++
++static inline struct rsi *
++gssd_upcall(struct rsi *item, struct svc_rqst *rqstp)
++{
++ struct rsi *tmp;
++ struct cache_head **hp, **head;
++
++ head = &rsi_cache.hash_table[rsi_hash(item)];
++ read_lock(&rsi_cache.hash_lock);
++ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct rsi, h);
++ if (rsi_match(tmp, item)) {
++ if (!test_bit(CACHE_VALID, &tmp->h.flags)) {
++ read_unlock(&rsi_cache.hash_lock);
++ return NULL;
++ }
++ *hp = tmp->h.next;
++ tmp->h.next = NULL;
++ rsi_cache.entries--;
++ read_unlock(&rsi_cache.hash_lock);
++ return tmp;
++ }
++ }
++ cache_get(&item->h);
++ item->h.next = *head;
++ *head = &item->h;
++ rsi_cache.entries++;
++ read_unlock(&rsi_cache.hash_lock);
++ cache_get(&item->h);
++ if (cache_check(&rsi_cache, &item->h, &rqstp->rq_chandle))
++ return NULL;
++ return item;
++}
+
+ static int rsi_parse(struct cache_detail *cd,
+ char *mesg, int mlen)
+@@ -176,17 +213,22 @@
+ char *buf = mesg;
+ char *ep;
+ int len;
+- struct rsi rsii, *rsip = NULL;
++ struct rsi *rsii;
+ time_t expiry;
+ int status = -EINVAL;
+
+- memset(&rsii, 0, sizeof(rsii));
++ rsii = kmalloc(sizeof(*rsii), GFP_KERNEL);
++ if (!rsii)
++ return -ENOMEM;
++ memset(rsii, 0, sizeof(*rsii));
++ cache_init(&rsii->h);
++
+ /* handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsii.in_handle, buf, len))
++ if (dup_to_netobj(&rsii->in_handle, buf, len))
+ goto out;
+
+ /* token */
+@@ -195,10 +237,9 @@
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsii.in_token, buf, len))
++ if (dup_to_netobj(&rsii->in_token, buf, len))
+ goto out;
+
+- rsii.h.flags = 0;
+ /* expiry */
+ expiry = get_expiry(&mesg);
+ status = -EINVAL;
+@@ -212,13 +253,13 @@
+ if (len == 0) {
+ goto out;
+ } else {
+- rsii.major_status = simple_strtoul(buf, &ep, 10);
++ rsii->major_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+ len = qword_get(&mesg, buf, mlen);
+ if (len <= 0)
+ goto out;
+- rsii.minor_status = simple_strtoul(buf, &ep, 10);
++ rsii->minor_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+
+@@ -227,7 +268,7 @@
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsii.out_handle, buf, len))
++ if (dup_to_netobj(&rsii->out_handle, buf, len))
+ goto out;
+
+ /* out_token */
+@@ -236,16 +277,14 @@
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsii.out_token, buf, len))
++ if (dup_to_netobj(&rsii->out_token, buf, len))
+ goto out;
+ }
+- rsii.h.expiry_time = expiry;
+- rsip = rsi_lookup(&rsii, 1);
+- status = 0;
++ rsii->h.expiry_time = expiry;
++ status = gssd_reply(rsii);
+ out:
+- rsi_free(&rsii);
+- if (rsip)
+- rsi_put(&rsip->h, &rsi_cache);
++ if (rsii)
++ rsi_put(&rsii->h, &rsi_cache);
+ return status;
+ }
+
+@@ -258,8 +297,6 @@
+ .cache_parse = rsi_parse,
+ };
+
+-static DefineSimpleCacheLookup(rsi, 0)
+-
+ /*
+ * The rpcsec_context cache is used to store a context that is
+ * used in data exchange.
+@@ -292,7 +329,6 @@
+
+ static struct cache_head *rsc_table[RSC_HASHMAX];
+ static struct cache_detail rsc_cache;
+-static struct rsc *rsc_lookup(struct rsc *item, int set);
+
+ static void rsc_free(struct rsc *rsci)
+ {
+@@ -325,26 +361,46 @@
+ return netobj_equal(&new->handle, &tmp->handle);
+ }
+
+-static inline void
+-rsc_init(struct rsc *new, struct rsc *tmp)
++static struct rsc *rsc_lookup(struct rsc *item, int set)
+ {
+- new->handle.len = tmp->handle.len;
+- tmp->handle.len = 0;
+- new->handle.data = tmp->handle.data;
+- tmp->handle.data = NULL;
+- new->mechctx = NULL;
+- new->cred.cr_group_info = NULL;
+-}
+-
+-static inline void
+-rsc_update(struct rsc *new, struct rsc *tmp)
+-{
+- new->mechctx = tmp->mechctx;
+- tmp->mechctx = NULL;
+- memset(&new->seqdata, 0, sizeof(new->seqdata));
+- spin_lock_init(&new->seqdata.sd_lock);
+- new->cred = tmp->cred;
+- tmp->cred.cr_group_info = NULL;
++ struct rsc *tmp = NULL;
++ struct cache_head **hp, **head;
++ head = &rsc_cache.hash_table[rsc_hash(item)];
++
++ if (set)
++ write_lock(&rsc_cache.hash_lock);
++ else
++ read_lock(&rsc_cache.hash_lock);
++ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct rsc, h);
++ if (!rsc_match(tmp, item))
++ continue;
++ cache_get(&tmp->h);
++ if (!set)
++ goto out_noset;
++ *hp = tmp->h.next;
++ tmp->h.next = NULL;
++ clear_bit(CACHE_HASHED, &tmp->h.flags);
++ rsc_put(&tmp->h, &rsc_cache);
++ goto out_set;
++ }
++ /* Didn't find anything */
++ if (!set)
++ goto out_nada;
++ rsc_cache.entries++;
++out_set:
++ set_bit(CACHE_HASHED, &item->h.flags);
++ item->h.next = *head;
++ *head = &item->h;
++ write_unlock(&rsc_cache.hash_lock);
++ cache_fresh(&rsc_cache, &item->h, item->h.expiry_time);
++ cache_get(&item->h);
++ return item;
++out_nada:
++ tmp = NULL;
++out_noset:
++ read_unlock(&rsc_cache.hash_lock);
++ return tmp;
+ }
+
+ static int rsc_parse(struct cache_detail *cd,
+@@ -353,19 +409,22 @@
+ /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
+ char *buf = mesg;
+ int len, rv;
+- struct rsc rsci, *rscp = NULL;
++ struct rsc *rsci, *res = NULL;
+ time_t expiry;
+ int status = -EINVAL;
+
+- memset(&rsci, 0, sizeof(rsci));
++ rsci = kmalloc(sizeof(*rsci), GFP_KERNEL);
++ if (!rsci)
++ return -ENOMEM;
++ memset(rsci, 0, sizeof(*rsci));
++ cache_init(&rsci->h);
+ /* context handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0) goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsci.handle, buf, len))
++ if (dup_to_netobj(&rsci->handle, buf, len))
+ goto out;
+
+- rsci.h.flags = 0;
+ /* expiry */
+ expiry = get_expiry(&mesg);
+ status = -EINVAL;
+@@ -373,26 +432,26 @@
+ goto out;
+
+ /* uid, or NEGATIVE */
+- rv = get_int(&mesg, &rsci.cred.cr_uid);
++ rv = get_int(&mesg, &rsci->cred.cr_uid);
+ if (rv == -EINVAL)
+ goto out;
+ if (rv == -ENOENT)
+- set_bit(CACHE_NEGATIVE, &rsci.h.flags);
++ set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ else {
+ int N, i;
+ struct gss_api_mech *gm;
+ struct xdr_netobj tmp_buf;
+
+ /* gid */
+- if (get_int(&mesg, &rsci.cred.cr_gid))
++ if (get_int(&mesg, &rsci->cred.cr_gid))
+ goto out;
+
+ /* number of additional gid's */
+ if (get_int(&mesg, &N))
+ goto out;
+ status = -ENOMEM;
+- rsci.cred.cr_group_info = groups_alloc(N);
+- if (rsci.cred.cr_group_info == NULL)
++ rsci->cred.cr_group_info = groups_alloc(N);
++ if (rsci->cred.cr_group_info == NULL)
+ goto out;
+
+ /* gid's */
+@@ -401,7 +460,7 @@
+ gid_t gid;
+ if (get_int(&mesg, &gid))
+ goto out;
+- GROUP_AT(rsci.cred.cr_group_info, i) = gid;
++ GROUP_AT(rsci->cred.cr_group_info, i) = gid;
+ }
+
+ /* mech name */
+@@ -422,19 +481,21 @@
+ }
+ tmp_buf.len = len;
+ tmp_buf.data = buf;
+- if (gss_import_sec_context(&tmp_buf, gm, &rsci.mechctx)) {
++ if (gss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) {
+ gss_mech_put(gm);
+ goto out;
+ }
+ gss_mech_put(gm);
+ }
+- rsci.h.expiry_time = expiry;
+- rscp = rsc_lookup(&rsci, 1);
++ rsci->h.expiry_time = expiry;
++ spin_lock_init(&rsci->seqdata.sd_lock);
++ res = rsc_lookup(rsci, 1);
++ rsc_put(&res->h, &rsc_cache);
++ rsci = NULL;
+ status = 0;
+ out:
+- rsc_free(&rsci);
+- if (rscp)
+- rsc_put(&rscp->h, &rsc_cache);
++ if (rsci)
++ rsc_put(&rsci->h, &rsc_cache);
+ return status;
+ }
+
+@@ -446,19 +507,14 @@
+ .cache_parse = rsc_parse,
+ };
+
+-static DefineSimpleCacheLookup(rsc, 0);
+-
+ struct rsc *
+ gss_svc_searchbyctx(struct xdr_netobj *handle)
+ {
+ struct rsc rsci;
+ struct rsc *found;
+
+- memset(&rsci, 0, sizeof(rsci));
+- if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+- return NULL;
++ rsci.handle = *handle;
+ found = rsc_lookup(&rsci, 0);
+- rsc_free(&rsci);
+ if (!found)
+ return NULL;
+ if (cache_check(&rsc_cache, &found->h, NULL))
+@@ -721,6 +777,45 @@
+ return stat;
+ }
+
++static int
++unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
++{
++ int stat = -EINVAL;
++ int out_offset;
++ u32 * lenp;
++ u32 priv_len, maj_stat;
++ int saved_len;
++
++ lenp = buf->head[0].iov_base;
++ priv_len = ntohl(svc_getu32(&buf->head[0]));
++ if (priv_len > buf->len) /* XXXJBF: wrong check */
++ goto out;
++ /* XXXJBF: bizarre hack: to handle revisits (and not decrypt
++ * twice), the first time through we write an offset
++ * telling us where to skip to find the already-decrypted data */
++ if (rqstp->rq_deferred) {
++ buf->head[0].iov_base += priv_len;
++ buf->head[0].iov_len -= priv_len;
++ return 0;
++ }
++ saved_len = buf->len; /* XXX HACK */
++ buf->len = priv_len;
++ maj_stat = gss_unwrap(ctx, NULL, 0, buf, &out_offset);
++ buf->len = saved_len;
++ buf->head[0].iov_base += out_offset;
++ buf->head[0].iov_len -= out_offset;
++ BUG_ON(buf->head[0].iov_len <= 0);
++ if (maj_stat != GSS_S_COMPLETE)
++ goto out;
++ if (ntohl(svc_getu32(&buf->head[0])) != seq)
++ goto out;
++ /* XXXJBF: see "bizarre hack", above. */
++ *lenp = htonl(out_offset + 4);
++ stat = 0;
++out:
++ return stat;
++}
++
+ struct gss_svc_data {
+ /* decoded gss client cred: */
+ struct rpc_gss_wire_cred clcred;
+@@ -730,6 +825,19 @@
+ struct rsc *rsci;
+ };
+
++static int
++svcauth_gss_set_client(struct svc_rqst *rqstp)
++{
++ struct gss_svc_data *svcdata = rqstp->rq_auth_data;
++ struct rsc *rsci = svcdata->rsci;
++ struct rpc_gss_wire_cred *gc = &svcdata->clcred;
++
++ rqstp->rq_client = find_gss_auth_domain(rsci->mechctx, gc->gc_svc);
++ if (rqstp->rq_client == NULL)
++ return SVC_DENIED;
++ return SVC_OK;
++}
++
+ /*
+ * Accept an rpcsec packet.
+ * If context establishment, punt to user space
+@@ -748,7 +856,7 @@
+ struct gss_svc_data *svcdata = rqstp->rq_auth_data;
+ struct rpc_gss_wire_cred *gc;
+ struct rsc *rsci = NULL;
+- struct rsi *rsip, rsikey;
++ struct rsi *rsip, *rsikey = NULL;
+ u32 *rpcstart;
+ u32 *reject_stat = resv->iov_base + resv->iov_len;
+ int ret;
+@@ -841,30 +949,23 @@
+ *authp = rpc_autherr_badcred;
+ if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0)
+ goto auth_err;
+- memset(&rsikey, 0, sizeof(rsikey));
+- if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
++ rsikey = kmalloc(sizeof(*rsikey), GFP_KERNEL);
++ if (!rsikey)
++ goto drop;
++ memset(rsikey, 0, sizeof(*rsikey));
++ cache_init(&rsikey->h);
++ if (dup_netobj(&rsikey->in_handle, &gc->gc_ctx))
+ goto drop;
+ *authp = rpc_autherr_badverf;
+- if (svc_safe_getnetobj(argv, &tmpobj)) {
+- kfree(rsikey.in_handle.data);
++ if (svc_safe_getnetobj(argv, &tmpobj))
+ goto auth_err;
+- }
+- if (dup_netobj(&rsikey.in_token, &tmpobj)) {
+- kfree(rsikey.in_handle.data);
++ if (dup_netobj(&rsikey->in_token, &tmpobj))
+ goto drop;
+- }
+
+- rsip = rsi_lookup(&rsikey, 0);
+- rsi_free(&rsikey);
+- if (!rsip) {
+- goto drop;
+- }
+- switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) {
+- case -EAGAIN:
++ rsip = gssd_upcall(rsikey, rqstp);
++ if (!rsip)
+ goto drop;
+- case -ENOENT:
+- goto drop;
+- case 0:
++ else {
+ rsci = gss_svc_searchbyctx(&rsip->out_handle);
+ if (!rsci) {
+ goto drop;
+@@ -893,11 +994,6 @@
+ svc_putu32(resv, rpc_success);
+ goto complete;
+ case RPC_GSS_PROC_DATA:
+- *authp = rpc_autherr_badcred;
+- rqstp->rq_client =
+- find_gss_auth_domain(rsci->mechctx, gc->gc_svc);
+- if (rqstp->rq_client == NULL)
+- goto auth_err;
+ *authp = rpcsec_gsserr_ctxproblem;
+ if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
+ goto auth_err;
+@@ -911,6 +1007,15 @@
+ if (unwrap_integ_data(&rqstp->rq_arg,
+ gc->gc_seq, rsci->mechctx))
+ goto auth_err;
++ /* placeholders for length and seq. number: */
++ svcdata->body_start = resv->iov_base + resv->iov_len;
++ svc_putu32(resv, 0);
++ svc_putu32(resv, 0);
++ break;
++ case RPC_GSS_SVC_PRIVACY:
++ if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
++ gc->gc_seq, rsci->mechctx))
++ goto auth_err;
+ svcdata->rsci = rsci;
+ cache_get(&rsci->h);
+ /* placeholders for length and seq. number: */
+@@ -918,11 +1023,11 @@
+ svc_putu32(resv, 0);
+ svc_putu32(resv, 0);
+ break;
+- case RPC_GSS_SVC_PRIVACY:
+- /* currently unsupported */
+ default:
+ goto auth_err;
+ }
++ svcdata->rsci = rsci;
++ cache_get(&rsci->h);
+ ret = SVC_OK;
+ goto out;
+ }
+@@ -937,13 +1042,15 @@
+ drop:
+ ret = SVC_DROP;
+ out:
++ if (rsikey)
++ rsi_put(&rsikey->h, &rsi_cache);
+ if (rsci)
+ rsc_put(&rsci->h, &rsc_cache);
+ return ret;
+ }
+
+-static int
+-svcauth_gss_release(struct svc_rqst *rqstp)
++static inline int
++svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
+ {
+ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+ struct rpc_gss_wire_cred *gc = &gsd->clcred;
+@@ -955,10 +1062,160 @@
+ int integ_offset, integ_len;
+ int stat = -EINVAL;
+
++ p = gsd->body_start;
++ gsd->body_start = NULL;
++ /* move accept_stat to right place: */
++ memcpy(p, p + 2, 4);
++ /* Don't wrap in failure case: */
++ /* Counting on not getting here if call was not even accepted! */
++ if (*p != rpc_success) {
++ resbuf->head[0].iov_len -= 2 * 4;
++ goto out;
++ }
++ p++;
++ integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
++ integ_len = resbuf->len - integ_offset;
++ BUG_ON(integ_len % 4);
++ *p++ = htonl(integ_len);
++ *p++ = htonl(gc->gc_seq);
++ if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
++ integ_len))
++ BUG();
++ if (resbuf->page_len == 0
++ && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
++ < PAGE_SIZE) {
++ BUG_ON(resbuf->tail[0].iov_len);
++ /* Use head for everything */
++ resv = &resbuf->head[0];
++ } else if (resbuf->tail[0].iov_base == NULL) {
++ /* copied from nfsd4_encode_read */
++ svc_take_page(rqstp);
++ resbuf->tail[0].iov_base = page_address(rqstp
++ ->rq_respages[rqstp->rq_resused-1]);
++ rqstp->rq_restailpage = rqstp->rq_resused-1;
++ resbuf->tail[0].iov_len = 0;
++ resv = &resbuf->tail[0];
++ } else {
++ resv = &resbuf->tail[0];
++ }
++ mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
++ if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
++ goto out_err;
++ svc_putu32(resv, htonl(mic.len));
++ memset(mic.data + mic.len, 0,
++ round_up_to_quad(mic.len) - mic.len);
++ resv->iov_len += XDR_QUADLEN(mic.len) << 2;
++ /* not strictly required: */
++ resbuf->len += XDR_QUADLEN(mic.len) << 2;
++ BUG_ON(resv->iov_len > PAGE_SIZE);
++out:
++ stat = 0;
++out_err:
++ return stat;
++}
++
++/* XXXJBF: Look for chances to share code with client */
++/* XXXJBF: Do we need to preallocate these pages somehow? E.g. see
++ * buffer size calculations in svcsock.c */
++/* XXXJBF: how does reference counting on pages work? */
++static struct page **
++svc_alloc_enc_pages(struct xdr_buf *buf)
++{
++ struct page **ret;
++ int last, i;
++
++ if (buf->page_len == 0)
++ return NULL;
++ BUG_ON(buf->page_base >> PAGE_CACHE_SHIFT);
++ last = (buf->page_base + buf->page_len - 1) >> PAGE_CACHE_SHIFT;
++ ret = kmalloc((last + 1) * sizeof(struct page *), GFP_KERNEL);
++ if (!ret)
++ goto out;
++ for (i = 0; i<= last; i++) {
++ ret[i] = alloc_page(GFP_KERNEL);
++ if (ret[i] == NULL)
++ goto out_free;
++ }
++out:
++ return ret;
++out_free:
++ for (i--; i >= 0; i--) {
++ __free_page(ret[i]);
++ }
++ return NULL;
++}
++
++static inline int
++svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp)
++{
++ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
++ struct rpc_gss_wire_cred *gc = &gsd->clcred;
++ struct xdr_buf *resbuf = &rqstp->rq_res;
++ struct page **inpages;
++ u32 *p;
++ int offset, *len;
++ int pad;
++ int stat = -EINVAL;
++
++ p = gsd->body_start;
++ gsd->body_start = NULL;
++ /* move accept_stat to right place: */
++ memcpy(p, p + 2, 4);
++ /* Don't wrap in failure case: */
++ /* Counting on not getting here if call was not even accepted! */
++ if (*p != rpc_success) {
++ resbuf->head[0].iov_len -= 2 * 4;
++ goto out;
++ }
++ p++;
++ len = p++;
++ offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base;
++ *p++ = htonl(gc->gc_seq);
++ stat = -ENOMEM;
++ inpages = resbuf->pages;
++ /* XXXJBF: huge memory leaks here: allocated pages probably aren't
++ * freed, and neither is memory used to hold page array. */
++ resbuf->pages = svc_alloc_enc_pages(resbuf);
++ if (resbuf->page_len && !resbuf->pages)
++ goto out_err; /* XXX sleep and retry? Reserve ahead of time
++ and BUG_ON? */
++ if (resbuf->tail[0].iov_len == 0 || resbuf->tail[0].iov_base == NULL) {
++ /* copied from nfsd4_encode_read */
++ {int i = svc_take_page(rqstp); BUG_ON(i); }
++ resbuf->tail[0].iov_base = page_address(rqstp
++ ->rq_respages[rqstp->rq_resused-1]);
++ rqstp->rq_restailpage = rqstp->rq_resused-1;
++ resbuf->tail[0].iov_len = 0;
++ }
++ /* XXX: Will svc code attempt to free stuff in xdr_buf->pages?
++ * Or can we leave it in any old state on error?? */
++ stat = -EINVAL;
++ if (gss_wrap(gsd->rsci->mechctx, GSS_C_QOP_DEFAULT, offset,
++ resbuf, inpages))
++ goto out_err;
++ *len = htonl(resbuf->len - offset);
++ pad = 3 - ((resbuf->len - offset - 1)&3);
++ p = (u32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len);
++ memset(p, 0, pad);
++ resbuf->tail[0].iov_len += pad;
++out:
++ return 0;
++out_err:
++ return stat;
++}
++
++static int
++svcauth_gss_release(struct svc_rqst *rqstp)
++{
++ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
++ struct rpc_gss_wire_cred *gc = &gsd->clcred;
++ struct xdr_buf *resbuf = &rqstp->rq_res;
++ int stat = -EINVAL;
++
+ if (gc->gc_proc != RPC_GSS_PROC_DATA)
+ goto out;
+ /* Release can be called twice, but we only wrap once. */
+- if (gsd->body_start == 0)
++ if (gsd->body_start == NULL)
+ goto out;
+ /* normally not set till svc_send, but we need it here: */
+ resbuf->len = resbuf->head[0].iov_len
+@@ -967,55 +1224,15 @@
+ case RPC_GSS_SVC_NONE:
+ break;
+ case RPC_GSS_SVC_INTEGRITY:
+- p = gsd->body_start;
+- gsd->body_start = NULL;
+- /* move accept_stat to right place: */
+- memcpy(p, p + 2, 4);
+- /* don't wrap in failure case: */
+- /* Note: counting on not getting here if call was not even
+- * accepted! */
+- if (*p != rpc_success) {
+- resbuf->head[0].iov_len -= 2 * 4;
+- goto out;
+- }
+- p++;
+- integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
+- integ_len = resbuf->len - integ_offset;
+- BUG_ON(integ_len % 4);
+- *p++ = htonl(integ_len);
+- *p++ = htonl(gc->gc_seq);
+- if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
+- integ_len))
+- BUG();
+- if (resbuf->page_len == 0
+- && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
+- < PAGE_SIZE) {
+- BUG_ON(resbuf->tail[0].iov_len);
+- /* Use head for everything */
+- resv = &resbuf->head[0];
+- } else if (resbuf->tail[0].iov_base == NULL) {
+- /* copied from nfsd4_encode_read */
+- svc_take_page(rqstp);
+- resbuf->tail[0].iov_base = page_address(rqstp
+- ->rq_respages[rqstp->rq_resused-1]);
+- rqstp->rq_restailpage = rqstp->rq_resused-1;
+- resbuf->tail[0].iov_len = 0;
+- resv = &resbuf->tail[0];
+- } else {
+- resv = &resbuf->tail[0];
+- }
+- mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
+- if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
++ stat = svcauth_gss_wrap_resp_integ(rqstp);
++ if (stat)
+ goto out_err;
+- svc_putu32(resv, htonl(mic.len));
+- memset(mic.data + mic.len, 0,
+- round_up_to_quad(mic.len) - mic.len);
+- resv->iov_len += XDR_QUADLEN(mic.len) << 2;
+- /* not strictly required: */
+- resbuf->len += XDR_QUADLEN(mic.len) << 2;
+- BUG_ON(resv->iov_len > PAGE_SIZE);
+ break;
+ case RPC_GSS_SVC_PRIVACY:
++ stat = svcauth_gss_wrap_resp_priv(rqstp);
++ if (stat)
++ goto out_err;
++ break;
+ default:
+ goto out_err;
+ }
+@@ -1052,6 +1269,7 @@
+ .accept = svcauth_gss_accept,
+ .release = svcauth_gss_release,
+ .domain_release = svcauth_gss_domain_release,
++ .set_client = svcauth_gss_set_client,
+ };
+
+ int
+Index: linux-2.6.10/net/sunrpc/auth_gss/sunrpcgss_syms.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/sunrpcgss_syms.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/sunrpcgss_syms.c 2005-04-05 19:01:49.158500672 +0800
+@@ -1,37 +0,0 @@
+-#include <linux/config.h>
+-#include <linux/module.h>
+-
+-#include <linux/types.h>
+-#include <linux/socket.h>
+-#include <linux/sched.h>
+-#include <linux/uio.h>
+-#include <linux/unistd.h>
+-
+-#include <linux/sunrpc/auth_gss.h>
+-#include <linux/sunrpc/svcauth_gss.h>
+-#include <linux/sunrpc/gss_asn1.h>
+-#include <linux/sunrpc/gss_krb5.h>
+-
+-/* svcauth_gss.c: */
+-EXPORT_SYMBOL(svcauth_gss_register_pseudoflavor);
+-
+-/* registering gss mechanisms to the mech switching code: */
+-EXPORT_SYMBOL(gss_mech_register);
+-EXPORT_SYMBOL(gss_mech_unregister);
+-EXPORT_SYMBOL(gss_mech_get);
+-EXPORT_SYMBOL(gss_mech_get_by_pseudoflavor);
+-EXPORT_SYMBOL(gss_mech_get_by_name);
+-EXPORT_SYMBOL(gss_mech_put);
+-EXPORT_SYMBOL(gss_pseudoflavor_to_service);
+-EXPORT_SYMBOL(gss_service_to_auth_domain_name);
+-
+-/* generic functionality in gss code: */
+-EXPORT_SYMBOL(g_make_token_header);
+-EXPORT_SYMBOL(g_verify_token_header);
+-EXPORT_SYMBOL(g_token_size);
+-EXPORT_SYMBOL(make_checksum);
+-EXPORT_SYMBOL(krb5_encrypt);
+-EXPORT_SYMBOL(krb5_decrypt);
+-
+-/* debug */
+-EXPORT_SYMBOL(print_hexl);
+Index: linux-2.6.10/net/sunrpc/auth_gss/Makefile
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/Makefile 2004-12-25 05:34:33.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/Makefile 2005-04-05 14:49:13.408690888 +0800
+@@ -10,7 +10,7 @@
+ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
+
+ rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
+- gss_krb5_seqnum.o
++ gss_krb5_seqnum.o gss_krb5_wrap.o
+
+ obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
+
+Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_mech.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_mech.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_mech.c 2005-04-05 14:49:13.400692104 +0800
+@@ -182,6 +182,7 @@
+ kfree(kctx);
+ }
+
++/* XXX the following wrappers have become pointless; kill them. */
+ static u32
+ gss_verify_mic_kerberos(struct gss_ctx *ctx,
+ struct xdr_buf *message,
+@@ -191,8 +192,7 @@
+ int qop_state;
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+
+- maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state,
+- KG_TOK_MIC_MSG);
++ maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state);
+ if (!maj_stat && qop_state)
+ *qstate = qop_state;
+
+@@ -208,7 +208,7 @@
+ u32 err = 0;
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+
+- err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG);
++ err = krb5_make_token(kctx, qop, message, mic_token);
+
+ dprintk("RPC: gss_get_mic_kerberos returning %d\n",err);
+
+@@ -219,6 +219,8 @@
+ .gss_import_sec_context = gss_import_sec_context_kerberos,
+ .gss_get_mic = gss_get_mic_kerberos,
+ .gss_verify_mic = gss_verify_mic_kerberos,
++ .gss_wrap = gss_wrap_kerberos,
++ .gss_unwrap = gss_unwrap_kerberos,
+ .gss_delete_sec_context = gss_delete_sec_context_kerberos,
+ };
+
+@@ -233,6 +235,11 @@
+ .service = RPC_GSS_SVC_INTEGRITY,
+ .name = "krb5i",
+ },
++ [2] = {
++ .pseudoflavor = RPC_AUTH_GSS_KRB5P,
++ .service = RPC_GSS_SVC_PRIVACY,
++ .name = "krb5p",
++ },
+ };
+
+ static struct gss_api_mech gss_kerberos_mech = {
+Index: linux-2.6.10/net/sunrpc/auth_gss/auth_gss.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/auth_gss/auth_gss.c 2004-12-25 05:34:44.000000000 +0800
++++ linux-2.6.10/net/sunrpc/auth_gss/auth_gss.c 2005-04-05 14:49:13.404691496 +0800
+@@ -45,6 +45,7 @@
+ #include <linux/socket.h>
+ #include <linux/in.h>
+ #include <linux/sched.h>
++#include <linux/pagemap.h>
+ #include <linux/sunrpc/clnt.h>
+ #include <linux/sunrpc/auth.h>
+ #include <linux/sunrpc/auth_gss.h>
+@@ -480,12 +481,14 @@
+ if (!cred)
+ goto err;
+ if (gss_err)
+- cred->cr_flags |= RPCAUTH_CRED_DEAD;
++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+ else
+ gss_cred_set_ctx(cred, ctx);
+ spin_lock(&gss_auth->lock);
+ gss_msg = __gss_find_upcall(gss_auth, acred.uid);
+ if (gss_msg) {
++ if (gss_err)
++ gss_msg->msg.errno = -EACCES;
+ __gss_unhash_msg(gss_msg);
+ spin_unlock(&gss_auth->lock);
+ gss_release_msg(gss_msg);
+@@ -740,7 +743,9 @@
+ maj_stat = gss_get_mic(ctx->gc_gss_ctx,
+ GSS_C_QOP_DEFAULT,
+ &verf_buf, &mic);
+- if(maj_stat != 0){
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) {
++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++ } else if (maj_stat != 0) {
+ printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
+ goto out_put_ctx;
+ }
+@@ -779,6 +784,7 @@
+ struct xdr_netobj mic;
+ u32 flav,len;
+ u32 service;
++ u32 maj_stat;
+
+ dprintk("RPC: %4u gss_validate\n", task->tk_pid);
+
+@@ -794,8 +800,11 @@
+ mic.data = (u8 *)p;
+ mic.len = len;
+
+- if (gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state))
+- goto out_bad;
++ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++ if (maj_stat)
++ goto out_bad;
+ service = gss_pseudoflavor_to_service(ctx->gc_gss_ctx->mech_type,
+ gss_cred->gc_flavor);
+ switch (service) {
+@@ -807,6 +816,11 @@
+ /* verifier data, flavor, length, length, sequence number: */
+ task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4;
+ break;
++ case RPC_GSS_SVC_PRIVACY:
++ /* XXXJBF: Ugh. Going for a wild overestimate.
++ * Need some info from krb5 layer? */
++ task->tk_auth->au_rslack = XDR_QUADLEN(len) + 32;
++ break;
+ default:
+ goto out_bad;
+ }
+@@ -821,11 +835,10 @@
+ }
+
+ static inline int
+-gss_wrap_req_integ(struct gss_cl_ctx *ctx,
+- kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
++gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
++ kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj)
+ {
+- struct rpc_rqst *req = (struct rpc_rqst *)rqstp;
+- struct xdr_buf *snd_buf = &req->rq_snd_buf;
++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
+ struct xdr_buf integ_buf;
+ u32 *integ_len = NULL;
+ struct xdr_netobj mic;
+@@ -836,7 +849,7 @@
+
+ integ_len = p++;
+ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+- *p++ = htonl(req->rq_seqno);
++ *p++ = htonl(rqstp->rq_seqno);
+
+ status = encode(rqstp, p, obj);
+ if (status)
+@@ -848,7 +861,7 @@
+ *integ_len = htonl(integ_buf.len);
+
+ /* guess whether we're in the head or the tail: */
+- if (snd_buf->page_len || snd_buf->tail[0].iov_len)
++ if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+ iov = snd_buf->tail;
+ else
+ iov = snd_buf->head;
+@@ -858,7 +871,9 @@
+ maj_stat = gss_get_mic(ctx->gc_gss_ctx,
+ GSS_C_QOP_DEFAULT, &integ_buf, &mic);
+ status = -EIO; /* XXX? */
+- if (maj_stat)
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++ else if (maj_stat)
+ return status;
+ q = xdr_encode_opaque(p, NULL, mic.len);
+
+@@ -868,6 +883,112 @@
+ return 0;
+ }
+
++static void
++priv_release_snd_buf(struct rpc_rqst *rqstp)
++{
++ int i;
++
++ for (i=0; i < rqstp->rq_enc_pages_num; i++)
++ __free_page(rqstp->rq_enc_pages[i]);
++ kfree(rqstp->rq_enc_pages);
++}
++
++static int
++alloc_enc_pages(struct rpc_rqst *rqstp)
++{
++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
++ int first, last, i;
++
++ if (snd_buf->page_len == 0) {
++ rqstp->rq_enc_pages_num = 0;
++ return 0;
++ }
++
++ first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
++ last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
++ rqstp->rq_enc_pages_num = last - first + 1 + 1;
++ rqstp->rq_enc_pages
++ = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
++ GFP_NOFS);
++ if (!rqstp->rq_enc_pages)
++ goto out;
++ for (i=0; i < rqstp->rq_enc_pages_num; i++) {
++ rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
++ if (rqstp->rq_enc_pages[i] == NULL)
++ goto out_free;
++ }
++ rqstp->rq_release_snd_buf = priv_release_snd_buf;
++ return 0;
++out_free:
++ for (i--; i >= 0; i--) {
++ __free_page(rqstp->rq_enc_pages[i]);
++ }
++out:
++ return -EAGAIN;
++}
++
++static inline int
++gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
++ kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj)
++{
++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
++ u32 offset;
++ u32 maj_stat;
++ int status;
++ u32 *opaque_len;
++ struct page **inpages;
++ int first;
++ int pad;
++ struct kvec *iov;
++ char *tmp;
++
++ opaque_len = p++;
++ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
++ *p++ = htonl(rqstp->rq_seqno);
++
++ status = encode(rqstp, p, obj);
++ if (status)
++ return status;
++
++ status = alloc_enc_pages(rqstp);
++ if (status)
++ return status;
++ /* XXXJBF: Oops! Do we need rq_enc_pages really any more?? */
++ first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
++ inpages = snd_buf->pages + first;
++ snd_buf->pages = rqstp->rq_enc_pages;
++ snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
++ /* XXX?: tail needs to be separate if we want to be able to expand
++ * the head (since it's often put right after the head). But is
++ * expanding the head safe in any case? */
++ if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
++ tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
++ memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
++ snd_buf->tail[0].iov_base = tmp;
++ }
++ maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset,
++ snd_buf, inpages);
++ status = -EIO; /* XXX? */
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++ else if (maj_stat)
++ return status;
++
++ *opaque_len = htonl(snd_buf->len - offset);
++ /* guess whether we're in the head or the tail: */
++ if (snd_buf->page_len || snd_buf->tail[0].iov_len)
++ iov = snd_buf->tail;
++ else
++ iov = snd_buf->head;
++ p = iov->iov_base + iov->iov_len;
++ pad = 3 - ((snd_buf->len - offset - 1) & 3);
++ memset(p, 0, pad);
++ iov->iov_len += pad;
++ snd_buf->len += pad;
++
++ return 0;
++}
++
+ static int
+ gss_wrap_req(struct rpc_task *task,
+ kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
+@@ -894,9 +1015,13 @@
+ status = encode(rqstp, p, obj);
+ goto out;
+ case RPC_GSS_SVC_INTEGRITY:
+- status = gss_wrap_req_integ(ctx, encode, rqstp, p, obj);
++ status = gss_wrap_req_integ(cred, ctx, encode,
++ rqstp, p, obj);
+ goto out;
+ case RPC_GSS_SVC_PRIVACY:
++ status = gss_wrap_req_priv(cred, ctx, encode,
++ rqstp, p, obj);
++ goto out;
+ default:
+ goto out;
+ }
+@@ -907,11 +1032,10 @@
+ }
+
+ static inline int
+-gss_unwrap_resp_integ(struct gss_cl_ctx *ctx,
+- kxdrproc_t decode, void *rqstp, u32 **p, void *obj)
++gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
++ struct rpc_rqst *rqstp, u32 **p)
+ {
+- struct rpc_rqst *req = (struct rpc_rqst *)rqstp;
+- struct xdr_buf *rcv_buf = &req->rq_rcv_buf;
++ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
+ struct xdr_buf integ_buf;
+ struct xdr_netobj mic;
+ u32 data_offset, mic_offset;
+@@ -926,7 +1050,7 @@
+ mic_offset = integ_len + data_offset;
+ if (mic_offset > rcv_buf->len)
+ return status;
+- if (ntohl(*(*p)++) != req->rq_seqno)
++ if (ntohl(*(*p)++) != rqstp->rq_seqno)
+ return status;
+
+ if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset,
+@@ -938,11 +1062,44 @@
+
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf,
+ &mic, NULL);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
++ if (maj_stat != GSS_S_COMPLETE)
++ return status;
++ return 0;
++}
++
++static inline int
++gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
++ struct rpc_rqst *rqstp, u32 **p)
++{
++ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
++ u32 offset, out_offset;
++ u32 opaque_len;
++ u32 maj_stat;
++ int status = -EIO;
++
++ opaque_len = ntohl(*(*p)++);
++ offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
++ if (offset + opaque_len > rcv_buf->len)
++ return status;
++ /* remove padding: */
++ rcv_buf->len = offset + opaque_len;
++
++ maj_stat = gss_unwrap(ctx->gc_gss_ctx, NULL,
++ offset, rcv_buf, &out_offset);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+ if (maj_stat != GSS_S_COMPLETE)
+ return status;
++ *p = (u32 *)(rcv_buf->head[0].iov_base + out_offset);
++ if (ntohl(*(*p)++) != rqstp->rq_seqno)
++ return status;
++
+ return 0;
+ }
+
++
+ static int
+ gss_unwrap_resp(struct rpc_task *task,
+ kxdrproc_t decode, void *rqstp, u32 *p, void *obj)
+@@ -962,12 +1119,16 @@
+ case RPC_GSS_SVC_NONE:
+ goto out_decode;
+ case RPC_GSS_SVC_INTEGRITY:
+- status = gss_unwrap_resp_integ(ctx, decode,
+- rqstp, &p, obj);
++ status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p);
+ if (status)
+ goto out;
+ break;
+ case RPC_GSS_SVC_PRIVACY:
++ status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p);
++ if (status)
++ goto out;
++ break;
++
+ default:
+ goto out;
+ }
+Index: linux-2.6.10/net/sunrpc/svc.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/svc.c 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/net/sunrpc/svc.c 2005-04-05 14:49:13.409690736 +0800
+@@ -264,6 +264,7 @@
+ u32 dir, prog, vers, proc,
+ auth_stat, rpc_stat;
+ int auth_res;
++ u32 *accept_statp;
+
+ rpc_stat = rpc_success;
+
+@@ -299,6 +300,9 @@
+ if (vers != 2) /* RPC version number */
+ goto err_bad_rpc;
+
++ /* Save position in case we later decide to reject: */
++ accept_statp = resv->iov_base + resv->iov_len;
++
+ svc_putu32(resv, xdr_zero); /* ACCEPT */
+
+ rqstp->rq_prog = prog = ntohl(svc_getu32(argv)); /* program number */
+@@ -311,10 +315,12 @@
+ * We do this before anything else in order to get a decent
+ * auth verifier.
+ */
+- if (progp->pg_authenticate != NULL)
+- auth_res = progp->pg_authenticate(rqstp, &auth_stat);
+- else
+- auth_res = svc_authenticate(rqstp, &auth_stat);
++ auth_res = svc_authenticate(rqstp, &auth_stat);
++ /* Also give the program a chance to reject this call: */
++ if (auth_res == SVC_OK) {
++ auth_stat = rpc_autherr_badcred;
++ auth_res = progp->pg_authenticate(rqstp);
++ }
+ switch (auth_res) {
+ case SVC_OK:
+ break;
+@@ -437,7 +443,8 @@
+ err_bad_auth:
+ dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
+ serv->sv_stats->rpcbadauth++;
+- resv->iov_len -= 4;
++ /* Restore write pointer to location of accept status: */
++ xdr_ressize_check(rqstp, accept_statp);
+ svc_putu32(resv, xdr_one); /* REJECT */
+ svc_putu32(resv, xdr_one); /* AUTH_ERROR */
+ svc_putu32(resv, auth_stat); /* status */
+Index: linux-2.6.10/net/sunrpc/sched.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/sched.c 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/net/sunrpc/sched.c 2005-04-05 14:49:13.391693472 +0800
+@@ -41,13 +41,7 @@
+
+ static void __rpc_default_timer(struct rpc_task *task);
+ static void rpciod_killall(void);
+-
+-/*
+- * When an asynchronous RPC task is activated within a bottom half
+- * handler, or while executing another RPC task, it is put on
+- * schedq, and rpciod is woken up.
+- */
+-static RPC_WAITQ(schedq, "schedq");
++static void rpc_async_schedule(void *);
+
+ /*
+ * RPC tasks that create another task (e.g. for contacting the portmapper)
+@@ -68,26 +62,18 @@
+ /*
+ * rpciod-related stuff
+ */
+-static DECLARE_WAIT_QUEUE_HEAD(rpciod_idle);
+-static DECLARE_COMPLETION(rpciod_killer);
+ static DECLARE_MUTEX(rpciod_sema);
+ static unsigned int rpciod_users;
+-static pid_t rpciod_pid;
+-static int rpc_inhibit;
++static struct workqueue_struct *rpciod_workqueue;
+
+ /*
+- * Spinlock for wait queues. Access to the latter also has to be
+- * interrupt-safe in order to allow timers to wake up sleeping tasks.
+- */
+-static spinlock_t rpc_queue_lock = SPIN_LOCK_UNLOCKED;
+-/*
+ * Spinlock for other critical sections of code.
+ */
+ static spinlock_t rpc_sched_lock = SPIN_LOCK_UNLOCKED;
+
+ /*
+ * Disable the timer for a given RPC task. Should be called with
+- * rpc_queue_lock and bh_disabled in order to avoid races within
++ * queue->lock and bh_disabled in order to avoid races within
+ * rpc_run_timer().
+ */
+ static inline void
+@@ -105,19 +91,19 @@
+ * without calling del_timer_sync(). The latter could cause a
+ * deadlock if called while we're holding spinlocks...
+ */
+-static void
+-rpc_run_timer(struct rpc_task *task)
++static void rpc_run_timer(struct rpc_task *task)
+ {
+ void (*callback)(struct rpc_task *);
+
+- spin_lock_bh(&rpc_queue_lock);
+ callback = task->tk_timeout_fn;
+ task->tk_timeout_fn = NULL;
+- spin_unlock_bh(&rpc_queue_lock);
+- if (callback) {
++ if (callback && RPC_IS_QUEUED(task)) {
+ dprintk("RPC: %4d running timer\n", task->tk_pid);
+ callback(task);
+ }
++ smp_mb__before_clear_bit();
++ clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
++ smp_mb__after_clear_bit();
+ }
+
+ /*
+@@ -136,29 +122,21 @@
+ task->tk_timeout_fn = timer;
+ else
+ task->tk_timeout_fn = __rpc_default_timer;
++ set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate);
+ mod_timer(&task->tk_timer, jiffies + task->tk_timeout);
+ }
+
+ /*
+- * Set up a timer for an already sleeping task.
+- */
+-void rpc_add_timer(struct rpc_task *task, rpc_action timer)
+-{
+- spin_lock_bh(&rpc_queue_lock);
+- if (!RPC_IS_RUNNING(task))
+- __rpc_add_timer(task, timer);
+- spin_unlock_bh(&rpc_queue_lock);
+-}
+-
+-/*
+ * Delete any timer for the current task. Because we use del_timer_sync(),
+- * this function should never be called while holding rpc_queue_lock.
++ * this function should never be called while holding queue->lock.
+ */
+ static inline void
+ rpc_delete_timer(struct rpc_task *task)
+ {
+- if (del_timer_sync(&task->tk_timer))
++ if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) {
++ del_singleshot_timer_sync(&task->tk_timer);
+ dprintk("RPC: %4d deleting timer\n", task->tk_pid);
++ }
+ }
+
+ /*
+@@ -169,16 +147,17 @@
+ struct list_head *q;
+ struct rpc_task *t;
+
++ INIT_LIST_HEAD(&task->u.tk_wait.links);
+ q = &queue->tasks[task->tk_priority];
+ if (unlikely(task->tk_priority > queue->maxpriority))
+ q = &queue->tasks[queue->maxpriority];
+- list_for_each_entry(t, q, tk_list) {
++ list_for_each_entry(t, q, u.tk_wait.list) {
+ if (t->tk_cookie == task->tk_cookie) {
+- list_add_tail(&task->tk_list, &t->tk_links);
++ list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
+ return;
+ }
+ }
+- list_add_tail(&task->tk_list, q);
++ list_add_tail(&task->u.tk_wait.list, q);
+ }
+
+ /*
+@@ -189,37 +168,21 @@
+ * improve overall performance.
+ * Everyone else gets appended to the queue to ensure proper FIFO behavior.
+ */
+-static int __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
++static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
+ {
+- if (task->tk_rpcwait == queue)
+- return 0;
++ BUG_ON (RPC_IS_QUEUED(task));
+
+- if (task->tk_rpcwait) {
+- printk(KERN_WARNING "RPC: doubly enqueued task!\n");
+- return -EWOULDBLOCK;
+- }
+ if (RPC_IS_PRIORITY(queue))
+ __rpc_add_wait_queue_priority(queue, task);
+ else if (RPC_IS_SWAPPER(task))
+- list_add(&task->tk_list, &queue->tasks[0]);
++ list_add(&task->u.tk_wait.list, &queue->tasks[0]);
+ else
+- list_add_tail(&task->tk_list, &queue->tasks[0]);
+- task->tk_rpcwait = queue;
++ list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
++ task->u.tk_wait.rpc_waitq = queue;
++ rpc_set_queued(task);
+
+ dprintk("RPC: %4d added to queue %p \"%s\"\n",
+ task->tk_pid, queue, rpc_qname(queue));
+-
+- return 0;
+-}
+-
+-int rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task)
+-{
+- int result;
+-
+- spin_lock_bh(&rpc_queue_lock);
+- result = __rpc_add_wait_queue(q, task);
+- spin_unlock_bh(&rpc_queue_lock);
+- return result;
+ }
+
+ /*
+@@ -229,12 +192,12 @@
+ {
+ struct rpc_task *t;
+
+- if (!list_empty(&task->tk_links)) {
+- t = list_entry(task->tk_links.next, struct rpc_task, tk_list);
+- list_move(&t->tk_list, &task->tk_list);
+- list_splice_init(&task->tk_links, &t->tk_links);
++ if (!list_empty(&task->u.tk_wait.links)) {
++ t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
++ list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
++ list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
+ }
+- list_del(&task->tk_list);
++ list_del(&task->u.tk_wait.list);
+ }
+
+ /*
+@@ -243,31 +206,17 @@
+ */
+ static void __rpc_remove_wait_queue(struct rpc_task *task)
+ {
+- struct rpc_wait_queue *queue = task->tk_rpcwait;
+-
+- if (!queue)
+- return;
++ struct rpc_wait_queue *queue;
++ queue = task->u.tk_wait.rpc_waitq;
+
+ if (RPC_IS_PRIORITY(queue))
+ __rpc_remove_wait_queue_priority(task);
+ else
+- list_del(&task->tk_list);
+- task->tk_rpcwait = NULL;
+-
++ list_del(&task->u.tk_wait.list);
+ dprintk("RPC: %4d removed from queue %p \"%s\"\n",
+ task->tk_pid, queue, rpc_qname(queue));
+ }
+
+-void
+-rpc_remove_wait_queue(struct rpc_task *task)
+-{
+- if (!task->tk_rpcwait)
+- return;
+- spin_lock_bh(&rpc_queue_lock);
+- __rpc_remove_wait_queue(task);
+- spin_unlock_bh(&rpc_queue_lock);
+-}
+-
+ static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
+ {
+ queue->priority = priority;
+@@ -290,6 +239,7 @@
+ {
+ int i;
+
++ spin_lock_init(&queue->lock);
+ for (i = 0; i < ARRAY_SIZE(queue->tasks); i++)
+ INIT_LIST_HEAD(&queue->tasks[i]);
+ queue->maxpriority = maxprio;
+@@ -316,34 +266,31 @@
+ * Note: If the task is ASYNC, this must be called with
+ * the spinlock held to protect the wait queue operation.
+ */
+-static inline void
+-rpc_make_runnable(struct rpc_task *task)
++static void rpc_make_runnable(struct rpc_task *task)
+ {
+- if (task->tk_timeout_fn) {
+- printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n");
++ int do_ret;
++
++ BUG_ON(task->tk_timeout_fn);
++ do_ret = rpc_test_and_set_running(task);
++ rpc_clear_queued(task);
++ if (do_ret)
+ return;
+- }
+- rpc_set_running(task);
+ if (RPC_IS_ASYNC(task)) {
+- if (RPC_IS_SLEEPING(task)) {
+- int status;
+- status = __rpc_add_wait_queue(&schedq, task);
+- if (status < 0) {
+- printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
+- task->tk_status = status;
+- return;
+- }
+- rpc_clear_sleeping(task);
+- wake_up(&rpciod_idle);
++ int status;
++
++ INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task);
++ status = queue_work(task->tk_workqueue, &task->u.tk_work);
++ if (status < 0) {
++ printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
++ task->tk_status = status;
++ return;
+ }
+- } else {
+- rpc_clear_sleeping(task);
+- wake_up(&task->tk_wait);
+- }
++ } else
++ wake_up(&task->u.tk_wait.waitq);
+ }
+
+ /*
+- * Place a newly initialized task on the schedq.
++ * Place a newly initialized task on the workqueue.
+ */
+ static inline void
+ rpc_schedule_run(struct rpc_task *task)
+@@ -352,33 +299,18 @@
+ if (RPC_IS_ACTIVATED(task))
+ return;
+ task->tk_active = 1;
+- rpc_set_sleeping(task);
+ rpc_make_runnable(task);
+ }
+
+ /*
+- * For other people who may need to wake the I/O daemon
+- * but should (for now) know nothing about its innards
+- */
+-void rpciod_wake_up(void)
+-{
+- if(rpciod_pid==0)
+- printk(KERN_ERR "rpciod: wot no daemon?\n");
+- wake_up(&rpciod_idle);
+-}
+-
+-/*
+ * Prepare for sleeping on a wait queue.
+ * By always appending tasks to the list we ensure FIFO behavior.
+ * NB: An RPC task will only receive interrupt-driven events as long
+ * as it's on a wait queue.
+ */
+-static void
+-__rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
++static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+ rpc_action action, rpc_action timer)
+ {
+- int status;
+-
+ dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid,
+ rpc_qname(q), jiffies);
+
+@@ -388,49 +320,36 @@
+ }
+
+ /* Mark the task as being activated if so needed */
+- if (!RPC_IS_ACTIVATED(task)) {
++ if (!RPC_IS_ACTIVATED(task))
+ task->tk_active = 1;
+- rpc_set_sleeping(task);
+- }
+
+- status = __rpc_add_wait_queue(q, task);
+- if (status) {
+- printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
+- task->tk_status = status;
+- } else {
+- rpc_clear_running(task);
+- if (task->tk_callback) {
+- dprintk(KERN_ERR "RPC: %4d overwrites an active callback\n", task->tk_pid);
+- BUG();
+- }
+- task->tk_callback = action;
+- __rpc_add_timer(task, timer);
+- }
++ __rpc_add_wait_queue(q, task);
++
++ BUG_ON(task->tk_callback != NULL);
++ task->tk_callback = action;
++ __rpc_add_timer(task, timer);
+ }
+
+-void
+-rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
++void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+ rpc_action action, rpc_action timer)
+ {
+ /*
+ * Protect the queue operations.
+ */
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&q->lock);
+ __rpc_sleep_on(q, task, action, timer);
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&q->lock);
+ }
+
+ /**
+- * __rpc_wake_up_task - wake up a single rpc_task
++ * __rpc_do_wake_up_task - wake up a single rpc_task
+ * @task: task to be woken up
+ *
+- * Caller must hold rpc_queue_lock
++ * Caller must hold queue->lock, and have cleared the task queued flag.
+ */
+-static void
+-__rpc_wake_up_task(struct rpc_task *task)
++static void __rpc_do_wake_up_task(struct rpc_task *task)
+ {
+- dprintk("RPC: %4d __rpc_wake_up_task (now %ld inh %d)\n",
+- task->tk_pid, jiffies, rpc_inhibit);
++ dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies);
+
+ #ifdef RPC_DEBUG
+ if (task->tk_magic != 0xf00baa) {
+@@ -445,12 +364,9 @@
+ printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
+ return;
+ }
+- if (RPC_IS_RUNNING(task))
+- return;
+
+ __rpc_disable_timer(task);
+- if (task->tk_rpcwait != &schedq)
+- __rpc_remove_wait_queue(task);
++ __rpc_remove_wait_queue(task);
+
+ rpc_make_runnable(task);
+
+@@ -458,6 +374,18 @@
+ }
+
+ /*
++ * Wake up the specified task
++ */
++static void __rpc_wake_up_task(struct rpc_task *task)
++{
++ if (rpc_start_wakeup(task)) {
++ if (RPC_IS_QUEUED(task))
++ __rpc_do_wake_up_task(task);
++ rpc_finish_wakeup(task);
++ }
++}
++
++/*
+ * Default timeout handler if none specified by user
+ */
+ static void
+@@ -471,14 +399,18 @@
+ /*
+ * Wake up the specified task
+ */
+-void
+-rpc_wake_up_task(struct rpc_task *task)
++void rpc_wake_up_task(struct rpc_task *task)
+ {
+- if (RPC_IS_RUNNING(task))
+- return;
+- spin_lock_bh(&rpc_queue_lock);
+- __rpc_wake_up_task(task);
+- spin_unlock_bh(&rpc_queue_lock);
++ if (rpc_start_wakeup(task)) {
++ if (RPC_IS_QUEUED(task)) {
++ struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq;
++
++ spin_lock_bh(&queue->lock);
++ __rpc_do_wake_up_task(task);
++ spin_unlock_bh(&queue->lock);
++ }
++ rpc_finish_wakeup(task);
++ }
+ }
+
+ /*
+@@ -494,11 +426,11 @@
+ */
+ q = &queue->tasks[queue->priority];
+ if (!list_empty(q)) {
+- task = list_entry(q->next, struct rpc_task, tk_list);
++ task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ if (queue->cookie == task->tk_cookie) {
+ if (--queue->nr)
+ goto out;
+- list_move_tail(&task->tk_list, q);
++ list_move_tail(&task->u.tk_wait.list, q);
+ }
+ /*
+ * Check if we need to switch queues.
+@@ -516,7 +448,7 @@
+ else
+ q = q - 1;
+ if (!list_empty(q)) {
+- task = list_entry(q->next, struct rpc_task, tk_list);
++ task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ goto new_queue;
+ }
+ } while (q != &queue->tasks[queue->priority]);
+@@ -541,14 +473,14 @@
+ struct rpc_task *task = NULL;
+
+ dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue));
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&queue->lock);
+ if (RPC_IS_PRIORITY(queue))
+ task = __rpc_wake_up_next_priority(queue);
+ else {
+ task_for_first(task, &queue->tasks[0])
+ __rpc_wake_up_task(task);
+ }
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&queue->lock);
+
+ return task;
+ }
+@@ -557,25 +489,25 @@
+ * rpc_wake_up - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+- * Grabs rpc_queue_lock
++ * Grabs queue->lock
+ */
+ void rpc_wake_up(struct rpc_wait_queue *queue)
+ {
+ struct rpc_task *task;
+
+ struct list_head *head;
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&queue->lock);
+ head = &queue->tasks[queue->maxpriority];
+ for (;;) {
+ while (!list_empty(head)) {
+- task = list_entry(head->next, struct rpc_task, tk_list);
++ task = list_entry(head->next, struct rpc_task, u.tk_wait.list);
+ __rpc_wake_up_task(task);
+ }
+ if (head == &queue->tasks[0])
+ break;
+ head--;
+ }
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&queue->lock);
+ }
+
+ /**
+@@ -583,18 +515,18 @@
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ *
+- * Grabs rpc_queue_lock
++ * Grabs queue->lock
+ */
+ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
+ {
+ struct list_head *head;
+ struct rpc_task *task;
+
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&queue->lock);
+ head = &queue->tasks[queue->maxpriority];
+ for (;;) {
+ while (!list_empty(head)) {
+- task = list_entry(head->next, struct rpc_task, tk_list);
++ task = list_entry(head->next, struct rpc_task, u.tk_wait.list);
+ task->tk_status = status;
+ __rpc_wake_up_task(task);
+ }
+@@ -602,7 +534,7 @@
+ break;
+ head--;
+ }
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&queue->lock);
+ }
+
+ /*
+@@ -626,22 +558,23 @@
+ /*
+ * This is the RPC `scheduler' (or rather, the finite state machine).
+ */
+-static int
+-__rpc_execute(struct rpc_task *task)
++static int __rpc_execute(struct rpc_task *task)
+ {
+ int status = 0;
+
+ dprintk("RPC: %4d rpc_execute flgs %x\n",
+ task->tk_pid, task->tk_flags);
+
+- if (!RPC_IS_RUNNING(task)) {
+- printk(KERN_WARNING "RPC: rpc_execute called for sleeping task!!\n");
+- return 0;
+- }
++ BUG_ON(RPC_IS_QUEUED(task));
+
+ restarted:
+ while (1) {
+ /*
++ * Garbage collection of pending timers...
++ */
++ rpc_delete_timer(task);
++
++ /*
+ * Execute any pending callback.
+ */
+ if (RPC_DO_CALLBACK(task)) {
+@@ -657,7 +590,9 @@
+ */
+ save_callback=task->tk_callback;
+ task->tk_callback=NULL;
++ lock_kernel();
+ save_callback(task);
++ unlock_kernel();
+ }
+
+ /*
+@@ -665,43 +600,35 @@
+ * tk_action may be NULL when the task has been killed
+ * by someone else.
+ */
+- if (RPC_IS_RUNNING(task)) {
+- /*
+- * Garbage collection of pending timers...
+- */
+- rpc_delete_timer(task);
++ if (!RPC_IS_QUEUED(task)) {
+ if (!task->tk_action)
+ break;
++ lock_kernel();
+ task->tk_action(task);
+- /* micro-optimization to avoid spinlock */
+- if (RPC_IS_RUNNING(task))
+- continue;
++ unlock_kernel();
+ }
+
+ /*
+- * Check whether task is sleeping.
++ * Lockless check for whether task is sleeping or not.
+ */
+- spin_lock_bh(&rpc_queue_lock);
+- if (!RPC_IS_RUNNING(task)) {
+- rpc_set_sleeping(task);
+- if (RPC_IS_ASYNC(task)) {
+- spin_unlock_bh(&rpc_queue_lock);
++ if (!RPC_IS_QUEUED(task))
++ continue;
++ rpc_clear_running(task);
++ if (RPC_IS_ASYNC(task)) {
++ /* Careful! we may have raced... */
++ if (RPC_IS_QUEUED(task))
+ return 0;
+- }
++ if (rpc_test_and_set_running(task))
++ return 0;
++ continue;
+ }
+- spin_unlock_bh(&rpc_queue_lock);
+
+- if (!RPC_IS_SLEEPING(task))
+- continue;
+ /* sync task: sleep here */
+ dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
+- if (current->pid == rpciod_pid)
+- printk(KERN_ERR "RPC: rpciod waiting on sync task!\n");
+-
+ if (RPC_TASK_UNINTERRUPTIBLE(task)) {
+- __wait_event(task->tk_wait, !RPC_IS_SLEEPING(task));
++ __wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task));
+ } else {
+- __wait_event_interruptible(task->tk_wait, !RPC_IS_SLEEPING(task), status);
++ __wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status);
+ /*
+ * When a sync task receives a signal, it exits with
+ * -ERESTARTSYS. In order to catch any callbacks that
+@@ -715,11 +642,14 @@
+ rpc_wake_up_task(task);
+ }
+ }
++ rpc_set_running(task);
+ dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
+ }
+
+ if (task->tk_exit) {
++ lock_kernel();
+ task->tk_exit(task);
++ unlock_kernel();
+ /* If tk_action is non-null, the user wants us to restart */
+ if (task->tk_action) {
+ if (!RPC_ASSASSINATED(task)) {
+@@ -738,7 +668,6 @@
+
+ /* Release all resources associated with the task */
+ rpc_release_task(task);
+-
+ return status;
+ }
+
+@@ -754,57 +683,16 @@
+ int
+ rpc_execute(struct rpc_task *task)
+ {
+- int status = -EIO;
+- if (rpc_inhibit) {
+- printk(KERN_INFO "RPC: execution inhibited!\n");
+- goto out_release;
+- }
+-
+- status = -EWOULDBLOCK;
+- if (task->tk_active) {
+- printk(KERN_ERR "RPC: active task was run twice!\n");
+- goto out_err;
+- }
++ BUG_ON(task->tk_active);
+
+ task->tk_active = 1;
+ rpc_set_running(task);
+ return __rpc_execute(task);
+- out_release:
+- rpc_release_task(task);
+- out_err:
+- return status;
+ }
+
+-/*
+- * This is our own little scheduler for async RPC tasks.
+- */
+-static void
+-__rpc_schedule(void)
++static void rpc_async_schedule(void *arg)
+ {
+- struct rpc_task *task;
+- int count = 0;
+-
+- dprintk("RPC: rpc_schedule enter\n");
+- while (1) {
+-
+- task_for_first(task, &schedq.tasks[0]) {
+- __rpc_remove_wait_queue(task);
+- spin_unlock_bh(&rpc_queue_lock);
+-
+- __rpc_execute(task);
+- spin_lock_bh(&rpc_queue_lock);
+- } else {
+- break;
+- }
+-
+- if (++count >= 200 || need_resched()) {
+- count = 0;
+- spin_unlock_bh(&rpc_queue_lock);
+- schedule();
+- spin_lock_bh(&rpc_queue_lock);
+- }
+- }
+- dprintk("RPC: rpc_schedule leave\n");
++ __rpc_execute((struct rpc_task *)arg);
+ }
+
+ /*
+@@ -862,7 +750,6 @@
+ task->tk_client = clnt;
+ task->tk_flags = flags;
+ task->tk_exit = callback;
+- init_waitqueue_head(&task->tk_wait);
+ if (current->uid != current->fsuid || current->gid != current->fsgid)
+ task->tk_flags |= RPC_TASK_SETUID;
+
+@@ -873,7 +760,11 @@
+
+ task->tk_priority = RPC_PRIORITY_NORMAL;
+ task->tk_cookie = (unsigned long)current;
+- INIT_LIST_HEAD(&task->tk_links);
++
++ /* Initialize workqueue for async tasks */
++ task->tk_workqueue = rpciod_workqueue;
++ if (!RPC_IS_ASYNC(task))
++ init_waitqueue_head(&task->u.tk_wait.waitq);
+
+ /* Add to global list of all tasks */
+ spin_lock(&rpc_sched_lock);
+@@ -944,8 +835,7 @@
+ goto out;
+ }
+
+-void
+-rpc_release_task(struct rpc_task *task)
++void rpc_release_task(struct rpc_task *task)
+ {
+ dprintk("RPC: %4d release task\n", task->tk_pid);
+
+@@ -963,19 +853,9 @@
+ list_del(&task->tk_task);
+ spin_unlock(&rpc_sched_lock);
+
+- /* Protect the execution below. */
+- spin_lock_bh(&rpc_queue_lock);
+-
+- /* Disable timer to prevent zombie wakeup */
+- __rpc_disable_timer(task);
+-
+- /* Remove from any wait queue we're still on */
+- __rpc_remove_wait_queue(task);
+-
++ BUG_ON (RPC_IS_QUEUED(task));
+ task->tk_active = 0;
+
+- spin_unlock_bh(&rpc_queue_lock);
+-
+ /* Synchronously delete any running timer */
+ rpc_delete_timer(task);
+
+@@ -1005,10 +885,9 @@
+ * queue 'childq'. If so returns a pointer to the parent.
+ * Upon failure returns NULL.
+ *
+- * Caller must hold rpc_queue_lock
++ * Caller must hold childq.lock
+ */
+-static inline struct rpc_task *
+-rpc_find_parent(struct rpc_task *child)
++static inline struct rpc_task *rpc_find_parent(struct rpc_task *child)
+ {
+ struct rpc_task *task, *parent;
+ struct list_head *le;
+@@ -1021,17 +900,16 @@
+ return NULL;
+ }
+
+-static void
+-rpc_child_exit(struct rpc_task *child)
++static void rpc_child_exit(struct rpc_task *child)
+ {
+ struct rpc_task *parent;
+
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&childq.lock);
+ if ((parent = rpc_find_parent(child)) != NULL) {
+ parent->tk_status = child->tk_status;
+ __rpc_wake_up_task(parent);
+ }
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&childq.lock);
+ }
+
+ /*
+@@ -1054,22 +932,20 @@
+ return NULL;
+ }
+
+-void
+-rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
++void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
+ {
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&childq.lock);
+ /* N.B. Is it possible for the child to have already finished? */
+ __rpc_sleep_on(&childq, task, func, NULL);
+ rpc_schedule_run(child);
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&childq.lock);
+ }
+
+ /*
+ * Kill all tasks for the given client.
+ * XXX: kill their descendants as well?
+ */
+-void
+-rpc_killall_tasks(struct rpc_clnt *clnt)
++void rpc_killall_tasks(struct rpc_clnt *clnt)
+ {
+ struct rpc_task *rovr;
+ struct list_head *le;
+@@ -1091,93 +967,14 @@
+
+ static DECLARE_MUTEX_LOCKED(rpciod_running);
+
+-static inline int
+-rpciod_task_pending(void)
+-{
+- return !list_empty(&schedq.tasks[0]);
+-}
+-
+-
+-/*
+- * This is the rpciod kernel thread
+- */
+-static int
+-rpciod(void *ptr)
+-{
+- int rounds = 0;
+-
+- lock_kernel();
+- /*
+- * Let our maker know we're running ...
+- */
+- rpciod_pid = current->pid;
+- up(&rpciod_running);
+-
+- daemonize("rpciod");
+- allow_signal(SIGKILL);
+-
+- dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid);
+- spin_lock_bh(&rpc_queue_lock);
+- while (rpciod_users) {
+- DEFINE_WAIT(wait);
+- if (signalled()) {
+- spin_unlock_bh(&rpc_queue_lock);
+- rpciod_killall();
+- flush_signals(current);
+- spin_lock_bh(&rpc_queue_lock);
+- }
+- __rpc_schedule();
+- if (current->flags & PF_FREEZE) {
+- spin_unlock_bh(&rpc_queue_lock);
+- refrigerator(PF_FREEZE);
+- spin_lock_bh(&rpc_queue_lock);
+- }
+-
+- if (++rounds >= 64) { /* safeguard */
+- spin_unlock_bh(&rpc_queue_lock);
+- schedule();
+- rounds = 0;
+- spin_lock_bh(&rpc_queue_lock);
+- }
+-
+- dprintk("RPC: rpciod back to sleep\n");
+- prepare_to_wait(&rpciod_idle, &wait, TASK_INTERRUPTIBLE);
+- if (!rpciod_task_pending() && !signalled()) {
+- spin_unlock_bh(&rpc_queue_lock);
+- schedule();
+- rounds = 0;
+- spin_lock_bh(&rpc_queue_lock);
+- }
+- finish_wait(&rpciod_idle, &wait);
+- dprintk("RPC: switch to rpciod\n");
+- }
+- spin_unlock_bh(&rpc_queue_lock);
+-
+- dprintk("RPC: rpciod shutdown commences\n");
+- if (!list_empty(&all_tasks)) {
+- printk(KERN_ERR "rpciod: active tasks at shutdown?!\n");
+- rpciod_killall();
+- }
+-
+- dprintk("RPC: rpciod exiting\n");
+- unlock_kernel();
+-
+- rpciod_pid = 0;
+- complete_and_exit(&rpciod_killer, 0);
+- return 0;
+-}
+-
+-static void
+-rpciod_killall(void)
++static void rpciod_killall(void)
+ {
+ unsigned long flags;
+
+ while (!list_empty(&all_tasks)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ rpc_killall_tasks(NULL);
+- spin_lock_bh(&rpc_queue_lock);
+- __rpc_schedule();
+- spin_unlock_bh(&rpc_queue_lock);
++ flush_workqueue(rpciod_workqueue);
+ if (!list_empty(&all_tasks)) {
+ dprintk("rpciod_killall: waiting for tasks to exit\n");
+ yield();
+@@ -1195,28 +992,30 @@
+ int
+ rpciod_up(void)
+ {
++ struct workqueue_struct *wq;
+ int error = 0;
+
+ down(&rpciod_sema);
+- dprintk("rpciod_up: pid %d, users %d\n", rpciod_pid, rpciod_users);
++ dprintk("rpciod_up: users %d\n", rpciod_users);
+ rpciod_users++;
+- if (rpciod_pid)
++ if (rpciod_workqueue)
+ goto out;
+ /*
+ * If there's no pid, we should be the first user.
+ */
+ if (rpciod_users > 1)
+- printk(KERN_WARNING "rpciod_up: no pid, %d users??\n", rpciod_users);
++ printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users);
+ /*
+ * Create the rpciod thread and wait for it to start.
+ */
+- error = kernel_thread(rpciod, NULL, 0);
+- if (error < 0) {
+- printk(KERN_WARNING "rpciod_up: create thread failed, error=%d\n", error);
++ error = -ENOMEM;
++ wq = create_workqueue("rpciod");
++ if (wq == NULL) {
++ printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error);
+ rpciod_users--;
+ goto out;
+ }
+- down(&rpciod_running);
++ rpciod_workqueue = wq;
+ error = 0;
+ out:
+ up(&rpciod_sema);
+@@ -1227,20 +1026,21 @@
+ rpciod_down(void)
+ {
+ down(&rpciod_sema);
+- dprintk("rpciod_down pid %d sema %d\n", rpciod_pid, rpciod_users);
++ dprintk("rpciod_down sema %d\n", rpciod_users);
+ if (rpciod_users) {
+ if (--rpciod_users)
+ goto out;
+ } else
+- printk(KERN_WARNING "rpciod_down: pid=%d, no users??\n", rpciod_pid);
++ printk(KERN_WARNING "rpciod_down: no users??\n");
+
+- if (!rpciod_pid) {
++ if (!rpciod_workqueue) {
+ dprintk("rpciod_down: Nothing to do!\n");
+ goto out;
+ }
++ rpciod_killall();
+
+- kill_proc(rpciod_pid, SIGKILL, 1);
+- wait_for_completion(&rpciod_killer);
++ destroy_workqueue(rpciod_workqueue);
++ rpciod_workqueue = NULL;
+ out:
+ up(&rpciod_sema);
+ }
+@@ -1258,7 +1058,12 @@
+ }
+ printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
+ "-rpcwait -action- --exit--\n");
+- alltask_for_each(t, le, &all_tasks)
++ alltask_for_each(t, le, &all_tasks) {
++ const char *rpc_waitq = "none";
++
++ if (RPC_IS_QUEUED(t))
++ rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
++
+ printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n",
+ t->tk_pid,
+ (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+@@ -1266,8 +1071,9 @@
+ t->tk_client,
+ (t->tk_client ? t->tk_client->cl_prog : 0),
+ t->tk_rqstp, t->tk_timeout,
+- rpc_qname(t->tk_rpcwait),
++ rpc_waitq,
+ t->tk_action, t->tk_exit);
++ }
+ spin_unlock(&rpc_sched_lock);
+ }
+ #endif
+Index: linux-2.6.10/net/sunrpc/sunrpc_syms.c
+===================================================================
+--- linux-2.6.10.orig/net/sunrpc/sunrpc_syms.c 2004-12-25 05:35:25.000000000 +0800
++++ linux-2.6.10/net/sunrpc/sunrpc_syms.c 2005-04-05 14:49:13.411690432 +0800
+@@ -58,6 +58,9 @@
+ EXPORT_SYMBOL(rpc_wake_up);
+ EXPORT_SYMBOL(rpc_queue_upcall);
+ EXPORT_SYMBOL(rpc_mkpipe);
++EXPORT_SYMBOL(rpc_mkdir);
++EXPORT_SYMBOL(rpc_rmdir);
++
+
+ /* Client transport */
+ EXPORT_SYMBOL(xprt_create_proto);
+@@ -90,6 +93,7 @@
+ EXPORT_SYMBOL(svc_auth_register);
+ EXPORT_SYMBOL(auth_domain_lookup);
+ EXPORT_SYMBOL(svc_authenticate);
++EXPORT_SYMBOL(svc_set_client);
+
+ /* RPC statistics */
+ #ifdef CONFIG_PROC_FS
+Index: linux-2.6.10/kernel/exit.c
+===================================================================
+--- linux-2.6.10.orig/kernel/exit.c 2005-04-05 14:48:52.534864192 +0800
++++ linux-2.6.10/kernel/exit.c 2005-04-05 14:50:57.737830448 +0800
+@@ -848,6 +848,8 @@
+ for (;;) ;
+ }
+
++EXPORT_SYMBOL(do_exit);
++
+ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+ {
+ if (comp)
+Index: linux-2.6.10/fs/locks.c
+===================================================================
+--- linux-2.6.10.orig/fs/locks.c 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/fs/locks.c 2005-04-05 14:49:13.434686936 +0800
+@@ -1096,15 +1096,13 @@
+ */
+ void remove_lease(struct file_lock *fl)
+ {
+- if (!IS_LEASE(fl))
+- return;
+-
+ lock_kernel();
+-
++ if (!fl || !IS_LEASE(fl))
++ goto out;
+ fl->fl_type = F_UNLCK | F_INPROGRESS;
+ fl->fl_break_time = jiffies - 10;
+ time_out_leases(fl->fl_file->f_dentry->d_inode);
+-
++out:
+ unlock_kernel();
+ }
+
+@@ -1563,9 +1561,6 @@
+ error = filp->f_op->lock(filp, F_GETLK, &file_lock);
+ if (error < 0)
+ goto out;
+- else if (error == LOCK_USE_CLNT)
+- /* Bypass for NFS with no locking - 2.0.36 compat */
+- fl = posix_test_lock(filp, &file_lock);
+ else
+ fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
+ } else {
+@@ -1708,9 +1703,6 @@
+ error = filp->f_op->lock(filp, F_GETLK, &file_lock);
+ if (error < 0)
+ goto out;
+- else if (error == LOCK_USE_CLNT)
+- /* Bypass for NFS with no locking - 2.0.36 compat */
+- fl = posix_test_lock(filp, &file_lock);
+ else
+ fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
+ } else {
+Index: linux-2.6.10/fs/dcache.c
+===================================================================
+--- linux-2.6.10.orig/fs/dcache.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/dcache.c 2005-04-05 14:49:13.413690128 +0800
+@@ -789,6 +789,54 @@
+ }
+
+ /**
++ * d_instantiate_unique - instantiate a non-aliased dentry
++ * @entry: dentry to instantiate
++ * @inode: inode to attach to this dentry
++ *
++ * Fill in inode information in the entry. On success, it returns NULL.
++ * If an unhashed alias of "entry" already exists, then we return the
++ * aliased dentry instead.
++ *
++ * Note that in order to avoid conflicts with rename() etc, the caller
++ * had better be holding the parent directory semaphore.
++ */
++struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
++{
++ struct dentry *alias;
++ int len = entry->d_name.len;
++ const char *name = entry->d_name.name;
++ unsigned int hash = entry->d_name.hash;
++
++ BUG_ON(!list_empty(&entry->d_alias));
++ spin_lock(&dcache_lock);
++ if (!inode)
++ goto do_negative;
++ list_for_each_entry(alias, &inode->i_dentry, d_alias) {
++ struct qstr *qstr = &alias->d_name;
++
++ if (qstr->hash != hash)
++ continue;
++ if (alias->d_parent != entry->d_parent)
++ continue;
++ if (qstr->len != len)
++ continue;
++ if (memcmp(qstr->name, name, len))
++ continue;
++ dget_locked(alias);
++ spin_unlock(&dcache_lock);
++ BUG_ON(!d_unhashed(alias));
++ return alias;
++ }
++ list_add(&entry->d_alias, &inode->i_dentry);
++do_negative:
++ entry->d_inode = inode;
++ spin_unlock(&dcache_lock);
++ security_d_instantiate(entry, inode);
++ return NULL;
++}
++EXPORT_SYMBOL(d_instantiate_unique);
++
++/**
+ * d_alloc_root - allocate root dentry
+ * @root_inode: inode to allocate the root for
+ *
+Index: linux-2.6.10/fs/lockd/svc.c
+===================================================================
+--- linux-2.6.10.orig/fs/lockd/svc.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/lockd/svc.c 2005-04-05 14:49:13.458683288 +0800
+@@ -418,6 +418,38 @@
+ return 0; \
+ }
+
++static inline int is_callback(u32 proc)
++{
++ return proc == NLMPROC_GRANTED
++ || proc == NLMPROC_GRANTED_MSG
++ || proc == NLMPROC_TEST_RES
++ || proc == NLMPROC_LOCK_RES
++ || proc == NLMPROC_CANCEL_RES
++ || proc == NLMPROC_UNLOCK_RES
++ || proc == NLMPROC_NSM_NOTIFY;
++}
++
++
++static int lockd_authenticate(struct svc_rqst *rqstp)
++{
++ rqstp->rq_client = NULL;
++ switch (rqstp->rq_authop->flavour) {
++ case RPC_AUTH_NULL:
++ case RPC_AUTH_UNIX:
++ if (rqstp->rq_proc == 0)
++ return SVC_OK;
++ if (is_callback(rqstp->rq_proc)) {
++ /* Leave it to individual procedures to
++ * call nlmsvc_lookup_host(rqstp)
++ */
++ return SVC_OK;
++ }
++ return svc_set_client(rqstp);
++ }
++ return SVC_DENIED;
++}
++
++
+ param_set_min_max(port, int, simple_strtol, 0, 65535)
+ param_set_min_max(grace_period, unsigned long, simple_strtoul,
+ nlm_grace_period_min, nlm_grace_period_max)
+@@ -498,4 +530,5 @@
+ .pg_name = "lockd", /* service name */
+ .pg_class = "nfsd", /* share authentication with nfsd */
+ .pg_stats = &nlmsvc_stats, /* stats table */
++ .pg_authenticate = &lockd_authenticate /* export authentication */
+ };
+Index: linux-2.6.10/fs/nfsd/nfs4xdr.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4xdr.c 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4xdr.c 2005-04-05 14:49:13.425688304 +0800
+@@ -60,121 +60,6 @@
+
+ #define NFSDDBG_FACILITY NFSDDBG_XDR
+
+-static const char utf8_byte_len[256] = {
+- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+- 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
+-};
+-
+-static inline int
+-is_legal_utf8_sequence(unsigned char *source, int length)
+-{
+- unsigned char *ptr;
+- unsigned char c;
+-
+- if (length==1) return 1;
+-
+- /* Check for overlong sequence, and check second byte */
+- c = *(source + 1);
+- switch (*source) {
+- case 0xE0: /* 3 bytes */
+- if ( c < 0xA0 ) return 0;
+- break;
+- case 0xF0: /* 4 bytes */
+- if ( c < 0x90 ) return 0;
+- break;
+- case 0xF8: /* 5 bytes */
+- if ( c < 0xC8 ) return 0;
+- break;
+- case 0xFC: /* 6 bytes */
+- if ( c < 0x84 ) return 0;
+- break;
+- default:
+- if ( (c & 0xC0) != 0x80) return 0;
+- }
+-
+- /* Check that trailing bytes look like 10xxxxxx */
+- for (ptr = source++ + length - 1; ptr>source; ptr--)
+- if ( ((*ptr) & 0xC0) != 0x80 ) return 0;
+- return 1;
+-}
+-
+-/* This does some screening on disallowed unicode characters. It is NOT
+- * comprehensive.
+- */
+-static int
+-is_allowed_utf8_char(unsigned char *source, int length)
+-{
+- /* We assume length and source point to a valid utf8 sequence */
+- unsigned char c;
+-
+- /* Disallow F0000 and up (in utf8, F3B08080) */
+- if (*source > 0xF3 ) return 0;
+- c = *(source + 1);
+- switch (*source) {
+- case 0xF3:
+- if (c >= 0xB0) return 0;
+- break;
+- /* Disallow D800-F8FF (in utf8, EDA080-EFA3BF */
+- case 0xED:
+- if (c >= 0xA0) return 0;
+- break;
+- case 0xEE:
+- return 0;
+- break;
+- case 0xEF:
+- if (c <= 0xA3) return 0;
+- /* Disallow FFF9-FFFF (EFBFB9-EFBFBF) */
+- if (c==0xBF)
+- /* Don't need to check <=0xBF, since valid utf8 */
+- if ( *(source+2) >= 0xB9) return 0;
+- break;
+- }
+- return 1;
+-}
+-
+-/* This routine should really check to see that the proper stringprep
+- * mappings have been applied. Instead, we do a simple screen of some
+- * of the more obvious illegal values by calling is_allowed_utf8_char.
+- * This will allow many illegal strings through, but if a client behaves,
+- * it will get full functionality. The other option (apart from full
+- * stringprep checking) is to limit everything to an easily handled subset,
+- * such as 7-bit ascii.
+- *
+- * Note - currently calling routines ignore return value except as boolean.
+- */
+-static int
+-check_utf8(char *str, int len)
+-{
+- unsigned char *chunk, *sourceend;
+- int chunklen;
+-
+- chunk = str;
+- sourceend = str + len;
+-
+- while (chunk < sourceend) {
+- chunklen = utf8_byte_len[*chunk];
+- if (!chunklen)
+- return nfserr_inval;
+- if (chunk + chunklen > sourceend)
+- return nfserr_inval;
+- if (!is_legal_utf8_sequence(chunk, chunklen))
+- return nfserr_inval;
+- if (!is_allowed_utf8_char(chunk, chunklen))
+- return nfserr_inval;
+- if ( (chunklen==1) && (!*chunk) )
+- return nfserr_inval; /* Disallow embedded nulls */
+- chunk += chunklen;
+- }
+-
+- return 0;
+-}
+-
+ static int
+ check_filename(char *str, int len, int err)
+ {
+@@ -187,7 +72,7 @@
+ for (i = 0; i < len; i++)
+ if (str[i] == '/')
+ return err;
+- return check_utf8(str, len);
++ return 0;
+ }
+
+ /*
+@@ -403,8 +288,6 @@
+ READ_BUF(dummy32);
+ len += XDR_QUADLEN(dummy32) << 2;
+ READMEM(buf, dummy32);
+- if (check_utf8(buf, dummy32))
+- return nfserr_inval;
+ ace.whotype = nfs4_acl_get_whotype(buf, dummy32);
+ status = 0;
+ if (ace.whotype != NFS4_ACL_WHO_NAMED)
+@@ -439,8 +322,6 @@
+ READ_BUF(dummy32);
+ len += (XDR_QUADLEN(dummy32) << 2);
+ READMEM(buf, dummy32);
+- if (check_utf8(buf, dummy32))
+- return nfserr_inval;
+ if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+ goto out_nfserr;
+ iattr->ia_valid |= ATTR_UID;
+@@ -452,8 +333,6 @@
+ READ_BUF(dummy32);
+ len += (XDR_QUADLEN(dummy32) << 2);
+ READMEM(buf, dummy32);
+- if (check_utf8(buf, dummy32))
+- return nfserr_inval;
+ if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+ goto out_nfserr;
+ iattr->ia_valid |= ATTR_GID;
+@@ -525,7 +404,7 @@
+ }
+ }
+ if (len != expected_len)
+- goto xdr_error;
++ printk("nfsd: funky nfs4 client sent extra bytes in setattr\n");
+
+ DECODE_TAIL;
+
+@@ -585,8 +464,6 @@
+ READ32(create->cr_linklen);
+ READ_BUF(create->cr_linklen);
+ SAVEMEM(create->cr_linkname, create->cr_linklen);
+- if (check_utf8(create->cr_linkname, create->cr_linklen))
+- return nfserr_inval;
+ break;
+ case NF4BLK:
+ case NF4CHR:
+@@ -615,6 +492,18 @@
+ }
+
+ static inline int
++nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
++{
++ DECODE_HEAD;
++
++ READ_BUF(sizeof(stateid_t));
++ READ32(dr->dr_stateid.si_generation);
++ COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
++
++ DECODE_TAIL;
++}
++
++static inline int
+ nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
+ {
+ return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
+@@ -790,8 +679,8 @@
+ READ32(open->op_delegate_type);
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+- READ_BUF(sizeof(delegation_stateid_t) + 4);
+- COPYMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t));
++ READ_BUF(sizeof(stateid_t) + 4);
++ COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ READ32(open->op_fname.len);
+ READ_BUF(open->op_fname.len);
+ SAVEMEM(open->op_fname.data, open->op_fname.len);
+@@ -825,7 +714,7 @@
+ DECODE_HEAD;
+
+ open_down->od_stateowner = NULL;
+- READ_BUF(4 + sizeof(stateid_t));
++ READ_BUF(12 + sizeof(stateid_t));
+ READ32(open_down->od_stateid.si_generation);
+ COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
+ READ32(open_down->od_seqid);
+@@ -1170,6 +1059,9 @@
+ case OP_CREATE:
+ op->status = nfsd4_decode_create(argp, &op->u.create);
+ break;
++ case OP_DELEGRETURN:
++ op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn);
++ break;
+ case OP_GETATTR:
+ op->status = nfsd4_decode_getattr(argp, &op->u.getattr);
+ break;
+@@ -1425,7 +1317,7 @@
+ if (status)
+ goto out_nfserr;
+ }
+- if ((bmval0 & FATTR4_WORD0_FILEHANDLE) && !fhp) {
++ if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+ fh_init(&tempfh, NFS4_FHSIZE);
+ status = fh_compose(&tempfh, exp, dentry, NULL);
+ if (status)
+@@ -1471,7 +1363,10 @@
+ if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
+ if ((buflen -= 4) < 0)
+ goto out_resource;
+- WRITE32( NFS4_FH_NOEXPIRE_WITH_OPEN | NFS4_FH_VOL_RENAME );
++ if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
++ WRITE32(NFS4_FH_PERSISTENT);
++ else
++ WRITE32(NFS4_FH_VOL_RENAME);
+ }
+ if (bmval0 & FATTR4_WORD0_CHANGE) {
+ /*
+@@ -1508,10 +1403,15 @@
+ if (bmval0 & FATTR4_WORD0_FSID) {
+ if ((buflen -= 16) < 0)
+ goto out_resource;
+- WRITE32(0);
+- WRITE32(MAJOR(stat.dev));
+- WRITE32(0);
+- WRITE32(MINOR(stat.dev));
++ if (is_fsid(fhp, rqstp->rq_reffh)) {
++ WRITE64((u64)exp->ex_fsid);
++ WRITE64((u64)0);
++ } else {
++ WRITE32(0);
++ WRITE32(MAJOR(stat.dev));
++ WRITE32(0);
++ WRITE32(MINOR(stat.dev));
++ }
+ }
+ if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
+ if ((buflen -= 4) < 0)
+@@ -1765,17 +1665,65 @@
+ }
+
+ static int
++nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
++ const char *name, int namlen, u32 *p, int *buflen)
++{
++ struct svc_export *exp = cd->rd_fhp->fh_export;
++ struct dentry *dentry;
++ int nfserr;
++
++ dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
++ if (IS_ERR(dentry))
++ return nfserrno(PTR_ERR(dentry));
++
++ exp_get(exp);
++ if (d_mountpoint(dentry)) {
++ if (nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp)) {
++ /*
++ * -EAGAIN is the only error returned from
++ * nfsd_cross_mnt() and it indicates that an
++ * up-call has been initiated to fill in the export
++ * options on exp. When the answer comes back,
++ * this call will be retried.
++ */
++ nfserr = nfserr_dropit;
++ goto out_put;
++ }
++
++ }
++ nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
++ cd->rd_rqstp);
++out_put:
++ dput(dentry);
++ exp_put(exp);
++ return nfserr;
++}
++
++static u32 *
++nfsd4_encode_rdattr_error(u32 *p, int buflen, int nfserr)
++{
++ u32 *attrlenp;
++
++ if (buflen < 6)
++ return NULL;
++ *p++ = htonl(2);
++ *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
++ *p++ = htonl(0); /* bmval1 */
++
++ attrlenp = p++;
++ *p++ = nfserr; /* no htonl */
++ *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
++ return p;
++}
++
++static int
+ nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen,
+ loff_t offset, ino_t ino, unsigned int d_type)
+ {
+ struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
+ int buflen;
+ u32 *p = cd->buffer;
+- u32 *attrlenp;
+- struct dentry *dentry;
+- struct svc_export *exp = cd->rd_fhp->fh_export;
+- u32 bmval0, bmval1;
+- int nfserr = 0;
++ int nfserr = nfserr_toosmall;
+
+ /* In nfsv4, "." and ".." never make it onto the wire.. */
+ if (name && isdotent(name, namlen)) {
+@@ -1788,106 +1736,44 @@
+
+ buflen = cd->buflen - 4 - XDR_QUADLEN(namlen);
+ if (buflen < 0)
+- goto nospc;
++ goto fail;
+
+ *p++ = xdr_one; /* mark entry present */
+ cd->offset = p; /* remember pointer */
+ p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
+ p = xdr_encode_array(p, name, namlen); /* name length & name */
+
+- /*
+- * Now we come to the ugly part: writing the fattr for this entry.
+- */
+- bmval0 = cd->rd_bmval[0];
+- bmval1 = cd->rd_bmval[1];
+- if ((bmval0 & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_FILEID)) || bmval1) {
+- /*
+- * "Heavyweight" case: we have no choice except to
+- * call nfsd4_encode_fattr().
+- */
+- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+- if (IS_ERR(dentry)) {
+- nfserr = nfserrno(PTR_ERR(dentry));
+- goto error;
+- }
+-
+- exp_get(exp);
+- if (d_mountpoint(dentry)) {
+- if ((nfserr = nfsd_cross_mnt(cd->rd_rqstp, &dentry,
+- &exp))) {
+- /*
+- * -EAGAIN is the only error returned from
+- * nfsd_cross_mnt() and it indicates that an
+- * up-call has been initiated to fill in the export
+- * options on exp. When the answer comes back,
+- * this call will be retried.
+- */
+- dput(dentry);
+- exp_put(exp);
+- nfserr = nfserr_dropit;
+- goto error;
+- }
+-
+- }
+-
+- nfserr = nfsd4_encode_fattr(NULL, exp,
+- dentry, p, &buflen, cd->rd_bmval,
+- cd->rd_rqstp);
+- dput(dentry);
+- exp_put(exp);
+- if (!nfserr) {
+- p += buflen;
+- goto out;
+- }
+- if (nfserr == nfserr_resource)
+- goto nospc;
+-
+-error:
++ nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen);
++ switch (nfserr) {
++ case nfs_ok:
++ p += buflen;
++ break;
++ case nfserr_resource:
++ nfserr = nfserr_toosmall;
++ goto fail;
++ case nfserr_dropit:
++ goto fail;
++ default:
+ /*
+- * If we get here, we experienced a miscellaneous
+- * failure while writing the attributes. If the
+- * client requested the RDATTR_ERROR attribute,
++ * If the client requested the RDATTR_ERROR attribute,
+ * we stuff the error code into this attribute
+ * and continue. If this attribute was not requested,
+ * then in accordance with the spec, we fail the
+ * entire READDIR operation(!)
+ */
+- if (!(bmval0 & FATTR4_WORD0_RDATTR_ERROR)) {
+- cd->common.err = nfserr;
+- return -EINVAL;
+- }
+-
+- bmval0 = FATTR4_WORD0_RDATTR_ERROR;
+- bmval1 = 0;
+- /* falling through here will do the right thing... */
++ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
++ goto fail;
++ nfserr = nfserr_toosmall;
++ p = nfsd4_encode_rdattr_error(p, buflen, nfserr);
++ if (p == NULL)
++ goto fail;
+ }
+-
+- /*
+- * In the common "lightweight" case, we avoid
+- * the overhead of nfsd4_encode_fattr() by assembling
+- * a small fattr by hand.
+- */
+- if (buflen < 6)
+- goto nospc;
+- *p++ = htonl(2);
+- *p++ = htonl(bmval0);
+- *p++ = htonl(bmval1);
+-
+- attrlenp = p++;
+- if (bmval0 & FATTR4_WORD0_RDATTR_ERROR)
+- *p++ = nfserr; /* no htonl */
+- if (bmval0 & FATTR4_WORD0_FILEID)
+- p = xdr_encode_hyper(p, (u64)ino);
+- *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
+-
+-out:
+ cd->buflen -= (p - cd->buffer);
+ cd->buffer = p;
+ cd->common.err = nfs_ok;
+ return 0;
+-
+-nospc:
+- cd->common.err = nfserr_toosmall;
++fail:
++ cd->common.err = nfserr;
+ return -EINVAL;
+ }
+
+@@ -2081,8 +1967,8 @@
+ case NFS4_OPEN_DELEGATE_NONE:
+ break;
+ case NFS4_OPEN_DELEGATE_READ:
+- RESERVE_SPACE(20 + sizeof(delegation_stateid_t));
+- WRITEMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t));
++ RESERVE_SPACE(20 + sizeof(stateid_t));
++ WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ WRITE32(0);
+
+ /*
+@@ -2095,8 +1981,8 @@
+ ADJUST_ARGS();
+ break;
+ case NFS4_OPEN_DELEGATE_WRITE:
+- RESERVE_SPACE(32 + sizeof(delegation_stateid_t));
+- WRITEMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t));
++ RESERVE_SPACE(32 + sizeof(stateid_t));
++ WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ WRITE32(0);
+
+ /*
+@@ -2185,10 +2071,17 @@
+ }
+ read->rd_vlen = v;
+
+- nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp,
+- read->rd_offset,
+- read->rd_iov, read->rd_vlen,
+- &maxcount);
++ if (read->rd_filp)
++ nfserr = nfsd_vfs_read(read->rd_rqstp, read->rd_fhp,
++ read->rd_filp, read->rd_offset,
++ read->rd_iov, read->rd_vlen,
++ &maxcount);
++ else
++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp,
++ read->rd_offset,
++ read->rd_iov, read->rd_vlen,
++ &maxcount);
++
+ if (nfserr == nfserr_symlink)
+ nfserr = nfserr_inval;
+ if (nfserr)
+@@ -2460,6 +2353,8 @@
+ case OP_CREATE:
+ nfsd4_encode_create(resp, op->status, &op->u.create);
+ break;
++ case OP_DELEGRETURN:
++ break;
+ case OP_GETATTR:
+ op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr);
+ break;
+Index: linux-2.6.10/fs/nfsd/nfs4state.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4state.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4state.c 2005-04-05 14:49:13.421688912 +0800
+@@ -44,6 +44,7 @@
+ #include <linux/mount.h>
+ #include <linux/workqueue.h>
+ #include <linux/smp_lock.h>
++#include <linux/kthread.h>
+ #include <linux/nfs4.h>
+ #include <linux/nfsd/state.h>
+ #include <linux/nfsd/xdr4.h>
+@@ -56,9 +57,11 @@
+ static u32 nfs4_reclaim_init = 0;
+ time_t boot_time;
+ static time_t grace_end = 0;
++static u32 first_run = 1; /* laundromat threads first run */
+ static u32 current_clientid = 1;
+-static u32 current_ownerid;
+-static u32 current_fileid;
++static u32 current_ownerid = 1;
++static u32 current_fileid = 1;
++static u32 current_delegid = 1;
+ static u32 nfs4_init;
+ stateid_t zerostateid; /* bits all 0 */
+ stateid_t onestateid; /* bits all 1 */
+@@ -70,14 +73,17 @@
+ u32 del_perclient = 0;
+ u32 alloc_file = 0;
+ u32 free_file = 0;
+-u32 alloc_sowner = 0;
+-u32 free_sowner = 0;
+ u32 vfsopen = 0;
+ u32 vfsclose = 0;
+-u32 alloc_lsowner= 0;
++u32 alloc_delegation= 0;
++u32 free_delegation= 0;
+
+ /* forward declarations */
+ struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
++static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
++static void release_delegation(struct nfs4_delegation *dp);
++static void release_stateid_lockowner(struct nfs4_stateid *open_stp);
++extern char recovery_dirname[];
+
+ /* Locking:
+ *
+@@ -117,6 +123,112 @@
+ static void release_stateid(struct nfs4_stateid *stp, int flags);
+ static void release_file(struct nfs4_file *fp);
+
++/*
++ * Delegation state
++ */
++
++/* recall_lock protects the del_recall_lru */
++spinlock_t recall_lock;
++static struct list_head del_recall_lru;
++
++static struct nfs4_delegation *
++alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
++{
++ struct nfs4_delegation *dp;
++ struct nfs4_file *fp = stp->st_file;
++
++ dprintk("NFSD alloc_init_deleg\n");
++ if ((dp = kmalloc(sizeof(struct nfs4_delegation),
++ GFP_KERNEL)) == NULL)
++ return dp;
++ INIT_LIST_HEAD(&dp->dl_del_perfile);
++ INIT_LIST_HEAD(&dp->dl_del_perclnt);
++ INIT_LIST_HEAD(&dp->dl_recall_lru);
++ dp->dl_client = clp;
++ dp->dl_file = fp;
++ dp->dl_flock = NULL;
++ dp->dl_stp = stp;
++ dp->dl_flags = 0;
++ dp->dl_type = type;
++ dp->dl_recall.cbr_dp = NULL;
++ dp->dl_recall.cbr_ident = 0;
++ dp->dl_recall.cbr_trunc = 0;
++ dp->dl_stateid.si_boot = boot_time;
++ dp->dl_stateid.si_stateownerid = current_delegid++;
++ dp->dl_stateid.si_fileid = 0;
++ dp->dl_stateid.si_generation = 0;
++ dp->dl_fhlen = current_fh->fh_handle.fh_size;
++ memcpy(dp->dl_fhval, ¤t_fh->fh_handle.fh_base,
++ current_fh->fh_handle.fh_size);
++ dp->dl_time = 0;
++ atomic_set(&dp->dl_state, NFS4_NO_RECALL);
++ atomic_set(&dp->dl_count, 1);
++ atomic_set(&dp->dl_recall_cnt, 0);
++ list_add(&dp->dl_del_perfile, &fp->fi_del_perfile);
++ list_add(&dp->dl_del_perclnt, &clp->cl_del_perclnt);
++ alloc_delegation++;
++ return dp;
++}
++
++/*
++ * Free the delegation structure.
++ */
++static void
++nfs4_free_delegation(struct nfs4_delegation *dp)
++{
++ dprintk("NFSD: nfs4_free_delegation freeing dp %p\n",dp);
++ list_del(&dp->dl_recall_lru);
++ kfree(dp);
++ free_delegation++;
++}
++
++/* release_delegation:
++ *
++ * lease_modify() is called to remove the FS_LEASE file_lock from
++ * the i_flock list, eventually calling nfsd's lock_manager
++ * fl_release_callback.
++ *
++ * call either:
++ * nfsd_close : if last close, locks_remove_flock calls lease_modify.
++ * otherwise, recalled state set to NFS4_RECALL_COMPLETE
++ * so that it will be reaped by the laundromat service.
++ * or
++ * remove_lease (calls time_out_lease which calls lease_modify).
++ * and nfs4_free_delegation.
++ *
++ * lock_kernel() protects dp->dl_flock which is set under the kernel lock
++ * by nfsd_copy_lock_deleg_callback and nfsd_release_deleg_callback.
++ *
++ */
++
++static void
++release_delegation(struct nfs4_delegation *dp)
++{
++ /* delayed nfsd_close */
++ if (dp->dl_flags && NFS4_DELAY_CLOSE) {
++ struct file *filp = dp->dl_stp->st_vfs_file;
++
++ dprintk("NFSD: release_delegation CLOSE\n");
++ release_stateid_lockowner(dp->dl_stp);
++ kfree(dp->dl_stp);
++ dp->dl_flags &= ~NFS4_DELAY_CLOSE;
++ dp->dl_stp = NULL;
++ atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE);
++ nfsd_close(filp);
++ vfsclose++;
++ } else {
++ dprintk("NFSD: release_delegation remove lease dl_flock %p\n",
++ dp->dl_flock);
++ remove_lease(dp->dl_flock);
++ list_del_init(&dp->dl_del_perfile);
++ list_del_init(&dp->dl_del_perclnt);
++ /* dl_count > 0 => outstanding recall rpc */
++ dprintk("NFSD: release_delegation free deleg dl_count %d\n",
++ atomic_read(&dp->dl_count));
++ if (atomic_dec_and_test(&dp->dl_count))
++ nfs4_free_delegation(dp);
++ }
++}
+
+ /*
+ * SETCLIENTID state
+@@ -148,7 +260,7 @@
+ * for last close replay.
+ */
+ static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE];
+-static int reclaim_str_hashtbl_size;
++static int reclaim_str_hashtbl_size = 0;
+ static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE];
+ static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE];
+ static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE];
+@@ -213,12 +325,38 @@
+ kfree(clp);
+ }
+
++void
++put_nfs4_client(struct nfs4_client *clp)
++{
++ if (atomic_dec_and_test(&clp->cl_count))
++ free_client(clp);
++}
++
+ static void
+ expire_client(struct nfs4_client *clp)
+ {
+ struct nfs4_stateowner *sop;
++ struct nfs4_delegation *dp;
++ struct nfs4_callback *cb = &clp->cl_callback;
++ struct rpc_clnt *clnt = clp->cl_callback.cb_client;
++
++ dprintk("NFSD: expire_client cl_count %d\n",
++ atomic_read(&clp->cl_count));
+
+- dprintk("NFSD: expire_client\n");
++ /* shutdown rpc client, ending any outstanding recall rpcs */
++ if (atomic_read(&cb->cb_set) == 1 && clnt) {
++ rpc_shutdown_client(clnt);
++ clnt = clp->cl_callback.cb_client = NULL;
++ }
++ while (!list_empty(&clp->cl_del_perclnt)) {
++ dp = list_entry(clp->cl_del_perclnt.next, struct nfs4_delegation, dl_del_perclnt);
++ dprintk("NFSD: expire client. dp %p, dl_state %d, fp %p\n",
++ dp, atomic_read(&dp->dl_state), dp->dl_flock);
++
++ /* force release of delegation. */
++ atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE);
++ release_delegation(dp);
++ }
+ list_del(&clp->cl_idhash);
+ list_del(&clp->cl_strhash);
+ list_del(&clp->cl_lru);
+@@ -226,7 +364,7 @@
+ sop = list_entry(clp->cl_perclient.next, struct nfs4_stateowner, so_perclient);
+ release_stateowner(sop);
+ }
+- free_client(clp);
++ put_nfs4_client(clp);
+ }
+
+ static struct nfs4_client *
+@@ -235,9 +373,13 @@
+
+ if (!(clp = alloc_client(name)))
+ goto out;
++ atomic_set(&clp->cl_count, 1);
++ atomic_set(&clp->cl_callback.cb_set, 0);
++ clp->cl_callback.cb_parsed = 0;
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_strhash);
+ INIT_LIST_HEAD(&clp->cl_perclient);
++ INIT_LIST_HEAD(&clp->cl_del_perclnt);
+ INIT_LIST_HEAD(&clp->cl_lru);
+ out:
+ return clp;
+@@ -420,17 +562,24 @@
+ {
+ struct nfs4_callback *cb = &clp->cl_callback;
+
++ /* Currently, we only support tcp for the callback channel */
++ if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
++ goto out_err;
++
+ if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
+- &cb->cb_addr, &cb->cb_port))) {
+- printk(KERN_INFO "NFSD: BAD callback address. client will not receive delegations\n");
+- cb->cb_parsed = 0;
+- return;
+- }
+- cb->cb_netid.len = se->se_callback_netid_len;
+- cb->cb_netid.data = se->se_callback_netid_val;
++ &cb->cb_addr, &cb->cb_port)))
++ goto out_err;
+ cb->cb_prog = se->se_callback_prog;
+ cb->cb_ident = se->se_callback_ident;
+ cb->cb_parsed = 1;
++ return;
++out_err:
++ printk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
++ "will not receive delegations\n",
++ clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
++
++ cb->cb_parsed = 0;
++ return;
+ }
+
+ /*
+@@ -707,6 +856,7 @@
+ status = nfserr_clid_inuse;
+ else {
+ expire_client(conf);
++ clp = unconf;
+ move_to_confirmed(unconf, idhashval);
+ status = nfs_ok;
+ }
+@@ -724,6 +874,7 @@
+ if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) {
+ status = nfserr_clid_inuse;
+ } else {
++ clp = conf;
+ status = nfs_ok;
+ }
+ goto out;
+@@ -738,6 +889,7 @@
+ status = nfserr_clid_inuse;
+ } else {
+ status = nfs_ok;
++ clp = unconf;
+ move_to_confirmed(unconf, idhashval);
+ }
+ goto out;
+@@ -757,7 +909,8 @@
+ status = nfserr_inval;
+ goto out;
+ out:
+- /* XXX if status == nfs_ok, probe callback path */
++ if (!status)
++ nfsd4_probe_callback(clp);
+ nfs4_unlock_state();
+ return status;
+ }
+@@ -803,6 +956,7 @@
+ if ((fp = kmalloc(sizeof(struct nfs4_file),GFP_KERNEL))) {
+ INIT_LIST_HEAD(&fp->fi_hash);
+ INIT_LIST_HEAD(&fp->fi_perfile);
++ INIT_LIST_HEAD(&fp->fi_del_perfile);
+ list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ fp->fi_inode = igrab(ino);
+ fp->fi_id = current_fileid++;
+@@ -822,7 +976,7 @@
+ while (!list_empty(&file_hashtbl[i])) {
+ fp = list_entry(file_hashtbl[i].next, struct nfs4_file, fi_hash);
+ /* this should never be more than once... */
+- if (!list_empty(&fp->fi_perfile)) {
++ if (!list_empty(&fp->fi_perfile) || !list_empty(&fp->fi_del_perfile)) {
+ printk("ERROR: release_all_files: file %p is open, creating dangling state !!!\n",fp);
+ }
+ release_file(fp);
+@@ -830,15 +984,36 @@
+ }
+ }
+
+-/* should use a slab cache */
++kmem_cache_t *stateowner_slab = NULL;
++
++int
++nfsd4_init_slabs(void)
++{
++ stateowner_slab = kmem_cache_create("nfsd4_stateowners",
++ sizeof(struct nfs4_stateowner), 0, 0, NULL, NULL);
++ if (stateowner_slab == NULL)
++ return -ENOMEM;
++ return 0;
++}
++
++int
++nfsd4_free_slabs(void)
++{
++ int status = 0;
++
++ if (stateowner_slab)
++ status = kmem_cache_destroy(stateowner_slab);
++ stateowner_slab = NULL;
++ return status;
++}
++
+ void
+ nfs4_free_stateowner(struct kref *kref)
+ {
+ struct nfs4_stateowner *sop =
+ container_of(kref, struct nfs4_stateowner, so_ref);
+ kfree(sop->so_owner.data);
+- kfree(sop);
+- free_sowner++;
++ kmem_cache_free(stateowner_slab, sop);
+ }
+
+ static inline struct nfs4_stateowner *
+@@ -846,14 +1021,14 @@
+ {
+ struct nfs4_stateowner *sop;
+
+- if ((sop = kmalloc(sizeof(struct nfs4_stateowner),GFP_KERNEL))) {
++ if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) {
+ if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) {
+ memcpy(sop->so_owner.data, owner->data, owner->len);
+ sop->so_owner.len = owner->len;
+ kref_init(&sop->so_ref);
+ return sop;
+ }
+- kfree(sop);
++ kmem_cache_free(stateowner_slab, sop);
+ }
+ return NULL;
+ }
+@@ -887,7 +1062,6 @@
+ rp->rp_status = NFSERR_SERVERFAULT;
+ rp->rp_buflen = 0;
+ rp->rp_buf = rp->rp_ibuf;
+- alloc_sowner++;
+ return sop;
+ }
+
+@@ -957,14 +1131,29 @@
+ __set_bit(open->op_share_deny, &stp->st_deny_bmap);
+ }
+
++/*
++* Because nfsd_close() can call locks_remove_flock() which removes leases,
++* delay nfsd_close() for delegations from the nfsd_open() clientid
++* until the delegation is reaped.
++*/
+ static void
+-release_stateid(struct nfs4_stateid *stp, int flags) {
++release_stateid(struct nfs4_stateid *stp, int flags)
++{
++ struct nfs4_delegation *dp;
++ struct nfs4_file *fp = stp->st_file;
+
+ list_del(&stp->st_hash);
+ list_del_perfile++;
+ list_del(&stp->st_perfile);
+ list_del(&stp->st_perfilestate);
+ if ((stp->st_vfs_set) && (flags & OPEN_STATE)) {
++ list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) {
++ if(cmp_clid(&dp->dl_client->cl_clientid,
++ &stp->st_stateowner->so_client->cl_clientid)) {
++ dp->dl_flags |= NFS4_DELAY_CLOSE;
++ return;
++ }
++ }
+ release_stateid_lockowner(stp);
+ nfsd_close(stp->st_vfs_file);
+ vfsclose++;
+@@ -1013,7 +1202,7 @@
+ if (sop->so_confirmed && list_empty(&sop->so_perfilestate))
+ move_to_close_lru(sop);
+ /* unused nfs4_file's are releseed. XXX slab cache? */
+- if (list_empty(&fp->fi_perfile)) {
++ if (list_empty(&fp->fi_perfile) && list_empty(&fp->fi_del_perfile)) {
+ release_file(fp);
+ }
+ }
+@@ -1141,6 +1330,100 @@
+ }
+ }
+
++/*
++ * Recall a delegation
++ */
++static int
++do_recall(void *__dp)
++{
++ struct nfs4_delegation *dp = __dp;
++
++ atomic_inc(&dp->dl_count);
++ nfsd4_cb_recall(dp);
++ do_exit(0);
++ return 0;
++}
++
++/*
++ * Spawn a thread to perform a recall on the delegation represented
++ * by the lease (file_lock)
++ *
++ * Called from break_lease() with lock_kernel() held,
++ *
++ */
++static
++void nfsd_break_deleg_cb(struct file_lock *fl)
++{
++ struct nfs4_delegation *dp= (struct nfs4_delegation *)fl->fl_owner;
++ struct task_struct *t;
++
++ dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
++ if (!dp)
++ return;
++
++ /* schedule delegation for recall */
++ spin_lock(&recall_lock);
++ atomic_set(&dp->dl_state, NFS4_RECALL_IN_PROGRESS);
++ list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
++ spin_unlock(&recall_lock);
++
++ /* only place dl_time is set. protected by lock_kernel*/
++ dp->dl_time = get_seconds();
++
++ /* XXX need to merge NFSD_LEASE_TIME with fs/locks.c:lease_break_time */
++ fl->fl_break_time = jiffies + NFSD_LEASE_TIME * HZ;
++
++ t = kthread_run(do_recall, dp, "%s", "nfs4_cb_recall");
++ if (IS_ERR(t)) {
++ struct nfs4_client *clp = dp->dl_client;
++
++ printk(KERN_INFO "NFSD: Callback thread failed for "
++ "for client (clientid %08x/%08x)\n",
++ clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
++ }
++}
++
++/*
++ * The file_lock is being reapd.
++ *
++ * Called by locks_free_lock() with lock_kernel() held.
++ */
++static
++void nfsd_release_deleg_cb(struct file_lock *fl)
++{
++ struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
++
++ dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d, dl_state %d\n", fl,dp, atomic_read(&dp->dl_count), atomic_read(&dp->dl_state));
++
++ if (!(fl->fl_flags & FL_LEASE) || !dp)
++ return;
++ atomic_set(&dp->dl_state,NFS4_RECALL_COMPLETE);
++ dp->dl_flock = NULL;
++}
++
++/*
++ * Set the delegation file_lock back pointer.
++ *
++ * Called from __setlease() with lock_kernel() held.
++ */
++static
++void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
++{
++ struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
++
++ dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
++ if (!dp)
++ return;
++ dp->dl_flock = new;
++}
++
++struct lock_manager_operations nfsd_lease_mng_ops = {
++ .fl_break = nfsd_break_deleg_cb,
++ .fl_release_private = nfsd_release_deleg_cb,
++ .fl_copy_lock = nfsd_copy_lock_deleg_cb,
++};
++
++
+
+ /*
+ * nfsd4_process_open1()
+@@ -1238,6 +1521,43 @@
+ }
+
+ static int
++nfs4_deleg_conflict(u32 share, u32 dtype)
++{
++ return (((share & NFS4_SHARE_ACCESS_WRITE) &&
++ dtype == NFS4_OPEN_DELEGATE_READ) ||
++ ((share & NFS4_SHARE_ACCESS_READ) &&
++ dtype == NFS4_OPEN_DELEGATE_WRITE));
++}
++
++#define DONT_DELEGATE 8
++
++/*
++ * nfs4_check_deleg_recall()
++ *
++ * Test any delegation that is currently within an incompleted recalled
++ * state, and return NFSERR_DELAY for conflicting open share.
++ * flag is set to DONT_DELEGATE for shares that match the deleg type.
++ */
++static int
++nfs4_check_deleg_recall(struct nfs4_file *fp, struct nfsd4_open *op, int *flag)
++{
++ struct nfs4_delegation *dp;
++ int status = 0;
++
++ list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) {
++ dprintk("NFSD: found delegation %p with dl_state %d\n",
++ dp, atomic_read(&dp->dl_state));
++ if (atomic_read(&dp->dl_state) == NFS4_RECALL_IN_PROGRESS) {
++ if(nfs4_deleg_conflict(op->op_share_access, dp->dl_type))
++ status = nfserr_jukebox;
++ else
++ *flag = DONT_DELEGATE;
++ }
++ }
++ return status;
++}
++
++static int
+ nfs4_check_open(struct nfs4_file *fp, struct nfs4_stateowner *sop, struct nfsd4_open *open, struct nfs4_stateid **stpp)
+ {
+ struct nfs4_stateid *local;
+@@ -1339,6 +1659,65 @@
+ }
+
+ /*
++ * Attempt to hand out a delegation.
++ */
++static void
++nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp, int *flag)
++{
++ struct nfs4_delegation *dp;
++ struct nfs4_stateowner *sop = stp->st_stateowner;
++ struct nfs4_callback *cb = &sop->so_client->cl_callback;
++ struct file_lock fl, *flp = &fl;
++ int status;
++
++ if (*flag == DONT_DELEGATE) {
++ *flag = NFS4_OPEN_DELEGATE_NONE;
++ return;
++ }
++
++ /* set flag */
++ *flag = NFS4_OPEN_DELEGATE_NONE;
++ if (open->op_claim_type != NFS4_OPEN_CLAIM_NULL
++ || !atomic_read(&cb->cb_set) || !sop->so_confirmed)
++ return;
++
++ if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
++ *flag = NFS4_OPEN_DELEGATE_READ;
++
++ else if (!(open->op_share_access & NFS4_SHARE_ACCESS_READ))
++ *flag = NFS4_OPEN_DELEGATE_WRITE;
++
++ if (!(dp = alloc_init_deleg(sop->so_client, stp, fh, *flag)))
++ return;
++ locks_init_lock(&fl);
++ fl.fl_lmops = &nfsd_lease_mng_ops;
++ fl.fl_flags = FL_LEASE;
++ fl.fl_end = OFFSET_MAX;
++ fl.fl_owner = (fl_owner_t)dp;
++ fl.fl_file = stp->st_vfs_file;
++ fl.fl_pid = current->tgid;
++
++ if ((status = setlease(stp->st_vfs_file,
++ *flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK, &flp))) {
++ dprintk("NFSD: setlease failed [%d], no delegation\n", status);
++ list_del(&dp->dl_del_perfile);
++ list_del(&dp->dl_del_perclnt);
++ kfree(dp);
++ free_delegation++;
++ *flag = NFS4_OPEN_DELEGATE_NONE;
++ return;
++ }
++
++ memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
++
++ dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n",
++ dp->dl_stateid.si_boot,
++ dp->dl_stateid.si_stateownerid,
++ dp->dl_stateid.si_fileid,
++ dp->dl_stateid.si_generation);
++}
++
++/*
+ * called with nfs4_lock_state() held.
+ */
+ int
+@@ -1346,28 +1725,24 @@
+ {
+ struct nfs4_stateowner *sop = open->op_stateowner;
+ struct nfs4_file *fp = NULL;
+- struct inode *ino;
++ struct inode *ino = current_fh->fh_dentry->d_inode;
+ unsigned int fi_hashval;
+ struct nfs4_stateid *stp = NULL;
+- int status;
+-
+- status = nfserr_resource;
+- if (!sop)
+- return status;
+-
+- ino = current_fh->fh_dentry->d_inode;
++ int status, delegflag = 0;
+
+ status = nfserr_inval;
+ if (!TEST_ACCESS(open->op_share_access) || !TEST_DENY(open->op_share_deny))
+ goto out;
+ /*
+- * Lookup file; if found, lookup stateid and check open request;
+- * not found, create
++ * Lookup file; if found, lookup stateid and check open request,
++ * and check for delegations in the process of being recalled.
++ * If not found, create the nfs4_file struct
+ */
+ fi_hashval = file_hashval(ino);
+ if (find_file(fi_hashval, ino, &fp)) {
+- status = nfs4_check_open(fp, sop, open, &stp);
+- if (status)
++ if ((status = nfs4_check_open(fp, sop, open, &stp)))
++ goto out;
++ if ((status = nfs4_check_deleg_recall(fp, open, &delegflag)))
+ goto out;
+ } else {
+ status = nfserr_resource;
+@@ -1407,14 +1782,20 @@
+ }
+ }
+ }
+- dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n",
+- stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid,
+- stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
+-
+ memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+
+- open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
++ /*
++ * Attempt to hand out a delegation. No error return, because the
++ * OPEN succeeds even if we fail.
++ */
++ nfs4_open_delegation(current_fh, open, stp, &delegflag);
++ open->op_delegate_type = delegflag;
++
+ status = nfs_ok;
++
++ dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n",
++ stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid,
++ stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
+ out:
+ /* take the opportunity to clean up unused state */
+ if (fp && list_empty(&fp->fi_perfile))
+@@ -1480,14 +1861,26 @@
+ {
+ struct nfs4_client *clp;
+ struct nfs4_stateowner *sop;
++ struct nfs4_delegation *dp;
+ struct list_head *pos, *next;
+ time_t cutoff = get_seconds() - NFSD_LEASE_TIME;
+ time_t t, clientid_val = NFSD_LEASE_TIME;
+- time_t u, close_val = NFSD_LEASE_TIME;
++ time_t u, test_val = NFSD_LEASE_TIME;
+
+ nfs4_lock_state();
+
+- dprintk("NFSD: laundromat service - starting, examining clients\n");
++ dprintk("NFSD: laundromat service - starting\n");
++ /* Remove clientid's from recovery directory */
++ if (first_run) {
++ int status;
++
++ dprintk("NFSD: laundromat service - FIRST_RUN\n");
++ status = nfsd4_list_rec_dir(1);
++ if (status < 0)
++ printk("NFSD: error clearing recovery directory %s\n",
++ recovery_dirname);
++ first_run = 0;
++ }
+ list_for_each_safe(pos, next, &client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
+@@ -1498,14 +1891,34 @@
+ }
+ dprintk("NFSD: purging unused client (clientid %08x)\n",
+ clp->cl_clientid.cl_id);
++ if (clp->cl_firststate)
++ nfsd4_remove_clid_file(clp);
+ expire_client(clp);
+ }
++ spin_lock(&recall_lock);
++ list_for_each_safe(pos, next, &del_recall_lru) {
++ dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
++ if (atomic_read(&dp->dl_state) == NFS4_RECALL_COMPLETE)
++ goto reap;
++ if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
++ u = dp->dl_time - cutoff;
++ if (test_val > u)
++ test_val = u;
++ break;
++ }
++reap:
++ dprintk("NFSD: purging unused delegation dp %p, fp %p\n",
++ dp, dp->dl_flock);
++ release_delegation(dp);
++ }
++ spin_unlock(&recall_lock);
++ test_val = NFSD_LEASE_TIME;
+ list_for_each_safe(pos, next, &close_lru) {
+ sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
+ if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
+ u = sop->so_time - cutoff;
+- if (close_val > u)
+- close_val = u;
++ if (test_val > u)
++ test_val = u;
+ break;
+ }
+ dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
+@@ -1564,21 +1977,81 @@
+ return 1;
+ }
+
++static inline int
++access_permit_read(unsigned long access_bmap)
++{
++ return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) ||
++ test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
++}
++
++static inline int
++access_permit_write(unsigned long access_bmap)
++{
++ return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) ||
++ test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
++}
++
++static
++int nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
++{
++ int status = nfserr_openmode;
++
++ if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
++ goto out;
++ if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
++ goto out;
++ status = nfs_ok;
++out:
++ return status;
++}
++
++static int
++nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
++{
++ int status = nfserr_openmode;
++
++ if ((flags & WR_STATE) & (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
++ goto out;
++ if ((flags & RD_STATE) & (dp->dl_type == NFS4_OPEN_DELEGATE_WRITE))
++ goto out;
++ status = nfs_ok;
++out:
++ return status;
++}
++
++static int
++nfs4_rw_grace(int flags)
++{
++ return (nfs4_in_grace() && ((flags & RD_STATE) || (flags & WR_STATE)));
++}
++
++/*
++ * Allow READ/WRITE during grace period on recovered state only for files
++ * that are not able to provide mandatory locking.
++ */
++static int
++nfs4_check_rw_grace(umode_t mode, int flags)
++{
++ return (nfs4_rw_grace(flags) && ((mode & S_IXGRP) && (mode & S_ISGID)));
++}
+
+ /*
+ * Checks for stateid operations
+ */
+ int
+-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct nfs4_stateid **stpp)
++nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+ {
+- struct nfs4_stateid *stp;
++ struct nfs4_stateid *stp = NULL;
++ struct nfs4_delegation *dp = NULL;
++ stateid_t *stidp;
++ struct inode *ino = current_fh->fh_dentry->d_inode;
+ int status;
+
+ dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
+ stateid->si_boot, stateid->si_stateownerid,
+ stateid->si_fileid, stateid->si_generation);
+-
+- *stpp = NULL;
++ if (filpp)
++ *filpp = NULL;
+
+ /* STALE STATEID */
+ status = nfserr_stale_stateid;
+@@ -1587,33 +2060,58 @@
+
+ /* BAD STATEID */
+ status = nfserr_bad_stateid;
+- if (!(stp = find_stateid(stateid, flags))) {
+- dprintk("NFSD: preprocess_stateid_op: no open stateid!\n");
+- goto out;
+- }
+- if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) {
+- dprintk("NFSD: preprocess_stateid_op: fh-stateid mismatch!\n");
+- stp->st_vfs_set = 0;
+- goto out;
+- }
+- if (!stp->st_stateowner->so_confirmed) {
+- dprintk("preprocess_stateid_op: lockowner not confirmed yet!\n");
+- goto out;
++ if (!stateid->si_fileid) { /* delegation stateid */
++
++ if(!(dp = find_delegation_stateid(ino, stateid))) {
++ dprintk("NFSD: delegation stateid not found\n");
++ if (nfs4_rw_grace(flags))
++ status = nfserr_grace;
++ goto out;
++ }
++ stidp = &dp->dl_stateid;
++ } else { /* open or lock stateid */
++ if (!(stp = find_stateid(stateid, flags))) {
++ dprintk("NFSD: open or lock stateid not found\n");
++ if (nfs4_rw_grace(flags))
++ status = nfserr_grace;
++ goto out;
++ }
++ if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
++ goto out;
++ if (!stp->st_stateowner->so_confirmed)
++ goto out;
++ stidp = &stp->st_stateid;
+ }
+- if (stateid->si_generation > stp->st_stateid.si_generation) {
+- dprintk("preprocess_stateid_op: future stateid?!\n");
++ if (stateid->si_generation > stidp->si_generation)
+ goto out;
+- }
+
+ /* OLD STATEID */
+ status = nfserr_old_stateid;
+- if (stateid->si_generation < stp->st_stateid.si_generation) {
+- dprintk("preprocess_stateid_op: old stateid!\n");
++ if (stateid->si_generation < stidp->si_generation)
+ goto out;
++
++ status = nfserr_grace;
++ if (nfs4_check_rw_grace(ino->i_mode, flags))
++ goto out;
++
++ if (stp) {
++ renew_client(stp->st_stateowner->so_client);
++ if ((status = nfs4_check_openmode(stp,flags)))
++ goto out;
++ if (filpp)
++ *filpp = stp->st_vfs_file;
++ } else if (dp) {
++ renew_client(dp->dl_client);
++ if ((status = nfs4_check_delegmode(dp, flags)))
++ goto out;
++ if (flags & DELEG_RET) {
++ atomic_set(&dp->dl_state,NFS4_RECALL_COMPLETE);
++ release_delegation(dp);
++ }
++ if (filpp && dp && dp->dl_stp)
++ *filpp = dp->dl_stp->st_vfs_file;
+ }
+- *stpp = stp;
+ status = nfs_ok;
+- renew_client(stp->st_stateowner->so_client);
+ out:
+ return status;
+ }
+@@ -1750,17 +2248,6 @@
+ goto out;
+ }
+
+-/*
+- * eventually, this will perform an upcall to the 'state daemon' as well as
+- * set the cl_first_state field.
+- */
+-void
+-first_state(struct nfs4_client *clp)
+-{
+- if (!clp->cl_first_state)
+- clp->cl_first_state = get_seconds();
+-}
+-
+ int
+ nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc)
+ {
+@@ -1793,8 +2280,16 @@
+ stp->st_stateid.si_stateownerid,
+ stp->st_stateid.si_fileid,
+ stp->st_stateid.si_generation);
+- status = nfs_ok;
+- first_state(sop->so_client);
++
++ if (!sop->so_client->cl_firststate) {
++ int err = nfsd4_create_clid_file(sop->so_client);
++ if (!err) {
++ sop->so_client->cl_firststate = 1;
++ dprintk("NFSD: OPEN_CONFIRM firststate set [%.*s]\n",
++ sop->so_client->cl_name.len,
++ sop->so_client->cl_name.data);
++ }
++ }
+ out:
+ if (oc->oc_stateowner)
+ nfs4_get_stateowner(oc->oc_stateowner);
+@@ -1912,6 +2407,22 @@
+ return status;
+ }
+
++int
++nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr)
++{
++ int status;
++
++ if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0)))
++ goto out;
++
++ nfs4_lock_state();
++ status = nfs4_preprocess_stateid_op(current_fh, &dr->dr_stateid, DELEG_RET, NULL);
++ nfs4_unlock_state();
++out:
++ return status;
++}
++
++
+ /*
+ * Lock owner state (byte-range locks)
+ */
+@@ -1938,7 +2449,7 @@
+ unsigned int hashval;
+
+ dprintk("NFSD: find_stateid flags 0x%x\n",flags);
+- if ((flags & LOCK_STATE) || (flags & RDWR_STATE)) {
++ if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+ hashval = stateid_hashval(st_id, f_id);
+ list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
+ if ((local->st_stateid.si_stateownerid == st_id) &&
+@@ -1946,7 +2457,7 @@
+ return local;
+ }
+ }
+- if ((flags & OPEN_STATE) || (flags & RDWR_STATE)) {
++ if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+ hashval = stateid_hashval(st_id, f_id);
+ list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
+ if ((local->st_stateid.si_stateownerid == st_id) &&
+@@ -1958,6 +2469,30 @@
+ return NULL;
+ }
+
++static struct nfs4_delegation *
++find_delegation_stateid(struct inode *ino, stateid_t *stid)
++{
++ struct nfs4_delegation *dp = NULL;
++ struct nfs4_file *fp = NULL;
++ u32 st_id;
++ unsigned int fi_hashval;
++
++ dprintk("NFSD:find_delegation_stateid ino %p, stid %p\n",ino,stid);
++
++ if(!ino || !stid)
++ return NULL;
++ st_id = stid->si_stateownerid;
++ fi_hashval = file_hashval(ino);
++ if (find_file(fi_hashval, ino, &fp)) {
++ list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) {
++ if(dp->dl_stateid.si_stateownerid == st_id) {
++ dprintk("NFSD: find_delegation dp %p\n",dp);
++ return dp;
++ }
++ }
++ }
++ return NULL;
++}
+
+ /*
+ * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
+@@ -2085,7 +2620,6 @@
+ rp->rp_status = NFSERR_SERVERFAULT;
+ rp->rp_buflen = 0;
+ rp->rp_buf = rp->rp_ibuf;
+- alloc_lsowner++;
+ return sop;
+ }
+
+@@ -2558,22 +3092,22 @@
+ /*
+ * failure => all reset bets are off, nfserr_no_grace...
+ */
+-static int
+-nfs4_client_to_reclaim(struct nfs4_client *clp)
++int
++nfs4_client_to_reclaim(char *name, int namlen)
+ {
+ unsigned int strhashval;
+ struct nfs4_client_reclaim *crp = NULL;
+
+- crp = alloc_reclaim(clp->cl_name.len);
++ dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", namlen, name);
++ crp = alloc_reclaim(namlen);
+ if (!crp)
+ return 0;
+- strhashval = clientstr_hashval(clp->cl_name.data, clp->cl_name.len);
++ strhashval = clientstr_hashval(name, namlen);
+ INIT_LIST_HEAD(&crp->cr_strhash);
+ list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
+- memcpy(crp->cr_name.data, clp->cl_name.data, clp->cl_name.len);
+- crp->cr_name.len = clp->cl_name.len;
+- crp->cr_first_state = clp->cl_first_state;
+- crp->cr_expired = 0;
++ memcpy(crp->cr_name.data, name, namlen);
++ crp->cr_name.len = namlen;
++ reclaim_str_hashtbl_size++;
+ return 1;
+ }
+
+@@ -2618,6 +3152,9 @@
+ if (!client)
+ return NULL;
+
++ dprintk("NFSD: nfs4_find_reclaim_client for %.*s\n",
++ clp->cl_name.len, clp->cl_name.data);
++
+ /* find clp->cl_name in reclaim_str_hashtbl */
+ strhashval = clientstr_hashval(client->cl_name.data,
+ client->cl_name.len);
+@@ -2639,8 +3176,6 @@
+
+ if ((crp = nfs4_find_reclaim_client(clid)) == NULL)
+ return nfserr_reclaim_bad;
+- if (crp->cr_expired)
+- return nfserr_no_grace;
+ return nfs_ok;
+ }
+
+@@ -2657,10 +3192,18 @@
+
+ if (nfs4_init)
+ return;
++ if (nfsd4_init_slabs())
++ BUG(); /* XXXXXX!!! */
+ if (!nfs4_reclaim_init) {
++ int status;
++
+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
+ reclaim_str_hashtbl_size = 0;
++ nfsd4_init_rec_dir(recovery_dirname);
++ status = nfsd4_list_rec_dir(0);
++ if (status)
++ printk("NFSD: Failure in reading recovery data\n");
+ nfs4_reclaim_init = 1;
+ }
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+@@ -2689,6 +3232,8 @@
+
+ INIT_LIST_HEAD(&close_lru);
+ INIT_LIST_HEAD(&client_lru);
++ INIT_LIST_HEAD(&del_recall_lru);
++ spin_lock_init(&recall_lock);
+ boot_time = get_seconds();
+ grace_time = max(old_lease_time, lease_time);
+ if (reclaim_str_hashtbl_size == 0)
+@@ -2725,6 +3270,15 @@
+ {
+ int i;
+ struct nfs4_client *clp = NULL;
++ struct nfs4_delegation *dp = NULL;
++ struct nfs4_stateowner *sop = NULL;
++ struct list_head *pos, *next;
++
++ list_for_each_safe(pos, next, &close_lru) {
++ sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
++ list_del(&sop->so_close_lru);
++ nfs4_put_stateowner(sop);
++ }
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ while (!list_empty(&conf_id_hashtbl[i])) {
+@@ -2736,20 +3290,31 @@
+ expire_client(clp);
+ }
+ }
++ spin_lock(&recall_lock);
++ list_for_each_safe(pos, next, &del_recall_lru) {
++ dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
++ atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE);
++ release_delegation(dp);
++ }
++ spin_unlock(&recall_lock);
++
+ release_all_files();
+ cancel_delayed_work(&laundromat_work);
+ flush_scheduled_work();
+ nfs4_init = 0;
++ nfs4_reclaim_init = 0;
+ dprintk("NFSD: list_add_perfile %d list_del_perfile %d\n",
+ list_add_perfile, list_del_perfile);
+ dprintk("NFSD: add_perclient %d del_perclient %d\n",
+ add_perclient, del_perclient);
+ dprintk("NFSD: alloc_file %d free_file %d\n",
+ alloc_file, free_file);
+- dprintk("NFSD: alloc_sowner %d alloc_lsowner %d free_sowner %d\n",
+- alloc_sowner, alloc_lsowner, free_sowner);
+ dprintk("NFSD: vfsopen %d vfsclose %d\n",
+ vfsopen, vfsclose);
++ dprintk("NFSD: alloc_delegation %d free_delegation %d\n",
++ alloc_delegation, free_delegation);
++ if (nfsd4_free_slabs())
++ BUG(); /* XXX? */
+ }
+
+ void
+@@ -2801,11 +3366,10 @@
+ /* populate reclaim_str_hashtbl with current confirmed nfs4_clientid */
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ list_for_each_entry(clp, &conf_id_hashtbl[i], cl_idhash) {
+- if (!nfs4_client_to_reclaim(clp)) {
++ if (!nfs4_client_to_reclaim(clp->cl_name.data, clp->cl_name.len)) {
+ nfs4_release_reclaim();
+ goto init_state;
+ }
+- reclaim_str_hashtbl_size++;
+ }
+ }
+ init_state:
+Index: linux-2.6.10/fs/nfsd/nfsproc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfsproc.c 2004-12-25 05:34:30.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfsproc.c 2005-04-05 14:49:13.426688152 +0800
+@@ -586,7 +586,6 @@
+ { nfserr_dquot, -EDQUOT },
+ #endif
+ { nfserr_stale, -ESTALE },
+- { nfserr_jukebox, -EWOULDBLOCK },
+ { nfserr_jukebox, -ETIMEDOUT },
+ { nfserr_dropit, -EAGAIN },
+ { nfserr_dropit, -ENOMEM },
+Index: linux-2.6.10/fs/nfsd/nfs4acl.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4acl.c 2004-12-25 05:34:29.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4acl.c 2005-04-05 14:49:13.429687696 +0800
+@@ -89,6 +89,8 @@
+ return ret;
+ }
+
++/* modify functions to take NFS errors */
++
+ static int
+ mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
+ {
+Index: linux-2.6.10/fs/nfsd/nfs4idmap.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4idmap.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4idmap.c 2005-04-05 14:49:13.414689976 +0800
+@@ -78,9 +78,9 @@
+
+ #define DefineSimpleCacheLookupMap(STRUCT, FUNC) \
+ DefineCacheLookup(struct STRUCT, h, FUNC##_lookup, \
+- (struct STRUCT *item, int set), /*no setup */, \
++ (struct STRUCT *item, int set), \
+ & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp), \
+- STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0)
++ STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+
+ /* Common entry handling */
+
+Index: linux-2.6.10/fs/nfsd/vfs.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/vfs.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/nfsd/vfs.c 2005-04-05 14:49:13.417689520 +0800
+@@ -304,6 +304,8 @@
+ * we need to break all leases.
+ */
+ err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
++ if (err == -EWOULDBLOCK)
++ err = -ETIMEDOUT;
+ if (err) /* ENOMEM or EWOULDBLOCK */
+ goto out_nfserr;
+
+@@ -678,6 +680,8 @@
+ * This may block while leases are broken.
+ */
+ err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
++ if (err == -EWOULDBLOCK)
++ err = -ETIMEDOUT;
+ if (err) /* NOMEM or WOULDBLOCK */
+ goto out_nfserr;
+
+@@ -822,21 +826,34 @@
+ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+ struct kvec *vec, int vlen, unsigned long *count)
+ {
+- struct raparms *ra;
+- mm_segment_t oldfs;
+ int err;
+ struct file *file;
+- struct inode *inode;
+
+ err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
+ if (err)
+ goto out;
++ err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
++
++ nfsd_close(file);
++out:
++ return err;
++}
++
++int
++nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
++ loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
++{
++ struct inode *inode;
++ struct raparms *ra;
++ mm_segment_t oldfs;
++ int err;
++
+ err = nfserr_perm;
+ inode = file->f_dentry->d_inode;
+ #ifdef MSNFS
+ if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+ (!lock_may_read(inode, offset, *count)))
+- goto out_close;
++ goto out;
+ #endif
+
+ /* Get readahead parameters */
+@@ -872,8 +889,6 @@
+ dnotify_parent(file->f_dentry, DN_ACCESS);
+ } else
+ err = nfserrno(err);
+-out_close:
+- nfsd_close(file);
+ out:
+ return err;
+ }
+@@ -888,25 +903,40 @@
+ struct kvec *vec, int vlen,
+ unsigned long cnt, int *stablep)
+ {
+- struct svc_export *exp;
+ struct file *file;
+- struct dentry *dentry;
+- struct inode *inode;
+- mm_segment_t oldfs;
+ int err = 0;
+- int stable = *stablep;
+
+ err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
+ if (err)
+ goto out;
+ if (!cnt)
+ goto out_close;
++
++ err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep);
++out_close:
++ nfsd_close(file);
++out:
++ return err;
++}
++
++int
++nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
++ loff_t offset, struct kvec *vec, int vlen,
++ unsigned long cnt, int *stablep)
++{
++ struct svc_export *exp;
++ struct dentry *dentry;
++ struct inode *inode;
++ mm_segment_t oldfs;
++ int err = 0;
++ int stable = *stablep;
++
+ err = nfserr_perm;
+
+ #ifdef MSNFS
+ if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+ (!lock_may_write(file->f_dentry->d_inode, offset, cnt)))
+- goto out_close;
++ goto out;
+ #endif
+
+ dentry = file->f_dentry;
+@@ -993,13 +1023,10 @@
+ err = 0;
+ else
+ err = nfserrno(err);
+-out_close:
+- nfsd_close(file);
+ out:
+ return err;
+ }
+
+-
+ #ifdef CONFIG_NFSD_V3
+ /*
+ * Commit all pending writes to stable storage.
+Index: linux-2.6.10/fs/nfsd/nfs4callback.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4callback.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/nfsd/nfs4callback.c 2005-04-05 14:49:13.428687848 +0800
+@@ -0,0 +1,589 @@
++/*
++ * linux/fs/nfsd/nfs4callback.c
++ *
++ * Copyright (c) 2001 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Kendrick Smith <kmsmith@umich.edu>
++ * Andy Adamson <andros@umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/list.h>
++#include <linux/inet.h>
++#include <linux/errno.h>
++#include <linux/sunrpc/xdr.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/clnt.h>
++#include <linux/nfsd/nfsd.h>
++#include <linux/nfsd/state.h>
++#include <linux/sunrpc/sched.h>
++#include <linux/nfs4.h>
++
++#define NFSDDBG_FACILITY NFSDDBG_PROC
++
++#define NFSPROC4_CB_NULL 0
++#define NFSPROC4_CB_COMPOUND 1
++
++/* declarations */
++static void nfs4_cb_null(struct rpc_task *task);
++extern spinlock_t recall_lock;
++
++/* Index of predefined Linux callback client operations */
++
++enum {
++ NFSPROC4_CLNT_CB_NULL = 0,
++ NFSPROC4_CLNT_CB_RECALL,
++};
++
++enum nfs_cb_opnum4 {
++ OP_CB_RECALL = 4,
++};
++
++#define NFS4_MAXTAGLEN 20
++
++#define NFS4_enc_cb_null_sz 0
++#define NFS4_dec_cb_null_sz 0
++#define cb_compound_enc_hdr_sz 4
++#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
++#define op_enc_sz 1
++#define op_dec_sz 2
++#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
++#define enc_stateid_sz 16
++#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
++ 1 + enc_stateid_sz + \
++ enc_nfs4_fh_sz)
++
++#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
++ op_dec_sz)
++
++/*
++* Generic encode routines from fs/nfs/nfs4xdr.c
++*/
++static inline u32 *
++xdr_writemem(u32 *p, const void *ptr, int nbytes)
++{
++ int tmp = XDR_QUADLEN(nbytes);
++ if (!tmp)
++ return p;
++ p[tmp-1] = 0;
++ memcpy(p, ptr, nbytes);
++ return p + tmp;
++}
++
++#define WRITE32(n) *p++ = htonl(n)
++#define WRITEMEM(ptr,nbytes) do { \
++ p = xdr_writemem(p, ptr, nbytes); \
++} while (0)
++#define RESERVE_SPACE(nbytes) do { \
++ p = xdr_reserve_space(xdr, nbytes); \
++ if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
++ BUG_ON(!p); \
++} while (0)
++
++/*
++ * Generic decode routines from fs/nfs/nfs4xdr.c
++ */
++#define DECODE_TAIL \
++ status = 0; \
++out: \
++ return status; \
++xdr_error: \
++ dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
++ status = -EIO; \
++ goto out
++
++#define READ32(x) (x) = ntohl(*p++)
++#define READ64(x) do { \
++ (x) = (u64)ntohl(*p++) << 32; \
++ (x) |= ntohl(*p++); \
++} while (0)
++#define READTIME(x) do { \
++ p++; \
++ (x.tv_sec) = ntohl(*p++); \
++ (x.tv_nsec) = ntohl(*p++); \
++} while (0)
++#define READ_BUF(nbytes) do { \
++ p = xdr_inline_decode(xdr, nbytes); \
++ if (!p) { \
++ dprintk("NFSD: %s: reply buffer overflowed in line %d.", \
++ __FUNCTION__, __LINE__); \
++ return -EIO; \
++ } \
++} while (0)
++
++struct nfs4_cb_compound_hdr {
++ int status;
++ u32 ident;
++ u32 nops;
++ u32 taglen;
++ char * tag;
++};
++
++static struct {
++int stat;
++int errno;
++} nfs_cb_errtbl[] = {
++ { NFS4_OK, 0 },
++ { NFS4ERR_PERM, EPERM },
++ { NFS4ERR_NOENT, ENOENT },
++ { NFS4ERR_IO, EIO },
++ { NFS4ERR_NXIO, ENXIO },
++ { NFS4ERR_ACCESS, EACCES },
++ { NFS4ERR_EXIST, EEXIST },
++ { NFS4ERR_XDEV, EXDEV },
++ { NFS4ERR_NOTDIR, ENOTDIR },
++ { NFS4ERR_ISDIR, EISDIR },
++ { NFS4ERR_INVAL, EINVAL },
++ { NFS4ERR_FBIG, EFBIG },
++ { NFS4ERR_NOSPC, ENOSPC },
++ { NFS4ERR_ROFS, EROFS },
++ { NFS4ERR_MLINK, EMLINK },
++ { NFS4ERR_NAMETOOLONG, ENAMETOOLONG },
++ { NFS4ERR_NOTEMPTY, ENOTEMPTY },
++ { NFS4ERR_DQUOT, EDQUOT },
++ { NFS4ERR_STALE, ESTALE },
++ { NFS4ERR_BADHANDLE, EBADHANDLE },
++ { NFS4ERR_BAD_COOKIE, EBADCOOKIE },
++ { NFS4ERR_NOTSUPP, ENOTSUPP },
++ { NFS4ERR_TOOSMALL, ETOOSMALL },
++ { NFS4ERR_SERVERFAULT, ESERVERFAULT },
++ { NFS4ERR_BADTYPE, EBADTYPE },
++ { NFS4ERR_LOCKED, EAGAIN },
++ { NFS4ERR_RESOURCE, EREMOTEIO },
++ { NFS4ERR_SYMLINK, ELOOP },
++ { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP },
++ { NFS4ERR_DEADLOCK, EDEADLK },
++ { -1, EIO }
++};
++
++static int
++nfs_cb_stat_to_errno(int stat)
++{
++ int i;
++ for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
++ if (nfs_cb_errtbl[i].stat == stat)
++ return nfs_cb_errtbl[i].errno;
++ }
++ /* If we cannot translate the error, the recovery routines should
++ * handle it.
++ * Note: remaining NFSv4 error codes have values > 10000, so should
++ * not conflict with native Linux error codes.
++ */
++ return stat;
++}
++
++/*
++ * XDR encode
++ */
++
++static int
++encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
++{
++ u32 * p;
++
++ RESERVE_SPACE(16);
++ WRITE32(0); /* tag length is always 0 */
++ WRITE32(NFS4_MINOR_VERSION);
++ WRITE32(hdr->ident);
++ WRITE32(hdr->nops);
++ return 0;
++}
++
++static int
++encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
++{
++ u32 *p;
++ int len = cb_rec->cbr_fhlen;
++
++ RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
++ WRITE32(OP_CB_RECALL);
++ WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
++ WRITE32(cb_rec->cbr_trunc);
++ WRITE32(len);
++ WRITEMEM(cb_rec->cbr_fhval, len);
++ return 0;
++}
++
++static int
++nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
++{
++ struct xdr_stream xdrs, *xdr = &xdrs;
++
++ xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
++ RESERVE_SPACE(0);
++ return 0;
++}
++
++static int
++nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args)
++{
++ struct xdr_stream xdr;
++ struct nfs4_cb_compound_hdr hdr = {
++ .nops = 1,
++ };
++
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_cb_compound_hdr(&xdr, &hdr);
++ return (encode_cb_recall(&xdr, args));
++}
++
++
++static int
++decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
++ u32 *p;
++
++ READ_BUF(8);
++ READ32(hdr->status);
++ READ32(hdr->taglen);
++ READ_BUF(hdr->taglen + 4);
++ hdr->tag = (char *)p;
++ p += XDR_QUADLEN(hdr->taglen);
++ READ32(hdr->nops);
++ return 0;
++}
++
++static int
++decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
++{
++ u32 *p;
++ u32 op;
++ int32_t nfserr;
++
++ READ_BUF(8);
++ READ32(op);
++ if (op != expected) {
++ dprintk("NFSD: decode_cb_op_hdr: Callback server returned "
++ " operation %d but we issued a request for %d\n",
++ op, expected);
++ return -EIO;
++ }
++ READ32(nfserr);
++ if (nfserr != NFS_OK)
++ return -nfs_cb_stat_to_errno(nfserr);
++ return 0;
++}
++
++static int
++nfs4_xdr_dec_cb_null(struct rpc_rqst *req, u32 *p)
++{
++ return 0;
++}
++
++static int
++nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p)
++{
++ struct xdr_stream xdr;
++ struct nfs4_cb_compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_cb_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
++out :
++ return status;
++}
++
++/*
++ * RPC procedure tables
++ */
++#ifndef MAX
++# define MAX(a, b) (((a) > (b))? (a) : (b))
++#endif
++
++#define PROC(proc, call, argtype, restype) \
++[NFSPROC4_CLNT_##proc] = { \
++ .p_proc = NFSPROC4_CB_##call, \
++ .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \
++ .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \
++ .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \
++}
++
++struct rpc_procinfo nfs4_cb_procedures[] = {
++ PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null),
++ PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall),
++};
++
++struct rpc_version nfs_cb_version4 = {
++ .number = 1,
++ .nrprocs = sizeof(nfs4_cb_procedures)/sizeof(nfs4_cb_procedures[0]),
++ .procs = nfs4_cb_procedures
++};
++
++static struct rpc_version * nfs_cb_version[] = {
++ NULL,
++ &nfs_cb_version4,
++};
++
++/*
++ * Use the SETCLIENTID credential
++ */
++struct rpc_cred *
++nfsd4_lookupcred(struct nfs4_client *clp, int taskflags)
++{
++ struct auth_cred acred;
++ struct rpc_clnt *clnt = clp->cl_callback.cb_client;
++ struct rpc_cred *ret = NULL;
++
++ if (!clnt)
++ goto out;
++ get_group_info(clp->cl_cred.cr_group_info);
++ acred.uid = clp->cl_cred.cr_uid;
++ acred.gid = clp->cl_cred.cr_gid;
++ acred.group_info = clp->cl_cred.cr_group_info;
++
++ dprintk("NFSD: looking up %s cred\n",
++ clnt->cl_auth->au_ops->au_name);
++ ret = rpcauth_lookup_credcache(clnt->cl_auth, &acred, taskflags);
++ put_group_info(clp->cl_cred.cr_group_info);
++out:
++ return ret;
++}
++
++/*
++ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
++ */
++void
++nfsd4_probe_callback(struct nfs4_client *clp)
++{
++ struct sockaddr_in addr;
++ struct nfs4_callback *cb = &clp->cl_callback;
++ struct rpc_timeout timeparms;
++ struct rpc_xprt * xprt;
++ struct rpc_program * program = &cb->cb_program;
++ struct rpc_stat * stat = &cb->cb_stat;
++ struct rpc_clnt * clnt;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
++ .rpc_argp = clp,
++ };
++ char hostname[32];
++ int status;
++
++ dprintk("NFSD: probe_callback. cb_parsed %d cb_set %d\n",
++ cb->cb_parsed, atomic_read(&cb->cb_set));
++ if (!cb->cb_parsed || atomic_read(&cb->cb_set))
++ return;
++
++ /* Initialize address */
++ memset(&addr, 0, sizeof(addr));
++ addr.sin_family = AF_INET;
++ addr.sin_port = htons(cb->cb_port);
++ addr.sin_addr.s_addr = htonl(cb->cb_addr);
++
++ /* Initialize timeout */
++ timeparms.to_initval = (NFSD_LEASE_TIME/4) * HZ;
++ timeparms.to_retries = 5;
++ timeparms.to_maxval = (NFSD_LEASE_TIME/2) * HZ;
++ timeparms.to_exponential = 1;
++
++ /* Create RPC transport */
++ if (!(xprt = xprt_create_proto(IPPROTO_TCP, &addr, &timeparms))) {
++ dprintk("NFSD: couldn't create callback transport!\n");
++ goto out_err;
++ }
++
++ /* Initialize rpc_program */
++ program->name = "nfs4_cb";
++ program->number = cb->cb_prog;
++ program->nrvers = sizeof(nfs_cb_version)/sizeof(nfs_cb_version[0]);
++ program->version = nfs_cb_version;
++ program->stats = stat;
++
++ /* Initialize rpc_stat */
++ memset(stat, 0, sizeof(struct rpc_stat));
++ stat->program = program;
++
++ /* Create RPC client
++ *
++ * XXX AUTH_UNIX only - need AUTH_GSS....
++ */
++ sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr));
++ if (!(clnt = rpc_create_client(xprt, hostname, program, 1, RPC_AUTH_UNIX))) {
++ dprintk("NFSD: couldn't create callback client\n");
++ goto out_xprt;
++ }
++ clnt->cl_intr = 1;
++ clnt->cl_softrtry = 1;
++ clnt->cl_chatty = 1;
++
++ /* Kick rpciod, put the call on the wire. */
++
++ if (rpciod_up() != 0) {
++ dprintk("nfsd: couldn't start rpciod for callbacks!\n");
++ goto out_clnt;
++ }
++
++ /* the task holds a reference to the nfs4_client struct */
++ cb->cb_client = clnt;
++ atomic_inc(&clp->cl_count);
++
++ msg.rpc_cred = nfsd4_lookupcred(clp,0);
++ status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, NULL);
++
++ if (status != 0) {
++ dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n");
++ goto out_rpciod;
++ }
++ return;
++
++out_rpciod:
++ atomic_dec(&clp->cl_count);
++ rpciod_down();
++out_clnt:
++ rpc_shutdown_client(clnt);
++ goto out_err;
++out_xprt:
++ xprt_destroy(xprt);
++out_err:
++ dprintk("NFSD: warning: no callback path to client %.*s\n",
++ clp->cl_name.len, clp->cl_name.data);
++ cb->cb_client = NULL;
++}
++
++static void
++nfs4_cb_null(struct rpc_task *task)
++{
++ struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
++ struct nfs4_callback *cb = &clp->cl_callback;
++ u32 addr = htonl(cb->cb_addr);
++
++ dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
++
++ if (task->tk_status < 0) {
++ dprintk("NFSD: callback establishment to client %.*s failed\n",
++ clp->cl_name.len, clp->cl_name.data);
++ goto out;
++ }
++ atomic_set(&cb->cb_set, 1);
++ dprintk("NFSD: callback set to client %u.%u.%u.%u\n", NIPQUAD(addr));
++out:
++ put_nfs4_client(clp);
++}
++
++/*
++ * Called with dp->dl_count incremented
++ */
++static void
++nfs4_cb_recall_done(struct rpc_task *task)
++{
++ struct nfs4_cb_recall *cbr = (struct nfs4_cb_recall *)task->tk_calldata;
++ struct nfs4_delegation *dp = cbr->cbr_dp;
++ int status;
++
++ spin_lock(&recall_lock);
++
++ /* all is well... */
++ if (task->tk_status == 0)
++ goto out;
++
++ /* network partition, retry nfsd4_cb_recall once. */
++ if (task->tk_status == -EIO) {
++ if (atomic_read(&dp->dl_recall_cnt) == 0)
++ goto retry;
++ else
++ /* callback channel no longer available */
++ atomic_set(&dp->dl_client->cl_callback.cb_set, 0);
++ }
++
++ /* Race: a recall occurred miliseconds after a delegation was granted.
++ * Client may have received recall prior to delegation. retry recall
++ * once.
++ * XXX what about nfserr_bad_stateid?
++ */
++ if (task->tk_status == -EBADHANDLE) {
++ if (atomic_read(&dp->dl_recall_cnt) == 0)
++ goto retry;
++ }
++
++ /* nfs4_laundromat will reap delegation */
++ atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE);
++
++out:
++ atomic_dec(&dp->dl_count);
++ BUG_ON(atomic_read(&dp->dl_count) < 0);
++ spin_unlock(&recall_lock);
++ return;
++
++retry:
++ atomic_inc(&dp->dl_recall_cnt);
++ spin_unlock(&recall_lock);
++ /* sleep 2 seconds before retrying recall */
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ schedule_timeout(2*HZ);
++ status = nfsd4_cb_recall(dp);
++ dprintk("NFSD: nfs4_cb_recall_done: retry status: %d dp %p dl_flock %p\n",status,dp, dp->dl_flock);
++}
++
++/*
++ * called with dp->dl_count inc'ed.
++ * nfs4_lock_state() may or may not have been called.
++ */
++int
++nfsd4_cb_recall(struct nfs4_delegation *dp)
++{
++ struct nfs4_client *clp;
++ struct rpc_clnt *clnt;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
++ };
++ struct nfs4_cb_recall *cbr = &dp->dl_recall;
++ int status;
++
++ dprintk("NFSD: nfsd4_cb_recall NFS4_enc_cb_recall_sz %d NFS4_dec_cb_recall_sz %d \n",NFS4_enc_cb_recall_sz,NFS4_dec_cb_recall_sz);
++
++ clp = dp->dl_client;
++ clnt = clp->cl_callback.cb_client;
++ status = EIO;
++ if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt)
++ goto out_free;
++
++ msg.rpc_argp = cbr;
++ msg.rpc_resp = cbr;
++ msg.rpc_cred = nfsd4_lookupcred(clp,0);
++
++ cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
++ cbr->cbr_dp = dp;
++
++ if ((status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
++ nfs4_cb_recall_done, cbr ))) {
++ dprintk("NFSD: recall_delegation: rpc_call_async failed %d\n",
++ status);
++ goto out_fail;
++ }
++out:
++ return status;
++out_fail:
++ status = nfserrno(status);
++ out_free:
++ kfree(cbr);
++ goto out;
++}
+Index: linux-2.6.10/fs/nfsd/nfs4proc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4proc.c 2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfs4proc.c 2005-04-05 14:49:13.432687240 +0800
+@@ -461,28 +461,12 @@
+ }
+
+ static inline int
+-access_bits_permit_read(unsigned long access_bmap)
+-{
+- return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) ||
+- test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
+-}
+-
+-static inline int
+-access_bits_permit_write(unsigned long access_bmap)
+-{
+- return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) ||
+- test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
+-}
+-
+-static inline int
+ nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read)
+ {
+- struct nfs4_stateid *stp;
+ int status;
++ struct file *filp;
+
+ /* no need to check permission - this will be done in nfsd_read() */
+- if (nfs4_in_grace())
+- return nfserr_grace;
+
+ if (read->rd_offset >= OFFSET_MAX)
+ return nfserr_inval;
+@@ -508,21 +492,17 @@
+ goto out;
+ }
+ /* check stateid */
+- if ((status = nfs4_preprocess_stateid_op(current_fh, &read->rd_stateid,
+- CHECK_FH | RDWR_STATE, &stp))) {
++ if ((status = nfs4_preprocess_stateid_op(current_fh, &read->rd_stateid,
++ CHECK_FH | RD_STATE, &filp))) {
+ dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
+ goto out;
+ }
+- status = nfserr_openmode;
+- if (!access_bits_permit_read(stp->st_access_bmap)) {
+- dprintk("NFSD: nfsd4_read: file not opened for read!\n");
+- goto out;
+- }
+ status = nfs_ok;
+ out:
+ nfs4_unlock_state();
+ read->rd_rqstp = rqstp;
+ read->rd_fhp = current_fh;
++ read->rd_filp = filp;
+ return status;
+ }
+
+@@ -562,6 +542,8 @@
+ {
+ int status;
+
++ if (nfs4_in_grace())
++ return nfserr_grace;
+ status = nfsd_unlink(rqstp, current_fh, 0, remove->rm_name, remove->rm_namelen);
+ if (status == nfserr_symlink)
+ return nfserr_notdir;
+@@ -580,6 +562,9 @@
+
+ if (!save_fh->fh_dentry)
+ return status;
++ if (nfs4_in_grace() && !(save_fh->fh_export->ex_flags
++ & NFSEXP_NOSUBTREECHECK))
++ return nfserr_grace;
+ status = nfsd_rename(rqstp, save_fh, rename->rn_sname,
+ rename->rn_snamelen, current_fh,
+ rename->rn_tname, rename->rn_tnamelen);
+@@ -605,12 +590,8 @@
+ static inline int
+ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr)
+ {
+- struct nfs4_stateid *stp;
+ int status = nfs_ok;
+
+- if (nfs4_in_grace())
+- return nfserr_grace;
+-
+ if (!current_fh->fh_dentry)
+ return nfserr_nofilehandle;
+
+@@ -626,15 +607,10 @@
+ nfs4_lock_state();
+ if ((status = nfs4_preprocess_stateid_op(current_fh,
+ &setattr->sa_stateid,
+- CHECK_FH | RDWR_STATE, &stp))) {
++ CHECK_FH | WR_STATE, NULL))) {
+ dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+ goto out_unlock;
+ }
+- status = nfserr_openmode;
+- if (!access_bits_permit_write(stp->st_access_bmap)) {
+- dprintk("NFSD: nfsd4_setattr: not opened for write!\n");
+- goto out_unlock;
+- }
+ nfs4_unlock_state();
+ }
+ status = nfs_ok;
+@@ -654,14 +630,11 @@
+ static inline int
+ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write)
+ {
+- struct nfs4_stateid *stp;
+ stateid_t *stateid = &write->wr_stateid;
++ struct file *filp;
+ u32 *p;
+ int status = nfs_ok;
+
+- if (nfs4_in_grace())
+- return nfserr_grace;
+-
+ /* no need to check permission - this will be done in nfsd_write() */
+
+ if (write->wr_offset >= OFFSET_MAX)
+@@ -677,18 +650,13 @@
+ goto zero_stateid;
+ }
+ if ((status = nfs4_preprocess_stateid_op(current_fh, stateid,
+- CHECK_FH | RDWR_STATE, &stp))) {
++ CHECK_FH | WR_STATE, &filp))) {
+ dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+ goto out;
+ }
+
+- status = nfserr_openmode;
+- if (!access_bits_permit_write(stp->st_access_bmap)) {
+- dprintk("NFSD: nfsd4_write: file not open for write!\n");
+- goto out;
+- }
+-
+ zero_stateid:
++
+ nfs4_unlock_state();
+ write->wr_bytes_written = write->wr_buflen;
+ write->wr_how_written = write->wr_stable_how;
+@@ -696,9 +664,16 @@
+ *p++ = nfssvc_boot.tv_sec;
+ *p++ = nfssvc_boot.tv_usec;
+
+- status = nfsd_write(rqstp, current_fh, write->wr_offset,
+- write->wr_vec, write->wr_vlen, write->wr_buflen,
+- &write->wr_how_written);
++ if (filp)
++ status = nfsd_vfs_write(rqstp, current_fh, filp,
++ write->wr_offset, write->wr_vec,
++ write->wr_vlen, write->wr_buflen,
++ &write->wr_how_written);
++ else
++ status = nfsd_write(rqstp, current_fh, write->wr_offset,
++ write->wr_vec, write->wr_vlen, write->wr_buflen,
++ &write->wr_how_written);
++
+ if (status == nfserr_symlink)
+ status = nfserr_inval;
+ return status;
+@@ -872,6 +847,9 @@
+ case OP_CREATE:
+ op->status = nfsd4_create(rqstp, current_fh, &op->u.create);
+ break;
++ case OP_DELEGRETURN:
++ op->status = nfsd4_delegreturn(rqstp, current_fh, &op->u.delegreturn);
++ break;
+ case OP_GETATTR:
+ op->status = nfsd4_getattr(rqstp, current_fh, &op->u.getattr);
+ break;
+Index: linux-2.6.10/fs/nfsd/export.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/export.c 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/fs/nfsd/export.c 2005-04-05 14:49:13.415689824 +0800
+@@ -255,7 +255,7 @@
+ new->ek_export = item->ek_export;
+ }
+
+-static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */
++static DefineSimpleCacheLookup(svc_expkey)
+
+ #define EXPORT_HASHBITS 8
+ #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS)
+@@ -492,8 +492,72 @@
+ new->ex_fsid = item->ex_fsid;
+ }
+
+-static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */
++struct svc_export *
++svc_export_lookup(struct svc_export *item, int set)
++{
++ struct svc_export *tmp, *new = NULL;
++ struct cache_head **hp, **head;
+
++ head = &svc_export_cache.hash_table[svc_export_hash(item)];
++retry:
++ if (set||new)
++ write_lock(&svc_export_cache.hash_lock);
++ else
++ read_lock(&svc_export_cache.hash_lock);
++ for(hp=head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct svc_export, h);
++ if (svc_export_match(item, tmp)) { /* found a match */
++ cache_get(&tmp->h);
++ if (set) {
++ if (test_bit(CACHE_NEGATIVE, &item->h.flags))
++ set_bit(CACHE_NEGATIVE, &tmp->h.flags);
++ else {
++ clear_bit(CACHE_NEGATIVE, &tmp->h.flags);
++ svc_export_update(tmp, item);
++ }
++ }
++ if (set||new)
++ write_unlock(&svc_export_cache.hash_lock);
++ else
++ read_unlock(&svc_export_cache.hash_lock);
++ if (set)
++ cache_fresh(&svc_export_cache, &tmp->h,
++ item->h.expiry_time);
++ if (new)
++ svc_export_put(&new->h, &svc_export_cache);
++ return tmp;
++ }
++ }
++ /* Didn't find anything */
++ if (new) {
++ svc_export_init(new, item);
++ new->h.next = *head;
++ *head = &new->h;
++ set_bit(CACHE_HASHED, &new->h.flags);
++ svc_export_cache.entries++;
++ if (set) {
++ tmp = new;
++ if (test_bit(CACHE_NEGATIVE, &item->h.flags))
++ set_bit(CACHE_NEGATIVE, &tmp->h.flags);
++ else
++ svc_export_update(tmp, item);
++ }
++ }
++ if (set||new)
++ write_unlock(&svc_export_cache.hash_lock);
++ else
++ read_unlock(&svc_export_cache.hash_lock);
++ if (new && set)
++ cache_fresh(&svc_export_cache, &new->h, item->h.expiry_time);
++ if (new)
++ return new;
++ new = kmalloc(sizeof(*new), GFP_KERNEL);
++ if (new) {
++ cache_init(&new->h);
++ goto retry;
++ }
++ return NULL;
++}
+
+ struct svc_expkey *
+ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
+Index: linux-2.6.10/fs/nfsd/nfssvc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfssvc.c 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/fs/nfsd/nfssvc.c 2005-04-05 14:49:13.422688760 +0800
+@@ -378,4 +378,6 @@
+ .pg_name = "nfsd", /* program name */
+ .pg_class = "nfsd", /* authentication class */
+ .pg_stats = &nfsd_svcstats, /* version table */
++ .pg_authenticate = &svc_set_client, /* export authentication */
++
+ };
+Index: linux-2.6.10/fs/nfsd/nfs4recover.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/nfs4recover.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/fs/nfsd/nfs4recover.c 2005-04-05 14:49:13.430687544 +0800
+@@ -0,0 +1,411 @@
++/*
++* linux/fs/nfsd/nfs4recover.c
++*
++* Copyright (c) 2004 The Regents of the University of Michigan.
++* All rights reserved.
++*
++* Andy Adamson <andros@umich.edu>
++*
++* Redistribution and use in source and binary forms, with or without
++* modification, are permitted provided that the following conditions
++* are met:
++*
++* 1. Redistributions of source code must retain the above copyright
++* notice, this list of conditions and the following disclaimer.
++* 2. Redistributions in binary form must reproduce the above copyright
++* notice, this list of conditions and the following disclaimer in the
++* documentation and/or other materials provided with the distribution.
++* 3. Neither the name of the University nor the names of its
++* contributors may be used to endorse or promote products derived
++* from this software without specific prior written permission.
++*
++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*
++*/
++
++
++#include <linux/param.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/nfsd/nfsd.h>
++#include <linux/nfs4.h>
++#include <linux/nfsd/state.h>
++#include <linux/nfsd/xdr4.h>
++#include <linux/file.h>
++#include <linux/namei.h>
++#include <asm/uaccess.h>
++
++#define NFSDDBG_FACILITY NFSDDBG_PROC
++
++/* MAX_FILE_LEN/2 = max client id name length due to changing name
++ * into hex
++ */
++#define MAX_FILE_LEN 256
++
++/* Globals */
++char recovery_dirname[] = "/var/lib/nfs/v4recovery";
++static uid_t saveuid;
++static gid_t savegid;
++static struct nameidata nd_rec_init;
++static int rec_dir_init = 0;
++
++void
++nfs4_save_set_user(void)
++{
++ saveuid = current->fsuid;
++ savegid = current->fsgid;
++ current->fsuid = 0;
++ current->fsgid = 0;
++}
++
++void
++nfs4_reset_user(void)
++{
++ current->fsuid = saveuid;
++ current->fsgid = savegid;
++}
++
++void
++nfs4_make_rec_filename(char **filename, struct nfs4_client *clp)
++{
++ char *fname = *filename;
++ int flen = MAX_FILE_LEN;
++
++ memset(fname, 0, flen);
++ qword_addhex(&fname, &flen, clp->cl_name.data, clp->cl_name.len);
++}
++
++/* XXX need to check dput() mntput ?? */
++int
++nfsd4_create_clid_file(struct nfs4_client *clp)
++{
++ struct file *filp = NULL;
++ struct dentry *dentry;
++ mm_segment_t oldfs;
++ loff_t offset = 0;
++ char fbuf[MAX_FILE_LEN], *fname = fbuf;
++ int status;
++
++
++ if (!rec_dir_init)
++ return -EINVAL;
++ nfs4_save_set_user();
++
++ dprintk("NFSD: nfsd4_create_clid_file IN recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ /* lock the parent */
++ down(&nd_rec_init.dentry->d_inode->i_sem);
++
++ nfs4_make_rec_filename(&fname, clp);
++ /* dentry->d_count will be 1 */
++ dentry = lookup_one_len(fname, nd_rec_init.dentry, strlen(fname));
++ status = PTR_ERR(dentry);
++ if (IS_ERR(dentry))
++ goto out_unlock;
++
++ status = -EEXIST;
++ if (dentry->d_inode){
++ dprintk("NFSD: nfsd4_create_clid_file: FILE EXISTS\n");
++ goto out_unlock;
++ }
++
++ /* nd_rec_init.dentry->d_count is bumped */
++ status = vfs_create(nd_rec_init.dentry->d_inode, dentry, S_IRWXU, NULL);
++ if (status < 0)
++ goto out_unlock;
++
++ up(&nd_rec_init.dentry->d_inode->i_sem);
++
++ filp = dentry_open(dget(dentry), mntget(nd_rec_init.mnt), O_RDWR);
++ status = PTR_ERR(filp);
++ if (IS_ERR(filp))
++ goto out_mnt;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ status = vfs_write(filp, clp->cl_name.data, clp->cl_name.len, &offset);
++ set_fs(oldfs);
++
++ dprintk("NFSD: nfsd4_create_clid_file vfs_write returns %d\n",status);
++ if (status >= 0)
++ status = nfs_ok;
++
++ if (filp->f_op && filp->f_op->flush) {
++ int err = filp->f_op->flush(filp);
++ dprintk("NFSD: nfsd4_create_clid_file called flush\n");
++ if (!status)
++ status = err;
++ }
++ /* dget and mntget in dentry_open call */
++ fput(filp);
++
++ /* dentry->d_count will be 0 */
++ dput(dentry);
++out_mnt:
++ /* dget in vfs_create call */
++ dput(nd_rec_init.dentry);
++
++out:
++ nfs4_reset_user();
++
++ dprintk("NFSD: nfsd4_create_clid_file OUT recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++ dprintk("NFSD: nfsd4_create_clid_file returns %d\n",status);
++
++ return status;
++
++out_unlock:
++ up(&nd_rec_init.dentry->d_inode->i_sem);
++ goto out;
++}
++
++/*
++ * called with pdentry->d_inode->i_sem held ?
++ */
++int
++nfsd4_unlink_rec_file(char *name, int namlen)
++{
++ struct dentry *dentry;
++ int type, status;
++
++ dprintk("NFSD: nfsd4_unlink_rec_file. name %.*s\n", namlen, name);
++
++ dentry = lookup_one_len(name, nd_rec_init.dentry, namlen);
++ dprintk("NFSD: nfsd4_unlink_rec_file POST LOOKUP nd_rec d_count %d\n",
++ atomic_read(&nd_rec_init.dentry->d_count));
++ status = PTR_ERR(dentry);
++ if (IS_ERR(dentry))
++ goto out;
++
++ status = -ENOENT;
++ if (!dentry->d_inode) {
++ dput(dentry);
++ goto out;
++ }
++
++ /* should only be files here! */
++ type = dentry->d_inode->i_mode & S_IFMT;
++ status = -EISDIR;
++ if (!(type & S_IFREG)) {
++ dput(dentry);
++ goto out;
++ }
++
++ dprintk("NFSD: nfsd4_unlink_rec_file PRE VFS UNLINK [%d:%d]\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ status = vfs_unlink(nd_rec_init.dentry->d_inode, dentry);
++
++ dprintk("NFSD: nfsd4_unlink_rec_file POST VFS UNLINK [%d:%d]\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ dprintk("NFSD: nfsd4_unlink_rec_file FILE dentry->d_count %d\n",
++ atomic_read(&dentry->d_count));
++out:
++ dprintk("NFSD: nfsd4_unlink_rec_file returns %d\n",status);
++ return status;
++}
++
++void
++nfsd4_remove_clid_file(struct nfs4_client *clp)
++{
++ char fbuf[MAX_FILE_LEN], *fname = fbuf;
++ int status;
++
++ if (!rec_dir_init)
++ return;
++
++ dprintk("NFSD: nfsd4_remove_clid_file client %.*s\n",
++ clp->cl_name.len,clp->cl_name.data);
++
++ nfs4_save_set_user();
++
++ dprintk("NFSD: nfsd4_remove_clid_file IN recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ nfs4_make_rec_filename(&fname, clp);
++ status = nfsd4_unlink_rec_file(fname, strlen(fname));
++ nfs4_reset_user();
++ if (status != nfs_ok)
++ printk("NFSD: Failed to remove expired client state file %.*s from %s\n", strlen(fname), fname, recovery_dirname);
++
++ dprintk("NFSD: nfsd4_remove_clid_file OUT recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++ return;
++}
++
++struct rec_dirent {
++ int clear;
++};
++
++/*
++ * on reboot, stuff the reclaim hash with known client id's.
++ *
++ * the filename may not equal the clid. the clid might be the first
++ * (and so far only) line of data in the file.
++ *
++ * i will probably end up writing data such as the setclientid principal
++ * to each clid file. if i do, i will always put the clid as the
++ * first line of data.
++ */
++
++int
++nfsd4_get_recdir_dirent(struct rec_dirent *rdirent, const char *name,
++ int namlen, loff_t offset, ino_t ino, unsigned int d_type)
++{
++ struct dentry *dclid;
++ struct file *filp;
++ mm_segment_t oldfs;
++ int status = nfs_ok;
++
++ dprintk("NFSD: nfsd4_get_recdir_dirent IN recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ dprintk("NFSD: nfsd4_get_recdir_dirent name %.*s, clear %d\n",
++ namlen, name, rdirent->clear);
++
++ if (name && isdotent(name, namlen))
++ goto out;
++
++ dclid = lookup_one_len(name, nd_rec_init.dentry, namlen);
++ status = PTR_ERR(dclid);
++ if(IS_ERR(dclid))
++ goto out;
++
++ if (rdirent->clear){
++ dprintk("NFSD: nfsd4_get_recdir_dirent REMOVE\n");
++
++ dprintk("NFSD: nfsd4_get_recdir_dirent PRE VFS_UNLINK [%d:%d]\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ status = vfs_unlink(nd_rec_init.dentry->d_inode, dclid);
++
++ dprintk("NFSD: nfsd4_get_recdir_dirent POST VFS_UNLINK [%d:%d]\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ } else {
++ char buf[MAX_FILE_LEN];
++
++ dprintk("NFSD: nfsd4_get_recdir_dirent READ\n");
++
++ filp = dentry_open(dclid, mntget(nd_rec_init.mnt), O_RDWR);
++ if (IS_ERR(filp)) {
++ status = PTR_ERR(filp);
++ goto out;
++ }
++
++ memset(buf, 0, MAX_FILE_LEN);
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ status = vfs_read(filp, buf, MAX_FILE_LEN, &filp->f_pos);
++ set_fs(oldfs);
++
++ dprintk("NFSD: nfsd4_get_recdir_dirent vfs_read returns %d\n",
++ status);
++ if (status > 0)
++ status = nfs4_client_to_reclaim(buf, status);
++ fput(filp);
++ }
++out:
++ dprintk("NFSD:nfsd4_get_recdir_dirent OUT recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ dprintk("NFSD: nfsd4_get_recdir_dirent returns %d\n",status);
++ return 0;
++}
++
++int
++nfsd4_list_rec_dir(int clear)
++{
++ struct file *filp;
++ struct rec_dirent rdirent;
++ int status;
++
++ if (!rec_dir_init)
++ return -EINVAL;
++
++ nfs4_save_set_user();
++
++ dprintk("NFSD: nfsd4_list_rec_dir IN recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ /* open directory */
++ filp = dentry_open(dget(nd_rec_init.dentry), mntget(nd_rec_init.mnt),
++ O_RDWR);
++ status = PTR_ERR(filp);
++ if (IS_ERR(filp))
++ goto out;
++ rdirent.clear = clear;
++
++ /* read the directory entries into memory */
++ status = vfs_readdir(filp, (filldir_t) nfsd4_get_recdir_dirent,
++ (void*)&rdirent);
++
++ fput(filp);
++out:
++ dprintk("NFSD: nfsd4_list_rec_dir OUT recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ dprintk("NFSD: nfsd4_list_rec_dir DONE status: %d\n", status);
++
++ nfs4_reset_user();
++ return status;
++}
++
++
++/*
++ * Hold reference to the recovery directory.
++ */
++
++void
++nfsd4_init_rec_dir(char *rec_dirname)
++{
++ int status;
++
++ printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
++ rec_dirname);
++
++ nfs4_save_set_user();
++
++ status = path_lookup(rec_dirname, LOOKUP_FOLLOW, &nd_rec_init);
++
++ printk("NFSD: nfsd4_init_rec_dir INITIAL recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++
++ if (!status)
++ rec_dir_init = 1;
++ nfs4_reset_user();
++ printk("NFSD: nfsd4_init_rec_dir rec_dir_init %d\n", rec_dir_init);
++}
++
++void
++nfsd4_shutdown_rec_dir(void)
++{
++ rec_dir_init = 0;
++ path_release(&nd_rec_init);
++
++ printk("NFSD: nfsd4_shutdown_rec_dir FINAL recdir [d:mnt] count %d:%d\n",
++ atomic_read(&nd_rec_init.dentry->d_count),
++ atomic_read(&nd_rec_init.mnt->mnt_count));
++}
+Index: linux-2.6.10/fs/nfsd/Makefile
+===================================================================
+--- linux-2.6.10.orig/fs/nfsd/Makefile 2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/fs/nfsd/Makefile 2005-04-05 14:49:13.431687392 +0800
+@@ -8,5 +8,5 @@
+ export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
+ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
+- nfs4acl.o
++ nfs4acl.o nfs4callback.o nfs4recover.o
+ nfsd-objs := $(nfsd-y)
+Index: linux-2.6.10/fs/nfs/nfs4xdr.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/nfs4xdr.c 2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/fs/nfs/nfs4xdr.c 2005-04-05 14:49:13.452684200 +0800
+@@ -82,12 +82,16 @@
+ #define encode_getfh_maxsz (op_encode_hdr_maxsz)
+ #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \
+ ((3+NFS4_FHSIZE) >> 2))
+-#define encode_getattr_maxsz (op_encode_hdr_maxsz + 3)
++#define nfs4_fattr_bitmap_maxsz 3
++#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+ #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
+ #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+-#define nfs4_fattr_bitmap_maxsz (36 + 2 * nfs4_name_maxsz)
+-#define decode_getattr_maxsz (op_decode_hdr_maxsz + 3 + \
+- nfs4_fattr_bitmap_maxsz)
++/* This is based on getfattr, which uses the most attributes: */
++#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
++ 3 + 3 + 3 + 2 * nfs4_name_maxsz))
++#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
++ nfs4_fattr_value_maxsz)
++#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
+ #define encode_savefh_maxsz (op_encode_hdr_maxsz)
+ #define decode_savefh_maxsz (op_decode_hdr_maxsz)
+ #define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2)
+@@ -122,11 +126,11 @@
+ #define encode_symlink_maxsz (op_encode_hdr_maxsz + \
+ 1 + nfs4_name_maxsz + \
+ nfs4_path_maxsz + \
+- nfs4_fattr_bitmap_maxsz)
++ nfs4_fattr_maxsz)
+ #define decode_symlink_maxsz (op_decode_hdr_maxsz + 8)
+ #define encode_create_maxsz (op_encode_hdr_maxsz + \
+ 2 + nfs4_name_maxsz + \
+- nfs4_fattr_bitmap_maxsz)
++ nfs4_fattr_maxsz)
+ #define decode_create_maxsz (op_decode_hdr_maxsz + 8)
+ #define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
+ #define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
+@@ -205,7 +209,7 @@
+ #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ op_encode_hdr_maxsz + 4 + \
+- nfs4_fattr_bitmap_maxsz + \
++ nfs4_fattr_maxsz + \
+ encode_getattr_maxsz)
+ #define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+@@ -360,6 +364,20 @@
+ encode_delegreturn_maxsz)
+ #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
+ decode_delegreturn_maxsz)
++#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \
++ encode_putfh_maxsz + \
++ encode_getattr_maxsz)
++#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \
++ decode_putfh_maxsz + \
++ op_decode_hdr_maxsz + \
++ nfs4_fattr_bitmap_maxsz + 1)
++#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \
++ encode_putfh_maxsz + \
++ op_encode_hdr_maxsz + 4 + \
++ nfs4_fattr_bitmap_maxsz + 1)
++#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \
++ decode_putfh_maxsz + \
++ op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+
+ static struct {
+ unsigned int mode;
+@@ -459,7 +477,7 @@
+ * In the worst-case, this would be
+ * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+ * = 36 bytes, plus any contribution from variable-length fields
+- * such as owner/group/acl's.
++ * such as owner/group.
+ */
+ len = 16;
+
+@@ -1083,6 +1101,27 @@
+ return 0;
+ }
+
++extern nfs4_stateid zero_stateid;
++
++static int
++encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
++{
++ uint32_t *p;
++
++ RESERVE_SPACE(4+sizeof(zero_stateid.data));
++ WRITE32(OP_SETATTR);
++ WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
++ RESERVE_SPACE(2*4);
++ WRITE32(1);
++ WRITE32(FATTR4_WORD0_ACL);
++ if (arg->acl_len % 4)
++ return -EINVAL;
++ RESERVE_SPACE(4);
++ WRITE32(arg->acl_len);
++ xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
++ return 0;
++}
++
+ static int
+ encode_savefh(struct xdr_stream *xdr)
+ {
+@@ -1627,6 +1666,34 @@
+ }
+
+ /*
++ * Encode a GETACL request
++ */
++static int
++nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,
++ struct nfs_getaclargs *args)
++{
++ struct xdr_stream xdr;
++ struct rpc_auth *auth = req->rq_task->tk_auth;
++ struct compound_hdr hdr = {
++ .nops = 2,
++ };
++ int replen, status;
++
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, &hdr);
++ status = encode_putfh(&xdr, args->fh);
++ if (status)
++ goto out;
++ status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
++ /* set up reply buffer: */
++ replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
++ xdr_inline_pages(&req->rq_rcv_buf, replen,
++ args->acl_pages, args->acl_pgbase, args->acl_len);
++out:
++ return status;
++}
++
++/*
+ * Encode a WRITE request
+ */
+ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+@@ -3122,6 +3189,46 @@
+ return decode_op_hdr(xdr, OP_RENEW);
+ }
+
++static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
++ ssize_t *acl_len)
++{
++ uint32_t *savep;
++ uint32_t attrlen,
++ bitmap[2] = {0};
++ struct kvec *iov = req->rq_rcv_buf.head;
++ int status;
++
++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
++ goto out;
++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
++ goto out;
++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
++ goto out;
++
++ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
++ return -EIO;
++ if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
++ int hdrlen, recvd;
++
++ /* We ignore &savep and don't do consistency checks on
++ * the attr length. Let userspace figure it out.... */
++ hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
++ recvd = req->rq_rcv_buf.len - hdrlen;
++ if (attrlen > recvd) {
++ printk(KERN_WARNING "NFS: server cheating in getattr"
++ " acl reply: attrlen %u > recvd %u\n",
++ attrlen, recvd);
++ return -EINVAL;
++ }
++ if (attrlen <= *acl_len)
++ xdr_read_pages(xdr, attrlen);
++ *acl_len = attrlen;
++ }
++
++out:
++ return status;
++}
++
+ static int
+ decode_savefh(struct xdr_stream *xdr)
+ {
+@@ -3413,6 +3520,71 @@
+
+ }
+
++/*
++ * Encode an SETACL request
++ */
++static int
++nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .nops = 2,
++ };
++ int status;
++
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, &hdr);
++ status = encode_putfh(&xdr, args->fh);
++ if (status)
++ goto out;
++ status = encode_setacl(&xdr, args);
++out:
++ return status;
++}
++/*
++ * Decode SETACL response
++ */
++static int
++nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_setattr(&xdr, res);
++out:
++ return status;
++}
++
++/*
++ * Decode GETACL response
++ */
++static int
++nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, ssize_t *acl_len)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_getacl(&xdr, rqstp, acl_len);
++
++out:
++ return status;
++}
+
+ /*
+ * Decode CLOSE response
+@@ -4009,6 +4181,8 @@
+ PROC(READDIR, enc_readdir, dec_readdir),
+ PROC(SERVER_CAPS, enc_server_caps, dec_server_caps),
+ PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn),
++ PROC(GETACL, enc_getacl, dec_getacl),
++ PROC(SETACL, enc_setacl, dec_setacl),
+ };
+
+ struct rpc_version nfs_version4 = {
+Index: linux-2.6.10/fs/nfs/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/inode.c 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/fs/nfs/inode.c 2005-04-05 14:49:13.445685264 +0800
+@@ -486,13 +486,27 @@
+ if (error < 0)
+ goto out_err;
+
+- buf->f_frsize = server->wtmult;
++ /*
++ * Current versions of glibc do not correctly handle the
++ * case where f_frsize != f_bsize. Eventually we want to
++ * report the value of wtmult in this field.
++ */
++ buf->f_frsize = sb->s_blocksize;
++
++ /*
++ * On most *nix systems, f_blocks, f_bfree, and f_bavail
++ * are reported in units of f_frsize. Linux hasn't had
++ * an f_frsize field in its statfs struct until recently,
++ * thus historically Linux's sys_statfs reports these
++ * fields in units of f_bsize.
++ */
+ buf->f_bsize = sb->s_blocksize;
+ blockbits = sb->s_blocksize_bits;
+ blockres = (1 << blockbits) - 1;
+ buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+ buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+ buf->f_bavail = (res.abytes + blockres) >> blockbits;
++
+ buf->f_files = res.tfiles;
+ buf->f_ffree = res.afiles;
+
+@@ -565,9 +579,9 @@
+
+ memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
+ if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+- nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
++ nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS;
+ else
+- nfsi->flags |= NFS_INO_INVALID_ATTR;
++ nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+ }
+
+ /*
+@@ -605,7 +619,7 @@
+ return 0;
+ if (nfs_compare_fh(NFS_FH(inode), fh))
+ return 0;
+- if (is_bad_inode(inode))
++ if (is_bad_inode(inode) || NFS_STALE(inode))
+ return 0;
+ return 1;
+ }
+@@ -664,7 +678,7 @@
+ /* Why so? Because we want revalidate for devices/FIFOs, and
+ * that's precisely what we have in nfs_file_inode_operations.
+ */
+- inode->i_op = &nfs_file_inode_operations;
++ inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops;
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_fop = &nfs_file_operations;
+ inode->i_data.a_ops = &nfs_file_aops;
+@@ -766,13 +780,8 @@
+ vmtruncate(inode, attr->ia_size);
+ }
+ }
+- if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+- struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred;
+- if (*cred) {
+- put_rpccred(*cred);
+- *cred = NULL;
+- }
+- }
++ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
++ NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS;
+ nfs_end_data_update(inode);
+ unlock_kernel();
+ return error;
+@@ -949,14 +958,14 @@
+ lock_kernel();
+ if (!inode || is_bad_inode(inode))
+ goto out_nowait;
+- if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode)
++ if (NFS_STALE(inode))
+ goto out_nowait;
+
+ while (NFS_REVALIDATING(inode)) {
+ status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING);
+ if (status < 0)
+ goto out_nowait;
+- if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOAC)
++ if (NFS_ATTRTIMEO(inode) == 0)
+ continue;
+ if (NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME))
+ continue;
+@@ -968,14 +977,14 @@
+ /* Protect against RPC races by saving the change attribute */
+ verifier = nfs_save_change_attribute(inode);
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
+- if (status) {
++ if (status != 0) {
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode), status);
+ if (status == -ESTALE) {
+- NFS_FLAGS(inode) |= NFS_INO_STALE;
+- if (inode != inode->i_sb->s_root->d_inode)
+- remove_inode_hash(inode);
++ nfs_zap_caches(inode);
++ if (!S_ISDIR(inode->i_mode))
++ NFS_FLAGS(inode) |= NFS_INO_STALE;
+ }
+ goto out;
+ }
+@@ -1014,7 +1023,6 @@
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode));
+
+- NFS_FLAGS(inode) &= ~NFS_INO_STALE;
+ out:
+ NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING;
+ wake_up(&nfsi->nfs_i_wait);
+@@ -1161,7 +1169,7 @@
+ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
+ || inode->i_uid != fattr->uid
+ || inode->i_gid != fattr->gid)
+- nfsi->flags |= NFS_INO_INVALID_ATTR;
++ nfsi->flags |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+
+ /* Has the link count changed? */
+ if (inode->i_nlink != fattr->nlink)
+@@ -1270,7 +1278,7 @@
+ #endif
+ nfsi->change_attr = fattr->change_attr;
+ if (!data_unstable)
+- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
++ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS;
+ }
+
+ memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+@@ -1278,14 +1286,8 @@
+
+ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
+ inode->i_uid != fattr->uid ||
+- inode->i_gid != fattr->gid) {
+- struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred;
+- if (*cred) {
+- put_rpccred(*cred);
+- *cred = NULL;
+- }
+- invalid |= NFS_INO_INVALID_ATTR;
+- }
++ inode->i_gid != fattr->gid)
++ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+
+ inode->i_mode = fattr->mode;
+ inode->i_nlink = fattr->nlink;
+@@ -1335,7 +1337,8 @@
+ */
+ nfs_invalidate_inode(inode);
+ out_err:
+- return -EIO;
++ NFS_FLAGS(inode) |= NFS_INO_STALE;
++ return -ESTALE;
+ }
+
+ /*
+@@ -1449,8 +1452,6 @@
+
+ kill_anon_super(s);
+
+- nfs4_renewd_prepare_shutdown(server);
+-
+ if (server->client != NULL && !IS_ERR(server->client))
+ rpc_shutdown_client(server->client);
+ if (server->client_sys != NULL && !IS_ERR(server->client_sys))
+@@ -1461,8 +1462,6 @@
+
+ rpciod_down(); /* release rpciod */
+
+- destroy_nfsv4_state(server);
+-
+ if (server->hostname != NULL)
+ kfree(server->hostname);
+ kfree(server);
+@@ -1478,8 +1477,53 @@
+
+ #ifdef CONFIG_NFS_V4
+
+-static void nfs4_clear_inode(struct inode *);
++#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
++
++int
++nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
++ size_t buflen, int flags)
++{
++ struct inode *inode = dentry->d_inode;
++
++ if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
++ return -EINVAL;
++
++ if (!S_ISREG(inode->i_mode) &&
++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++ return -EPERM;
++
++ return nfs4_proc_set_acl(inode, buf, buflen);
++}
++
++/* The getxattr man page suggests returning -ENODATA for unknown attributes,
++ * and that's what we'll do for e.g. user attributes that haven't been set.
++ * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
++ * attributes in kernel-managed attribute namespaces. */
++ssize_t
++nfs4_getxattr(struct dentry *dentry, const char *key, void *buf,
++ size_t buflen)
++{
++ struct inode *inode = dentry->d_inode;
+
++ if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
++ return -EOPNOTSUPP;
++
++ return nfs4_proc_get_acl(inode, buf, buflen);
++}
++
++ssize_t
++nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
++{
++ ssize_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
++
++ if (buf && buflen < len)
++ return -ERANGE;
++ if (buf)
++ memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
++ return len;
++}
++
++static void nfs4_clear_inode(struct inode *);
+
+ static struct super_operations nfs4_sops = {
+ .alloc_inode = nfs_alloc_inode,
+@@ -1543,9 +1587,6 @@
+ server->wsize = nfs_block_size(data->wsize, NULL);
+ server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+
+- /* NFSv4 doesn't use NLM locking */
+- server->flags |= NFS_MOUNT_NONLM;
+-
+ server->acregmin = data->acregmin*HZ;
+ server->acregmax = data->acregmax*HZ;
+ server->acdirmin = data->acdirmin*HZ;
+@@ -1790,8 +1831,22 @@
+
+ static void nfs4_kill_super(struct super_block *sb)
+ {
++ struct nfs_server *server = NFS_SB(sb);
++
+ nfs_return_all_delegations(sb);
+- nfs_kill_super(sb);
++ kill_anon_super(sb);
++
++ nfs4_renewd_prepare_shutdown(server);
++
++ if (server->client != NULL && !IS_ERR(server->client))
++ rpc_shutdown_client(server->client);
++ rpciod_down(); /* release rpciod */
++
++ destroy_nfsv4_state(server);
++
++ if (server->hostname != NULL)
++ kfree(server->hostname);
++ kfree(server);
+ }
+
+ static struct file_system_type nfs4_fs_type = {
+@@ -1821,9 +1876,13 @@
+ extern int nfs_init_nfspagecache(void);
+ extern void nfs_destroy_nfspagecache(void);
+ extern int nfs_init_readpagecache(void);
+-extern int nfs_destroy_readpagecache(void);
++extern void nfs_destroy_readpagecache(void);
+ extern int nfs_init_writepagecache(void);
+-extern int nfs_destroy_writepagecache(void);
++extern void nfs_destroy_writepagecache(void);
++#ifdef CONFIG_NFS_DIRECTIO
++extern int nfs_init_directcache(void);
++extern void nfs_destroy_directcache(void);
++#endif
+
+ static kmem_cache_t * nfs_inode_cachep;
+
+@@ -1904,6 +1963,12 @@
+ if (err)
+ goto out1;
+
++#ifdef CONFIG_NFS_DIRECTIO
++ err = nfs_init_directcache();
++ if (err)
++ goto out0;
++#endif
++
+ #ifdef CONFIG_PROC_FS
+ rpc_proc_register(&nfs_rpcstat);
+ #endif
+@@ -1914,8 +1979,14 @@
+ goto out;
+ return 0;
+ out:
++#ifdef CONFIG_PROC_FS
+ rpc_proc_unregister("nfs");
++#endif
+ nfs_destroy_writepagecache();
++#ifdef CONFIG_NFS_DIRECTIO
++out0:
++ nfs_destroy_directcache();
++#endif
+ out1:
+ nfs_destroy_readpagecache();
+ out2:
+@@ -1928,6 +1999,9 @@
+
+ static void __exit exit_nfs_fs(void)
+ {
++#ifdef CONFIG_NFS_DIRECTIO
++ nfs_destroy_directcache();
++#endif
+ nfs_destroy_writepagecache();
+ nfs_destroy_readpagecache();
+ nfs_destroy_inodecache();
+Index: linux-2.6.10/fs/nfs/nfs4state.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/nfs4state.c 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/fs/nfs/nfs4state.c 2005-04-05 14:49:13.446685112 +0800
+@@ -445,7 +445,7 @@
+ state->owner = owner;
+ atomic_inc(&owner->so_count);
+ list_add(&state->inode_states, &nfsi->open_states);
+- state->inode = inode;
++ state->inode = igrab(inode);
+ spin_unlock(&inode->i_lock);
+ } else {
+ spin_unlock(&inode->i_lock);
+@@ -471,6 +471,7 @@
+ list_del(&state->inode_states);
+ spin_unlock(&inode->i_lock);
+ list_del(&state->open_states);
++ iput(inode);
+ BUG_ON (state->state != 0);
+ nfs4_free_open_state(state);
+ nfs4_put_state_owner(owner);
+@@ -486,7 +487,6 @@
+ struct nfs4_state_owner *owner = state->owner;
+ struct nfs4_client *clp = owner->so_client;
+ int newstate;
+- int status = 0;
+
+ atomic_inc(&owner->so_count);
+ down_read(&clp->cl_sem);
+@@ -508,10 +508,8 @@
+ newstate |= FMODE_WRITE;
+ if (state->state == newstate)
+ goto out;
+- if (newstate != 0)
+- status = nfs4_do_downgrade(inode, state, newstate);
+- else
+- status = nfs4_do_close(inode, state);
++ if (nfs4_do_close(inode, state, newstate) == -EINPROGRESS)
++ return;
+ }
+ out:
+ nfs4_put_open_state(state);
+Index: linux-2.6.10/fs/nfs/idmap.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/idmap.c 2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/fs/nfs/idmap.c 2005-04-05 14:49:13.454683896 +0800
+@@ -80,6 +80,7 @@
+ static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
+ size_t);
+ void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
++static void idmap_pipe_release(struct inode *inode);
+
+ static unsigned int fnvhash32(const void *, size_t);
+
+@@ -87,6 +88,7 @@
+ .upcall = idmap_pipe_upcall,
+ .downcall = idmap_pipe_downcall,
+ .destroy_msg = idmap_pipe_destroy_msg,
++ .release_pipe = idmap_pipe_release,
+ };
+
+ void
+@@ -448,6 +450,19 @@
+ up(&idmap->idmap_im_lock);
+ }
+
++static void
++idmap_pipe_release(struct inode *inode)
++{
++ struct rpc_inode *rpci = RPC_I(inode);
++ struct idmap *idmap = (struct idmap *)rpci->private;
++ struct idmap_msg *im = &idmap->idmap_im;
++
++ down(&idmap->idmap_im_lock);
++ im->im_status = IDMAP_STATUS_LOOKUPFAIL;
++ wake_up(&idmap->idmap_wq);
++ up(&idmap->idmap_im_lock);
++}
++
+ /*
+ * Fowler/Noll/Vo hash
+ * http://www.isthe.com/chongo/tech/comp/fnv/
+Index: linux-2.6.10/fs/nfs/dir.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/dir.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/nfs/dir.c 2005-04-05 14:49:13.439686176 +0800
+@@ -40,8 +40,6 @@
+ static int nfs_opendir(struct inode *, struct file *);
+ static int nfs_readdir(struct file *, void *, filldir_t);
+ static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+-static int nfs_cached_lookup(struct inode *, struct dentry *,
+- struct nfs_fh *, struct nfs_fattr *);
+ static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
+ static int nfs_mkdir(struct inode *, struct dentry *, int);
+ static int nfs_rmdir(struct inode *, struct dentry *);
+@@ -92,6 +90,9 @@
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
++ .getxattr = nfs4_getxattr,
++ .setxattr = nfs4_setxattr,
++ .listxattr = nfs4_listxattr,
+ };
+
+ #endif /* CONFIG_NFS_V4 */
+@@ -294,24 +295,13 @@
+ return res;
+ }
+
+-static unsigned int nfs_type2dtype[] = {
+- DT_UNKNOWN,
+- DT_REG,
+- DT_DIR,
+- DT_BLK,
+- DT_CHR,
+- DT_LNK,
+- DT_SOCK,
+- DT_UNKNOWN,
+- DT_FIFO
+-};
+-
+-static inline
+-unsigned int nfs_type_to_d_type(enum nfs_ftype type)
++static inline unsigned int dt_type(struct inode *inode)
+ {
+- return nfs_type2dtype[type];
++ return (inode->i_mode >> 12) & 15;
+ }
+
++static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
++
+ /*
+ * Once we've found the start of the dirent within a page: fill 'er up...
+ */
+@@ -321,6 +311,7 @@
+ {
+ struct file *file = desc->file;
+ struct nfs_entry *entry = desc->entry;
++ struct dentry *dentry = NULL;
+ unsigned long fileid;
+ int loop_count = 0,
+ res;
+@@ -333,9 +324,16 @@
+ * retrieving the current dirent on the server */
+ fileid = nfs_fileid_to_ino_t(entry->ino);
+
++ /* Get a dentry if we have one */
++ if (dentry != NULL)
++ dput(dentry);
++ dentry = nfs_readdir_lookup(desc);
++
+ /* Use readdirplus info */
+- if (desc->plus && (entry->fattr->valid & NFS_ATTR_FATTR))
+- d_type = nfs_type_to_d_type(entry->fattr->type);
++ if (dentry != NULL && dentry->d_inode != NULL) {
++ d_type = dt_type(dentry->d_inode);
++ fileid = dentry->d_inode->i_ino;
++ }
+
+ res = filldir(dirent, entry->name, entry->len,
+ entry->prev_cookie, fileid, d_type);
+@@ -352,7 +350,8 @@
+ }
+ }
+ dir_page_release(desc);
+-
++ if (dentry != NULL)
++ dput(dentry);
+ dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->target, res);
+ return res;
+ }
+@@ -615,24 +614,10 @@
+ goto out_valid;
+ }
+
+- /*
+- * Note: we're not holding inode->i_sem and so may be racing with
+- * operations that change the directory. We therefore save the
+- * change attribute *before* we do the RPC call.
+- */
+- verifier = nfs_save_change_attribute(dir);
+- error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr);
+- if (!error) {
+- if (nfs_compare_fh(NFS_FH(inode), &fhandle))
+- goto out_bad;
+- if (nfs_lookup_verify_inode(inode, isopen))
+- goto out_zap_parent;
+- goto out_valid_renew;
+- }
+-
+ if (NFS_STALE(inode))
+ goto out_bad;
+
++ verifier = nfs_save_change_attribute(dir);
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+ if (error)
+ goto out_bad;
+@@ -641,7 +626,6 @@
+ if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
+ goto out_bad;
+
+- out_valid_renew:
+ nfs_renew_times(dentry);
+ nfs_set_verifier(dentry, verifier);
+ out_valid:
+@@ -723,6 +707,7 @@
+
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+ {
++ struct dentry *res;
+ struct inode *inode = NULL;
+ int error;
+ struct nfs_fh fhandle;
+@@ -731,11 +716,11 @@
+ dfprintk(VFS, "NFS: lookup(%s/%s)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name);
+
+- error = -ENAMETOOLONG;
++ res = ERR_PTR(-ENAMETOOLONG);
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+ goto out;
+
+- error = -ENOMEM;
++ res = ERR_PTR(-ENOMEM);
+ dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+
+ lock_kernel();
+@@ -746,29 +731,27 @@
+ if (nfs_is_exclusive_create(dir, nd))
+ goto no_entry;
+
+- error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr);
+- if (error != 0) {
+- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name,
+- &fhandle, &fattr);
+- if (error == -ENOENT)
+- goto no_entry;
+- if (error != 0)
+- goto out_unlock;
++ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
++ if (error == -ENOENT)
++ goto no_entry;
++ if (error < 0) {
++ res = ERR_PTR(error);
++ goto out_unlock;
+ }
+- error = -EACCES;
++ res = ERR_PTR(-EACCES);
+ inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
+ if (!inode)
+ goto out_unlock;
+ no_entry:
+- error = 0;
+- d_add(dentry, inode);
++ res = d_add_unique(dentry, inode);
++ if (res != NULL)
++ dentry = res;
+ nfs_renew_times(dentry);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ out_unlock:
+ unlock_kernel();
+ out:
+- BUG_ON(error > 0);
+- return ERR_PTR(error);
++ return res;
+ }
+
+ #ifdef CONFIG_NFS_V4
+@@ -798,15 +781,15 @@
+
+ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+ {
++ struct dentry *res = NULL;
+ struct inode *inode = NULL;
+- int error = 0;
+
+ /* Check that we are indeed trying to open this file */
+ if (!is_atomic_open(dir, nd))
+ goto no_open;
+
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
+- error = -ENAMETOOLONG;
++ res = ERR_PTR(-ENAMETOOLONG);
+ goto out;
+ }
+ dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+@@ -828,7 +811,7 @@
+ inode = nfs4_atomic_open(dir, dentry, nd);
+ unlock_kernel();
+ if (IS_ERR(inode)) {
+- error = PTR_ERR(inode);
++ int error = PTR_ERR(inode);
+ switch (error) {
+ /* Make a negative dentry */
+ case -ENOENT:
+@@ -841,16 +824,18 @@
+ /* case -EISDIR: */
+ /* case -EINVAL: */
+ default:
++ res = ERR_PTR(error);
+ goto out;
+ }
+ }
+ no_entry:
+- d_add(dentry, inode);
++ res = d_add_unique(dentry, inode);
++ if (res != NULL)
++ dentry = res;
+ nfs_renew_times(dentry);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ out:
+- BUG_ON(error > 0);
+- return ERR_PTR(error);
++ return res;
+ no_open:
+ return nfs_lookup(dir, dentry, nd);
+ }
+@@ -906,83 +891,51 @@
+ }
+ #endif /* CONFIG_NFSV4 */
+
+-static inline
+-int find_dirent_name(nfs_readdir_descriptor_t *desc, struct page *page, struct dentry *dentry)
++static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
+ {
++ struct dentry *parent = desc->file->f_dentry;
++ struct inode *dir = parent->d_inode;
+ struct nfs_entry *entry = desc->entry;
+- int status;
+-
+- while((status = dir_decode(desc)) == 0) {
+- if (entry->len != dentry->d_name.len)
+- continue;
+- if (memcmp(entry->name, dentry->d_name.name, entry->len))
+- continue;
+- if (!(entry->fattr->valid & NFS_ATTR_FATTR))
+- continue;
+- break;
+- }
+- return status;
+-}
+-
+-/*
+- * Use the cached Readdirplus results in order to avoid a LOOKUP call
+- * whenever we believe that the parent directory has not changed.
+- *
+- * We assume that any file creation/rename changes the directory mtime.
+- * As this results in a page cache invalidation whenever it occurs,
+- * we don't require any other tests for cache coherency.
+- */
+-static
+-int nfs_cached_lookup(struct inode *dir, struct dentry *dentry,
+- struct nfs_fh *fh, struct nfs_fattr *fattr)
+-{
+- nfs_readdir_descriptor_t desc;
+- struct nfs_server *server;
+- struct nfs_entry entry;
+- struct page *page;
+- unsigned long timestamp;
+- int res;
+-
+- if (!NFS_USE_READDIRPLUS(dir))
+- return -ENOENT;
+- server = NFS_SERVER(dir);
+- /* Don't use readdirplus unless the cache is stable */
+- if ((server->flags & NFS_MOUNT_NOAC) != 0
+- || nfs_caches_unstable(dir)
+- || nfs_attribute_timeout(dir))
+- return -ENOENT;
+- if ((NFS_FLAGS(dir) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) != 0)
+- return -ENOENT;
+- timestamp = NFS_I(dir)->readdir_timestamp;
+-
+- entry.fh = fh;
+- entry.fattr = fattr;
+-
+- desc.decode = NFS_PROTO(dir)->decode_dirent;
+- desc.entry = &entry;
+- desc.page_index = 0;
+- desc.plus = 1;
+-
+- for(;(page = find_get_page(dir->i_mapping, desc.page_index)); desc.page_index++) {
+-
+- res = -EIO;
+- if (PageUptodate(page)) {
+- void * kaddr = kmap_atomic(page, KM_USER0);
+- desc.ptr = kaddr;
+- res = find_dirent_name(&desc, page, dentry);
+- kunmap_atomic(kaddr, KM_USER0);
+- }
+- page_cache_release(page);
++ struct dentry *dentry, *alias;
++ struct qstr name = {
++ .name = entry->name,
++ .len = entry->len,
++ };
++ struct inode *inode;
+
+- if (res == 0)
+- goto out_found;
+- if (res != -EAGAIN)
++ switch (name.len) {
++ case 2:
++ if (name.name[0] == '.' && name.name[1] == '.')
++ return dget_parent(parent);
+ break;
++ case 1:
++ if (name.name[0] == '.')
++ return dget(parent);
++ }
++ name.hash = full_name_hash(name.name, name.len);
++ dentry = d_lookup(parent, &name);
++ if (dentry != NULL)
++ return dentry;
++ if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
++ return NULL;
++ /* Note: caller is already holding the dir->i_sem! */
++ dentry = d_alloc(parent, &name);
++ if (dentry == NULL)
++ return NULL;
++ dentry->d_op = NFS_PROTO(dir)->dentry_ops;
++ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
++ if (!inode) {
++ dput(dentry);
++ return NULL;
+ }
+- return -ENOENT;
+- out_found:
+- fattr->timestamp = timestamp;
+- return 0;
++ alias = d_add_unique(dentry, inode);
++ if (alias != NULL) {
++ dput(dentry);
++ dentry = alias;
++ }
++ nfs_renew_times(dentry);
++ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
++ return dentry;
+ }
+
+ /*
+@@ -1045,15 +998,9 @@
+ if (nd && (nd->flags & LOOKUP_CREATE))
+ open_flags = nd->intent.open.flags;
+
+- /*
+- * The 0 argument passed into the create function should one day
+- * contain the O_EXCL flag if requested. This allows NFSv3 to
+- * select the appropriate create strategy. Currently open_namei
+- * does not pass the create flags.
+- */
+ lock_kernel();
+ nfs_begin_data_update(dir);
+- inode = NFS_PROTO(dir)->create(dir, &dentry->d_name, &attr, open_flags);
++ inode = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
+ nfs_end_data_update(dir);
+ if (!IS_ERR(inode)) {
+ d_instantiate(dentry, inode);
+@@ -1508,7 +1455,7 @@
+
+ if (cache->cred != cred
+ || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
+- || (NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR))
++ || (NFS_FLAGS(inode) & NFS_INO_INVALID_ACCESS))
+ return -ENOENT;
+ memcpy(res, cache, sizeof(*res));
+ return 0;
+@@ -1522,6 +1469,7 @@
+ if (cache->cred)
+ put_rpccred(cache->cred);
+ cache->cred = get_rpccred(set->cred);
++ NFS_FLAGS(inode) &= ~NFS_INO_INVALID_ACCESS;
+ }
+ cache->jiffies = set->jiffies;
+ cache->mask = set->mask;
+Index: linux-2.6.10/fs/nfs/unlink.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/unlink.c 2004-12-25 05:35:29.000000000 +0800
++++ linux-2.6.10/fs/nfs/unlink.c 2005-04-05 14:49:13.435686784 +0800
+@@ -215,7 +215,6 @@
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+ spin_unlock(&dentry->d_lock);
+- if (data->task.tk_rpcwait == &nfs_delete_queue)
+- rpc_wake_up_task(&data->task);
++ rpc_wake_up_task(&data->task);
+ nfs_put_unlinkdata(data);
+ }
+Index: linux-2.6.10/fs/nfs/write.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/write.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfs/write.c 2005-04-05 14:49:13.443685568 +0800
+@@ -61,7 +61,6 @@
+ #include <linux/nfs_page.h>
+ #include <asm/uaccess.h>
+ #include <linux/smp_lock.h>
+-#include <linux/mempool.h>
+
+ #include "delegation.h"
+
+@@ -83,49 +82,17 @@
+ static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int);
+
+ static kmem_cache_t *nfs_wdata_cachep;
+-static mempool_t *nfs_wdata_mempool;
+-static mempool_t *nfs_commit_mempool;
++mempool_t *nfs_wdata_mempool;
++mempool_t *nfs_commit_mempool;
+
+ static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
+
+-static __inline__ struct nfs_write_data *nfs_writedata_alloc(void)
+-{
+- struct nfs_write_data *p;
+- p = (struct nfs_write_data *)mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
+- if (p) {
+- memset(p, 0, sizeof(*p));
+- INIT_LIST_HEAD(&p->pages);
+- }
+- return p;
+-}
+-
+-static __inline__ void nfs_writedata_free(struct nfs_write_data *p)
+-{
+- mempool_free(p, nfs_wdata_mempool);
+-}
+-
+-static void nfs_writedata_release(struct rpc_task *task)
++void nfs_writedata_release(struct rpc_task *task)
+ {
+ struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata;
+ nfs_writedata_free(wdata);
+ }
+
+-static __inline__ struct nfs_write_data *nfs_commit_alloc(void)
+-{
+- struct nfs_write_data *p;
+- p = (struct nfs_write_data *)mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
+- if (p) {
+- memset(p, 0, sizeof(*p));
+- INIT_LIST_HEAD(&p->pages);
+- }
+- return p;
+-}
+-
+-static __inline__ void nfs_commit_free(struct nfs_write_data *p)
+-{
+- mempool_free(p, nfs_commit_mempool);
+-}
+-
+ /* Adjust the file length if we're writing beyond the end */
+ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
+ {
+@@ -184,11 +151,10 @@
+ int result, written = 0;
+ struct nfs_write_data *wdata;
+
+- wdata = kmalloc(sizeof(*wdata), GFP_NOFS);
++ wdata = nfs_writedata_alloc();
+ if (!wdata)
+ return -ENOMEM;
+
+- memset(wdata, 0, sizeof(*wdata));
+ wdata->flags = how;
+ wdata->cred = ctx->cred;
+ wdata->inode = inode;
+@@ -238,8 +204,7 @@
+
+ io_error:
+ nfs_end_data_update_defer(inode);
+-
+- kfree(wdata);
++ nfs_writedata_free(wdata);
+ return written ? written : result;
+ }
+
+@@ -1199,7 +1164,8 @@
+ }
+ if (time_before(complain, jiffies)) {
+ printk(KERN_WARNING
+- "NFS: Server wrote less than requested.\n");
++ "NFS: Server wrote zero bytes, expected %u.\n",
++ argp->count);
+ complain = jiffies + 300 * HZ;
+ }
+ /* Can't do anything about it except throw an error. */
+Index: linux-2.6.10/fs/nfs/proc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/proc.c 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/fs/nfs/proc.c 2005-04-05 14:49:13.440686024 +0800
+@@ -63,12 +63,12 @@
+ dprintk("%s: call getattr\n", __FUNCTION__);
+ fattr->valid = 0;
+ status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0);
+- dprintk("%s: reply getattr %d\n", __FUNCTION__, status);
++ dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
+ if (status)
+ return status;
+ dprintk("%s: call statfs\n", __FUNCTION__);
+ status = rpc_call(server->client_sys, NFSPROC_STATFS, fhandle, &fsinfo, 0);
+- dprintk("%s: reply statfs %d\n", __FUNCTION__, status);
++ dprintk("%s: reply statfs: %d\n", __FUNCTION__, status);
+ if (status)
+ return status;
+ info->rtmax = NFS_MAXDATA;
+@@ -96,7 +96,7 @@
+ fattr->valid = 0;
+ status = rpc_call(server->client, NFSPROC_GETATTR,
+ fhandle, fattr, 0);
+- dprintk("NFS reply getattr\n");
++ dprintk("NFS reply getattr: %d\n", status);
+ return status;
+ }
+
+@@ -114,7 +114,7 @@
+ dprintk("NFS call setattr\n");
+ fattr->valid = 0;
+ status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0);
+- dprintk("NFS reply setattr\n");
++ dprintk("NFS reply setattr: %d\n", status);
+ return status;
+ }
+
+@@ -213,15 +213,15 @@
+ }
+
+ static struct inode *
+-nfs_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr,
++nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+ {
+ struct nfs_fh fhandle;
+ struct nfs_fattr fattr;
+ struct nfs_createargs arg = {
+ .fh = NFS_FH(dir),
+- .name = name->name,
+- .len = name->len,
++ .name = dentry->d_name.name,
++ .len = dentry->d_name.len,
+ .sattr = sattr
+ };
+ struct nfs_diropok res = {
+@@ -231,7 +231,7 @@
+ int status;
+
+ fattr.valid = 0;
+- dprintk("NFS call create %s\n", name->name);
++ dprintk("NFS call create %s\n", dentry->d_name.name);
+ status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
+ dprintk("NFS reply create: %d\n", status);
+ if (status == 0) {
+@@ -620,6 +620,7 @@
+ .version = 2, /* protocol version */
+ .dentry_ops = &nfs_dentry_operations,
+ .dir_inode_ops = &nfs_dir_inode_operations,
++ .file_inode_ops = &nfs_file_inode_operations,
+ .getroot = nfs_proc_get_root,
+ .getattr = nfs_proc_getattr,
+ .setattr = nfs_proc_setattr,
+Index: linux-2.6.10/fs/nfs/callback.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/callback.c 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/fs/nfs/callback.c 2005-04-05 14:49:13.436686632 +0800
+@@ -139,133 +139,10 @@
+ return ret;
+ }
+
+-/*
+- * AUTH_NULL authentication
+- */
+-static int nfs_callback_null_accept(struct svc_rqst *rqstp, u32 *authp)
+-{
+- struct kvec *argv = &rqstp->rq_arg.head[0];
+- struct kvec *resv = &rqstp->rq_res.head[0];
+-
+- if (argv->iov_len < 3*4)
+- return SVC_GARBAGE;
+-
+- if (svc_getu32(argv) != 0) {
+- dprintk("svc: bad null cred\n");
+- *authp = rpc_autherr_badcred;
+- return SVC_DENIED;
+- }
+- if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
+- dprintk("svc: bad null verf\n");
+- *authp = rpc_autherr_badverf;
+- return SVC_DENIED;
+- }
+-
+- /* Signal that mapping to nobody uid/gid is required */
+- rqstp->rq_cred.cr_uid = (uid_t) -1;
+- rqstp->rq_cred.cr_gid = (gid_t) -1;
+- rqstp->rq_cred.cr_group_info = groups_alloc(0);
+- if (rqstp->rq_cred.cr_group_info == NULL)
+- return SVC_DROP; /* kmalloc failure - client must retry */
+-
+- /* Put NULL verifier */
+- svc_putu32(resv, RPC_AUTH_NULL);
+- svc_putu32(resv, 0);
+- dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK);
+- return SVC_OK;
+-}
+-
+-static int nfs_callback_null_release(struct svc_rqst *rqstp)
+-{
+- if (rqstp->rq_cred.cr_group_info)
+- put_group_info(rqstp->rq_cred.cr_group_info);
+- rqstp->rq_cred.cr_group_info = NULL;
+- return 0; /* don't drop */
+-}
+-
+-static struct auth_ops nfs_callback_auth_null = {
+- .name = "null",
+- .flavour = RPC_AUTH_NULL,
+- .accept = nfs_callback_null_accept,
+- .release = nfs_callback_null_release,
+-};
+-
+-/*
+- * AUTH_SYS authentication
+- */
+-static int nfs_callback_unix_accept(struct svc_rqst *rqstp, u32 *authp)
+-{
+- struct kvec *argv = &rqstp->rq_arg.head[0];
+- struct kvec *resv = &rqstp->rq_res.head[0];
+- struct svc_cred *cred = &rqstp->rq_cred;
+- u32 slen, i;
+- int len = argv->iov_len;
+-
+- dprintk("%s: start\n", __FUNCTION__);
+- cred->cr_group_info = NULL;
+- rqstp->rq_client = NULL;
+- if ((len -= 3*4) < 0)
+- return SVC_GARBAGE;
+-
+- /* Get length, time stamp and machine name */
+- svc_getu32(argv);
+- svc_getu32(argv);
+- slen = XDR_QUADLEN(ntohl(svc_getu32(argv)));
+- if (slen > 64 || (len -= (slen + 3)*4) < 0)
+- goto badcred;
+- argv->iov_base = (void*)((u32*)argv->iov_base + slen);
+- argv->iov_len -= slen*4;
+-
+- cred->cr_uid = ntohl(svc_getu32(argv));
+- cred->cr_gid = ntohl(svc_getu32(argv));
+- slen = ntohl(svc_getu32(argv));
+- if (slen > 16 || (len -= (slen + 2)*4) < 0)
+- goto badcred;
+- cred->cr_group_info = groups_alloc(slen);
+- if (cred->cr_group_info == NULL)
+- return SVC_DROP;
+- for (i = 0; i < slen; i++)
+- GROUP_AT(cred->cr_group_info, i) = ntohl(svc_getu32(argv));
+-
+- if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
+- *authp = rpc_autherr_badverf;
+- return SVC_DENIED;
+- }
+- /* Put NULL verifier */
+- svc_putu32(resv, RPC_AUTH_NULL);
+- svc_putu32(resv, 0);
+- dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK);
+- return SVC_OK;
+-badcred:
+- *authp = rpc_autherr_badcred;
+- return SVC_DENIED;
+-}
+-
+-static int nfs_callback_unix_release(struct svc_rqst *rqstp)
+-{
+- if (rqstp->rq_cred.cr_group_info)
+- put_group_info(rqstp->rq_cred.cr_group_info);
+- rqstp->rq_cred.cr_group_info = NULL;
+- return 0;
+-}
+-
+-static struct auth_ops nfs_callback_auth_unix = {
+- .name = "unix",
+- .flavour = RPC_AUTH_UNIX,
+- .accept = nfs_callback_unix_accept,
+- .release = nfs_callback_unix_release,
+-};
+-
+-/*
+- * Hook the authentication protocol
+- */
+-static int nfs_callback_auth(struct svc_rqst *rqstp, u32 *authp)
++static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+ {
+ struct in_addr *addr = &rqstp->rq_addr.sin_addr;
+ struct nfs4_client *clp;
+- struct kvec *argv = &rqstp->rq_arg.head[0];
+- int flavour;
+- int retval;
+
+ /* Don't talk to strangers */
+ clp = nfs4_find_client(addr);
+@@ -273,34 +150,19 @@
+ return SVC_DROP;
+ dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr));
+ nfs4_put_client(clp);
+- flavour = ntohl(svc_getu32(argv));
+- switch(flavour) {
++ switch (rqstp->rq_authop->flavour) {
+ case RPC_AUTH_NULL:
+- if (rqstp->rq_proc != CB_NULL) {
+- *authp = rpc_autherr_tooweak;
+- retval = SVC_DENIED;
+- break;
+- }
+- rqstp->rq_authop = &nfs_callback_auth_null;
+- retval = nfs_callback_null_accept(rqstp, authp);
++ if (rqstp->rq_proc != CB_NULL)
++ return SVC_DENIED;
+ break;
+ case RPC_AUTH_UNIX:
+- /* Eat the authentication flavour */
+- rqstp->rq_authop = &nfs_callback_auth_unix;
+- retval = nfs_callback_unix_accept(rqstp, authp);
+ break;
++ case RPC_AUTH_GSS:
++ /* FIXME: RPCSEC_GSS handling? */
+ default:
+- /* FIXME: need to add RPCSEC_GSS upcalls */
+-#if 0
+- svc_ungetu32(argv);
+- retval = svc_authenticate(rqstp, authp);
+-#else
+- *authp = rpc_autherr_rejectedcred;
+- retval = SVC_DENIED;
+-#endif
++ return SVC_DENIED;
+ }
+- dprintk("%s: flavour %d returning error %d\n", __FUNCTION__, flavour, retval);
+- return retval;
++ return SVC_OK;
+ }
+
+ /*
+@@ -321,5 +183,5 @@
+ .pg_name = "NFSv4 callback", /* service name */
+ .pg_class = "nfs", /* authentication class */
+ .pg_stats = &nfs4_callback_stats,
+- .pg_authenticate = nfs_callback_auth,
++ .pg_authenticate = nfs_callback_authenticate,
+ };
+Index: linux-2.6.10/fs/nfs/file.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/file.c 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/fs/nfs/file.c 2005-04-05 14:49:13.453684048 +0800
+@@ -67,6 +67,19 @@
+ .setattr = nfs_setattr,
+ };
+
++#ifdef CONFIG_NFS_V4
++
++struct inode_operations nfs4_file_inode_operations = {
++ .permission = nfs_permission,
++ .getattr = nfs_getattr,
++ .setattr = nfs_setattr,
++ .getxattr = nfs4_getxattr,
++ .setxattr = nfs4_setxattr,
++ .listxattr = nfs4_listxattr,
++};
++
++#endif /* CONFIG_NFS_V4 */
++
+ /* Hack for future NFS swap support */
+ #ifndef IS_SWAPFILE
+ # define IS_SWAPFILE(inode) (0)
+@@ -295,10 +308,19 @@
+ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
+ {
+ struct inode *inode = filp->f_mapping->host;
+- int status;
++ int status = 0;
+
+ lock_kernel();
+- status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++ /* Use local locking if mounted with "-onolock" */
++ if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
++ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++ else {
++ struct file_lock *cfl = posix_test_lock(filp, fl);
++ if (cfl != NULL) {
++ memcpy(fl, cfl, sizeof(*fl));
++ fl->fl_type = F_UNLCK;
++ }
++ }
+ unlock_kernel();
+ return status;
+ }
+@@ -325,7 +347,11 @@
+ * still need to complete the unlock.
+ */
+ lock_kernel();
+- status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++ /* Use local locking if mounted with "-onolock" */
++ if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
++ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++ else
++ status = posix_lock_file_wait(filp, fl);
+ rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset);
+ return status;
+ }
+@@ -351,15 +377,19 @@
+ return status;
+
+ lock_kernel();
+- status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+- /* If we were signalled we still need to ensure that
+- * we clean up any state on the server. We therefore
+- * record the lock call as having succeeded in order to
+- * ensure that locks_remove_posix() cleans it out when
+- * the process exits.
+- */
+- if (status == -EINTR || status == -ERESTARTSYS)
+- posix_lock_file(filp, fl);
++ /* Use local locking if mounted with "-onolock" */
++ if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) {
++ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++ /* If we were signalled we still need to ensure that
++ * we clean up any state on the server. We therefore
++ * record the lock call as having succeeded in order to
++ * ensure that locks_remove_posix() cleans it out when
++ * the process exits.
++ */
++ if (status == -EINTR || status == -ERESTARTSYS)
++ posix_lock_file(filp, fl);
++ } else
++ status = posix_lock_file_wait(filp, fl);
+ unlock_kernel();
+ if (status < 0)
+ return status;
+@@ -396,15 +426,6 @@
+ if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ return -ENOLCK;
+
+- if (NFS_PROTO(inode)->version != 4) {
+- /* Fake OK code if mounted without NLM support */
+- if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) {
+- if (IS_GETLK(cmd))
+- return LOCK_USE_CLNT;
+- return 0;
+- }
+- }
+-
+ /*
+ * No BSD flocks over NFS allowed.
+ * Note: we could try to fake a POSIX lock request here by
+Index: linux-2.6.10/fs/nfs/nfs3proc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/nfs3proc.c 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/fs/nfs/nfs3proc.c 2005-04-05 14:49:13.441685872 +0800
+@@ -80,10 +80,10 @@
+ dprintk("%s: call fsinfo\n", __FUNCTION__);
+ info->fattr->valid = 0;
+ status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0);
+- dprintk("%s: reply fsinfo %d\n", __FUNCTION__, status);
++ dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
+ if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
+ status = rpc_call(server->client_sys, NFS3PROC_GETATTR, fhandle, info->fattr, 0);
+- dprintk("%s: reply getattr %d\n", __FUNCTION__, status);
++ dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
+ }
+ return status;
+ }
+@@ -101,7 +101,7 @@
+ fattr->valid = 0;
+ status = rpc_call(server->client, NFS3PROC_GETATTR,
+ fhandle, fattr, 0);
+- dprintk("NFS reply getattr\n");
++ dprintk("NFS reply getattr: %d\n", status);
+ return status;
+ }
+
+@@ -119,7 +119,7 @@
+ dprintk("NFS call setattr\n");
+ fattr->valid = 0;
+ status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0);
+- dprintk("NFS reply setattr\n");
++ dprintk("NFS reply setattr: %d\n", status);
+ return status;
+ }
+
+@@ -198,7 +198,7 @@
+ if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
+ entry->mask |= MAY_EXEC;
+ }
+- dprintk("NFS reply access, status = %d\n", status);
++ dprintk("NFS reply access: %d\n", status);
+ return status;
+ }
+
+@@ -296,7 +296,7 @@
+ * For now, we don't implement O_EXCL.
+ */
+ static struct inode *
+-nfs3_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr,
++nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+ {
+ struct nfs_fh fhandle;
+@@ -304,8 +304,8 @@
+ struct nfs_fattr dir_attr;
+ struct nfs3_createargs arg = {
+ .fh = NFS_FH(dir),
+- .name = name->name,
+- .len = name->len,
++ .name = dentry->d_name.name,
++ .len = dentry->d_name.len,
+ .sattr = sattr,
+ };
+ struct nfs3_diropres res = {
+@@ -315,7 +315,7 @@
+ };
+ int status;
+
+- dprintk("NFS call create %s\n", name->name);
++ dprintk("NFS call create %s\n", dentry->d_name.name);
+ arg.createmode = NFS3_CREATE_UNCHECKED;
+ if (flags & O_EXCL) {
+ arg.createmode = NFS3_CREATE_EXCLUSIVE;
+@@ -353,7 +353,7 @@
+ if (status != 0)
+ goto out;
+ if (fhandle.size == 0 || !(fattr.valid & NFS_ATTR_FATTR)) {
+- status = nfs3_proc_lookup(dir, name, &fhandle, &fattr);
++ status = nfs3_proc_lookup(dir, &dentry->d_name, &fhandle, &fattr);
+ if (status != 0)
+ goto out;
+ }
+@@ -838,6 +838,7 @@
+ .version = 3, /* protocol version */
+ .dentry_ops = &nfs_dentry_operations,
+ .dir_inode_ops = &nfs_dir_inode_operations,
++ .file_inode_ops = &nfs_file_inode_operations,
+ .getroot = nfs3_proc_get_root,
+ .getattr = nfs3_proc_getattr,
+ .setattr = nfs3_proc_setattr,
+Index: linux-2.6.10/fs/nfs/nfs4proc.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/nfs4proc.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfs/nfs4proc.c 2005-04-05 14:49:13.456683592 +0800
+@@ -477,7 +477,7 @@
+ /*
+ * Returns an nfs4_state + an referenced inode
+ */
+-static int _nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
++static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+ {
+ struct nfs4_state_owner *sp;
+ struct nfs4_state *state = NULL;
+@@ -491,7 +491,7 @@
+ struct nfs_openargs o_arg = {
+ .fh = NFS_FH(dir),
+ .open_flags = flags,
+- .name = name,
++ .name = &dentry->d_name,
+ .server = server,
+ .bitmask = server->attr_bitmask,
+ .claim = NFS4_OPEN_CLAIM_NULL,
+@@ -581,14 +581,14 @@
+ }
+
+
+-struct nfs4_state *nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred)
++struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred)
+ {
+ struct nfs4_exception exception = { };
+ struct nfs4_state *res;
+ int status;
+
+ do {
+- status = _nfs4_do_open(dir, name, flags, sattr, cred, &res);
++ status = _nfs4_do_open(dir, dentry, flags, sattr, cred, &res);
+ if (status == 0)
+ break;
+ /* NOTE: BAD_SEQID means the server and client disagree about the
+@@ -635,6 +635,8 @@
+
+ fattr->valid = 0;
+
++ if (state != NULL)
++ msg.rpc_cred = state->owner->so_cred;
+ if (sattr->ia_valid & ATTR_SIZE)
+ nfs4_copy_stateid(&arg.stateid, state, NULL);
+ else
+@@ -658,6 +660,61 @@
+ return err;
+ }
+
++struct nfs4_closedata {
++ struct inode *inode;
++ struct nfs4_state *state;
++ struct nfs_closeargs arg;
++ struct nfs_closeres res;
++};
++
++static void nfs4_close_done(struct rpc_task *task)
++{
++ struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
++ struct nfs4_state *state = calldata->state;
++ struct nfs4_state_owner *sp = state->owner;
++ struct nfs_server *server = NFS_SERVER(calldata->inode);
++
++ /* hmm. we are done with the inode, and in the process of freeing
++ * the state_owner. we keep this around to process errors
++ */
++ nfs4_increment_seqid(task->tk_status, sp);
++ switch (task->tk_status) {
++ case 0:
++ state->state = calldata->arg.open_flags;
++ memcpy(&state->stateid, &calldata->res.stateid,
++ sizeof(state->stateid));
++ break;
++ case -NFS4ERR_STALE_STATEID:
++ case -NFS4ERR_EXPIRED:
++ state->state = calldata->arg.open_flags;
++ nfs4_schedule_state_recovery(server->nfs4_state);
++ break;
++ default:
++ if (nfs4_async_handle_error(task, server) == -EAGAIN) {
++ rpc_restart_call(task);
++ return;
++ }
++ }
++ nfs4_put_open_state(state);
++ up(&sp->so_sema);
++ nfs4_put_state_owner(sp);
++ up_read(&server->nfs4_state->cl_sem);
++ kfree(calldata);
++}
++
++static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *calldata)
++{
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
++ .rpc_argp = &calldata->arg,
++ .rpc_resp = &calldata->res,
++ .rpc_cred = calldata->state->owner->so_cred,
++ };
++ if (calldata->arg.open_flags != 0)
++ msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
++ return rpc_call_async(clnt, &msg, 0, nfs4_close_done, calldata);
++}
++
+ /*
+ * It is possible for data to be read/written from a mem-mapped file
+ * after the sys_close call (which hits the vfs layer as a flush).
+@@ -669,102 +726,34 @@
+ *
+ * NOTE: Caller must be holding the sp->so_owner semaphore!
+ */
+-static int _nfs4_do_close(struct inode *inode, struct nfs4_state *state)
++int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode)
+ {
+- struct nfs4_state_owner *sp = state->owner;
+- int status = 0;
+- struct nfs_closeargs arg = {
+- .fh = NFS_FH(inode),
+- };
+- struct nfs_closeres res;
+- struct rpc_message msg = {
+- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+- .rpc_argp = &arg,
+- .rpc_resp = &res,
+- };
++ struct nfs4_closedata *calldata;
++ int status;
+
+- if (test_bit(NFS_DELEGATED_STATE, &state->flags))
++ /* Tell caller we're done */
++ if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
++ state->state = mode;
+ return 0;
+- memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
++ }
++ calldata = (struct nfs4_closedata *)kmalloc(sizeof(*calldata), GFP_KERNEL);
++ if (calldata == NULL)
++ return -ENOMEM;
++ calldata->inode = inode;
++ calldata->state = state;
++ calldata->arg.fh = NFS_FH(inode);
+ /* Serialization for the sequence id */
+- arg.seqid = sp->so_seqid,
+- status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, RPC_TASK_NOINTR);
+-
+- /* hmm. we are done with the inode, and in the process of freeing
+- * the state_owner. we keep this around to process errors
++ calldata->arg.seqid = state->owner->so_seqid;
++ calldata->arg.open_flags = mode;
++ memcpy(&calldata->arg.stateid, &state->stateid,
++ sizeof(calldata->arg.stateid));
++ status = nfs4_close_call(NFS_SERVER(inode)->client, calldata);
++ /*
++ * Return -EINPROGRESS on success in order to indicate to the
++ * caller that an asynchronous RPC call has been launched, and
++ * that it will release the semaphores on completion.
+ */
+- nfs4_increment_seqid(status, sp);
+- if (!status)
+- memcpy(&state->stateid, &res.stateid, sizeof(state->stateid));
+-
+- return status;
+-}
+-
+-int nfs4_do_close(struct inode *inode, struct nfs4_state *state)
+-{
+- struct nfs_server *server = NFS_SERVER(state->inode);
+- struct nfs4_exception exception = { };
+- int err;
+- do {
+- err = _nfs4_do_close(inode, state);
+- switch (err) {
+- case -NFS4ERR_STALE_STATEID:
+- case -NFS4ERR_EXPIRED:
+- nfs4_schedule_state_recovery(server->nfs4_state);
+- err = 0;
+- default:
+- state->state = 0;
+- }
+- err = nfs4_handle_exception(server, err, &exception);
+- } while (exception.retry);
+- return err;
+-}
+-
+-static int _nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode)
+-{
+- struct nfs4_state_owner *sp = state->owner;
+- int status = 0;
+- struct nfs_closeargs arg = {
+- .fh = NFS_FH(inode),
+- .seqid = sp->so_seqid,
+- .open_flags = mode,
+- };
+- struct nfs_closeres res;
+- struct rpc_message msg = {
+- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE],
+- .rpc_argp = &arg,
+- .rpc_resp = &res,
+- };
+-
+- if (test_bit(NFS_DELEGATED_STATE, &state->flags))
+- return 0;
+- memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
+- status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, RPC_TASK_NOINTR);
+- nfs4_increment_seqid(status, sp);
+- if (!status)
+- memcpy(&state->stateid, &res.stateid, sizeof(state->stateid));
+-
+- return status;
+-}
+-
+-int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode)
+-{
+- struct nfs_server *server = NFS_SERVER(state->inode);
+- struct nfs4_exception exception = { };
+- int err;
+- do {
+- err = _nfs4_do_downgrade(inode, state, mode);
+- switch (err) {
+- case -NFS4ERR_STALE_STATEID:
+- case -NFS4ERR_EXPIRED:
+- nfs4_schedule_state_recovery(server->nfs4_state);
+- err = 0;
+- default:
+- state->state = mode;
+- }
+- err = nfs4_handle_exception(server, err, &exception);
+- } while (exception.retry);
+- return err;
++ return (status == 0) ? -EINPROGRESS : status;
+ }
+
+ struct inode *
+@@ -785,7 +774,7 @@
+ }
+
+ cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+- state = nfs4_do_open(dir, &dentry->d_name, nd->intent.open.flags, &attr, cred);
++ state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred);
+ put_rpccred(cred);
+ if (IS_ERR(state))
+ return (struct inode *)state;
+@@ -802,7 +791,7 @@
+ cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+ state = nfs4_open_delegated(dentry->d_inode, openflags, cred);
+ if (IS_ERR(state))
+- state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred);
++ state = nfs4_do_open(dir, dentry, openflags, NULL, cred);
+ put_rpccred(cred);
+ if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0)
+ return 1;
+@@ -1026,7 +1015,7 @@
+ FMODE_WRITE, cred);
+ if (IS_ERR(state))
+ state = nfs4_do_open(dentry->d_parent->d_inode,
+- &dentry->d_name, FMODE_WRITE,
++ dentry, FMODE_WRITE,
+ NULL, cred);
+ need_iput = 1;
+ }
+@@ -1327,7 +1316,7 @@
+ */
+
+ static struct inode *
+-nfs4_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr,
++nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+ {
+ struct inode *inode;
+@@ -1335,7 +1324,7 @@
+ struct rpc_cred *cred;
+
+ cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+- state = nfs4_do_open(dir, name, flags, sattr, cred);
++ state = nfs4_do_open(dir, dentry, flags, sattr, cred);
+ put_rpccred(cred);
+ if (!IS_ERR(state)) {
+ inode = state->inode;
+@@ -2049,6 +2038,86 @@
+ }
+
+ static int
++nfs4_server_supports_acls(struct nfs_server *server)
++{
++ return (server->caps & NFS_CAP_ACLS)
++ && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
++ && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
++}
++
++/* XXX: assuming XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE,
++ * and that it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE)
++ * bytes on the stack. (Currently probably both true.)
++ */
++#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
++
++static void buf_to_pages(const void *buf, ssize_t buflen,
++ struct page **pages, unsigned int *pgbase)
++{
++ const void *p = buf;
++
++ *pgbase = offset_in_page(buf);
++ p -= *pgbase;
++ while (p < buf + buflen) {
++ *(pages++) = virt_to_page(p);
++ p += PAGE_CACHE_SIZE;
++ }
++}
++
++ssize_t
++nfs4_proc_get_acl(struct inode *inode, void *buf, ssize_t buflen)
++{
++ struct nfs_server *server = NFS_SERVER(inode);
++ struct page *pages[NFS4ACL_MAXPAGES];
++ struct nfs_getaclargs args = {
++ .fh = NFS_FH(inode),
++ .acl_pages = pages,
++ .acl_len = buflen,
++ };
++ ssize_t acl_len = buflen;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
++ .rpc_argp = &args,
++ .rpc_resp = &acl_len,
++ };
++ int ret;
++
++ if (!nfs4_server_supports_acls(server))
++ return -EOPNOTSUPP;
++ buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
++ ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
++ if (buflen && acl_len > buflen)
++ return -ERANGE;
++ if (ret == 0)
++ ret = acl_len;
++ return ret;
++}
++
++int
++nfs4_proc_set_acl(struct inode *inode, const void *buf, ssize_t buflen)
++{
++ struct nfs_server *server = NFS_SERVER(inode);
++ struct page *pages[NFS4ACL_MAXPAGES];
++ struct nfs_setaclargs arg = {
++ .fh = NFS_FH(inode),
++ .acl_pages = pages,
++ .acl_len = buflen,
++ };
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL],
++ .rpc_argp = &arg,
++ .rpc_resp = NULL,
++ };
++ int ret;
++
++ if (!nfs4_server_supports_acls(server))
++ return -EOPNOTSUPP;
++ buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
++ ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
++ return ret;
++}
++
++static int
+ nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server)
+ {
+ struct nfs4_client *clp = server->nfs4_state;
+@@ -2589,6 +2658,7 @@
+ .version = 4, /* protocol version */
+ .dentry_ops = &nfs4_dentry_operations,
+ .dir_inode_ops = &nfs4_dir_inode_operations,
++ .file_inode_ops = &nfs4_file_inode_operations,
+ .getroot = nfs4_proc_get_root,
+ .getattr = nfs4_proc_getattr,
+ .setattr = nfs4_proc_setattr,
+Index: linux-2.6.10/fs/nfs/direct.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/direct.c 2005-03-31 15:35:23.000000000 +0800
++++ linux-2.6.10/fs/nfs/direct.c 2005-04-05 14:49:13.448684808 +0800
+@@ -33,6 +33,7 @@
+ * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
+ * 08 Jun 2003 Port to 2.5 APIs --cel
+ * 31 Mar 2004 Handle direct I/O without VFS support --cel
++ * 15 Sep 2004 Parallel async reads --cel
+ *
+ */
+
+@@ -43,6 +44,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/file.h>
+ #include <linux/pagemap.h>
++#include <linux/kref.h>
+
+ #include <linux/nfs_fs.h>
+ #include <linux/nfs_page.h>
+@@ -50,11 +52,27 @@
+
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
++#include <asm/atomic.h>
+
+ #define NFSDBG_FACILITY NFSDBG_VFS
+-#define VERF_SIZE (2 * sizeof(__u32))
+ #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT)
+
++static kmem_cache_t *nfs_direct_cachep;
++
++/*
++ * This represents a set of asynchronous requests that we're waiting on
++ */
++struct nfs_direct_req {
++ struct kref kref; /* release manager */
++ struct list_head list; /* nfs_read_data structs */
++ wait_queue_head_t wait; /* wait for i/o completion */
++ struct page ** pages; /* pages in our buffer */
++ unsigned int npages; /* count of pages */
++ atomic_t complete, /* i/os we're waiting for */
++ count, /* bytes actually processed */
++ error; /* any reported error */
++};
++
+
+ /**
+ * nfs_get_user_pages - find and set up pages underlying user's buffer
+@@ -71,7 +89,8 @@
+ unsigned long page_count;
+ size_t array_size;
+
+- /* set an arbitrary limit to prevent arithmetic overflow */
++ /* set an arbitrary limit to prevent type overflow */
++ /* XXX: this can probably be as large as INT_MAX */
+ if (size > MAX_DIRECTIO_SIZE) {
+ *pages = NULL;
+ return -EFBIG;
+@@ -95,6 +114,8 @@
+ /**
+ * nfs_free_user_pages - tear down page struct array
+ * @pages: array of page struct pointers underlying target buffer
++ * @npages: number of pages in the array
++ * @do_dirty: dirty the pages as we release them
+ */
+ static void
+ nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
+@@ -109,77 +130,231 @@
+ }
+
+ /**
+- * nfs_direct_read_seg - Read in one iov segment. Generate separate
+- * read RPCs for each "rsize" bytes.
++ * nfs_direct_req_release - release nfs_direct_req structure for direct read
++ * @kref: kref object embedded in an nfs_direct_req structure
++ *
++ */
++static void nfs_direct_req_release(struct kref *kref)
++{
++ struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
++ kmem_cache_free(nfs_direct_cachep, dreq);
++}
++
++/**
++ * nfs_direct_read_alloc - allocate nfs_read_data structures for direct read
++ * @count: count of bytes for the read request
++ * @rsize: local rsize setting
++ *
++ * Note we also set the number of requests we have in the dreq when we are
++ * done. This prevents races with I/O completion so we will always wait
++ * until all requests have been dispatched and completed.
++ */
++static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int rsize)
++{
++ struct list_head *list;
++ struct nfs_direct_req *dreq;
++ unsigned int reads = 0;
++
++ dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
++ if (!dreq)
++ return NULL;
++
++ kref_init(&dreq->kref);
++ init_waitqueue_head(&dreq->wait);
++ INIT_LIST_HEAD(&dreq->list);
++ atomic_set(&dreq->count, 0);
++ atomic_set(&dreq->error, 0);
++
++ list = &dreq->list;
++ for(;;) {
++ struct nfs_read_data *data = nfs_readdata_alloc();
++
++ if (unlikely(!data)) {
++ while (!list_empty(list)) {
++ data = list_entry(list->next,
++ struct nfs_read_data, pages);
++ list_del(&data->pages);
++ nfs_readdata_free(data);
++ }
++ kref_put(&dreq->kref, nfs_direct_req_release);
++ return NULL;
++ }
++
++ INIT_LIST_HEAD(&data->pages);
++ list_add(&data->pages, list);
++
++ data->req = (struct nfs_page *) dreq;
++ reads++;
++ if (nbytes <= rsize)
++ break;
++ nbytes -= rsize;
++ }
++ kref_get(&dreq->kref);
++ atomic_set(&dreq->complete, reads);
++ return dreq;
++}
++
++/**
++ * nfs_direct_read_result - handle a read reply for a direct read request
++ * @data: address of NFS READ operation control block
++ * @status: status of this NFS READ operation
++ *
++ * We must hold a reference to all the pages in this direct read request
++ * until the RPCs complete. This could be long *after* we are woken up in
++ * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server).
++ */
++static void nfs_direct_read_result(struct nfs_read_data *data, int status)
++{
++ struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
++
++ if (likely(status >= 0))
++ atomic_add(data->res.count, &dreq->count);
++ else
++ atomic_set(&dreq->error, status);
++
++ if (unlikely(atomic_dec_and_test(&dreq->complete))) {
++ nfs_free_user_pages(dreq->pages, dreq->npages, 1);
++ wake_up(&dreq->wait);
++ kref_put(&dreq->kref, nfs_direct_req_release);
++ }
++}
++
++/**
++ * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read
++ * @dreq: address of nfs_direct_req struct for this request
+ * @inode: target inode
+ * @ctx: target file open context
+- * user_addr: starting address of this segment of user's buffer
+- * count: size of this segment
+- * file_offset: offset in file to begin the operation
+- * @pages: array of addresses of page structs defining user's buffer
+- * nr_pages: size of pages array
++ * @user_addr: starting address of this segment of user's buffer
++ * @count: size of this segment
++ * @file_offset: offset in file to begin the operation
++ *
++ * For each nfs_read_data struct that was allocated on the list, dispatch
++ * an NFS READ operation
+ */
+-static int
+-nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx,
+- unsigned long user_addr, size_t count, loff_t file_offset,
+- struct page **pages, int nr_pages)
+-{
+- const unsigned int rsize = NFS_SERVER(inode)->rsize;
+- int tot_bytes = 0;
+- int curpage = 0;
+- struct nfs_read_data rdata = {
+- .inode = inode,
+- .cred = ctx->cred,
+- .args = {
+- .fh = NFS_FH(inode),
+- .context = ctx,
+- },
+- .res = {
+- .fattr = &rdata.fattr,
+- },
+- };
++static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
++ struct inode *inode, struct nfs_open_context *ctx,
++ unsigned long user_addr, size_t count, loff_t file_offset)
++{
++ struct list_head *list = &dreq->list;
++ struct page **pages = dreq->pages;
++ unsigned int curpage, pgbase;
++ unsigned int rsize = NFS_SERVER(inode)->rsize;
+
+- rdata.args.pgbase = user_addr & ~PAGE_MASK;
+- rdata.args.offset = file_offset;
+- do {
+- int result;
+-
+- rdata.args.count = count;
+- if (rdata.args.count > rsize)
+- rdata.args.count = rsize;
+- rdata.args.pages = &pages[curpage];
+-
+- dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
+- rdata.args.count, (long long) rdata.args.offset,
+- user_addr + tot_bytes, rdata.args.pgbase, curpage);
++ curpage = 0;
++ pgbase = user_addr & ~PAGE_MASK;
++ do {
++ struct nfs_read_data *data;
++ unsigned int bytes;
++
++ bytes = rsize;
++ if (count < rsize)
++ bytes = count;
++
++ data = list_entry(list->next, struct nfs_read_data, pages);
++ list_del_init(&data->pages);
++
++ data->inode = inode;
++ data->cred = ctx->cred;
++ data->args.fh = NFS_FH(inode);
++ data->args.context = ctx;
++ data->args.offset = file_offset;
++ data->args.pgbase = pgbase;
++ data->args.pages = &pages[curpage];
++ data->args.count = bytes;
++ data->res.fattr = &data->fattr;
++ data->res.eof = 0;
++ data->res.count = bytes;
++
++ NFS_PROTO(inode)->read_setup(data);
++
++ data->task.tk_cookie = (unsigned long) inode;
++ data->task.tk_calldata = data;
++ data->task.tk_release = nfs_readdata_release;
++ data->complete = nfs_direct_read_result;
+
+ lock_kernel();
+- result = NFS_PROTO(inode)->read(&rdata);
++ rpc_execute(&data->task);
+ unlock_kernel();
+
+- if (result <= 0) {
+- if (tot_bytes > 0)
+- break;
+- if (result == -EISDIR)
+- result = -EINVAL;
+- return result;
+- }
++ dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
++ data->task.tk_pid,
++ inode->i_sb->s_id,
++ (long long)NFS_FILEID(inode),
++ bytes,
++ (unsigned long long)data->args.offset);
++
++ file_offset += bytes;
++ pgbase += bytes;
++ curpage += pgbase >> PAGE_SHIFT;
++ pgbase &= ~PAGE_MASK;
+
+- tot_bytes += result;
+- if (rdata.res.eof)
+- break;
+-
+- rdata.args.offset += result;
+- rdata.args.pgbase += result;
+- curpage += rdata.args.pgbase >> PAGE_SHIFT;
+- rdata.args.pgbase &= ~PAGE_MASK;
+- count -= result;
++ count -= bytes;
+ } while (count != 0);
++}
+
+- /* XXX: should we zero the rest of the user's buffer if we
+- * hit eof? */
++/**
++ * nfs_direct_read_wait - wait for I/O completion for direct reads
++ * @dreq: request on which we are to wait
++ * @intr: whether or not this wait can be interrupted
++ *
++ * Collects and returns the final error value/byte-count.
++ */
++static ssize_t nfs_direct_read_wait(struct nfs_direct_req *dreq, int intr)
++{
++ int result = 0;
+
+- return tot_bytes;
++ if (intr) {
++ result = wait_event_interruptible(dreq->wait,
++ (atomic_read(&dreq->complete) == 0));
++ } else {
++ wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0));
++ }
++
++ if (!result)
++ result = atomic_read(&dreq->error);
++ if (!result)
++ result = atomic_read(&dreq->count);
++
++ kref_put(&dreq->kref, nfs_direct_req_release);
++ return (ssize_t) result;
++}
++
++/**
++ * nfs_direct_read_seg - Read in one iov segment. Generate separate
++ * read RPCs for each "rsize" bytes.
++ * @inode: target inode
++ * @ctx: target file open context
++ * @user_addr: starting address of this segment of user's buffer
++ * @count: size of this segment
++ * @file_offset: offset in file to begin the operation
++ * @pages: array of addresses of page structs defining user's buffer
++ * @nr_pages: number of pages in the array
++ *
++ */
++static ssize_t nfs_direct_read_seg(struct inode *inode,
++ struct nfs_open_context *ctx, unsigned long user_addr,
++ size_t count, loff_t file_offset, struct page **pages,
++ unsigned int nr_pages)
++{
++ ssize_t result;
++ sigset_t oldset;
++ struct rpc_clnt *clnt = NFS_CLIENT(inode);
++ struct nfs_direct_req *dreq;
++
++ dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
++ if (!dreq)
++ return -ENOMEM;
++
++ dreq->pages = pages;
++ dreq->npages = nr_pages;
++
++ rpc_clnt_sigmask(clnt, &oldset);
++ nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count,
++ file_offset);
++ result = nfs_direct_read_wait(dreq, clnt->cl_intr);
++ rpc_clnt_sigunmask(clnt, &oldset);
++
++ return result;
+ }
+
+ /**
+@@ -191,9 +366,8 @@
+ * file_offset: offset in file to begin the operation
+ * nr_segs: size of iovec array
+ *
+- * generic_file_direct_IO has already pushed out any non-direct
+- * writes so that this read will see them when we read from the
+- * server.
++ * We've already pushed out any non-direct writes so that this read
++ * will see them when we read from the server.
+ */
+ static ssize_t
+ nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx,
+@@ -222,8 +396,6 @@
+ result = nfs_direct_read_seg(inode, ctx, user_addr, size,
+ file_offset, pages, page_count);
+
+- nfs_free_user_pages(pages, page_count, 1);
+-
+ if (result <= 0) {
+ if (tot_bytes > 0)
+ break;
+@@ -249,31 +421,31 @@
+ * @pages: array of addresses of page structs defining user's buffer
+ * nr_pages: size of pages array
+ */
+-static int
+-nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx,
+- unsigned long user_addr, size_t count, loff_t file_offset,
+- struct page **pages, int nr_pages)
++static ssize_t nfs_direct_write_seg(struct inode *inode,
++ struct nfs_open_context *ctx, unsigned long user_addr,
++ size_t count, loff_t file_offset, struct page **pages,
++ int nr_pages)
+ {
+ const unsigned int wsize = NFS_SERVER(inode)->wsize;
+ size_t request;
+- int curpage, need_commit, result, tot_bytes;
++ int curpage, need_commit;
++ ssize_t result, tot_bytes;
+ struct nfs_writeverf first_verf;
+- struct nfs_write_data wdata = {
+- .inode = inode,
+- .cred = ctx->cred,
+- .args = {
+- .fh = NFS_FH(inode),
+- .context = ctx,
+- },
+- .res = {
+- .fattr = &wdata.fattr,
+- .verf = &wdata.verf,
+- },
+- };
++ struct nfs_write_data *wdata;
+
+- wdata.args.stable = NFS_UNSTABLE;
++ wdata = nfs_writedata_alloc();
++ if (!wdata)
++ return -ENOMEM;
++
++ wdata->inode = inode;
++ wdata->cred = ctx->cred;
++ wdata->args.fh = NFS_FH(inode);
++ wdata->args.context = ctx;
++ wdata->args.stable = NFS_UNSTABLE;
+ if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize)
+- wdata.args.stable = NFS_FILE_SYNC;
++ wdata->args.stable = NFS_FILE_SYNC;
++ wdata->res.fattr = &wdata->fattr;
++ wdata->res.verf = &wdata->verf;
+
+ nfs_begin_data_update(inode);
+ retry:
+@@ -281,20 +453,20 @@
+ tot_bytes = 0;
+ curpage = 0;
+ request = count;
+- wdata.args.pgbase = user_addr & ~PAGE_MASK;
+- wdata.args.offset = file_offset;
+- do {
+- wdata.args.count = request;
+- if (wdata.args.count > wsize)
+- wdata.args.count = wsize;
+- wdata.args.pages = &pages[curpage];
++ wdata->args.pgbase = user_addr & ~PAGE_MASK;
++ wdata->args.offset = file_offset;
++ do {
++ wdata->args.count = request;
++ if (wdata->args.count > wsize)
++ wdata->args.count = wsize;
++ wdata->args.pages = &pages[curpage];
+
+ dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
+- wdata.args.count, (long long) wdata.args.offset,
+- user_addr + tot_bytes, wdata.args.pgbase, curpage);
++ wdata->args.count, (long long) wdata->args.offset,
++ user_addr + tot_bytes, wdata->args.pgbase, curpage);
+
+ lock_kernel();
+- result = NFS_PROTO(inode)->write(&wdata);
++ result = NFS_PROTO(inode)->write(wdata);
+ unlock_kernel();
+
+ if (result <= 0) {
+@@ -304,20 +476,25 @@
+ }
+
+ if (tot_bytes == 0)
+- memcpy(&first_verf.verifier, &wdata.verf.verifier,
+- VERF_SIZE);
+- if (wdata.verf.committed != NFS_FILE_SYNC) {
++ memcpy(&first_verf.verifier, &wdata->verf.verifier,
++ sizeof(first_verf.verifier));
++ if (wdata->verf.committed != NFS_FILE_SYNC) {
+ need_commit = 1;
+- if (memcmp(&first_verf.verifier,
+- &wdata.verf.verifier, VERF_SIZE))
++ if (memcmp(&first_verf.verifier, &wdata->verf.verifier,
++ sizeof(first_verf.verifier)));
+ goto sync_retry;
+ }
+
+- tot_bytes += result;
+- wdata.args.offset += result;
+- wdata.args.pgbase += result;
+- curpage += wdata.args.pgbase >> PAGE_SHIFT;
+- wdata.args.pgbase &= ~PAGE_MASK;
++ tot_bytes += result;
++
++ /* in case of a short write: stop now, let the app recover */
++ if (result < wdata->args.count)
++ break;
++
++ wdata->args.offset += result;
++ wdata->args.pgbase += result;
++ curpage += wdata->args.pgbase >> PAGE_SHIFT;
++ wdata->args.pgbase &= ~PAGE_MASK;
+ request -= result;
+ } while (request != 0);
+
+@@ -325,27 +502,27 @@
+ * Commit data written so far, even in the event of an error
+ */
+ if (need_commit) {
+- wdata.args.count = tot_bytes;
+- wdata.args.offset = file_offset;
++ wdata->args.count = tot_bytes;
++ wdata->args.offset = file_offset;
+
+ lock_kernel();
+- result = NFS_PROTO(inode)->commit(&wdata);
++ result = NFS_PROTO(inode)->commit(wdata);
+ unlock_kernel();
+
+ if (result < 0 || memcmp(&first_verf.verifier,
+- &wdata.verf.verifier,
+- VERF_SIZE) != 0)
++ &wdata->verf.verifier,
++ sizeof(first_verf.verifier)) != 0)
+ goto sync_retry;
+ }
+ result = tot_bytes;
+
+ out:
+ nfs_end_data_update_defer(inode);
+-
++ nfs_writedata_free(wdata);
+ return result;
+
+ sync_retry:
+- wdata.args.stable = NFS_FILE_SYNC;
++ wdata->args.stable = NFS_FILE_SYNC;
+ goto retry;
+ }
+
+@@ -362,9 +539,9 @@
+ * that non-direct readers might access, so they will pick up these
+ * writes immediately.
+ */
+-static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx,
+- const struct iovec *iov, loff_t file_offset,
+- unsigned long nr_segs)
++static ssize_t nfs_direct_write(struct inode *inode,
++ struct nfs_open_context *ctx, const struct iovec *iov,
++ loff_t file_offset, unsigned long nr_segs)
+ {
+ ssize_t tot_bytes = 0;
+ unsigned long seg = 0;
+@@ -504,6 +681,8 @@
+ if (mapping->nrpages) {
+ retval = filemap_fdatawrite(mapping);
+ if (retval == 0)
++ retval = nfs_wb_all(inode);
++ if (retval == 0)
+ retval = filemap_fdatawait(mapping);
+ if (retval)
+ goto out;
+@@ -593,6 +772,8 @@
+ if (mapping->nrpages) {
+ retval = filemap_fdatawrite(mapping);
+ if (retval == 0)
++ retval = nfs_wb_all(inode);
++ if (retval == 0)
+ retval = filemap_fdatawait(mapping);
+ if (retval)
+ goto out;
+@@ -607,3 +788,21 @@
+ out:
+ return retval;
+ }
++
++int nfs_init_directcache(void)
++{
++ nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
++ sizeof(struct nfs_direct_req),
++ 0, SLAB_RECLAIM_ACCOUNT,
++ NULL, NULL);
++ if (nfs_direct_cachep == NULL)
++ return -ENOMEM;
++
++ return 0;
++}
++
++void nfs_destroy_directcache(void)
++{
++ if (kmem_cache_destroy(nfs_direct_cachep))
++ printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
++}
+Index: linux-2.6.10/fs/nfs/read.c
+===================================================================
+--- linux-2.6.10.orig/fs/nfs/read.c 2004-12-25 05:33:47.000000000 +0800
++++ linux-2.6.10/fs/nfs/read.c 2005-04-05 14:49:13.437686480 +0800
+@@ -24,7 +24,6 @@
+ #include <linux/mm.h>
+ #include <linux/slab.h>
+ #include <linux/pagemap.h>
+-#include <linux/mempool.h>
+ #include <linux/sunrpc/clnt.h>
+ #include <linux/nfs_fs.h>
+ #include <linux/nfs_page.h>
+@@ -39,25 +38,11 @@
+ static void nfs_readpage_result_full(struct nfs_read_data *, int);
+
+ static kmem_cache_t *nfs_rdata_cachep;
+-static mempool_t *nfs_rdata_mempool;
++mempool_t *nfs_rdata_mempool;
+
+ #define MIN_POOL_READ (32)
+
+-static struct nfs_read_data *nfs_readdata_alloc(void)
+-{
+- struct nfs_read_data *p;
+- p = (struct nfs_read_data *)mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
+- if (p)
+- memset(p, 0, sizeof(*p));
+- return p;
+-}
+-
+-static __inline__ void nfs_readdata_free(struct nfs_read_data *p)
+-{
+- mempool_free(p, nfs_rdata_mempool);
+-}
+-
+-static void nfs_readdata_release(struct rpc_task *task)
++void nfs_readdata_release(struct rpc_task *task)
+ {
+ struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata;
+ nfs_readdata_free(data);
--- /dev/null
+Index: linux-2.6.10/arch/i386/kernel/asm-offsets.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/asm-offsets.c 2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/asm-offsets.c 2005-04-05 16:34:18.173220992 +0800
+@@ -52,6 +52,7 @@
+ OFFSET(TI_preempt_count, thread_info, preempt_count);
+ OFFSET(TI_addr_limit, thread_info, addr_limit);
+ OFFSET(TI_restart_block, thread_info, restart_block);
++ OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+ BLANK();
+
+ OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
+Index: linux-2.6.10/arch/i386/kernel/cpu/common.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/cpu/common.c 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/cpu/common.c 2005-04-05 16:34:18.174220840 +0800
+@@ -384,6 +384,12 @@
+ if (disable_pse)
+ clear_bit(X86_FEATURE_PSE, c->x86_capability);
+
++ /* hack: disable SEP for non-NX cpus; SEP breaks Execshield. */
++ #ifdef CONFIG_HIGHMEM64G
++ if (!test_bit(X86_FEATURE_NX, c->x86_capability))
++ #endif
++ clear_bit(X86_FEATURE_SEP, c->x86_capability);
++
+ /* If the model name is still unset, do table lookup. */
+ if ( !c->x86_model_id[0] ) {
+ char *p;
+Index: linux-2.6.10/arch/i386/kernel/entry.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/entry.S 2005-04-05 16:29:30.192000792 +0800
++++ linux-2.6.10/arch/i386/kernel/entry.S 2005-04-05 16:34:18.167221904 +0800
+@@ -218,8 +218,12 @@
+ pushl %ebp
+ pushfl
+ pushl $(__USER_CS)
+- pushl $SYSENTER_RETURN
+-
++ /*
++ * Push current_thread_info()->sysenter_return to the stack.
++ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
++ * pushed above, and the word being pushed now:
++ */
++ pushl (TI_sysenter_return-THREAD_SIZE+4*4)(%esp)
+ /*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+Index: linux-2.6.10/arch/i386/kernel/process.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/process.c 2004-12-25 05:33:47.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/process.c 2005-04-05 16:34:18.173220992 +0800
+@@ -36,6 +36,8 @@
+ #include <linux/module.h>
+ #include <linux/kallsyms.h>
+ #include <linux/ptrace.h>
++#include <linux/mman.h>
++#include <linux/random.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -565,6 +567,8 @@
+ /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+
+ __unlazy_fpu(prev_p);
++ if (next_p->mm)
++ load_user_cs_desc(cpu, next_p->mm);
+
+ /*
+ * Reload esp0, LDT and the page table pointer:
+@@ -812,3 +816,62 @@
+ return 0;
+ }
+
++
++unsigned long arch_align_stack(unsigned long sp)
++{
++ if (current->flags & PF_RELOCEXEC)
++ sp -= ((get_random_int() % 65536) << 4);
++ return sp & ~0xf;
++}
++
++
++void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
++{
++ if (limit > mm->context.exec_limit) {
++ mm->context.exec_limit = limit;
++ set_user_cs(&mm->context.user_cs, limit);
++ if (mm == current->mm)
++ load_user_cs_desc(smp_processor_id(), mm);
++ }
++}
++
++void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
++{
++ struct vm_area_struct *vma;
++ unsigned long limit = 0;
++
++ if (old_end == mm->context.exec_limit) {
++ for (vma = mm->mmap; vma; vma = vma->vm_next)
++ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++ limit = vma->vm_end;
++
++ mm->context.exec_limit = limit;
++ set_user_cs(&mm->context.user_cs, limit);
++ if (mm == current->mm)
++ load_user_cs_desc(smp_processor_id(), mm);
++ }
++}
++
++void arch_flush_exec_range(struct mm_struct *mm)
++{
++ mm->context.exec_limit = 0;
++ set_user_cs(&mm->context.user_cs, 0);
++}
++
++/*
++ * Generate random brk address between 128MB and 196MB. (if the layout
++ * allows it.)
++ */
++void randomize_brk(unsigned long old_brk)
++{
++ unsigned long new_brk, range_start, range_end;
++
++ range_start = 0x08000000;
++ if (current->mm->brk >= range_start)
++ range_start = current->mm->brk;
++ range_end = range_start + 0x02000000;
++ new_brk = randomize_range(range_start, range_end, 0);
++ if (new_brk)
++ current->mm->brk = new_brk;
++}
++
+Index: linux-2.6.10/arch/i386/kernel/signal.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/signal.c 2005-04-05 16:29:23.290050048 +0800
++++ linux-2.6.10/arch/i386/kernel/signal.c 2005-04-05 16:34:18.170221448 +0800
+@@ -390,7 +390,7 @@
+ if (err)
+ goto give_sigsegv;
+
+- restorer = &__kernel_sigreturn;
++ restorer = current->mm->context.vdso + (long)&__kernel_sigreturn;
+ if (ka->sa.sa_flags & SA_RESTORER)
+ restorer = ka->sa.sa_restorer;
+
+@@ -487,9 +487,10 @@
+ goto give_sigsegv;
+
+ /* Set up to return from userspace. */
+- restorer = &__kernel_rt_sigreturn;
++ restorer = current->mm->context.vdso + (long)&__kernel_rt_sigreturn;
+ if (ka->sa.sa_flags & SA_RESTORER)
+ restorer = ka->sa.sa_restorer;
++
+ err |= __put_user(restorer, &frame->pretcode);
+
+ /*
+Index: linux-2.6.10/arch/i386/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/smp.c 2005-04-05 16:29:30.198999728 +0800
++++ linux-2.6.10/arch/i386/kernel/smp.c 2005-04-05 16:34:18.172221144 +0800
+@@ -22,6 +22,7 @@
+
+ #include <asm/mtrr.h>
+ #include <asm/tlbflush.h>
++#include <asm/desc.h>
+ #include <mach_apic.h>
+
+ /*
+@@ -313,6 +314,8 @@
+ unsigned long cpu;
+
+ cpu = get_cpu();
++ if (current->active_mm)
++ load_user_cs_desc(cpu, current->active_mm);
+
+ if (!cpu_isset(cpu, flush_cpumask))
+ goto out;
+Index: linux-2.6.10/arch/i386/kernel/sysenter.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/sysenter.c 2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/sysenter.c 2005-04-05 16:34:18.171221296 +0800
+@@ -13,6 +13,7 @@
+ #include <linux/gfp.h>
+ #include <linux/string.h>
+ #include <linux/elf.h>
++#include <linux/mman.h>
+
+ #include <asm/cpufeature.h>
+ #include <asm/msr.h>
+@@ -41,11 +42,14 @@
+ extern const char vsyscall_int80_start, vsyscall_int80_end;
+ extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+
++struct page *sysenter_page;
++
+ static int __init sysenter_setup(void)
+ {
+ void *page = (void *)get_zeroed_page(GFP_ATOMIC);
+
+- __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
++ __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_KERNEL_RO);
++ sysenter_page = virt_to_page(page);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP)) {
+ memcpy(page,
+@@ -59,7 +63,51 @@
+ &vsyscall_sysenter_end - &vsyscall_sysenter_start);
+
+ on_each_cpu(enable_sep_cpu, NULL, 1, 1);
++
+ return 0;
+ }
+
+ __initcall(sysenter_setup);
++
++extern void SYSENTER_RETURN_OFFSET;
++
++unsigned int vdso_enabled = 0;
++
++void map_vsyscall(void)
++{
++ struct thread_info *ti = current_thread_info();
++ struct vm_area_struct *vma;
++ unsigned long addr;
++
++ if (unlikely(!vdso_enabled)) {
++ current->mm->context.vdso = NULL;
++ return;
++ }
++
++ /*
++ * Map the vDSO (it will be randomized):
++ */
++ down_write(¤t->mm->mmap_sem);
++ addr = do_mmap(NULL, 0, 4096, PROT_READ | PROT_EXEC, MAP_PRIVATE, 0);
++ current->mm->context.vdso = (void *)addr;
++ ti->sysenter_return = (void *)addr + (long)&SYSENTER_RETURN_OFFSET;
++ if (addr != -1) {
++ vma = find_vma(current->mm, addr);
++ if (vma) {
++ pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW;
++ get_page(sysenter_page);
++ install_page(current->mm, vma, addr,
++ sysenter_page, vma->vm_page_prot);
++
++ }
++ }
++ up_write(¤t->mm->mmap_sem);
++}
++
++static int __init vdso_setup(char *str)
++{
++ vdso_enabled = simple_strtoul(str, NULL, 0);
++ return 1;
++}
++__setup("vdso=", vdso_setup);
++
+Index: linux-2.6.10/arch/i386/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/traps.c 2005-04-05 16:29:30.193000640 +0800
++++ linux-2.6.10/arch/i386/kernel/traps.c 2005-04-05 16:43:17.073295728 +0800
+@@ -497,6 +497,10 @@
+ DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+ DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+
++/*
++ * the original non-exec stack patch was written by
++ * Solar Designer <solar at openwall.com>. Thanks!
++ */
+ fastcall void do_general_protection(struct pt_regs * regs, long error_code)
+ {
+ int cpu = get_cpu();
+@@ -535,6 +539,46 @@
+ if (!(regs->xcs & 3))
+ goto gp_in_kernel;
+
++ /*
++ * lazy-check for CS validity on exec-shield binaries:
++ */
++ if (current->mm) {
++ int cpu = smp_processor_id();
++ struct desc_struct *desc1, *desc2;
++ struct vm_area_struct *vma;
++ unsigned long limit = 0;
++
++ spin_lock(¤t->mm->page_table_lock);
++ for (vma = current->mm->mmap; vma; vma = vma->vm_next)
++ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
++ limit = vma->vm_end;
++ spin_unlock(¤t->mm->page_table_lock);
++
++ current->mm->context.exec_limit = limit;
++ set_user_cs(¤t->mm->context.user_cs, limit);
++
++ desc1 = ¤t->mm->context.user_cs;
++ desc2 = per_cpu(cpu_gdt_table, cpu) + GDT_ENTRY_DEFAULT_USER_CS;
++
++ /*
++ * The CS was not in sync - reload it and retry the
++ * instruction. If the instruction still faults then
++ * we wont hit this branch next time around.
++ */
++ if (desc1->a != desc2->a || desc1->b != desc2->b) {
++ if (print_fatal_signals >= 2) {
++ printk("#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id());
++ printk(" exec_limit: %08lx, user_cs: %08lx/%08lx, CPU_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, desc1->a, desc1->b, desc2->a, desc2->b);
++ }
++ load_user_cs_desc(cpu, current->mm);
++ return;
++ }
++ }
++ if (print_fatal_signals) {
++ printk("#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id());
++ printk(" exec_limit: %08lx, user_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, current->mm->context.user_cs.a, current->mm->context.user_cs.b);
++ }
++
+ current->thread.error_code = error_code;
+ current->thread.trap_no = 13;
+ force_sig(SIGSEGV, current);
+Index: linux-2.6.10/arch/i386/kernel/vsyscall.lds.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/vsyscall.lds.S 2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/vsyscall.lds.S 2005-04-05 16:34:18.169221600 +0800
+@@ -7,7 +7,7 @@
+
+ SECTIONS
+ {
+- . = VSYSCALL_BASE + SIZEOF_HEADERS;
++ . = SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .dynsym : { *(.dynsym) }
+@@ -20,7 +20,7 @@
+ For the layouts to match, we need to skip more than enough
+ space for the dynamic symbol table et al. If this amount
+ is insufficient, ld -shared will barf. Just increase it here. */
+- . = VSYSCALL_BASE + 0x400;
++ . = 0x400;
+
+ .text : { *(.text) } :text =0x90909090
+
+Index: linux-2.6.10/arch/i386/kernel/vsyscall-sysenter.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/vsyscall-sysenter.S 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/vsyscall-sysenter.S 2005-04-05 16:34:18.170221448 +0800
+@@ -24,11 +24,11 @@
+ /* 7: align return point with nop's to make disassembly easier */
+ .space 7,0x90
+
+- /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */
++ /* 14: System call restart point is here! (SYSENTER_RETURN_OFFSET-2) */
+ jmp .Lenter_kernel
+ /* 16: System call normal return point is here! */
+- .globl SYSENTER_RETURN /* Symbol used by entry.S. */
+-SYSENTER_RETURN:
++ .globl SYSENTER_RETURN_OFFSET /* Symbol used by sysenter.c */
++SYSENTER_RETURN_OFFSET:
+ pop %ebp
+ .Lpop_ebp:
+ pop %edx
+Index: linux-2.6.10/arch/i386/mm/init.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/mm/init.c 2005-04-05 16:29:28.016331544 +0800
++++ linux-2.6.10/arch/i386/mm/init.c 2005-04-05 16:34:18.167221904 +0800
+@@ -518,7 +518,10 @@
+ set_nx();
+ if (nx_enabled)
+ printk("NX (Execute Disable) protection: active\n");
++ else
+ #endif
++ if (exec_shield)
++ printk("Using x86 segment limits to approximate NX protection\n");
+
+ pagetable_init();
+
+Index: linux-2.6.10/arch/i386/mm/mmap.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/mm/mmap.c 2004-12-25 05:34:33.000000000 +0800
++++ linux-2.6.10/arch/i386/mm/mmap.c 2005-04-05 16:43:44.365146736 +0800
+@@ -26,6 +26,7 @@
+
+ #include <linux/personality.h>
+ #include <linux/mm.h>
++#include <linux/random.h>
+
+ /*
+ * Top of mmap area (just below the process stack).
+@@ -38,13 +39,17 @@
+ static inline unsigned long mmap_base(struct mm_struct *mm)
+ {
+ unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
++ unsigned long random_factor = 0;
++
++ if (current->flags & PF_RELOCEXEC)
++ random_factor = get_random_int() % (1024*1024);
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+- return TASK_SIZE - (gap & PAGE_MASK);
++ return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
+ }
+
+ /*
+@@ -57,15 +62,17 @@
+ * Fall back to the standard layout if the personality
+ * bit is set, or if the expected stack growth is unlimited:
+ */
+- if (sysctl_legacy_va_layout ||
++ if ((exec_shield != 2) && (sysctl_legacy_va_layout ||
+ (current->personality & ADDR_COMPAT_LAYOUT) ||
+- current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
++ current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)){
+ mm->mmap_base = TASK_UNMAPPED_BASE;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->unmap_area = arch_unmap_area;
+ } else {
+ mm->mmap_base = mmap_base(mm);
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
++ if (current->flags & PF_RELOCEXEC)
++ mm->get_unmapped_exec_area = arch_get_unmapped_exec_area;
+ mm->unmap_area = arch_unmap_area_topdown;
+ }
+ }
+Index: linux-2.6.10/arch/ia64/ia32/binfmt_elf32.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/ia32/binfmt_elf32.c 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/arch/ia64/ia32/binfmt_elf32.c 2005-04-05 16:34:18.174220840 +0800
+@@ -272,7 +272,7 @@
+ }
+
+ static unsigned long
+-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
++elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)
+ {
+ unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK;
+
+Index: linux-2.6.10/arch/x86_64/ia32/ia32_binfmt.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/ia32/ia32_binfmt.c 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/x86_64/ia32/ia32_binfmt.c 2005-04-05 16:34:18.175220688 +0800
+@@ -390,7 +390,7 @@
+ }
+
+ static unsigned long
+-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
++elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)
+ {
+ unsigned long map_addr;
+ struct task_struct *me = current;
+Index: linux-2.6.10/drivers/char/random.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/random.c 2005-04-05 16:29:24.214909448 +0800
++++ linux-2.6.10/drivers/char/random.c 2005-04-05 16:34:18.197217344 +0800
+@@ -2469,3 +2469,37 @@
+ }
+ #endif
+ #endif /* CONFIG_INET */
++
++/*
++ * Get a random word:
++ */
++unsigned int get_random_int(void)
++{
++ unsigned int val = 0;
++
++ if (!exec_shield_randomize)
++ return 0;
++
++#ifdef CONFIG_X86_HAS_TSC
++ rdtscl(val);
++#endif
++ val += current->pid + jiffies + (int)val;
++
++ /*
++ * Use IP's RNG. It suits our purpose perfectly: it re-keys itself
++ * every second, from the entropy pool (and thus creates a limited
++ * drain on it), and uses halfMD4Transform within the second. We
++ * also spice it with the TSC (if available), jiffies, PID and the
++ * stack address:
++ */
++ return secure_ip_id(val);
++}
++
++unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len)
++{
++ unsigned long range = end - len - start;
++ if (end <= start + len)
++ return 0;
++ return PAGE_ALIGN(get_random_int() % range + start);
++}
++
+Index: linux-2.6.10/fs/binfmt_elf.c
+===================================================================
+--- linux-2.6.10.orig/fs/binfmt_elf.c 2005-04-05 16:29:24.353888320 +0800
++++ linux-2.6.10/fs/binfmt_elf.c 2005-04-05 16:39:25.042569760 +0800
+@@ -494,7 +494,7 @@
+ unsigned long reloc_func_desc = 0;
+ char passed_fileno[6];
+ struct files_struct *files;
+- int have_pt_gnu_stack, executable_stack = EXSTACK_DEFAULT;
++ int have_pt_gnu_stack, relocexec, executable_stack = EXSTACK_DEFAULT;
+ unsigned long def_flags = 0;
+ struct {
+ struct elfhdr elf_ex;
+@@ -660,6 +660,24 @@
+ }
+ have_pt_gnu_stack = (i < loc->elf_ex.e_phnum);
+
++ relocexec = 0;
++
++ if (current->personality == PER_LINUX)
++ switch (exec_shield) {
++ case 1:
++ if (executable_stack == EXSTACK_DISABLE_X) {
++ current->flags |= PF_RELOCEXEC;
++ relocexec = PF_RELOCEXEC;
++ }
++ break;
++
++ case 2:
++ executable_stack = EXSTACK_DISABLE_X;
++ current->flags |= PF_RELOCEXEC;
++ relocexec = PF_RELOCEXEC;
++ break;
++ }
++
+ /* Some simple consistency checks for the interpreter */
+ if (elf_interpreter) {
+ interpreter_type = INTERPRETER_ELF | INTERPRETER_AOUT;
+@@ -713,6 +731,15 @@
+ if (retval)
+ goto out_free_dentry;
+
++ current->flags |= relocexec;
++#ifdef __i386__
++ /*
++ * Turn off the CS limit completely if exec-shield disabled or
++ * NX active:
++ */
++ if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled)
++ arch_add_exec_range(current->mm, -1);
++#endif
+ /* Discard our unneeded old files struct */
+ if (files) {
+ steal_locks(files);
+@@ -731,7 +758,8 @@
+ /* Do this immediately, since STACK_TOP as used in setup_arg_pages
+ may depend on the personality. */
+ SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+- if (elf_read_implies_exec(loc->elf_ex, have_pt_gnu_stack))
++ if (exec_shield != 2 &&
++ elf_read_implies_exec(loc->elf_ex, have_pt_gnu_stack))
+ current->personality |= READ_IMPLIES_EXEC;
+
+ arch_pick_mmap_layout(current->mm);
+@@ -894,6 +922,14 @@
+
+ set_binfmt(&elf_format);
+
++ /*
++ * Map the vsyscall trampoline. This address is then passed via
++ * AT_SYSINFO.
++ */
++#ifdef __HAVE_ARCH_VSYSCALL
++ map_vsyscall();
++#endif
++
+ compute_creds(bprm);
+ current->flags &= ~PF_FORKNOEXEC;
+ create_elf_tables(bprm, &loc->elf_ex, (interpreter_type == INTERPRETER_AOUT),
+Index: linux-2.6.10/fs/exec.c
+===================================================================
+--- linux-2.6.10.orig/fs/exec.c 2005-04-05 16:29:30.270988784 +0800
++++ linux-2.6.10/fs/exec.c 2005-04-05 16:34:18.177220384 +0800
+@@ -396,7 +396,12 @@
+ while (i < MAX_ARG_PAGES)
+ bprm->page[i++] = NULL;
+ #else
++#ifdef __HAVE_ARCH_ALIGN_STACK
++ stack_base = arch_align_stack(STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE);
++ stack_base = PAGE_ALIGN(stack_base);
++#else
+ stack_base = STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE;
++#endif
+ bprm->p += stack_base;
+ mm->arg_start = bprm->p;
+ arg_size = STACK_TOP - (PAGE_MASK & (unsigned long) mm->arg_start);
+@@ -854,6 +859,7 @@
+ tcomm[i] = '\0';
+ set_task_comm(current, tcomm);
+
++ current->flags &= ~PF_RELOCEXEC;
+ flush_thread();
+
+ if (bprm->e_uid != current->euid || bprm->e_gid != current->egid ||
+Index: linux-2.6.10/fs/proc/array.c
+===================================================================
+--- linux-2.6.10.orig/fs/proc/array.c 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/fs/proc/array.c 2005-04-05 16:34:18.180219928 +0800
+@@ -373,8 +373,12 @@
+ ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
+ read_unlock(&tasklist_lock);
+
+- if (!whole || num_threads<2)
+- wchan = get_wchan(task);
++ if (!whole || num_threads<2) {
++ wchan = 0;
++ if (current->uid == task->uid || current->euid == task->uid ||
++ capable(CAP_SYS_NICE))
++ wchan = get_wchan(task);
++ }
+ if (!whole) {
+ min_flt = task->min_flt;
+ maj_flt = task->maj_flt;
+Index: linux-2.6.10/fs/proc/base.c
+===================================================================
+--- linux-2.6.10.orig/fs/proc/base.c 2005-04-05 16:29:24.361887104 +0800
++++ linux-2.6.10/fs/proc/base.c 2005-04-05 16:34:18.179220080 +0800
+@@ -117,7 +117,7 @@
+ E(PROC_TGID_CMDLINE, "cmdline", S_IFREG|S_IRUGO),
+ E(PROC_TGID_STAT, "stat", S_IFREG|S_IRUGO),
+ E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO),
+- E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO),
++ E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUSR),
+ E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR),
+ E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO),
+ E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO),
+@@ -142,7 +142,7 @@
+ E(PROC_TID_CMDLINE, "cmdline", S_IFREG|S_IRUGO),
+ E(PROC_TID_STAT, "stat", S_IFREG|S_IRUGO),
+ E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO),
+- E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO),
++ E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUSR),
+ E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR),
+ E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO),
+ E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO),
+Index: linux-2.6.10/fs/proc/task_mmu.c
+===================================================================
+--- linux-2.6.10.orig/fs/proc/task_mmu.c 2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/fs/proc/task_mmu.c 2005-04-05 16:41:11.796340720 +0800
+@@ -14,19 +14,27 @@
+ buffer += sprintf(buffer,
+ "VmSize:\t%8lu kB\n"
+ "VmLck:\t%8lu kB\n"
+- "VmRSS:\t%8lu kB\n"
+- "VmData:\t%8lu kB\n"
+- "VmStk:\t%8lu kB\n"
+- "VmExe:\t%8lu kB\n"
+- "VmLib:\t%8lu kB\n"
+- "VmPTE:\t%8lu kB\n",
+- (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+- mm->locked_vm << (PAGE_SHIFT-10),
+- mm->rss << (PAGE_SHIFT-10),
+- data << (PAGE_SHIFT-10),
+- mm->stack_vm << (PAGE_SHIFT-10), text, lib,
+- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+- return buffer;
++ "VmData:\t%8lu kB\n"
++ "VmStk:\t%8lu kB\n"
++ "VmExe:\t%8lu kB\n"
++ "VmLib:\t%8lu kB\n"
++ "VmPTE:\t%8lu kB\n"
++ "StaBrk:\t%08lx kB\n"
++ "Brk:\t%08lx kB\n"
++ "StaStk:\t%08lx kB\n" ,
++ (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
++ mm->locked_vm << (PAGE_SHIFT-10),
++ mm->rss << (PAGE_SHIFT-10),
++ data << (PAGE_SHIFT-10),
++ mm->stack_vm << (PAGE_SHIFT-10), text, lib,
++ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
++ mm->start_brk, mm->brk, mm->start_stack);
++#if __i386__
++ if (!nx_enabled)
++ buffer += sprintf(buffer,
++ "ExecLim:\t%08lx\n", mm->context.exec_limit);
++#endif
++ return buffer;
+ }
+
+ unsigned long task_vsize(struct mm_struct *mm)
+@@ -47,6 +55,9 @@
+
+ static int show_map(struct seq_file *m, void *v)
+ {
++#ifdef __i386__
++ struct task_struct *task = m->private;
++#endif
+ struct vm_area_struct *map = v;
+ struct file *file = map->vm_file;
+ int flags = map->vm_flags;
+@@ -65,7 +76,13 @@
+ map->vm_end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+- flags & VM_EXEC ? 'x' : '-',
++ (flags & VM_EXEC
++#ifdef __i386__
++ || (!nx_enabled &&
++ (map->vm_start < task->mm->context.exec_limit))
++#endif
++ )
++ ? 'x' : '-',
+ flags & VM_MAYSHARE ? 's' : 'p',
+ map->vm_pgoff << PAGE_SHIFT,
+ MAJOR(dev), MINOR(dev), ino, &len);
+Index: linux-2.6.10/include/asm-i386/desc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/desc.h 2005-04-05 16:29:30.129010368 +0800
++++ linux-2.6.10/include/asm-i386/desc.h 2005-04-05 16:34:18.188218712 +0800
+@@ -129,6 +129,20 @@
+ extern int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr,
+ unsigned long bytecount);
+
++static inline void set_user_cs(struct desc_struct *desc, unsigned long limit)
++{
++ limit = (limit - 1) / PAGE_SIZE;
++ desc->a = limit & 0xffff;
++ desc->b = (limit & 0xf0000) | 0x00c0fb00;
++}
++
++#define load_user_cs_desc(cpu, mm) \
++ per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs
++
++extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit);
++extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit);
++extern void arch_flush_exec_range(struct mm_struct *mm);
++
+ #endif /* !__ASSEMBLY__ */
+
+ #endif
+Index: linux-2.6.10/include/asm-i386/elf.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/elf.h 2004-12-25 05:35:15.000000000 +0800
++++ linux-2.6.10/include/asm-i386/elf.h 2005-04-05 16:34:18.188218712 +0800
+@@ -9,6 +9,7 @@
+ #include <asm/user.h>
+ #include <asm/processor.h>
+ #include <asm/system.h> /* for savesegment */
++#include <asm/desc.h>
+
+ #include <linux/utsname.h>
+
+@@ -133,15 +134,22 @@
+ #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
+ #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
+
+-#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL))
+-#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE)
+-#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
+ extern void __kernel_vsyscall;
++#define VSYSCALL_BASE ((unsigned long)current->mm->context.vdso)
++#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE)
++#define VSYSCALL_OFFSET ((unsigned long) &__kernel_vsyscall)
++#define VSYSCALL_ENTRY (VSYSCALL_BASE + VSYSCALL_OFFSET)
+
+-#define ARCH_DLINFO \
+-do { \
+- NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \
+- NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \
++/* kernel-internal fixmap address: */
++#define __VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL))
++#define __VSYSCALL_EHDR ((const struct elfhdr *) __VSYSCALL_BASE)
++
++#define ARCH_DLINFO \
++do { \
++ if (VSYSCALL_BASE) { \
++ NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \
++ NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \
++ } \
+ } while (0)
+
+ /*
+@@ -152,15 +160,15 @@
+ * Dumping its extra ELF program headers includes all the other information
+ * a debugger needs to easily find how the vsyscall DSO was being used.
+ */
+-#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum)
++#define ELF_CORE_EXTRA_PHDRS (__VSYSCALL_EHDR->e_phnum)
+ #define ELF_CORE_WRITE_EXTRA_PHDRS \
+ do { \
+ const struct elf_phdr *const vsyscall_phdrs = \
+- (const struct elf_phdr *) (VSYSCALL_BASE \
+- + VSYSCALL_EHDR->e_phoff); \
++ (const struct elf_phdr *) (__VSYSCALL_BASE \
++ + __VSYSCALL_EHDR->e_phoff); \
+ int i; \
+ Elf32_Off ofs = 0; \
+- for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \
++ for (i = 0; i < __VSYSCALL_EHDR->e_phnum; ++i) { \
+ struct elf_phdr phdr = vsyscall_phdrs[i]; \
+ if (phdr.p_type == PT_LOAD) { \
+ BUG_ON(ofs != 0); \
+@@ -178,10 +186,10 @@
+ #define ELF_CORE_WRITE_EXTRA_DATA \
+ do { \
+ const struct elf_phdr *const vsyscall_phdrs = \
+- (const struct elf_phdr *) (VSYSCALL_BASE \
+- + VSYSCALL_EHDR->e_phoff); \
++ (const struct elf_phdr *) (__VSYSCALL_BASE \
++ + __VSYSCALL_EHDR->e_phoff); \
+ int i; \
+- for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \
++ for (i = 0; i < __VSYSCALL_EHDR->e_phnum; ++i) { \
+ if (vsyscall_phdrs[i].p_type == PT_LOAD) \
+ DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \
+ PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \
+@@ -190,4 +198,10 @@
+
+ #endif
+
++#define __HAVE_ARCH_RANDOMIZE_BRK
++extern void randomize_brk(unsigned long old_brk);
++
++#define __HAVE_ARCH_VSYSCALL
++extern void map_vsyscall(void);
++
+ #endif
+Index: linux-2.6.10/include/asm-i386/mmu.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/mmu.h 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/include/asm-i386/mmu.h 2005-04-05 16:34:18.189218560 +0800
+@@ -7,11 +7,17 @@
+ * we put the segment information here.
+ *
+ * cpu_vm_mask is used to optimize ldt flushing.
++ *
++ * exec_limit is used to track the range PROT_EXEC
++ * mappings span.
+ */
+ typedef struct {
+ int size;
+ struct semaphore sem;
+ void *ldt;
++ struct desc_struct user_cs;
++ unsigned long exec_limit;
++ void *vdso;
+ } mm_context_t;
+
+ #endif
+Index: linux-2.6.10/include/asm-i386/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/pgalloc.h 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/include/asm-i386/pgalloc.h 2005-04-05 16:34:18.190218408 +0800
+@@ -4,6 +4,7 @@
+ #include <linux/config.h>
+ #include <asm/processor.h>
+ #include <asm/fixmap.h>
++#include <asm/desc.h>
+ #include <linux/threads.h>
+ #include <linux/mm.h> /* for struct page */
+
+Index: linux-2.6.10/include/asm-i386/processor.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/processor.h 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/include/asm-i386/processor.h 2005-04-05 16:34:18.189218560 +0800
+@@ -296,7 +296,10 @@
+ /* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
++#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
++
++#define __HAVE_ARCH_ALIGN_STACK
++extern unsigned long arch_align_stack(unsigned long sp);
+
+ #define HAVE_ARCH_PICK_MMAP_LAYOUT
+
+@@ -478,6 +481,7 @@
+ regs->xcs = __USER_CS; \
+ regs->eip = new_eip; \
+ regs->esp = new_esp; \
++ load_user_cs_desc(smp_processor_id(), current->mm); \
+ } while (0)
+
+ /* Forward declaration, a strange C thing */
+Index: linux-2.6.10/include/asm-i386/thread_info.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/thread_info.h 2005-04-05 16:29:30.127010672 +0800
++++ linux-2.6.10/include/asm-i386/thread_info.h 2005-04-05 16:34:18.190218408 +0800
+@@ -38,6 +38,7 @@
+ 0-0xBFFFFFFF for user-thead
+ 0-0xFFFFFFFF for kernel-thread
+ */
++ void *sysenter_return;
+ struct restart_block restart_block;
+
+ unsigned long previous_esp; /* ESP of the previous stack in case
+Index: linux-2.6.10/include/asm-ia64/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ia64/pgalloc.h 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/include/asm-ia64/pgalloc.h 2005-04-05 16:34:18.184219320 +0800
+@@ -23,6 +23,10 @@
+ #include <asm/mmu_context.h>
+ #include <asm/processor.h>
+
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm) do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
+ /*
+ * Very stupidly, we used to get new pgd's and pmd's, init their contents
+ * to point to the NULL versions of the next level page table, later on
+Index: linux-2.6.10/include/asm-ppc64/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/pgalloc.h 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/include/asm-ppc64/pgalloc.h 2005-04-05 16:34:18.185219168 +0800
+@@ -11,6 +11,11 @@
+
+ extern kmem_cache_t *zero_cache;
+
++/* Dummy functions since we don't support execshield on ppc */
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm) do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
+ /*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+Index: linux-2.6.10/include/asm-ppc/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc/pgalloc.h 2004-12-25 05:33:48.000000000 +0800
++++ linux-2.6.10/include/asm-ppc/pgalloc.h 2005-04-05 16:34:18.183219472 +0800
+@@ -40,5 +40,10 @@
+
+ #define check_pgt_cache() do { } while (0)
+
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm) do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
++
+ #endif /* _PPC_PGALLOC_H */
+ #endif /* __KERNEL__ */
+Index: linux-2.6.10/include/asm-s390/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-s390/pgalloc.h 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/include/asm-s390/pgalloc.h 2005-04-05 16:34:18.186219016 +0800
+@@ -19,6 +19,10 @@
+ #include <linux/gfp.h>
+ #include <linux/mm.h>
+
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm) do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
+ #define check_pgt_cache() do {} while (0)
+
+ extern void diag10(unsigned long addr);
+Index: linux-2.6.10/include/asm-sparc64/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sparc64/pgalloc.h 2004-12-25 05:35:29.000000000 +0800
++++ linux-2.6.10/include/asm-sparc64/pgalloc.h 2005-04-05 16:34:18.187218864 +0800
+@@ -261,4 +261,8 @@
+ #define pgd_free(pgd) free_pgd_fast(pgd)
+ #define pgd_alloc(mm) get_pgd_fast()
+
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm) do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
+ #endif /* _SPARC64_PGALLOC_H */
+Index: linux-2.6.10/include/asm-sparc/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sparc/pgalloc.h 2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/include/asm-sparc/pgalloc.h 2005-04-05 16:34:18.191218256 +0800
+@@ -66,4 +66,8 @@
+ #define pte_free(pte) BTFIXUP_CALL(pte_free)(pte)
+ #define __pte_free_tlb(tlb, pte) pte_free(pte)
+
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm) do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
+ #endif /* _SPARC_PGALLOC_H */
+Index: linux-2.6.10/include/asm-x86_64/pgalloc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/pgalloc.h 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/include/asm-x86_64/pgalloc.h 2005-04-05 16:34:18.185219168 +0800
+@@ -7,6 +7,11 @@
+ #include <linux/threads.h>
+ #include <linux/mm.h>
+
++#define arch_add_exec_range(mm, limit) do { ; } while (0)
++#define arch_flush_exec_range(mm) do { ; } while (0)
++#define arch_remove_exec_range(mm, limit) do { ; } while (0)
++
++
+ #define pmd_populate_kernel(mm, pmd, pte) \
+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
+ #define pgd_populate(mm, pgd, pmd) \
+Index: linux-2.6.10/include/linux/mm.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mm.h 2005-04-05 16:29:30.250991824 +0800
++++ linux-2.6.10/include/linux/mm.h 2005-04-05 16:43:44.366146584 +0800
+@@ -685,7 +685,14 @@
+ unsigned long addr, unsigned long len, pgoff_t pgoff);
+ extern void exit_mmap(struct mm_struct *);
+
+-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
++extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int);
++
++
++static inline unsigned long get_unmapped_area(struct file * file, unsigned long addr,
++ unsigned long len, unsigned long pgoff, unsigned long flags)
++{
++ return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0);
++}
+
+ extern unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file *file,
+ unsigned long addr, unsigned long len,
+Index: linux-2.6.10/include/linux/random.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/random.h 2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/include/linux/random.h 2005-04-05 16:34:18.183219472 +0800
+@@ -69,6 +69,9 @@
+ extern struct file_operations random_fops, urandom_fops;
+ #endif
+
++unsigned int get_random_int(void);
++unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
++
+ #endif /* __KERNEL___ */
+
+ #endif /* _LINUX_RANDOM_H */
+Index: linux-2.6.10/include/linux/resource.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/resource.h 2004-12-25 05:33:52.000000000 +0800
++++ linux-2.6.10/include/linux/resource.h 2005-04-05 16:34:18.182219624 +0800
+@@ -52,8 +52,11 @@
+ /*
+ * Limit the stack by to some sane default: root can always
+ * increase this limit if needed.. 8MB seems reasonable.
++ *
++ * (2MB more to cover randomization effects.)
+ */
+-#define _STK_LIM (8*1024*1024)
++#define _STK_LIM (10*1024*1024)
++#define EXEC_STACK_BIAS (2*1024*1024)
+
+ /*
+ * GPG wants 32kB of mlocked memory, to make sure pass phrases
+Index: linux-2.6.10/include/linux/sched.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sched.h 2005-04-05 16:29:27.971338384 +0800
++++ linux-2.6.10/include/linux/sched.h 2005-04-05 16:43:44.367146432 +0800
+@@ -32,6 +32,9 @@
+ #include <linux/topology.h>
+
+ struct exec_domain;
++extern int exec_shield;
++extern int exec_shield_randomize;
++extern int print_fatal_signals;
+
+ /*
+ * cloning flags:
+@@ -193,6 +196,10 @@
+ extern unsigned long
+ arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
+ unsigned long, unsigned long);
++
++extern unsigned long
++arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long,
++ unsigned long, unsigned long);
+ extern unsigned long
+ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+@@ -208,6 +215,9 @@
+ unsigned long (*get_unmapped_area) (struct file *filp,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags);
++ unsigned long (*get_unmapped_exec_area) (struct file *filp,
++ unsigned long addr, unsigned long len,
++ unsigned long pgoff, unsigned long flags);
+ void (*unmap_area) (struct vm_area_struct *area);
+ unsigned long mmap_base; /* base of mmap area */
+ unsigned long free_area_cache; /* first hole */
+@@ -720,6 +730,7 @@
+ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
+ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
+ #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
++#define PF_RELOCEXEC 0x00800000 /* relocate shared libraries */
+
+ #ifdef CONFIG_SMP
+ extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
+Index: linux-2.6.10/kernel/signal.c
+===================================================================
+--- linux-2.6.10.orig/kernel/signal.c 2005-04-05 16:29:27.951341424 +0800
++++ linux-2.6.10/kernel/signal.c 2005-04-05 16:43:17.077295120 +0800
+@@ -1608,6 +1608,35 @@
+ spin_unlock_irq(¤t->sighand->siglock);
+ }
+
++int print_fatal_signals = 0;
++
++static void print_fatal_signal(struct pt_regs *regs, int signr)
++{
++ int i;
++ unsigned char insn;
++ printk("%s/%d: potentially unexpected fatal signal %d.\n",
++ current->comm, current->pid, signr);
++
++#ifdef __i386__
++ printk("code at %08lx: ", regs->eip);
++ for (i = 0; i < 16; i++) {
++ __get_user(insn, (unsigned char *)(regs->eip + i));
++ printk("%02x ", insn);
++ }
++#endif
++ printk("\n");
++ show_regs(regs);
++}
++
++static int __init setup_print_fatal_signals(char *str)
++{
++ get_option (&str, &print_fatal_signals);
++
++ return 1;
++}
++
++__setup("print-fatal-signals=", setup_print_fatal_signals);
++
+ #ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER
+
+ static void
+@@ -1808,6 +1837,12 @@
+ if (!signr)
+ break; /* will return 0 */
+
++ if ((signr == SIGSEGV) && print_fatal_signals) {
++ spin_unlock_irq(¤t->sighand->siglock);
++ print_fatal_signal(regs, signr);
++ spin_lock_irq(¤t->sighand->siglock);
++ }
++
+ if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) {
+ ptrace_signal_deliver(regs, cookie);
+
+@@ -1904,6 +1939,8 @@
+ * Anything else is fatal, maybe with a core dump.
+ */
+ current->flags |= PF_SIGNALED;
++ if (print_fatal_signals)
++ print_fatal_signal(regs, signr);
+ if (sig_kernel_coredump(signr)) {
+ /*
+ * If it was able to dump core, this kills all
+Index: linux-2.6.10/kernel/sysctl.c
+===================================================================
+--- linux-2.6.10.orig/kernel/sysctl.c 2005-04-05 16:29:24.394882088 +0800
++++ linux-2.6.10/kernel/sysctl.c 2005-04-05 16:43:17.078294968 +0800
+@@ -75,6 +75,29 @@
+ void __user *, size_t *, loff_t *);
+ #endif
+
++extern unsigned int vdso_enabled;
++
++int exec_shield = 1;
++int exec_shield_randomize = 1;
++
++static int __init setup_exec_shield(char *str)
++{
++ get_option (&str, &exec_shield);
++
++ return 1;
++}
++
++__setup("exec-shield=", setup_exec_shield);
++
++static int __init setup_exec_shield_randomize(char *str)
++{
++ get_option (&str, &exec_shield_randomize);
++
++ return 1;
++}
++
++__setup("exec-shield-randomize=", setup_exec_shield_randomize);
++
+ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+ static int maxolduid = 65535;
+ static int minolduid;
+@@ -276,6 +299,40 @@
+ .proc_handler = &proc_dointvec,
+ },
+ {
++ .ctl_name = KERN_PANIC,
++ .procname = "exec-shield",
++ .data = &exec_shield,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ {
++ .ctl_name = KERN_PANIC,
++ .procname = "exec-shield-randomize",
++ .data = &exec_shield_randomize,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ {
++ .ctl_name = KERN_PANIC,
++ .procname = "print-fatal-signals",
++ .data = &print_fatal_signals,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++#if __i386__
++ {
++ .ctl_name = KERN_PANIC,
++ .procname = "vdso",
++ .data = &vdso_enabled,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++#endif
++ {
+ .ctl_name = KERN_CORE_USES_PID,
+ .procname = "core_uses_pid",
+ .data = &core_uses_pid,
+Index: linux-2.6.10/mm/mmap.c
+===================================================================
+--- linux-2.6.10.orig/mm/mmap.c 2005-04-05 16:29:30.134009608 +0800
++++ linux-2.6.10/mm/mmap.c 2005-04-05 16:43:44.369146128 +0800
+@@ -23,6 +23,7 @@
+ #include <linux/mount.h>
+ #include <linux/mempolicy.h>
+ #include <linux/rmap.h>
++#include <linux/random.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -245,6 +246,8 @@
+ __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent)
+ {
++ if (vma->vm_flags & VM_EXEC)
++ arch_add_exec_range(mm, vma->vm_end);
+ if (prev) {
+ vma->vm_next = prev->vm_next;
+ prev->vm_next = vma;
+@@ -347,6 +350,8 @@
+ rb_erase(&vma->vm_rb, &mm->mm_rb);
+ if (mm->mmap_cache == vma)
+ mm->mmap_cache = prev;
++ if (vma->vm_flags & VM_EXEC)
++ arch_remove_exec_range(mm, vma->vm_end);
+ }
+
+ /*
+@@ -642,6 +647,8 @@
+ } else /* cases 2, 5, 7 */
+ vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL);
++ if (prev->vm_flags & VM_EXEC)
++ arch_add_exec_range(mm, prev->vm_end);
+ return prev;
+ }
+
+@@ -813,7 +820,7 @@
+ /* Obtain the address to map to. we verify (or select) it and ensure
+ * that it represents a valid section of the address space.
+ */
+- addr = get_unmapped_area(file, addr, len, pgoff, flags);
++ addr = get_unmapped_area_prot(file, addr, len, pgoff, flags, prot & PROT_EXEC);
+ if (addr & ~PAGE_MASK)
+ return addr;
+
+@@ -1207,9 +1214,10 @@
+ area->vm_mm->free_area_cache = area->vm_end;
+ }
+
++
+ unsigned long
+-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+- unsigned long pgoff, unsigned long flags)
++get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
++ unsigned long pgoff, unsigned long flags, int exec)
+ {
+ if (flags & MAP_FIXED) {
+ unsigned long ret;
+@@ -1241,10 +1249,80 @@
+ return file->f_op->get_unmapped_area(file, addr, len,
+ pgoff, flags);
+
+- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
++ if (exec && current->mm->get_unmapped_exec_area)
++ return current->mm->get_unmapped_exec_area(file, addr, len, pgoff, flags);
++ else
++ return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ }
+
+-EXPORT_SYMBOL(get_unmapped_area);
++EXPORT_SYMBOL(get_unmapped_area_prot);
++
++
++#define SHLIB_BASE 0x00111000
++
++unsigned long arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
++ unsigned long len0, unsigned long pgoff, unsigned long flags)
++{
++ unsigned long addr = addr0, len = len0;
++ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
++ unsigned long tmp;
++
++ if (len > TASK_SIZE)
++ return -ENOMEM;
++
++ if (!addr && !(flags & MAP_FIXED))
++ addr = randomize_range(SHLIB_BASE, 0x01000000, len);
++
++ if (addr) {
++ addr = PAGE_ALIGN(addr);
++ vma = find_vma(mm, addr);
++ if (TASK_SIZE - len >= addr &&
++ (!vma || addr + len <= vma->vm_start)) {
++ return addr;
++ }
++ }
++
++ addr = SHLIB_BASE;
++
++ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
++ /* At this point: (!vma || addr < vma->vm_end). */
++ if (TASK_SIZE - len < addr) {
++ return -ENOMEM;
++ }
++ if (!vma || addr + len <= vma->vm_start) {
++ /*
++ * Must not let a PROT_EXEC mapping get into the
++ * brk area:
++ */
++ if (addr + len > mm->brk)
++ goto failed;
++
++ /*
++ * Up until the brk area we randomize addresses
++ * as much as possible:
++ */
++ if (addr >= 0x01000000) {
++ tmp = randomize_range(0x01000000, mm->brk, len);
++ vma = find_vma(mm, tmp);
++ if (TASK_SIZE - len >= tmp &&
++ (!vma || tmp + len <= vma->vm_start))
++ return tmp;
++ }
++ /*
++ * Ok, randomization didnt work out - return
++ * the result of the linear search:
++ */
++ return addr;
++ }
++ addr = vma->vm_end;
++ }
++
++failed:
++ return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags);
++}
++
++
+
+ /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+ struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+@@ -1319,6 +1397,14 @@
+ return prev ? prev->vm_next : vma;
+ }
+
++
++static int over_stack_limit(unsigned long sz)
++{
++ if (sz < EXEC_STACK_BIAS)
++ return 0;
++ return (sz - EXEC_STACK_BIAS) > current->signal->rlim[RLIMIT_STACK].rlim_cur;
++}
++
+ #ifdef CONFIG_STACK_GROWSUP
+ /*
+ * vma is the first one with address > vma->vm_end. Have to extend vma.
+@@ -1358,7 +1444,7 @@
+ return -ENOMEM;
+ }
+
+- if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur ||
++ if (over_stack_limit(address - vma->vm_start) ||
+ ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
+ current->signal->rlim[RLIMIT_AS].rlim_cur) {
+ anon_vma_unlock(vma);
+@@ -1432,7 +1518,7 @@
+ return -ENOMEM;
+ }
+
+- if (vma->vm_end - address > current->signal->rlim[RLIMIT_STACK].rlim_cur ||
++ if (over_stack_limit(vma->vm_end - address) ||
+ ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
+ current->signal->rlim[RLIMIT_AS].rlim_cur) {
+ anon_vma_unlock(vma);
+@@ -1668,10 +1754,14 @@
+ if (new->vm_ops && new->vm_ops->open)
+ new->vm_ops->open(new);
+
+- if (new_below)
++ if (new_below) {
++ unsigned long old_end = vma->vm_end;
++
+ vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ ((addr - new->vm_start) >> PAGE_SHIFT), new);
+- else
++ if (vma->vm_flags & VM_EXEC)
++ arch_remove_exec_range(mm, old_end);
++ } else
+ vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+
+ return 0;
+@@ -1890,6 +1980,7 @@
+ mm->rss = 0;
+ mm->total_vm = 0;
+ mm->locked_vm = 0;
++ arch_flush_exec_range(mm);
+
+ spin_unlock(&mm->page_table_lock);
+
+Index: linux-2.6.10/mm/mprotect.c
+===================================================================
+--- linux-2.6.10.orig/mm/mprotect.c 2005-04-05 16:29:30.135009456 +0800
++++ linux-2.6.10/mm/mprotect.c 2005-04-05 16:34:18.193217952 +0800
+@@ -22,6 +22,7 @@
+
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
++#include <asm/pgalloc.h>
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+
+@@ -117,7 +118,7 @@
+ struct mm_struct * mm = vma->vm_mm;
+ unsigned long oldflags = vma->vm_flags;
+ long nrpages = (end - start) >> PAGE_SHIFT;
+- unsigned long charged = 0;
++ unsigned long charged = 0, old_end = vma->vm_end;
+ pgprot_t newprot;
+ pgoff_t pgoff;
+ int error;
+@@ -179,8 +180,11 @@
+ * vm_flags and vm_page_prot are protected by the mmap_sem
+ * held in write mode.
+ */
++ oldflags = vma->vm_flags;
+ vma->vm_flags = newflags;
+ vma->vm_page_prot = newprot;
++ if (oldflags & VM_EXEC)
++ arch_remove_exec_range(current->mm, old_end);
+ change_protection(vma, start, end, newprot);
+ __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+ __vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+Index: linux-2.6.10/mm/mremap.c
+===================================================================
+--- linux-2.6.10.orig/mm/mremap.c 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/mm/mremap.c 2005-04-05 16:43:44.370145976 +0800
+@@ -385,8 +385,8 @@
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+- new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+- vma->vm_pgoff, map_flags);
++ new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len,
++ vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC);
+ ret = new_addr;
+ if (new_addr & ~PAGE_MASK)
+ goto out;
--- /dev/null
+Index: linux-2.6.10/arch/i386/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig.debug 2005-04-05 16:29:30.191000944 +0800
++++ linux-2.6.10/arch/i386/Kconfig.debug 2005-04-05 16:47:53.904211032 +0800
+@@ -2,6 +2,63 @@
+
+ source "lib/Kconfig.debug"
+
++config CRASH_DUMP
++ tristate "Crash dump support (EXPERIMENTAL)"
++ depends on EXPERIMENTAL
++ default n
++ ---help---
++ Say Y here to enable saving an image of system memory when a panic
++ or other error occurs. Dumps can also be forced with the SysRq+d
++ key if MAGIC_SYSRQ is enabled.
++
++config KERNTYPES
++ bool
++ depends on CRASH_DUMP
++ default y
++
++config CRASH_DUMP_BLOCKDEV
++ tristate "Crash dump block device driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving crash dumps directly to a disk device.
++
++config CRASH_DUMP_NETDEV
++ tristate "Crash dump network device driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving crash dumps over a network device.
++
++config CRASH_DUMP_MEMDEV
++ bool "Crash dump staged memory driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow intermediate saving crash dumps in spare
++ memory pages which would then be written out to disk
++ later.
++
++config CRASH_DUMP_SOFTBOOT
++ bool "Save crash dump across a soft reboot"
++ depends on CRASH_DUMP_MEMDEV
++ help
++ Say Y to allow a crash dump to be preserved in memory
++ pages across a soft reboot and written out to disk
++ thereafter. For this to work, CRASH_DUMP must be
++ configured as part of the kernel (not as a module).
++
++config CRASH_DUMP_COMPRESS_RLE
++ tristate "Crash dump RLE compression"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving dumps with Run Length Encoding compression.
++
++config CRASH_DUMP_COMPRESS_GZIP
++ tristate "Crash dump GZIP compression"
++ select ZLIB_INFLATE
++ select ZLIB_DEFLATE
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving dumps with Gnu Zip compression.
++
+ config EARLY_PRINTK
+ bool "Early printk" if EMBEDDED
+ default y
+@@ -15,8 +72,8 @@
+ with klogd/syslogd or the X server. You should normally N here,
+ unless you want to debug such a crash.
+
+-config DEBUG_STACKOVERFLOW
+- bool "Check for stack overflows"
++config DEBUG_STACKOVERFLOW
++ bool "Check for stack overflows"
+ depends on DEBUG_KERNEL
+
+ config KPROBES
+Index: linux-2.6.10/arch/i386/mm/init.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/mm/init.c 2005-04-05 16:47:05.157621640 +0800
++++ linux-2.6.10/arch/i386/mm/init.c 2005-04-05 16:47:53.909210272 +0800
+@@ -244,6 +244,13 @@
+ return 0;
+ }
+
++/* To enable modules to check if a page is in RAM */
++int pfn_is_ram(unsigned long pfn)
++{
++ return (page_is_ram(pfn));
++}
++
++
+ #ifdef CONFIG_HIGHMEM
+ pte_t *kmap_pte;
+ pgprot_t kmap_prot;
+Index: linux-2.6.10/arch/i386/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/traps.c 2005-04-05 16:47:05.156621792 +0800
++++ linux-2.6.10/arch/i386/kernel/traps.c 2005-04-05 16:47:53.906210728 +0800
+@@ -27,6 +27,7 @@
+ #include <linux/ptrace.h>
+ #include <linux/utsname.h>
+ #include <linux/kprobes.h>
++#include <linux/dump.h>
+
+ #ifdef CONFIG_EISA
+ #include <linux/ioport.h>
+@@ -382,6 +383,7 @@
+ bust_spinlocks(0);
+ die.lock_owner = -1;
+ spin_unlock_irq(&die.lock);
++ dump((char *)str, regs);
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+
+@@ -654,6 +656,7 @@
+ printk(" on CPU%d, eip %08lx, registers:\n",
+ smp_processor_id(), regs->eip);
+ show_registers(regs);
++ dump((char *)msg, regs);
+ printk("console shuts up ...\n");
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+Index: linux-2.6.10/arch/i386/kernel/setup.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/setup.c 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/setup.c 2005-04-05 16:47:53.905210880 +0800
+@@ -662,6 +662,10 @@
+ */
+ #define LOWMEMSIZE() (0x9f000)
+
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++unsigned long crashdump_addr = 0xdeadbeef;
++#endif
++
+ static void __init parse_cmdline_early (char ** cmdline_p)
+ {
+ char c = ' ', *to = command_line, *from = saved_command_line;
+@@ -823,6 +827,11 @@
+ if (c == ' ' && !memcmp(from, "vmalloc=", 8))
+ __VMALLOC_RESERVE = memparse(from+8, &from);
+
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++ if (c == ' ' && !memcmp(from, "crashdump=", 10))
++ crashdump_addr = memparse(from+10, &from);
++#endif
++
+ c = *(from++);
+ if (!c)
+ break;
+@@ -1288,6 +1297,10 @@
+
+ static char * __init machine_specific_memory_setup(void);
+
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++extern void crashdump_reserve(void);
++#endif
++
+ /*
+ * Determine if we were loaded by an EFI loader. If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+@@ -1393,6 +1406,10 @@
+ #endif
+
+
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++ crashdump_reserve(); /* Preserve crash dump state from prev boot */
++#endif
++
+ dmi_scan_machine();
+
+ #ifdef CONFIG_X86_GENERICARCH
+Index: linux-2.6.10/arch/i386/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/smp.c 2005-04-05 16:47:05.154622096 +0800
++++ linux-2.6.10/arch/i386/kernel/smp.c 2005-04-05 16:47:53.908210424 +0800
+@@ -19,6 +19,7 @@
+ #include <linux/mc146818rtc.h>
+ #include <linux/cache.h>
+ #include <linux/interrupt.h>
++#include <linux/dump.h>
+
+ #include <asm/mtrr.h>
+ #include <asm/tlbflush.h>
+@@ -143,6 +144,13 @@
+ */
+ cfg = __prepare_ICR(shortcut, vector);
+
++ if (vector == DUMP_VECTOR) {
++ /*
++ * Setup DUMP IPI to be delivered as an NMI
++ */
++ cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI;
++ }
++
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+@@ -220,6 +228,13 @@
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector);
++
++ if (vector == DUMP_VECTOR) {
++ /*
++ * Setup DUMP IPI to be delivered as an NMI
++ */
++ cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI;
++ }
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+@@ -506,6 +521,11 @@
+
+ static struct call_data_struct * call_data;
+
++void dump_send_ipi(void)
++{
++ send_IPI_allbutself(DUMP_VECTOR);
++}
++
+ /*
+ * this function sends a 'generic call function' IPI to all other CPUs
+ * in the system.
+@@ -561,7 +581,7 @@
+ return 0;
+ }
+
+-static void stop_this_cpu (void * dummy)
++void stop_this_cpu (void * dummy)
+ {
+ /*
+ * Remove this CPU:
+@@ -622,4 +642,3 @@
+ atomic_inc(&call_data->finished);
+ }
+ }
+-
+Index: linux-2.6.10/arch/i386/kernel/i386_ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/i386_ksyms.c 2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/i386_ksyms.c 2005-04-05 16:47:53.907210576 +0800
+@@ -16,6 +16,7 @@
+ #include <linux/tty.h>
+ #include <linux/highmem.h>
+ #include <linux/time.h>
++#include <linux/nmi.h>
+
+ #include <asm/semaphore.h>
+ #include <asm/processor.h>
+@@ -31,6 +32,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/nmi.h>
+ #include <asm/ist.h>
++#include <asm/e820.h>
+ #include <asm/kdebug.h>
+
+ extern void dump_thread(struct pt_regs *, struct user *);
+@@ -192,3 +194,20 @@
+ #endif
+
+ EXPORT_SYMBOL(csum_partial);
++
++#ifdef CONFIG_CRASH_DUMP_MODULE
++#ifdef CONFIG_SMP
++extern irq_desc_t irq_desc[NR_IRQS];
++extern cpumask_t irq_affinity[NR_IRQS];
++extern void stop_this_cpu(void *);
++EXPORT_SYMBOL(irq_desc);
++EXPORT_SYMBOL(irq_affinity);
++EXPORT_SYMBOL(stop_this_cpu);
++EXPORT_SYMBOL(dump_send_ipi);
++#endif
++extern int pfn_is_ram(unsigned long);
++EXPORT_SYMBOL(pfn_is_ram);
++#ifdef ARCH_HAS_NMI_WATCHDOG
++EXPORT_SYMBOL(touch_nmi_watchdog);
++#endif
++#endif
+Index: linux-2.6.10/arch/s390/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/s390/Kconfig.debug 2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/arch/s390/Kconfig.debug 2005-04-05 16:47:53.921208448 +0800
+@@ -2,4 +2,13 @@
+
+ source "lib/Kconfig.debug"
+
++config KERNTYPES
++ bool "Kerntypes debugging information"
++ default y
++ ---help---
++ Say Y here to save additional kernel debugging information in the
++ file init/kerntypes.o. This information is used by crash analysis
++ tools such as lcrash to assign structures to kernel addresses.
++
++
+ endmenu
+Index: linux-2.6.10/arch/s390/boot/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/s390/boot/Makefile 2004-12-25 05:35:49.000000000 +0800
++++ linux-2.6.10/arch/s390/boot/Makefile 2005-04-05 16:47:53.922208296 +0800
+@@ -15,4 +15,4 @@
+
+ install: $(CONFIGURE) $(obj)/image
+ sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/image \
+- System.map Kerntypes "$(INSTALL_PATH)"
++ System.map init/Kerntypes "$(INSTALL_PATH)"
+Index: linux-2.6.10/arch/s390/boot/install.sh
+===================================================================
+--- linux-2.6.10.orig/arch/s390/boot/install.sh 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/s390/boot/install.sh 2005-04-05 16:47:53.921208448 +0800
+@@ -16,7 +16,8 @@
+ # $1 - kernel version
+ # $2 - kernel image file
+ # $3 - kernel map file
+-# $4 - default install path (blank if root directory)
++# $4 - kernel type file
++# $5 - default install path (blank if root directory)
+ #
+
+ # User may have a custom install script
+@@ -26,13 +27,13 @@
+
+ # Default install - same as make zlilo
+
+-if [ -f $4/vmlinuz ]; then
+- mv $4/vmlinuz $4/vmlinuz.old
++if [ -f $5/vmlinuz ]; then
++ mv $5/vmlinuz $5/vmlinuz.old
+ fi
+
+-if [ -f $4/System.map ]; then
+- mv $4/System.map $4/System.old
++if [ -f $5/System.map ]; then
++ mv $5/System.map $5/System.old
+ fi
+
+-cat $2 > $4/vmlinuz
+-cp $3 $4/System.map
++cat $2 > $5/vmlinuz
++cp $3 $5/System.map
+Index: linux-2.6.10/arch/ia64/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/Kconfig.debug 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/ia64/Kconfig.debug 2005-04-05 16:47:53.917209056 +0800
+@@ -2,6 +2,65 @@
+
+ source "lib/Kconfig.debug"
+
++config CRASH_DUMP
++ tristate "Crash dump support (EXPERIMENTAL)"
++ depends on EXPERIMENTAL
++ default n
++ ---help---
++ Say Y here to enable saving an image of system memory when a panic
++ or other error occurs. Dumps can also be forced with the SysRq+d
++ key if MAGIC_SYSRQ is enabled.
++
++config KERNTYPES
++ bool
++ depends on CRASH_DUMP
++ default y
++
++config CRASH_DUMP_BLOCKDEV
++ tristate "Crash dump block device driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving crash dumps directly to a disk device.
++
++config CRASH_DUMP_NETDEV
++ tristate "Crash dump network device driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving crash dumps over a network device.
++
++config CRASH_DUMP_MEMDEV
++ bool "Crash dump staged memory driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow intermediate saving crash dumps in spare
++ memory pages which would then be written out to disk
++ later.
++
++config CRASH_DUMP_SOFTBOOT
++ bool "Save crash dump across a soft reboot"
++ depends on CRASH_DUMP_MEMDEV
++ help
++ Say Y to allow a crash dump to be preserved in memory
++ pages across a soft reboot and written out to disk
++ thereafter. For this to work, CRASH_DUMP must be
++ configured as part of the kernel (not as a module).
++
++config CRASH_DUMP_COMPRESS_RLE
++ tristate "Crash dump RLE compression"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving dumps with Run Length Encoding compression.
++
++config CRASH_DUMP_COMPRESS_GZIP
++ tristate "Crash dump GZIP compression"
++ select ZLIB_INFLATE
++ select ZLIB_DEFLATE
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving dumps with Gnu Zip compression.
++
++
++
+ choice
+ prompt "Physical memory granularity"
+ default IA64_GRANULE_64MB
+Index: linux-2.6.10/arch/ia64/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/kernel/traps.c 2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/ia64/kernel/traps.c 2005-04-05 16:47:53.918208904 +0800
+@@ -21,6 +21,8 @@
+ #include <asm/intrinsics.h>
+ #include <asm/processor.h>
+ #include <asm/uaccess.h>
++#include <asm/nmi.h>
++#include <linux/dump.h>
+
+ extern spinlock_t timerlist_lock;
+
+@@ -89,6 +91,7 @@
+ printk("%s[%d]: %s %ld [%d]\n",
+ current->comm, current->pid, str, err, ++die_counter);
+ show_regs(regs);
++ dump((char *)str, regs);
+ } else
+ printk(KERN_ERR "Recursive die() failure, output suppressed\n");
+
+Index: linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/kernel/ia64_ksyms.c 2005-04-05 16:29:27.954340968 +0800
++++ linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c 2005-04-05 16:47:53.917209056 +0800
+@@ -7,7 +7,6 @@
+
+ #include <linux/config.h>
+ #include <linux/module.h>
+-
+ #include <linux/string.h>
+ EXPORT_SYMBOL(memset);
+ EXPORT_SYMBOL(memchr);
+@@ -28,6 +27,9 @@
+ EXPORT_SYMBOL(strstr);
+ EXPORT_SYMBOL(strpbrk);
+
++#include <linux/syscalls.h>
++EXPORT_SYMBOL(sys_ioctl);
++
+ #include <asm/checksum.h>
+ EXPORT_SYMBOL(ip_fast_csum); /* hand-coded assembly */
+
+@@ -125,3 +127,21 @@
+ # endif
+ # endif
+ #endif
++
++#include <asm/hw_irq.h>
++
++#ifdef CONFIG_CRASH_DUMP_MODULE
++#ifdef CONFIG_SMP
++extern irq_desc_t _irq_desc[NR_IRQS];
++extern cpumask_t irq_affinity[NR_IRQS];
++extern void stop_this_cpu(void *);
++extern int (*dump_ipi_function_ptr)(struct pt_regs *);
++extern void dump_send_ipi(void);
++EXPORT_SYMBOL(_irq_desc);
++EXPORT_SYMBOL(irq_affinity);
++EXPORT_SYMBOL(stop_this_cpu);
++EXPORT_SYMBOL(dump_send_ipi);
++EXPORT_SYMBOL(dump_ipi_function_ptr);
++#endif
++#endif
++
+Index: linux-2.6.10/arch/ia64/kernel/irq.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/kernel/irq.c 2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/arch/ia64/kernel/irq.c 2005-04-05 16:47:53.919208752 +0800
+@@ -933,7 +933,11 @@
+
+ static struct proc_dir_entry * smp_affinity_entry [NR_IRQS];
+
++#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE)
++cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
++#else
+ static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
++#endif
+
+ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
+
+Index: linux-2.6.10/arch/ia64/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/ia64/kernel/smp.c 2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/arch/ia64/kernel/smp.c 2005-04-05 16:47:53.920208600 +0800
+@@ -31,6 +31,10 @@
+ #include <linux/efi.h>
+ #include <linux/bitops.h>
+
++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE)
++#include <linux/dump.h>
++#endif
++
+ #include <asm/atomic.h>
+ #include <asm/current.h>
+ #include <asm/delay.h>
+@@ -67,6 +71,11 @@
+ #define IPI_CALL_FUNC 0
+ #define IPI_CPU_STOP 1
+
++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE)
++#define IPI_DUMP_INTERRUPT 4
++ int (*dump_ipi_function_ptr)(struct pt_regs *) = NULL;
++#endif
++
+ /* This needs to be cacheline aligned because it is written to by *other* CPUs. */
+ static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+
+@@ -84,7 +93,9 @@
+ spin_unlock_irq(&call_lock);
+ }
+
+-static void
++
++/*changed static void stop_this_cpu -> void stop_this_cpu */
++void
+ stop_this_cpu (void)
+ {
+ /*
+@@ -155,6 +166,15 @@
+ case IPI_CPU_STOP:
+ stop_this_cpu();
+ break;
++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE)
++ case IPI_DUMP_INTERRUPT:
++ if( dump_ipi_function_ptr != NULL ) {
++ if (!dump_ipi_function_ptr(regs)) {
++ printk(KERN_ERR "(*dump_ipi_function_ptr)(): rejected IPI_DUMP_INTERRUPT\n");
++ }
++ }
++ break;
++#endif
+
+ default:
+ printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which);
+@@ -369,9 +389,17 @@
+ {
+ send_IPI_allbutself(IPI_CPU_STOP);
+ }
++EXPORT_SYMBOL(smp_send_stop);
+
+ int __init
+ setup_profiling_timer (unsigned int multiplier)
+ {
+ return -EINVAL;
+ }
++
++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE)
++void dump_send_ipi(void)
++{
++ send_IPI_allbutself(IPI_DUMP_INTERRUPT);
++}
++#endif
+Index: linux-2.6.10/arch/ppc64/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/Kconfig.debug 2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/arch/ppc64/Kconfig.debug 2005-04-05 16:47:53.922208296 +0800
+@@ -2,6 +2,64 @@
+
+ source "lib/Kconfig.debug"
+
++config KERNTYPES
++ bool
++ depends on CRASH_DUMP
++ default y
++
++config CRASH_DUMP
++ tristate "Crash dump support"
++ default n
++ ---help---
++ Say Y here to enable saving an image of system memory when a panic
++ or other error occurs. Dumps can also be forced with the SysRq+d
++ key if MAGIC_SYSRQ is enabled.
++
++config CRASH_DUMP_BLOCKDEV
++ tristate "Crash dump block device driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving crash dumps directly to a disk device.
++
++config CRASH_DUMP_NETDEV
++ tristate "Crash dump network device driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving crash dumps over a network device.
++
++config CRASH_DUMP_MEMDEV
++ bool "Crash dump staged memory driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow intermediate saving crash dumps in spare
++ memory pages which would then be written out to disk
++ later. Need 'kexec' support for this to work.
++ **** Not supported at present ****
++
++config CRASH_DUMP_SOFTBOOT
++ bool "Save crash dump across a soft reboot"
++ help
++ Say Y to allow a crash dump to be preserved in memory
++ pages across a soft reboot and written out to disk
++ thereafter. For this to work, CRASH_DUMP must be
++ configured as part of the kernel (not as a module).
++ Need 'kexec' support to use this option.
++ **** Not supported at present ****
++
++config CRASH_DUMP_COMPRESS_RLE
++ tristate "Crash dump RLE compression"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving dumps with Run Length Encoding compression.
++
++config CRASH_DUMP_COMPRESS_GZIP
++ tristate "Crash dump GZIP compression"
++ select ZLIB_INFLATE
++ select ZLIB_DEFLATE
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving dumps with Gnu Zip compression.
++
+ config DEBUG_STACKOVERFLOW
+ bool "Check for stack overflows"
+ depends on DEBUG_KERNEL
+Index: linux-2.6.10/arch/ppc64/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/traps.c 2004-12-25 05:34:47.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/traps.c 2005-04-05 16:47:53.923208144 +0800
+@@ -29,6 +29,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
++#include <linux/dump.h>
+
+ #include <asm/pgtable.h>
+ #include <asm/uaccess.h>
+@@ -116,6 +117,7 @@
+ if (nl)
+ printk("\n");
+ show_regs(regs);
++ dump((char *)str, regs);
+ bust_spinlocks(0);
+ spin_unlock_irq(&die_lock);
+
+Index: linux-2.6.10/arch/ppc64/kernel/ppc_ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/ppc_ksyms.c 2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/ppc_ksyms.c 2005-04-05 16:47:53.925207840 +0800
+@@ -159,6 +159,17 @@
+ EXPORT_SYMBOL(get_wchan);
+ EXPORT_SYMBOL(console_drivers);
+
++#ifdef CONFIG_CRASH_DUMP_MODULE
++extern int dump_page_is_ram(unsigned long);
++EXPORT_SYMBOL(dump_page_is_ram);
++#ifdef CONFIG_SMP
++EXPORT_SYMBOL(irq_affinity);
++extern void stop_this_cpu(void *);
++EXPORT_SYMBOL(stop_this_cpu);
++EXPORT_SYMBOL(dump_send_ipi);
++#endif
++#endif
++
+ EXPORT_SYMBOL(tb_ticks_per_usec);
+ EXPORT_SYMBOL(paca);
+ EXPORT_SYMBOL(cur_cpu_spec);
+Index: linux-2.6.10/arch/ppc64/kernel/lmb.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/lmb.c 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/lmb.c 2005-04-05 16:47:53.924207992 +0800
+@@ -344,3 +344,31 @@
+
+ return pa;
+ }
++
++
++/*
++ * This is the copy of page_is_ram (mm/init.c). The difference is
++ * it identifies all memory holes.
++ */
++int dump_page_is_ram(unsigned long pfn)
++{
++ int i;
++ unsigned long paddr = (pfn << PAGE_SHIFT);
++
++ for (i=0; i < lmb.memory.cnt ;i++) {
++ unsigned long base;
++
++#ifdef CONFIG_MSCHUNKS
++ base = lmb.memory.region[i].physbase;
++#else
++ base = lmb.memory.region[i].base;
++#endif
++ if ((paddr >= base) &&
++ (paddr < (base + lmb.memory.region[i].size))) {
++ return 1;
++ }
++ }
++
++ return 0;
++}
++
+Index: linux-2.6.10/arch/ppc64/kernel/xics.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/xics.c 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/xics.c 2005-04-05 16:47:53.925207840 +0800
+@@ -421,7 +421,8 @@
+ smp_message_recv(PPC_MSG_MIGRATE_TASK, regs);
+ }
+ #endif
+-#ifdef CONFIG_DEBUGGER
++#if defined(CONFIG_DEBUGGER) || defined(CONFIG_CRASH_DUMP) \
++ || defined(CONFIG_CRASH_DUMP_MODULE)
+ if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK,
+ &xics_ipi_message[cpu].value)) {
+ mb();
+Index: linux-2.6.10/arch/ppc64/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/ppc64/kernel/smp.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/arch/ppc64/kernel/smp.c 2005-04-05 16:47:53.926207688 +0800
+@@ -30,6 +30,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/cache.h>
+ #include <linux/err.h>
++#include <linux/dump.h>
+ #include <linux/sysdev.h>
+ #include <linux/cpu.h>
+
+@@ -71,6 +72,7 @@
+ struct smp_ops_t *smp_ops;
+
+ static volatile unsigned int cpu_callin_map[NR_CPUS];
++static int (*dump_ipi_function_ptr)(struct pt_regs *) = NULL;
+
+ extern unsigned char stab_array[];
+
+@@ -177,9 +179,16 @@
+ /* spare */
+ break;
+ #endif
+-#ifdef CONFIG_DEBUGGER
++#if defined(CONFIG_DEBUGGER) || defined(CONFIG_CRASH_DUMP) \
++ || defined(CONFIG_CRASH_DUMP_MODULE)
+ case PPC_MSG_DEBUGGER_BREAK:
+- debugger_ipi(regs);
++ if (dump_ipi_function_ptr) {
++ dump_ipi_function_ptr(regs);
++ }
++#ifdef CONFIG_DEBUGGER
++ else
++ debugger_ipi(regs);
++#endif
+ break;
+ #endif
+ default:
+@@ -201,7 +210,16 @@
+ }
+ #endif
+
+-static void stop_this_cpu(void *dummy)
++void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *))
++{
++ dump_ipi_function_ptr = dump_ipi_callback;
++ if (dump_ipi_callback) {
++ mb();
++ smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_DEBUGGER_BREAK);
++ }
++}
++
++void stop_this_cpu(void *dummy)
+ {
+ local_irq_disable();
+ while (1)
+Index: linux-2.6.10/arch/x86_64/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/Kconfig.debug 2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/arch/x86_64/Kconfig.debug 2005-04-05 16:47:53.909210272 +0800
+@@ -2,6 +2,66 @@
+
+ source "lib/Kconfig.debug"
+
++config CRASH_DUMP
++ tristate "Crash dump support (EXPERIMENTAL)"
++ depends on EXPERIMENTAL
++ default n
++ ---help---
++ Say Y here to enable saving an image of system memory when a panic
++ or other error occurs. Dumps can also be forced with the SysRq+d
++ key if MAGIC_SYSRQ is enabled.
++
++config KERNTYPES
++ bool
++ depends on CRASH_DUMP
++ default y
++
++config CRASH_DUMP_BLOCKDEV
++ tristate "Crash dump block device driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving crash dumps directly to a disk device.
++
++config CRASH_DUMP_NETDEV
++ tristate "Crash dump network device driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving crash dumps over a network device.
++
++config CRASH_DUMP_MEMDEV
++ bool "Crash dump staged memory driver"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow intermediate saving crash dumps in spare
++ memory pages which would then be written out to disk
++ later.
++
++config CRASH_DUMP_SOFTBOOT
++ bool "Save crash dump across a soft reboot"
++ depends on CRASH_DUMP_MEMDEV
++ help
++ Say Y to allow a crash dump to be preserved in memory
++ lkcd-kernpages across a soft reboot and written out to disk
++ thereafter. For this to work, CRASH_DUMP must be
++ configured as part of the kernel (not as a module).
++
++config CRASH_DUMP_COMPRESS_RLE
++ tristate "Crash dump RLE compression"
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving dumps with Run Length Encoding compression.
++
++
++config CRASH_DUMP_COMPRESS_GZIP
++ tristate "Crash dump GZIP compression"
++ select ZLIB_INFLATE
++ select ZLIB_DEFLATE
++ depends on CRASH_DUMP
++ help
++ Say Y to allow saving dumps with Gnu Zip compression.
++
++
++
+ # !SMP for now because the context switch early causes GPF in segment reloading
+ # and the GS base checking does the wrong thing then, causing a hang.
+ config CHECKING
+Index: linux-2.6.10/arch/x86_64/mm/init.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/mm/init.c 2005-04-05 16:29:30.040023896 +0800
++++ linux-2.6.10/arch/x86_64/mm/init.c 2005-04-05 16:47:53.916209208 +0800
+@@ -378,7 +378,7 @@
+ __flush_tlb_all();
+ }
+
+-static inline int page_is_ram (unsigned long pagenr)
++inline int page_is_ram (unsigned long pagenr)
+ {
+ int i;
+
+Index: linux-2.6.10/arch/x86_64/kernel/traps.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/traps.c 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/traps.c 2005-04-05 16:47:53.915209360 +0800
+@@ -27,6 +27,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/interrupt.h>
+ #include <linux/module.h>
++#include <linux/dump.h>
+ #include <linux/moduleparam.h>
+
+ #include <asm/system.h>
+@@ -369,6 +370,7 @@
+ printk("\n");
+ notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
+ show_registers(regs);
++ dump((char *)str, regs);
+ /* Executive summary in case the oops scrolled away */
+ printk(KERN_ALERT "RIP ");
+ printk_address(regs->rip);
+Index: linux-2.6.10/arch/x86_64/kernel/setup.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/setup.c 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/setup.c 2005-04-05 16:47:53.911209968 +0800
+@@ -221,6 +221,8 @@
+ }
+ }
+
++unsigned long crashdump_addr = 0xdeadbeef;
++
+ static __init void parse_cmdline_early (char ** cmdline_p)
+ {
+ char c = ' ', *to = command_line, *from = COMMAND_LINE;
+@@ -311,6 +313,9 @@
+
+ if (!memcmp(from,"oops=panic", 10))
+ panic_on_oops = 1;
++
++ if (c == ' ' && !memcmp(from, "crashdump=", 10))
++ crashdump_addr = memparse(from+10, &from);
+
+ next_char:
+ c = *(from++);
+@@ -441,6 +446,10 @@
+ reserve_bootmem_generic(addr, PAGE_SIZE);
+ }
+
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++extern void crashdump_reserve(void);
++#endif
++
+ void __init setup_arch(char **cmdline_p)
+ {
+ unsigned long low_mem_size;
+@@ -550,6 +559,9 @@
+ }
+ #endif
+ paging_init();
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++ crashdump_reserve(); /* Preserve crash dump state from prev boot */
++#endif
+
+ check_ioapic();
+ #ifdef CONFIG_ACPI_BOOT
+Index: linux-2.6.10/arch/x86_64/kernel/smp.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/smp.c 2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/smp.c 2005-04-05 16:47:53.915209360 +0800
+@@ -20,6 +20,7 @@
+ #include <linux/kernel_stat.h>
+ #include <linux/mc146818rtc.h>
+ #include <linux/interrupt.h>
++#include <linux/dump.h>
+
+ #include <asm/mtrr.h>
+ #include <asm/pgalloc.h>
+@@ -151,6 +152,13 @@
+ if (!mm)
+ BUG();
+
++ if (vector == DUMP_VECTOR) {
++ /*
++ * Setup DUMP IPI to be delivered as an NMI
++ */
++ cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI;
++ }
++
+ /*
+ * I'm not happy about this global shared spinlock in the
+ * MM hot path, but we'll see how contended it is.
+@@ -253,6 +261,13 @@
+ send_IPI_allbutself(KDB_VECTOR);
+ }
+
++
++/* void dump_send_ipi(int (*dump_ipi_handler)(struct pt_regs *)); */
++void dump_send_ipi(void)
++{
++ send_IPI_allbutself(DUMP_VECTOR);
++}
++
+ /*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+@@ -340,6 +355,18 @@
+ return 0;
+ }
+
++void stop_this_cpu(void* dummy)
++{
++ /*
++ * Remove this CPU:
++ */
++ cpu_clear(smp_processor_id(), cpu_online_map);
++ local_irq_disable();
++ disable_local_APIC();
++ for (;;)
++ asm("hlt");
++}
++
+ void smp_stop_cpu(void)
+ {
+ /*
+Index: linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/x8664_ksyms.c 2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c 2005-04-05 16:47:53.914209512 +0800
+@@ -32,6 +32,7 @@
+ #include <asm/unistd.h>
+ #include <asm/delay.h>
+ #include <asm/tlbflush.h>
++#include <asm/e820.h>
+ #include <asm/kdebug.h>
+
+ extern spinlock_t rtc_lock;
+@@ -216,6 +217,20 @@
+ extern unsigned long __supported_pte_mask;
+ EXPORT_SYMBOL(__supported_pte_mask);
+
++#ifdef CONFIG_CRASH_DUMP_MODULE
++#ifdef CONFIG_SMP
++extern irq_desc_t irq_desc[NR_IRQS];
++extern cpumask_t irq_affinity[NR_IRQS];
++extern void stop_this_cpu(void *);
++EXPORT_SYMBOL(irq_desc);
++EXPORT_SYMBOL(irq_affinity);
++EXPORT_SYMBOL(dump_send_ipi);
++EXPORT_SYMBOL(stop_this_cpu);
++#endif
++extern int page_is_ram(unsigned long);
++EXPORT_SYMBOL(page_is_ram);
++#endif
++
+ #ifdef CONFIG_SMP
+ EXPORT_SYMBOL(flush_tlb_page);
+ EXPORT_SYMBOL_GPL(flush_tlb_all);
+Index: linux-2.6.10/arch/x86_64/kernel/pci-gart.c
+===================================================================
+--- linux-2.6.10.orig/arch/x86_64/kernel/pci-gart.c 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/x86_64/kernel/pci-gart.c 2005-04-05 16:47:53.913209664 +0800
+@@ -34,7 +34,7 @@
+ dma_addr_t bad_dma_address;
+
+ unsigned long iommu_bus_base; /* GART remapping area (physical) */
+-static unsigned long iommu_size; /* size of remapping area bytes */
++unsigned long iommu_size; /* size of remapping area bytes */
+ static unsigned long iommu_pages; /* .. and in pages */
+
+ u32 *iommu_gatt_base; /* Remapping table */
+Index: linux-2.6.10/init/version.c
+===================================================================
+--- linux-2.6.10.orig/init/version.c 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/init/version.c 2005-04-05 16:47:53.896212248 +0800
+@@ -11,6 +11,7 @@
+ #include <linux/uts.h>
+ #include <linux/utsname.h>
+ #include <linux/version.h>
++#include <linux/stringify.h>
+
+ #define version(a) Version_ ## a
+ #define version_string(a) version(a)
+@@ -31,3 +32,6 @@
+ const char *linux_banner =
+ "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+ LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n";
++
++const char *LINUX_COMPILE_VERSION_ID = __stringify(LINUX_COMPILE_VERSION_ID);
++LINUX_COMPILE_VERSION_ID_TYPE;
+Index: linux-2.6.10/init/kerntypes.c
+===================================================================
+--- linux-2.6.10.orig/init/kerntypes.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/init/kerntypes.c 2005-04-05 16:47:53.895212400 +0800
+@@ -0,0 +1,40 @@
++/*
++ * kerntypes.c
++ *
++ * Copyright (C) 2000 Tom Morano (tjm@sgi.com) and
++ * Matt D. Robinson (yakker@alacritech.com)
++ *
++ * Dummy module that includes headers for all kernel types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under version 2 of the GNU GPL.
++ */
++
++#include <linux/compile.h>
++#include <linux/module.h>
++#include <linux/mm.h>
++#include <linux/vmalloc.h>
++#include <linux/config.h>
++#include <linux/utsname.h>
++#include <linux/kernel_stat.h>
++#include <linux/dump.h>
++
++#include <asm/kerntypes.h>
++
++#ifdef LINUX_COMPILE_VERSION_ID_TYPE
++/* Define version type for version validation of dump and kerntypes */
++LINUX_COMPILE_VERSION_ID_TYPE;
++#endif
++#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP)
++extern struct runqueue runqueues;
++struct runqueue rn;
++#endif
++
++struct new_utsname *p;
++void
++kerntypes_dummy(void)
++{
++}
+Index: linux-2.6.10/init/main.c
+===================================================================
+--- linux-2.6.10.orig/init/main.c 2005-04-05 16:29:30.028025720 +0800
++++ linux-2.6.10/init/main.c 2005-04-05 16:47:53.897212096 +0800
+@@ -109,6 +109,16 @@
+ EXPORT_SYMBOL(system_state);
+
+ /*
++ * The kernel_magic value represents the address of _end, which allows
++ * namelist tools to "match" each other respectively. That way a tool
++ * that looks at /dev/mem can verify that it is using the right System.map
++ * file -- if kernel_magic doesn't equal the namelist value of _end,
++ * something's wrong.
++ */
++extern unsigned long _end;
++unsigned long *kernel_magic = &_end;
++
++/*
+ * Boot command-line arguments
+ */
+ #define MAX_INIT_ARGS 32
+Index: linux-2.6.10/init/Makefile
+===================================================================
+--- linux-2.6.10.orig/init/Makefile 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/init/Makefile 2005-04-05 16:47:53.897212096 +0800
+@@ -9,12 +9,20 @@
+ mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o
+ mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o
+
++extra-$(CONFIG_KERNTYPES) += kerntypes.o
++#For IA64, compile kerntypes in dwarf-2 format.
++ifeq ($(CONFIG_IA64),y)
++CFLAGS_kerntypes.o := -gdwarf-2
++else
++CFLAGS_kerntypes.o := -gstabs
++endif
++
+ # files to be removed upon make clean
+ clean-files := ../include/linux/compile.h
+
+ # dependencies on generated files need to be listed explicitly
+
+-$(obj)/version.o: include/linux/compile.h
++$(obj)/version.o $(obj)/kerntypes.o: include/linux/compile.h
+
+ # compile.h changes depending on hostname, generation number, etc,
+ # so we regenerate it always.
+@@ -24,3 +32,4 @@
+ include/linux/compile.h: FORCE
+ @echo ' CHK $@'
+ @$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CC) $(CFLAGS)"
++
+Index: linux-2.6.10/include/asm-um/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-um/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-um/kerntypes.h 2005-04-05 16:47:53.864217112 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-um/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* Usermode-Linux-specific header files */
++#ifndef _UM_KERNTYPES_H
++#define _UM_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _UM_KERNTYPES_H */
+Index: linux-2.6.10/include/linux/sysctl.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sysctl.h 2005-04-05 16:29:27.969338688 +0800
++++ linux-2.6.10/include/linux/sysctl.h 2005-04-05 16:47:53.894212552 +0800
+@@ -135,6 +135,7 @@
+ KERN_HZ_TIMER=65, /* int: hz timer on or off */
+ KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
+ KERN_SETUID_DUMPABLE=67, /* int: behaviour of dumps for setuid core */
++ KERN_DUMP=68, /* directory: dump parameters */
+ };
+
+
+Index: linux-2.6.10/include/linux/sched.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/sched.h 2005-04-05 16:47:05.178618448 +0800
++++ linux-2.6.10/include/linux/sched.h 2005-04-05 16:47:53.891213008 +0800
+@@ -94,6 +94,7 @@
+ extern int nr_threads;
+ extern int last_pid;
+ DECLARE_PER_CPU(unsigned long, process_counts);
++DECLARE_PER_CPU(struct runqueue, runqueues);
+ extern int nr_processes(void);
+ extern unsigned long nr_running(void);
+ extern unsigned long nr_uninterruptible(void);
+@@ -760,6 +761,110 @@
+ void yield(void);
+
+ /*
++ * These are the runqueue data structures:
++ */
++
++#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
++
++typedef struct runqueue runqueue_t;
++
++struct prio_array {
++ unsigned int nr_active;
++ unsigned long bitmap[BITMAP_SIZE];
++ struct list_head queue[MAX_PRIO];
++};
++
++/*
++ * This is the main, per-CPU runqueue data structure.
++ *
++ * Locking rule: those places that want to lock multiple runqueues
++ * (such as the load balancing or the thread migration code), lock
++ * acquire operations must be ordered by ascending &runqueue.
++ */
++struct runqueue {
++ spinlock_t lock;
++
++ /*
++ * nr_running and cpu_load should be in the same cacheline because
++ * remote CPUs use both these fields when doing load calculation.
++ */
++ unsigned long nr_running;
++#ifdef CONFIG_SMP
++ unsigned long cpu_load;
++#endif
++ unsigned long long nr_switches;
++
++ /*
++ * This is part of a global counter where only the total sum
++ * over all CPUs matters. A task can increase this counter on
++ * one CPU and if it got migrated afterwards it may decrease
++ * it on another CPU. Always updated under the runqueue lock:
++ */
++ unsigned long nr_uninterruptible;
++
++ unsigned long expired_timestamp;
++ unsigned long long timestamp_last_tick;
++ task_t *curr, *idle;
++ struct mm_struct *prev_mm;
++ prio_array_t *active, *expired, arrays[2];
++ int best_expired_prio;
++ atomic_t nr_iowait;
++
++#ifdef CONFIG_SMP
++ struct sched_domain *sd;
++
++ /* For active balancing */
++ int active_balance;
++ int push_cpu;
++
++ task_t *migration_thread;
++ struct list_head migration_queue;
++#endif
++
++#ifdef CONFIG_SCHEDSTATS
++ /* latency stats */
++ struct sched_info rq_sched_info;
++
++ /* sys_sched_yield() stats */
++ unsigned long yld_exp_empty;
++ unsigned long yld_act_empty;
++ unsigned long yld_both_empty;
++ unsigned long yld_cnt;
++
++ /* schedule() stats */
++ unsigned long sched_noswitch;
++ unsigned long sched_switch;
++ unsigned long sched_cnt;
++ unsigned long sched_goidle;
++
++ /* pull_task() stats */
++ unsigned long pt_gained[MAX_IDLE_TYPES];
++ unsigned long pt_lost[MAX_IDLE_TYPES];
++
++ /* active_load_balance() stats */
++ unsigned long alb_cnt;
++ unsigned long alb_lost;
++ unsigned long alb_gained;
++ unsigned long alb_failed;
++
++ /* try_to_wake_up() stats */
++ unsigned long ttwu_cnt;
++ unsigned long ttwu_attempts;
++ unsigned long ttwu_moved;
++
++ /* wake_up_new_task() stats */
++ unsigned long wunt_cnt;
++ unsigned long wunt_moved;
++
++ /* sched_migrate_task() stats */
++ unsigned long smt_cnt;
++
++ /* sched_balance_exec() stats */
++ unsigned long sbe_cnt;
++#endif
++};
++
++/*
+ * The default (Linux) execution domain.
+ */
+ extern struct exec_domain default_exec_domain;
+Index: linux-2.6.10/include/linux/miscdevice.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/miscdevice.h 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/include/linux/miscdevice.h 2005-04-05 16:47:53.893212704 +0800
+@@ -25,6 +25,7 @@
+ #define MICROCODE_MINOR 184
+ #define MWAVE_MINOR 219 /* ACP/Mwave Modem */
+ #define MPT_MINOR 220
++#define CRASH_DUMP_MINOR 230 /* LKCD */
+ #define MISC_DYNAMIC_MINOR 255
+
+ #define TUN_MINOR 200
+Index: linux-2.6.10/include/linux/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dump.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dump.h 2005-04-05 16:47:53.893212704 +0800
+@@ -0,0 +1,406 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ *
++ * vmdump.h to dump.h by: Matt D. Robinson (yakker@sourceforge.net)
++ * Copyright 2001 - 2002 Matt D. Robinson. All rights reserved.
++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved.
++ *
++ * Most of this is the same old stuff from vmdump.h, except now we're
++ * actually a stand-alone driver plugged into the block layer interface,
++ * with the exception that we now allow for compression modes externally
++ * loaded (e.g., someone can come up with their own).
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file includes all structure definitions for crash dumps. */
++#ifndef _DUMP_H
++#define _DUMP_H
++
++#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE)
++
++#include <linux/list.h>
++#include <linux/notifier.h>
++#include <linux/dumpdev.h>
++#include <asm/ioctl.h>
++
++/*
++ * Predefine default DUMP_PAGE constants, asm header may override.
++ *
++ * On ia64 discontinuous memory systems it's possible for the memory
++ * banks to stop at 2**12 page alignments, the smallest possible page
++ * size. But the system page size, PAGE_SIZE, is in fact larger.
++ */
++#define DUMP_PAGE_SHIFT PAGE_SHIFT
++#define DUMP_PAGE_MASK PAGE_MASK
++#define DUMP_PAGE_ALIGN(addr) PAGE_ALIGN(addr)
++
++/*
++ * Dump offset changed from 4Kb to 64Kb to support multiple PAGE_SIZE
++ * (kernel page size). Assumption goes that 64K is the highest page size
++ * supported
++ */
++
++#define DUMP_HEADER_OFFSET (1ULL << 16)
++
++#define OLDMINORBITS 8
++#define OLDMINORMASK ((1U << OLDMINORBITS) -1)
++
++/* Making DUMP_PAGE_SIZE = PAGE_SIZE, to support dumping on architectures
++ * which support page sizes (PAGE_SIZE) greater than 4KB.
++ * Will it affect ia64 discontinuous memory systems ????
++ */
++#define DUMP_PAGE_SIZE PAGE_SIZE
++
++/* thread_info lies at the bottom of stack, (Except IA64). */
++#define STACK_START_POSITION(tsk) (tsk->thread_info)
++/*
++ * Predefined default memcpy() to use when copying memory to the dump buffer.
++ *
++ * On ia64 there is a heads up function that can be called to let the prom
++ * machine check monitor know that the current activity is risky and it should
++ * ignore the fault (nofault). In this case the ia64 header will redefine this
++ * macro to __dump_memcpy() and use it's arch specific version.
++ */
++#define DUMP_memcpy memcpy
++#define bzero(a,b) memset(a, 0, b)
++
++/* necessary header files */
++#include <asm/dump.h> /* for architecture-specific header */
++
++/*
++ * Size of the buffer that's used to hold:
++ *
++ * 1. the dump header (padded to fill the complete buffer)
++ * 2. the possibly compressed page headers and data
++ *
++ * = 256k for page size >= 64k
++ * = 64k for page size < 64k
++ */
++#if (PAGE_SHIFT >= 16)
++#define DUMP_BUFFER_SIZE (256 * 1024) /* size of dump buffer */
++#else
++#define DUMP_BUFFER_SIZE (64 * 1024) /* size of dump buffer */
++#endif
++
++#define DUMP_HEADER_SIZE DUMP_BUFFER_SIZE
++
++/* standard header definitions */
++#define DUMP_MAGIC_NUMBER 0xa8190173618f23edULL /* dump magic number */
++#define DUMP_MAGIC_LIVE 0xa8190173618f23cdULL /* live magic number */
++#define DUMP_VERSION_NUMBER 0x8 /* dump version number */
++#define DUMP_PANIC_LEN 0x100 /* dump panic string length */
++
++/* dump levels - type specific stuff added later -- add as necessary */
++#define DUMP_LEVEL_NONE 0x0 /* no dumping at all -- just bail */
++#define DUMP_LEVEL_HEADER 0x1 /* kernel dump header only */
++#define DUMP_LEVEL_KERN 0x2 /* dump header and kernel pages */
++#define DUMP_LEVEL_USED 0x4 /* dump header, kernel/user pages */
++#define DUMP_LEVEL_ALL_RAM 0x8 /* dump header, all RAM pages */
++#define DUMP_LEVEL_ALL 0x10 /* dump all memory RAM and firmware */
++
++
++/* dump compression options -- add as necessary */
++#define DUMP_COMPRESS_NONE 0x0 /* don't compress this dump */
++#define DUMP_COMPRESS_RLE 0x1 /* use RLE compression */
++#define DUMP_COMPRESS_GZIP 0x2 /* use GZIP compression */
++
++/* dump flags - any dump-type specific flags -- add as necessary */
++#define DUMP_FLAGS_NONE 0x0 /* no flags are set for this dump */
++#define DUMP_FLAGS_SOFTBOOT 0x2 /* 2 stage soft-boot based dump */
++#define DUMP_FLAGS_NONDISRUPT 0X1 /* non-disruptive dumping */
++
++#define DUMP_FLAGS_TARGETMASK 0xf0000000 /* handle special case targets */
++#define DUMP_FLAGS_DISKDUMP 0x80000000 /* dump to local disk */
++#define DUMP_FLAGS_NETDUMP 0x40000000 /* dump over the network */
++
++/* dump header flags -- add as necessary */
++#define DUMP_DH_FLAGS_NONE 0x0 /* no flags set (error condition!) */
++#define DUMP_DH_RAW 0x1 /* raw page (no compression) */
++#define DUMP_DH_COMPRESSED 0x2 /* page is compressed */
++#define DUMP_DH_END 0x4 /* end marker on a full dump */
++#define DUMP_DH_TRUNCATED 0x8 /* dump is incomplete */
++#define DUMP_DH_TEST_PATTERN 0x10 /* dump page is a test pattern */
++#define DUMP_DH_NOT_USED 0x20 /* 1st bit not used in flags */
++
++/* names for various dump parameters in /proc/kernel */
++#define DUMP_ROOT_NAME "sys/dump"
++#define DUMP_DEVICE_NAME "device"
++#define DUMP_COMPRESS_NAME "compress"
++#define DUMP_LEVEL_NAME "level"
++#define DUMP_FLAGS_NAME "flags"
++#define DUMP_ADDR_NAME "addr"
++
++#define DUMP_SYSRQ_KEY 'd' /* key to use for MAGIC_SYSRQ key */
++
++/* CTL_DUMP names: */
++enum
++{
++ CTL_DUMP_DEVICE=1,
++ CTL_DUMP_COMPRESS=3,
++ CTL_DUMP_LEVEL=3,
++ CTL_DUMP_FLAGS=4,
++ CTL_DUMP_ADDR=5,
++ CTL_DUMP_TEST=6,
++};
++
++
++/* page size for gzip compression -- buffered slightly beyond hardware PAGE_SIZE used by DUMP */
++#define DUMP_DPC_PAGE_SIZE (DUMP_PAGE_SIZE + 512)
++
++/* dump ioctl() control options */
++#define DIOSDUMPDEV _IOW('p', 0xA0, unsigned int) /* set the dump device */
++#define DIOGDUMPDEV _IOR('p', 0xA1, unsigned int) /* get the dump device */
++#define DIOSDUMPLEVEL _IOW('p', 0xA2, unsigned int) /* set the dump level */
++#define DIOGDUMPLEVEL _IOR('p', 0xA3, unsigned int) /* get the dump level */
++#define DIOSDUMPFLAGS _IOW('p', 0xA4, unsigned int) /* set the dump flag parameters */
++#define DIOGDUMPFLAGS _IOR('p', 0xA5, unsigned int) /* get the dump flag parameters */
++#define DIOSDUMPCOMPRESS _IOW('p', 0xA6, unsigned int) /* set the dump compress level */
++#define DIOGDUMPCOMPRESS _IOR('p', 0xA7, unsigned int) /* get the dump compress level */
++
++/* these ioctls are used only by netdump module */
++#define DIOSTARGETIP _IOW('p', 0xA8, unsigned int) /* set the target m/c's ip */
++#define DIOGTARGETIP _IOR('p', 0xA9, unsigned int) /* get the target m/c's ip */
++#define DIOSTARGETPORT _IOW('p', 0xAA, unsigned int) /* set the target m/c's port */
++#define DIOGTARGETPORT _IOR('p', 0xAB, unsigned int) /* get the target m/c's port */
++#define DIOSSOURCEPORT _IOW('p', 0xAC, unsigned int) /* set the source m/c's port */
++#define DIOGSOURCEPORT _IOR('p', 0xAD, unsigned int) /* get the source m/c's port */
++#define DIOSETHADDR _IOW('p', 0xAE, unsigned int) /* set ethernet address */
++#define DIOGETHADDR _IOR('p', 0xAF, unsigned int) /* get ethernet address */
++#define DIOGDUMPOKAY _IOR('p', 0xB0, unsigned int) /* check if dump is configured */
++#define DIOSDUMPTAKE _IOW('p', 0xB1, unsigned int) /* Take a manual dump */
++
++/*
++ * Structure: __dump_header
++ * Function: This is the header dumped at the top of every valid crash
++ * dump.
++ */
++struct __dump_header {
++ /* the dump magic number -- unique to verify dump is valid */
++ u64 dh_magic_number;
++
++ /* the version number of this dump */
++ u32 dh_version;
++
++ /* the size of this header (in case we can't read it) */
++ u32 dh_header_size;
++
++ /* the level of this dump (just a header?) */
++ u32 dh_dump_level;
++
++ /*
++ * We assume dump_page_size to be 4K in every case.
++ * Store here the configurable system page size (4K, 8K, 16K, etc.)
++ */
++ u32 dh_page_size;
++
++ /* the size of all physical memory */
++ u64 dh_memory_size;
++
++ /* the start of physical memory */
++ u64 dh_memory_start;
++
++ /* the end of physical memory */
++ u64 dh_memory_end;
++
++ /* the number of hardware/physical pages in this dump specifically */
++ u32 dh_num_dump_pages;
++
++ /* the panic string, if available */
++ char dh_panic_string[DUMP_PANIC_LEN];
++
++ /* timeval depends on architecture, two long values */
++ struct {
++ u64 tv_sec;
++ u64 tv_usec;
++ } dh_time; /* the time of the system crash */
++
++ /* the NEW utsname (uname) information -- in character form */
++ /* we do this so we don't have to include utsname.h */
++ /* plus it helps us be more architecture independent */
++ /* now maybe one day soon they'll make the [65] a #define! */
++ char dh_utsname_sysname[65];
++ char dh_utsname_nodename[65];
++ char dh_utsname_release[65];
++ char dh_utsname_version[65];
++ char dh_utsname_machine[65];
++ char dh_utsname_domainname[65];
++
++ /* the address of current task (OLD = void *, NEW = u64) */
++ u64 dh_current_task;
++
++ /* what type of compression we're using in this dump (if any) */
++ u32 dh_dump_compress;
++
++ /* any additional flags */
++ u32 dh_dump_flags;
++
++ /* any additional flags */
++ u32 dh_dump_device;
++} __attribute__((packed));
++
++/*
++ * Structure: __dump_page
++ * Function: To act as the header associated to each physical page of
++ * memory saved in the system crash dump. This allows for
++ * easy reassembly of each crash dump page. The address bits
++ * are split to make things easier for 64-bit/32-bit system
++ * conversions.
++ *
++ * dp_byte_offset and dp_page_index are landmarks that are helpful when
++ * looking at a hex dump of /dev/vmdump,
++ */
++struct __dump_page {
++ /* the address of this dump page */
++ u64 dp_address;
++
++ /* the size of this dump page */
++ u32 dp_size;
++
++ /* flags (currently DUMP_COMPRESSED, DUMP_RAW or DUMP_END) */
++ u32 dp_flags;
++} __attribute__((packed));
++
++/*
++ * Structure: __lkcdinfo
++ * Function: This structure contains information needed for the lkcdutils
++ * package (particularly lcrash) to determine what information is
++ * associated to this kernel, specifically.
++ */
++struct __lkcdinfo {
++ int arch;
++ int ptrsz;
++ int byte_order;
++ int linux_release;
++ int page_shift;
++ int page_size;
++ u64 page_mask;
++ u64 page_offset;
++ int stack_offset;
++};
++
++#ifdef __KERNEL__
++
++/*
++ * Structure: __dump_compress
++ * Function: This is what an individual compression mechanism can use
++ * to plug in their own compression techniques. It's always
++ * best to build these as individual modules so that people
++ * can put in whatever they want.
++ */
++struct __dump_compress {
++ /* the list_head structure for list storage */
++ struct list_head list;
++
++ /* the type of compression to use (DUMP_COMPRESS_XXX) */
++ int compress_type;
++ const char *compress_name;
++
++ /* the compression function to call */
++ u32 (*compress_func)(const u8 *, u32, u8 *, u32, unsigned long);
++};
++
++/* functions for dump compression registration */
++extern void dump_register_compression(struct __dump_compress *);
++extern void dump_unregister_compression(int);
++
++/*
++ * Structure dump_mbank[]:
++ *
++ * For CONFIG_DISCONTIGMEM systems this array specifies the
++ * memory banks/chunks that need to be dumped after a panic.
++ *
++ * For classic systems it specifies a single set of pages from
++ * 0 to max_mapnr.
++ */
++struct __dump_mbank {
++ u64 start;
++ u64 end;
++ int type;
++ int pad1;
++ long pad2;
++};
++
++#define DUMP_MBANK_TYPE_CONVENTIONAL_MEMORY 1
++#define DUMP_MBANK_TYPE_OTHER 2
++
++#define MAXCHUNKS 256
++extern int dump_mbanks;
++extern struct __dump_mbank dump_mbank[MAXCHUNKS];
++
++/* notification event codes */
++#define DUMP_BEGIN 0x0001 /* dump beginning */
++#define DUMP_END 0x0002 /* dump ending */
++
++/* Scheduler soft spin control.
++ *
++ * 0 - no dump in progress
++ * 1 - cpu0 is dumping, ...
++ */
++extern unsigned long dump_oncpu;
++extern void dump_execute(const char *, const struct pt_regs *);
++
++/*
++ * Notifier list for kernel code which wants to be called
++ * at kernel dump.
++ */
++extern struct notifier_block *dump_notifier_list;
++static inline int register_dump_notifier(struct notifier_block *nb)
++{
++ return notifier_chain_register(&dump_notifier_list, nb);
++}
++static inline int unregister_dump_notifier(struct notifier_block * nb)
++{
++ return notifier_chain_unregister(&dump_notifier_list, nb);
++}
++
++extern void (*dump_function_ptr)(const char *, const struct pt_regs *);
++static inline void dump(char * str, struct pt_regs * regs)
++{
++ if (dump_function_ptr)
++ dump_function_ptr(str, regs);
++}
++
++/*
++ * Common Arch Specific Functions should be declared here.
++ * This allows the C compiler to detect discrepancies.
++ */
++extern void __dump_open(void);
++extern void __dump_cleanup(void);
++extern void __dump_clean_irq_state(void);
++extern void __dump_init(u64);
++extern void __dump_save_regs(struct pt_regs *, const struct pt_regs *);
++extern void __dump_save_context(int cpu, const struct pt_regs *, struct task_struct *tsk);
++extern int __dump_configure_header(const struct pt_regs *);
++extern int __dump_irq_enable(void);
++extern void __dump_irq_restore(void);
++extern int __dump_page_valid(unsigned long index);
++#ifdef CONFIG_SMP
++extern void __dump_save_other_cpus(void);
++#else
++#define __dump_save_other_cpus()
++#endif
++
++extern int manual_handle_crashdump(void);
++
++/* to track all used (compound + zero order) pages */
++#define PageInuse(p) (PageCompound(p) || page_count(p))
++
++#endif /* __KERNEL__ */
++
++#else /* !CONFIG_CRASH_DUMP */
++
++/* If not configured then make code disappear! */
++#define register_dump_watchdog(x) do { } while(0)
++#define unregister_dump_watchdog(x) do { } while(0)
++#define register_dump_notifier(x) do { } while(0)
++#define unregister_dump_notifier(x) do { } while(0)
++#define dump_in_progress() 0
++#define dump(x, y) do { } while(0)
++
++#endif /* !CONFIG_CRASH_DUMP */
++
++#endif /* _DUMP_H */
+Index: linux-2.6.10/include/linux/dumpdev.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dumpdev.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dumpdev.h 2005-04-05 16:47:53.890213160 +0800
+@@ -0,0 +1,163 @@
++/*
++ * Generic dump device interfaces for flexible system dump
++ * (Enables variation of dump target types e.g disk, network, memory)
++ *
++ * These interfaces have evolved based on discussions on lkcd-devel.
++ * Eventually the intent is to support primary and secondary or
++ * alternate targets registered at the same time, with scope for
++ * situation based failover or multiple dump devices used for parallel
++ * dump i/o.
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com)
++ *
++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#ifndef _LINUX_DUMPDEV_H
++#define _LINUX_DUMPDEV_H
++
++#include <linux/kernel.h>
++#include <linux/wait.h>
++#include <linux/netpoll.h>
++#include <linux/bio.h>
++
++/* Determined by the dump target (device) type */
++
++struct dump_dev;
++
++struct dump_dev_ops {
++ int (*open)(struct dump_dev *, unsigned long); /* configure */
++ int (*release)(struct dump_dev *); /* unconfigure */
++ int (*silence)(struct dump_dev *); /* when dump starts */
++ int (*resume)(struct dump_dev *); /* when dump is over */
++ int (*seek)(struct dump_dev *, loff_t);
++ /* trigger a write (async in nature typically) */
++ int (*write)(struct dump_dev *, void *, unsigned long);
++ /* not usually used during dump, but option available */
++ int (*read)(struct dump_dev *, void *, unsigned long);
++ /* use to poll for completion */
++ int (*ready)(struct dump_dev *, void *);
++ int (*ioctl)(struct dump_dev *, unsigned int, unsigned long);
++};
++
++struct dump_dev {
++ char type_name[32]; /* block, net-poll etc */
++ unsigned long device_id; /* interpreted differently for various types */
++ struct dump_dev_ops *ops;
++ struct list_head list;
++ loff_t curr_offset;
++ struct netpoll np;
++};
++
++/*
++ * dump_dev type variations:
++ */
++
++/* block */
++struct dump_blockdev {
++ struct dump_dev ddev;
++ dev_t dev_id;
++ struct block_device *bdev;
++ struct bio *bio;
++ loff_t start_offset;
++ loff_t limit;
++ int err;
++};
++
++static inline struct dump_blockdev *DUMP_BDEV(struct dump_dev *dev)
++{
++ return container_of(dev, struct dump_blockdev, ddev);
++}
++
++
++/* mem - for internal use by soft-boot based dumper */
++struct dump_memdev {
++ struct dump_dev ddev;
++ unsigned long indirect_map_root;
++ unsigned long nr_free;
++ struct page *curr_page;
++ unsigned long *curr_map;
++ unsigned long curr_map_offset;
++ unsigned long last_offset;
++ unsigned long last_used_offset;
++ unsigned long last_bs_offset;
++};
++
++static inline struct dump_memdev *DUMP_MDEV(struct dump_dev *dev)
++{
++ return container_of(dev, struct dump_memdev, ddev);
++}
++
++/* Todo/future - meant for raw dedicated interfaces e.g. mini-ide driver */
++struct dump_rdev {
++ struct dump_dev ddev;
++ char name[32];
++ int (*reset)(struct dump_rdev *, unsigned int,
++ unsigned long);
++ /* ... to do ... */
++};
++
++/* just to get the size right when saving config across a soft-reboot */
++struct dump_anydev {
++ union {
++ struct dump_blockdev bddev;
++ /* .. add other types here .. */
++ };
++};
++
++
++
++/* Dump device / target operation wrappers */
++/* These assume that dump_dev is initiatized to dump_config.dumper->dev */
++
++extern struct dump_dev *dump_dev;
++
++static inline int dump_dev_open(unsigned long arg)
++{
++ return dump_dev->ops->open(dump_dev, arg);
++}
++
++static inline int dump_dev_release(void)
++{
++ return dump_dev->ops->release(dump_dev);
++}
++
++static inline int dump_dev_silence(void)
++{
++ return dump_dev->ops->silence(dump_dev);
++}
++
++static inline int dump_dev_resume(void)
++{
++ return dump_dev->ops->resume(dump_dev);
++}
++
++static inline int dump_dev_seek(loff_t offset)
++{
++ return dump_dev->ops->seek(dump_dev, offset);
++}
++
++static inline int dump_dev_write(void *buf, unsigned long len)
++{
++ return dump_dev->ops->write(dump_dev, buf, len);
++}
++
++static inline int dump_dev_ready(void *buf)
++{
++ return dump_dev->ops->ready(dump_dev, buf);
++}
++
++static inline int dump_dev_ioctl(unsigned int cmd, unsigned long arg)
++{
++ if (!dump_dev || !dump_dev->ops->ioctl)
++ return -EINVAL;
++ return dump_dev->ops->ioctl(dump_dev, cmd, arg);
++}
++
++extern int dump_register_device(struct dump_dev *);
++extern void dump_unregister_device(struct dump_dev *);
++
++#endif /* _LINUX_DUMPDEV_H */
+Index: linux-2.6.10/include/linux/dump_netdev.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dump_netdev.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/dump_netdev.h 2005-04-05 16:47:53.889213312 +0800
+@@ -0,0 +1,80 @@
++/*
++ * linux/drivers/net/netconsole.h
++ *
++ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
++ *
++ * This file contains the implementation of an IRQ-safe, crash-safe
++ * kernel console implementation that outputs kernel messages to the
++ * network.
++ *
++ * Modification history:
++ *
++ * 2001-09-17 started by Ingo Molnar.
++ */
++
++/****************************************************************
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2, or (at your option)
++ * any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ *
++ ****************************************************************/
++
++#define NETCONSOLE_VERSION 0x03
++
++enum netdump_commands {
++ COMM_NONE = 0,
++ COMM_SEND_MEM = 1,
++ COMM_EXIT = 2,
++ COMM_REBOOT = 3,
++ COMM_HELLO = 4,
++ COMM_GET_NR_PAGES = 5,
++ COMM_GET_PAGE_SIZE = 6,
++ COMM_START_NETDUMP_ACK = 7,
++ COMM_GET_REGS = 8,
++ COMM_GET_MAGIC = 9,
++ COMM_START_WRITE_NETDUMP_ACK = 10,
++};
++
++typedef struct netdump_req_s {
++ u64 magic;
++ u32 nr;
++ u32 command;
++ u32 from;
++ u32 to;
++} req_t;
++
++enum netdump_replies {
++ REPLY_NONE = 0,
++ REPLY_ERROR = 1,
++ REPLY_LOG = 2,
++ REPLY_MEM = 3,
++ REPLY_RESERVED = 4,
++ REPLY_HELLO = 5,
++ REPLY_NR_PAGES = 6,
++ REPLY_PAGE_SIZE = 7,
++ REPLY_START_NETDUMP = 8,
++ REPLY_END_NETDUMP = 9,
++ REPLY_REGS = 10,
++ REPLY_MAGIC = 11,
++ REPLY_START_WRITE_NETDUMP = 12,
++};
++
++typedef struct netdump_reply_s {
++ u32 nr;
++ u32 code;
++ u32 info;
++} reply_t;
++
++#define HEADER_LEN (1 + sizeof(reply_t))
++
++
+Index: linux-2.6.10/include/asm-parisc/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-parisc/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-parisc/kerntypes.h 2005-04-05 16:47:53.870216200 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-parisc/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* PA-RISC-specific header files */
++#ifndef _PARISC_KERNTYPES_H
++#define _PARISC_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _PARISC_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-h8300/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-h8300/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-h8300/kerntypes.h 2005-04-05 16:47:53.880214680 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-h8300/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* H8300-specific header files */
++#ifndef _H8300_KERNTYPES_H
++#define _H8300_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _H8300_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-ppc/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ppc/kerntypes.h 2005-04-05 16:47:53.882214376 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-ppc/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* PowerPC-specific header files */
++#ifndef _PPC_KERNTYPES_H
++#define _PPC_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _PPC_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-alpha/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-alpha/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-alpha/kerntypes.h 2005-04-05 16:47:53.876215288 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-alpha/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* Alpha-specific header files */
++#ifndef _ALPHA_KERNTYPES_H
++#define _ALPHA_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _ALPHA_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-arm26/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-arm26/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-arm26/kerntypes.h 2005-04-05 16:47:53.865216960 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-arm26/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* ARM26-specific header files */
++#ifndef _ARM26_KERNTYPES_H
++#define _ARM26_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _ARM26_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-sh/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sh/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-sh/kerntypes.h 2005-04-05 16:47:53.877215136 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-sh/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* Super-H-specific header files */
++#ifndef _SH_KERNTYPES_H
++#define _SH_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _SH_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-ia64/nmi.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ia64/nmi.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ia64/nmi.h 2005-04-05 16:47:53.883214224 +0800
+@@ -0,0 +1,28 @@
++/*
++ * linux/include/asm-ia64/nmi.h
++ */
++#ifndef ASM_NMI_H
++#define ASM_NMI_H
++
++#include <linux/pm.h>
++
++struct pt_regs;
++
++typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
++
++/**
++ * set_nmi_callback
++ *
++ * Set a handler for an NMI. Only one handler may be
++ * set. Return 1 if the NMI was handled.
++ */
++void set_nmi_callback(nmi_callback_t callback);
++
++/**
++ * unset_nmi_callback
++ *
++ * Remove the handler previously set.
++ */
++void unset_nmi_callback(void);
++
++#endif /* ASM_NMI_H */
+Index: linux-2.6.10/include/asm-ia64/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ia64/dump.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ia64/dump.h 2005-04-05 16:47:53.884214072 +0800
+@@ -0,0 +1,201 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file holds the architecture specific crash dump header */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++/* definitions */
++#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */
++#define DUMP_ASM_VERSION_NUMBER 0x4 /* version number */
++
++#ifdef __KERNEL__
++#include <linux/efi.h>
++#include <asm/pal.h>
++#include <asm/ptrace.h>
++
++#ifdef CONFIG_SMP
++extern cpumask_t irq_affinity[];
++extern int (*dump_ipi_function_ptr)(struct pt_regs *);
++extern void dump_send_ipi(void);
++#else /* !CONFIG_SMP */
++#define dump_send_ipi() do { } while(0)
++#endif
++
++#else /* !__KERNEL__ */
++/* necessary header files */
++#include <asm/ptrace.h> /* for pt_regs */
++#include <linux/threads.h>
++#endif /* __KERNEL__ */
++
++/*
++ * mkswap.c calls getpagesize() to get the system page size,
++ * which is not necessarily the same as the hardware page size.
++ *
++ * For ia64 the kernel PAGE_SIZE can be configured from 4KB ... 16KB.
++ *
++ * The physical memory is layed out out in the hardware/minimal pages.
++ * This is the size we need to use for dumping physical pages.
++ *
++ * Note ths hardware/minimal page size being use in;
++ * arch/ia64/kernel/efi.c`efi_memmap_walk():
++ * curr.end = curr.start + (md->num_pages << 12);
++ *
++ * Since the system page size could change between the kernel we boot
++ * on the the kernel that cause the core dume we may want to have something
++ * more constant like the maximum system page size (See include/asm-ia64/page.h).
++ */
++/* IA64 manages the stack in differnt manner as compared to other architectures.
++ * task_struct lies at the bottom of stack.
++ */
++#undef STACK_START_POSITION
++#define STACK_START_POSITION(tsk) (tsk)
++#define DUMP_MIN_PAGE_SHIFT 12
++#define DUMP_MIN_PAGE_SIZE (1UL << DUMP_MIN_PAGE_SHIFT)
++#define DUMP_MIN_PAGE_MASK (~(DUMP_MIN_PAGE_SIZE - 1))
++#define DUMP_MIN_PAGE_ALIGN(addr) (((addr) + DUMP_MIN_PAGE_SIZE - 1) & DUMP_MIN_PAGE_MASK)
++
++#define DUMP_MAX_PAGE_SHIFT 16
++#define DUMP_MAX_PAGE_SIZE (1UL << DUMP_MAX_PAGE_SHIFT)
++#define DUMP_MAX_PAGE_MASK (~(DUMP_MAX_PAGE_SIZE - 1))
++#define DUMP_MAX_PAGE_ALIGN(addr) (((addr) + DUMP_MAX_PAGE_SIZE - 1) & DUMP_MAX_PAGE_MASK)
++
++#define DUMP_EF_PAGE_SHIFT DUMP_MIN_PAGE_SHIFT
++
++extern int _end,_start;
++
++/*
++ * Structure: dump_header_asm_t
++ * Function: This is the header for architecture-specific stuff. It
++ * follows right after the dump header.
++ */
++/*typedef struct _dump_header_asm {*/
++
++typedef struct __dump_header_asm {
++
++ /* the dump magic number -- unique to verify dump is valid */
++ uint64_t dha_magic_number;
++
++ /* the version number of this dump */
++ uint32_t dha_version;
++
++ /* the size of this header (in case we can't read it) */
++ uint32_t dha_header_size;
++
++ /* pointer to pt_regs, (OLD: (struct pt_regs *, NEW: (uint64_t)) */
++ uint64_t dha_pt_regs;
++
++ /* the dump registers */
++ struct pt_regs dha_regs;
++
++ /* the rnat register saved after flushrs */
++ uint64_t dha_rnat;
++
++ /* the pfs register saved after flushrs */
++ uint64_t dha_pfs;
++
++ /* the bspstore register saved after flushrs */
++ uint64_t dha_bspstore;
++
++ /* smp specific */
++ uint32_t dha_smp_num_cpus;
++ uint32_t dha_dumping_cpu;
++ struct pt_regs dha_smp_regs[NR_CPUS];
++ uint64_t dha_smp_current_task[NR_CPUS];
++ uint64_t dha_stack[NR_CPUS];
++ uint64_t dha_stack_ptr[NR_CPUS];
++
++} __attribute__((packed)) dump_header_asm_t;
++
++
++extern struct __dump_header_asm dump_header_asm;
++
++#ifdef __KERNEL__
++static inline void get_current_regs(struct pt_regs *regs)
++{
++ /*
++ * REMIND: Looking at functions/Macros like:
++ * DO_SAVE_SWITCH_STACK
++ * ia64_switch_to()
++ * ia64_save_extra()
++ * switch_to()
++ * to implement this new feature that Matt seem to have added
++ * to panic.c; seems all platforms are now expected to provide
++ * this function to dump the current registers into the pt_regs
++ * structure.
++ */
++ volatile unsigned long rsc_value;/*for storing the rsc value*/
++ volatile unsigned long ic_value;
++
++ __asm__ __volatile__("mov %0=b6;;":"=r"(regs->b6));
++ __asm__ __volatile__("mov %0=b7;;":"=r"(regs->b7));
++
++ __asm__ __volatile__("mov %0=ar.csd;;":"=r"(regs->ar_csd));
++ __asm__ __volatile__("mov %0=ar.ssd;;":"=r"(regs->ar_ssd));
++ __asm__ __volatile__("mov %0=psr;;":"=r"(ic_value));
++ if(ic_value & 0x1000)/*Within an interrupt*/
++ {
++ __asm__ __volatile__("mov %0=cr.ipsr;;":"=r"(regs->cr_ipsr));
++ __asm__ __volatile__("mov %0=cr.iip;;":"=r"(regs->cr_iip));
++ __asm__ __volatile__("mov %0=cr.ifs;;":"=r"(regs->cr_ifs));
++ }
++ else
++ {
++ regs->cr_ipsr=regs->cr_iip=regs->cr_ifs=(unsigned long)-1;
++ }
++ __asm__ __volatile__("mov %0=ar.unat;;":"=r"(regs->ar_unat));
++ __asm__ __volatile__("mov %0=ar.pfs;;":"=r"(regs->ar_pfs));
++ __asm__ __volatile__("mov %0=ar.rsc;;":"=r"(rsc_value));
++ regs->ar_rsc = rsc_value;
++ /*loadrs is from 16th bit to 29th bit of rsc*/
++ regs->loadrs = rsc_value >> 16 & (unsigned long)0x3fff;
++ /*setting the rsc.mode value to 0 (rsc.mode is the last two bits of rsc)*/
++ __asm__ __volatile__("mov ar.rsc=%0;;"::"r"(rsc_value & (unsigned long)(~3)));
++ __asm__ __volatile__("mov %0=ar.rnat;;":"=r"(regs->ar_rnat));
++ __asm__ __volatile__("mov %0=ar.bspstore;;":"=r"(regs->ar_bspstore));
++ /*copying the original value back*/
++ __asm__ __volatile__("mov ar.rsc=%0;;"::"r"(rsc_value));
++ __asm__ __volatile__("mov %0=pr;;":"=r"(regs->pr));
++ __asm__ __volatile__("mov %0=ar.fpsr;;":"=r"(regs->ar_fpsr));
++ __asm__ __volatile__("mov %0=ar.ccv;;":"=r"(regs->ar_ccv));
++
++ __asm__ __volatile__("mov %0=r2;;":"=r"(regs->r2));
++ __asm__ __volatile__("mov %0=r3;;":"=r"(regs->r3));
++ __asm__ __volatile__("mov %0=r8;;":"=r"(regs->r8));
++ __asm__ __volatile__("mov %0=r9;;":"=r"(regs->r9));
++ __asm__ __volatile__("mov %0=r10;;":"=r"(regs->r10));
++ __asm__ __volatile__("mov %0=r11;;":"=r"(regs->r11));
++ __asm__ __volatile__("mov %0=r12;;":"=r"(regs->r12));
++ __asm__ __volatile__("mov %0=r13;;":"=r"(regs->r13));
++ __asm__ __volatile__("mov %0=r14;;":"=r"(regs->r14));
++ __asm__ __volatile__("mov %0=r15;;":"=r"(regs->r15));
++ __asm__ __volatile__("mov %0=r16;;":"=r"(regs->r16));
++ __asm__ __volatile__("mov %0=r17;;":"=r"(regs->r17));
++ __asm__ __volatile__("mov %0=r18;;":"=r"(regs->r18));
++ __asm__ __volatile__("mov %0=r19;;":"=r"(regs->r19));
++ __asm__ __volatile__("mov %0=r20;;":"=r"(regs->r20));
++ __asm__ __volatile__("mov %0=r21;;":"=r"(regs->r21));
++ __asm__ __volatile__("mov %0=r22;;":"=r"(regs->r22));
++ __asm__ __volatile__("mov %0=r23;;":"=r"(regs->r23));
++ __asm__ __volatile__("mov %0=r24;;":"=r"(regs->r24));
++ __asm__ __volatile__("mov %0=r25;;":"=r"(regs->r25));
++ __asm__ __volatile__("mov %0=r26;;":"=r"(regs->r26));
++ __asm__ __volatile__("mov %0=r27;;":"=r"(regs->r27));
++ __asm__ __volatile__("mov %0=r28;;":"=r"(regs->r28));
++ __asm__ __volatile__("mov %0=r29;;":"=r"(regs->r29));
++ __asm__ __volatile__("mov %0=r30;;":"=r"(regs->r30));
++ __asm__ __volatile__("mov %0=r31;;":"=r"(regs->r31));
++}
++
++/* Perhaps added to Common Arch Specific Functions and moved to dump.h some day */
++extern void * __dump_memcpy(void *, const void *, size_t);
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_DUMP_H */
+Index: linux-2.6.10/include/asm-ia64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ia64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ia64/kerntypes.h 2005-04-05 16:47:53.884214072 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-ia64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* IA64-specific header files */
++#ifndef _IA64_KERNTYPES_H
++#define _IA64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _IA64_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-ppc64/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/dump.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ppc64/dump.h 2005-04-05 16:47:53.878214984 +0800
+@@ -0,0 +1,115 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Todd Inglett <tinglett@vnet.ibm.com>
++ *
++ * Copyright 2002 - 2004 International Business Machines
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file holds the architecture specific crash dump header */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++/* necessary header files */
++#include <asm/ptrace.h> /* for pt_regs */
++#include <asm/kmap_types.h>
++#include <linux/threads.h>
++
++/* definitions */
++#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */
++#define DUMP_ASM_VERSION_NUMBER 0x5 /* version number */
++
++/*
++ * Structure: __dump_header_asm
++ * Function: This is the header for architecture-specific stuff. It
++ * follows right after the dump header.
++ */
++struct __dump_header_asm {
++
++ /* the dump magic number -- unique to verify dump is valid */
++ uint64_t dha_magic_number;
++
++ /* the version number of this dump */
++ uint32_t dha_version;
++
++ /* the size of this header (in case we can't read it) */
++ uint32_t dha_header_size;
++
++ /* the dump registers */
++ struct pt_regs dha_regs;
++
++ /* smp specific */
++ uint32_t dha_smp_num_cpus;
++ int dha_dumping_cpu;
++ struct pt_regs dha_smp_regs[NR_CPUS];
++ uint64_t dha_smp_current_task[NR_CPUS];
++ uint64_t dha_stack[NR_CPUS];
++ uint64_t dha_stack_ptr[NR_CPUS];
++} __attribute__((packed));
++
++#ifdef __KERNEL__
++static inline void get_current_regs(struct pt_regs *regs)
++{
++ unsigned long tmp1, tmp2;
++
++ __asm__ __volatile__ (
++ "std 0,0(%2)\n"
++ "std 1,8(%2)\n"
++ "std 2,16(%2)\n"
++ "std 3,24(%2)\n"
++ "std 4,32(%2)\n"
++ "std 5,40(%2)\n"
++ "std 6,48(%2)\n"
++ "std 7,56(%2)\n"
++ "std 8,64(%2)\n"
++ "std 9,72(%2)\n"
++ "std 10,80(%2)\n"
++ "std 11,88(%2)\n"
++ "std 12,96(%2)\n"
++ "std 13,104(%2)\n"
++ "std 14,112(%2)\n"
++ "std 15,120(%2)\n"
++ "std 16,128(%2)\n"
++ "std 17,136(%2)\n"
++ "std 18,144(%2)\n"
++ "std 19,152(%2)\n"
++ "std 20,160(%2)\n"
++ "std 21,168(%2)\n"
++ "std 22,176(%2)\n"
++ "std 23,184(%2)\n"
++ "std 24,192(%2)\n"
++ "std 25,200(%2)\n"
++ "std 26,208(%2)\n"
++ "std 27,216(%2)\n"
++ "std 28,224(%2)\n"
++ "std 29,232(%2)\n"
++ "std 30,240(%2)\n"
++ "std 31,248(%2)\n"
++ "mfmsr %0\n"
++ "std %0, 264(%2)\n"
++ "mfctr %0\n"
++ "std %0, 280(%2)\n"
++ "mflr %0\n"
++ "std %0, 288(%2)\n"
++ "bl 1f\n"
++ "1: mflr %1\n"
++ "std %1, 256(%2)\n"
++ "mtlr %0\n"
++ "mfxer %0\n"
++ "std %0, 296(%2)\n"
++ : "=&r" (tmp1), "=&r" (tmp2)
++ : "b" (regs));
++}
++
++extern struct __dump_header_asm dump_header_asm;
++
++#ifdef CONFIG_SMP
++extern void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *));
++#else
++#define dump_send_ipi() do { } while(0)
++#endif
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_DUMP_H */
+Index: linux-2.6.10/include/asm-ppc64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-ppc64/kerntypes.h 2005-04-05 16:47:53.879214832 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-ppc64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* PPC64-specific header files */
++#ifndef _PPC64_KERNTYPES_H
++#define _PPC64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _PPC64_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-ppc64/kmap_types.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/kmap_types.h 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/include/asm-ppc64/kmap_types.h 2005-04-05 16:47:53.878214984 +0800
+@@ -16,7 +16,8 @@
+ KM_IRQ1,
+ KM_SOFTIRQ0,
+ KM_SOFTIRQ1,
+- KM_TYPE_NR
++ KM_TYPE_NR,
++ KM_DUMP
+ };
+
+ #endif
+Index: linux-2.6.10/include/asm-ppc64/smp.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-ppc64/smp.h 2004-12-25 05:33:47.000000000 +0800
++++ linux-2.6.10/include/asm-ppc64/smp.h 2005-04-05 16:47:53.877215136 +0800
+@@ -36,7 +36,7 @@
+ extern void smp_send_debugger_break(int cpu);
+ struct pt_regs;
+ extern void smp_message_recv(int, struct pt_regs *);
+-
++extern void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *));
+
+ #define smp_processor_id() (get_paca()->paca_index)
+ #define hard_smp_processor_id() (get_paca()->hw_cpu_id)
+Index: linux-2.6.10/include/asm-cris/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-cris/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-cris/kerntypes.h 2005-04-05 16:47:53.874215592 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-cris/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* CRIS-specific header files */
++#ifndef _CRIS_KERNTYPES_H
++#define _CRIS_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _CRIS_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-m68knommu/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-m68knommu/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-m68knommu/kerntypes.h 2005-04-05 16:47:53.870216200 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-m68knommu/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* m68k/no-MMU-specific header files */
++#ifndef _M68KNOMMU_KERNTYPES_H
++#define _M68KNOMMU_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _M68KNOMMU_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-v850/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-v850/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-v850/kerntypes.h 2005-04-05 16:47:53.888213464 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-v850/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* V850-specific header files */
++#ifndef _V850_KERNTYPES_H
++#define _V850_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _V850_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-x86_64/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/dump.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-x86_64/dump.h 2005-04-05 16:47:53.868216504 +0800
+@@ -0,0 +1,93 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ * x86_64 lkcd port Sachin Sant ( sachinp@in.ibm.com)
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file holds the architecture specific crash dump header */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++/* necessary header files */
++#include <asm/ptrace.h> /* for pt_regs */
++#include <linux/threads.h>
++
++/* definitions */
++#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */
++#define DUMP_ASM_VERSION_NUMBER 0x2 /* version number */
++
++
++/*
++ * Structure: dump_header_asm_t
++ * Function: This is the header for architecture-specific stuff. It
++ * follows right after the dump header.
++ */
++struct __dump_header_asm {
++
++ /* the dump magic number -- unique to verify dump is valid */
++ uint64_t dha_magic_number;
++
++ /* the version number of this dump */
++ uint32_t dha_version;
++
++ /* the size of this header (in case we can't read it) */
++ uint32_t dha_header_size;
++
++ /* the dump registers */
++ struct pt_regs dha_regs;
++
++ /* smp specific */
++ uint32_t dha_smp_num_cpus;
++ int dha_dumping_cpu;
++ struct pt_regs dha_smp_regs[NR_CPUS];
++ uint64_t dha_smp_current_task[NR_CPUS];
++ uint64_t dha_stack[NR_CPUS];
++ uint64_t dha_stack_ptr[NR_CPUS];
++} __attribute__((packed));
++
++#ifdef __KERNEL__
++static inline void get_current_regs(struct pt_regs *regs)
++{
++ unsigned seg;
++ __asm__ __volatile__("movq %%r15,%0" : "=m"(regs->r15));
++ __asm__ __volatile__("movq %%r14,%0" : "=m"(regs->r14));
++ __asm__ __volatile__("movq %%r13,%0" : "=m"(regs->r13));
++ __asm__ __volatile__("movq %%r12,%0" : "=m"(regs->r12));
++ __asm__ __volatile__("movq %%r11,%0" : "=m"(regs->r11));
++ __asm__ __volatile__("movq %%r10,%0" : "=m"(regs->r10));
++ __asm__ __volatile__("movq %%r9,%0" : "=m"(regs->r9));
++ __asm__ __volatile__("movq %%r8,%0" : "=m"(regs->r8));
++ __asm__ __volatile__("movq %%rbx,%0" : "=m"(regs->rbx));
++ __asm__ __volatile__("movq %%rcx,%0" : "=m"(regs->rcx));
++ __asm__ __volatile__("movq %%rdx,%0" : "=m"(regs->rdx));
++ __asm__ __volatile__("movq %%rsi,%0" : "=m"(regs->rsi));
++ __asm__ __volatile__("movq %%rdi,%0" : "=m"(regs->rdi));
++ __asm__ __volatile__("movq %%rbp,%0" : "=m"(regs->rbp));
++ __asm__ __volatile__("movq %%rax,%0" : "=m"(regs->rax));
++ __asm__ __volatile__("movq %%rsp,%0" : "=m"(regs->rsp));
++ __asm__ __volatile__("movl %%ss, %0" :"=r"(seg));
++ regs->ss = (unsigned long)seg;
++ __asm__ __volatile__("movl %%cs, %0" :"=r"(seg));
++ regs->cs = (unsigned long)seg;
++ __asm__ __volatile__("pushfq; popq %0" :"=m"(regs->eflags));
++ regs->rip = (unsigned long)current_text_addr();
++
++}
++
++extern volatile int dump_in_progress;
++extern struct __dump_header_asm dump_header_asm;
++
++#ifdef CONFIG_SMP
++
++
++extern void dump_send_ipi(void);
++#else
++#define dump_send_ipi() do { } while(0)
++#endif
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_DUMP_H */
+Index: linux-2.6.10/include/asm-x86_64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-x86_64/kerntypes.h 2005-04-05 16:47:53.869216352 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-x86_64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* x86_64-specific header files */
++#ifndef _X86_64_KERNTYPES_H
++#define _X86_64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _X86_64_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-x86_64/hw_irq.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/hw_irq.h 2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/include/asm-x86_64/hw_irq.h 2005-04-05 16:47:53.869216352 +0800
+@@ -34,7 +34,6 @@
+
+ #define IA32_SYSCALL_VECTOR 0x80
+
+-
+ /*
+ * Vectors 0x20-0x2f are used for ISA interrupts.
+ */
+@@ -55,6 +54,7 @@
+ #define TASK_MIGRATION_VECTOR 0xfb
+ #define CALL_FUNCTION_VECTOR 0xfa
+ #define KDB_VECTOR 0xf9
++#define DUMP_VECTOR 0xf8
+
+ #define THERMAL_APIC_VECTOR 0xf0
+
+Index: linux-2.6.10/include/asm-x86_64/kmap_types.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/kmap_types.h 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/include/asm-x86_64/kmap_types.h 2005-04-05 16:47:53.868216504 +0800
+@@ -13,7 +13,8 @@
+ KM_IRQ1,
+ KM_SOFTIRQ0,
+ KM_SOFTIRQ1,
+- KM_TYPE_NR
++ KM_DUMP,
++ KM_TYPE_NR,
+ };
+
+ #endif
+Index: linux-2.6.10/include/asm-x86_64/smp.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-x86_64/smp.h 2004-12-25 05:33:48.000000000 +0800
++++ linux-2.6.10/include/asm-x86_64/smp.h 2005-04-05 16:47:53.867216656 +0800
+@@ -41,6 +41,7 @@
+ extern int pic_mode;
+ extern int smp_num_siblings;
+ extern void smp_flush_tlb(void);
++extern void dump_send_ipi(void);
+ extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
+ extern void smp_send_reschedule(int cpu);
+ extern void smp_invalidate_rcv(void); /* Process an NMI */
+Index: linux-2.6.10/include/asm-s390/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-s390/dump.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-s390/dump.h 2005-04-05 16:47:53.865216960 +0800
+@@ -0,0 +1,10 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ */
++
++/* Nothing to be done here, we have proper hardware support */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++#endif
++
+Index: linux-2.6.10/include/asm-s390/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-s390/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-s390/kerntypes.h 2005-04-05 16:47:53.866216808 +0800
+@@ -0,0 +1,46 @@
++/*
++ * asm-s390/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* S/390 specific header files */
++#ifndef _S390_KERNTYPES_H
++#define _S390_KERNTYPES_H
++
++#include <asm/lowcore.h>
++#include <asm/debug.h>
++#include <asm/ccwdev.h>
++#include <asm/ccwgroup.h>
++#include <asm/qdio.h>
++
++/* channel subsystem driver */
++#include "../../drivers/s390/cio/cio.h"
++#include "../../drivers/s390/cio/chsc.h"
++#include "../../drivers/s390/cio/css.h"
++#include "../../drivers/s390/cio/device.h"
++#include "../../drivers/s390/cio/qdio.h"
++
++/* dasd device driver */
++#include "../../drivers/s390/block/dasd_int.h"
++#include "../../drivers/s390/block/dasd_diag.h"
++#include "../../drivers/s390/block/dasd_eckd.h"
++#include "../../drivers/s390/block/dasd_fba.h"
++
++/* networking drivers */
++#include "../../drivers/s390/net/fsm.h"
++#include "../../drivers/s390/net/iucv.h"
++#include "../../drivers/s390/net/lcs.h"
++
++/* zfcp device driver */
++#include "../../drivers/s390/scsi/zfcp_def.h"
++#include "../../drivers/s390/scsi/zfcp_fsf.h"
++
++#endif /* _S390_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-sparc64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sparc64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-sparc64/kerntypes.h 2005-04-05 16:47:53.872215896 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-sparc64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* SPARC64-specific header files */
++#ifndef _SPARC64_KERNTYPES_H
++#define _SPARC64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _SPARC64_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-mips/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-mips/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-mips/kerntypes.h 2005-04-05 16:47:53.881214528 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-mips/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* MIPS-specific header files */
++#ifndef _MIPS_KERNTYPES_H
++#define _MIPS_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _MIPS_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-m68k/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-m68k/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-m68k/kerntypes.h 2005-04-05 16:47:53.875215440 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-m68k/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* m68k-specific header files */
++#ifndef _M68K_KERNTYPES_H
++#define _M68K_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _M68K_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-generic/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-generic/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-generic/kerntypes.h 2005-04-05 16:47:53.871216048 +0800
+@@ -0,0 +1,20 @@
++/*
++ * asm-generic/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* Arch-independent header files */
++#ifndef _GENERIC_KERNTYPES_H
++#define _GENERIC_KERNTYPES_H
++
++#include <linux/pci.h>
++
++#endif /* _GENERIC_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-i386/dump.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/dump.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-i386/dump.h 2005-04-05 16:47:53.886213768 +0800
+@@ -0,0 +1,90 @@
++/*
++ * Kernel header file for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* This header file holds the architecture specific crash dump header */
++#ifndef _ASM_DUMP_H
++#define _ASM_DUMP_H
++
++/* necessary header files */
++#include <asm/ptrace.h>
++#include <asm/page.h>
++#include <linux/threads.h>
++#include <linux/mm.h>
++
++/* definitions */
++#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */
++#define DUMP_ASM_VERSION_NUMBER 0x3 /* version number */
++
++/*
++ * Structure: __dump_header_asm
++ * Function: This is the header for architecture-specific stuff. It
++ * follows right after the dump header.
++ */
++struct __dump_header_asm {
++ /* the dump magic number -- unique to verify dump is valid */
++ u64 dha_magic_number;
++
++ /* the version number of this dump */
++ u32 dha_version;
++
++ /* the size of this header (in case we can't read it) */
++ u32 dha_header_size;
++
++ /* the esp for i386 systems */
++ u32 dha_esp;
++
++ /* the eip for i386 systems */
++ u32 dha_eip;
++
++ /* the dump registers */
++ struct pt_regs dha_regs;
++
++ /* smp specific */
++ u32 dha_smp_num_cpus;
++ u32 dha_dumping_cpu;
++ struct pt_regs dha_smp_regs[NR_CPUS];
++ u32 dha_smp_current_task[NR_CPUS];
++ u32 dha_stack[NR_CPUS];
++ u32 dha_stack_ptr[NR_CPUS];
++} __attribute__((packed));
++
++#ifdef __KERNEL__
++
++extern struct __dump_header_asm dump_header_asm;
++
++#ifdef CONFIG_SMP
++extern cpumask_t irq_affinity[];
++extern int (*dump_ipi_function_ptr)(struct pt_regs *);
++extern void dump_send_ipi(void);
++#else
++#define dump_send_ipi() do { } while(0)
++#endif
++
++static inline void get_current_regs(struct pt_regs *regs)
++{
++ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx));
++ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx));
++ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx));
++ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi));
++ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi));
++ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp));
++ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax));
++ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp));
++ __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss));
++ __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs));
++ __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds));
++ __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes));
++ __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags));
++ regs->eip = (unsigned long)current_text_addr();
++}
++
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_DUMP_H */
+Index: linux-2.6.10/include/asm-i386/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-i386/kerntypes.h 2005-04-05 16:47:53.887213616 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-i386/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* ix86-specific header files */
++#ifndef _I386_KERNTYPES_H
++#define _I386_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _I386_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-i386/kmap_types.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/kmap_types.h 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/include/asm-i386/kmap_types.h 2005-04-05 16:47:53.886213768 +0800
+@@ -23,7 +23,8 @@
+ D(10) KM_IRQ1,
+ D(11) KM_SOFTIRQ0,
+ D(12) KM_SOFTIRQ1,
+-D(13) KM_TYPE_NR
++D(13) KM_DUMP,
++D(14) KM_TYPE_NR
+ };
+
+ #undef D
+Index: linux-2.6.10/include/asm-i386/smp.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/smp.h 2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/include/asm-i386/smp.h 2005-04-05 16:47:53.885213920 +0800
+@@ -37,6 +37,7 @@
+ extern cpumask_t cpu_sibling_map[];
+
+ extern void smp_flush_tlb(void);
++extern void dump_send_ipi(void);
+ extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
+ extern void smp_invalidate_rcv(void); /* Process an NMI */
+ extern void (*mtrr_hook) (void);
+Index: linux-2.6.10/include/asm-i386/mach-default/irq_vectors.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/mach-default/irq_vectors.h 2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/include/asm-i386/mach-default/irq_vectors.h 2005-04-05 16:47:53.887213616 +0800
+@@ -48,6 +48,7 @@
+ #define INVALIDATE_TLB_VECTOR 0xfd
+ #define RESCHEDULE_VECTOR 0xfc
+ #define CALL_FUNCTION_VECTOR 0xfb
++#define DUMP_VECTOR 0xfa
+
+ #define THERMAL_APIC_VECTOR 0xf0
+ /*
+Index: linux-2.6.10/include/asm-arm/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-arm/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-arm/kerntypes.h 2005-04-05 16:47:53.873215744 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-arm/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* ARM-specific header files */
++#ifndef _ARM_KERNTYPES_H
++#define _ARM_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _ARM_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-sparc/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-sparc/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-sparc/kerntypes.h 2005-04-05 16:47:53.874215592 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-sparc/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* SPARC-specific header files */
++#ifndef _SPARC_KERNTYPES_H
++#define _SPARC_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _SPARC_KERNTYPES_H */
+Index: linux-2.6.10/include/asm-mips64/kerntypes.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-mips64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/asm-mips64/kerntypes.h 2005-04-05 16:47:53.881214528 +0800
+@@ -0,0 +1,21 @@
++/*
++ * asm-mips64/kerntypes.h
++ *
++ * Arch-dependent header file that includes headers for all arch-specific
++ * types of interest.
++ * The kernel type information is used by the lcrash utility when
++ * analyzing system crash dumps or the live system. Using the type
++ * information for the running system, rather than kernel header files,
++ * makes for a more flexible and robust analysis tool.
++ *
++ * This source code is released under the GNU GPL.
++ */
++
++/* MIPS64-specific header files */
++#ifndef _MIPS64_KERNTYPES_H
++#define _MIPS64_KERNTYPES_H
++
++/* Use the default */
++#include <asm-generic/kerntypes.h>
++
++#endif /* _MIPS64_KERNTYPES_H */
+Index: linux-2.6.10/net/Kconfig
+===================================================================
+--- linux-2.6.10.orig/net/Kconfig 2005-04-05 16:29:27.896349784 +0800
++++ linux-2.6.10/net/Kconfig 2005-04-05 16:47:53.895212400 +0800
+@@ -632,7 +632,7 @@
+ endmenu
+
+ config NETPOLL
+- def_bool NETCONSOLE
++ def_bool NETCONSOLE || CRASH_DUMP_NETDEV
+
+ config NETPOLL_RX
+ bool "Netpoll support for trapping incoming packets"
+Index: linux-2.6.10/scripts/mkcompile_h
+===================================================================
+--- linux-2.6.10.orig/scripts/mkcompile_h 2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/scripts/mkcompile_h 2005-04-05 16:47:53.950204040 +0800
+@@ -33,7 +33,7 @@
+
+ UTS_LEN=64
+ UTS_TRUNCATE="sed -e s/\(.\{1,$UTS_LEN\}\).*/\1/"
+-
++LINUX_COMPILE_VERSION_ID="__linux_compile_version_id__`hostname | tr -c '[0-9A-Za-z\n]' '__'`_`LANG=C date | tr -c '[0-9A-Za-z\n]' '_'`"
+ # Generate a temporary compile.h
+
+ ( echo /\* This file is auto generated, version $VERSION \*/
+@@ -55,6 +55,8 @@
+ fi
+
+ echo \#define LINUX_COMPILER \"`$CC -v 2>&1 | tail -n 1`\"
++ echo \#define LINUX_COMPILE_VERSION_ID $LINUX_COMPILE_VERSION_ID
++ echo \#define LINUX_COMPILE_VERSION_ID_TYPE typedef char* "$LINUX_COMPILE_VERSION_ID""_t"
+ ) > .tmpcompile
+
+ # Only replace the real compile.h if the new one is different,
+Index: linux-2.6.10/mm/bootmem.c
+===================================================================
+--- linux-2.6.10.orig/mm/bootmem.c 2004-12-25 05:34:30.000000000 +0800
++++ linux-2.6.10/mm/bootmem.c 2005-04-05 16:47:53.903211184 +0800
+@@ -26,6 +26,7 @@
+ */
+ unsigned long max_low_pfn;
+ unsigned long min_low_pfn;
++EXPORT_SYMBOL(min_low_pfn);
+ unsigned long max_pfn;
+
+ EXPORT_SYMBOL(max_pfn); /* This is exported so
+@@ -284,6 +285,7 @@
+ if (j + 16 < BITS_PER_LONG)
+ prefetchw(page + j + 16);
+ __ClearPageReserved(page + j);
++ set_page_count(page + j, 1);
+ }
+ __free_pages(page, ffs(BITS_PER_LONG)-1);
+ i += BITS_PER_LONG;
+Index: linux-2.6.10/mm/page_alloc.c
+===================================================================
+--- linux-2.6.10.orig/mm/page_alloc.c 2005-04-05 16:29:28.218300840 +0800
++++ linux-2.6.10/mm/page_alloc.c 2005-04-05 16:47:53.902211336 +0800
+@@ -47,6 +47,11 @@
+ EXPORT_SYMBOL(totalram_pages);
+ EXPORT_SYMBOL(nr_swap_pages);
+
++#ifdef CONFIG_CRASH_DUMP_MODULE
++/* This symbol has to be exported to use 'for_each_pgdat' macro by modules. */
++EXPORT_SYMBOL(pgdat_list);
++#endif
++
+ /*
+ * Used by page_zone() to look up the address of the struct zone whose
+ * id is encoded in the upper bits of page->flags
+@@ -281,8 +286,11 @@
+ arch_free_page(page, order);
+
+ mod_page_state(pgfree, 1 << order);
+- for (i = 0 ; i < (1 << order) ; ++i)
++ for (i = 0 ; i < (1 << order) ; ++i){
++ if (unlikely(i))
++ __put_page(page + i);
+ free_pages_check(__FUNCTION__, page + i);
++ }
+ list_add(&page->lru, &list);
+ kernel_map_pages(page, 1<<order, 0);
+ free_pages_bulk(page_zone(page), 1, &list, order);
+@@ -322,44 +330,34 @@
+ return page;
+ }
+
+-static inline void set_page_refs(struct page *page, int order)
+-{
+-#ifdef CONFIG_MMU
+- set_page_count(page, 1);
+-#else
+- int i;
+-
+- /*
+- * We need to reference all the pages for this order, otherwise if
+- * anyone accesses one of the pages with (get/put) it will be freed.
+- */
+- for (i = 0; i < (1 << order); i++)
+- set_page_count(page+i, 1);
+-#endif /* CONFIG_MMU */
+-}
+-
+ /*
+ * This page is about to be returned from the page allocator
+ */
+-static void prep_new_page(struct page *page, int order)
++static void prep_new_page(struct page *_page, int order)
+ {
+- if (page->mapping || page_mapped(page) ||
+- (page->flags & (
+- 1 << PG_private |
+- 1 << PG_locked |
+- 1 << PG_lru |
+- 1 << PG_active |
+- 1 << PG_dirty |
+- 1 << PG_reclaim |
+- 1 << PG_swapcache |
+- 1 << PG_writeback )))
++ int i;
++
++ for(i = 0; i < (1 << order); i++){
++ struct page *page = _page + i;
++
++ if (page->mapping || page_mapped(page) ||
++ (page->flags & (
++ 1 << PG_private |
++ 1 << PG_locked |
++ 1 << PG_lru |
++ 1 << PG_active |
++ 1 << PG_dirty |
++ 1 << PG_reclaim |
++ 1 << PG_swapcache |
++ 1 << PG_writeback )))
+ bad_page(__FUNCTION__, page);
+
+- page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+- 1 << PG_referenced | 1 << PG_arch_1 |
+- 1 << PG_checked | 1 << PG_mappedtodisk);
+- page->private = 0;
+- set_page_refs(page, order);
++ page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
++ 1 << PG_referenced | 1 << PG_arch_1 |
++ 1 << PG_checked | 1 << PG_mappedtodisk);
++ page->private = 0;
++ set_page_count(page, 1);
++ }
+ }
+
+ /*
+Index: linux-2.6.10/kernel/sched.c
+===================================================================
+--- linux-2.6.10.orig/kernel/sched.c 2005-04-05 16:29:30.335978904 +0800
++++ linux-2.6.10/kernel/sched.c 2005-04-05 16:47:53.901211488 +0800
+@@ -54,6 +54,10 @@
+ #define cpu_to_node_mask(cpu) (cpu_online_map)
+ #endif
+
++/* used to soft spin in sched while dump is in progress */
++unsigned long dump_oncpu;
++EXPORT_SYMBOL(dump_oncpu);
++
+ /*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+@@ -184,109 +188,6 @@
+ #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
+ < (long long) (sd)->cache_hot_time)
+
+-/*
+- * These are the runqueue data structures:
+- */
+-
+-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+-
+-typedef struct runqueue runqueue_t;
+-
+-struct prio_array {
+- unsigned int nr_active;
+- unsigned long bitmap[BITMAP_SIZE];
+- struct list_head queue[MAX_PRIO];
+-};
+-
+-/*
+- * This is the main, per-CPU runqueue data structure.
+- *
+- * Locking rule: those places that want to lock multiple runqueues
+- * (such as the load balancing or the thread migration code), lock
+- * acquire operations must be ordered by ascending &runqueue.
+- */
+-struct runqueue {
+- spinlock_t lock;
+-
+- /*
+- * nr_running and cpu_load should be in the same cacheline because
+- * remote CPUs use both these fields when doing load calculation.
+- */
+- unsigned long nr_running;
+-#ifdef CONFIG_SMP
+- unsigned long cpu_load;
+-#endif
+- unsigned long long nr_switches;
+-
+- /*
+- * This is part of a global counter where only the total sum
+- * over all CPUs matters. A task can increase this counter on
+- * one CPU and if it got migrated afterwards it may decrease
+- * it on another CPU. Always updated under the runqueue lock:
+- */
+- unsigned long nr_uninterruptible;
+-
+- unsigned long expired_timestamp;
+- unsigned long long timestamp_last_tick;
+- task_t *curr, *idle;
+- struct mm_struct *prev_mm;
+- prio_array_t *active, *expired, arrays[2];
+- int best_expired_prio;
+- atomic_t nr_iowait;
+-
+-#ifdef CONFIG_SMP
+- struct sched_domain *sd;
+-
+- /* For active balancing */
+- int active_balance;
+- int push_cpu;
+-
+- task_t *migration_thread;
+- struct list_head migration_queue;
+-#endif
+-
+-#ifdef CONFIG_SCHEDSTATS
+- /* latency stats */
+- struct sched_info rq_sched_info;
+-
+- /* sys_sched_yield() stats */
+- unsigned long yld_exp_empty;
+- unsigned long yld_act_empty;
+- unsigned long yld_both_empty;
+- unsigned long yld_cnt;
+-
+- /* schedule() stats */
+- unsigned long sched_noswitch;
+- unsigned long sched_switch;
+- unsigned long sched_cnt;
+- unsigned long sched_goidle;
+-
+- /* pull_task() stats */
+- unsigned long pt_gained[MAX_IDLE_TYPES];
+- unsigned long pt_lost[MAX_IDLE_TYPES];
+-
+- /* active_load_balance() stats */
+- unsigned long alb_cnt;
+- unsigned long alb_lost;
+- unsigned long alb_gained;
+- unsigned long alb_failed;
+-
+- /* try_to_wake_up() stats */
+- unsigned long ttwu_cnt;
+- unsigned long ttwu_attempts;
+- unsigned long ttwu_moved;
+-
+- /* wake_up_new_task() stats */
+- unsigned long wunt_cnt;
+- unsigned long wunt_moved;
+-
+- /* sched_migrate_task() stats */
+- unsigned long smt_cnt;
+-
+- /* sched_balance_exec() stats */
+- unsigned long sbe_cnt;
+-#endif
+-};
+
+ static DEFINE_PER_CPU(struct runqueue, runqueues);
+
+@@ -2535,6 +2436,15 @@
+ unsigned long run_time;
+ int cpu, idx;
+
++ /*
++ * If crash dump is in progress, this other cpu's
++ * need to wait until it completes.
++ * NB: this code is optimized away for kernels without
++ * dumping enabled.
++ */
++ if (unlikely(dump_oncpu))
++ goto dump_scheduling_disabled;
++
+ /*
+ * Test if we are atomic. Since do_exit() needs to call into
+ * schedule() atomically, we ignore that path for now.
+@@ -2698,6 +2608,16 @@
+ preempt_enable_no_resched();
+ if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ goto need_resched;
++
++ return;
++
++ dump_scheduling_disabled:
++ /* allow scheduling only if this is the dumping cpu */
++ if (dump_oncpu != smp_processor_id()+1) {
++ while (dump_oncpu)
++ cpu_relax();
++ }
++ return;
+ }
+
+ EXPORT_SYMBOL(schedule);
+Index: linux-2.6.10/kernel/panic.c
+===================================================================
+--- linux-2.6.10.orig/kernel/panic.c 2004-12-25 05:35:29.000000000 +0800
++++ linux-2.6.10/kernel/panic.c 2005-04-05 16:47:53.898211944 +0800
+@@ -18,12 +18,17 @@
+ #include <linux/sysrq.h>
+ #include <linux/interrupt.h>
+ #include <linux/nmi.h>
++#ifdef CONFIG_KEXEC
++#include <linux/kexec.h>
++#endif
+
+ int panic_timeout;
+ int panic_on_oops;
+ int tainted;
++void (*dump_function_ptr)(const char *, const struct pt_regs *) = 0;
+
+ EXPORT_SYMBOL(panic_timeout);
++EXPORT_SYMBOL(dump_function_ptr);
+
+ struct notifier_block *panic_notifier_list;
+
+@@ -71,11 +76,12 @@
+ printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+ bust_spinlocks(0);
+
++ notifier_call_chain(&panic_notifier_list, 0, buf);
++
+ #ifdef CONFIG_SMP
+ smp_send_stop();
+ #endif
+
+- notifier_call_chain(&panic_notifier_list, 0, buf);
+
+ if (!panic_blink)
+ panic_blink = no_blink;
+@@ -87,6 +93,18 @@
+ * We can't use the "normal" timers since we just panicked..
+ */
+ printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
++#ifdef CONFIG_KEXEC
++{
++ struct kimage *image;
++ image = xchg(&kexec_image, 0);
++ if (image) {
++ printk(KERN_EMERG "by starting a new kernel ..\n");
++ mdelay(panic_timeout*1000);
++ machine_kexec(image);
++ }
++}
++#endif
++
+ for (i = 0; i < panic_timeout*1000; ) {
+ touch_nmi_watchdog();
+ i += panic_blink(i);
+Index: linux-2.6.10/drivers/block/ll_rw_blk.c
+===================================================================
+--- linux-2.6.10.orig/drivers/block/ll_rw_blk.c 2005-04-05 16:29:30.310982704 +0800
++++ linux-2.6.10/drivers/block/ll_rw_blk.c 2005-04-05 16:47:53.949204192 +0800
+@@ -28,6 +28,7 @@
+ #include <linux/slab.h>
+ #include <linux/swap.h>
+ #include <linux/writeback.h>
++#include <linux/dump.h>
+
+ /*
+ * for max sense size
+@@ -2628,7 +2629,8 @@
+ sector_t maxsector;
+ int ret, nr_sectors = bio_sectors(bio);
+
+- might_sleep();
++ if (likely(!dump_oncpu))
++ might_sleep();
+ /* Test device or partition size, when known. */
+ maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+ if (maxsector) {
+Index: linux-2.6.10/drivers/dump/dump_i386.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_i386.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_i386.c 2005-04-05 16:47:53.940205560 +0800
+@@ -0,0 +1,372 @@
++/*
++ * Architecture specific (i386) functions for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ *
++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com)
++ * Copyright 2000 TurboLinux, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * The hooks for dumping the kernel virtual memory to disk are in this
++ * file. Any time a modification is made to the virtual memory mechanism,
++ * these routines must be changed to use the new mechanisms.
++ */
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/smp.h>
++#include <linux/fs.h>
++#include <linux/vmalloc.h>
++#include <linux/mm.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++#include <linux/irq.h>
++
++#include <asm/processor.h>
++#include <asm/e820.h>
++#include <asm/hardirq.h>
++#include <asm/nmi.h>
++
++static __s32 saved_irq_count; /* saved preempt_count() flags */
++
++static int
++alloc_dha_stack(void)
++{
++ int i;
++ void *ptr;
++
++ if (dump_header_asm.dha_stack[0])
++ return 0;
++
++ ptr = vmalloc(THREAD_SIZE * num_online_cpus());
++ if (!ptr) {
++ printk("vmalloc for dha_stacks failed\n");
++ return -ENOMEM;
++ }
++
++ for (i = 0; i < num_online_cpus(); i++) {
++ dump_header_asm.dha_stack[i] = (u32)((unsigned long)ptr +
++ (i * THREAD_SIZE));
++ }
++ return 0;
++}
++
++static int
++free_dha_stack(void)
++{
++ if (dump_header_asm.dha_stack[0]) {
++ vfree((void *)dump_header_asm.dha_stack[0]);
++ dump_header_asm.dha_stack[0] = 0;
++ }
++ return 0;
++}
++
++
++void
++__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs)
++{
++ *dest_regs = *regs;
++
++ /* In case of panic dumps, we collects regs on entry to panic.
++ * so, we shouldn't 'fix' ssesp here again. But it is hard to
++ * tell just looking at regs whether ssesp need fixing. We make
++ * this decision by looking at xss in regs. If we have better
++ * means to determine that ssesp are valid (by some flag which
++ * tells that we are here due to panic dump), then we can use
++ * that instead of this kludge.
++ */
++ if (!user_mode(regs)) {
++ if ((0xffff & regs->xss) == __KERNEL_DS)
++ /* already fixed up */
++ return;
++ dest_regs->esp = (unsigned long)&(regs->esp);
++ __asm__ __volatile__ ("movw %%ss, %%ax;"
++ :"=a"(dest_regs->xss));
++ }
++}
++
++void
++__dump_save_context(int cpu, const struct pt_regs *regs,
++ struct task_struct *tsk)
++{
++ dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk;
++ __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs);
++
++ /* take a snapshot of the stack */
++ /* doing this enables us to tolerate slight drifts on this cpu */
++
++ if (dump_header_asm.dha_stack[cpu]) {
++ memcpy((void *)dump_header_asm.dha_stack[cpu],
++ STACK_START_POSITION(tsk),
++ THREAD_SIZE);
++ }
++ dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info);
++}
++
++#ifdef CONFIG_SMP
++extern cpumask_t irq_affinity[];
++extern irq_desc_t irq_desc[];
++extern void dump_send_ipi(void);
++
++static int dump_expect_ipi[NR_CPUS];
++static atomic_t waiting_for_dump_ipi;
++static cpumask_t saved_affinity[NR_IRQS];
++
++extern void stop_this_cpu(void *); /* exported by i386 kernel */
++
++static int
++dump_nmi_callback(struct pt_regs *regs, int cpu)
++{
++ if (!dump_expect_ipi[cpu])
++ return 0;
++
++ dump_expect_ipi[cpu] = 0;
++
++ dump_save_this_cpu(regs);
++ atomic_dec(&waiting_for_dump_ipi);
++
++ level_changed:
++ switch (dump_silence_level) {
++ case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */
++ while (dump_oncpu) {
++ barrier(); /* paranoia */
++ if (dump_silence_level != DUMP_HARD_SPIN_CPUS)
++ goto level_changed;
++
++ cpu_relax(); /* kill time nicely */
++ }
++ break;
++
++ case DUMP_HALT_CPUS: /* Execute halt */
++ stop_this_cpu(NULL);
++ break;
++
++ case DUMP_SOFT_SPIN_CPUS:
++ /* Mark the task so it spins in schedule */
++ set_tsk_thread_flag(current, TIF_NEED_RESCHED);
++ break;
++ }
++
++ return 1;
++}
++
++/* save registers on other processors */
++void
++__dump_save_other_cpus(void)
++{
++ int i, cpu = smp_processor_id();
++ int other_cpus = num_online_cpus()-1;
++
++ if (other_cpus > 0) {
++ atomic_set(&waiting_for_dump_ipi, other_cpus);
++
++ for (i = 0; i < NR_CPUS; i++) {
++ dump_expect_ipi[i] = (i != cpu && cpu_online(i));
++ }
++
++ /* short circuit normal NMI handling temporarily */
++ set_nmi_callback(dump_nmi_callback);
++ wmb();
++
++ dump_send_ipi();
++ /* may be we dont need to wait for NMI to be processed.
++ just write out the header at the end of dumping, if
++ this IPI is not processed until then, there probably
++ is a problem and we just fail to capture state of
++ other cpus. */
++ while(atomic_read(&waiting_for_dump_ipi) > 0) {
++ cpu_relax();
++ }
++
++ unset_nmi_callback();
++ }
++}
++
++/*
++ * Routine to save the old irq affinities and change affinities of all irqs to
++ * the dumping cpu.
++ */
++static void
++set_irq_affinity(void)
++{
++ int i;
++ cpumask_t cpu = CPU_MASK_NONE;
++
++ cpu_set(smp_processor_id(), cpu);
++ memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long));
++ for (i = 0; i < NR_IRQS; i++) {
++ if (irq_desc[i].handler == NULL)
++ continue;
++ irq_affinity[i] = cpu;
++ if (irq_desc[i].handler->set_affinity != NULL)
++ irq_desc[i].handler->set_affinity(i, irq_affinity[i]);
++ }
++}
++
++/*
++ * Restore old irq affinities.
++ */
++static void
++reset_irq_affinity(void)
++{
++ int i;
++
++ memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long));
++ for (i = 0; i < NR_IRQS; i++) {
++ if (irq_desc[i].handler == NULL)
++ continue;
++ if (irq_desc[i].handler->set_affinity != NULL)
++ irq_desc[i].handler->set_affinity(i, saved_affinity[i]);
++ }
++}
++
++#else /* !CONFIG_SMP */
++#define set_irq_affinity() do { } while (0)
++#define reset_irq_affinity() do { } while (0)
++#define save_other_cpu_states() do { } while (0)
++#endif /* !CONFIG_SMP */
++
++/*
++ * Kludge - dump from interrupt context is unreliable (Fixme)
++ *
++ * We do this so that softirqs initiated for dump i/o
++ * get processed and we don't hang while waiting for i/o
++ * to complete or in any irq synchronization attempt.
++ *
++ * This is not quite legal of course, as it has the side
++ * effect of making all interrupts & softirqs triggered
++ * while dump is in progress complete before currently
++ * pending softirqs and the currently executing interrupt
++ * code.
++ */
++static inline void
++irq_bh_save(void)
++{
++ saved_irq_count = irq_count();
++ preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK);
++}
++
++static inline void
++irq_bh_restore(void)
++{
++ preempt_count() |= saved_irq_count;
++}
++
++/*
++ * Name: __dump_irq_enable
++ * Func: Reset system so interrupts are enabled.
++ * This is used for dump methods that require interrupts
++ * Eventually, all methods will have interrupts disabled
++ * and this code can be removed.
++ *
++ * Change irq affinities
++ * Re-enable interrupts
++ */
++int
++__dump_irq_enable(void)
++{
++ set_irq_affinity();
++ irq_bh_save();
++ local_irq_enable();
++ return 0;
++}
++
++/*
++ * Name: __dump_irq_restore
++ * Func: Resume the system state in an architecture-specific way.
++
++ */
++void
++__dump_irq_restore(void)
++{
++ local_irq_disable();
++ reset_irq_affinity();
++ irq_bh_restore();
++}
++
++/*
++ * Name: __dump_configure_header()
++ * Func: Meant to fill in arch specific header fields except per-cpu state
++ * already captured via __dump_save_context for all CPUs.
++ */
++int
++__dump_configure_header(const struct pt_regs *regs)
++{
++ return (0);
++}
++
++/*
++ * Name: __dump_init()
++ * Func: Initialize the dumping routine process.
++ */
++void
++__dump_init(uint64_t local_memory_start)
++{
++ return;
++}
++
++/*
++ * Name: __dump_open()
++ * Func: Open the dump device (architecture specific).
++ */
++void
++__dump_open(void)
++{
++ alloc_dha_stack();
++}
++
++/*
++ * Name: __dump_cleanup()
++ * Func: Free any architecture specific data structures. This is called
++ * when the dump module is being removed.
++ */
++void
++__dump_cleanup(void)
++{
++ free_dha_stack();
++}
++
++extern int pfn_is_ram(unsigned long);
++
++/*
++ * Name: __dump_page_valid()
++ * Func: Check if page is valid to dump.
++ */
++int
++__dump_page_valid(unsigned long index)
++{
++ if (!pfn_valid(index))
++ return 0;
++
++ return pfn_is_ram(index);
++}
++
++/*
++ * Name: manual_handle_crashdump()
++ * Func: Interface for the lkcd dump command. Calls dump_execute()
++ */
++int
++manual_handle_crashdump(void) {
++
++ struct pt_regs regs;
++
++ get_current_regs(®s);
++ dump_execute("manual", ®s);
++ return 0;
++}
++
++/*
++ * Name: __dump_clean_irq_state()
++ * Func: Clean up from the previous IRQ handling state. Such as oops from
++ * interrupt handler or bottom half.
++ */
++void
++__dump_clean_irq_state(void)
++{
++ return;
++}
+Index: linux-2.6.10/drivers/dump/dump_ia64.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_ia64.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_ia64.c 2005-04-05 16:47:53.928207384 +0800
+@@ -0,0 +1,458 @@
++/*
++ * Architecture specific (ia64) functions for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ * Contributions from SGI, IBM, and others.
++ *
++ * 2.4 kernel modifications by: Matt D. Robinson (yakker@alacritech.com)
++ * ia64 kernel modifications by: Piet Delaney (piet@www.piet.net)
++ *
++ * Copyright (C) 2001 - 2002 Matt D. Robinson (yakker@alacritech.com)
++ * Copyright (C) 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * The hooks for dumping the kernel virtual memory to disk are in this
++ * file. Any time a modification is made to the virtual memory mechanism,
++ * these routines must be changed to use the new mechanisms.
++ */
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/smp.h>
++#include <linux/fs.h>
++#include <linux/vmalloc.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++#include <linux/mm.h>
++#include <asm/processor.h>
++#include <asm-ia64/dump.h>
++#include <asm/hardirq.h>
++#include <linux/irq.h>
++#include <linux/delay.h>
++
++static __s32 saved_irq_count; /* saved preempt_count() flags */
++
++
++static int alloc_dha_stack(void)
++{
++ int i;
++ void *ptr;
++
++ if (dump_header_asm.dha_stack[0])
++ {
++ return 0;
++ }
++ ptr = vmalloc(THREAD_SIZE * num_online_cpus());
++ if (!ptr) {
++ printk("vmalloc for dha_stacks failed\n");
++ return -ENOMEM;
++ }
++ bzero(ptr,THREAD_SIZE );
++
++ for (i = 0; i < num_online_cpus(); i++) {
++ dump_header_asm.dha_stack[i] = (uint64_t)((unsigned long)ptr + (i * THREAD_SIZE));
++ }
++ return 0;
++}
++
++static int free_dha_stack(void)
++{
++ if (dump_header_asm.dha_stack[0])
++ {
++ vfree((void*)dump_header_asm.dha_stack[0]);
++ dump_header_asm.dha_stack[0] = 0;
++ }
++ return 0;
++}
++
++/* a structure to get arguments into the following callback routine */
++struct unw_args {
++ int cpu;
++ struct task_struct *tsk;
++};
++
++static void
++do_save_sw(struct unw_frame_info *info, void *arg)
++{
++ struct unw_args *uwargs = (struct unw_args *)arg;
++ int cpu = uwargs->cpu;
++ struct task_struct *tsk = uwargs->tsk;
++
++ dump_header_asm.dha_stack_ptr[cpu] = (uint64_t)info->sw;
++
++ if (tsk && dump_header_asm.dha_stack[cpu]) {
++ memcpy((void *)dump_header_asm.dha_stack[cpu],
++ STACK_START_POSITION(tsk),
++ THREAD_SIZE);
++ }
++}
++
++void
++__dump_save_context(int cpu, const struct pt_regs *regs,
++ struct task_struct *tsk)
++{
++ struct unw_args uwargs;
++
++ dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk;
++
++ if (regs) {
++ dump_header_asm.dha_smp_regs[cpu] = *regs;
++ }
++
++ /* save a snapshot of the stack in a nice state for unwinding */
++ uwargs.cpu = cpu;
++ uwargs.tsk = tsk;
++
++ unw_init_running(do_save_sw, (void *)&uwargs);
++}
++
++#ifdef CONFIG_SMP
++
++extern cpumask_t irq_affinity[];
++#define irq_desc _irq_desc
++extern irq_desc_t irq_desc[];
++extern void dump_send_ipi(void);
++static cpumask_t saved_affinity[NR_IRQS];
++
++/*
++ * Routine to save the old irq affinities and change affinities of all irqs to
++ * the dumping cpu.
++ */
++static void
++set_irq_affinity(void)
++{
++ int i;
++ cpumask_t cpu = CPU_MASK_NONE;
++
++ cpu_set(smp_processor_id(), cpu);
++ memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long));
++ for (i = 0; i < NR_IRQS; i++) {
++ if (irq_desc[i].handler == NULL)
++ continue;
++ irq_affinity[i] = cpu;
++ if (irq_desc[i].handler->set_affinity != NULL)
++ irq_desc[i].handler->set_affinity(i, irq_affinity[i]);
++ }
++}
++
++/*
++ * Restore old irq affinities.
++ */
++static void
++reset_irq_affinity(void)
++{
++ int i;
++
++ memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long));
++ for (i = 0; i < NR_IRQS; i++) {
++ if (irq_desc[i].handler == NULL)
++ continue;
++ if (irq_desc[i].handler->set_affinity != NULL)
++ irq_desc[i].handler->set_affinity(i, saved_affinity[i]);
++ }
++}
++
++#else /* !CONFIG_SMP */
++#define set_irq_affinity() do { } while (0)
++#define reset_irq_affinity() do { } while (0)
++#define save_other_cpu_states() do { } while (0)
++#endif /* !CONFIG_SMP */
++
++#ifdef CONFIG_SMP
++static int dump_expect_ipi[NR_CPUS];
++static atomic_t waiting_for_dump_ipi;
++static int wait_for_dump_ipi = 2000; /* wait 2000 ms for ipi to be handled */
++extern void (*dump_trace_ptr)(struct pt_regs *);
++
++
++extern void stop_this_cpu(void);
++
++static int
++dump_nmi_callback(struct pt_regs *regs, int cpu)
++{
++ if (!dump_expect_ipi[cpu])
++ return 0;
++
++ dump_expect_ipi[cpu] = 0;
++
++ dump_save_this_cpu(regs);
++ atomic_dec(&waiting_for_dump_ipi);
++
++ level_changed:
++ switch (dump_silence_level) {
++ case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */
++ while (dump_oncpu) {
++ barrier(); /* paranoia */
++ if (dump_silence_level != DUMP_HARD_SPIN_CPUS)
++ goto level_changed;
++
++ cpu_relax(); /* kill time nicely */
++ }
++ break;
++
++ case DUMP_HALT_CPUS: /* Execute halt */
++ stop_this_cpu();
++ break;
++
++ case DUMP_SOFT_SPIN_CPUS:
++ /* Mark the task so it spins in schedule */
++ set_tsk_thread_flag(current, TIF_NEED_RESCHED);
++ break;
++ }
++
++ return 1;
++}
++
++int IPI_handler(struct pt_regs *regs)
++{
++ int cpu;
++ cpu = task_cpu(current);
++ return(dump_nmi_callback(regs, cpu));
++}
++
++/* save registers on other processors */
++void
++__dump_save_other_cpus(void)
++{
++ int i, cpu = smp_processor_id();
++ int other_cpus = num_online_cpus()-1;
++ int wait_time = wait_for_dump_ipi;
++
++ if (other_cpus > 0) {
++ atomic_set(&waiting_for_dump_ipi, other_cpus);
++
++ for (i = 0; i < NR_CPUS; i++) {
++ dump_expect_ipi[i] = (i != cpu && cpu_online(i));
++ }
++
++ dump_ipi_function_ptr = IPI_handler;
++
++ wmb();
++
++ dump_send_ipi();
++ /* may be we dont need to wait for IPI to be processed.
++ * just write out the header at the end of dumping, if
++ * this IPI is not processed until then, there probably
++ * is a problem and we just fail to capture state of
++ * other cpus. */
++ while(wait_time-- && (atomic_read(&waiting_for_dump_ipi) > 0)) {
++ barrier();
++ mdelay(1);
++ }
++ if (wait_time <= 0) {
++ printk("dump ipi timeout, proceeding...\n");
++ }
++ }
++}
++#endif
++/*
++ * Kludge - dump from interrupt context is unreliable (Fixme)
++ *
++ * We do this so that softirqs initiated for dump i/o
++ * get processed and we don't hang while waiting for i/o
++ * to complete or in any irq synchronization attempt.
++ *
++ * This is not quite legal of course, as it has the side
++ * effect of making all interrupts & softirqs triggered
++ * while dump is in progress complete before currently
++ * pending softirqs and the currently executing interrupt
++ * code.
++ */
++static inline void
++irq_bh_save(void)
++{
++ saved_irq_count = irq_count();
++ preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK);
++}
++
++static inline void
++irq_bh_restore(void)
++{
++ preempt_count() |= saved_irq_count;
++}
++
++/*
++ * Name: __dump_configure_header()
++ * Func: Configure the dump header with all proper values.
++ */
++int
++__dump_configure_header(const struct pt_regs *regs)
++{
++ return (0);
++}
++
++
++#define dim(x) (sizeof(x)/sizeof(*(x)))
++
++/*
++ * Name: __dump_irq_enable
++ * Func: Reset system so interrupts are enabled.
++ * This is used for dump methods that require interrupts
++ * Eventually, all methods will have interrupts disabled
++ * and this code can be removed.
++ *
++ * Change irq affinities
++ * Re-enable interrupts
++ */
++int
++__dump_irq_enable(void)
++{
++ set_irq_affinity();
++ irq_bh_save();
++ ia64_srlz_d();
++ /*
++ * reduce the task priority level
++ * to get disk interrupts
++ */
++ ia64_setreg(_IA64_REG_CR_TPR, 0);
++ ia64_srlz_d();
++ local_irq_enable();
++ return 0;
++}
++
++/*
++ * Name: __dump_irq_restore
++ * Func: Resume the system state in an architecture-specific way.
++
++ */
++void
++__dump_irq_restore(void)
++{
++ local_irq_disable();
++ reset_irq_affinity();
++ irq_bh_restore();
++}
++
++/*
++ * Name: __dump_page_valid()
++ * Func: Check if page is valid to dump.
++ */
++int
++__dump_page_valid(unsigned long index)
++{
++ if (!pfn_valid(index))
++ {
++ return 0;
++ }
++ return 1;
++}
++
++/*
++ * Name: __dump_init()
++ * Func: Initialize the dumping routine process. This is in case
++ * it's necessary in the future.
++ */
++void
++__dump_init(uint64_t local_memory_start)
++{
++ return;
++}
++
++/*
++ * Name: __dump_open()
++ * Func: Open the dump device (architecture specific). This is in
++ * case it's necessary in the future.
++ */
++void
++__dump_open(void)
++{
++ alloc_dha_stack();
++ return;
++}
++
++
++/*
++ * Name: __dump_cleanup()
++ * Func: Free any architecture specific data structures. This is called
++ * when the dump module is being removed.
++ */
++void
++__dump_cleanup(void)
++{
++ free_dha_stack();
++
++ return;
++}
++
++
++
++int __dump_memcpy_mc_expected = 0; /* Doesn't help yet */
++
++/*
++ * An ia64 version of memcpy() that trys to avoid machine checks.
++ *
++ * NB:
++ * By itself __dump_memcpy_mc_expected() ins't providing any
++ * protection against Machine Checks. We are looking into the
++ * possability of adding code to the arch/ia64/kernel/mca.c fuction
++ * ia64_mca_ucmc_handler() to restore state so that a IA64_MCA_CORRECTED
++ * can be returned to the firmware. Curently it always returns
++ * IA64_MCA_COLD_BOOT and reboots the machine.
++ */
++/*
++void * __dump_memcpy(void * dest, const void *src, size_t count)
++{
++ void *vp;
++
++ if (__dump_memcpy_mc_expected) {
++ ia64_pal_mc_expected((u64) 1, 0);
++ }
++
++ vp = memcpy(dest, src, count);
++
++ if (__dump_memcpy_mc_expected) {
++ ia64_pal_mc_expected((u64) 0, 0);
++ }
++ return(vp);
++}
++*/
++/*
++ * Name: manual_handle_crashdump()
++ * Func: Interface for the lkcd dump command. Calls dump_execute()
++ */
++int
++manual_handle_crashdump(void) {
++
++ struct pt_regs regs;
++
++ get_current_regs(®s);
++ dump_execute("manual", ®s);
++ return 0;
++}
++
++/*
++ * Name: __dump_clean_irq_state()
++ * Func: Clean up from the previous IRQ handling state. Such as oops from
++ * interrupt handler or bottom half.
++ */
++void
++__dump_clean_irq_state(void)
++{
++ unsigned long saved_tpr;
++ unsigned long TPR_MASK = 0xFFFFFFFFFFFEFF0F;
++
++
++ /* Get the processors task priority register */
++ saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
++ /* clear the mmi and mic bit's of the TPR to unmask interrupts */
++ saved_tpr = saved_tpr & TPR_MASK;
++ ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
++ ia64_srlz_d();
++
++ /* Tell the processor we're done with the interrupt
++ * that got us here.
++ */
++
++ ia64_eoi();
++
++ /* local implementation of irq_exit(); */
++ preempt_count() -= IRQ_EXIT_OFFSET;
++ preempt_enable_no_resched();
++
++ return;
++}
++
+Index: linux-2.6.10/drivers/dump/dump_rle.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_rle.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_rle.c 2005-04-05 16:47:53.935206320 +0800
+@@ -0,0 +1,176 @@
++/*
++ * RLE Compression functions for kernel crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sourceforge.net)
++ * Copyright 2001 Matt D. Robinson. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* header files */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/file.h>
++#include <linux/init.h>
++#include <linux/dump.h>
++
++/*
++ * Name: dump_compress_rle()
++ * Func: Compress a DUMP_PAGE_SIZE (hardware) page down to something more
++ * reasonable, if possible. This is the same routine we use in IRIX.
++ */
++static u32
++dump_compress_rle(const u8 *old, u32 oldsize, u8 *new, u32 newsize,
++ unsigned long loc)
++{
++ u16 ri, wi, count = 0;
++ u_char value = 0, cur_byte;
++
++ /*
++ * If the block should happen to "compress" to larger than the
++ * buffer size, allocate a larger one and change cur_buf_size.
++ */
++
++ wi = ri = 0;
++
++ while (ri < oldsize) {
++ if (!ri) {
++ cur_byte = value = old[ri];
++ count = 0;
++ } else {
++ if (count == 255) {
++ if (wi + 3 > oldsize) {
++ return oldsize;
++ }
++ new[wi++] = 0;
++ new[wi++] = count;
++ new[wi++] = value;
++ value = cur_byte = old[ri];
++ count = 0;
++ } else {
++ if ((cur_byte = old[ri]) == value) {
++ count++;
++ } else {
++ if (count > 1) {
++ if (wi + 3 > oldsize) {
++ return oldsize;
++ }
++ new[wi++] = 0;
++ new[wi++] = count;
++ new[wi++] = value;
++ } else if (count == 1) {
++ if (value == 0) {
++ if (wi + 3 > oldsize) {
++ return oldsize;
++ }
++ new[wi++] = 0;
++ new[wi++] = 1;
++ new[wi++] = 0;
++ } else {
++ if (wi + 2 > oldsize) {
++ return oldsize;
++ }
++ new[wi++] = value;
++ new[wi++] = value;
++ }
++ } else { /* count == 0 */
++ if (value == 0) {
++ if (wi + 2 > oldsize) {
++ return oldsize;
++ }
++ new[wi++] = value;
++ new[wi++] = value;
++ } else {
++ if (wi + 1 > oldsize) {
++ return oldsize;
++ }
++ new[wi++] = value;
++ }
++ } /* if count > 1 */
++
++ value = cur_byte;
++ count = 0;
++
++ } /* if byte == value */
++
++ } /* if count == 255 */
++
++ } /* if ri == 0 */
++ ri++;
++
++ }
++ if (count > 1) {
++ if (wi + 3 > oldsize) {
++ return oldsize;
++ }
++ new[wi++] = 0;
++ new[wi++] = count;
++ new[wi++] = value;
++ } else if (count == 1) {
++ if (value == 0) {
++ if (wi + 3 > oldsize)
++ return oldsize;
++ new[wi++] = 0;
++ new[wi++] = 1;
++ new[wi++] = 0;
++ } else {
++ if (wi + 2 > oldsize)
++ return oldsize;
++ new[wi++] = value;
++ new[wi++] = value;
++ }
++ } else { /* count == 0 */
++ if (value == 0) {
++ if (wi + 2 > oldsize)
++ return oldsize;
++ new[wi++] = value;
++ new[wi++] = value;
++ } else {
++ if (wi + 1 > oldsize)
++ return oldsize;
++ new[wi++] = value;
++ }
++ } /* if count > 1 */
++
++ value = cur_byte;
++ count = 0;
++ return wi;
++}
++
++/* setup the rle compression functionality */
++static struct __dump_compress dump_rle_compression = {
++ .compress_type = DUMP_COMPRESS_RLE,
++ .compress_func = dump_compress_rle,
++ .compress_name = "RLE",
++};
++
++/*
++ * Name: dump_compress_rle_init()
++ * Func: Initialize rle compression for dumping.
++ */
++static int __init
++dump_compress_rle_init(void)
++{
++ dump_register_compression(&dump_rle_compression);
++ return 0;
++}
++
++/*
++ * Name: dump_compress_rle_cleanup()
++ * Func: Remove rle compression for dumping.
++ */
++static void __exit
++dump_compress_rle_cleanup(void)
++{
++ dump_unregister_compression(DUMP_COMPRESS_RLE);
++}
++
++/* module initialization */
++module_init(dump_compress_rle_init);
++module_exit(dump_compress_rle_cleanup);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("LKCD Development Team <lkcd-devel@lists.sourceforge.net>");
++MODULE_DESCRIPTION("RLE compression module for crash dump driver");
+Index: linux-2.6.10/drivers/dump/dump_execute.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_execute.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_execute.c 2005-04-05 16:47:53.943205104 +0800
+@@ -0,0 +1,144 @@
++/*
++ * The file has the common/generic dump execution code
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya <suparna@in.ibm.com>
++ * Split and rewrote high level dump execute code to make use
++ * of dump method interfaces.
++ *
++ * Derived from original code in dump_base.c created by
++ * Matt Robinson <yakker@sourceforge.net>)
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * Assumes dumper and dump config settings are in place
++ * (invokes corresponding dumper specific routines as applicable)
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++#include <linux/kernel.h>
++#include <linux/notifier.h>
++#include <linux/dump.h>
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include "dump_methods.h"
++
++struct notifier_block *dump_notifier_list; /* dump started/ended callback */
++
++extern int panic_timeout;
++
++/* Dump progress indicator */
++void
++dump_speedo(int i)
++{
++ static const char twiddle[4] = { '|', '\\', '-', '/' };
++ printk("%c\b", twiddle[i&3]);
++}
++
++/* Make the device ready and write out the header */
++int dump_begin(void)
++{
++ int err = 0;
++
++ /* dump_dev = dump_config.dumper->dev; */
++ dumper_reset();
++ if ((err = dump_dev_silence())) {
++ /* quiesce failed, can't risk continuing */
++ /* Todo/Future: switch to alternate dump scheme if possible */
++ printk("dump silence dev failed ! error %d\n", err);
++ return err;
++ }
++
++ pr_debug("Writing dump header\n");
++ if ((err = dump_update_header())) {
++ printk("dump update header failed ! error %d\n", err);
++ dump_dev_resume();
++ return err;
++ }
++
++ dump_config.dumper->curr_offset = DUMP_BUFFER_SIZE;
++
++ return 0;
++}
++
++/*
++ * Write the dump terminator, a final header update and let go of
++ * exclusive use of the device for dump.
++ */
++int dump_complete(void)
++{
++ int ret = 0;
++
++ if (dump_config.level != DUMP_LEVEL_HEADER) {
++ if ((ret = dump_update_end_marker())) {
++ printk("dump update end marker error %d\n", ret);
++ }
++ if ((ret = dump_update_header())) {
++ printk("dump update header error %d\n", ret);
++ }
++ }
++ ret = dump_dev_resume();
++
++ if ((panic_timeout > 0) && (!(dump_config.flags & (DUMP_FLAGS_SOFTBOOT | DUMP_FLAGS_NONDISRUPT)))) {
++ mdelay(panic_timeout * 1000);
++ machine_restart(NULL);
++ }
++
++ return ret;
++}
++
++/* Saves all dump data */
++int dump_execute_savedump(void)
++{
++ int ret = 0, err = 0;
++
++ if ((ret = dump_begin())) {
++ return ret;
++ }
++
++ if (dump_config.level != DUMP_LEVEL_HEADER) {
++ ret = dump_sequencer();
++ }
++ if ((err = dump_complete())) {
++ printk("Dump complete failed. Error %d\n", err);
++ }
++
++ return ret;
++}
++
++extern void dump_calc_bootmap_pages(void);
++
++/* Does all the real work: Capture and save state */
++int dump_generic_execute(const char *panic_str, const struct pt_regs *regs)
++{
++ int ret = 0;
++
++#ifdef CONFIG_DISCONTIGMEM
++ printk(KERN_INFO "Reconfiguring memory bank information....\n");
++ printk(KERN_INFO "This may take a while....\n");
++ dump_reconfigure_mbanks();
++#endif
++
++ if ((ret = dump_configure_header(panic_str, regs))) {
++ printk("dump config header failed ! error %d\n", ret);
++ return ret;
++ }
++
++ dump_calc_bootmap_pages();
++ /* tell interested parties that a dump is about to start */
++ notifier_call_chain(&dump_notifier_list, DUMP_BEGIN,
++ &dump_config.dump_device);
++
++ if (dump_config.level != DUMP_LEVEL_NONE)
++ ret = dump_execute_savedump();
++
++ pr_debug("dumped %ld blocks of %d bytes each\n",
++ dump_config.dumper->count, DUMP_BUFFER_SIZE);
++
++ /* tell interested parties that a dump has completed */
++ notifier_call_chain(&dump_notifier_list, DUMP_END,
++ &dump_config.dump_device);
++
++ return ret;
++}
+Index: linux-2.6.10/drivers/dump/dump_netdev.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_netdev.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_netdev.c 2005-04-05 16:47:53.936206168 +0800
+@@ -0,0 +1,566 @@
++/*
++ * Implements the dump driver interface for saving a dump via network
++ * interface.
++ *
++ * Some of this code has been taken/adapted from Ingo Molnar's netconsole
++ * code. LKCD team expresses its thanks to Ingo.
++ *
++ * Started: June 2002 - Mohamed Abbas <mohamed.abbas@intel.com>
++ * Adapted netconsole code to implement LKCD dump over the network.
++ *
++ * Nov 2002 - Bharata B. Rao <bharata@in.ibm.com>
++ * Innumerable code cleanups, simplification and some fixes.
++ * Netdump configuration done by ioctl instead of using module parameters.
++ * Oct 2003 - Prasanna S Panchamukhi <prasanna@in.ibm.com>
++ * Netdump code modified to use Netpoll API's.
++ *
++ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#include <net/tcp.h>
++#include <net/udp.h>
++#include <linux/delay.h>
++#include <linux/random.h>
++#include <linux/reboot.h>
++#include <linux/module.h>
++#include <linux/dump.h>
++#include <linux/dump_netdev.h>
++
++#include <asm/unaligned.h>
++
++static int startup_handshake;
++static int page_counter;
++static unsigned long flags_global;
++static int netdump_in_progress;
++
++/*
++ * security depends on the trusted path between the netconsole
++ * server and netconsole client, since none of the packets are
++ * encrypted. The random magic number protects the protocol
++ * against spoofing.
++ */
++static u64 dump_magic;
++
++/*
++ * We maintain a small pool of fully-sized skbs,
++ * to make sure the message gets out even in
++ * extreme OOM situations.
++ */
++
++static void rx_hook(struct netpoll *np, int port, char *msg, int size);
++int new_req = 0;
++static req_t req;
++
++static void rx_hook(struct netpoll *np, int port, char *msg, int size)
++{
++ req_t * __req = (req_t *) msg;
++ /*
++ * First check if were are dumping or doing startup handshake, if
++ * not quickly return.
++ */
++
++ if (!netdump_in_progress)
++ return ;
++
++ if ((ntohl(__req->command) != COMM_GET_MAGIC) &&
++ (ntohl(__req->command) != COMM_HELLO) &&
++ (ntohl(__req->command) != COMM_START_WRITE_NETDUMP_ACK) &&
++ (ntohl(__req->command) != COMM_START_NETDUMP_ACK) &&
++ (memcmp(&__req->magic, &dump_magic, sizeof(dump_magic)) != 0))
++ goto out;
++
++ req.magic = ntohl(__req->magic);
++ req.command = ntohl(__req->command);
++ req.from = ntohl(__req->from);
++ req.to = ntohl(__req->to);
++ req.nr = ntohl(__req->nr);
++ new_req = 1;
++out:
++ return ;
++}
++static char netdump_membuf[1024 + HEADER_LEN + 1];
++/*
++ * Fill the netdump_membuf with the header information from reply_t structure
++ * and send it down to netpoll_send_udp() routine.
++ */
++static void
++netdump_send_packet(struct netpoll *np, reply_t *reply, size_t data_len) {
++ char *b;
++
++ b = &netdump_membuf[1];
++ netdump_membuf[0] = NETCONSOLE_VERSION;
++ put_unaligned(htonl(reply->nr), (u32 *) b);
++ put_unaligned(htonl(reply->code), (u32 *) (b + sizeof(reply->code)));
++ put_unaligned(htonl(reply->info), (u32 *) (b + sizeof(reply->code) +
++ sizeof(reply->info)));
++ netpoll_send_udp(np, netdump_membuf, data_len + HEADER_LEN);
++}
++
++static void
++dump_send_mem(struct netpoll *np, req_t *req, const char* buff, size_t len)
++{
++ int i;
++
++ int nr_chunks = len/1024;
++ reply_t reply;
++
++ reply.nr = req->nr;
++ reply.code = REPLY_MEM;
++ if ( nr_chunks <= 0)
++ nr_chunks = 1;
++ for (i = 0; i < nr_chunks; i++) {
++ unsigned int offset = i*1024;
++ reply.info = offset;
++ memcpy((netdump_membuf + HEADER_LEN), (buff + offset), 1024);
++ netdump_send_packet(np, &reply, 1024);
++ }
++}
++
++/*
++ * This function waits for the client to acknowledge the receipt
++ * of the netdump startup reply, with the possibility of packets
++ * getting lost. We resend the startup packet if no ACK is received,
++ * after a 1 second delay.
++ *
++ * (The client can test the success of the handshake via the HELLO
++ * command, and send ACKs until we enter netdump mode.)
++ */
++static int
++dump_handshake(struct dump_dev *net_dev)
++{
++ reply_t reply;
++ int i, j;
++ size_t str_len;
++
++ if (startup_handshake) {
++ sprintf((netdump_membuf + HEADER_LEN),
++ "NETDUMP start, waiting for start-ACK.\n");
++ reply.code = REPLY_START_NETDUMP;
++ reply.nr = 0;
++ reply.info = 0;
++ } else {
++ sprintf((netdump_membuf + HEADER_LEN),
++ "NETDUMP start, waiting for start-ACK.\n");
++ reply.code = REPLY_START_WRITE_NETDUMP;
++ reply.nr = net_dev->curr_offset;
++ reply.info = net_dev->curr_offset;
++ }
++ str_len = strlen(netdump_membuf + HEADER_LEN);
++
++ /* send 300 handshake packets before declaring failure */
++ for (i = 0; i < 300; i++) {
++ netdump_send_packet(&net_dev->np, &reply, str_len);
++
++ /* wait 1 sec */
++ for (j = 0; j < 10000; j++) {
++ udelay(100);
++ netpoll_poll(&net_dev->np);
++ if (new_req)
++ break;
++ }
++
++ /*
++ * if there is no new request, try sending the handshaking
++ * packet again
++ */
++ if (!new_req)
++ continue;
++
++ /*
++ * check if the new request is of the expected type,
++ * if so, return, else try sending the handshaking
++ * packet again
++ */
++ if (startup_handshake) {
++ if (req.command == COMM_HELLO || req.command ==
++ COMM_START_NETDUMP_ACK) {
++ return 0;
++ } else {
++ new_req = 0;
++ continue;
++ }
++ } else {
++ if (req.command == COMM_SEND_MEM) {
++ return 0;
++ } else {
++ new_req = 0;
++ continue;
++ }
++ }
++ }
++ return -1;
++}
++
++static ssize_t
++do_netdump(struct dump_dev *net_dev, const char* buff, size_t len)
++{
++ reply_t reply;
++ ssize_t ret = 0;
++ int repeatCounter, counter, total_loop;
++ size_t str_len;
++
++ netdump_in_progress = 1;
++
++ if (dump_handshake(net_dev) < 0) {
++ printk("network dump failed due to handshake failure\n");
++ goto out;
++ }
++
++ /*
++ * Ideally startup handshake should be done during dump configuration,
++ * i.e., in dump_net_open(). This will be done when I figure out
++ * the dependency between startup handshake, subsequent write and
++ * various commands wrt to net-server.
++ */
++ if (startup_handshake)
++ startup_handshake = 0;
++
++ counter = 0;
++ repeatCounter = 0;
++ total_loop = 0;
++ while (1) {
++ if (!new_req) {
++ netpoll_poll(&net_dev->np);
++ }
++ if (!new_req) {
++ repeatCounter++;
++
++ if (repeatCounter > 5) {
++ counter++;
++ if (counter > 10000) {
++ if (total_loop >= 100000) {
++ printk("Time OUT LEAVE NOW\n");
++ goto out;
++ } else {
++ total_loop++;
++ printk("Try number %d out of "
++ "10 before Time Out\n",
++ total_loop);
++ }
++ }
++ mdelay(1);
++ repeatCounter = 0;
++ }
++ continue;
++ }
++ repeatCounter = 0;
++ counter = 0;
++ total_loop = 0;
++ new_req = 0;
++ switch (req.command) {
++ case COMM_NONE:
++ break;
++
++ case COMM_SEND_MEM:
++ dump_send_mem(&net_dev->np, &req, buff, len);
++ break;
++
++ case COMM_EXIT:
++ case COMM_START_WRITE_NETDUMP_ACK:
++ ret = len;
++ goto out;
++
++ case COMM_HELLO:
++ sprintf((netdump_membuf + HEADER_LEN),
++ "Hello, this is netdump version " "0.%02d\n",
++ NETCONSOLE_VERSION);
++ str_len = strlen(netdump_membuf + HEADER_LEN);
++ reply.code = REPLY_HELLO;
++ reply.nr = req.nr;
++ reply.info = net_dev->curr_offset;
++ netdump_send_packet(&net_dev->np, &reply, str_len);
++ break;
++
++ case COMM_GET_PAGE_SIZE:
++ sprintf((netdump_membuf + HEADER_LEN),
++ "PAGE_SIZE: %ld\n", PAGE_SIZE);
++ str_len = strlen(netdump_membuf + HEADER_LEN);
++ reply.code = REPLY_PAGE_SIZE;
++ reply.nr = req.nr;
++ reply.info = PAGE_SIZE;
++ netdump_send_packet(&net_dev->np, &reply, str_len);
++ break;
++
++ case COMM_GET_NR_PAGES:
++ reply.code = REPLY_NR_PAGES;
++ reply.nr = req.nr;
++ reply.info = num_physpages;
++ reply.info = page_counter;
++ sprintf((netdump_membuf + HEADER_LEN),
++ "Number of pages: %ld\n", num_physpages);
++ str_len = strlen(netdump_membuf + HEADER_LEN);
++ netdump_send_packet(&net_dev->np, &reply, str_len);
++ break;
++
++ case COMM_GET_MAGIC:
++ reply.code = REPLY_MAGIC;
++ reply.nr = req.nr;
++ reply.info = NETCONSOLE_VERSION;
++ sprintf((netdump_membuf + HEADER_LEN),
++ (char *)&dump_magic, sizeof(dump_magic));
++ str_len = strlen(netdump_membuf + HEADER_LEN);
++ netdump_send_packet(&net_dev->np, &reply, str_len);
++ break;
++
++ default:
++ reply.code = REPLY_ERROR;
++ reply.nr = req.nr;
++ reply.info = req.command;
++ sprintf((netdump_membuf + HEADER_LEN),
++ "Got unknown command code %d!\n", req.command);
++ str_len = strlen(netdump_membuf + HEADER_LEN);
++ netdump_send_packet(&net_dev->np, &reply, str_len);
++ break;
++ }
++ }
++out:
++ netdump_in_progress = 0;
++ return ret;
++}
++
++static int
++dump_validate_config(struct netpoll *np)
++{
++ if (!np->local_ip) {
++ printk("network device %s has no local address, "
++ "aborting.\n", np->name);
++ return -1;
++ }
++
++#define IP(x) ((unsigned char *)&np->local_ip)[x]
++ printk("Source %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3));
++#undef IP
++
++ if (!np->local_port) {
++ printk("source_port parameter not specified, aborting.\n");
++ return -1;
++ }
++
++ if (!np->remote_ip) {
++ printk("target_ip parameter not specified, aborting.\n");
++ return -1;
++ }
++
++ np->remote_ip = ntohl(np->remote_ip);
++#define IP(x) ((unsigned char *)&np->remote_ip)[x]
++ printk("Target %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3));
++#undef IP
++
++ if (!np->remote_port) {
++ printk("target_port parameter not specified, aborting.\n");
++ return -1;
++ }
++ printk("Target Ethernet Address %02x:%02x:%02x:%02x:%02x:%02x",
++ np->remote_mac[0], np->remote_mac[1], np->remote_mac[2],
++ np->remote_mac[3], np->remote_mac[4], np->remote_mac[5]);
++
++ if ((np->remote_mac[0] & np->remote_mac[1] & np->remote_mac[2] &
++ np->remote_mac[3] & np->remote_mac[4] & np->remote_mac[5]) == 255)
++ printk("(Broadcast)");
++ printk("\n");
++ return 0;
++}
++
++/*
++ * Prepares the dump device so we can take a dump later.
++ * Validates the netdump configuration parameters.
++ *
++ * TODO: Network connectivity check should be done here.
++ */
++static int
++dump_net_open(struct dump_dev *net_dev, unsigned long arg)
++{
++ int retval = 0;
++
++ /* get the interface name */
++ if (copy_from_user(net_dev->np.dev_name, (void *)arg, IFNAMSIZ))
++ return -EFAULT;
++ net_dev->np.rx_hook = rx_hook;
++ retval = netpoll_setup(&net_dev->np);
++
++ dump_validate_config(&net_dev->np);
++ net_dev->curr_offset = 0;
++ printk("Network device %s successfully configured for dumping\n",
++ net_dev->np.dev_name);
++ return retval;
++}
++
++/*
++ * Close the dump device and release associated resources
++ * Invoked when unconfiguring the dump device.
++ */
++static int
++dump_net_release(struct dump_dev *net_dev)
++{
++ netpoll_cleanup(&net_dev->np);
++ return 0;
++}
++
++/*
++ * Prepare the dump device for use (silence any ongoing activity
++ * and quiesce state) when the system crashes.
++ */
++static int
++dump_net_silence(struct dump_dev *net_dev)
++{
++ netpoll_set_trap(1);
++ local_irq_save(flags_global);
++ startup_handshake = 1;
++ net_dev->curr_offset = 0;
++ printk("Dumping to network device %s on CPU %d ...\n", net_dev->np.name,
++ smp_processor_id());
++ return 0;
++}
++
++/*
++ * Invoked when dumping is done. This is the time to put things back
++ * (i.e. undo the effects of dump_block_silence) so the device is
++ * available for normal use.
++ */
++static int
++dump_net_resume(struct dump_dev *net_dev)
++{
++ int indx;
++ size_t str_len;
++ reply_t reply;
++
++ sprintf((netdump_membuf + HEADER_LEN), "NETDUMP end.\n");
++ str_len = strlen(netdump_membuf + HEADER_LEN);
++ for( indx = 0; indx < 6; indx++) {
++ reply.code = REPLY_END_NETDUMP;
++ reply.nr = 0;
++ reply.info = 0;
++ netdump_send_packet(&net_dev->np, &reply, str_len);
++ }
++ printk("NETDUMP END!\n");
++ local_irq_restore(flags_global);
++ netpoll_set_trap(0);
++ startup_handshake = 0;
++ return 0;
++}
++
++/*
++ * Seek to the specified offset in the dump device.
++ * Makes sure this is a valid offset, otherwise returns an error.
++ */
++static int
++dump_net_seek(struct dump_dev *net_dev, loff_t off)
++{
++ net_dev->curr_offset = off;
++ return 0;
++}
++
++/*
++ *
++ */
++static int
++dump_net_write(struct dump_dev *net_dev, void *buf, unsigned long len)
++{
++ int cnt, i, off;
++ ssize_t ret;
++
++ cnt = len/ PAGE_SIZE;
++
++ for (i = 0; i < cnt; i++) {
++ off = i* PAGE_SIZE;
++ ret = do_netdump(net_dev, buf+off, PAGE_SIZE);
++ if (ret <= 0)
++ return -1;
++ net_dev->curr_offset = net_dev->curr_offset + PAGE_SIZE;
++ }
++ return len;
++}
++
++/*
++ * check if the last dump i/o is over and ready for next request
++ */
++static int
++dump_net_ready(struct dump_dev *net_dev, void *buf)
++{
++ return 0;
++}
++
++/*
++ * ioctl function used for configuring network dump
++ */
++static int
++dump_net_ioctl(struct dump_dev *net_dev, unsigned int cmd, unsigned long arg)
++{
++ switch (cmd) {
++ case DIOSTARGETIP:
++ net_dev->np.remote_ip= arg;
++ break;
++ case DIOSTARGETPORT:
++ net_dev->np.remote_port = (u16)arg;
++ break;
++ case DIOSSOURCEPORT:
++ net_dev->np.local_port = (u16)arg;
++ break;
++ case DIOSETHADDR:
++ return copy_from_user(net_dev->np.remote_mac, (void *)arg, 6);
++ break;
++ case DIOGTARGETIP:
++ case DIOGTARGETPORT:
++ case DIOGSOURCEPORT:
++ case DIOGETHADDR:
++ break;
++ default:
++ return -EINVAL;
++ }
++ return 0;
++}
++
++struct dump_dev_ops dump_netdev_ops = {
++ .open = dump_net_open,
++ .release = dump_net_release,
++ .silence = dump_net_silence,
++ .resume = dump_net_resume,
++ .seek = dump_net_seek,
++ .write = dump_net_write,
++ /* .read not implemented */
++ .ready = dump_net_ready,
++ .ioctl = dump_net_ioctl
++};
++
++static struct dump_dev default_dump_netdev = {
++ .type_name = "networkdev",
++ .ops = &dump_netdev_ops,
++ .curr_offset = 0,
++ .np.name = "netdump",
++ .np.dev_name = "eth0",
++ .np.rx_hook = rx_hook,
++ .np.local_port = 6688,
++ .np.remote_port = 6688,
++ .np.remote_mac = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
++};
++
++static int __init
++dump_netdev_init(void)
++{
++ default_dump_netdev.curr_offset = 0;
++
++ if (dump_register_device(&default_dump_netdev) < 0) {
++ printk("network dump device driver registration failed\n");
++ return -1;
++ }
++ printk("network device driver for LKCD registered\n");
++
++ get_random_bytes(&dump_magic, sizeof(dump_magic));
++ return 0;
++}
++
++static void __exit
++dump_netdev_cleanup(void)
++{
++ dump_unregister_device(&default_dump_netdev);
++}
++
++MODULE_AUTHOR("LKCD Development Team <lkcd-devel@lists.sourceforge.net>");
++MODULE_DESCRIPTION("Network Dump Driver for Linux Kernel Crash Dump (LKCD)");
++MODULE_LICENSE("GPL");
++
++module_init(dump_netdev_init);
++module_exit(dump_netdev_cleanup);
+Index: linux-2.6.10/drivers/dump/dump_x8664.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_x8664.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_x8664.c 2005-04-05 16:47:53.932206776 +0800
+@@ -0,0 +1,362 @@
++/*
++ * Architecture specific (x86-64) functions for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ *
++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com)
++ * Copyright 2000 TurboLinux, Inc. All rights reserved.
++ *
++ * x86-64 port Copyright 2002 Andi Kleen, SuSE Labs
++ * x86-64 port Sachin Sant ( sachinp@in.ibm.com )
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * The hooks for dumping the kernel virtual memory to disk are in this
++ * file. Any time a modification is made to the virtual memory mechanism,
++ * these routines must be changed to use the new mechanisms.
++ */
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/smp.h>
++#include <linux/fs.h>
++#include <linux/vmalloc.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++#include <linux/mm.h>
++#include <linux/rcupdate.h>
++#include <asm/processor.h>
++#include <asm/hardirq.h>
++#include <asm/kdebug.h>
++#include <asm/uaccess.h>
++#include <asm/nmi.h>
++#include <asm/kdebug.h>
++
++static __s32 saved_irq_count; /* saved preempt_count() flag */
++
++void (*dump_trace_ptr)(struct pt_regs *);
++
++static int alloc_dha_stack(void)
++{
++ int i;
++ void *ptr;
++
++ if (dump_header_asm.dha_stack[0])
++ return 0;
++
++ ptr = vmalloc(THREAD_SIZE * num_online_cpus());
++ if (!ptr) {
++ printk("vmalloc for dha_stacks failed\n");
++ return -ENOMEM;
++ }
++
++ for (i = 0; i < num_online_cpus(); i++) {
++ dump_header_asm.dha_stack[i] =
++ (uint64_t)((unsigned long)ptr + (i * THREAD_SIZE));
++ }
++ return 0;
++}
++
++static int free_dha_stack(void)
++{
++ if (dump_header_asm.dha_stack[0]) {
++ vfree((void *)dump_header_asm.dha_stack[0]);
++ dump_header_asm.dha_stack[0] = 0;
++ }
++ return 0;
++}
++
++void
++__dump_save_regs(struct pt_regs* dest_regs, const struct pt_regs* regs)
++{
++ if (regs)
++ memcpy(dest_regs, regs, sizeof(struct pt_regs));
++}
++
++void
++__dump_save_context(int cpu, const struct pt_regs *regs,
++ struct task_struct *tsk)
++{
++ dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk;
++ __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs);
++
++ /* take a snapshot of the stack */
++ /* doing this enables us to tolerate slight drifts on this cpu */
++
++ if (dump_header_asm.dha_stack[cpu]) {
++ memcpy((void *)dump_header_asm.dha_stack[cpu],
++ STACK_START_POSITION(tsk),
++ THREAD_SIZE);
++ }
++ dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info);
++}
++
++#ifdef CONFIG_SMP
++extern cpumask_t irq_affinity[];
++extern irq_desc_t irq_desc[];
++extern void dump_send_ipi(void);
++static int dump_expect_ipi[NR_CPUS];
++static atomic_t waiting_for_dump_ipi;
++static unsigned long saved_affinity[NR_IRQS];
++
++extern void stop_this_cpu(void *);
++
++static int
++dump_nmi_callback(struct pt_regs *regs, int cpu)
++{
++ if (!dump_expect_ipi[cpu]) {
++ return 0;
++ }
++
++ dump_expect_ipi[cpu] = 0;
++
++ dump_save_this_cpu(regs);
++ atomic_dec(&waiting_for_dump_ipi);
++
++level_changed:
++
++ switch (dump_silence_level) {
++ case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */
++ while (dump_oncpu) {
++ barrier(); /* paranoia */
++ if (dump_silence_level != DUMP_HARD_SPIN_CPUS)
++ goto level_changed;
++
++ cpu_relax(); /* kill time nicely */
++ }
++ break;
++
++ case DUMP_HALT_CPUS: /* Execute halt */
++ stop_this_cpu(NULL);
++ break;
++
++ case DUMP_SOFT_SPIN_CPUS:
++ /* Mark the task so it spins in schedule */
++ set_tsk_thread_flag(current, TIF_NEED_RESCHED);
++ break;
++ }
++
++ return 1;
++}
++
++/* save registers on other processors */
++void
++__dump_save_other_cpus(void)
++{
++ int i, cpu = smp_processor_id();
++ int other_cpus = num_online_cpus() - 1;
++
++ if (other_cpus > 0) {
++ atomic_set(&waiting_for_dump_ipi, other_cpus);
++
++ for (i = 0; i < NR_CPUS; i++)
++ dump_expect_ipi[i] = (i != cpu && cpu_online(i));
++
++ set_nmi_callback(dump_nmi_callback);
++ wmb();
++
++ dump_send_ipi();
++
++ /* may be we dont need to wait for NMI to be processed.
++ just write out the header at the end of dumping, if
++ this IPI is not processed untill then, there probably
++ is a problem and we just fail to capture state of
++ other cpus. */
++ while(atomic_read(&waiting_for_dump_ipi) > 0)
++ cpu_relax();
++
++ unset_nmi_callback();
++ }
++ return;
++}
++
++/*
++ * Routine to save the old irq affinities and change affinities of all irqs to
++ * the dumping cpu.
++ */
++static void
++set_irq_affinity(void)
++{
++ int i;
++ cpumask_t cpu = CPU_MASK_NONE;
++
++ cpu_set(smp_processor_id(), cpu);
++ memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long));
++ for (i = 0; i < NR_IRQS; i++) {
++ if (irq_desc[i].handler == NULL)
++ continue;
++ irq_affinity[i] = cpu;
++ if (irq_desc[i].handler->set_affinity != NULL)
++ irq_desc[i].handler->set_affinity(i, irq_affinity[i]);
++ }
++}
++
++/*
++ * Restore old irq affinities.
++ */
++static void
++reset_irq_affinity(void)
++{
++ int i;
++
++ memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long));
++ for (i = 0; i < NR_IRQS; i++) {
++ if (irq_desc[i].handler == NULL)
++ continue;
++ if (irq_desc[i].handler->set_affinity != NULL)
++ irq_desc[i].handler->set_affinity(i, saved_affinity[i]);
++ }
++}
++
++#else /* !CONFIG_SMP */
++#define set_irq_affinity() do { } while (0)
++#define reset_irq_affinity() do { } while (0)
++#define save_other_cpu_states() do { } while (0)
++#endif /* !CONFIG_SMP */
++
++static inline void
++irq_bh_save(void)
++{
++ saved_irq_count = irq_count();
++ preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK);
++}
++
++static inline void
++irq_bh_restore(void)
++{
++ preempt_count() |= saved_irq_count;
++}
++
++/*
++ * Name: __dump_irq_enable
++ * Func: Reset system so interrupts are enabled.
++ * This is used for dump methods that require interrupts
++ * Eventually, all methods will have interrupts disabled
++ * and this code can be removed.
++ *
++ * Change irq affinities
++ * Re-enable interrupts
++ */
++int
++__dump_irq_enable(void)
++{
++ set_irq_affinity();
++ irq_bh_save();
++ local_irq_enable();
++ return 0;
++}
++
++/*
++ * Name: __dump_irq_restore
++ * Func: Resume the system state in an architecture-speeific way.
++ *
++ */
++void
++__dump_irq_restore(void)
++{
++ local_irq_disable();
++ reset_irq_affinity();
++ irq_bh_restore();
++}
++
++/*
++ * Name: __dump_configure_header()
++ * Func: Configure the dump header with all proper values.
++ */
++int
++__dump_configure_header(const struct pt_regs *regs)
++{
++ /* Dummy function - return */
++ return (0);
++}
++
++static int notify(struct notifier_block *nb, unsigned long code, void *data)
++{
++ if (code == DIE_NMI_IPI && dump_oncpu)
++ return NOTIFY_BAD;
++ return NOTIFY_DONE;
++}
++
++static struct notifier_block dump_notifier = {
++ .notifier_call = notify,
++};
++
++/*
++ * Name: __dump_init()
++ * Func: Initialize the dumping routine process.
++ */
++void
++__dump_init(uint64_t local_memory_start)
++{
++ notifier_chain_register(&die_chain, &dump_notifier);
++}
++
++/*
++ * Name: __dump_open()
++ * Func: Open the dump device (architecture specific). This is in
++ * case it's necessary in the future.
++ */
++void
++__dump_open(void)
++{
++ alloc_dha_stack();
++ /* return */
++ return;
++}
++
++/*
++ * Name: __dump_cleanup()
++ * Func: Free any architecture specific data structures. This is called
++ * when the dump module is being removed.
++ */
++void
++__dump_cleanup(void)
++{
++ free_dha_stack();
++ notifier_chain_unregister(&die_chain, &dump_notifier);
++ synchronize_kernel();
++ return;
++}
++
++extern int page_is_ram(unsigned long);
++
++/*
++ * Name: __dump_page_valid()
++ * Func: Check if page is valid to dump.
++ */
++int
++__dump_page_valid(unsigned long index)
++{
++ if (!pfn_valid(index))
++ return 0;
++
++ return page_is_ram(index);
++}
++
++/*
++ * Name: manual_handle_crashdump()
++ * Func: Interface for the lkcd dump command. Calls dump_execute()
++ */
++int
++manual_handle_crashdump(void) {
++
++ struct pt_regs regs;
++
++ get_current_regs(®s);
++ dump_execute("manual", ®s);
++ return 0;
++}
++
++/*
++ * Name: __dump_clean_irq_state()
++ * Func: Clean up from the previous IRQ handling state. Such as oops from
++ * interrupt handler or bottom half.
++ */
++void
++__dump_clean_irq_state(void)
++{
++ return;
++}
+Index: linux-2.6.10/drivers/dump/dump_overlay.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_overlay.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_overlay.c 2005-04-05 16:47:53.934206472 +0800
+@@ -0,0 +1,890 @@
++/*
++ * Two-stage soft-boot based dump scheme methods (memory overlay
++ * with post soft-boot writeout)
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya <suparna@in.ibm.com>
++ *
++ * This approach of saving the dump in memory and writing it
++ * out after a softboot without clearing memory is derived from the
++ * Mission Critical Linux dump implementation. Credits and a big
++ * thanks for letting the lkcd project make use of the excellent
++ * piece of work and also for helping with clarifications and
++ * tips along the way are due to:
++ * Dave Winchell <winchell@mclx.com> (primary author of mcore)
++ * and also to
++ * Jeff Moyer <moyer@mclx.com>
++ * Josh Huber <huber@mclx.com>
++ *
++ * For those familiar with the mcore implementation, the key
++ * differences/extensions here are in allowing entire memory to be
++ * saved (in compressed form) through a careful ordering scheme
++ * on both the way down as well on the way up after boot, the latter
++ * for supporting the LKCD notion of passes in which most critical
++ * data is the first to be saved to the dump device. Also the post
++ * boot writeout happens from within the kernel rather than driven
++ * from userspace.
++ *
++ * The sequence is orchestrated through the abstraction of "dumpers",
++ * one for the first stage which then sets up the dumper for the next
++ * stage, providing for a smooth and flexible reuse of the singlestage
++ * dump scheme methods and a handle to pass dump device configuration
++ * information across the soft boot.
++ *
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * Disruptive dumping using the second kernel soft-boot option
++ * for issuing dump i/o operates in 2 stages:
++ *
++ * (1) - Saves the (compressed & formatted) dump in memory using a
++ * carefully ordered overlay scheme designed to capture the
++ * entire physical memory or selective portions depending on
++ * dump config settings,
++ * - Registers the stage 2 dumper and
++ * - Issues a soft reboot w/o clearing memory.
++ *
++ * The overlay scheme starts with a small bootstrap free area
++ * and follows a reverse ordering of passes wherein it
++ * compresses and saves data starting with the least critical
++ * areas first, thus freeing up the corresponding pages to
++ * serve as destination for subsequent data to be saved, and
++ * so on. With a good compression ratio, this makes it feasible
++ * to capture an entire physical memory dump without significantly
++ * reducing memory available during regular operation.
++ *
++ * (2) Post soft-reboot, runs through the saved memory dump and
++ * writes it out to disk, this time around, taking care to
++ * save the more critical data first (i.e. pages which figure
++ * in early passes for a regular dump). Finally issues a
++ * clean reboot.
++ *
++ * Since the data was saved in memory after selection/filtering
++ * and formatted as per the chosen output dump format, at this
++ * stage the filter and format actions are just dummy (or
++ * passthrough) actions, except for influence on ordering of
++ * passes.
++ */
++
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/highmem.h>
++#include <linux/bootmem.h>
++#include <linux/dump.h>
++#ifdef CONFIG_KEXEC
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include <linux/kexec.h>
++#endif
++#include "dump_methods.h"
++
++extern struct list_head dumper_list_head;
++extern struct dump_memdev *dump_memdev;
++extern struct dumper dumper_stage2;
++struct dump_config_block *dump_saved_config = NULL;
++extern struct dump_blockdev *dump_blockdev;
++static struct dump_memdev *saved_dump_memdev = NULL;
++static struct dumper *saved_dumper = NULL;
++
++#ifdef CONFIG_KEXEC
++extern int panic_timeout;
++#endif
++
++/* For testing
++extern void dump_display_map(struct dump_memdev *);
++*/
++
++struct dumper *dumper_by_name(char *name)
++{
++#ifdef LATER
++ struct dumper *dumper;
++ list_for_each_entry(dumper, &dumper_list_head, dumper_list)
++ if (!strncmp(dumper->name, name, 32))
++ return dumper;
++
++ /* not found */
++ return NULL;
++#endif
++ /* Temporary proof of concept */
++ if (!strncmp(dumper_stage2.name, name, 32))
++ return &dumper_stage2;
++ else
++ return NULL;
++}
++
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++extern void dump_early_reserve_map(struct dump_memdev *);
++
++void crashdump_reserve(void)
++{
++ extern unsigned long crashdump_addr;
++
++ if (crashdump_addr == 0xdeadbeef)
++ return;
++
++ /* reserve dump config and saved dump pages */
++ dump_saved_config = (struct dump_config_block *)crashdump_addr;
++ /* magic verification */
++ if (dump_saved_config->magic != DUMP_MAGIC_LIVE) {
++ printk("Invalid dump magic. Ignoring dump\n");
++ dump_saved_config = NULL;
++ return;
++ }
++
++ printk("Dump may be available from previous boot\n");
++
++#ifdef CONFIG_X86_64
++ reserve_bootmem_node(NODE_DATA(0),
++ virt_to_phys((void *)crashdump_addr),
++ PAGE_ALIGN(sizeof(struct dump_config_block)));
++#else
++ reserve_bootmem(virt_to_phys((void *)crashdump_addr),
++ PAGE_ALIGN(sizeof(struct dump_config_block)));
++#endif
++ dump_early_reserve_map(&dump_saved_config->memdev);
++
++}
++#endif
++
++/*
++ * Loads the dump configuration from a memory block saved across soft-boot
++ * The ops vectors need fixing up as the corresp. routines may have
++ * relocated in the new soft-booted kernel.
++ */
++int dump_load_config(struct dump_config_block *config)
++{
++ struct dumper *dumper;
++ struct dump_data_filter *filter_table, *filter;
++ struct dump_dev *dev;
++ int i;
++
++ if (config->magic != DUMP_MAGIC_LIVE)
++ return -ENOENT; /* not a valid config */
++
++ /* initialize generic config data */
++ memcpy(&dump_config, &config->config, sizeof(dump_config));
++
++ /* initialize dumper state */
++ if (!(dumper = dumper_by_name(config->dumper.name))) {
++ printk("dumper name mismatch\n");
++ return -ENOENT; /* dumper mismatch */
++ }
++
++ /* verify and fixup schema */
++ if (strncmp(dumper->scheme->name, config->scheme.name, 32)) {
++ printk("dumper scheme mismatch\n");
++ return -ENOENT; /* mismatch */
++ }
++ config->scheme.ops = dumper->scheme->ops;
++ config->dumper.scheme = &config->scheme;
++
++ /* verify and fixup filter operations */
++ filter_table = dumper->filter;
++ for (i = 0, filter = config->filter_table;
++ ((i < MAX_PASSES) && filter_table[i].selector);
++ i++, filter++) {
++ if (strncmp(filter_table[i].name, filter->name, 32)) {
++ printk("dump filter mismatch\n");
++ return -ENOENT; /* filter name mismatch */
++ }
++ filter->selector = filter_table[i].selector;
++ }
++ config->dumper.filter = config->filter_table;
++
++ /* fixup format */
++ if (strncmp(dumper->fmt->name, config->fmt.name, 32)) {
++ printk("dump format mismatch\n");
++ return -ENOENT; /* mismatch */
++ }
++ config->fmt.ops = dumper->fmt->ops;
++ config->dumper.fmt = &config->fmt;
++
++ /* fixup target device */
++ dev = (struct dump_dev *)(&config->dev[0]);
++ if (dumper->dev == NULL) {
++ pr_debug("Vanilla dumper - assume default\n");
++ if (dump_dev == NULL)
++ return -ENODEV;
++ dumper->dev = dump_dev;
++ }
++
++ if (strncmp(dumper->dev->type_name, dev->type_name, 32)) {
++ printk("dump dev type mismatch %s instead of %s\n",
++ dev->type_name, dumper->dev->type_name);
++ return -ENOENT; /* mismatch */
++ }
++ dev->ops = dumper->dev->ops;
++ config->dumper.dev = dev;
++
++ /* fixup memory device containing saved dump pages */
++ /* assume statically init'ed dump_memdev */
++ config->memdev.ddev.ops = dump_memdev->ddev.ops;
++ /* switch to memdev from prev boot */
++ saved_dump_memdev = dump_memdev; /* remember current */
++ dump_memdev = &config->memdev;
++
++ /* Make this the current primary dumper */
++ dump_config.dumper = &config->dumper;
++
++ return 0;
++}
++
++/* Saves the dump configuration in a memory block for use across a soft-boot */
++int dump_save_config(struct dump_config_block *config)
++{
++ printk("saving dump config settings\n");
++
++ /* dump config settings */
++ memcpy(&config->config, &dump_config, sizeof(dump_config));
++
++ /* dumper state */
++ memcpy(&config->dumper, dump_config.dumper, sizeof(struct dumper));
++ memcpy(&config->scheme, dump_config.dumper->scheme,
++ sizeof(struct dump_scheme));
++ memcpy(&config->fmt, dump_config.dumper->fmt, sizeof(struct dump_fmt));
++ memcpy(&config->dev[0], dump_config.dumper->dev,
++ sizeof(struct dump_anydev));
++ memcpy(&config->filter_table, dump_config.dumper->filter,
++ sizeof(struct dump_data_filter)*MAX_PASSES);
++
++ /* handle to saved mem pages */
++ memcpy(&config->memdev, dump_memdev, sizeof(struct dump_memdev));
++
++ config->magic = DUMP_MAGIC_LIVE;
++
++ return 0;
++}
++
++int dump_init_stage2(struct dump_config_block *saved_config)
++{
++ int err = 0;
++
++ pr_debug("dump_init_stage2\n");
++ /* Check if dump from previous boot exists */
++ if (saved_config) {
++ printk("loading dumper from previous boot \n");
++ /* load and configure dumper from previous boot */
++ if ((err = dump_load_config(saved_config)))
++ return err;
++
++ if (!dump_oncpu) {
++ if ((err = dump_configure(dump_config.dump_device))) {
++ printk("Stage 2 dump configure failed\n");
++ return err;
++ }
++ }
++
++ dumper_reset();
++ dump_dev = dump_config.dumper->dev;
++ /* write out the dump */
++ err = dump_generic_execute(NULL, NULL);
++
++ dump_saved_config = NULL;
++
++ if (!dump_oncpu) {
++ dump_unconfigure();
++ }
++
++ return err;
++
++ } else {
++ /* no dump to write out */
++ printk("no dumper from previous boot \n");
++ return 0;
++ }
++}
++
++extern void dump_mem_markpages(struct dump_memdev *);
++
++int dump_switchover_stage(void)
++{
++ int ret = 0;
++
++ /* trigger stage 2 rightaway - in real life would be after soft-boot */
++ /* dump_saved_config would be a boot param */
++ saved_dump_memdev = dump_memdev;
++ saved_dumper = dump_config.dumper;
++ ret = dump_init_stage2(dump_saved_config);
++ dump_memdev = saved_dump_memdev;
++ dump_config.dumper = saved_dumper;
++ return ret;
++}
++
++int dump_activate_softboot(void)
++{
++ int err = 0;
++#ifdef CONFIG_KEXEC
++ int num_cpus_online = 0;
++ struct kimage *image;
++#endif
++
++ /* temporary - switchover to writeout previously saved dump */
++#ifndef CONFIG_KEXEC
++ err = dump_switchover_stage(); /* non-disruptive case */
++ if (dump_oncpu)
++ dump_config.dumper = &dumper_stage1; /* set things back */
++
++ return err;
++#else
++
++ dump_silence_level = DUMP_HALT_CPUS;
++ /* wait till we become the only cpu */
++ /* maybe by checking for online cpus ? */
++
++ while((num_cpus_online = num_online_cpus()) > 1);
++
++ /* now call into kexec */
++
++ image = xchg(&kexec_image, 0);
++ if (image) {
++ mdelay(panic_timeout*1000);
++ machine_kexec(image);
++ }
++
++
++ /* TBD/Fixme:
++ * * should we call reboot notifiers ? inappropriate for panic ?
++ * * what about device_shutdown() ?
++ * * is explicit bus master disabling needed or can we do that
++ * * through driverfs ?
++ * */
++ return 0;
++#endif
++}
++
++/* --- DUMP SCHEME ROUTINES --- */
++
++static inline int dump_buf_pending(struct dumper *dumper)
++{
++ return (dumper->curr_buf - dumper->dump_buf);
++}
++
++/* Invoked during stage 1 of soft-reboot based dumping */
++int dump_overlay_sequencer(void)
++{
++ struct dump_data_filter *filter = dump_config.dumper->filter;
++ struct dump_data_filter *filter2 = dumper_stage2.filter;
++ int pass = 0, err = 0, save = 0;
++ int (*action)(unsigned long, unsigned long);
++
++ /* Make sure gzip compression is being used */
++ if (dump_config.dumper->compress->compress_type != DUMP_COMPRESS_GZIP) {
++ printk(" Please set GZIP compression \n");
++ return -EINVAL;
++ }
++
++ /* start filling in dump data right after the header */
++ dump_config.dumper->curr_offset =
++ PAGE_ALIGN(dump_config.dumper->header_len);
++
++ /* Locate the last pass */
++ for (;filter->selector; filter++, pass++);
++
++ /*
++ * Start from the end backwards: overlay involves a reverse
++ * ordering of passes, since less critical pages are more
++ * likely to be reusable as scratch space once we are through
++ * with them.
++ */
++ for (--pass, --filter; pass >= 0; pass--, filter--)
++ {
++ /* Assumes passes are exclusive (even across dumpers) */
++ /* Requires care when coding the selection functions */
++ if ((save = filter->level_mask & dump_config.level))
++ action = dump_save_data;
++ else
++ action = dump_skip_data;
++
++ /* Remember the offset where this pass started */
++ /* The second stage dumper would use this */
++ if (dump_buf_pending(dump_config.dumper) & (PAGE_SIZE - 1)) {
++ pr_debug("Starting pass %d with pending data\n", pass);
++ pr_debug("filling dummy data to page-align it\n");
++ dump_config.dumper->curr_buf = (void *)PAGE_ALIGN(
++ (unsigned long)dump_config.dumper->curr_buf);
++ }
++
++ filter2[pass].start[0] = dump_config.dumper->curr_offset
++ + dump_buf_pending(dump_config.dumper);
++
++ err = dump_iterator(pass, action, filter);
++
++ filter2[pass].end[0] = dump_config.dumper->curr_offset
++ + dump_buf_pending(dump_config.dumper);
++ filter2[pass].num_mbanks = 1;
++
++ if (err < 0) {
++ printk("dump_overlay_seq: failure %d in pass %d\n",
++ err, pass);
++ break;
++ }
++ printk("\n %d overlay pages %s of %d each in pass %d\n",
++ err, save ? "saved" : "skipped", DUMP_PAGE_SIZE, pass);
++ }
++
++ return err;
++}
++
++/* from dump_memdev.c */
++extern struct page *dump_mem_lookup(struct dump_memdev *dev, unsigned long loc);
++extern struct page *dump_mem_next_page(struct dump_memdev *dev);
++
++static inline struct page *dump_get_saved_page(loff_t loc)
++{
++ return (dump_mem_lookup(dump_memdev, loc >> PAGE_SHIFT));
++}
++
++static inline struct page *dump_next_saved_page(void)
++{
++ return (dump_mem_next_page(dump_memdev));
++}
++
++/*
++ * Iterates over list of saved dump pages. Invoked during second stage of
++ * soft boot dumping
++ *
++ * Observation: If additional selection is desired at this stage then
++ * a different iterator could be written which would advance
++ * to the next page header everytime instead of blindly picking up
++ * the data. In such a case loc would be interpreted differently.
++ * At this moment however a blind pass seems sufficient, cleaner and
++ * faster.
++ */
++int dump_saved_data_iterator(int pass, int (*action)(unsigned long,
++ unsigned long), struct dump_data_filter *filter)
++{
++ loff_t loc, end;
++ struct page *page;
++ unsigned long count = 0;
++ int i, err = 0;
++ unsigned long sz;
++
++ for (i = 0; i < filter->num_mbanks; i++) {
++ loc = filter->start[i];
++ end = filter->end[i];
++ printk("pass %d, start off 0x%llx end offset 0x%llx\n", pass,
++ loc, end);
++
++ /* loc will get treated as logical offset into stage 1 */
++ page = dump_get_saved_page(loc);
++
++ for (; loc < end; loc += PAGE_SIZE) {
++ dump_config.dumper->curr_loc = loc;
++ if (!page) {
++ printk("no more saved data for pass %d\n",
++ pass);
++ break;
++ }
++ sz = (loc + PAGE_SIZE > end) ? end - loc : PAGE_SIZE;
++
++ if (page && filter->selector(pass, (unsigned long)page,
++ PAGE_SIZE)) {
++ pr_debug("mem offset 0x%llx\n", loc);
++ if ((err = action((unsigned long)page, sz)))
++ break;
++ else
++ count++;
++ /* clear the contents of page */
++ /* fixme: consider using KM_DUMP instead */
++ clear_highpage(page);
++
++ }
++ page = dump_next_saved_page();
++ }
++ }
++
++ return err ? err : count;
++}
++
++static inline int dump_overlay_pages_done(struct page *page, int nr)
++{
++ int ret=0;
++
++ for (; nr ; page++, nr--) {
++ if (dump_check_and_free_page(dump_memdev, page))
++ ret++;
++ }
++ return ret;
++}
++
++int dump_overlay_save_data(unsigned long loc, unsigned long len)
++{
++ int err = 0;
++ struct page *page = (struct page *)loc;
++ static unsigned long cnt = 0;
++
++ if ((err = dump_generic_save_data(loc, len)))
++ return err;
++
++ if (dump_overlay_pages_done(page, len >> PAGE_SHIFT)) {
++ cnt++;
++ if (!(cnt & 0x7f))
++ pr_debug("released page 0x%lx\n", page_to_pfn(page));
++ }
++
++ return err;
++}
++
++
++int dump_overlay_skip_data(unsigned long loc, unsigned long len)
++{
++ struct page *page = (struct page *)loc;
++
++ dump_overlay_pages_done(page, len >> PAGE_SHIFT);
++ return 0;
++}
++
++int dump_overlay_resume(void)
++{
++ int err = 0;
++
++ /*
++ * switch to stage 2 dumper, save dump_config_block
++ * and then trigger a soft-boot
++ */
++ dumper_stage2.header_len = dump_config.dumper->header_len;
++ dump_config.dumper = &dumper_stage2;
++ if ((err = dump_save_config(dump_saved_config)))
++ return err;
++
++ dump_dev = dump_config.dumper->dev;
++
++#ifdef CONFIG_KEXEC
++ /* If we are doing a disruptive dump, activate softboot now */
++ if((panic_timeout > 0) && (!(dump_config.flags & DUMP_FLAGS_NONDISRUPT)))
++ err = dump_activate_softboot();
++#endif
++
++ return err;
++ err = dump_switchover_stage(); /* plugs into soft boot mechanism */
++ dump_config.dumper = &dumper_stage1; /* set things back */
++ return err;
++}
++
++int dump_overlay_configure(unsigned long devid)
++{
++ struct dump_dev *dev;
++ struct dump_config_block *saved_config = dump_saved_config;
++ int err = 0;
++
++ /* If there is a previously saved dump, write it out first */
++ if (saved_config) {
++ printk("Processing old dump pending writeout\n");
++ err = dump_switchover_stage();
++ if (err) {
++ printk("failed to writeout saved dump\n");
++ return err;
++ }
++ dump_free_mem(saved_config); /* testing only: not after boot */
++ }
++
++ dev = dumper_stage2.dev = dump_config.dumper->dev;
++ /* From here on the intermediate dump target is memory-only */
++ dump_dev = dump_config.dumper->dev = &dump_memdev->ddev;
++ if ((err = dump_generic_configure(0))) {
++ printk("dump generic configure failed: err %d\n", err);
++ return err;
++ }
++ /* temporary */
++ dumper_stage2.dump_buf = dump_config.dumper->dump_buf;
++
++ /* Sanity check on the actual target dump device */
++ if (!dev || (err = dev->ops->open(dev, devid))) {
++ return err;
++ }
++ /* TBD: should we release the target if this is soft-boot only ? */
++
++ /* alloc a dump config block area to save across reboot */
++ if (!(dump_saved_config = dump_alloc_mem(sizeof(struct
++ dump_config_block)))) {
++ printk("dump config block alloc failed\n");
++ /* undo configure */
++ dump_generic_unconfigure();
++ return -ENOMEM;
++ }
++ dump_config.dump_addr = (unsigned long)dump_saved_config;
++ printk("Dump config block of size %d set up at 0x%lx\n",
++ sizeof(*dump_saved_config), (unsigned long)dump_saved_config);
++ return 0;
++}
++
++int dump_overlay_unconfigure(void)
++{
++ struct dump_dev *dev = dumper_stage2.dev;
++ int err = 0;
++
++ pr_debug("dump_overlay_unconfigure\n");
++ /* Close the secondary device */
++ dev->ops->release(dev);
++ pr_debug("released secondary device\n");
++
++ err = dump_generic_unconfigure();
++ pr_debug("Unconfigured generic portions\n");
++ dump_free_mem(dump_saved_config);
++ dump_saved_config = NULL;
++ pr_debug("Freed saved config block\n");
++ dump_dev = dump_config.dumper->dev = dumper_stage2.dev;
++
++ printk("Unconfigured overlay dumper\n");
++ return err;
++}
++
++int dump_staged_unconfigure(void)
++{
++ int err = 0;
++ struct dump_config_block *saved_config = dump_saved_config;
++ struct dump_dev *dev;
++
++ pr_debug("dump_staged_unconfigure\n");
++ err = dump_generic_unconfigure();
++
++ /* now check if there is a saved dump waiting to be written out */
++ if (saved_config) {
++ printk("Processing saved dump pending writeout\n");
++ if ((err = dump_switchover_stage())) {
++ printk("Error in commiting saved dump at 0x%lx\n",
++ (unsigned long)saved_config);
++ printk("Old dump may hog memory\n");
++ } else {
++ dump_free_mem(saved_config);
++ pr_debug("Freed saved config block\n");
++ }
++ dump_saved_config = NULL;
++ } else {
++ dev = &dump_memdev->ddev;
++ dev->ops->release(dev);
++ }
++ printk("Unconfigured second stage dumper\n");
++
++ return 0;
++}
++
++/* ----- PASSTHRU FILTER ROUTINE --------- */
++
++/* transparent - passes everything through */
++int dump_passthru_filter(int pass, unsigned long loc, unsigned long sz)
++{
++ return 1;
++}
++
++/* ----- PASSTRU FORMAT ROUTINES ---- */
++
++
++int dump_passthru_configure_header(const char *panic_str, const struct pt_regs *regs)
++{
++ dump_config.dumper->header_dirty++;
++ return 0;
++}
++
++/* Copies bytes of data from page(s) to the specified buffer */
++int dump_copy_pages(void *buf, struct page *page, unsigned long sz)
++{
++ unsigned long len = 0, bytes;
++ void *addr;
++
++ while (len < sz) {
++ addr = kmap_atomic(page, KM_DUMP);
++ bytes = (sz > len + PAGE_SIZE) ? PAGE_SIZE : sz - len;
++ memcpy(buf, addr, bytes);
++ kunmap_atomic(addr, KM_DUMP);
++ buf += bytes;
++ len += bytes;
++ page++;
++ }
++ /* memset(dump_config.dumper->curr_buf, 0x57, len); temporary */
++
++ return sz - len;
++}
++
++int dump_passthru_update_header(void)
++{
++ long len = dump_config.dumper->header_len;
++ struct page *page;
++ void *buf = dump_config.dumper->dump_buf;
++ int err = 0;
++
++ if (!dump_config.dumper->header_dirty)
++ return 0;
++
++ pr_debug("Copying header of size %ld bytes from memory\n", len);
++ if (len > DUMP_BUFFER_SIZE)
++ return -E2BIG;
++
++ page = dump_mem_lookup(dump_memdev, 0);
++ for (; (len > 0) && page; buf += PAGE_SIZE, len -= PAGE_SIZE) {
++ if ((err = dump_copy_pages(buf, page, PAGE_SIZE)))
++ return err;
++ page = dump_mem_next_page(dump_memdev);
++ }
++ if (len > 0) {
++ printk("Incomplete header saved in mem\n");
++ return -ENOENT;
++ }
++
++ if ((err = dump_dev_seek(0))) {
++ printk("Unable to seek to dump header offset\n");
++ return err;
++ }
++ err = dump_ll_write(dump_config.dumper->dump_buf,
++ buf - dump_config.dumper->dump_buf);
++ if (err < dump_config.dumper->header_len)
++ return (err < 0) ? err : -ENOSPC;
++
++ dump_config.dumper->header_dirty = 0;
++ return 0;
++}
++
++static loff_t next_dph_offset = 0;
++
++static int dph_valid(struct __dump_page *dph)
++{
++ if ((dph->dp_address & (PAGE_SIZE - 1)) || (dph->dp_flags
++ > DUMP_DH_COMPRESSED) || (!dph->dp_flags) ||
++ (dph->dp_size > PAGE_SIZE)) {
++ printk("dp->address = 0x%llx, dp->size = 0x%x, dp->flag = 0x%x\n",
++ dph->dp_address, dph->dp_size, dph->dp_flags);
++ return 0;
++ }
++ return 1;
++}
++
++int dump_verify_lcrash_data(void *buf, unsigned long sz)
++{
++ struct __dump_page *dph;
++
++ /* sanity check for page headers */
++ while (next_dph_offset + sizeof(*dph) < sz) {
++ dph = (struct __dump_page *)(buf + next_dph_offset);
++ if (!dph_valid(dph)) {
++ printk("Invalid page hdr at offset 0x%llx\n",
++ next_dph_offset);
++ return -EINVAL;
++ }
++ next_dph_offset += dph->dp_size + sizeof(*dph);
++ }
++
++ next_dph_offset -= sz;
++ return 0;
++}
++
++/*
++ * TBD/Later: Consider avoiding the copy by using a scatter/gather
++ * vector representation for the dump buffer
++ */
++int dump_passthru_add_data(unsigned long loc, unsigned long sz)
++{
++ struct page *page = (struct page *)loc;
++ void *buf = dump_config.dumper->curr_buf;
++ int err = 0;
++
++ if ((err = dump_copy_pages(buf, page, sz))) {
++ printk("dump_copy_pages failed");
++ return err;
++ }
++
++ if ((err = dump_verify_lcrash_data(buf, sz))) {
++ printk("dump_verify_lcrash_data failed\n");
++ printk("Invalid data for pfn 0x%lx\n", page_to_pfn(page));
++ printk("Page flags 0x%lx\n", page->flags);
++ printk("Page count 0x%x\n", page_count(page));
++ return err;
++ }
++
++ dump_config.dumper->curr_buf = buf + sz;
++
++ return 0;
++}
++
++
++/* Stage 1 dumper: Saves compressed dump in memory and soft-boots system */
++
++/* Scheme to overlay saved data in memory for writeout after a soft-boot */
++struct dump_scheme_ops dump_scheme_overlay_ops = {
++ .configure = dump_overlay_configure,
++ .unconfigure = dump_overlay_unconfigure,
++ .sequencer = dump_overlay_sequencer,
++ .iterator = dump_page_iterator,
++ .save_data = dump_overlay_save_data,
++ .skip_data = dump_overlay_skip_data,
++ .write_buffer = dump_generic_write_buffer
++};
++
++struct dump_scheme dump_scheme_overlay = {
++ .name = "overlay",
++ .ops = &dump_scheme_overlay_ops
++};
++
++
++/* Stage 1 must use a good compression scheme - default to gzip */
++extern struct __dump_compress dump_gzip_compression;
++
++struct dumper dumper_stage1 = {
++ .name = "stage1",
++ .scheme = &dump_scheme_overlay,
++ .fmt = &dump_fmt_lcrash,
++ .compress = &dump_none_compression, /* needs to be gzip */
++ .filter = dump_filter_table,
++ .dev = NULL,
++};
++
++/* Stage 2 dumper: Activated after softboot to write out saved dump to device */
++
++/* Formatter that transfers data as is (transparent) w/o further conversion */
++struct dump_fmt_ops dump_fmt_passthru_ops = {
++ .configure_header = dump_passthru_configure_header,
++ .update_header = dump_passthru_update_header,
++ .save_context = NULL, /* unused */
++ .add_data = dump_passthru_add_data,
++ .update_end_marker = dump_lcrash_update_end_marker
++};
++
++struct dump_fmt dump_fmt_passthru = {
++ .name = "passthru",
++ .ops = &dump_fmt_passthru_ops
++};
++
++/* Filter that simply passes along any data within the range (transparent)*/
++/* Note: The start and end ranges in the table are filled in at run-time */
++
++extern int dump_filter_none(int pass, unsigned long loc, unsigned long sz);
++
++struct dump_data_filter dump_passthru_filtertable[MAX_PASSES] = {
++{.name = "passkern", .selector = dump_passthru_filter,
++ .level_mask = DUMP_MASK_KERN },
++{.name = "passuser", .selector = dump_passthru_filter,
++ .level_mask = DUMP_MASK_USED },
++{.name = "passunused", .selector = dump_passthru_filter,
++ .level_mask = DUMP_MASK_UNUSED },
++{.name = "none", .selector = dump_filter_none,
++ .level_mask = DUMP_MASK_REST }
++};
++
++
++/* Scheme to handle data staged / preserved across a soft-boot */
++struct dump_scheme_ops dump_scheme_staged_ops = {
++ .configure = dump_generic_configure,
++ .unconfigure = dump_staged_unconfigure,
++ .sequencer = dump_generic_sequencer,
++ .iterator = dump_saved_data_iterator,
++ .save_data = dump_generic_save_data,
++ .skip_data = dump_generic_skip_data,
++ .write_buffer = dump_generic_write_buffer
++};
++
++struct dump_scheme dump_scheme_staged = {
++ .name = "staged",
++ .ops = &dump_scheme_staged_ops
++};
++
++/* The stage 2 dumper comprising all these */
++struct dumper dumper_stage2 = {
++ .name = "stage2",
++ .scheme = &dump_scheme_staged,
++ .fmt = &dump_fmt_passthru,
++ .compress = &dump_none_compression,
++ .filter = dump_passthru_filtertable,
++ .dev = NULL,
++};
++
+Index: linux-2.6.10/drivers/dump/dump_memdev.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_memdev.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_memdev.c 2005-04-05 16:47:53.947204496 +0800
+@@ -0,0 +1,655 @@
++/*
++ * Implements the dump driver interface for saving a dump in available
++ * memory areas. The saved pages may be written out to persistent storage
++ * after a soft reboot.
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya <suparna@in.ibm.com>
++ *
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ *
++ * The approach of tracking pages containing saved dump using map pages
++ * allocated as needed has been derived from the Mission Critical Linux
++ * mcore dump implementation.
++ *
++ * Credits and a big thanks for letting the lkcd project make use of
++ * the excellent piece of work and also helping with clarifications
++ * and tips along the way are due to:
++ * Dave Winchell <winchell@mclx.com> (primary author of mcore)
++ * Jeff Moyer <moyer@mclx.com>
++ * Josh Huber <huber@mclx.com>
++ *
++ * For those familiar with the mcore code, the main differences worth
++ * noting here (besides the dump device abstraction) result from enabling
++ * "high" memory pages (pages not permanently mapped in the kernel
++ * address space) to be used for saving dump data (because of which a
++ * simple virtual address based linked list cannot be used anymore for
++ * managing free pages), an added level of indirection for faster
++ * lookups during the post-boot stage, and the idea of pages being
++ * made available as they get freed up while dump to memory progresses
++ * rather than one time before starting the dump. The last point enables
++ * a full memory snapshot to be saved starting with an initial set of
++ * bootstrap pages given a good compression ratio. (See dump_overlay.c)
++ *
++ */
++
++/*
++ * -----------------MEMORY LAYOUT ------------------
++ * The memory space consists of a set of discontiguous pages, and
++ * discontiguous map pages as well, rooted in a chain of indirect
++ * map pages (also discontiguous). Except for the indirect maps
++ * (which must be preallocated in advance), the rest of the pages
++ * could be in high memory.
++ *
++ * root
++ * | --------- -------- --------
++ * --> | . . +|--->| . +|------->| . . | indirect
++ * --|--|--- ---|---- --|-|--- maps
++ * | | | | |
++ * ------ ------ ------- ------ -------
++ * | . | | . | | . . | | . | | . . | maps
++ * --|--- --|--- --|--|-- --|--- ---|-|--
++ * page page page page page page page data
++ * pages
++ *
++ * Writes to the dump device happen sequentially in append mode.
++ * The main reason for the existence of the indirect map is
++ * to enable a quick way to lookup a specific logical offset in
++ * the saved data post-soft-boot, e.g. to writeout pages
++ * with more critical data first, even though such pages
++ * would have been compressed and copied last, being the lowest
++ * ranked candidates for reuse due to their criticality.
++ * (See dump_overlay.c)
++ */
++#include <linux/mm.h>
++#include <linux/highmem.h>
++#include <linux/bootmem.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++
++#define DUMP_MAP_SZ (PAGE_SIZE / sizeof(unsigned long)) /* direct map size */
++#define DUMP_IND_MAP_SZ DUMP_MAP_SZ - 1 /* indirect map size */
++#define DUMP_NR_BOOTSTRAP 64 /* no of bootstrap pages */
++
++extern int dump_low_page(struct page *);
++
++/* check if the next entry crosses a page boundary */
++static inline int is_last_map_entry(unsigned long *map)
++{
++ unsigned long addr = (unsigned long)(map + 1);
++
++ return (!(addr & (PAGE_SIZE - 1)));
++}
++
++/* Todo: should have some validation checks */
++/* The last entry in the indirect map points to the next indirect map */
++/* Indirect maps are referred to directly by virtual address */
++static inline unsigned long *next_indirect_map(unsigned long *map)
++{
++ return (unsigned long *)map[DUMP_IND_MAP_SZ];
++}
++
++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
++/* Called during early bootup - fixme: make this __init */
++void dump_early_reserve_map(struct dump_memdev *dev)
++{
++ unsigned long *map1, *map2;
++ loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT;
++ int i, j;
++
++ printk("Reserve bootmap space holding previous dump of %lld pages\n",
++ last);
++ map1= (unsigned long *)dev->indirect_map_root;
++
++ while (map1 && (off < last)) {
++#ifdef CONFIG_X86_64
++ reserve_bootmem_node(NODE_DATA(0), virt_to_phys((void *)map1),
++ PAGE_SIZE);
++#else
++ reserve_bootmem(virt_to_phys((void *)map1), PAGE_SIZE);
++#endif
++ for (i=0; (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last);
++ i++, off += DUMP_MAP_SZ) {
++ pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
++ if (map1[i] >= max_low_pfn)
++ continue;
++#ifdef CONFIG_X86_64
++ reserve_bootmem_node(NODE_DATA(0),
++ map1[i] << PAGE_SHIFT, PAGE_SIZE);
++#else
++ reserve_bootmem(map1[i] << PAGE_SHIFT, PAGE_SIZE);
++#endif
++ map2 = pfn_to_kaddr(map1[i]);
++ for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] &&
++ (off + j < last); j++) {
++ pr_debug("\t map[%d][%d] = 0x%lx\n", i, j,
++ map2[j]);
++ if (map2[j] < max_low_pfn) {
++#ifdef CONFIG_X86_64
++ reserve_bootmem_node(NODE_DATA(0),
++ map2[j] << PAGE_SHIFT,
++ PAGE_SIZE);
++#else
++ reserve_bootmem(map2[j] << PAGE_SHIFT,
++ PAGE_SIZE);
++#endif
++ }
++ }
++ }
++ map1 = next_indirect_map(map1);
++ }
++ dev->nr_free = 0; /* these pages don't belong to this boot */
++}
++#endif
++
++/* mark dump pages so that they aren't used by this kernel */
++void dump_mark_map(struct dump_memdev *dev)
++{
++ unsigned long *map1, *map2;
++ loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT;
++ struct page *page;
++ int i, j;
++
++ printk("Dump: marking pages in use by previous dump\n");
++ map1= (unsigned long *)dev->indirect_map_root;
++
++ while (map1 && (off < last)) {
++ page = virt_to_page(map1);
++ set_page_count(page, 1);
++ for (i=0; (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last);
++ i++, off += DUMP_MAP_SZ) {
++ pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
++ page = pfn_to_page(map1[i]);
++ set_page_count(page, 1);
++ map2 = kmap_atomic(page, KM_DUMP);
++ for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] &&
++ (off + j < last); j++) {
++ pr_debug("\t map[%d][%d] = 0x%lx\n", i, j,
++ map2[j]);
++ page = pfn_to_page(map2[j]);
++ set_page_count(page, 1);
++ }
++ }
++ map1 = next_indirect_map(map1);
++ }
++}
++
++
++/*
++ * Given a logical offset into the mem device lookup the
++ * corresponding page
++ * loc is specified in units of pages
++ * Note: affects curr_map (even in the case where lookup fails)
++ */
++struct page *dump_mem_lookup(struct dump_memdev *dump_mdev, unsigned long loc)
++{
++ unsigned long *map;
++ unsigned long i, index = loc / DUMP_MAP_SZ;
++ struct page *page = NULL;
++ unsigned long curr_pfn, curr_map, *curr_map_ptr = NULL;
++
++ map = (unsigned long *)dump_mdev->indirect_map_root;
++ if (!map)
++ return NULL;
++ if (loc > dump_mdev->last_offset >> PAGE_SHIFT)
++ return NULL;
++
++ /*
++ * first locate the right indirect map
++ * in the chain of indirect maps
++ */
++ for (i = 0; i + DUMP_IND_MAP_SZ < index ; i += DUMP_IND_MAP_SZ) {
++ if (!(map = next_indirect_map(map)))
++ return NULL;
++ }
++ /* then the right direct map */
++ /* map entries are referred to by page index */
++ if ((curr_map = map[index - i])) {
++ page = pfn_to_page(curr_map);
++ /* update the current traversal index */
++ /* dump_mdev->curr_map = &map[index - i];*/
++ curr_map_ptr = &map[index - i];
++ }
++
++ if (page)
++ map = kmap_atomic(page, KM_DUMP);
++ else
++ return NULL;
++
++ /* and finally the right entry therein */
++ /* data pages are referred to by page index */
++ i = index * DUMP_MAP_SZ;
++ if ((curr_pfn = map[loc - i])) {
++ page = pfn_to_page(curr_pfn);
++ dump_mdev->curr_map = curr_map_ptr;
++ dump_mdev->curr_map_offset = loc - i;
++ dump_mdev->ddev.curr_offset = loc << PAGE_SHIFT;
++ } else {
++ page = NULL;
++ }
++ kunmap_atomic(map, KM_DUMP);
++
++ return page;
++}
++
++/*
++ * Retrieves a pointer to the next page in the dump device
++ * Used during the lookup pass post-soft-reboot
++ */
++struct page *dump_mem_next_page(struct dump_memdev *dev)
++{
++ unsigned long i;
++ unsigned long *map;
++ struct page *page = NULL;
++
++ if (dev->ddev.curr_offset + PAGE_SIZE >= dev->last_offset) {
++ return NULL;
++ }
++
++ if ((i = (unsigned long)(++dev->curr_map_offset)) >= DUMP_MAP_SZ) {
++ /* move to next map */
++ if (is_last_map_entry(++dev->curr_map)) {
++ /* move to the next indirect map page */
++ printk("dump_mem_next_page: go to next indirect map\n");
++ dev->curr_map = (unsigned long *)*dev->curr_map;
++ if (!dev->curr_map)
++ return NULL;
++ }
++ i = dev->curr_map_offset = 0;
++ pr_debug("dump_mem_next_page: next map 0x%lx, entry 0x%lx\n",
++ dev->curr_map, *dev->curr_map);
++
++ };
++
++ if (*dev->curr_map) {
++ map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_DUMP);
++ if (map[i])
++ page = pfn_to_page(map[i]);
++ kunmap_atomic(map, KM_DUMP);
++ dev->ddev.curr_offset += PAGE_SIZE;
++ };
++
++ return page;
++}
++
++/* Copied from dump_filters.c */
++static inline int kernel_page(struct page *p)
++{
++ /* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */
++ return (PageReserved(p) && !PageInuse(p)) || (!PageLRU(p) && PageInuse(p));
++}
++
++static inline int user_page(struct page *p)
++{
++ return PageInuse(p) && (!PageReserved(p) && PageLRU(p));
++}
++
++int dump_reused_by_boot(struct page *page)
++{
++ /* Todo
++ * Checks:
++ * if PageReserved
++ * if < __end + bootmem_bootmap_pages for this boot + allowance
++ * if overwritten by initrd (how to check ?)
++ * Also, add more checks in early boot code
++ * e.g. bootmem bootmap alloc verify not overwriting dump, and if
++ * so then realloc or move the dump pages out accordingly.
++ */
++
++ /* Temporary proof of concept hack, avoid overwriting kern pages */
++
++ return (kernel_page(page) || dump_low_page(page) || user_page(page));
++}
++
++
++/* Uses the free page passed in to expand available space */
++int dump_mem_add_space(struct dump_memdev *dev, struct page *page)
++{
++ struct page *map_page;
++ unsigned long *map;
++ unsigned long i;
++
++ if (!dev->curr_map)
++ return -ENOMEM; /* must've exhausted indirect map */
++
++ if (!*dev->curr_map || dev->curr_map_offset >= DUMP_MAP_SZ) {
++ /* add map space */
++ *dev->curr_map = page_to_pfn(page);
++ dev->curr_map_offset = 0;
++ return 0;
++ }
++
++ /* add data space */
++ i = dev->curr_map_offset;
++ map_page = pfn_to_page(*dev->curr_map);
++ map = (unsigned long *)kmap_atomic(map_page, KM_DUMP);
++ map[i] = page_to_pfn(page);
++ kunmap_atomic(map, KM_DUMP);
++ dev->curr_map_offset = ++i;
++ dev->last_offset += PAGE_SIZE;
++ if (i >= DUMP_MAP_SZ) {
++ /* move to next map */
++ if (is_last_map_entry(++dev->curr_map)) {
++ /* move to the next indirect map page */
++ pr_debug("dump_mem_add_space: using next"
++ "indirect map\n");
++ dev->curr_map = (unsigned long *)*dev->curr_map;
++ }
++ }
++ return 0;
++}
++
++
++/* Caution: making a dest page invalidates existing contents of the page */
++int dump_check_and_free_page(struct dump_memdev *dev, struct page *page)
++{
++ int err = 0;
++
++ /*
++ * the page can be used as a destination only if we are sure
++ * it won't get overwritten by the soft-boot, and is not
++ * critical for us right now.
++ */
++ if (dump_reused_by_boot(page))
++ return 0;
++
++ if ((err = dump_mem_add_space(dev, page))) {
++ printk("Warning: Unable to extend memdev space. Err %d\n",
++ err);
++ return 0;
++ }
++
++ dev->nr_free++;
++ return 1;
++}
++
++
++/* Set up the initial maps and bootstrap space */
++/* Must be called only after any previous dump is written out */
++int dump_mem_open(struct dump_dev *dev, unsigned long devid)
++{
++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++ unsigned long nr_maps, *map, *prev_map = &dump_mdev->indirect_map_root;
++ void *addr;
++ struct page *page;
++ unsigned long i = 0;
++ int err = 0;
++
++ /* Todo: sanity check for unwritten previous dump */
++
++ /* allocate pages for indirect map (non highmem area) */
++ nr_maps = num_physpages / DUMP_MAP_SZ; /* maps to cover entire mem */
++ for (i = 0; i < nr_maps; i += DUMP_IND_MAP_SZ) {
++ if (!(map = (unsigned long *)dump_alloc_mem(PAGE_SIZE))) {
++ printk("Unable to alloc indirect map %ld\n",
++ i / DUMP_IND_MAP_SZ);
++ return -ENOMEM;
++ }
++ clear_page(map);
++ *prev_map = (unsigned long)map;
++ prev_map = &map[DUMP_IND_MAP_SZ];
++ };
++
++ dump_mdev->curr_map = (unsigned long *)dump_mdev->indirect_map_root;
++ dump_mdev->curr_map_offset = 0;
++
++ /*
++ * allocate a few bootstrap pages: at least 1 map and 1 data page
++ * plus enough to save the dump header
++ */
++ i = 0;
++ do {
++ if (!(addr = dump_alloc_mem(PAGE_SIZE))) {
++ printk("Unable to alloc bootstrap page %ld\n", i);
++ return -ENOMEM;
++ }
++
++ page = virt_to_page(addr);
++ if (dump_low_page(page)) {
++ dump_free_mem(addr);
++ continue;
++ }
++
++ if (dump_mem_add_space(dump_mdev, page)) {
++ printk("Warning: Unable to extend memdev "
++ "space. Err %d\n", err);
++ dump_free_mem(addr);
++ continue;
++ }
++ i++;
++ } while (i < DUMP_NR_BOOTSTRAP);
++
++ printk("dump memdev init: %ld maps, %ld bootstrap pgs, %ld free pgs\n",
++ nr_maps, i, dump_mdev->last_offset >> PAGE_SHIFT);
++
++ dump_mdev->last_bs_offset = dump_mdev->last_offset;
++
++ return 0;
++}
++
++/* Releases all pre-alloc'd pages */
++int dump_mem_release(struct dump_dev *dev)
++{
++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++ struct page *page, *map_page;
++ unsigned long *map, *prev_map;
++ void *addr;
++ int i;
++
++ if (!dump_mdev->nr_free)
++ return 0;
++
++ pr_debug("dump_mem_release\n");
++ page = dump_mem_lookup(dump_mdev, 0);
++ for (i = 0; page && (i < DUMP_NR_BOOTSTRAP - 1); i++) {
++ if (PageHighMem(page))
++ break;
++ addr = page_address(page);
++ if (!addr) {
++ printk("page_address(%p) = NULL\n", page);
++ break;
++ }
++ pr_debug("Freeing page at 0x%lx\n", addr);
++ dump_free_mem(addr);
++ if (dump_mdev->curr_map_offset >= DUMP_MAP_SZ - 1) {
++ map_page = pfn_to_page(*dump_mdev->curr_map);
++ if (PageHighMem(map_page))
++ break;
++ page = dump_mem_next_page(dump_mdev);
++ addr = page_address(map_page);
++ if (!addr) {
++ printk("page_address(%p) = NULL\n",
++ map_page);
++ break;
++ }
++ pr_debug("Freeing map page at 0x%lx\n", addr);
++ dump_free_mem(addr);
++ i++;
++ } else {
++ page = dump_mem_next_page(dump_mdev);
++ }
++ }
++
++ /* now for the last used bootstrap page used as a map page */
++ if ((i < DUMP_NR_BOOTSTRAP) && (*dump_mdev->curr_map)) {
++ map_page = pfn_to_page(*dump_mdev->curr_map);
++ if ((map_page) && !PageHighMem(map_page)) {
++ addr = page_address(map_page);
++ if (!addr) {
++ printk("page_address(%p) = NULL\n", map_page);
++ } else {
++ pr_debug("Freeing map page at 0x%lx\n", addr);
++ dump_free_mem(addr);
++ i++;
++ }
++ }
++ }
++
++ printk("Freed %d bootstrap pages\n", i);
++
++ /* free the indirect maps */
++ map = (unsigned long *)dump_mdev->indirect_map_root;
++
++ i = 0;
++ while (map) {
++ prev_map = map;
++ map = next_indirect_map(map);
++ dump_free_mem(prev_map);
++ i++;
++ }
++
++ printk("Freed %d indirect map(s)\n", i);
++
++ /* Reset the indirect map */
++ dump_mdev->indirect_map_root = 0;
++ dump_mdev->curr_map = 0;
++
++ /* Reset the free list */
++ dump_mdev->nr_free = 0;
++
++ dump_mdev->last_offset = dump_mdev->ddev.curr_offset = 0;
++ dump_mdev->last_used_offset = 0;
++ dump_mdev->curr_map = NULL;
++ dump_mdev->curr_map_offset = 0;
++ return 0;
++}
++
++/*
++ * Long term:
++ * It is critical for this to be very strict. Cannot afford
++ * to have anything running and accessing memory while we overwrite
++ * memory (potential risk of data corruption).
++ * If in doubt (e.g if a cpu is hung and not responding) just give
++ * up and refuse to proceed with this scheme.
++ *
++ * Note: I/O will only happen after soft-boot/switchover, so we can
++ * safely disable interrupts and force stop other CPUs if this is
++ * going to be a disruptive dump, no matter what they
++ * are in the middle of.
++ */
++/*
++ * ATM Most of this is already taken care of in the nmi handler
++ * We may halt the cpus rightaway if we know this is going to be disruptive
++ * For now, since we've limited ourselves to overwriting free pages we
++ * aren't doing much here. Eventually, we'd have to wait to make sure other
++ * cpus aren't using memory we could be overwriting
++ */
++int dump_mem_silence(struct dump_dev *dev)
++{
++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++
++ if (dump_mdev->last_offset > dump_mdev->last_bs_offset) {
++ /* prefer to run lkcd config & start with a clean slate */
++ return -EEXIST;
++ }
++ return 0;
++}
++
++extern int dump_overlay_resume(void);
++
++/* Trigger the next stage of dumping */
++int dump_mem_resume(struct dump_dev *dev)
++{
++ dump_overlay_resume();
++ return 0;
++}
++
++/*
++ * Allocate mem dev pages as required and copy buffer contents into it.
++ * Fails if the no free pages are available
++ * Keeping it simple and limited for starters (can modify this over time)
++ * Does not handle holes or a sparse layout
++ * Data must be in multiples of PAGE_SIZE
++ */
++int dump_mem_write(struct dump_dev *dev, void *buf, unsigned long len)
++{
++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++ struct page *page;
++ unsigned long n = 0;
++ void *addr;
++ unsigned long *saved_curr_map, saved_map_offset;
++ int ret = 0;
++
++ pr_debug("dump_mem_write: offset 0x%llx, size %ld\n",
++ dev->curr_offset, len);
++
++ if (dev->curr_offset + len > dump_mdev->last_offset) {
++ printk("Out of space to write\n");
++ return -ENOSPC;
++ }
++
++ if ((len & (PAGE_SIZE - 1)) || (dev->curr_offset & (PAGE_SIZE - 1)))
++ return -EINVAL; /* not aligned in units of page size */
++
++ saved_curr_map = dump_mdev->curr_map;
++ saved_map_offset = dump_mdev->curr_map_offset;
++ page = dump_mem_lookup(dump_mdev, dev->curr_offset >> PAGE_SHIFT);
++
++ for (n = len; (n > 0) && page; n -= PAGE_SIZE, buf += PAGE_SIZE ) {
++ addr = kmap_atomic(page, KM_DUMP);
++ /* memset(addr, 'x', PAGE_SIZE); */
++ memcpy(addr, buf, PAGE_SIZE);
++ kunmap_atomic(addr, KM_DUMP);
++ /* dev->curr_offset += PAGE_SIZE; */
++ page = dump_mem_next_page(dump_mdev);
++ }
++
++ dump_mdev->curr_map = saved_curr_map;
++ dump_mdev->curr_map_offset = saved_map_offset;
++
++ if (dump_mdev->last_used_offset < dev->curr_offset)
++ dump_mdev->last_used_offset = dev->curr_offset;
++
++ return (len - n) ? (len - n) : ret ;
++}
++
++/* dummy - always ready */
++int dump_mem_ready(struct dump_dev *dev, void *buf)
++{
++ return 0;
++}
++
++/*
++ * Should check for availability of space to write upto the offset
++ * affects only the curr_offset; last_offset untouched
++ * Keep it simple: Only allow multiples of PAGE_SIZE for now
++ */
++int dump_mem_seek(struct dump_dev *dev, loff_t offset)
++{
++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
++
++ if (offset & (PAGE_SIZE - 1))
++ return -EINVAL; /* allow page size units only for now */
++
++ /* Are we exceeding available space ? */
++ if (offset > dump_mdev->last_offset) {
++ printk("dump_mem_seek failed for offset 0x%llx\n",
++ offset);
++ return -ENOSPC;
++ }
++
++ dump_mdev->ddev.curr_offset = offset;
++ return 0;
++}
++
++struct dump_dev_ops dump_memdev_ops = {
++ .open = dump_mem_open,
++ .release = dump_mem_release,
++ .silence = dump_mem_silence,
++ .resume = dump_mem_resume,
++ .seek = dump_mem_seek,
++ .write = dump_mem_write,
++ .read = NULL, /* not implemented at the moment */
++ .ready = dump_mem_ready
++};
++
++static struct dump_memdev default_dump_memdev = {
++ .ddev = {.type_name = "memdev", .ops = &dump_memdev_ops,
++ .device_id = 0x14}
++ /* assume the rest of the fields are zeroed by default */
++};
++
++/* may be overwritten if a previous dump exists */
++struct dump_memdev *dump_memdev = &default_dump_memdev;
++
+Index: linux-2.6.10/drivers/dump/dump_blockdev.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_blockdev.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_blockdev.c 2005-04-05 16:47:53.945204800 +0800
+@@ -0,0 +1,469 @@
++/*
++ * Implements the dump driver interface for saving a dump to
++ * a block device through the kernel's generic low level block i/o
++ * routines.
++ *
++ * Started: June 2002 - Mohamed Abbas <mohamed.abbas@intel.com>
++ * Moved original lkcd kiobuf dump i/o code from dump_base.c
++ * to use generic dump device interfaces
++ *
++ * Sept 2002 - Bharata B. Rao <bharata@in.ibm.com>
++ * Convert dump i/o to directly use bio instead of kiobuf for 2.5
++ *
++ * Oct 2002 - Suparna Bhattacharya <suparna@in.ibm.com>
++ * Rework to new dumpdev.h structures, implement open/close/
++ * silence, misc fixes (blocknr removal, bio_add_page usage)
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#include <linux/types.h>
++#include <linux/proc_fs.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/blkdev.h>
++#include <linux/bio.h>
++#include <asm/hardirq.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++
++extern void *dump_page_buf;
++
++/* The end_io callback for dump i/o completion */
++static int
++dump_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
++{
++ struct dump_blockdev *dump_bdev;
++
++ if (bio->bi_size) {
++ /* some bytes still left to transfer */
++ return 1; /* not complete */
++ }
++
++ dump_bdev = (struct dump_blockdev *)bio->bi_private;
++ if (error) {
++ printk("IO error while writing the dump, aborting\n");
++ }
++
++ dump_bdev->err = error;
++
++ /* no wakeup needed, since caller polls for completion */
++ return 0;
++}
++
++/* Check if the dump bio is already mapped to the specified buffer */
++static int
++dump_block_map_valid(struct dump_blockdev *dev, struct page *page,
++ int len)
++{
++ struct bio *bio = dev->bio;
++ unsigned long bsize = 0;
++
++ if (!bio->bi_vcnt)
++ return 0; /* first time, not mapped */
++
++
++ if ((bio_page(bio) != page) || (len > bio->bi_vcnt << PAGE_SHIFT))
++ return 0; /* buffer not mapped */
++
++ bsize = bdev_hardsect_size(bio->bi_bdev);
++ if ((len & (PAGE_SIZE - 1)) || (len & bsize))
++ return 0; /* alignment checks needed */
++
++ /* quick check to decide if we need to redo bio_add_page */
++ if (bdev_get_queue(bio->bi_bdev)->merge_bvec_fn)
++ return 0; /* device may have other restrictions */
++
++ return 1; /* already mapped */
++}
++
++/*
++ * Set up the dump bio for i/o from the specified buffer
++ * Return value indicates whether the full buffer could be mapped or not
++ */
++static int
++dump_block_map(struct dump_blockdev *dev, void *buf, int len)
++{
++ struct page *page = virt_to_page(buf);
++ struct bio *bio = dev->bio;
++ unsigned long bsize = 0;
++
++ bio->bi_bdev = dev->bdev;
++ bio->bi_sector = (dev->start_offset + dev->ddev.curr_offset) >> 9;
++ bio->bi_idx = 0; /* reset index to the beginning */
++
++ if (dump_block_map_valid(dev, page, len)) {
++ /* already mapped and usable rightaway */
++ bio->bi_size = len; /* reset size to the whole bio */
++ bio->bi_vcnt = (len + PAGE_SIZE - 1) / PAGE_SIZE; /* Set the proper vector cnt */
++ } else {
++ /* need to map the bio */
++ bio->bi_size = 0;
++ bio->bi_vcnt = 0;
++ bsize = bdev_hardsect_size(bio->bi_bdev);
++
++ /* first a few sanity checks */
++ if (len < bsize) {
++ printk("map: len less than hardsect size \n");
++ return -EINVAL;
++ }
++
++ if ((unsigned long)buf & bsize) {
++ printk("map: not aligned \n");
++ return -EINVAL;
++ }
++
++ /* assume contig. page aligned low mem buffer( no vmalloc) */
++ if ((page_address(page) != buf) || (len & (PAGE_SIZE - 1))) {
++ printk("map: invalid buffer alignment!\n");
++ return -EINVAL;
++ }
++ /* finally we can go ahead and map it */
++ while (bio->bi_size < len)
++ if (bio_add_page(bio, page++, PAGE_SIZE, 0) == 0) {
++ break;
++ }
++
++ bio->bi_end_io = dump_bio_end_io;
++ bio->bi_private = dev;
++ }
++
++ if (bio->bi_size != len) {
++ printk("map: bio size = %d not enough for len = %d!\n",
++ bio->bi_size, len);
++ return -E2BIG;
++ }
++ return 0;
++}
++
++static void
++dump_free_bio(struct bio *bio)
++{
++ if (bio)
++ kfree(bio->bi_io_vec);
++ kfree(bio);
++}
++
++/*
++ * Prepares the dump device so we can take a dump later.
++ * The caller is expected to have filled up the dev_id field in the
++ * block dump dev structure.
++ *
++ * At dump time when dump_block_write() is invoked it will be too
++ * late to recover, so as far as possible make sure obvious errors
++ * get caught right here and reported back to the caller.
++ */
++static int
++dump_block_open(struct dump_dev *dev, unsigned long arg)
++{
++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++ struct block_device *bdev;
++ int retval = 0;
++ struct bio_vec *bvec;
++
++ /* make sure this is a valid block device */
++ if (!arg) {
++ retval = -EINVAL;
++ goto err;
++ }
++
++ /* Convert it to the new dev_t format */
++ arg = MKDEV((arg >> OLDMINORBITS), (arg & OLDMINORMASK));
++
++ /* get a corresponding block_dev struct for this */
++ bdev = bdget((dev_t)arg);
++ if (!bdev) {
++ retval = -ENODEV;
++ goto err;
++ }
++
++ /* get the block device opened */
++ if ((retval = blkdev_get(bdev, O_RDWR | O_LARGEFILE, 0))) {
++ goto err1;
++ }
++
++ if ((dump_bdev->bio = kmalloc(sizeof(struct bio), GFP_KERNEL))
++ == NULL) {
++ printk("Cannot allocate bio\n");
++ retval = -ENOMEM;
++ goto err2;
++ }
++
++ bio_init(dump_bdev->bio);
++
++ if ((bvec = kmalloc(sizeof(struct bio_vec) *
++ (DUMP_BUFFER_SIZE >> PAGE_SHIFT), GFP_KERNEL)) == NULL) {
++ retval = -ENOMEM;
++ goto err3;
++ }
++
++ /* assign the new dump dev structure */
++ dump_bdev->dev_id = (dev_t)arg;
++ dump_bdev->bdev = bdev;
++
++ /* make a note of the limit */
++ dump_bdev->limit = bdev->bd_inode->i_size;
++
++ /* now make sure we can map the dump buffer */
++ dump_bdev->bio->bi_io_vec = bvec;
++ dump_bdev->bio->bi_max_vecs = DUMP_BUFFER_SIZE >> PAGE_SHIFT;
++
++ retval = dump_block_map(dump_bdev, dump_config.dumper->dump_buf,
++ DUMP_BUFFER_SIZE);
++
++ if (retval) {
++ printk("open: dump_block_map failed, ret %d\n", retval);
++ goto err3;
++ }
++
++ printk("Block device (%d,%d) successfully configured for dumping\n",
++ MAJOR(dump_bdev->dev_id),
++ MINOR(dump_bdev->dev_id));
++
++
++ /* after opening the block device, return */
++ return retval;
++
++err3: dump_free_bio(dump_bdev->bio);
++ dump_bdev->bio = NULL;
++err2: if (bdev) blkdev_put(bdev);
++ goto err;
++err1: if (bdev) bdput(bdev);
++ dump_bdev->bdev = NULL;
++err: return retval;
++}
++
++/*
++ * Close the dump device and release associated resources
++ * Invoked when unconfiguring the dump device.
++ */
++static int
++dump_block_release(struct dump_dev *dev)
++{
++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++
++ /* release earlier bdev if present */
++ if (dump_bdev->bdev) {
++ blkdev_put(dump_bdev->bdev);
++ dump_bdev->bdev = NULL;
++ }
++
++ dump_free_bio(dump_bdev->bio);
++ dump_bdev->bio = NULL;
++
++ return 0;
++}
++
++
++/*
++ * Prepare the dump device for use (silence any ongoing activity
++ * and quiesce state) when the system crashes.
++ */
++static int
++dump_block_silence(struct dump_dev *dev)
++{
++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++ struct request_queue *q = bdev_get_queue(dump_bdev->bdev);
++ int ret;
++
++ /* If we can't get request queue lock, refuse to take the dump */
++ if (!spin_trylock(q->queue_lock))
++ return -EBUSY;
++
++ ret = elv_queue_empty(q);
++ spin_unlock(q->queue_lock);
++
++ /* For now we assume we have the device to ourselves */
++ /* Just a quick sanity check */
++ if (!ret) {
++ /* Warn the user and move on */
++ printk(KERN_ALERT "Warning: Non-empty request queue\n");
++ printk(KERN_ALERT "I/O requests in flight at dump time\n");
++ }
++
++ /*
++ * Move to a softer level of silencing where no spin_lock_irqs
++ * are held on other cpus
++ */
++ dump_silence_level = DUMP_SOFT_SPIN_CPUS;
++
++ ret = __dump_irq_enable();
++ if (ret) {
++ return ret;
++ }
++
++ printk("Dumping to block device (%d,%d) on CPU %d ...\n",
++ MAJOR(dump_bdev->dev_id), MINOR(dump_bdev->dev_id),
++ smp_processor_id());
++
++ return 0;
++}
++
++/*
++ * Invoked when dumping is done. This is the time to put things back
++ * (i.e. undo the effects of dump_block_silence) so the device is
++ * available for normal use.
++ */
++static int
++dump_block_resume(struct dump_dev *dev)
++{
++ __dump_irq_restore();
++ return 0;
++}
++
++
++/*
++ * Seek to the specified offset in the dump device.
++ * Makes sure this is a valid offset, otherwise returns an error.
++ */
++static int
++dump_block_seek(struct dump_dev *dev, loff_t off)
++{
++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++ loff_t offset = off + dump_bdev->start_offset;
++
++ if (offset & ( PAGE_SIZE - 1)) {
++ printk("seek: non-page aligned\n");
++ return -EINVAL;
++ }
++
++ if (offset & (bdev_hardsect_size(dump_bdev->bdev) - 1)) {
++ printk("seek: not sector aligned \n");
++ return -EINVAL;
++ }
++
++ if (offset > dump_bdev->limit) {
++ printk("seek: not enough space left on device!\n");
++ return -ENOSPC;
++ }
++ dev->curr_offset = off;
++ return 0;
++}
++
++/*
++ * Write out a buffer after checking the device limitations,
++ * sector sizes, etc. Assumes the buffer is in directly mapped
++ * kernel address space (not vmalloc'ed).
++ *
++ * Returns: number of bytes written or -ERRNO.
++ */
++static int
++dump_block_write(struct dump_dev *dev, void *buf,
++ unsigned long len)
++{
++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++ loff_t offset = dev->curr_offset + dump_bdev->start_offset;
++ int retval = -ENOSPC;
++
++ if (offset >= dump_bdev->limit) {
++ printk("write: not enough space left on device!\n");
++ goto out;
++ }
++
++ /* don't write more blocks than our max limit */
++ if (offset + len > dump_bdev->limit)
++ len = dump_bdev->limit - offset;
++
++
++ retval = dump_block_map(dump_bdev, buf, len);
++ if (retval){
++ printk("write: dump_block_map failed! err %d\n", retval);
++ goto out;
++ }
++
++ /*
++ * Write out the data to disk.
++ * Assumes the entire buffer mapped to a single bio, which we can
++ * submit and wait for io completion. In the future, may consider
++ * increasing the dump buffer size and submitting multiple bio s
++ * for better throughput.
++ */
++ dump_bdev->err = -EAGAIN;
++ submit_bio(WRITE, dump_bdev->bio);
++
++ dump_bdev->ddev.curr_offset += len;
++ retval = len;
++ out:
++ return retval;
++}
++
++/*
++ * Name: dump_block_ready()
++ * Func: check if the last dump i/o is over and ready for next request
++ */
++static int
++dump_block_ready(struct dump_dev *dev, void *buf)
++{
++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev);
++ request_queue_t *q = bdev_get_queue(dump_bdev->bio->bi_bdev);
++
++ /* check for io completion */
++ if (dump_bdev->err == -EAGAIN) {
++ q->unplug_fn(q);
++ return -EAGAIN;
++ }
++
++ if (dump_bdev->err) {
++ printk("dump i/o err\n");
++ return dump_bdev->err;
++ }
++
++ return 0;
++}
++
++
++struct dump_dev_ops dump_blockdev_ops = {
++ .open = dump_block_open,
++ .release = dump_block_release,
++ .silence = dump_block_silence,
++ .resume = dump_block_resume,
++ .seek = dump_block_seek,
++ .write = dump_block_write,
++ /* .read not implemented */
++ .ready = dump_block_ready
++};
++
++static struct dump_blockdev default_dump_blockdev = {
++ .ddev = {.type_name = "blockdev", .ops = &dump_blockdev_ops,
++ .curr_offset = 0},
++ /*
++ * leave enough room for the longest swap header possibly written
++ * written by mkswap (likely the largest page size supported by
++ * the arch
++ */
++ .start_offset = DUMP_HEADER_OFFSET,
++ .err = 0
++ /* assume the rest of the fields are zeroed by default */
++};
++
++struct dump_blockdev *dump_blockdev = &default_dump_blockdev;
++
++static int __init
++dump_blockdev_init(void)
++{
++ if (dump_register_device(&dump_blockdev->ddev) < 0) {
++ printk("block device driver registration failed\n");
++ return -1;
++ }
++
++ printk("block device driver for LKCD registered\n");
++ return 0;
++}
++
++static void __exit
++dump_blockdev_cleanup(void)
++{
++ dump_unregister_device(&dump_blockdev->ddev);
++ printk("block device driver for LKCD unregistered\n");
++}
++
++MODULE_AUTHOR("LKCD Development Team <lkcd-devel@lists.sourceforge.net>");
++MODULE_DESCRIPTION("Block Dump Driver for Linux Kernel Crash Dump (LKCD)");
++MODULE_LICENSE("GPL");
++
++module_init(dump_blockdev_init);
++module_exit(dump_blockdev_cleanup);
+Index: linux-2.6.10/drivers/dump/dump_fmt.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_fmt.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_fmt.c 2005-04-05 16:47:53.941205408 +0800
+@@ -0,0 +1,407 @@
++/*
++ * Implements the routines which handle the format specific
++ * aspects of dump for the default dump format.
++ *
++ * Used in single stage dumping and stage 1 of soft-boot based dumping
++ * Saves data in LKCD (lcrash) format
++ *
++ * Previously a part of dump_base.c
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya <suparna@in.ibm.com>
++ * Split off and reshuffled LKCD dump format code around generic
++ * dump method interfaces.
++ *
++ * Derived from original code created by
++ * Matt Robinson <yakker@sourceforge.net>)
++ *
++ * Contributions from SGI, IBM, HP, MCL, and others.
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/time.h>
++#include <linux/sched.h>
++#include <linux/ptrace.h>
++#include <linux/utsname.h>
++#include <linux/dump.h>
++#include <asm/dump.h>
++#include "dump_methods.h"
++
++/*
++ * SYSTEM DUMP LAYOUT
++ *
++ * System dumps are currently the combination of a dump header and a set
++ * of data pages which contain the system memory. The layout of the dump
++ * (for full dumps) is as follows:
++ *
++ * +-----------------------------+
++ * | generic dump header |
++ * +-----------------------------+
++ * | architecture dump header |
++ * +-----------------------------+
++ * | page header |
++ * +-----------------------------+
++ * | page data |
++ * +-----------------------------+
++ * | page header |
++ * +-----------------------------+
++ * | page data |
++ * +-----------------------------+
++ * | | |
++ * | | |
++ * | | |
++ * | | |
++ * | V |
++ * +-----------------------------+
++ * | PAGE_END header |
++ * +-----------------------------+
++ *
++ * There are two dump headers, the first which is architecture
++ * independent, and the other which is architecture dependent. This
++ * allows different architectures to dump different data structures
++ * which are specific to their chipset, CPU, etc.
++ *
++ * After the dump headers come a succession of dump page headers along
++ * with dump pages. The page header contains information about the page
++ * size, any flags associated with the page (whether it's compressed or
++ * not), and the address of the page. After the page header is the page
++ * data, which is either compressed (or not). Each page of data is
++ * dumped in succession, until the final dump header (PAGE_END) is
++ * placed at the end of the dump, assuming the dump device isn't out
++ * of space.
++ *
++ * This mechanism allows for multiple compression types, different
++ * types of data structures, different page ordering, etc., etc., etc.
++ * It's a very straightforward mechanism for dumping system memory.
++ */
++
++struct __dump_header dump_header; /* the primary dump header */
++struct __dump_header_asm dump_header_asm; /* the arch-specific dump header */
++
++/* Replace a runtime sanity check on the DUMP_BUFFER_SIZE with a
++ * compile-time check. The compile_time_assertions routine will not
++ * compile if the assertion is false.
++ *
++ * If you fail this assert you are most likely on a large machine and
++ * should use a special 6.0.0 version of LKCD or a version > 7.0.0. See
++ * the LKCD website for more information.
++ */
++
++#define COMPILE_TIME_ASSERT(const_expr) \
++ switch(0){case 0: case (const_expr):;}
++
++static inline void compile_time_assertions(void)
++{
++ COMPILE_TIME_ASSERT((sizeof(struct __dump_header) +
++ sizeof(struct __dump_header_asm)) <= DUMP_BUFFER_SIZE);
++}
++
++/*
++ * Set up common header fields (mainly the arch indep section)
++ * Per-cpu state is handled by lcrash_save_context
++ * Returns the size of the header in bytes.
++ */
++static int lcrash_init_dump_header(const char *panic_str)
++{
++ struct timeval dh_time;
++ u64 temp_memsz = dump_header.dh_memory_size;
++
++ /* initialize the dump headers to zero */
++ /* save dha_stack pointer because it may contains pointer for stack! */
++ memset(&dump_header, 0, sizeof(dump_header));
++ memset(&dump_header_asm, 0,
++ offsetof(struct __dump_header_asm, dha_stack));
++ memset(&dump_header_asm.dha_stack+1, 0,
++ sizeof(dump_header_asm) -
++ offsetof(struct __dump_header_asm, dha_stack) -
++ sizeof(dump_header_asm.dha_stack));
++ dump_header.dh_memory_size = temp_memsz;
++
++ /* configure dump header values */
++ dump_header.dh_magic_number = DUMP_MAGIC_NUMBER;
++ dump_header.dh_version = DUMP_VERSION_NUMBER;
++ dump_header.dh_memory_start = PAGE_OFFSET;
++ dump_header.dh_memory_end = DUMP_MAGIC_NUMBER;
++ dump_header.dh_header_size = sizeof(struct __dump_header);
++ dump_header.dh_page_size = PAGE_SIZE;
++ dump_header.dh_dump_level = dump_config.level;
++ dump_header.dh_current_task = (unsigned long) current;
++ dump_header.dh_dump_compress = dump_config.dumper->compress->
++ compress_type;
++ dump_header.dh_dump_flags = dump_config.flags;
++ dump_header.dh_dump_device = dump_config.dumper->dev->device_id;
++
++#if DUMP_DEBUG >= 6
++ dump_header.dh_num_bytes = 0;
++#endif
++ dump_header.dh_num_dump_pages = 0;
++ do_gettimeofday(&dh_time);
++ dump_header.dh_time.tv_sec = dh_time.tv_sec;
++ dump_header.dh_time.tv_usec = dh_time.tv_usec;
++
++ memcpy((void *)&(dump_header.dh_utsname_sysname),
++ (const void *)&(system_utsname.sysname), __NEW_UTS_LEN + 1);
++ memcpy((void *)&(dump_header.dh_utsname_nodename),
++ (const void *)&(system_utsname.nodename), __NEW_UTS_LEN + 1);
++ memcpy((void *)&(dump_header.dh_utsname_release),
++ (const void *)&(system_utsname.release), __NEW_UTS_LEN + 1);
++ memcpy((void *)&(dump_header.dh_utsname_version),
++ (const void *)&(system_utsname.version), __NEW_UTS_LEN + 1);
++ memcpy((void *)&(dump_header.dh_utsname_machine),
++ (const void *)&(system_utsname.machine), __NEW_UTS_LEN + 1);
++ memcpy((void *)&(dump_header.dh_utsname_domainname),
++ (const void *)&(system_utsname.domainname), __NEW_UTS_LEN + 1);
++
++ if (panic_str) {
++ memcpy((void *)&(dump_header.dh_panic_string),
++ (const void *)panic_str, DUMP_PANIC_LEN);
++ }
++
++ dump_header_asm.dha_magic_number = DUMP_ASM_MAGIC_NUMBER;
++ dump_header_asm.dha_version = DUMP_ASM_VERSION_NUMBER;
++ dump_header_asm.dha_header_size = sizeof(dump_header_asm);
++#ifdef CONFIG_ARM
++ dump_header_asm.dha_physaddr_start = PHYS_OFFSET;
++#endif
++
++ dump_header_asm.dha_smp_num_cpus = num_online_cpus();
++ pr_debug("smp_num_cpus in header %d\n",
++ dump_header_asm.dha_smp_num_cpus);
++
++ dump_header_asm.dha_dumping_cpu = smp_processor_id();
++
++ return sizeof(dump_header) + sizeof(dump_header_asm);
++}
++
++
++int dump_lcrash_configure_header(const char *panic_str,
++ const struct pt_regs *regs)
++{
++ int retval = 0;
++
++ dump_config.dumper->header_len = lcrash_init_dump_header(panic_str);
++
++ /* capture register states for all processors */
++ dump_save_this_cpu(regs);
++ __dump_save_other_cpus(); /* side effect:silence cpus */
++
++ /* configure architecture-specific dump header values */
++ if ((retval = __dump_configure_header(regs)))
++ return retval;
++
++ dump_config.dumper->header_dirty++;
++ return 0;
++}
++/* save register and task context */
++void dump_lcrash_save_context(int cpu, const struct pt_regs *regs,
++ struct task_struct *tsk)
++{
++ /* This level of abstraction might be redundantly redundant */
++ __dump_save_context(cpu, regs, tsk);
++}
++
++/* write out the header */
++int dump_write_header(void)
++{
++ int retval = 0, size;
++ void *buf = dump_config.dumper->dump_buf;
++
++ /* accounts for DUMP_HEADER_OFFSET if applicable */
++ if ((retval = dump_dev_seek(0))) {
++ printk("Unable to seek to dump header offset: %d\n",
++ retval);
++ return retval;
++ }
++
++ memcpy(buf, (void *)&dump_header, sizeof(dump_header));
++ size = sizeof(dump_header);
++ memcpy(buf + size, (void *)&dump_header_asm, sizeof(dump_header_asm));
++ size += sizeof(dump_header_asm);
++ size = PAGE_ALIGN(size);
++ retval = dump_ll_write(buf , size);
++
++ if (retval < size)
++ return (retval >= 0) ? ENOSPC : retval;
++ return 0;
++}
++
++int dump_generic_update_header(void)
++{
++ int err = 0;
++
++ if (dump_config.dumper->header_dirty) {
++ if ((err = dump_write_header())) {
++ printk("dump write header failed !err %d\n", err);
++ } else {
++ dump_config.dumper->header_dirty = 0;
++ }
++ }
++
++ return err;
++}
++
++static inline int is_curr_stack_page(struct page *page, unsigned long size)
++{
++ unsigned long thread_addr = (unsigned long)current_thread_info();
++ unsigned long addr = (unsigned long)page_address(page);
++
++ return !PageHighMem(page) && (addr < thread_addr + THREAD_SIZE)
++ && (addr + size > thread_addr);
++}
++
++static inline int is_dump_page(struct page *page, unsigned long size)
++{
++ unsigned long addr = (unsigned long)page_address(page);
++ unsigned long dump_buf = (unsigned long)dump_config.dumper->dump_buf;
++
++ return !PageHighMem(page) && (addr < dump_buf + DUMP_BUFFER_SIZE)
++ && (addr + size > dump_buf);
++}
++
++int dump_allow_compress(struct page *page, unsigned long size)
++{
++ /*
++ * Don't compress the page if any part of it overlaps
++ * with the current stack or dump buffer (since the contents
++ * in these could be changing while compression is going on)
++ */
++ return !is_curr_stack_page(page, size) && !is_dump_page(page, size);
++}
++
++void lcrash_init_pageheader(struct __dump_page *dp, struct page *page,
++ unsigned long sz)
++{
++ memset(dp, sizeof(struct __dump_page), 0);
++ dp->dp_flags = 0;
++ dp->dp_size = 0;
++ if (sz > 0)
++ dp->dp_address = (loff_t)page_to_pfn(page) << PAGE_SHIFT;
++
++#if DUMP_DEBUG > 6
++ dp->dp_page_index = dump_header.dh_num_dump_pages;
++ dp->dp_byte_offset = dump_header.dh_num_bytes + DUMP_BUFFER_SIZE
++ + DUMP_HEADER_OFFSET; /* ?? */
++#endif /* DUMP_DEBUG */
++}
++
++int dump_lcrash_add_data(unsigned long loc, unsigned long len)
++{
++ struct page *page = (struct page *)loc;
++ void *addr, *buf = dump_config.dumper->curr_buf;
++ struct __dump_page *dp = (struct __dump_page *)buf;
++ int bytes, size;
++
++ if (buf > dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE)
++ return -ENOMEM;
++
++ lcrash_init_pageheader(dp, page, len);
++ buf += sizeof(struct __dump_page);
++
++ while (len) {
++ addr = kmap_atomic(page, KM_DUMP);
++ size = bytes = (len > PAGE_SIZE) ? PAGE_SIZE : len;
++ /* check for compression */
++ if (dump_allow_compress(page, bytes)) {
++ size = dump_compress_data((char *)addr, bytes,
++ (char *)buf, loc);
++ }
++ /* set the compressed flag if the page did compress */
++ if (size && (size < bytes)) {
++ dp->dp_flags |= DUMP_DH_COMPRESSED;
++ } else {
++ /* compression failed -- default to raw mode */
++ dp->dp_flags |= DUMP_DH_RAW;
++ memcpy(buf, addr, bytes);
++ size = bytes;
++ }
++ /* memset(buf, 'A', size); temporary: testing only !! */
++ kunmap_atomic(addr, KM_DUMP);
++ dp->dp_size += size;
++ buf += size;
++ len -= bytes;
++ page++;
++ }
++
++ /* now update the header */
++#if DUMP_DEBUG > 6
++ dump_header.dh_num_bytes += dp->dp_size + sizeof(*dp);
++#endif
++ dump_header.dh_num_dump_pages++;
++ dump_config.dumper->header_dirty++;
++
++ dump_config.dumper->curr_buf = buf;
++
++ return len;
++}
++
++int dump_lcrash_update_end_marker(void)
++{
++ struct __dump_page *dp =
++ (struct __dump_page *)dump_config.dumper->curr_buf;
++ unsigned long left;
++ int ret = 0;
++
++ lcrash_init_pageheader(dp, NULL, 0);
++ dp->dp_flags |= DUMP_DH_END; /* tbd: truncation test ? */
++
++ /* now update the header */
++#if DUMP_DEBUG > 6
++ dump_header.dh_num_bytes += sizeof(*dp);
++#endif
++ dump_config.dumper->curr_buf += sizeof(*dp);
++ left = dump_config.dumper->curr_buf - dump_config.dumper->dump_buf;
++
++ printk("\n");
++
++ while (left) {
++ if ((ret = dump_dev_seek(dump_config.dumper->curr_offset))) {
++ printk("Seek failed at offset 0x%llx\n",
++ dump_config.dumper->curr_offset);
++ return ret;
++ }
++
++ if (DUMP_BUFFER_SIZE > left)
++ memset(dump_config.dumper->curr_buf, 'm',
++ DUMP_BUFFER_SIZE - left);
++
++ if ((ret = dump_ll_write(dump_config.dumper->dump_buf,
++ DUMP_BUFFER_SIZE)) < DUMP_BUFFER_SIZE) {
++ return (ret < 0) ? ret : -ENOSPC;
++ }
++
++ dump_config.dumper->curr_offset += DUMP_BUFFER_SIZE;
++
++ if (left > DUMP_BUFFER_SIZE) {
++ left -= DUMP_BUFFER_SIZE;
++ memcpy(dump_config.dumper->dump_buf,
++ dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE, left);
++ dump_config.dumper->curr_buf -= DUMP_BUFFER_SIZE;
++ } else {
++ left = 0;
++ }
++ }
++ return 0;
++}
++
++
++/* Default Formatter (lcrash) */
++struct dump_fmt_ops dump_fmt_lcrash_ops = {
++ .configure_header = dump_lcrash_configure_header,
++ .update_header = dump_generic_update_header,
++ .save_context = dump_lcrash_save_context,
++ .add_data = dump_lcrash_add_data,
++ .update_end_marker = dump_lcrash_update_end_marker
++};
++
++struct dump_fmt dump_fmt_lcrash = {
++ .name = "lcrash",
++ .ops = &dump_fmt_lcrash_ops
++};
++
+Index: linux-2.6.10/drivers/dump/dump_setup.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_setup.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_setup.c 2005-04-05 16:47:53.939205712 +0800
+@@ -0,0 +1,923 @@
++/*
++ * Standard kernel function entry points for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sourceforge.net)
++ * Contributions from SGI, IBM, HP, MCL, and others.
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved.
++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * -----------------------------------------------------------------------
++ *
++ * DUMP HISTORY
++ *
++ * This dump code goes back to SGI's first attempts at dumping system
++ * memory on SGI systems running IRIX. A few developers at SGI needed
++ * a way to take this system dump and analyze it, and created 'icrash',
++ * or IRIX Crash. The mechanism (the dumps and 'icrash') were used
++ * by support people to generate crash reports when a system failure
++ * occurred. This was vital for large system configurations that
++ * couldn't apply patch after patch after fix just to hope that the
++ * problems would go away. So the system memory, along with the crash
++ * dump analyzer, allowed support people to quickly figure out what the
++ * problem was on the system with the crash dump.
++ *
++ * In comes Linux. SGI started moving towards the open source community,
++ * and upon doing so, SGI wanted to take its support utilities into Linux
++ * with the hopes that they would end up the in kernel and user space to
++ * be used by SGI's customers buying SGI Linux systems. One of the first
++ * few products to be open sourced by SGI was LKCD, or Linux Kernel Crash
++ * Dumps. LKCD comprises of a patch to the kernel to enable system
++ * dumping, along with 'lcrash', or Linux Crash, to analyze the system
++ * memory dump. A few additional system scripts and kernel modifications
++ * are also included to make the dump mechanism and dump data easier to
++ * process and use.
++ *
++ * As soon as LKCD was released into the open source community, a number
++ * of larger companies started to take advantage of it. Today, there are
++ * many community members that contribute to LKCD, and it continues to
++ * flourish and grow as an open source project.
++ */
++
++/*
++ * DUMP TUNABLES (read/write with ioctl, readonly with /proc)
++ *
++ * This is the list of system tunables (via /proc) that are available
++ * for Linux systems. All the read, write, etc., functions are listed
++ * here. Currently, there are a few different tunables for dumps:
++ *
++ * dump_device (used to be dumpdev):
++ * The device for dumping the memory pages out to. This
++ * may be set to the primary swap partition for disruptive dumps,
++ * and must be an unused partition for non-disruptive dumps.
++ * Todo: In the case of network dumps, this may be interpreted
++ * as the IP address of the netdump server to connect to.
++ *
++ * dump_compress (used to be dump_compress_pages):
++ * This is the flag which indicates which compression mechanism
++ * to use. This is a BITMASK, not an index (0,1,2,4,8,16,etc.).
++ * This is the current set of values:
++ *
++ * 0: DUMP_COMPRESS_NONE -- Don't compress any pages.
++ * 1: DUMP_COMPRESS_RLE -- This uses RLE compression.
++ * 2: DUMP_COMPRESS_GZIP -- This uses GZIP compression.
++ *
++ * dump_level:
++ * The amount of effort the dump module should make to save
++ * information for post crash analysis. This value is now
++ * a BITMASK value, not an index:
++ *
++ * 0: Do nothing, no dumping. (DUMP_LEVEL_NONE)
++ *
++ * 1: Print out the dump information to the dump header, and
++ * write it out to the dump_device. (DUMP_LEVEL_HEADER)
++ *
++ * 2: Write out the dump header and all kernel memory pages.
++ * (DUMP_LEVEL_KERN)
++ *
++ * 4: Write out the dump header and all kernel and user
++ * memory pages. (DUMP_LEVEL_USED)
++ *
++ * 8: Write out the dump header and all conventional/cached
++ * memory (RAM) pages in the system (kernel, user, free).
++ * (DUMP_LEVEL_ALL_RAM)
++ *
++ * 16: Write out everything, including non-conventional memory
++ * like firmware, proms, I/O registers, uncached memory.
++ * (DUMP_LEVEL_ALL)
++ *
++ * The dump_level will default to 1.
++ *
++ * dump_flags:
++ * These are the flags to use when talking about dumps. There
++ * are lots of possibilities. This is a BITMASK value, not an index.
++ *
++ * -----------------------------------------------------------------------
++ */
++
++#include <linux/kernel.h>
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include <linux/fs.h>
++#include <linux/dump.h>
++#include <linux/ioctl32.h>
++#include <linux/syscalls.h>
++#include "dump_methods.h"
++#include <linux/proc_fs.h>
++#include <linux/module.h>
++#include <linux/utsname.h>
++#include <linux/highmem.h>
++#include <linux/miscdevice.h>
++#include <linux/sysrq.h>
++#include <linux/sysctl.h>
++#include <linux/nmi.h>
++#include <linux/init.h>
++#include <asm/hardirq.h>
++#include <asm/uaccess.h>
++
++
++/*
++ * -----------------------------------------------------------------------
++ * V A R I A B L E S
++ * -----------------------------------------------------------------------
++ */
++
++/* Dump tunables */
++struct dump_config dump_config = {
++ .level = 0,
++ .flags = 0,
++ .dump_device = 0,
++ .dump_addr = 0,
++ .dumper = NULL
++};
++#ifdef CONFIG_ARM
++static _dump_regs_t all_regs;
++#endif
++
++/* Global variables used in dump.h */
++/* degree of system freeze when dumping */
++enum dump_silence_levels dump_silence_level = DUMP_HARD_SPIN_CPUS;
++
++/* Other global fields */
++extern struct __dump_header dump_header;
++struct dump_dev *dump_dev = NULL; /* Active dump device */
++static int dump_compress = 0;
++
++static u32 dump_compress_none(const u8 *old, u32 oldsize, u8 *new, u32 newsize,
++ unsigned long loc);
++struct __dump_compress dump_none_compression = {
++ .compress_type = DUMP_COMPRESS_NONE,
++ .compress_func = dump_compress_none,
++ .compress_name = "none",
++};
++
++/* our device operations and functions */
++static int dump_ioctl(struct inode *i, struct file *f,
++ unsigned int cmd, unsigned long arg);
++
++#ifdef CONFIG_COMPAT
++static int dw_long(unsigned int, unsigned int, unsigned long, struct file*);
++#endif
++
++static struct file_operations dump_fops = {
++ .owner = THIS_MODULE,
++ .ioctl = dump_ioctl,
++};
++
++static struct miscdevice dump_miscdev = {
++ .minor = CRASH_DUMP_MINOR,
++ .name = "dump",
++ .fops = &dump_fops,
++};
++MODULE_ALIAS_MISCDEV(CRASH_DUMP_MINOR);
++
++/* static variables */
++static int dump_okay = 0; /* can we dump out to disk? */
++static spinlock_t dump_lock = SPIN_LOCK_UNLOCKED;
++
++/* used for dump compressors */
++static struct list_head dump_compress_list = LIST_HEAD_INIT(dump_compress_list);
++
++/* list of registered dump targets */
++static struct list_head dump_target_list = LIST_HEAD_INIT(dump_target_list);
++
++/* lkcd info structure -- this is used by lcrash for basic system data */
++struct __lkcdinfo lkcdinfo = {
++ .ptrsz = (sizeof(void *) * 8),
++#if defined(__LITTLE_ENDIAN)
++ .byte_order = __LITTLE_ENDIAN,
++#else
++ .byte_order = __BIG_ENDIAN,
++#endif
++ .page_shift = PAGE_SHIFT,
++ .page_size = PAGE_SIZE,
++ .page_mask = PAGE_MASK,
++ .page_offset = PAGE_OFFSET,
++};
++
++/*
++ * -----------------------------------------------------------------------
++ * / P R O C T U N A B L E F U N C T I O N S
++ * -----------------------------------------------------------------------
++ */
++
++static int proc_dump_device(ctl_table *ctl, int write, struct file *f,
++ void __user *buffer, size_t *lenp, loff_t *ppos);
++
++static int proc_doulonghex(ctl_table *ctl, int write, struct file *f,
++ void __user *buffer, size_t *lenp, loff_t *ppos);
++/*
++ * sysctl-tuning infrastructure.
++ */
++static ctl_table dump_table[] = {
++ { .ctl_name = CTL_DUMP_LEVEL,
++ .procname = DUMP_LEVEL_NAME,
++ .data = &dump_config.level,
++ .maxlen = sizeof(int),
++ .mode = 0444,
++ .proc_handler = proc_doulonghex, },
++
++ { .ctl_name = CTL_DUMP_FLAGS,
++ .procname = DUMP_FLAGS_NAME,
++ .data = &dump_config.flags,
++ .maxlen = sizeof(int),
++ .mode = 0444,
++ .proc_handler = proc_doulonghex, },
++
++ { .ctl_name = CTL_DUMP_COMPRESS,
++ .procname = DUMP_COMPRESS_NAME,
++ .data = &dump_compress, /* FIXME */
++ .maxlen = sizeof(int),
++ .mode = 0444,
++ .proc_handler = proc_dointvec, },
++
++ { .ctl_name = CTL_DUMP_DEVICE,
++ .procname = DUMP_DEVICE_NAME,
++ .mode = 0444,
++ .data = &dump_config.dump_device, /* FIXME */
++ .maxlen = sizeof(int),
++ .proc_handler = proc_dump_device },
++
++#ifdef CONFIG_CRASH_DUMP_MEMDEV
++ { .ctl_name = CTL_DUMP_ADDR,
++ .procname = DUMP_ADDR_NAME,
++ .mode = 0444,
++ .data = &dump_config.dump_addr,
++ .maxlen = sizeof(unsigned long),
++ .proc_handler = proc_doulonghex },
++#endif
++
++ { 0, }
++};
++
++static ctl_table dump_root[] = {
++ { .ctl_name = KERN_DUMP,
++ .procname = "dump",
++ .mode = 0555,
++ .child = dump_table },
++ { 0, }
++};
++
++static ctl_table kernel_root[] = {
++ { .ctl_name = CTL_KERN,
++ .procname = "kernel",
++ .mode = 0555,
++ .child = dump_root, },
++ { 0, }
++};
++
++static struct ctl_table_header *sysctl_header;
++
++/*
++ * -----------------------------------------------------------------------
++ * C O M P R E S S I O N F U N C T I O N S
++ * -----------------------------------------------------------------------
++ */
++
++/*
++ * Name: dump_compress_none()
++ * Func: Don't do any compression, period.
++ */
++static u32
++dump_compress_none(const u8 *old, u32 oldsize, u8 *new, u32 newsize,
++ unsigned long loc)
++{
++ /* just return the old size */
++ return oldsize;
++}
++
++
++/*
++ * Name: dump_execute()
++ * Func: Execute the dumping process. This makes sure all the appropriate
++ * fields are updated correctly, and calls dump_execute_memdump(),
++ * which does the real work.
++ */
++void
++dump_execute(const char *panic_str, const struct pt_regs *regs)
++{
++ int state = -1;
++ unsigned long flags;
++
++ /* make sure we can dump */
++ if (!dump_okay) {
++ pr_info("LKCD not yet configured, can't take dump now\n");
++ return;
++ }
++
++ /* Exclude multiple dumps at the same time,
++ * and disable interrupts, some drivers may re-enable
++ * interrupts in with silence()
++ *
++ * Try and acquire spin lock. If successful, leave preempt
++ * and interrupts disabled. See spin_lock_irqsave in spinlock.h
++ */
++ local_irq_save(flags);
++ if (!spin_trylock(&dump_lock)) {
++ local_irq_restore(flags);
++ pr_info("LKCD dump already in progress\n");
++ return;
++ }
++
++ /* What state are interrupts really in? */
++ if (in_interrupt()){
++ if(in_irq())
++ printk(KERN_ALERT "Dumping from interrupt handler!\n");
++ else
++ printk(KERN_ALERT "Dumping from bottom half!\n");
++
++ __dump_clean_irq_state();
++ }
++
++
++ /* Bring system into the strictest level of quiescing for min drift
++ * dump drivers can soften this as required in dev->ops->silence()
++ */
++ dump_oncpu = smp_processor_id() + 1;
++ dump_silence_level = DUMP_HARD_SPIN_CPUS;
++
++ state = dump_generic_execute(panic_str, regs);
++
++ dump_oncpu = 0;
++ spin_unlock_irqrestore(&dump_lock, flags);
++
++ if (state < 0) {
++ printk("Dump Incomplete or failed!\n");
++ } else {
++ printk("Dump Complete; %d dump pages saved.\n",
++ dump_header.dh_num_dump_pages);
++ }
++}
++
++/*
++ * Name: dump_register_compression()
++ * Func: Register a dump compression mechanism.
++ */
++void
++dump_register_compression(struct __dump_compress *item)
++{
++ if (item)
++ list_add(&(item->list), &dump_compress_list);
++}
++
++/*
++ * Name: dump_unregister_compression()
++ * Func: Remove a dump compression mechanism, and re-assign the dump
++ * compression pointer if necessary.
++ */
++void
++dump_unregister_compression(int compression_type)
++{
++ struct list_head *tmp;
++ struct __dump_compress *dc;
++
++ /* let's make sure our list is valid */
++ if (compression_type != DUMP_COMPRESS_NONE) {
++ list_for_each(tmp, &dump_compress_list) {
++ dc = list_entry(tmp, struct __dump_compress, list);
++ if (dc->compress_type == compression_type) {
++ list_del(&(dc->list));
++ break;
++ }
++ }
++ }
++}
++
++/*
++ * Name: dump_compress_init()
++ * Func: Initialize (or re-initialize) compression scheme.
++ */
++static int
++dump_compress_init(int compression_type)
++{
++ struct list_head *tmp;
++ struct __dump_compress *dc;
++
++ /* try to remove the compression item */
++ list_for_each(tmp, &dump_compress_list) {
++ dc = list_entry(tmp, struct __dump_compress, list);
++ if (dc->compress_type == compression_type) {
++ dump_config.dumper->compress = dc;
++ dump_compress = compression_type;
++ pr_debug("Dump Compress %s\n", dc->compress_name);
++ return 0;
++ }
++ }
++
++ /*
++ * nothing on the list -- return ENODATA to indicate an error
++ *
++ * NB:
++ * EAGAIN: reports "Resource temporarily unavailable" which
++ * isn't very enlightening.
++ */
++ printk("compression_type:%d not found\n", compression_type);
++
++ return -ENODATA;
++}
++
++static int
++dumper_setup(unsigned long flags, unsigned long devid)
++{
++ int ret = 0;
++
++ /* unconfigure old dumper if it exists */
++ dump_okay = 0;
++ if (dump_config.dumper) {
++ pr_debug("Unconfiguring current dumper\n");
++ dump_unconfigure();
++ }
++ /* set up new dumper */
++ if (dump_config.flags & DUMP_FLAGS_SOFTBOOT) {
++ printk("Configuring softboot based dump \n");
++#ifdef CONFIG_CRASH_DUMP_MEMDEV
++ dump_config.dumper = &dumper_stage1;
++#else
++ printk("Requires CONFIG_CRASHDUMP_MEMDEV. Can't proceed.\n");
++ return -1;
++#endif
++ } else {
++ dump_config.dumper = &dumper_singlestage;
++ }
++ dump_config.dumper->dev = dump_dev;
++
++ ret = dump_configure(devid);
++ if (!ret) {
++ dump_okay = 1;
++ pr_debug("%s dumper set up for dev 0x%lx\n",
++ dump_config.dumper->name, devid);
++ dump_config.dump_device = devid;
++ } else {
++ printk("%s dumper set up failed for dev 0x%lx\n",
++ dump_config.dumper->name, devid);
++ dump_config.dumper = NULL;
++ }
++ return ret;
++}
++
++static int
++dump_target_init(int target)
++{
++ char type[20];
++ struct list_head *tmp;
++ struct dump_dev *dev;
++
++ switch (target) {
++ case DUMP_FLAGS_DISKDUMP:
++ strcpy(type, "blockdev"); break;
++ case DUMP_FLAGS_NETDUMP:
++ strcpy(type, "networkdev"); break;
++ default:
++ return -1;
++ }
++
++ /*
++ * This is a bit stupid, generating strings from flag
++ * and doing strcmp. This is done because 'struct dump_dev'
++ * has string 'type_name' and not interger 'type'.
++ */
++ list_for_each(tmp, &dump_target_list) {
++ dev = list_entry(tmp, struct dump_dev, list);
++ if (strcmp(type, dev->type_name) == 0) {
++ dump_dev = dev;
++ return 0;
++ }
++ }
++ return -1;
++}
++
++/*
++ * Name: dump_ioctl()
++ * Func: Allow all dump tunables through a standard ioctl() mechanism.
++ * This is far better than before, where we'd go through /proc,
++ * because now this will work for multiple OS and architectures.
++ */
++static int
++dump_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg)
++{
++ /* check capabilities */
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++
++ if (!dump_config.dumper && cmd == DIOSDUMPCOMPRESS)
++ /* dump device must be configured first */
++ return -ENODEV;
++
++ /*
++ * This is the main mechanism for controlling get/set data
++ * for various dump device parameters. The real trick here
++ * is setting the dump device (DIOSDUMPDEV). That's what
++ * triggers everything else.
++ */
++ switch (cmd) {
++ case DIOSDUMPDEV: /* set dump_device */
++ pr_debug("Configuring dump device\n");
++ if (!(f->f_flags & O_RDWR))
++ return -EPERM;
++
++ __dump_open();
++ return dumper_setup(dump_config.flags, arg);
++
++
++ case DIOGDUMPDEV: /* get dump_device */
++ return put_user((long)dump_config.dump_device, (long *)arg);
++
++ case DIOSDUMPLEVEL: /* set dump_level */
++ if (!(f->f_flags & O_RDWR))
++ return -EPERM;
++
++ /* make sure we have a positive value */
++ if (arg < 0)
++ return -EINVAL;
++
++ /* Fixme: clean this up */
++ dump_config.level = 0;
++ switch ((int)arg) {
++ case DUMP_LEVEL_ALL:
++ case DUMP_LEVEL_ALL_RAM:
++ dump_config.level |= DUMP_MASK_UNUSED;
++ case DUMP_LEVEL_USED:
++ dump_config.level |= DUMP_MASK_USED;
++ case DUMP_LEVEL_KERN:
++ dump_config.level |= DUMP_MASK_KERN;
++ case DUMP_LEVEL_HEADER:
++ dump_config.level |= DUMP_MASK_HEADER;
++ case DUMP_LEVEL_NONE:
++ break;
++ default:
++ return (-EINVAL);
++ }
++ pr_debug("Dump Level 0x%lx\n", dump_config.level);
++ break;
++
++ case DIOGDUMPLEVEL: /* get dump_level */
++ /* fixme: handle conversion */
++ return put_user((long)dump_config.level, (long *)arg);
++
++
++ case DIOSDUMPFLAGS: /* set dump_flags */
++ /* check flags */
++ if (!(f->f_flags & O_RDWR))
++ return -EPERM;
++
++ /* make sure we have a positive value */
++ if (arg < 0)
++ return -EINVAL;
++
++ if (dump_target_init(arg & DUMP_FLAGS_TARGETMASK) < 0)
++ return -EINVAL; /* return proper error */
++
++ dump_config.flags = arg;
++
++ pr_debug("Dump Flags 0x%lx\n", dump_config.flags);
++ break;
++
++ case DIOGDUMPFLAGS: /* get dump_flags */
++ return put_user((long)dump_config.flags, (long *)arg);
++
++ case DIOSDUMPCOMPRESS: /* set the dump_compress status */
++ if (!(f->f_flags & O_RDWR))
++ return -EPERM;
++
++ return dump_compress_init((int)arg);
++
++ case DIOGDUMPCOMPRESS: /* get the dump_compress status */
++ return put_user((long)(dump_config.dumper ?
++ dump_config.dumper->compress->compress_type : 0),
++ (long *)arg);
++ case DIOGDUMPOKAY: /* check if dump is configured */
++ return put_user((long)dump_okay, (long *)arg);
++
++ case DIOSDUMPTAKE: /* Trigger a manual dump */
++ /* Do not proceed if lkcd not yet configured */
++ if(!dump_okay) {
++ printk("LKCD not yet configured. Cannot take manual dump\n");
++ return -ENODEV;
++ }
++
++ /* Take the dump */
++ return manual_handle_crashdump();
++
++ default:
++ /*
++ * these are network dump specific ioctls, let the
++ * module handle them.
++ */
++ return dump_dev_ioctl(cmd, arg);
++ }
++ return 0;
++}
++
++/*
++ * Handle special cases for dump_device
++ * changing dump device requires doing an opening the device
++ */
++static int
++proc_dump_device(ctl_table *ctl, int write, struct file *f,
++ void __user *buffer, size_t *lenp, loff_t *ppos)
++{
++ int *valp = ctl->data;
++ int oval = *valp;
++ int ret = -EPERM;
++
++ /* same permission checks as ioctl */
++ if (capable(CAP_SYS_ADMIN)) {
++ ret = proc_doulonghex(ctl, write, f, buffer, lenp, ppos);
++ if (ret == 0 && write && *valp != oval) {
++ /* need to restore old value to close properly */
++ dump_config.dump_device = (dev_t) oval;
++ __dump_open();
++ ret = dumper_setup(dump_config.flags, (dev_t) *valp);
++ }
++ }
++
++ return ret;
++}
++
++/* All for the want of a proc_do_xxx routine which prints values in hex */
++/* Write is not implemented correctly, so mode is set to 0444 above. */
++static int
++proc_doulonghex(ctl_table *ctl, int write, struct file *f,
++ void __user *buffer, size_t *lenp, loff_t *ppos)
++{
++#define TMPBUFLEN 21
++ unsigned long *i;
++ size_t len, left;
++ char buf[TMPBUFLEN];
++
++ if (!ctl->data || !ctl->maxlen || !*lenp || (*ppos && !write)) {
++ *lenp = 0;
++ return 0;
++ }
++
++ i = (unsigned long *) ctl->data;
++ left = *lenp;
++
++ sprintf(buf, "0x%lx\n", (*i));
++ len = strlen(buf);
++ if (len > left)
++ len = left;
++ if(copy_to_user(buffer, buf, len))
++ return -EFAULT;
++
++ left -= len;
++ *lenp -= left;
++ *ppos += *lenp;
++ return 0;
++}
++
++/*
++ * -----------------------------------------------------------------------
++ * I N I T F U N C T I O N S
++ * -----------------------------------------------------------------------
++ */
++
++#ifdef CONFIG_COMPAT
++static int dw_long(unsigned int fd, unsigned int cmd, unsigned long arg,
++ struct file *f)
++{
++ mm_segment_t old_fs = get_fs();
++ int err;
++ unsigned long val;
++
++ set_fs (KERNEL_DS);
++ err = sys_ioctl(fd, cmd, (u64)&val);
++ set_fs (old_fs);
++ if (!err && put_user((unsigned int) val, (u32 *)arg))
++ return -EFAULT;
++ return err;
++}
++#endif
++
++/*
++ * These register and unregister routines are exported for modules
++ * to register their dump drivers (like block, net etc)
++ */
++int
++dump_register_device(struct dump_dev *ddev)
++{
++ struct list_head *tmp;
++ struct dump_dev *dev;
++
++ list_for_each(tmp, &dump_target_list) {
++ dev = list_entry(tmp, struct dump_dev, list);
++ if (strcmp(ddev->type_name, dev->type_name) == 0) {
++ printk("Target type %s already registered\n",
++ dev->type_name);
++ return -1; /* return proper error */
++ }
++ }
++ list_add(&(ddev->list), &dump_target_list);
++
++ return 0;
++}
++
++void
++dump_unregister_device(struct dump_dev *ddev)
++{
++ list_del(&(ddev->list));
++ if (ddev != dump_dev)
++ return;
++
++ dump_okay = 0;
++
++ if (dump_config.dumper)
++ dump_unconfigure();
++
++ dump_config.flags &= ~DUMP_FLAGS_TARGETMASK;
++ dump_okay = 0;
++ dump_dev = NULL;
++ dump_config.dumper = NULL;
++}
++
++static int panic_event(struct notifier_block *this, unsigned long event,
++ void *ptr)
++{
++#ifdef CONFIG_ARM
++ get_current_general_regs(&all_regs);
++ get_current_cp14_regs(&all_regs);
++ get_current_cp15_regs(&all_regs);
++ dump_execute((const char *)ptr, &all_regs);
++#else
++ struct pt_regs regs;
++
++ get_current_regs(®s);
++ dump_execute((const char *)ptr, ®s);
++#endif
++ return 0;
++}
++
++extern struct notifier_block *panic_notifier_list;
++static int panic_event(struct notifier_block *, unsigned long, void *);
++static struct notifier_block panic_block = {
++ .notifier_call = panic_event,
++};
++
++#ifdef CONFIG_MAGIC_SYSRQ
++/* Sysrq handler */
++static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs,
++ struct tty_struct *tty) {
++ if(!pt_regs) {
++ struct pt_regs regs;
++ get_current_regs(®s);
++ dump_execute("sysrq", ®s);
++
++ } else {
++ dump_execute("sysrq", pt_regs);
++ }
++}
++
++static struct sysrq_key_op sysrq_crashdump_op = {
++ .handler = sysrq_handle_crashdump,
++ .help_msg = "Dump",
++ .action_msg = "Starting crash dump",
++};
++#endif
++
++static inline void
++dump_sysrq_register(void)
++{
++#ifdef CONFIG_MAGIC_SYSRQ
++ register_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op);
++#endif
++}
++
++static inline void
++dump_sysrq_unregister(void)
++{
++#ifdef CONFIG_MAGIC_SYSRQ
++ unregister_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op);
++#endif
++}
++
++/*
++ * Name: dump_init()
++ * Func: Initialize the dump process. This will set up any architecture
++ * dependent code. The big key is we need the memory offsets before
++ * the page table is initialized, because the base memory offset
++ * is changed after paging_init() is called.
++ */
++static int __init
++dump_init(void)
++{
++ struct sysinfo info;
++ int err;
++
++ /* try to create our dump device */
++ err = misc_register(&dump_miscdev);
++ if (err) {
++ printk("cannot register dump character device!\n");
++ return err;
++ }
++
++ __dump_init((u64)PAGE_OFFSET);
++
++#ifdef CONFIG_COMPAT
++ err = register_ioctl32_conversion(DIOSDUMPDEV, NULL);
++ err |= register_ioctl32_conversion(DIOGDUMPDEV, NULL);
++ err |= register_ioctl32_conversion(DIOSDUMPLEVEL, NULL);
++ err |= register_ioctl32_conversion(DIOGDUMPLEVEL, dw_long);
++ err |= register_ioctl32_conversion(DIOSDUMPFLAGS, NULL);
++ err |= register_ioctl32_conversion(DIOGDUMPFLAGS, dw_long);
++ err |= register_ioctl32_conversion(DIOSDUMPCOMPRESS, NULL);
++ err |= register_ioctl32_conversion(DIOGDUMPCOMPRESS, dw_long);
++ err |= register_ioctl32_conversion(DIOSTARGETIP, NULL);
++ err |= register_ioctl32_conversion(DIOGTARGETIP, NULL);
++ err |= register_ioctl32_conversion(DIOSTARGETPORT, NULL);
++ err |= register_ioctl32_conversion(DIOGTARGETPORT, NULL);
++ err |= register_ioctl32_conversion(DIOSSOURCEPORT, NULL);
++ err |= register_ioctl32_conversion(DIOGSOURCEPORT, NULL);
++ err |= register_ioctl32_conversion(DIOSETHADDR, NULL);
++ err |= register_ioctl32_conversion(DIOGETHADDR, NULL);
++ err |= register_ioctl32_conversion(DIOGDUMPOKAY, dw_long);
++ err |= register_ioctl32_conversion(DIOSDUMPTAKE, NULL);
++ if (err) {
++ printk(KERN_ERR "LKCD: registering ioctl32 translations failed\
++");
++ }
++#endif
++ /* set the dump_compression_list structure up */
++ dump_register_compression(&dump_none_compression);
++
++ /* grab the total memory size now (not if/when we crash) */
++ si_meminfo(&info);
++
++ /* set the memory size */
++ dump_header.dh_memory_size = (u64)info.totalram;
++
++ sysctl_header = register_sysctl_table(kernel_root, 0);
++ dump_sysrq_register();
++
++ notifier_chain_register(&panic_notifier_list, &panic_block);
++ dump_function_ptr = dump_execute;
++
++ pr_info("Crash dump driver initialized.\n");
++ return 0;
++}
++
++static void __exit
++dump_cleanup(void)
++{
++ int err;
++ dump_okay = 0;
++
++ if (dump_config.dumper)
++ dump_unconfigure();
++
++ /* arch-specific cleanup routine */
++ __dump_cleanup();
++
++#ifdef CONFIG_COMPAT
++ err = unregister_ioctl32_conversion(DIOSDUMPDEV);
++ err |= unregister_ioctl32_conversion(DIOGDUMPDEV);
++ err |= unregister_ioctl32_conversion(DIOSDUMPLEVEL);
++ err |= unregister_ioctl32_conversion(DIOGDUMPLEVEL);
++ err |= unregister_ioctl32_conversion(DIOSDUMPFLAGS);
++ err |= unregister_ioctl32_conversion(DIOGDUMPFLAGS);
++ err |= unregister_ioctl32_conversion(DIOSDUMPCOMPRESS);
++ err |= unregister_ioctl32_conversion(DIOGDUMPCOMPRESS);
++ err |= unregister_ioctl32_conversion(DIOSTARGETIP);
++ err |= unregister_ioctl32_conversion(DIOGTARGETIP);
++ err |= unregister_ioctl32_conversion(DIOSTARGETPORT);
++ err |= unregister_ioctl32_conversion(DIOGTARGETPORT);
++ err |= unregister_ioctl32_conversion(DIOSSOURCEPORT);
++ err |= unregister_ioctl32_conversion(DIOGSOURCEPORT);
++ err |= unregister_ioctl32_conversion(DIOSETHADDR);
++ err |= unregister_ioctl32_conversion(DIOGETHADDR);
++ err |= unregister_ioctl32_conversion(DIOGDUMPOKAY);
++ err |= unregister_ioctl32_conversion(DIOSDUMPTAKE);
++ if (err) {
++ printk(KERN_ERR "LKCD: Unregistering ioctl32 translations failed\n");
++ }
++#endif
++
++ /* ignore errors while unregistering -- since can't do anything */
++ unregister_sysctl_table(sysctl_header);
++ misc_deregister(&dump_miscdev);
++ dump_sysrq_unregister();
++ notifier_chain_unregister(&panic_notifier_list, &panic_block);
++ dump_function_ptr = NULL;
++}
++
++EXPORT_SYMBOL(dump_register_compression);
++EXPORT_SYMBOL(dump_unregister_compression);
++EXPORT_SYMBOL(dump_register_device);
++EXPORT_SYMBOL(dump_unregister_device);
++EXPORT_SYMBOL(dump_config);
++EXPORT_SYMBOL(dump_silence_level);
++
++EXPORT_SYMBOL(__dump_irq_enable);
++EXPORT_SYMBOL(__dump_irq_restore);
++
++MODULE_AUTHOR("Matt D. Robinson <yakker@sourceforge.net>");
++MODULE_DESCRIPTION("Linux Kernel Crash Dump (LKCD) driver");
++MODULE_LICENSE("GPL");
++
++module_init(dump_init);
++module_exit(dump_cleanup);
+Index: linux-2.6.10/drivers/dump/dump_scheme.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_scheme.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_scheme.c 2005-04-05 16:47:53.944204952 +0800
+@@ -0,0 +1,430 @@
++/*
++ * Default single stage dump scheme methods
++ *
++ * Previously a part of dump_base.c
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya <suparna@in.ibm.com>
++ * Split and rewrote LKCD dump scheme to generic dump method
++ * interfaces
++ * Derived from original code created by
++ * Matt Robinson <yakker@sourceforge.net>)
++ *
++ * Contributions from SGI, IBM, HP, MCL, and others.
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * Implements the default dump scheme, i.e. single-stage gathering and
++ * saving of dump data directly to the target device, which operates in
++ * a push mode, where the dumping system decides what data it saves
++ * taking into account pre-specified dump config options.
++ *
++ * Aside: The 2-stage dump scheme, where there is a soft-reset between
++ * the gathering and saving phases, also reuses some of these
++ * default routines (see dump_overlay.c)
++ */
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include <linux/nmi.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++
++extern int panic_timeout; /* time before reboot */
++
++extern void dump_speedo(int);
++
++/* Default sequencer used during single stage dumping */
++/* Also invoked during stage 2 of soft-boot based dumping */
++int dump_generic_sequencer(void)
++{
++ struct dump_data_filter *filter = dump_config.dumper->filter;
++ int pass = 0, err = 0, save = 0;
++ int (*action)(unsigned long, unsigned long);
++
++ /*
++ * We want to save the more critical data areas first in
++ * case we run out of space, encounter i/o failures, or get
++ * interrupted otherwise and have to give up midway
++ * So, run through the passes in increasing order
++ */
++ for (;filter->selector; filter++, pass++)
++ {
++ /* Assumes passes are exclusive (even across dumpers) */
++ /* Requires care when coding the selection functions */
++ if ((save = filter->level_mask & dump_config.level))
++ action = dump_save_data;
++ else
++ action = dump_skip_data;
++
++ if ((err = dump_iterator(pass, action, filter)) < 0)
++ break;
++
++ printk("\n %d dump pages %s of %d each in pass %d\n",
++ err, save ? "saved" : "skipped", (int)DUMP_PAGE_SIZE, pass);
++
++ }
++
++ return (err < 0) ? err : 0;
++}
++
++static inline struct page *dump_get_page(loff_t loc)
++{
++
++ unsigned long page_index = loc >> PAGE_SHIFT;
++
++ /* todo: complete this to account for ia64/discontig mem */
++ /* todo: and to check for validity, ram page, no i/o mem etc */
++ /* need to use pfn/physaddr equiv of kern_addr_valid */
++
++ /* Important:
++ * On ARM/XScale system, the physical address starts from
++ * PHYS_OFFSET, and it maybe the situation that PHYS_OFFSET != 0.
++ * For example on Intel's PXA250, PHYS_OFFSET = 0xa0000000. And the
++ * page index starts from PHYS_PFN_OFFSET. When configuring
++ * filter, filter->start is assigned to 0 in dump_generic_configure.
++ * Here we want to adjust it by adding PHYS_PFN_OFFSET to it!
++ */
++#ifdef CONFIG_ARM
++ page_index += PHYS_PFN_OFFSET;
++#endif
++ if (__dump_page_valid(page_index))
++ return pfn_to_page(page_index);
++ else
++ return NULL;
++
++}
++
++/* Default iterator: for singlestage and stage 1 of soft-boot dumping */
++/* Iterates over range of physical memory pages in DUMP_PAGE_SIZE increments */
++int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned long),
++ struct dump_data_filter *filter)
++{
++ /* Todo : fix unit, type */
++ loff_t loc, start, end;
++ int i, count = 0, err = 0;
++ struct page *page;
++
++ /* Todo: Add membanks code */
++ /* TBD: Check if we need to address DUMP_PAGE_SIZE < PAGE_SIZE */
++
++ for (i = 0; i < filter->num_mbanks; i++) {
++ start = filter->start[i];
++ end = filter->end[i];
++ for (loc = start; loc < end; loc += DUMP_PAGE_SIZE) {
++ dump_config.dumper->curr_loc = loc;
++ page = dump_get_page(loc);
++ if (page && filter->selector(pass,
++ (unsigned long) page, DUMP_PAGE_SIZE)) {
++ if ((err = action((unsigned long)page,
++ DUMP_PAGE_SIZE))) {
++ printk("dump_page_iterator: err %d for "
++ "loc 0x%llx, in pass %d\n",
++ err, loc, pass);
++ return err ? err : count;
++ } else
++ count++;
++ }
++ }
++ }
++
++ return err ? err : count;
++}
++
++/*
++ * Base function that saves the selected block of data in the dump
++ * Action taken when iterator decides that data needs to be saved
++ */
++int dump_generic_save_data(unsigned long loc, unsigned long sz)
++{
++ void *buf;
++ void *dump_buf = dump_config.dumper->dump_buf;
++ int left, bytes, ret;
++
++ if ((ret = dump_add_data(loc, sz))) {
++ return ret;
++ }
++ buf = dump_config.dumper->curr_buf;
++
++ /* If we've filled up the buffer write it out */
++ if ((left = buf - dump_buf) >= DUMP_BUFFER_SIZE) {
++ bytes = dump_write_buffer(dump_buf, DUMP_BUFFER_SIZE);
++ if (bytes < DUMP_BUFFER_SIZE) {
++ printk("dump_write_buffer failed %d\n", bytes);
++ return bytes ? -ENOSPC : bytes;
++ }
++
++ left -= bytes;
++
++ /* -- A few chores to do from time to time -- */
++ dump_config.dumper->count++;
++
++ if (!(dump_config.dumper->count & 0x3f)) {
++ /* Update the header every one in a while */
++ memset((void *)dump_buf, 'b', DUMP_BUFFER_SIZE);
++ if ((ret = dump_update_header()) < 0) {
++ /* issue warning */
++ return ret;
++ }
++ printk(".");
++
++ touch_nmi_watchdog();
++ } else if (!(dump_config.dumper->count & 0x7)) {
++ /* Show progress so the user knows we aren't hung */
++ dump_speedo(dump_config.dumper->count >> 3);
++ }
++ /* Todo: Touch/Refresh watchdog */
++
++ /* --- Done with periodic chores -- */
++
++ /*
++ * extra bit of copying to simplify verification
++ * in the second kernel boot based scheme
++ */
++ memcpy(dump_buf - DUMP_PAGE_SIZE, dump_buf +
++ DUMP_BUFFER_SIZE - DUMP_PAGE_SIZE, DUMP_PAGE_SIZE);
++
++ /* now adjust the leftover bits back to the top of the page */
++ /* this case would not arise during stage 2 (passthru) */
++ memset(dump_buf, 'z', DUMP_BUFFER_SIZE);
++ if (left) {
++ memcpy(dump_buf, dump_buf + DUMP_BUFFER_SIZE, left);
++ }
++ buf -= DUMP_BUFFER_SIZE;
++ dump_config.dumper->curr_buf = buf;
++ }
++
++ return 0;
++}
++
++int dump_generic_skip_data(unsigned long loc, unsigned long sz)
++{
++ /* dummy by default */
++ return 0;
++}
++
++/*
++ * Common low level routine to write a buffer to current dump device
++ * Expects checks for space etc to have been taken care of by the caller
++ * Operates serially at the moment for simplicity.
++ * TBD/Todo: Consider batching for improved throughput
++ */
++int dump_ll_write(void *buf, unsigned long len)
++{
++ long transferred = 0, last_transfer = 0;
++ int ret = 0;
++
++ /* make sure device is ready */
++ while ((ret = dump_dev_ready(NULL)) == -EAGAIN);
++ if (ret < 0) {
++ printk("dump_dev_ready failed !err %d\n", ret);
++ return ret;
++ }
++
++ while (len) {
++ if ((last_transfer = dump_dev_write(buf, len)) <= 0) {
++ ret = last_transfer;
++ printk("dump_dev_write failed !err %d\n",
++ ret);
++ break;
++ }
++ /* wait till complete */
++ while ((ret = dump_dev_ready(buf)) == -EAGAIN)
++ cpu_relax();
++
++ if (ret < 0) {
++ printk("i/o failed !err %d\n", ret);
++ break;
++ }
++
++ len -= last_transfer;
++ buf += last_transfer;
++ transferred += last_transfer;
++ }
++ return (ret < 0) ? ret : transferred;
++}
++
++/* default writeout routine for single dump device */
++/* writes out the dump data ensuring enough space is left for the end marker */
++int dump_generic_write_buffer(void *buf, unsigned long len)
++{
++ long written = 0;
++ int err = 0;
++
++ /* check for space */
++ if ((err = dump_dev_seek(dump_config.dumper->curr_offset + len +
++ 2*DUMP_BUFFER_SIZE)) < 0) {
++ printk("dump_write_buffer: insuff space after offset 0x%llx\n",
++ dump_config.dumper->curr_offset);
++ return err;
++ }
++ /* alignment check would happen as a side effect of this */
++ if ((err = dump_dev_seek(dump_config.dumper->curr_offset)) < 0)
++ return err;
++
++ written = dump_ll_write(buf, len);
++
++ /* all or none */
++
++ if (written < len)
++ written = written ? -ENOSPC : written;
++ else
++ dump_config.dumper->curr_offset += len;
++
++ return written;
++}
++
++int dump_generic_configure(unsigned long devid)
++{
++ struct dump_dev *dev = dump_config.dumper->dev;
++ struct dump_data_filter *filter;
++ void *buf;
++ int ret = 0;
++
++ /* Allocate the dump buffer and initialize dumper state */
++ /* Assume that we get aligned addresses */
++ if (!(buf = dump_alloc_mem(DUMP_BUFFER_SIZE + 3 * DUMP_PAGE_SIZE)))
++ return -ENOMEM;
++
++ if ((unsigned long)buf & (PAGE_SIZE - 1)) {
++ /* sanity check for page aligned address */
++ dump_free_mem(buf);
++ return -ENOMEM; /* fixme: better error code */
++ }
++
++ /* Initialize the rest of the fields */
++ dump_config.dumper->dump_buf = buf + DUMP_PAGE_SIZE;
++ dumper_reset();
++
++ /* Open the dump device */
++ if (!dev)
++ return -ENODEV;
++
++ if ((ret = dev->ops->open(dev, devid))) {
++ return ret;
++ }
++
++ /* Initialise the memory ranges in the dump filter */
++ for (filter = dump_config.dumper->filter ;filter->selector; filter++) {
++ if (!filter->start[0] && !filter->end[0]) {
++ pg_data_t *pgdat;
++ int i = 0;
++ for_each_pgdat(pgdat) {
++ filter->start[i] =
++ (loff_t)pgdat->node_start_pfn << PAGE_SHIFT;
++ filter->end[i] =
++ (loff_t)(pgdat->node_start_pfn + pgdat->node_spanned_pages) << PAGE_SHIFT;
++ i++;
++ }
++ filter->num_mbanks = i;
++ }
++ }
++
++ return 0;
++}
++
++int dump_generic_unconfigure(void)
++{
++ struct dump_dev *dev = dump_config.dumper->dev;
++ void *buf = dump_config.dumper->dump_buf;
++ int ret = 0;
++
++ pr_debug("Generic unconfigure\n");
++ /* Close the dump device */
++ if (dev && (ret = dev->ops->release(dev)))
++ return ret;
++
++ printk("Closed dump device\n");
++
++ if (buf)
++ dump_free_mem((buf - DUMP_PAGE_SIZE));
++
++ dump_config.dumper->curr_buf = dump_config.dumper->dump_buf = NULL;
++ pr_debug("Released dump buffer\n");
++
++ return 0;
++}
++
++#ifdef CONFIG_DISCONTIGMEM
++
++void dump_reconfigure_mbanks(void)
++{
++ pg_data_t *pgdat;
++ loff_t start, end, loc, loc_end;
++ int i=0;
++ struct dump_data_filter *filter = dump_config.dumper->filter;
++
++ for_each_pgdat(pgdat) {
++
++ start = (loff_t)(pgdat->node_start_pfn << PAGE_SHIFT);
++ end = ((loff_t)(pgdat->node_start_pfn + pgdat->node_spanned_pages) << PAGE_SHIFT);
++ for(loc = start; loc < end; loc += (DUMP_PAGE_SIZE)) {
++
++ if(!(__dump_page_valid(loc >> PAGE_SHIFT)))
++ continue;
++
++ /* We found a valid page. This is the start */
++ filter->start[i] = loc;
++
++ /* Now loop here till you find the end */
++ for(loc_end = loc; loc_end < end; loc_end += (DUMP_PAGE_SIZE)) {
++
++ if(__dump_page_valid(loc_end >> PAGE_SHIFT)) {
++ /* This page could very well be the last page */
++ filter->end[i] = loc_end;
++ continue;
++ }
++ break;
++ }
++ i++;
++ loc = loc_end;
++ }
++ }
++ filter->num_mbanks = i;
++
++ /* Propagate memory bank information to other filters */
++ for (filter = dump_config.dumper->filter, filter++ ;filter->selector; filter++) {
++ for(i = 0; i < dump_config.dumper->filter->num_mbanks; i++) {
++ filter->start[i] = dump_config.dumper->filter->start[i];
++ filter->end[i] = dump_config.dumper->filter->end[i];
++ filter->num_mbanks = dump_config.dumper->filter->num_mbanks;
++ }
++ }
++}
++#endif
++
++/* Set up the default dump scheme */
++
++struct dump_scheme_ops dump_scheme_singlestage_ops = {
++ .configure = dump_generic_configure,
++ .unconfigure = dump_generic_unconfigure,
++ .sequencer = dump_generic_sequencer,
++ .iterator = dump_page_iterator,
++ .save_data = dump_generic_save_data,
++ .skip_data = dump_generic_skip_data,
++ .write_buffer = dump_generic_write_buffer,
++};
++
++struct dump_scheme dump_scheme_singlestage = {
++ .name = "single-stage",
++ .ops = &dump_scheme_singlestage_ops
++};
++
++/* The single stage dumper comprising all these */
++struct dumper dumper_singlestage = {
++ .name = "single-stage",
++ .scheme = &dump_scheme_singlestage,
++ .fmt = &dump_fmt_lcrash,
++ .compress = &dump_none_compression,
++ .filter = dump_filter_table,
++ .dev = NULL,
++};
++
+Index: linux-2.6.10/drivers/dump/dump_gzip.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_gzip.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_gzip.c 2005-04-05 16:47:53.937206016 +0800
+@@ -0,0 +1,174 @@
++/*
++ * GZIP Compression functions for kernel crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sourceforge.net)
++ * Copyright 2001 Matt D. Robinson. All rights reserved.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/* header files */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/file.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/dump.h>
++#include <linux/zlib.h>
++#include <linux/vmalloc.h>
++
++static void *deflate_workspace;
++static unsigned long workspace_paddr[2];
++
++static u8 *safety_buffer;
++
++/*
++ * Name: dump_compress_gzip()
++ * Func: Compress a DUMP_PAGE_SIZE page using gzip-style algorithms (the.
++ * deflate functions similar to what's used in PPP).
++ */
++static u32
++dump_compress_gzip(const u8 *old, u32 oldsize, u8 *new, u32 newsize,
++ unsigned long loc)
++{
++ /* error code and dump stream */
++ int err;
++ z_stream dump_stream;
++ struct page *pg = (struct page *)loc;
++ unsigned long paddr = page_to_pfn(pg) << PAGE_SHIFT;
++ static int warning = 0;
++
++ dump_stream.workspace = deflate_workspace;
++ if ((paddr == workspace_paddr[0]) || (paddr == workspace_paddr[1])) {
++ /*
++ * This page belongs to deflate_workspace used as temporary
++ * buffer for compression. Hence, dump them without compression.
++ */
++ return(0);
++ }
++ if ((err = zlib_deflateInit(&dump_stream, Z_BEST_COMPRESSION)) != Z_OK) {
++ /* fall back to RLE compression */
++ printk("dump_compress_gzip(): zlib_deflateInit() "
++ "failed (%d)!\n", err);
++ return 0;
++ }
++
++ /* copy the old page to the safety buffer */
++ if (oldsize <= DUMP_PAGE_SIZE) {
++ memcpy(safety_buffer, old, oldsize);
++ dump_stream.next_in = (u8 *) safety_buffer;
++ } else {
++ if (!warning) {
++ printk("dump_compress_gzip oversize input: %d\n",
++ oldsize);
++ warning++;
++ }
++ dump_stream.next_in = (u8 *) old;
++ }
++
++ /* use old (page of memory) and size (DUMP_PAGE_SIZE) as in-streams */
++ dump_stream.avail_in = oldsize;
++
++ /* out streams are new (dpcpage) and new size (DUMP_DPC_PAGE_SIZE) */
++ dump_stream.next_out = new;
++ dump_stream.avail_out = newsize;
++
++ /* deflate the page -- check for error */
++ err = zlib_deflate(&dump_stream, Z_FINISH);
++ if (err != Z_STREAM_END) {
++ /* zero is return code here */
++ (void)zlib_deflateEnd(&dump_stream);
++ printk("dump_compress_gzip(): zlib_deflate() failed (%d)!\n",
++ err);
++ return 0;
++ }
++
++ /* let's end the deflated compression stream */
++ if ((err = zlib_deflateEnd(&dump_stream)) != Z_OK) {
++ printk("dump_compress_gzip(): zlib_deflateEnd() "
++ "failed (%d)!\n", err);
++ }
++
++ /* return the compressed byte total (if it's smaller) */
++ if (dump_stream.total_out >= oldsize) {
++ return oldsize;
++ }
++ return dump_stream.total_out;
++}
++
++/* setup the gzip compression functionality */
++static struct __dump_compress dump_gzip_compression = {
++ .compress_type = DUMP_COMPRESS_GZIP,
++ .compress_func = dump_compress_gzip,
++ .compress_name = "GZIP",
++};
++
++/*
++ * Name: dump_compress_gzip_init()
++ * Func: Initialize gzip as a compression mechanism.
++ */
++static int __init
++dump_compress_gzip_init(void)
++{
++ struct page *pg;
++
++ deflate_workspace = vmalloc(zlib_deflate_workspacesize());
++ if (!deflate_workspace) {
++ printk("dump_compress_gzip_init(): Failed to "
++ "alloc %d bytes for deflate workspace\n",
++ zlib_deflate_workspacesize());
++ return -ENOMEM;
++ }
++ /*
++ * Need to find pages (workspace) that are used for compression.
++ * Even though zlib_deflate_workspacesize() is 64 pages (approximately)
++ * depends on the arch, we used only 2 pages. Hence, get the physical
++ * addresses for these 2 pages and used them to not to compress those
++ * pages.
++ */
++ pg = vmalloc_to_page(deflate_workspace);
++ workspace_paddr[0] = page_to_pfn(pg) << PAGE_SHIFT;
++ pg = vmalloc_to_page(deflate_workspace + DUMP_PAGE_SIZE);
++ workspace_paddr[1] = page_to_pfn(pg) << PAGE_SHIFT;
++
++ /* Eliminate the possibility of real data getting a compression
++ * failure.
++ */
++
++ if (!(safety_buffer = (void *)__get_free_pages(GFP_KERNEL,
++ get_order(DUMP_PAGE_SIZE))))
++ return -ENOMEM;
++
++ printk("dump gzip safety buffer: %p, %d\n", safety_buffer,
++ (int)DUMP_PAGE_SIZE);
++
++ dump_register_compression(&dump_gzip_compression);
++ return 0;
++}
++
++/*
++ * Name: dump_compress_gzip_cleanup()
++ * Func: Remove gzip as a compression mechanism.
++ */
++static void __exit
++dump_compress_gzip_cleanup(void)
++{
++ vfree(deflate_workspace);
++ if (safety_buffer) {
++ free_pages((unsigned long)safety_buffer,
++ get_order(DUMP_PAGE_SIZE));
++ safety_buffer = NULL;
++ }
++
++ dump_unregister_compression(DUMP_COMPRESS_GZIP);
++}
++
++/* module initialization */
++module_init(dump_compress_gzip_init);
++module_exit(dump_compress_gzip_cleanup);
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("LKCD Development Team <lkcd-devel@lists.sourceforge.net>");
++MODULE_DESCRIPTION("Gzip compression module for crash dump driver");
+Index: linux-2.6.10/drivers/dump/dump_filters.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_filters.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_filters.c 2005-04-05 16:47:53.942205256 +0800
+@@ -0,0 +1,143 @@
++/*
++ * Default filters to select data to dump for various passes.
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya <suparna@in.ibm.com>
++ * Split and rewrote default dump selection logic to generic dump
++ * method interfaces
++ * Derived from a portion of dump_base.c created by
++ * Matt Robinson <yakker@sourceforge.net>)
++ *
++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved.
++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved.
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * Used during single-stage dumping and during stage 1 of the 2-stage scheme
++ * (Stage 2 of the 2-stage scheme uses the fully transparent filters
++ * i.e. passthru filters in dump_overlay.c)
++ *
++ * Future: Custom selective dump may involve a different set of filters.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#include <linux/kernel.h>
++#include <linux/bootmem.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/dump.h>
++#include "dump_methods.h"
++
++#define DUMP_PFN_SAFETY_MARGIN 1024 /* 4 MB */
++static unsigned long bootmap_pages;
++
++/* Copied from mm/bootmem.c - FIXME */
++/* return the number of _pages_ that will be allocated for the boot bitmap */
++void dump_calc_bootmap_pages (void)
++{
++ unsigned long mapsize;
++ unsigned long pages = num_physpages;
++
++ mapsize = (pages+7)/8;
++ mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
++ mapsize >>= PAGE_SHIFT;
++ bootmap_pages = mapsize + DUMP_PFN_SAFETY_MARGIN + 1;
++}
++
++
++/* temporary */
++extern unsigned long min_low_pfn;
++
++
++int dump_low_page(struct page *p)
++{
++ return ((page_to_pfn(p) >= min_low_pfn) &&
++ (page_to_pfn(p) < (min_low_pfn + bootmap_pages)));
++}
++
++static inline int kernel_page(struct page *p)
++{
++ /* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */
++ return (PageReserved(p) && !PageInuse(p)) || (!PageLRU(p) && PageInuse(p));
++}
++
++static inline int user_page(struct page *p)
++{
++ return PageInuse(p) && (!PageReserved(p) && PageLRU(p));
++}
++
++static inline int unreferenced_page(struct page *p)
++{
++ return !PageInuse(p) && !PageReserved(p);
++}
++
++
++/* loc marks the beginning of a range of pages */
++int dump_filter_kernpages(int pass, unsigned long loc, unsigned long sz)
++{
++ struct page *page = (struct page *)loc;
++ /* if any of the pages is a kernel page, select this set */
++ while (sz) {
++ if (dump_low_page(page) || kernel_page(page))
++ return 1;
++ sz -= PAGE_SIZE;
++ page++;
++ }
++ return 0;
++}
++
++
++/* loc marks the beginning of a range of pages */
++int dump_filter_userpages(int pass, unsigned long loc, unsigned long sz)
++{
++ struct page *page = (struct page *)loc;
++ int ret = 0;
++ /* select if the set has any user page, and no kernel pages */
++ while (sz) {
++ if (user_page(page) && !dump_low_page(page)) {
++ ret = 1;
++ } else if (kernel_page(page) || dump_low_page(page)) {
++ return 0;
++ }
++ page++;
++ sz -= PAGE_SIZE;
++ }
++ return ret;
++}
++
++
++
++/* loc marks the beginning of a range of pages */
++int dump_filter_unusedpages(int pass, unsigned long loc, unsigned long sz)
++{
++ struct page *page = (struct page *)loc;
++
++ /* select if the set does not have any used pages */
++ while (sz) {
++ if (!unreferenced_page(page) || dump_low_page(page)) {
++ return 0;
++ }
++ page++;
++ sz -= PAGE_SIZE;
++ }
++ return 1;
++}
++
++/* dummy: last (non-existent) pass */
++int dump_filter_none(int pass, unsigned long loc, unsigned long sz)
++{
++ return 0;
++}
++
++/* TBD: resolve level bitmask ? */
++struct dump_data_filter dump_filter_table[] = {
++ { .name = "kern", .selector = dump_filter_kernpages,
++ .level_mask = DUMP_MASK_KERN},
++ { .name = "user", .selector = dump_filter_userpages,
++ .level_mask = DUMP_MASK_USED},
++ { .name = "unused", .selector = dump_filter_unusedpages,
++ .level_mask = DUMP_MASK_UNUSED},
++ { .name = "none", .selector = dump_filter_none,
++ .level_mask = DUMP_MASK_REST},
++ { .name = "", .selector = NULL, .level_mask = 0}
++};
++
+Index: linux-2.6.10/drivers/dump/dump_ppc64.c
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_ppc64.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_ppc64.c 2005-04-05 16:47:53.931206928 +0800
+@@ -0,0 +1,410 @@
++/*
++ * Architecture specific (ppc64) functions for Linux crash dumps.
++ *
++ * Created by: Matt Robinson (yakker@sgi.com)
++ *
++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved.
++ *
++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com)
++ * Copyright 2000 TurboLinux, Inc. All rights reserved.
++ * Copyright 2003, 2004 IBM Corporation
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++/*
++ * The hooks for dumping the kernel virtual memory to disk are in this
++ * file. Any time a modification is made to the virtual memory mechanism,
++ * these routines must be changed to use the new mechanisms.
++ */
++#include <linux/types.h>
++#include <linux/fs.h>
++#include <linux/dump.h>
++#include <linux/mm.h>
++#include <linux/vmalloc.h>
++#include <linux/delay.h>
++#include <linux/syscalls.h>
++#include <asm/hardirq.h>
++#include "dump_methods.h"
++#include <linux/irq.h>
++#include <asm/machdep.h>
++#include <asm/uaccess.h>
++#include <asm/irq.h>
++#include <asm/page.h>
++#if defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE)
++#include <linux/kdb.h>
++#endif
++
++extern cpumask_t irq_affinity[];
++
++static cpumask_t saved_affinity[NR_IRQS];
++
++static __s32 saved_irq_count; /* saved preempt_count() flags */
++
++static int alloc_dha_stack(void)
++{
++ int i;
++ void *ptr;
++
++ if (dump_header_asm.dha_stack[0])
++ return 0;
++
++ ptr = (void *)vmalloc(THREAD_SIZE * num_possible_cpus());
++ if (!ptr) {
++ return -ENOMEM;
++ }
++
++ for (i = 0; i < num_possible_cpus(); i++) {
++ dump_header_asm.dha_stack[i] =
++ (uint64_t)((unsigned long)ptr + (i * THREAD_SIZE));
++ }
++ return 0;
++}
++
++static int free_dha_stack(void)
++{
++ if (dump_header_asm.dha_stack[0]) {
++ vfree((void*)dump_header_asm.dha_stack[0]);
++ dump_header_asm.dha_stack[0] = 0;
++ }
++ return 0;
++}
++#ifdef CONFIG_SMP
++static int dump_expect_ipi[NR_CPUS];
++static atomic_t waiting_for_dump_ipi;
++
++extern void stop_this_cpu(void *);
++static int
++dump_ipi_handler(struct pt_regs *regs)
++{
++ int cpu = smp_processor_id();
++
++ if (!dump_expect_ipi[cpu])
++ return 0;
++ dump_save_this_cpu(regs);
++ atomic_dec(&waiting_for_dump_ipi);
++
++ level_changed:
++ switch (dump_silence_level) {
++ case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */
++ while (dump_oncpu) {
++ barrier(); /* paranoia */
++ if (dump_silence_level != DUMP_HARD_SPIN_CPUS)
++ goto level_changed;
++ cpu_relax(); /* kill time nicely */
++ }
++ break;
++
++ case DUMP_HALT_CPUS: /* Execute halt */
++ stop_this_cpu(NULL);
++ break;
++
++ case DUMP_SOFT_SPIN_CPUS:
++ /* Mark the task so it spins in schedule */
++ set_tsk_thread_flag(current, TIF_NEED_RESCHED);
++ break;
++ }
++
++ return 1;
++}
++
++/* save registers on other processors
++ * If the other cpus don't respond we simply do not get their states.
++ */
++void
++__dump_save_other_cpus(void)
++{
++ int i, cpu = smp_processor_id();
++ int other_cpus = num_online_cpus()-1;
++
++ if (other_cpus > 0) {
++ atomic_set(&waiting_for_dump_ipi, other_cpus);
++ for (i = 0; i < NR_CPUS; i++)
++ dump_expect_ipi[i] = (i != cpu && cpu_online(i));
++
++ printk(KERN_ALERT "sending IPI to other cpus...\n");
++ dump_send_ipi(dump_ipi_handler);
++ /*
++ * may be we dont need to wait for IPI to be processed.
++ * just write out the header at the end of dumping, if
++ * this IPI is not processed until then, there probably
++ * is a problem and we just fail to capture state of
++ * other cpus.
++ * However, we will wait 10 secs for other CPUs to respond.
++ * If not, proceed the dump process even though we failed
++ * to capture other CPU states.
++ */
++ i = 10000; /* wait max of 10 seconds */
++ while ((atomic_read(&waiting_for_dump_ipi) > 0) && (--i > 0)) {
++ barrier();
++ mdelay(1);
++ }
++ printk(KERN_ALERT "done waiting: %d cpus not responding\n",
++ atomic_read(&waiting_for_dump_ipi));
++ dump_send_ipi(NULL); /* clear handler */
++ }
++}
++
++/*
++ * Restore old irq affinities.
++ */
++static void
++__dump_reset_irq_affinity(void)
++{
++ int i;
++ irq_desc_t *irq_d;
++
++ memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long));
++
++ for_each_irq(i) {
++ irq_d = get_irq_desc(i);
++ if (irq_d->handler == NULL) {
++ continue;
++ }
++ if (irq_d->handler->set_affinity != NULL) {
++ irq_d->handler->set_affinity(i, saved_affinity[i]);
++ }
++ }
++}
++
++/*
++ * Routine to save the old irq affinities and change affinities of all irqs to
++ * the dumping cpu.
++ *
++ * NB: Need to be expanded to multiple nodes.
++ */
++static void
++__dump_set_irq_affinity(void)
++{
++ int i;
++ cpumask_t cpu = CPU_MASK_NONE;
++ irq_desc_t *irq_d;
++
++ cpu_set(smp_processor_id(), cpu);
++
++ memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long));
++
++ for_each_irq(i) {
++ irq_d = get_irq_desc(i);
++ if (irq_d->handler == NULL) {
++ continue;
++ }
++ irq_affinity[i] = cpu;
++ if (irq_d->handler->set_affinity != NULL) {
++ irq_d->handler->set_affinity(i, irq_affinity[i]);
++ }
++ }
++}
++#else /* !CONFIG_SMP */
++#define __dump_save_other_cpus() do { } while (0)
++#define __dump_set_irq_affinity() do { } while (0)
++#define __dump_reset_irq_affinity() do { } while (0)
++#endif /* !CONFIG_SMP */
++
++void
++__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs)
++{
++ if (regs) {
++ memcpy(dest_regs, regs, sizeof(struct pt_regs));
++ }
++}
++
++void
++__dump_save_context(int cpu, const struct pt_regs *regs,
++ struct task_struct *tsk)
++{
++ dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk;
++ __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs);
++
++ /* take a snapshot of the stack */
++ /* doing this enables us to tolerate slight drifts on this cpu */
++
++ if (dump_header_asm.dha_stack[cpu]) {
++ memcpy((void *)dump_header_asm.dha_stack[cpu],
++ STACK_START_POSITION(tsk),
++ THREAD_SIZE);
++ }
++ dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info);
++}
++
++/*
++ * Name: __dump_configure_header()
++ * Func: Configure the dump header with all proper values.
++ */
++int
++__dump_configure_header(const struct pt_regs *regs)
++{
++ return (0);
++}
++
++#if defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE)
++int
++kdb_sysdump(int argc, const char **argv, const char **envp, struct pt_regs *regs)
++{
++ kdb_printf("Dumping to disk...\n");
++ dump("dump from kdb", regs);
++ kdb_printf("Dump Complete\n");
++ return 0;
++}
++#endif
++
++/*
++ * Name: __dump_init()
++ * Func: Initialize the dumping routine process. This is in case
++ * it's necessary in the future.
++ */
++void
++__dump_init(uint64_t local_memory_start)
++{
++#if defined(FIXME) && defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE)
++ /* This won't currently work because interrupts are off in kdb
++ * and the dump process doesn't understand how to recover.
++ */
++ /* ToDo: add a command to query/set dump configuration */
++ kdb_register_repeat("sysdump", kdb_sysdump, "", "use lkcd to dump the system to disk (if configured)", 0, KDB_REPEAT_NONE);
++#endif
++
++ /* return */
++ return;
++}
++
++/*
++ * Name: __dump_open()
++ * Func: Open the dump device (architecture specific). This is in
++ * case it's necessary in the future.
++ */
++void
++__dump_open(void)
++{
++ alloc_dha_stack();
++}
++
++
++/*
++ * Name: __dump_cleanup()
++ * Func: Free any architecture specific data structures. This is called
++ * when the dump module is being removed.
++ */
++void
++__dump_cleanup(void)
++{
++ free_dha_stack();
++}
++
++/*
++ * Kludge - dump from interrupt context is unreliable (Fixme)
++ *
++ * We do this so that softirqs initiated for dump i/o
++ * get processed and we don't hang while waiting for i/o
++ * to complete or in any irq synchronization attempt.
++ *
++ * This is not quite legal of course, as it has the side
++ * effect of making all interrupts & softirqs triggered
++ * while dump is in progress complete before currently
++ * pending softirqs and the currently executing interrupt
++ * code.
++ */
++static inline void
++irq_bh_save(void)
++{
++ saved_irq_count = irq_count();
++ preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK);
++}
++
++static inline void
++irq_bh_restore(void)
++{
++ preempt_count() |= saved_irq_count;
++}
++
++/*
++ * Name: __dump_irq_enable
++ * Func: Reset system so interrupts are enabled.
++ * This is used for dump methods that require interrupts
++ * Eventually, all methods will have interrupts disabled
++ * and this code can be removed.
++ *
++ * Change irq affinities
++ * Re-enable interrupts
++ */
++int
++__dump_irq_enable(void)
++{
++ __dump_set_irq_affinity();
++ irq_bh_save();
++ local_irq_enable();
++ return 0;
++}
++
++/*
++ * Name: __dump_irq_restore
++ * Func: Resume the system state in an architecture-specific way.
++ */
++void
++__dump_irq_restore(void)
++{
++ local_irq_disable();
++ __dump_reset_irq_affinity();
++ irq_bh_restore();
++}
++
++#if 0
++/* Cheap progress hack. It estimates pages to write and
++ * assumes all pages will go -- so it may get way off.
++ * As the progress is not displayed for other architectures, not used at this
++ * moment.
++ */
++void
++__dump_progress_add_page(void)
++{
++ unsigned long total_pages = nr_free_pages() + nr_inactive_pages + nr_active_pages;
++ unsigned int percent = (dump_header.dh_num_dump_pages * 100) / total_pages;
++ char buf[30];
++
++ if (percent > last_percent && percent <= 100) {
++ sprintf(buf, "Dump %3d%% ", percent);
++ ppc64_dump_msg(0x2, buf);
++ last_percent = percent;
++ }
++
++}
++#endif
++
++extern int dump_page_is_ram(unsigned long);
++/*
++ * Name: __dump_page_valid()
++ * Func: Check if page is valid to dump.
++ */
++int
++__dump_page_valid(unsigned long index)
++{
++ if (!pfn_valid(index))
++ return 0;
++
++ return dump_page_is_ram(index);
++}
++
++/*
++ * Name: manual_handle_crashdump()
++ * Func: Interface for the lkcd dump command. Calls dump_execute()
++ */
++int
++manual_handle_crashdump(void)
++{
++ struct pt_regs regs;
++
++ get_current_regs(®s);
++ dump_execute("manual", ®s);
++ return 0;
++}
++
++/*
++ * Name: __dump_clean_irq_state()
++ * Func: Clean up from the previous IRQ handling state. Such as oops from
++ * interrupt handler or bottom half.
++ */
++void
++__dump_clean_irq_state(void)
++{
++ return;
++}
+Index: linux-2.6.10/drivers/dump/dump_methods.h
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/dump_methods.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/dump_methods.h 2005-04-05 16:47:53.930207080 +0800
+@@ -0,0 +1,357 @@
++/*
++ * Generic interfaces for flexible system dump
++ *
++ * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com)
++ *
++ * Copyright (C) 2002 International Business Machines Corp.
++ *
++ * This code is released under version 2 of the GNU GPL.
++ */
++
++#ifndef _LINUX_DUMP_METHODS_H
++#define _LINUX_DUMP_METHODS_H
++
++/*
++ * Inspired by Matt Robinson's suggestion of introducing dump
++ * methods as a way to enable different crash dump facilities to
++ * coexist where each employs its own scheme or dumping policy.
++ *
++ * The code here creates a framework for flexible dump by defining
++ * a set of methods and providing associated helpers that differentiate
++ * between the underlying mechanism (how to dump), overall scheme
++ * (sequencing of stages and data dumped and associated quiescing),
++ * output format (what the dump output looks like), target type
++ * (where to save the dump; see dumpdev.h), and selection policy
++ * (state/data to dump).
++ *
++ * These sets of interfaces can be mixed and matched to build a
++ * dumper suitable for a given situation, allowing for
++ * flexibility as well appropriate degree of code reuse.
++ * For example all features and options of lkcd (including
++ * granular selective dumping in the near future) should be
++ * available even when say, the 2 stage soft-boot based mechanism
++ * is used for taking disruptive dumps.
++ *
++ * Todo: Additionally modules or drivers may supply their own
++ * custom dumpers which extend dump with module specific
++ * information or hardware state, and can even tweak the
++ * mechanism when it comes to saving state relevant to
++ * them.
++ */
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/highmem.h>
++#include <linux/dumpdev.h>
++#include <asm/page.h> /* get_order */
++
++#define MAX_PASSES 6
++#define MAX_DEVS 4
++
++
++/* To customise selection of pages to be dumped in a given pass/group */
++struct dump_data_filter{
++ char name[32];
++ int (*selector)(int, unsigned long, unsigned long);
++ ulong level_mask; /* dump level(s) for which this filter applies */
++ loff_t start[MAX_NUMNODES], end[MAX_NUMNODES]; /* location range applicable */
++ ulong num_mbanks; /* Number of memory banks. Greater than one for discontig memory (NUMA) */
++};
++
++
++/*
++ * Determined by the kind of dump mechanism and appropriate
++ * overall scheme
++ */
++struct dump_scheme_ops {
++ /* sets aside memory, inits data structures etc */
++ int (*configure)(unsigned long devid);
++ /* releases resources */
++ int (*unconfigure)(void);
++
++ /* ordering of passes, invoking iterator */
++ int (*sequencer)(void);
++ /* iterates over system data, selects and acts on data to dump */
++ int (*iterator)(int, int (*)(unsigned long, unsigned long),
++ struct dump_data_filter *);
++ /* action when data is selected for dump */
++ int (*save_data)(unsigned long, unsigned long);
++ /* action when data is to be excluded from dump */
++ int (*skip_data)(unsigned long, unsigned long);
++ /* policies for space, multiple dump devices etc */
++ int (*write_buffer)(void *, unsigned long);
++};
++
++struct dump_scheme {
++ /* the name serves as an anchor to locate the scheme after reboot */
++ char name[32];
++ struct dump_scheme_ops *ops;
++ struct list_head list;
++};
++
++/* Quiescing/Silence levels (controls IPI callback behaviour) */
++extern enum dump_silence_levels {
++ DUMP_SOFT_SPIN_CPUS = 1,
++ DUMP_HARD_SPIN_CPUS = 2,
++ DUMP_HALT_CPUS = 3,
++} dump_silence_level;
++
++/* determined by the dump (file) format */
++struct dump_fmt_ops {
++ /* build header */
++ int (*configure_header)(const char *, const struct pt_regs *);
++ int (*update_header)(void); /* update header and write it out */
++ /* save curr context */
++ void (*save_context)(int, const struct pt_regs *,
++ struct task_struct *);
++ /* typically called by the save_data action */
++ /* add formatted data to the dump buffer */
++ int (*add_data)(unsigned long, unsigned long);
++ int (*update_end_marker)(void);
++};
++
++struct dump_fmt {
++ unsigned long magic;
++ char name[32]; /* lcrash, crash, elf-core etc */
++ struct dump_fmt_ops *ops;
++ struct list_head list;
++};
++
++/*
++ * Modules will be able add their own data capture schemes by
++ * registering their own dumpers. Typically they would use the
++ * primary dumper as a template and tune it with their routines.
++ * Still Todo.
++ */
++
++/* The combined dumper profile (mechanism, scheme, dev, fmt) */
++struct dumper {
++ char name[32]; /* singlestage, overlay (stg1), passthru(stg2), pull */
++ struct dump_scheme *scheme;
++ struct dump_fmt *fmt;
++ struct __dump_compress *compress;
++ struct dump_data_filter *filter;
++ struct dump_dev *dev;
++ /* state valid only for active dumper(s) - per instance */
++ /* run time state/context */
++ int curr_pass;
++ unsigned long count;
++ loff_t curr_offset; /* current logical offset into dump device */
++ loff_t curr_loc; /* current memory location */
++ void *curr_buf; /* current position in the dump buffer */
++ void *dump_buf; /* starting addr of dump buffer */
++ int header_dirty; /* whether the header needs to be written out */
++ int header_len;
++ struct list_head dumper_list; /* links to other dumpers */
++};
++
++/* Starting point to get to the current configured state */
++struct dump_config {
++ ulong level;
++ ulong flags;
++ struct dumper *dumper;
++ unsigned long dump_device;
++ unsigned long dump_addr; /* relevant only for in-memory dumps */
++ struct list_head dump_dev_list;
++};
++
++extern struct dump_config dump_config;
++
++/* Used to save the dump config across a reboot for 2-stage dumps:
++ *
++ * Note: The scheme, format, compression and device type should be
++ * registered at bootup, for this config to be sharable across soft-boot.
++ * The function addresses could have changed and become invalid, and
++ * need to be set up again.
++ */
++struct dump_config_block {
++ u64 magic; /* for a quick sanity check after reboot */
++ struct dump_memdev memdev; /* handle to dump stored in memory */
++ struct dump_config config;
++ struct dumper dumper;
++ struct dump_scheme scheme;
++ struct dump_fmt fmt;
++ struct __dump_compress compress;
++ struct dump_data_filter filter_table[MAX_PASSES];
++ struct dump_anydev dev[MAX_DEVS]; /* target dump device */
++};
++
++
++/* Wrappers that invoke the methods for the current (active) dumper */
++
++/* Scheme operations */
++
++static inline int dump_sequencer(void)
++{
++ return dump_config.dumper->scheme->ops->sequencer();
++}
++
++static inline int dump_iterator(int pass, int (*action)(unsigned long,
++ unsigned long), struct dump_data_filter *filter)
++{
++ return dump_config.dumper->scheme->ops->iterator(pass, action, filter);
++}
++
++#define dump_save_data dump_config.dumper->scheme->ops->save_data
++#define dump_skip_data dump_config.dumper->scheme->ops->skip_data
++
++static inline int dump_write_buffer(void *buf, unsigned long len)
++{
++ return dump_config.dumper->scheme->ops->write_buffer(buf, len);
++}
++
++static inline int dump_configure(unsigned long devid)
++{
++ return dump_config.dumper->scheme->ops->configure(devid);
++}
++
++static inline int dump_unconfigure(void)
++{
++ return dump_config.dumper->scheme->ops->unconfigure();
++}
++
++/* Format operations */
++
++static inline int dump_configure_header(const char *panic_str,
++ const struct pt_regs *regs)
++{
++ return dump_config.dumper->fmt->ops->configure_header(panic_str, regs);
++}
++
++static inline void dump_save_context(int cpu, const struct pt_regs *regs,
++ struct task_struct *tsk)
++{
++ dump_config.dumper->fmt->ops->save_context(cpu, regs, tsk);
++}
++
++static inline int dump_save_this_cpu(const struct pt_regs *regs)
++{
++ int cpu = smp_processor_id();
++
++ dump_save_context(cpu, regs, current);
++ return 1;
++}
++
++static inline int dump_update_header(void)
++{
++ return dump_config.dumper->fmt->ops->update_header();
++}
++
++static inline int dump_update_end_marker(void)
++{
++ return dump_config.dumper->fmt->ops->update_end_marker();
++}
++
++static inline int dump_add_data(unsigned long loc, unsigned long sz)
++{
++ return dump_config.dumper->fmt->ops->add_data(loc, sz);
++}
++
++/* Compression operation */
++static inline int dump_compress_data(char *src, int slen, char *dst,
++ unsigned long loc)
++{
++ return dump_config.dumper->compress->compress_func(src, slen,
++ dst, DUMP_DPC_PAGE_SIZE, loc);
++}
++
++
++/* Prototypes of some default implementations of dump methods */
++
++extern struct __dump_compress dump_none_compression;
++
++/* Default scheme methods (dump_scheme.c) */
++
++extern int dump_generic_sequencer(void);
++extern int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned
++ long), struct dump_data_filter *filter);
++extern int dump_generic_save_data(unsigned long loc, unsigned long sz);
++extern int dump_generic_skip_data(unsigned long loc, unsigned long sz);
++extern int dump_generic_write_buffer(void *buf, unsigned long len);
++extern int dump_generic_configure(unsigned long);
++extern int dump_generic_unconfigure(void);
++#ifdef CONFIG_DISCONTIGMEM
++extern void dump_reconfigure_mbanks(void);
++#endif
++
++/* Default scheme template */
++extern struct dump_scheme dump_scheme_singlestage;
++
++/* Default dump format methods */
++
++extern int dump_lcrash_configure_header(const char *panic_str,
++ const struct pt_regs *regs);
++extern void dump_lcrash_save_context(int cpu, const struct pt_regs *regs,
++ struct task_struct *tsk);
++extern int dump_generic_update_header(void);
++extern int dump_lcrash_add_data(unsigned long loc, unsigned long sz);
++extern int dump_lcrash_update_end_marker(void);
++
++/* Default format (lcrash) template */
++extern struct dump_fmt dump_fmt_lcrash;
++
++/* Default dump selection filter table */
++
++/*
++ * Entries listed in order of importance and correspond to passes
++ * The last entry (with a level_mask of zero) typically reflects data that
++ * won't be dumped -- this may for example be used to identify data
++ * that will be skipped for certain so the corresponding memory areas can be
++ * utilized as scratch space.
++ */
++extern struct dump_data_filter dump_filter_table[];
++
++/* Some pre-defined dumpers */
++extern struct dumper dumper_singlestage;
++extern struct dumper dumper_stage1;
++extern struct dumper dumper_stage2;
++
++/* These are temporary */
++#define DUMP_MASK_HEADER DUMP_LEVEL_HEADER
++#define DUMP_MASK_KERN DUMP_LEVEL_KERN
++#define DUMP_MASK_USED DUMP_LEVEL_USED
++#define DUMP_MASK_UNUSED DUMP_LEVEL_ALL_RAM
++#define DUMP_MASK_REST 0 /* dummy for now */
++
++/* Helpers - move these to dump.h later ? */
++
++int dump_generic_execute(const char *panic_str, const struct pt_regs *regs);
++extern int dump_ll_write(void *buf, unsigned long len);
++int dump_check_and_free_page(struct dump_memdev *dev, struct page *page);
++
++static inline void dumper_reset(void)
++{
++ dump_config.dumper->curr_buf = dump_config.dumper->dump_buf;
++ dump_config.dumper->curr_loc = 0;
++ dump_config.dumper->curr_offset = 0;
++ dump_config.dumper->count = 0;
++ dump_config.dumper->curr_pass = 0;
++}
++
++/*
++ * May later be moulded to perform boot-time allocations so we can dump
++ * earlier during bootup
++ */
++static inline void *dump_alloc_mem(unsigned long size)
++{
++ return (void *) __get_free_pages(GFP_KERNEL, get_order(size));
++}
++
++static inline void dump_free_mem(void *buf)
++{
++ struct page *page;
++
++ /* ignore reserved pages (e.g. post soft boot stage) */
++ if (buf && (page = virt_to_page(buf))) {
++ if (PageReserved(page))
++ return;
++ }
++ /*
++ * Allocated using __get_free_pages().
++ */
++ free_pages((unsigned long)buf,
++ get_order(DUMP_BUFFER_SIZE + 3 * DUMP_PAGE_SIZE));
++}
++
++
++#endif /* _LINUX_DUMP_METHODS_H */
+Index: linux-2.6.10/drivers/dump/Makefile
+===================================================================
+--- linux-2.6.10.orig/drivers/dump/Makefile 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/drivers/dump/Makefile 2005-04-05 16:47:53.947204496 +0800
+@@ -0,0 +1,22 @@
++#
++# Makefile for the dump device drivers.
++#
++
++dump-y := dump_setup.o dump_fmt.o dump_filters.o dump_scheme.o dump_execute.o
++ifeq ($(CONFIG_X86_64),)
++ifeq ($(CONFIG_X86),y)
++dump-$(CONFIG_X86) += dump_i386.o
++endif
++endif
++dump-$(CONFIG_ARM) += dump_arm.o
++dump-$(CONFIG_PPC64) += dump_ppc64.o
++dump-$(CONFIG_X86_64) += dump_x8664.o
++dump-$(CONFIG_IA64) += dump_ia64.o
++dump-$(CONFIG_CRASH_DUMP_MEMDEV) += dump_memdev.o dump_overlay.o
++dump-objs += $(dump-y)
++
++obj-$(CONFIG_CRASH_DUMP) += dump.o
++obj-$(CONFIG_CRASH_DUMP_BLOCKDEV) += dump_blockdev.o
++obj-$(CONFIG_CRASH_DUMP_NETDEV) += dump_netdev.o
++obj-$(CONFIG_CRASH_DUMP_COMPRESS_RLE) += dump_rle.o
++obj-$(CONFIG_CRASH_DUMP_COMPRESS_GZIP) += dump_gzip.o
+Index: linux-2.6.10/drivers/Makefile
+===================================================================
+--- linux-2.6.10.orig/drivers/Makefile 2004-12-25 05:36:00.000000000 +0800
++++ linux-2.6.10/drivers/Makefile 2005-04-05 16:47:53.950204040 +0800
+@@ -60,3 +60,4 @@
+ obj-$(CONFIG_CPU_FREQ) += cpufreq/
+ obj-$(CONFIG_MMC) += mmc/
+ obj-y += firmware/
++obj-$(CONFIG_CRASH_DUMP) += dump/
--- /dev/null
+Index: linux-2.6.10/include/asm-um/archparam-i386.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-um/archparam-i386.h 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/include/asm-um/archparam-i386.h 2005-04-05 12:40:36.075903800 +0800
+@@ -10,7 +10,8 @@
+
+ #include "user.h"
+
+-#define ELF_PLATFORM "i586"
++extern char * elf_aux_platform;
++#define ELF_PLATFORM (elf_aux_platform)
+
+ #define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3)
+
+@@ -56,15 +57,13 @@
+ pr_reg[16] = PT_REGS_SS(regs); \
+ } while(0);
+
+-#if 0 /* Turn this back on when UML has VSYSCALL working */
+-#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL))
+-#else
+-#define VSYSCALL_BASE 0
+-#endif
+
+-#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE)
+-#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
+-extern void *__kernel_vsyscall;
++extern unsigned long vsyscall_ehdr;
++extern unsigned long vsyscall_end;
++extern unsigned long __kernel_vsyscall;
++
++#define VSYSCALL_BASE vsyscall_ehdr
++#define VSYSCALL_END vsyscall_end
+
+ /*
+ * Architecture-neutral AT_ values in 0-17, leave some room
+@@ -75,8 +74,10 @@
+
+ #define ARCH_DLINFO \
+ do { \
+- NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \
+- NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \
++ if ( vsyscall_ehdr ) { \
++ NEW_AUX_ENT(AT_SYSINFO, __kernel_vsyscall); \
++ NEW_AUX_ENT(AT_SYSINFO_EHDR, vsyscall_ehdr); \
++ } \
+ } while (0)
+
+ /*
+@@ -87,22 +88,18 @@
+ * Dumping its extra ELF program headers includes all the other information
+ * a debugger needs to easily find how the vsyscall DSO was being used.
+ */
+-#if 0
+-#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum)
+-#endif
+-
+-#undef ELF_CORE_EXTRA_PHDRS
++#define ELF_CORE_EXTRA_PHDRS \
++ (vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0 )
+
+-#if 0
+ #define ELF_CORE_WRITE_EXTRA_PHDRS \
+-do { \
+- const struct elf_phdr *const vsyscall_phdrs = \
+- (const struct elf_phdr *) (VSYSCALL_BASE \
+- + VSYSCALL_EHDR->e_phoff); \
++if ( vsyscall_ehdr ) { \
++ const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \
++ const struct elf_phdr *const phdrp = \
++ (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \
+ int i; \
+ Elf32_Off ofs = 0; \
+- for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \
+- struct elf_phdr phdr = vsyscall_phdrs[i]; \
++ for (i = 0; i < ehdrp->e_phnum; ++i) { \
++ struct elf_phdr phdr = phdrp[i]; \
+ if (phdr.p_type == PT_LOAD) { \
+ ofs = phdr.p_offset = offset; \
+ offset += phdr.p_filesz; \
+@@ -112,23 +109,19 @@
+ phdr.p_paddr = 0; /* match other core phdrs */ \
+ DUMP_WRITE(&phdr, sizeof(phdr)); \
+ } \
+-} while (0)
++}
+ #define ELF_CORE_WRITE_EXTRA_DATA \
+-do { \
+- const struct elf_phdr *const vsyscall_phdrs = \
+- (const struct elf_phdr *) (VSYSCALL_BASE \
+- + VSYSCALL_EHDR->e_phoff); \
++if ( vsyscall_ehdr ) { \
++ const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \
++ const struct elf_phdr *const phdrp = \
++ (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \
+ int i; \
+- for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \
+- if (vsyscall_phdrs[i].p_type == PT_LOAD) \
+- DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \
+- vsyscall_phdrs[i].p_filesz); \
++ for (i = 0; i < ehdrp->e_phnum; ++i) { \
++ if (phdrp[i].p_type == PT_LOAD) \
++ DUMP_WRITE((void *) phdrp[i].p_vaddr, \
++ phdrp[i].p_filesz); \
+ } \
+-} while (0)
+-#endif
+-
+-#undef ELF_CORE_WRITE_EXTRA_PHDRS
+-#undef ELF_CORE_WRITE_EXTRA_DATA
++}
+
+ #define R_386_NONE 0
+ #define R_386_32 1
+Index: linux-2.6.10/include/asm-um/elf.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-um/elf.h 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/include/asm-um/elf.h 2005-04-05 12:40:36.074903952 +0800
+@@ -3,7 +3,8 @@
+
+ #include "asm/archparam.h"
+
+-#define ELF_HWCAP (0)
++extern long elf_aux_hwcap;
++#define ELF_HWCAP (elf_aux_hwcap)
+
+ #define SET_PERSONALITY(ex, ibcs2) do ; while(0)
+
+Index: linux-2.6.10/include/asm-um/fixmap.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-um/fixmap.h 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/include/asm-um/fixmap.h 2005-04-05 12:40:36.075903800 +0800
+@@ -3,6 +3,7 @@
+
+ #include <linux/config.h>
+ #include <asm/kmap_types.h>
++#include <asm/archparam.h>
+
+ /*
+ * Here we define all the compile-time 'special' virtual
+@@ -34,7 +35,6 @@
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+ #endif
+- FIX_VSYSCALL,
+ __end_of_fixed_addresses
+ };
+
+@@ -68,8 +68,8 @@
+ * This is the range that is readable by user mode, and things
+ * acting like user mode such as get_user_pages.
+ */
+-#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL))
+-#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
++#define FIXADDR_USER_START VSYSCALL_BASE
++#define FIXADDR_USER_END VSYSCALL_END
+
+ extern void __this_fixmap_does_not_exist(void);
+
+Index: linux-2.6.10/include/asm-i386/thread_info.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/thread_info.h 2005-03-31 16:20:10.000000000 +0800
++++ linux-2.6.10/include/asm-i386/thread_info.h 2005-04-05 12:40:36.076903648 +0800
+@@ -139,6 +139,7 @@
+ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
+ #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */
+ #define TIF_IRET 5 /* return with iret */
++#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
+ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
+ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */
+
+@@ -148,12 +149,14 @@
+ #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
+ #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP)
+ #define _TIF_IRET (1<<TIF_IRET)
++#define _TIF_SYSCALL_EMU (1<<TIF_SYSCALL_EMU)
+ #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
+ #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
+
+ /* work to do on interrupt/exception return */
+ #define _TIF_WORK_MASK \
+- (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
++ (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|\
++ _TIF_SYSCALL_EMU))
+ #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */
+
+ /*
+Index: linux-2.6.10/include/asm-i386/mmu_context.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/mmu_context.h 2004-12-25 05:33:48.000000000 +0800
++++ linux-2.6.10/include/asm-i386/mmu_context.h 2005-04-05 12:40:36.077903496 +0800
+@@ -6,13 +6,25 @@
+ #include <asm/atomic.h>
+ #include <asm/pgalloc.h>
+ #include <asm/tlbflush.h>
++#include <asm/semaphore.h>
+
+ /*
+- * Used for LDT copy/destruction.
++ * Used for LDT initialization/destruction. You cannot copy an LDT with
++ * init_new_context, since it thinks you are passing it a new LDT and won't
++ * deallocate its old content.
+ */
+ int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+ void destroy_context(struct mm_struct *mm);
+
++/* LDT initialization for a clean environment - needed for SKAS.*/
++static inline void init_new_empty_context(struct mm_struct *mm)
++{
++ init_MUTEX(&mm->context.sem);
++ mm->context.size = 0;
++}
++
++/* LDT copy for SKAS - for the above problem.*/
++int copy_context(struct mm_struct *mm, struct mm_struct *old_mm);
+
+ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+ {
+@@ -29,6 +41,10 @@
+ {
+ int cpu = smp_processor_id();
+
++#ifdef CONFIG_SMP
++ prev = per_cpu(cpu_tlbstate, cpu).active_mm;
++#endif
++
+ if (likely(prev != next)) {
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+@@ -50,7 +66,6 @@
+ #ifdef CONFIG_SMP
+ else {
+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
+- BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
+
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+Index: linux-2.6.10/include/asm-i386/ptrace.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/ptrace.h 2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/include/asm-i386/ptrace.h 2005-04-05 12:40:36.077903496 +0800
+@@ -64,4 +64,26 @@
+ #endif
+ #endif
+
++/*For SKAS3 support.*/
++#ifndef _LINUX_PTRACE_STRUCT_DEF
++#define _LINUX_PTRACE_STRUCT_DEF
++
++#define PTRACE_FAULTINFO 52
++#define PTRACE_SIGPENDING 53
++#define PTRACE_LDT 54
++#define PTRACE_SWITCH_MM 55
++
++struct ptrace_faultinfo {
++ int is_write;
++ unsigned long addr;
++};
++
++struct ptrace_ldt {
++ int func;
++ void *ptr;
++ unsigned long bytecount;
++};
++
++#endif /*ifndef _LINUX_PTRACE_STRUCT_DEF*/
++
+ #endif
+Index: linux-2.6.10/include/asm-i386/desc.h
+===================================================================
+--- linux-2.6.10.orig/include/asm-i386/desc.h 2005-03-31 16:20:09.000000000 +0800
++++ linux-2.6.10/include/asm-i386/desc.h 2005-04-05 12:40:36.078903344 +0800
+@@ -126,6 +126,9 @@
+ put_cpu();
+ }
+
++extern int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr,
++ unsigned long bytecount);
++
+ #endif /* !__ASSEMBLY__ */
+
+ #endif
+Index: linux-2.6.10/include/linux/ptrace.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/ptrace.h 2005-03-31 15:35:23.000000000 +0800
++++ linux-2.6.10/include/linux/ptrace.h 2005-04-05 12:40:36.071904408 +0800
+@@ -20,6 +20,7 @@
+ #define PTRACE_DETACH 0x11
+
+ #define PTRACE_SYSCALL 24
++#define PTRACE_SYSEMU 31
+
+ /* 0x4200-0x4300 are reserved for architecture-independent additions. */
+ #define PTRACE_SETOPTIONS 0x4200
+Index: linux-2.6.10/include/linux/mm.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mm.h 2005-03-31 16:10:15.000000000 +0800
++++ linux-2.6.10/include/linux/mm.h 2005-04-05 12:40:36.072904256 +0800
+@@ -625,6 +625,9 @@
+ extern struct shrinker *set_shrinker(int, shrinker_t);
+ extern void remove_shrinker(struct shrinker *shrinker);
+
++extern long do_mprotect(struct mm_struct *mm, unsigned long start,
++ size_t len, unsigned long prot);
++
+ /*
+ * On a two-level page table, this ends up being trivial. Thus the
+ * inlining and the symmetry break with pte_alloc_map() that does all
+@@ -684,9 +687,15 @@
+
+ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+
+-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
++extern unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file *file,
++ unsigned long addr, unsigned long len,
++ unsigned long prot, unsigned long flag,
++ unsigned long pgoff);
++static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+- unsigned long flag, unsigned long pgoff);
++ unsigned long flag, unsigned long pgoff) {
++ return __do_mmap_pgoff(current->mm, file, addr, len, prot, flag, pgoff);
++}
+
+ static inline unsigned long do_mmap(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+Index: linux-2.6.10/include/linux/proc_mm.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/proc_mm.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/include/linux/proc_mm.h 2005-04-05 12:40:36.073904104 +0800
+@@ -0,0 +1,48 @@
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PROC_MM_H
++#define __PROC_MM_H
++
++#include "linux/sched.h"
++
++#define MM_MMAP 54
++#define MM_MUNMAP 55
++#define MM_MPROTECT 56
++#define MM_COPY_SEGMENTS 57
++
++struct mm_mmap {
++ unsigned long addr;
++ unsigned long len;
++ unsigned long prot;
++ unsigned long flags;
++ unsigned long fd;
++ unsigned long offset;
++};
++
++struct mm_munmap {
++ unsigned long addr;
++ unsigned long len;
++};
++
++struct mm_mprotect {
++ unsigned long addr;
++ unsigned long len;
++ unsigned int prot;
++};
++
++struct proc_mm_op {
++ int op;
++ union {
++ struct mm_mmap mmap;
++ struct mm_munmap munmap;
++ struct mm_mprotect mprotect;
++ int copy_segments;
++ } u;
++};
++
++extern struct mm_struct *proc_mm_get_mm(int fd);
++
++#endif
+Index: linux-2.6.10/lib/Kconfig.debug
+===================================================================
+--- linux-2.6.10.orig/lib/Kconfig.debug 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/lib/Kconfig.debug 2005-04-05 12:40:36.010913680 +0800
+@@ -23,7 +23,6 @@
+ config MAGIC_SYSRQ
+ bool "Magic SysRq key"
+ depends on DEBUG_KERNEL && (H8300 || M68KNOMMU || V850)
+- depends (USERMODE && MCONSOLE)
+ help
+ Enables console device to interpret special characters as
+ commands to dump state information.
+Index: linux-2.6.10/kernel/fork.c
+===================================================================
+--- linux-2.6.10.orig/kernel/fork.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/kernel/fork.c 2005-04-05 12:40:36.070904560 +0800
+@@ -927,6 +927,9 @@
+ * of CLONE_PTRACE.
+ */
+ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
++#ifdef TIF_SYSCALL_EMU
++ clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
++#endif
+
+ /* Our parent execution domain becomes current domain
+ These must match for thread signalling to apply */
+Index: linux-2.6.10/mm/mmap.c
+===================================================================
+--- linux-2.6.10.orig/mm/mmap.c 2005-03-31 16:20:10.000000000 +0800
++++ linux-2.6.10/mm/mmap.c 2005-04-05 12:40:36.013913224 +0800
+@@ -759,11 +759,11 @@
+ * The caller must hold down_write(current->mm->mmap_sem).
+ */
+
+-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+- unsigned long len, unsigned long prot,
+- unsigned long flags, unsigned long pgoff)
++unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file * file,
++ unsigned long addr, unsigned long len,
++ unsigned long prot, unsigned long flags,
++ unsigned long pgoff)
+ {
+- struct mm_struct * mm = current->mm;
+ struct vm_area_struct * vma, * prev;
+ struct inode *inode;
+ unsigned int vm_flags;
+@@ -1037,7 +1037,7 @@
+ return error;
+ }
+
+-EXPORT_SYMBOL(do_mmap_pgoff);
++EXPORT_SYMBOL(__do_mmap_pgoff);
+
+ /* Get an address range which is currently unmapped.
+ * For shmat() with addr=0.
+Index: linux-2.6.10/mm/proc_mm.c
+===================================================================
+--- linux-2.6.10.orig/mm/proc_mm.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/mm/proc_mm.c 2005-04-05 12:40:36.014913072 +0800
+@@ -0,0 +1,181 @@
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/mm.h"
++#include "linux/init.h"
++#include "linux/proc_fs.h"
++#include "linux/proc_mm.h"
++#include "linux/file.h"
++#include "linux/mman.h"
++#include "asm/uaccess.h"
++#include "asm/mmu_context.h"
++
++static struct file_operations proc_mm_fops;
++
++struct mm_struct *proc_mm_get_mm(int fd)
++{
++ struct mm_struct *ret = ERR_PTR(-EBADF);
++ struct file *file;
++
++ file = fget(fd);
++ if (!file)
++ goto out;
++
++ ret = ERR_PTR(-EINVAL);
++ if(file->f_op != &proc_mm_fops)
++ goto out_fput;
++
++ ret = file->private_data;
++ out_fput:
++ fput(file);
++ out:
++ return(ret);
++}
++
++extern long do_mmap2(struct mm_struct *mm, unsigned long addr,
++ unsigned long len, unsigned long prot,
++ unsigned long flags, unsigned long fd,
++ unsigned long pgoff);
++
++static ssize_t write_proc_mm(struct file *file, const char *buffer,
++ size_t count, loff_t *ppos)
++{
++ struct mm_struct *mm = file->private_data;
++ struct proc_mm_op req;
++ int n, ret;
++
++ if(count > sizeof(req))
++ return(-EINVAL);
++
++ n = copy_from_user(&req, buffer, count);
++ if(n != 0)
++ return(-EFAULT);
++
++ ret = count;
++ switch(req.op){
++ case MM_MMAP: {
++ struct mm_mmap *map = &req.u.mmap;
++
++ /* Nobody ever noticed it, but do_mmap_pgoff() calls
++ * get_unmapped_area() which checks current->mm, if
++ * MAP_FIXED is not set, so mmap() could replace
++ * an old mapping.
++ */
++ if (! (map->flags & MAP_FIXED))
++ return(-EINVAL);
++
++ ret = do_mmap2(mm, map->addr, map->len, map->prot,
++ map->flags, map->fd, map->offset >> PAGE_SHIFT);
++ if((ret & ~PAGE_MASK) == 0)
++ ret = count;
++
++ break;
++ }
++ case MM_MUNMAP: {
++ struct mm_munmap *unmap = &req.u.munmap;
++
++ down_write(&mm->mmap_sem);
++ ret = do_munmap(mm, unmap->addr, unmap->len);
++ up_write(&mm->mmap_sem);
++
++ if(ret == 0)
++ ret = count;
++ break;
++ }
++ case MM_MPROTECT: {
++ struct mm_mprotect *protect = &req.u.mprotect;
++
++ ret = do_mprotect(mm, protect->addr, protect->len,
++ protect->prot);
++ if(ret == 0)
++ ret = count;
++ break;
++ }
++
++ case MM_COPY_SEGMENTS: {
++ struct mm_struct *from = proc_mm_get_mm(req.u.copy_segments);
++
++ if(IS_ERR(from)){
++ ret = PTR_ERR(from);
++ break;
++ }
++
++ ret = copy_context(mm, from);
++ if(ret == 0)
++ ret = count;
++ break;
++ }
++ default:
++ ret = -EINVAL;
++ break;
++ }
++
++ return(ret);
++}
++
++static int open_proc_mm(struct inode *inode, struct file *file)
++{
++ struct mm_struct *mm = mm_alloc();
++ int ret;
++
++ ret = -ENOMEM;
++ if(mm == NULL)
++ goto out_mem;
++
++ init_new_empty_context(mm);
++ arch_pick_mmap_layout(mm);
++
++ spin_lock(&mmlist_lock);
++ list_add(&mm->mmlist, ¤t->mm->mmlist);
++ spin_unlock(&mmlist_lock);
++
++ file->private_data = mm;
++
++ return(0);
++
++ out_mem:
++ return(ret);
++}
++
++static int release_proc_mm(struct inode *inode, struct file *file)
++{
++ struct mm_struct *mm = file->private_data;
++
++ mmput(mm);
++ return(0);
++}
++
++static struct file_operations proc_mm_fops = {
++ .open = open_proc_mm,
++ .release = release_proc_mm,
++ .write = write_proc_mm,
++};
++
++static int make_proc_mm(void)
++{
++ struct proc_dir_entry *ent;
++
++ ent = create_proc_entry("mm", 0222, &proc_root);
++ if(ent == NULL){
++ printk("make_proc_mm : Failed to register /proc/mm\n");
++ return(0);
++ }
++ ent->proc_fops = &proc_mm_fops;
++
++ return(0);
++}
++
++__initcall(make_proc_mm);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+Index: linux-2.6.10/mm/mprotect.c
+===================================================================
+--- linux-2.6.10.orig/mm/mprotect.c 2005-03-31 16:20:10.000000000 +0800
++++ linux-2.6.10/mm/mprotect.c 2005-04-05 12:40:36.011913528 +0800
+@@ -93,19 +93,20 @@
+ {
+ pgd_t *dir;
+ unsigned long beg = start;
++ struct mm_struct * mm = vma->vm_mm;
+
+- dir = pgd_offset(current->mm, start);
++ dir = pgd_offset(mm, start);
+ flush_cache_range(vma, beg, end);
+ if (start >= end)
+ BUG();
+- spin_lock(¤t->mm->page_table_lock);
++ spin_lock(&mm->page_table_lock);
+ do {
+ change_pmd_range(dir, start, end - start, newprot);
+ start = (start + PGDIR_SIZE) & PGDIR_MASK;
+ dir++;
+ } while (start && (start < end));
+ flush_tlb_range(vma, beg, end);
+- spin_unlock(¤t->mm->page_table_lock);
++ spin_unlock(&mm->page_table_lock);
+ return;
+ }
+
+@@ -190,8 +191,9 @@
+ return error;
+ }
+
+-asmlinkage long
+-sys_mprotect(unsigned long start, size_t len, unsigned long prot)
++long
++do_mprotect(struct mm_struct *mm, unsigned long start, size_t len,
++ unsigned long prot)
+ {
+ unsigned long vm_flags, nstart, end, tmp;
+ struct vm_area_struct *vma, *prev;
+@@ -220,9 +222,9 @@
+
+ vm_flags = calc_vm_prot_bits(prot);
+
+- down_write(¤t->mm->mmap_sem);
++ down_write(&mm->mmap_sem);
+
+- vma = find_vma_prev(current->mm, start, &prev);
++ vma = find_vma_prev(mm, start, &prev);
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+@@ -288,6 +290,11 @@
+ }
+ }
+ out:
+- up_write(¤t->mm->mmap_sem);
++ up_write(&mm->mmap_sem);
+ return error;
+ }
++
++asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
++{
++ return(do_mprotect(current->mm, start, len, prot));
++}
+Index: linux-2.6.10/mm/Makefile
+===================================================================
+--- linux-2.6.10.orig/mm/Makefile 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/mm/Makefile 2005-04-05 12:40:36.014913072 +0800
+@@ -18,3 +18,4 @@
+ obj-$(CONFIG_SHMEM) += shmem.o
+ obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+
++obj-$(CONFIG_PROC_MM) += proc_mm.o
+Index: linux-2.6.10/arch/i386/kernel/entry.S
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/entry.S 2005-03-31 16:20:08.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/entry.S 2005-04-05 12:40:36.064905472 +0800
+@@ -222,7 +222,7 @@
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+
+- testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++ testb $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ jnz syscall_trace_entry
+ cmpl $(nr_syscalls), %eax
+ jae syscall_badsys
+@@ -245,8 +245,8 @@
+ pushl %eax # save orig_eax
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+- # system call tracing in operation
+- testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++ # system call tracing in operation / emulation
++ testb $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ jnz syscall_trace_entry
+ cmpl $(nr_syscalls), %eax
+ jae syscall_badsys
+@@ -307,6 +307,9 @@
+ movl %esp, %eax
+ xorl %edx,%edx
+ call do_syscall_trace
++ cmpl $0, %eax
++ jne syscall_exit # ret != 0 -> running under PTRACE_SYSEMU,
++ # so must skip actual syscall
+ movl ORIG_EAX(%esp), %eax
+ cmpl $(nr_syscalls), %eax
+ jnae syscall_call
+Index: linux-2.6.10/arch/i386/kernel/ptrace.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/ptrace.c 2004-12-25 05:34:29.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/ptrace.c 2005-04-05 12:40:36.061905928 +0800
+@@ -15,6 +15,7 @@
+ #include <linux/user.h>
+ #include <linux/security.h>
+ #include <linux/audit.h>
++#include <linux/proc_mm.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -406,15 +407,27 @@
+ }
+ break;
+
++ case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
+ case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
+ case PTRACE_CONT: /* restart after signal. */
+ ret = -EIO;
+ if ((unsigned long) data > _NSIG)
+ break;
++ /* If we came here with PTRACE_SYSEMU and now continue with
++ * PTRACE_SYSCALL, entry.S used to intercept the syscall return.
++ * But it shouldn't!
++ * So we don't clear TIF_SYSCALL_EMU, which is always unused in
++ * this special case, to remember, we came from SYSEMU. That
++ * flag will be cleared by do_syscall_trace().
++ */
++ if (request == PTRACE_SYSEMU) {
++ set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
++ } else if (request == PTRACE_CONT) {
++ clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
++ }
+ if (request == PTRACE_SYSCALL) {
+ set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+- }
+- else {
++ } else {
+ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ }
+ child->exit_code = data;
+@@ -443,6 +456,8 @@
+ ret = -EIO;
+ if ((unsigned long) data > _NSIG)
+ break;
++ /*See do_syscall_trace to know why we don't clear
++ * TIF_SYSCALL_EMU.*/
+ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ set_singlestep(child);
+ child->exit_code = data;
+@@ -542,6 +557,58 @@
+ (struct user_desc __user *) data);
+ break;
+
++#ifdef CONFIG_PROC_MM
++ case PTRACE_FAULTINFO: {
++ struct ptrace_faultinfo fault;
++
++ fault = ((struct ptrace_faultinfo)
++ { .is_write = child->thread.error_code,
++ .addr = child->thread.cr2 });
++ ret = copy_to_user((unsigned long *) data, &fault,
++ sizeof(fault));
++ if(ret)
++ break;
++ break;
++ }
++
++ case PTRACE_SIGPENDING:
++ ret = copy_to_user((unsigned long *) data,
++ &child->pending.signal,
++ sizeof(child->pending.signal));
++ break;
++
++ case PTRACE_LDT: {
++ struct ptrace_ldt ldt;
++
++ if(copy_from_user(&ldt, (unsigned long *) data,
++ sizeof(ldt))){
++ ret = -EIO;
++ break;
++ }
++ ret = __modify_ldt(child->mm, ldt.func, ldt.ptr, ldt.bytecount);
++ break;
++ }
++
++ case PTRACE_SWITCH_MM: {
++ struct mm_struct *old = child->mm;
++ struct mm_struct *new = proc_mm_get_mm(data);
++
++ if(IS_ERR(new)){
++ ret = PTR_ERR(new);
++ break;
++ }
++
++ atomic_inc(&new->mm_users);
++ task_lock(child);
++ child->mm = new;
++ child->active_mm = new;
++ task_unlock(child);
++ mmput(old);
++ ret = 0;
++ break;
++ }
++#endif
++
+ default:
+ ret = ptrace_request(child, request, addr, data);
+ break;
+@@ -557,8 +624,9 @@
+ * - triggered by current->work.syscall_trace
+ */
+ __attribute__((regparm(3)))
+-void do_syscall_trace(struct pt_regs *regs, int entryexit)
++int do_syscall_trace(struct pt_regs *regs, int entryexit)
+ {
++ int is_sysemu, is_systrace, is_singlestep;
+ if (unlikely(current->audit_context)) {
+ if (!entryexit)
+ audit_syscall_entry(current, regs->orig_eax,
+@@ -567,16 +635,27 @@
+ else
+ audit_syscall_exit(current, regs->eax);
+ }
+-
+- if (!test_thread_flag(TIF_SYSCALL_TRACE) &&
+- !test_thread_flag(TIF_SINGLESTEP))
+- return;
++ is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
++ is_systrace = test_thread_flag(TIF_SYSCALL_TRACE);
++ is_singlestep = test_thread_flag(TIF_SINGLESTEP);
++
++ if (!is_systrace && !is_singlestep && !is_sysemu)
++ return 0;
++ /* We can detect the case of coming from PTRACE_SYSEMU and now running
++ * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being
++ * set additionally.
++ * If so let's reset the flag and return without action.
++ */
++ if (is_sysemu && (is_systrace || is_singlestep)) {
++ clear_thread_flag(TIF_SYSCALL_EMU);
++ return 0;
++ }
+ if (!(current->ptrace & PT_PTRACED))
+- return;
++ return 0;
+ /* the 0x80 provides a way for the tracing parent to distinguish
+ between a syscall stop and SIGTRAP delivery */
+ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) &&
+- !test_thread_flag(TIF_SINGLESTEP) ? 0x80 : 0));
++ !is_singlestep ? 0x80 : 0));
+
+ /*
+ * this isn't the same as continuing with a signal, but it will do
+@@ -587,4 +666,6 @@
+ send_sig(current->exit_code, current, 1);
+ current->exit_code = 0;
+ }
++ /* != 0 if nullifying the syscall, 0 if running it normally */
++ return is_sysemu;
+ }
+Index: linux-2.6.10/arch/i386/kernel/ldt.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/ldt.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/ldt.c 2005-04-05 12:40:36.062905776 +0800
+@@ -18,6 +18,7 @@
+ #include <asm/system.h>
+ #include <asm/ldt.h>
+ #include <asm/desc.h>
++#include <asm/mmu_context.h>
+
+ #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+ static void flush_ldt(void *null)
+@@ -27,11 +28,12 @@
+ }
+ #endif
+
+-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
++static int alloc_ldt(struct mm_struct *mm, int mincount, int reload)
+ {
+ void *oldldt;
+ void *newldt;
+ int oldsize;
++ mm_context_t * pc = &mm->context;
+
+ if (mincount <= pc->size)
+ return 0;
+@@ -58,13 +60,15 @@
+ #ifdef CONFIG_SMP
+ cpumask_t mask;
+ preempt_disable();
+- load_LDT(pc);
++ if (¤t->active_mm->context == pc)
++ load_LDT(pc);
+ mask = cpumask_of_cpu(smp_processor_id());
+- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
++ if (!cpus_equal(mm->cpu_vm_mask, mask))
+ smp_call_function(flush_ldt, NULL, 1, 1);
+ preempt_enable();
+ #else
+- load_LDT(pc);
++ if (¤t->active_mm->context == pc)
++ load_LDT(pc);
+ #endif
+ }
+ if (oldsize) {
+@@ -76,12 +80,12 @@
+ return 0;
+ }
+
+-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
++static inline int copy_ldt(struct mm_struct *new, struct mm_struct *old)
+ {
+- int err = alloc_ldt(new, old->size, 0);
++ int err = alloc_ldt(new, old->context.size, 0);
+ if (err < 0)
+ return err;
+- memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
++ memcpy(new->context.ldt, old->context.ldt, old->context.size*LDT_ENTRY_SIZE);
+ return 0;
+ }
+
+@@ -89,22 +93,24 @@
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
++int copy_context(struct mm_struct *mm, struct mm_struct *old_mm)
+ {
+- struct mm_struct * old_mm;
+ int retval = 0;
+
+- init_MUTEX(&mm->context.sem);
+- mm->context.size = 0;
+- old_mm = current->mm;
+ if (old_mm && old_mm->context.size > 0) {
+ down(&old_mm->context.sem);
+- retval = copy_ldt(&mm->context, &old_mm->context);
++ retval = copy_ldt(mm, old_mm);
+ up(&old_mm->context.sem);
+ }
+ return retval;
+ }
+
++int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
++{
++ init_new_empty_context(mm);
++ return copy_context(mm, current->mm);
++}
++
+ /*
+ * No need to lock the MM as we are the last user
+ */
+@@ -121,11 +127,11 @@
+ }
+ }
+
+-static int read_ldt(void __user * ptr, unsigned long bytecount)
++static int read_ldt(struct mm_struct * mm, void __user * ptr,
++ unsigned long bytecount)
+ {
+ int err;
+ unsigned long size;
+- struct mm_struct * mm = current->mm;
+
+ if (!mm->context.size)
+ return 0;
+@@ -174,9 +180,8 @@
+ return err;
+ }
+
+-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
++static int write_ldt(struct mm_struct * mm, void __user * ptr, unsigned long bytecount, int oldmode)
+ {
+- struct mm_struct * mm = current->mm;
+ __u32 entry_1, entry_2, *lp;
+ int error;
+ struct user_desc ldt_info;
+@@ -200,7 +205,7 @@
+
+ down(&mm->context.sem);
+ if (ldt_info.entry_number >= mm->context.size) {
+- error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
++ error = alloc_ldt(mm, ldt_info.entry_number+1, 1);
+ if (error < 0)
+ goto out_unlock;
+ }
+@@ -233,23 +238,29 @@
+ return error;
+ }
+
+-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
++int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr,
++ unsigned long bytecount)
+ {
+ int ret = -ENOSYS;
+
+ switch (func) {
+ case 0:
+- ret = read_ldt(ptr, bytecount);
++ ret = read_ldt(mm, ptr, bytecount);
+ break;
+ case 1:
+- ret = write_ldt(ptr, bytecount, 1);
++ ret = write_ldt(mm, ptr, bytecount, 1);
+ break;
+ case 2:
+ ret = read_default_ldt(ptr, bytecount);
+ break;
+ case 0x11:
+- ret = write_ldt(ptr, bytecount, 0);
++ ret = write_ldt(mm, ptr, bytecount, 0);
+ break;
+ }
+ return ret;
+ }
++
++asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
++{
++ return __modify_ldt(current->mm, func, ptr, bytecount);
++}
+Index: linux-2.6.10/arch/i386/kernel/sys_i386.c
+===================================================================
+--- linux-2.6.10.orig/arch/i386/kernel/sys_i386.c 2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/i386/kernel/sys_i386.c 2005-04-05 12:40:36.063905624 +0800
+@@ -41,7 +41,7 @@
+ }
+
+ /* common code for old and new mmaps */
+-static inline long do_mmap2(
++long do_mmap2(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff)
+@@ -56,9 +56,9 @@
+ goto out;
+ }
+
+- down_write(¤t->mm->mmap_sem);
+- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+- up_write(¤t->mm->mmap_sem);
++ down_write(&mm->mmap_sem);
++ error = __do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff);
++ up_write(&mm->mmap_sem);
+
+ if (file)
+ fput(file);
+@@ -70,7 +70,7 @@
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff)
+ {
+- return do_mmap2(addr, len, prot, flags, fd, pgoff);
++ return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff);
+ }
+
+ /*
+@@ -101,7 +101,7 @@
+ if (a.offset & ~PAGE_MASK)
+ goto out;
+
+- err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
++ err = do_mmap2(current->mm, a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
+ out:
+ return err;
+ }
+Index: linux-2.6.10/arch/i386/Kconfig
+===================================================================
+--- linux-2.6.10.orig/arch/i386/Kconfig 2005-03-31 15:35:23.000000000 +0800
++++ linux-2.6.10/arch/i386/Kconfig 2005-04-05 12:40:36.066905168 +0800
+@@ -738,6 +738,10 @@
+ depends on HIGHMEM64G
+ default y
+
++config PROC_MM
++ bool "/proc/mm support"
++ default y
++
+ # Common NUMA Features
+ config NUMA
+ bool "Numa Memory Allocation and Scheduler Support"
+Index: linux-2.6.10/arch/um/include/frame.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/frame.h 2004-12-25 05:34:31.000000000 +0800
++++ linux-2.6.10/arch/um/include/frame.h 2005-04-05 19:01:49.158500672 +0800
+@@ -1,53 +0,0 @@
+-/*
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_H_
+-#define __FRAME_H_
+-
+-#include "sysdep/frame.h"
+-
+-struct frame_common {
+- void *data;
+- int len;
+- int sig_index;
+- int sr_index;
+- int sr_relative;
+- int sp_index;
+- struct arch_frame_data arch;
+-};
+-
+-struct sc_frame {
+- struct frame_common common;
+- int sc_index;
+-};
+-
+-extern struct sc_frame signal_frame_sc;
+-
+-extern struct sc_frame signal_frame_sc_sr;
+-
+-struct si_frame {
+- struct frame_common common;
+- int sip_index;
+- int si_index;
+- int ucp_index;
+- int uc_index;
+-};
+-
+-extern struct si_frame signal_frame_si;
+-
+-extern void capture_signal_stack(void);
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only. This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/frame_kern.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/frame_kern.h 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/arch/um/include/frame_kern.h 2005-04-05 12:40:36.056906688 +0800
+@@ -6,8 +6,8 @@
+ #ifndef __FRAME_KERN_H_
+ #define __FRAME_KERN_H_
+
+-#include "frame.h"
+-#include "sysdep/frame_kern.h"
++#define _S(nr) (1<<((nr)-1))
++#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
+
+ extern int setup_signal_stack_sc(unsigned long stack_top, int sig,
+ struct k_sigaction *ka,
+Index: linux-2.6.10/arch/um/include/frame_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/frame_user.h 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/um/include/frame_user.h 2005-04-05 19:01:49.158500672 +0800
+@@ -1,23 +0,0 @@
+-/*
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_USER_H_
+-#define __FRAME_USER_H_
+-
+-#include "sysdep/frame_user.h"
+-#include "frame.h"
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only. This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/ptrace_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/ptrace_user.h 2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/arch/um/include/ptrace_user.h 2005-04-05 12:40:36.057906536 +0800
+@@ -26,4 +26,35 @@
+ int get_using_sysemu(void);
+ extern int sysemu_supported;
+
++
++/* syscall emulation path in ptrace */
++
++#ifndef PTRACE_SYSEMU
++#define PTRACE_SYSEMU 31
++#endif
++
++/* On architectures, that started to support PTRACE_O_TRACESYSGOOD
++ * in linux 2.4, there are two different definitions of
++ * PTRACE_SETOPTIONS: linux 2.4 uses 21 while linux 2.6 uses 0x4200.
++ * For binary compatibility, 2.6 also supports the old "21", named
++ * PTRACE_OLDSETOPTION. On these architectures, UML always must use
++ * "21", to ensure the kernel runs on 2.4 and 2.6 host without
++ * recompilation. So, we use PTRACE_OLDSETOPTIONS in UML.
++ * We also want to be able to build the kernel on 2.4, which doesn't
++ * have PTRACE_OLDSETOPTIONS. So, if it is missing, we declare
++ * PTRACE_OLDSETOPTIONS to to be the same as PTRACE_SETOPTIONS.
++ *
++ * On architectures, that start to support PTRACE_O_TRACESYSGOOD on
++ * linux 2.6, PTRACE_OLDSETOPTIONS never is defined, and also isn't
++ * supported by the host kernel. In that case, our trick lets us use
++ * the new 0x4200 with the name PTRACE_OLDSETOPTIONS.
++ */
++#ifndef PTRACE_OLDSETOPTIONS
++#define PTRACE_OLDSETOPTIONS PTRACE_SETOPTIONS
++#endif
++
++void set_using_sysemu(int value);
++int get_using_sysemu(void);
++extern int sysemu_supported;
++
+ #endif
+Index: linux-2.6.10/arch/um/include/sysdep-i386/frame.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame.h 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/um/include/sysdep-i386/frame.h 2005-04-05 19:01:49.158500672 +0800
+@@ -1,29 +0,0 @@
+-/*
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_I386_H
+-#define __FRAME_I386_H
+-
+-struct arch_frame_data_raw {
+- unsigned long fp_start;
+- unsigned long sr;
+-};
+-
+-struct arch_frame_data {
+- int fpstate_size;
+-};
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only. This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/sysdep-i386/frame_kern.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame_kern.h 2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/um/include/sysdep-i386/frame_kern.h 2005-04-05 19:01:49.158500672 +0800
+@@ -1,69 +0,0 @@
+-/*
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_KERN_I386_H
+-#define __FRAME_KERN_I386_H
+-
+-/* This is called from sys_sigreturn. It takes the sp at the point of the
+- * sigreturn system call and returns the address of the sigcontext struct
+- * on the stack.
+- */
+-
+-static inline void *sp_to_sc(unsigned long sp)
+-{
+- return((void *) sp);
+-}
+-
+-static inline void *sp_to_uc(unsigned long sp)
+-{
+- unsigned long uc;
+-
+- uc = sp + signal_frame_si.uc_index -
+- signal_frame_si.common.sp_index - 4;
+- return((void *) uc);
+-}
+-
+-static inline void *sp_to_rt_sc(unsigned long sp)
+-{
+- unsigned long sc;
+-
+- sc = sp - signal_frame_si.common.sp_index +
+- signal_frame_si.common.len - 4;
+- return((void *) sc);
+-}
+-
+-static inline void *sp_to_mask(unsigned long sp)
+-{
+- unsigned long mask;
+-
+- mask = sp - signal_frame_sc.common.sp_index +
+- signal_frame_sc.common.len - 8;
+- return((void *) mask);
+-}
+-
+-extern int sc_size(void *data);
+-
+-static inline void *sp_to_rt_mask(unsigned long sp)
+-{
+- unsigned long mask;
+-
+- mask = sp - signal_frame_si.common.sp_index +
+- signal_frame_si.common.len +
+- sc_size(&signal_frame_si.common.arch) - 4;
+- return((void *) mask);
+-}
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only. This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/sysdep-i386/frame_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame_user.h 2004-12-25 05:35:28.000000000 +0800
++++ linux-2.6.10/arch/um/include/sysdep-i386/frame_user.h 2005-04-05 19:01:49.158500672 +0800
+@@ -1,91 +0,0 @@
+-/*
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#ifndef __FRAME_USER_I386_H
+-#define __FRAME_USER_I386_H
+-
+-#include <asm/page.h>
+-#include "sysdep/frame.h"
+-
+-/* This stuff is to calculate the size of the fp state struct at runtime
+- * because it has changed between 2.2 and 2.4 and it would be good for a
+- * UML compiled on one to work on the other.
+- * So, setup_arch_frame_raw fills in the arch struct with the raw data, which
+- * just contains the address of the end of the sigcontext. This is invoked
+- * from the signal handler.
+- * setup_arch_frame uses that data to figure out what
+- * arch_frame_data.fpstate_size should be. It really has no idea, since it's
+- * not allowed to do sizeof(struct fpstate) but it's safe to consider that it's
+- * everything from the end of the sigcontext up to the top of the stack. So,
+- * it masks off the page number to get the offset within the page and subtracts
+- * that from the page size, and that's how big the fpstate struct will be
+- * considered to be.
+- */
+-
+-static inline void setup_arch_frame_raw(struct arch_frame_data_raw *data,
+- void *end, unsigned long srp)
+-{
+- unsigned long sr = *((unsigned long *) srp);
+-
+- data->fp_start = (unsigned long) end;
+- if((sr & PAGE_MASK) == ((unsigned long) end & PAGE_MASK))
+- data->sr = sr;
+- else data->sr = 0;
+-}
+-
+-static inline void setup_arch_frame(struct arch_frame_data_raw *in,
+- struct arch_frame_data *out)
+-{
+- unsigned long fpstate_start = in->fp_start;
+-
+- if(in->sr == 0){
+- fpstate_start &= ~PAGE_MASK;
+- out->fpstate_size = PAGE_SIZE - fpstate_start;
+- }
+- else {
+- out->fpstate_size = in->sr - fpstate_start;
+- }
+-}
+-
+-/* This figures out where on the stack the SA_RESTORER function address
+- * is stored. For i386, it's the signal handler return address, so it's
+- * located next to the frame pointer.
+- * This is inlined, so __builtin_frame_address(0) is correct. Otherwise,
+- * it would have to be __builtin_frame_address(1).
+- */
+-
+-#define frame_restorer() \
+-({ \
+- unsigned long *fp; \
+-\
+- fp = __builtin_frame_address(0); \
+- ((unsigned long) (fp + 1)); \
+-})
+-
+-/* Similarly, this returns the value of sp when the handler was first
+- * entered. This is used to calculate the proper sp when delivering
+- * signals.
+- */
+-
+-#define frame_sp() \
+-({ \
+- unsigned long *fp; \
+-\
+- fp = __builtin_frame_address(0); \
+- ((unsigned long) (fp + 1)); \
+-})
+-
+-#endif
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only. This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/include/elf_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/elf_user.h 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/um/include/elf_user.h 2005-04-05 12:40:36.054906992 +0800
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH
++ * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
++ * Licensed under the GPL
++ */
++
++#ifndef __ELF_USER_H__
++#define __ELF_USER_H__
++
++/* For compilation on a host that doesn't support AT_SYSINFO (Linux 2.4) */
++
++#ifndef AT_SYSINFO
++#define AT_SYSINFO 32
++#endif
++#ifndef AT_SYSINFO_EHDR
++#define AT_SYSINFO_EHDR 33
++#endif
++
++#endif
+Index: linux-2.6.10/arch/um/include/skas_ptrace.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/skas_ptrace.h 2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/arch/um/include/skas_ptrace.h 2005-04-05 12:40:36.056906688 +0800
+@@ -6,6 +6,7 @@
+ #ifndef __SKAS_PTRACE_H
+ #define __SKAS_PTRACE_H
+
++#ifndef PTRACE_FAULTINFO
+ struct ptrace_faultinfo {
+ int is_write;
+ unsigned long addr;
+@@ -21,6 +22,7 @@
+ #define PTRACE_SIGPENDING 53
+ #define PTRACE_LDT 54
+ #define PTRACE_SWITCH_MM 55
++#endif
+
+ #endif
+
+Index: linux-2.6.10/arch/um/include/signal_user.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/include/signal_user.h 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/um/include/signal_user.h 2005-04-05 12:40:36.055906840 +0800
+@@ -14,6 +14,8 @@
+ extern int set_signals(int enable);
+ extern int get_signals(void);
+
++#define SYSCALL_TRAP 0x80
++
+ #endif
+
+ /*
+Index: linux-2.6.10/arch/um/sys-i386/ptrace_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/ptrace_user.c 2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/arch/um/sys-i386/ptrace_user.c 2005-04-05 12:40:36.022911856 +0800
+@@ -17,17 +17,30 @@
+
+ int ptrace_getregs(long pid, unsigned long *regs_out)
+ {
+- return(ptrace(PTRACE_GETREGS, pid, 0, regs_out));
++ if(ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0)
++ return(-errno);
++ return(0);
+ }
+
+ int ptrace_setregs(long pid, unsigned long *regs)
+ {
+- return(ptrace(PTRACE_SETREGS, pid, 0, regs));
++ if(ptrace(PTRACE_SETREGS, pid, 0, regs) < 0)
++ return(-errno);
++ return(0);
+ }
+
+ int ptrace_getfpregs(long pid, unsigned long *regs)
+ {
+- return(ptrace(PTRACE_GETFPREGS, pid, 0, regs));
++ if(ptrace(PTRACE_GETFPREGS, pid, 0, regs) < 0)
++ return(-errno);
++ return(0);
++}
++
++int ptrace_setfpregs(long pid, unsigned long *regs)
++{
++ if(ptrace(PTRACE_SETFPREGS, pid, 0, regs) < 0)
++ return(-errno);
++ return(0);
+ }
+
+ static void write_debugregs(int pid, unsigned long *regs)
+Index: linux-2.6.10/arch/um/sys-i386/sigcontext.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/sigcontext.c 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/um/sys-i386/sigcontext.c 2005-04-05 12:40:36.023911704 +0800
+@@ -9,22 +9,14 @@
+ #include <asm/sigcontext.h>
+ #include "sysdep/ptrace.h"
+ #include "kern_util.h"
+-#include "frame_user.h"
+-
+-int sc_size(void *data)
+-{
+- struct arch_frame_data *arch = data;
+-
+- return(sizeof(struct sigcontext) + arch->fpstate_size);
+-}
+
+ void sc_to_sc(void *to_ptr, void *from_ptr)
+ {
+ struct sigcontext *to = to_ptr, *from = from_ptr;
+- int size = sizeof(*to) + signal_frame_sc.common.arch.fpstate_size;
+
+- memcpy(to, from, size);
+- if(from->fpstate != NULL) to->fpstate = (struct _fpstate *) (to + 1);
++ memcpy(to, from, sizeof(*to) + sizeof(struct _fpstate));
++ if(from->fpstate != NULL)
++ to->fpstate = (struct _fpstate *) (to + 1);
+ }
+
+ unsigned long *sc_sigmask(void *sc_ptr)
+Index: linux-2.6.10/arch/um/sys-i386/sysrq.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/sysrq.c 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/um/sys-i386/sysrq.c 2005-04-05 12:40:36.022911856 +0800
+@@ -33,3 +33,13 @@
+
+ show_trace((unsigned long *) ®s);
+ }
++
++/* Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+Index: linux-2.6.10/arch/um/sys-i386/signal.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/signal.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/um/sys-i386/signal.c 2005-04-05 12:40:36.021912008 +0800
+@@ -0,0 +1,374 @@
++/*
++ * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/signal.h"
++#include "linux/ptrace.h"
++#include "asm/current.h"
++#include "asm/ucontext.h"
++#include "asm/uaccess.h"
++#include "asm/unistd.h"
++#include "frame_kern.h"
++#include "signal_user.h"
++#include "ptrace_user.h"
++#include "sigcontext.h"
++#include "mode.h"
++
++#ifdef CONFIG_MODE_SKAS
++
++#include "skas.h"
++
++static int copy_sc_from_user_skas(struct pt_regs *regs,
++ struct sigcontext *from)
++{
++ struct sigcontext sc;
++ unsigned long fpregs[HOST_FP_SIZE];
++ int err;
++
++ err = copy_from_user(&sc, from, sizeof(sc));
++ err |= copy_from_user(fpregs, sc.fpstate, sizeof(fpregs));
++ if(err)
++ return(err);
++
++ REGS_GS(regs->regs.skas.regs) = sc.gs;
++ REGS_FS(regs->regs.skas.regs) = sc.fs;
++ REGS_ES(regs->regs.skas.regs) = sc.es;
++ REGS_DS(regs->regs.skas.regs) = sc.ds;
++ REGS_EDI(regs->regs.skas.regs) = sc.edi;
++ REGS_ESI(regs->regs.skas.regs) = sc.esi;
++ REGS_EBP(regs->regs.skas.regs) = sc.ebp;
++ REGS_SP(regs->regs.skas.regs) = sc.esp;
++ REGS_EBX(regs->regs.skas.regs) = sc.ebx;
++ REGS_EDX(regs->regs.skas.regs) = sc.edx;
++ REGS_ECX(regs->regs.skas.regs) = sc.ecx;
++ REGS_EAX(regs->regs.skas.regs) = sc.eax;
++ REGS_IP(regs->regs.skas.regs) = sc.eip;
++ REGS_CS(regs->regs.skas.regs) = sc.cs;
++ REGS_EFLAGS(regs->regs.skas.regs) = sc.eflags;
++ REGS_SS(regs->regs.skas.regs) = sc.ss;
++ regs->regs.skas.fault_addr = sc.cr2;
++ regs->regs.skas.fault_type = FAULT_WRITE(sc.err);
++ regs->regs.skas.trap_type = sc.trapno;
++
++ err = ptrace_setfpregs(userspace_pid[0], fpregs);
++ if(err < 0){
++ printk("copy_sc_from_user_skas - PTRACE_SETFPREGS failed, "
++ "errno = %d\n", err);
++ return(1);
++ }
++
++ return(0);
++}
++
++int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp,
++ struct pt_regs *regs, unsigned long fault_addr,
++ int fault_type)
++{
++ struct sigcontext sc;
++ unsigned long fpregs[HOST_FP_SIZE];
++ int err;
++
++ sc.gs = REGS_GS(regs->regs.skas.regs);
++ sc.fs = REGS_FS(regs->regs.skas.regs);
++ sc.es = REGS_ES(regs->regs.skas.regs);
++ sc.ds = REGS_DS(regs->regs.skas.regs);
++ sc.edi = REGS_EDI(regs->regs.skas.regs);
++ sc.esi = REGS_ESI(regs->regs.skas.regs);
++ sc.ebp = REGS_EBP(regs->regs.skas.regs);
++ sc.esp = REGS_SP(regs->regs.skas.regs);
++ sc.ebx = REGS_EBX(regs->regs.skas.regs);
++ sc.edx = REGS_EDX(regs->regs.skas.regs);
++ sc.ecx = REGS_ECX(regs->regs.skas.regs);
++ sc.eax = REGS_EAX(regs->regs.skas.regs);
++ sc.eip = REGS_IP(regs->regs.skas.regs);
++ sc.cs = REGS_CS(regs->regs.skas.regs);
++ sc.eflags = REGS_EFLAGS(regs->regs.skas.regs);
++ sc.esp_at_signal = regs->regs.skas.regs[UESP];
++ sc.ss = regs->regs.skas.regs[SS];
++ sc.cr2 = fault_addr;
++ sc.err = TO_SC_ERR(fault_type);
++ sc.trapno = regs->regs.skas.trap_type;
++
++ err = ptrace_getfpregs(userspace_pid[0], fpregs);
++ if(err < 0){
++ printk("copy_sc_to_user_skas - PTRACE_GETFPREGS failed, "
++ "errno = %d\n", err);
++ return(1);
++ }
++ to_fp = (to_fp ? to_fp : (struct _fpstate *) (to + 1));
++ sc.fpstate = to_fp;
++
++ if(err)
++ return(err);
++
++ return(copy_to_user(to, &sc, sizeof(sc)) ||
++ copy_to_user(to_fp, fpregs, sizeof(fpregs)));
++}
++#endif
++
++#ifdef CONFIG_MODE_TT
++int copy_sc_from_user_tt(struct sigcontext *to, struct sigcontext *from,
++ int fpsize)
++{
++ struct _fpstate *to_fp, *from_fp;
++ unsigned long sigs;
++ int err;
++
++ to_fp = to->fpstate;
++ from_fp = from->fpstate;
++ sigs = to->oldmask;
++ err = copy_from_user(to, from, sizeof(*to));
++ to->oldmask = sigs;
++ if(to_fp != NULL){
++ err |= copy_from_user(&to->fpstate, &to_fp,
++ sizeof(to->fpstate));
++ err |= copy_from_user(to_fp, from_fp, fpsize);
++ }
++ return(err);
++}
++
++int copy_sc_to_user_tt(struct sigcontext *to, struct _fpstate *fp,
++ struct sigcontext *from, int fpsize)
++{
++ struct _fpstate *to_fp, *from_fp;
++ int err;
++
++ to_fp = (fp ? fp : (struct _fpstate *) (to + 1));
++ from_fp = from->fpstate;
++ err = copy_to_user(to, from, sizeof(*to));
++ if(from_fp != NULL){
++ err |= copy_to_user(&to->fpstate, &to_fp,
++ sizeof(to->fpstate));
++ err |= copy_to_user(to_fp, from_fp, fpsize);
++ }
++ return(err);
++}
++#endif
++
++static int copy_sc_from_user(struct pt_regs *to, void *from)
++{
++ int ret;
++
++ ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from,
++ sizeof(struct _fpstate)),
++ copy_sc_from_user_skas(to, from));
++ return(ret);
++}
++
++static int copy_sc_to_user(struct sigcontext *to, struct _fpstate *fp,
++ struct pt_regs *from)
++{
++ return(CHOOSE_MODE(copy_sc_to_user_tt(to, fp, UPT_SC(&from->regs),
++ sizeof(*fp)),
++ copy_sc_to_user_skas(to, fp, from,
++ current->thread.cr2,
++ current->thread.err)));
++}
++
++static int copy_ucontext_to_user(struct ucontext *uc, struct _fpstate *fp,
++ sigset_t *set, unsigned long sp)
++{
++ int err = 0;
++
++ err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp);
++ err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags);
++ err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size);
++ err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs);
++ err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set));
++ return(err);
++}
++
++struct sigframe
++{
++ char *pretcode;
++ int sig;
++ struct sigcontext sc;
++ struct _fpstate fpstate;
++ unsigned long extramask[_NSIG_WORDS-1];
++ char retcode[8];
++};
++
++struct rt_sigframe
++{
++ char *pretcode;
++ int sig;
++ struct siginfo *pinfo;
++ void *puc;
++ struct siginfo info;
++ struct ucontext uc;
++ struct _fpstate fpstate;
++ char retcode[8];
++};
++
++int setup_signal_stack_sc(unsigned long stack_top, int sig,
++ struct k_sigaction *ka, struct pt_regs *regs,
++ sigset_t *mask)
++{
++ struct sigframe __user *frame;
++ void *restorer;
++ int err = 0;
++
++ stack_top &= -8UL;
++ frame = (struct sigframe *) stack_top - 1;
++ if(verify_area(VERIFY_WRITE, frame, sizeof(*frame)))
++ return(1);
++
++ restorer = (void *) frame->retcode;
++ if(ka->sa.sa_flags & SA_RESTORER)
++ restorer = ka->sa.sa_restorer;
++
++ err |= __put_user(restorer, &frame->pretcode);
++ err |= __put_user(sig, &frame->sig);
++ err |= copy_sc_to_user(&frame->sc, NULL, regs);
++ err |= __put_user(mask->sig[0], &frame->sc.oldmask);
++ if (_NSIG_WORDS > 1)
++ err |= __copy_to_user(&frame->extramask, &mask->sig[1],
++ sizeof(frame->extramask));
++
++ /*
++ * This is popl %eax ; movl $,%eax ; int $0x80
++ *
++ * WE DO NOT USE IT ANY MORE! It's only left here for historical
++ * reasons and because gdb uses it as a signature to notice
++ * signal handler stack frames.
++ */
++ err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
++ err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
++ err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
++
++ if(err)
++ return(err);
++
++ PT_REGS_SP(regs) = (unsigned long) frame;
++ PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
++ PT_REGS_EAX(regs) = (unsigned long) sig;
++ PT_REGS_EDX(regs) = (unsigned long) 0;
++ PT_REGS_ECX(regs) = (unsigned long) 0;
++
++ if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
++ ptrace_notify(SIGTRAP);
++ return(0);
++}
++
++int setup_signal_stack_si(unsigned long stack_top, int sig,
++ struct k_sigaction *ka, struct pt_regs *regs,
++ siginfo_t *info, sigset_t *mask)
++{
++ struct rt_sigframe __user *frame;
++ void *restorer;
++ int err = 0;
++
++ stack_top &= -8UL;
++ frame = (struct rt_sigframe *) stack_top - 1;
++ if(verify_area(VERIFY_WRITE, frame, sizeof(*frame)))
++ return(1);
++
++ restorer = (void *) frame->retcode;
++ if(ka->sa.sa_flags & SA_RESTORER)
++ restorer = ka->sa.sa_restorer;
++
++ err |= __put_user(restorer, &frame->pretcode);
++ err |= __put_user(sig, &frame->sig);
++ err |= __put_user(&frame->info, &frame->pinfo);
++ err |= __put_user(&frame->uc, &frame->puc);
++ err |= copy_siginfo_to_user(&frame->info, info);
++ err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask,
++ PT_REGS_SP(regs));
++
++ /*
++ * This is movl $,%eax ; int $0x80
++ *
++ * WE DO NOT USE IT ANY MORE! It's only left here for historical
++ * reasons and because gdb uses it as a signature to notice
++ * signal handler stack frames.
++ */
++ err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
++ err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
++ err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
++
++ if(err)
++ return(err);
++
++ PT_REGS_SP(regs) = (unsigned long) frame;
++ PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
++ PT_REGS_EAX(regs) = (unsigned long) sig;
++ PT_REGS_EDX(regs) = (unsigned long) &frame->info;
++ PT_REGS_ECX(regs) = (unsigned long) &frame->uc;
++
++ if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
++ ptrace_notify(SIGTRAP);
++ return(0);
++}
++
++long sys_sigreturn(struct pt_regs regs)
++{
++ unsigned long __user sp = PT_REGS_SP(¤t->thread.regs);
++ struct sigframe __user *frame = (struct sigframe *)(sp - 8);
++ sigset_t set;
++ struct sigcontext __user *sc = &frame->sc;
++ unsigned long __user *oldmask = &sc->oldmask;
++ unsigned long __user *extramask = &frame->extramask;
++ int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
++
++ if(copy_from_user(&set.sig[0], oldmask, sizeof(&set.sig[0])) ||
++ copy_from_user(&set.sig[1], extramask, sig_size))
++ goto segfault;
++
++ sigdelsetmask(&set, ~_BLOCKABLE);
++
++ spin_lock_irq(¤t->sighand->siglock);
++ current->blocked = set;
++ recalc_sigpending();
++ spin_unlock_irq(¤t->sighand->siglock);
++
++ if(copy_sc_from_user(¤t->thread.regs, sc))
++ goto segfault;
++
++ PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; /* Avoid ERESTART handling */
++ return(PT_REGS_SYSCALL_RET(¤t->thread.regs));
++
++ segfault:
++ force_sig(SIGSEGV, current);
++ return 0;
++}
++
++long sys_rt_sigreturn(struct pt_regs regs)
++{
++ unsigned long __user sp = PT_REGS_SP(¤t->thread.regs);
++ struct rt_sigframe __user *frame = (struct rt_sigframe *) (sp - 4);
++ sigset_t set;
++ struct ucontext __user *uc = &frame->uc;
++ int sig_size = _NSIG_WORDS * sizeof(unsigned long);
++
++ if(copy_from_user(&set, &uc->uc_sigmask, sig_size))
++ goto segfault;
++
++ sigdelsetmask(&set, ~_BLOCKABLE);
++
++ spin_lock_irq(¤t->sighand->siglock);
++ current->blocked = set;
++ recalc_sigpending();
++ spin_unlock_irq(¤t->sighand->siglock);
++
++ if(copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext))
++ goto segfault;
++
++ PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; /* Avoid ERESTART handling */
++ return(PT_REGS_SYSCALL_RET(¤t->thread.regs));
++
++ segfault:
++ force_sig(SIGSEGV, current);
++ return 0;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+Index: linux-2.6.10/arch/um/sys-i386/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/sys-i386/Makefile 2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/arch/um/sys-i386/Makefile 2005-04-05 12:40:36.023911704 +0800
+@@ -1,5 +1,5 @@
+ obj-y = bitops.o bugs.o checksum.o fault.o ksyms.o ldt.o ptrace.o \
+- ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o
++ ptrace_user.o semaphore.o signal.o sigcontext.o syscalls.o sysrq.o
+
+ obj-$(CONFIG_HIGHMEM) += highmem.o
+ obj-$(CONFIG_MODULES) += module.o
+Index: linux-2.6.10/arch/um/kernel/mem_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/mem_user.c 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/mem_user.c 2005-04-05 12:40:36.051907448 +0800
+@@ -101,6 +101,8 @@
+ }
+ printf("OK\n");
+ munmap(addr, UM_KERN_PAGE_SIZE);
++
++ os_close_file(fd);
+ }
+
+ static int have_devanon = 0;
+@@ -261,6 +263,39 @@
+ }
+ #endif
+
++#if 0
++/* Debugging facility for dumping stuff out to the host, avoiding the timing
++ * problems that come with printf and breakpoints.
++ * Enable in case of emergency.
++ */
++
++int logging = 1;
++int logging_fd = -1;
++
++int logging_line = 0;
++char logging_buf[512];
++
++void log(char *fmt, ...)
++{
++ va_list ap;
++ struct timeval tv;
++ struct openflags flags;
++
++ if(logging == 0) return;
++ if(logging_fd < 0){
++ flags = of_create(of_trunc(of_rdwr(OPENFLAGS())));
++ logging_fd = os_open_file("log", flags, 0644);
++ }
++ gettimeofday(&tv, NULL);
++ sprintf(logging_buf, "%d\t %u.%u ", logging_line++, tv.tv_sec,
++ tv.tv_usec);
++ va_start(ap, fmt);
++ vsprintf(&logging_buf[strlen(logging_buf)], fmt, ap);
++ va_end(ap);
++ write(logging_fd, logging_buf, strlen(logging_buf));
++}
++#endif
++
+ /*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+Index: linux-2.6.10/arch/um/kernel/time.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/time.c 2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/time.c 2005-04-05 12:40:36.046908208 +0800
+@@ -60,6 +60,9 @@
+ (setitimer(ITIMER_REAL, &disable, NULL) < 0))
+ printk("disnable_timer - setitimer failed, errno = %d\n",
+ errno);
++ /* If there are signals already queued, after unblocking ignore them */
++ set_handler(SIGALRM, SIG_IGN, 0, -1);
++ set_handler(SIGVTALRM, SIG_IGN, 0, -1);
+ }
+
+ void switch_timers(int to_real)
+Index: linux-2.6.10/arch/um/kernel/ksyms.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/ksyms.c 2004-12-25 05:33:50.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/ksyms.c 2005-04-05 12:40:36.049907752 +0800
+@@ -48,6 +48,7 @@
+ EXPORT_SYMBOL(mode_tt);
+ EXPORT_SYMBOL(handle_page_fault);
+ EXPORT_SYMBOL(find_iomem);
++EXPORT_SYMBOL(end_iomem);
+
+ #ifdef CONFIG_MODE_TT
+ EXPORT_SYMBOL(strncpy_from_user_tt);
+Index: linux-2.6.10/arch/um/kernel/um_arch.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/um_arch.c 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/um_arch.c 2005-04-05 12:40:36.045908360 +0800
+@@ -44,11 +44,6 @@
+ .ipi_pipe = { -1, -1 }
+ };
+
+-/* Placeholder to make UML link until the vsyscall stuff is actually
+- * implemented
+- */
+-void *__kernel_vsyscall;
+-
+ unsigned long thread_saved_pc(struct task_struct *task)
+ {
+ return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas,
+@@ -326,6 +321,11 @@
+ */
+ check_tmpexec();
+
++ /* Need to check this early because mmapping happens before the
++ * kernel is running.
++ */
++ check_tmpexec();
++
+ brk_start = (unsigned long) sbrk(0);
+ CHOOSE_MODE_PROC(before_mem_tt, before_mem_skas, brk_start);
+ /* Increase physical memory size for exec-shield users
+Index: linux-2.6.10/arch/um/kernel/process.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/process.c 2004-12-25 05:35:25.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/process.c 2005-04-05 12:40:36.025911400 +0800
+@@ -13,6 +13,7 @@
+ #include <setjmp.h>
+ #include <sys/time.h>
+ #include <sys/ptrace.h>
++#include <linux/ptrace.h>
+ #include <sys/wait.h>
+ #include <sys/mman.h>
+ #include <asm/ptrace.h>
+@@ -285,6 +286,9 @@
+ printk("Checking that ptrace can change system call numbers...");
+ pid = start_ptraced_child(&stack);
+
++ if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0)
++ panic("check_ptrace: PTRACE_SETOPTIONS failed, errno = %d", errno);
++
+ while(1){
+ if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
+ panic("check_ptrace : ptrace failed, errno = %d",
+@@ -292,8 +296,8 @@
+ CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
+ if(n < 0)
+ panic("check_ptrace : wait failed, errno = %d", errno);
+- if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
+- panic("check_ptrace : expected SIGTRAP, "
++ if(!WIFSTOPPED(status) || (WSTOPSIG(status) != (SIGTRAP|SYSCALL_TRAP)))
++ panic("check_ptrace : expected (SIGTRAP|SYSCALL_TRAP), "
+ "got status = %d", status);
+
+ syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
+Index: linux-2.6.10/arch/um/kernel/process_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/process_kern.c 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/process_kern.c 2005-04-05 12:40:36.047908056 +0800
+@@ -291,8 +291,6 @@
+
+ EXPORT_SYMBOL(disable_hlt);
+
+-extern int signal_frame_size;
+-
+ void *um_kmalloc(int size)
+ {
+ return(kmalloc(size, GFP_KERNEL));
+Index: linux-2.6.10/arch/um/kernel/signal_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/signal_user.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/signal_user.c 2005-04-05 12:40:36.050907600 +0800
+@@ -61,6 +61,10 @@
+ * disable profiling; it's safe because the profiling code does not interact
+ * with the kernel code at all.*/
+
++/* Both here and in set/get_signal we don't touch SIGPROF, because we must not
++ * disable profiling; it's safe because the profiling code does not interact
++ * with the kernel code at all.*/
++
+ static void change_signals(int type)
+ {
+ sigset_t mask;
+Index: linux-2.6.10/arch/um/kernel/initrd_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/initrd_user.c 2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/initrd_user.c 2005-04-05 12:40:36.026911248 +0800
+@@ -29,6 +29,8 @@
+ filename, -n);
+ return(-1);
+ }
++
++ os_close_file(fd);
+ return(0);
+ }
+
+Index: linux-2.6.10/arch/um/kernel/dyn.lds.S
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/dyn.lds.S 2004-12-25 05:34:48.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/dyn.lds.S 2005-04-05 12:40:36.044908512 +0800
+@@ -7,8 +7,11 @@
+
+ SECTIONS
+ {
++ PROVIDE (__executable_start = START);
+ . = START + SIZEOF_HEADERS;
+ .interp : { *(.interp) }
++ /* Used in arch/um/kernel/mem.c. Any memory between START and __binary_start
++ * is remapped.*/
+ __binary_start = .;
+ . = ALIGN(4096); /* Init code and data */
+ _stext = .;
+Index: linux-2.6.10/arch/um/kernel/ptrace.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/ptrace.c 2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/ptrace.c 2005-04-05 12:40:36.044908512 +0800
+@@ -16,6 +16,7 @@
+ #include "asm/uaccess.h"
+ #include "kern_util.h"
+ #include "ptrace_user.h"
++#include "signal_user.h"
+
+ /*
+ * Called by kernel/ptrace.c when detaching..
+@@ -328,8 +329,10 @@
+ /* the 0x80 provides a way for the tracing parent to distinguish
+ between a syscall stop and SIGTRAP delivery */
+ tracesysgood = (current->ptrace & PT_TRACESYSGOOD) && !is_singlestep;
+- ptrace_notify(SIGTRAP | (tracesysgood ? 0x80 : 0));
+-
++ ptrace_notify(SIGTRAP | (tracesysgood ? SYSCALL_TRAP : 0));
++ if ( entryexit ) /* force do_signal() --> is_syscall() */
++ set_thread_flag(TIF_SIGPENDING);
++
+ /* force do_signal() --> is_syscall() */
+ set_thread_flag(TIF_SIGPENDING);
+
+Index: linux-2.6.10/arch/um/kernel/uml.lds.S
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/uml.lds.S 2005-04-01 12:25:25.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/uml.lds.S 2005-04-05 12:40:36.049907752 +0800
+@@ -7,8 +7,12 @@
+
+ SECTIONS
+ {
++ /*This must contain the right address - not quite the default ELF one.*/
++ PROVIDE (__executable_start = START);
+ . = START + SIZEOF_HEADERS;
+
++ /* Used in arch/um/kernel/mem.c. Any memory between START and __binary_start
++ * is remapped.*/
+ __binary_start = .;
+ #ifdef MODE_TT
+ .thread_private : {
+@@ -20,9 +24,13 @@
+ }
+ . = ALIGN(4096);
+ .remap : { arch/um/kernel/tt/unmap_fin.o (.text) }
+-#endif
+
++ /*If you put this after #endif, STATIC build without TT mode
++ gives a segfaulting binary. And after all, a hole just after
++ binary_start is not very polite to glibc.*/
+ . = ALIGN(4096); /* Init code and data */
++#endif
++
+ _stext = .;
+ __init_begin = .;
+ .init.text : {
+Index: linux-2.6.10/arch/um/kernel/main.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/main.c 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/main.c 2005-04-05 12:40:36.024911552 +0800
+@@ -81,6 +81,8 @@
+
+ extern int uml_exitcode;
+
++extern void scan_elf_aux( char **envp);
++
+ int main(int argc, char **argv, char **envp)
+ {
+ char **new_argv;
+@@ -147,6 +149,8 @@
+ set_handler(SIGTERM, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
+ set_handler(SIGHUP, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
+
++ scan_elf_aux( envp);
++
+ do_uml_initcalls();
+ ret = linux_main(argc, argv);
+
+@@ -155,18 +159,20 @@
+ int err;
+
+ printf("\n");
+-
+- /* Let any pending signals fire, then disable them. This
+- * ensures that they won't be delivered after the exec, when
+- * they are definitely not expected.
+- */
+- unblock_signals();
++ /* stop timers and set SIG*ALRM to be ignored */
+ disable_timer();
++ /* disable SIGIO for the fds and set SIGIO to be ignored */
+ err = deactivate_all_fds();
+ if(err)
+ printf("deactivate_all_fds failed, errno = %d\n",
+ -err);
+
++ /* Let any pending signals fire now. This ensures
++ * that they won't be delivered after the exec, when
++ * they are definitely not expected.
++ */
++ unblock_signals();
++
+ execvp(new_argv[0], new_argv);
+ perror("Failed to exec kernel");
+ ret = 1;
+Index: linux-2.6.10/arch/um/kernel/irq_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/irq_user.c 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/irq_user.c 2005-04-05 12:40:36.028910944 +0800
+@@ -374,6 +374,8 @@
+ if(err)
+ return(err);
+ }
++ /* If there is a signal already queued, after unblocking ignore it */
++ set_handler(SIGIO, SIG_IGN, 0, -1);
+
+ return(0);
+ }
+Index: linux-2.6.10/arch/um/kernel/signal_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/signal_kern.c 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/signal_kern.c 2005-04-05 12:40:36.048907904 +0800
+@@ -230,53 +230,6 @@
+ return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs)));
+ }
+
+-extern int userspace_pid[];
+-
+-static int copy_sc_from_user(struct pt_regs *to, void *from,
+- struct arch_frame_data *arch)
+-{
+- int ret;
+-
+- ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from, arch),
+- copy_sc_from_user_skas(userspace_pid[0],
+- &to->regs, from));
+- return(ret);
+-}
+-
+-long sys_sigreturn(struct pt_regs regs)
+-{
+- void __user *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs));
+- void __user *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs));
+- int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
+-
+- spin_lock_irq(¤t->sighand->siglock);
+- copy_from_user(¤t->blocked.sig[0], sc_sigmask(sc),
+- sizeof(current->blocked.sig[0]));
+- copy_from_user(¤t->blocked.sig[1], mask, sig_size);
+- sigdelsetmask(¤t->blocked, ~_BLOCKABLE);
+- recalc_sigpending();
+- spin_unlock_irq(¤t->sighand->siglock);
+- copy_sc_from_user(¤t->thread.regs, sc,
+- &signal_frame_sc.common.arch);
+- return(PT_REGS_SYSCALL_RET(¤t->thread.regs));
+-}
+-
+-long sys_rt_sigreturn(struct pt_regs regs)
+-{
+- unsigned long sp = PT_REGS_SP(¤t->thread.regs);
+- struct ucontext __user *uc = sp_to_uc(sp);
+- int sig_size = _NSIG_WORDS * sizeof(unsigned long);
+-
+- spin_lock_irq(¤t->sighand->siglock);
+- copy_from_user(¤t->blocked, &uc->uc_sigmask, sig_size);
+- sigdelsetmask(¤t->blocked, ~_BLOCKABLE);
+- recalc_sigpending();
+- spin_unlock_irq(¤t->sighand->siglock);
+- copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext,
+- &signal_frame_si.common.arch);
+- return(PT_REGS_SYSCALL_RET(¤t->thread.regs));
+-}
+-
+ /*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+Index: linux-2.6.10/arch/um/kernel/skas/include/uaccess-skas.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/include/uaccess-skas.h 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/include/uaccess-skas.h 2005-04-05 12:40:36.037909576 +0800
+@@ -7,6 +7,51 @@
+ #define __SKAS_UACCESS_H
+
+ #include "asm/errno.h"
++#include "asm/fixmap.h"
++
++#define access_ok_skas(type, addr, size) \
++ ((segment_eq(get_fs(), KERNEL_DS)) || \
++ (((unsigned long) (addr) < TASK_SIZE) && \
++ ((unsigned long) (addr) + (size) <= TASK_SIZE)) || \
++ ((type == VERIFY_READ ) && \
++ ((unsigned long) (addr) >= FIXADDR_USER_START) && \
++ ((unsigned long) (addr) + (size) <= FIXADDR_USER_END) && \
++ ((unsigned long) (addr) + (size) >= (unsigned long)(addr))))
++
++static inline int verify_area_skas(int type, const void * addr,
++ unsigned long size)
++{
++ return(access_ok_skas(type, addr, size) ? 0 : -EFAULT);
++}
++
++extern int copy_from_user_skas(void *to, const void *from, int n);
++extern int copy_to_user_skas(void *to, const void *from, int n);
++extern int strncpy_from_user_skas(char *dst, const char *src, int count);
++extern int __clear_user_skas(void *mem, int len);
++extern int clear_user_skas(void *mem, int len);
++extern int strnlen_user_skas(const void *str, int len);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_UACCESS_H
++#define __SKAS_UACCESS_H
++
++#include "asm/errno.h"
+
+ #define access_ok_skas(type, addr, size) \
+ ((segment_eq(get_fs(), KERNEL_DS)) || \
+Index: linux-2.6.10/arch/um/kernel/skas/include/mmu-skas.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/include/mmu-skas.h 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/include/mmu-skas.h 2005-04-05 12:40:36.035909880 +0800
+@@ -22,3 +22,27 @@
+ * c-file-style: "linux"
+ * End:
+ */
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_MMU_H
++#define __SKAS_MMU_H
++
++struct mmu_context_skas {
++ int mm_fd;
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+Index: linux-2.6.10/arch/um/kernel/skas/include/mode-skas.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/include/mode-skas.h 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/include/mode-skas.h 2005-04-05 12:40:36.036909728 +0800
+@@ -14,6 +14,40 @@
+ extern int have_fpx_regs;
+
+ extern void user_time_init_skas(void);
++extern void sig_handler_common_skas(int sig, void *sc_ptr);
++extern void halt_skas(void);
++extern void reboot_skas(void);
++extern void kill_off_processes_skas(void);
++extern int is_skas_winch(int pid, int fd, void *data);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MODE_SKAS_H__
++#define __MODE_SKAS_H__
++
++#include <sysdep/ptrace.h>
++
++extern unsigned long exec_regs[];
++extern unsigned long exec_fp_regs[];
++extern unsigned long exec_fpx_regs[];
++extern int have_fpx_regs;
++
++extern void user_time_init_skas(void);
+ extern int copy_sc_from_user_skas(int pid, union uml_pt_regs *regs,
+ void *from_ptr);
+ extern int copy_sc_to_user_skas(int pid, void *to_ptr, void *fp,
+Index: linux-2.6.10/arch/um/kernel/skas/sys-i386/sigcontext.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/sys-i386/sigcontext.c 2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/sys-i386/sigcontext.c 2005-04-05 19:01:49.158500672 +0800
+@@ -1,114 +0,0 @@
+-/*
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#include <errno.h>
+-#include <asm/sigcontext.h>
+-#include <sys/ptrace.h>
+-#include <linux/ptrace.h>
+-#include "sysdep/ptrace.h"
+-#include "sysdep/ptrace_user.h"
+-#include "kern_util.h"
+-#include "user.h"
+-#include "sigcontext.h"
+-#include "mode.h"
+-
+-int copy_sc_from_user_skas(int pid, union uml_pt_regs *regs, void *from_ptr)
+-{
+- struct sigcontext sc, *from = from_ptr;
+- unsigned long fpregs[FP_FRAME_SIZE];
+- int err;
+-
+- err = copy_from_user_proc(&sc, from, sizeof(sc));
+- err |= copy_from_user_proc(fpregs, sc.fpstate, sizeof(fpregs));
+- if(err)
+- return(err);
+-
+- regs->skas.regs[GS] = sc.gs;
+- regs->skas.regs[FS] = sc.fs;
+- regs->skas.regs[ES] = sc.es;
+- regs->skas.regs[DS] = sc.ds;
+- regs->skas.regs[EDI] = sc.edi;
+- regs->skas.regs[ESI] = sc.esi;
+- regs->skas.regs[EBP] = sc.ebp;
+- regs->skas.regs[UESP] = sc.esp;
+- regs->skas.regs[EBX] = sc.ebx;
+- regs->skas.regs[EDX] = sc.edx;
+- regs->skas.regs[ECX] = sc.ecx;
+- regs->skas.regs[EAX] = sc.eax;
+- regs->skas.regs[EIP] = sc.eip;
+- regs->skas.regs[CS] = sc.cs;
+- regs->skas.regs[EFL] = sc.eflags;
+- regs->skas.regs[SS] = sc.ss;
+- regs->skas.fault_addr = sc.cr2;
+- regs->skas.fault_type = FAULT_WRITE(sc.err);
+- regs->skas.trap_type = sc.trapno;
+-
+- err = ptrace(PTRACE_SETFPREGS, pid, 0, fpregs);
+- if(err < 0){
+- printk("copy_sc_to_user - PTRACE_SETFPREGS failed, "
+- "errno = %d\n", errno);
+- return(1);
+- }
+-
+- return(0);
+-}
+-
+-int copy_sc_to_user_skas(int pid, void *to_ptr, void *fp,
+- union uml_pt_regs *regs, unsigned long fault_addr,
+- int fault_type)
+-{
+- struct sigcontext sc, *to = to_ptr;
+- struct _fpstate *to_fp;
+- unsigned long fpregs[FP_FRAME_SIZE];
+- int err;
+-
+- sc.gs = regs->skas.regs[GS];
+- sc.fs = regs->skas.regs[FS];
+- sc.es = regs->skas.regs[ES];
+- sc.ds = regs->skas.regs[DS];
+- sc.edi = regs->skas.regs[EDI];
+- sc.esi = regs->skas.regs[ESI];
+- sc.ebp = regs->skas.regs[EBP];
+- sc.esp = regs->skas.regs[UESP];
+- sc.ebx = regs->skas.regs[EBX];
+- sc.edx = regs->skas.regs[EDX];
+- sc.ecx = regs->skas.regs[ECX];
+- sc.eax = regs->skas.regs[EAX];
+- sc.eip = regs->skas.regs[EIP];
+- sc.cs = regs->skas.regs[CS];
+- sc.eflags = regs->skas.regs[EFL];
+- sc.esp_at_signal = regs->skas.regs[UESP];
+- sc.ss = regs->skas.regs[SS];
+- sc.cr2 = fault_addr;
+- sc.err = TO_SC_ERR(fault_type);
+- sc.trapno = regs->skas.trap_type;
+-
+- err = ptrace(PTRACE_GETFPREGS, pid, 0, fpregs);
+- if(err < 0){
+- printk("copy_sc_to_user - PTRACE_GETFPREGS failed, "
+- "errno = %d\n", errno);
+- return(1);
+- }
+- to_fp = (struct _fpstate *)
+- (fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to)));
+- sc.fpstate = to_fp;
+-
+- if(err)
+- return(err);
+-
+- return(copy_to_user_proc(to, &sc, sizeof(sc)) ||
+- copy_to_user_proc(to_fp, fpregs, sizeof(fpregs)));
+-}
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only. This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/kernel/skas/sys-i386/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/sys-i386/Makefile 2004-12-25 05:35:27.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/sys-i386/Makefile 2005-04-05 19:01:49.158500672 +0800
+@@ -1,12 +0,0 @@
+-#
+-# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+-# Licensed under the GPL
+-#
+-
+-obj-y = sigcontext.o
+-
+-USER_OBJS = sigcontext.o
+-USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
+-
+-$(USER_OBJS) : %.o: %.c
+- $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
+Index: linux-2.6.10/arch/um/kernel/skas/process.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/process.c 2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/process.c 2005-04-05 12:40:36.030910640 +0800
+@@ -11,6 +11,7 @@
+ #include <sched.h>
+ #include <sys/wait.h>
+ #include <sys/ptrace.h>
++#include <linux/ptrace.h>
+ #include <sys/mman.h>
+ #include <sys/user.h>
+ #include <asm/unistd.h>
+@@ -60,15 +61,10 @@
+ /*To use the same value of using_sysemu as the caller, ask it that value (in local_using_sysemu)*/
+ static void handle_trap(int pid, union uml_pt_regs *regs, int local_using_sysemu)
+ {
+- int err, syscall_nr, status;
+-
+- syscall_nr = PT_SYSCALL_NR(regs->skas.regs);
+- UPT_SYSCALL_NR(regs) = syscall_nr;
+- if(syscall_nr < 0){
+- relay_signal(SIGTRAP, regs);
+- return;
+- }
+
++ int err, status;
++
++ UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->skas.regs); /* Mark this as a syscall */
+ if (!local_using_sysemu)
+ {
+ err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid);
+@@ -82,7 +78,8 @@
+ "errno = %d\n", errno);
+
+ CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED));
+- if((err < 0) || !WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
++ if((err < 0) || !WIFSTOPPED(status) ||
++ (WSTOPSIG(status) != (SIGTRAP|SYSCALL_TRAP)))
+ panic("handle_trap - failed to wait at end of syscall, "
+ "errno = %d, status = %d\n", errno, status);
+ }
+@@ -131,6 +128,10 @@
+ panic("start_userspace : expected SIGSTOP, got status = %d",
+ status);
+
++ if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, (void *)PTRACE_O_TRACESYSGOOD) < 0)
++ panic("start_userspace : PTRACE_SETOPTIONS failed, errno=%d\n",
++ errno);
++
+ if(munmap(stack, PAGE_SIZE) < 0)
+ panic("start_userspace : munmap failed, errno = %d\n", errno);
+
+@@ -160,15 +161,19 @@
+
+ regs->skas.is_user = 1;
+ save_registers(regs);
++ UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
+
+ if(WIFSTOPPED(status)){
+ switch(WSTOPSIG(status)){
+ case SIGSEGV:
+ handle_segv(pid);
+ break;
+- case SIGTRAP:
++ case (SIGTRAP|SYSCALL_TRAP):
+ handle_trap(pid, regs, local_using_sysemu);
+ break;
++ case SIGTRAP:
++ relay_signal(SIGTRAP, regs);
++ break;
+ case SIGIO:
+ case SIGVTALRM:
+ case SIGILL:
+@@ -222,9 +227,10 @@
+ block_signals();
+ if(sigsetjmp(fork_buf, 1) == 0)
+ new_thread_proc(stack, handler);
+- set_signals(flags);
+
+ remove_sigstack();
++
++ set_signals(flags);
+ }
+
+ void thread_wait(void *sw, void *fb)
+Index: linux-2.6.10/arch/um/kernel/skas/process_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/process_kern.c 2004-12-25 05:35:50.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/process_kern.c 2005-04-05 12:40:36.032910336 +0800
+@@ -19,7 +19,6 @@
+ #include "os.h"
+ #include "user_util.h"
+ #include "tlb.h"
+-#include "frame.h"
+ #include "kern.h"
+ #include "mode.h"
+ #include "proc_mm.h"
+@@ -183,7 +182,6 @@
+ int start_uml_skas(void)
+ {
+ start_userspace(0);
+- capture_signal_stack();
+
+ init_new_thread_signals(1);
+ uml_idle_timer();
+Index: linux-2.6.10/arch/um/kernel/skas/syscall_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/syscall_kern.c 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/syscall_kern.c 2005-04-05 12:40:36.034910032 +0800
+@@ -6,6 +6,7 @@
+ #include "linux/sys.h"
+ #include "linux/ptrace.h"
+ #include "asm/errno.h"
++#include "linux/ptrace.h"
+ #include "asm/unistd.h"
+ #include "asm/ptrace.h"
+ #include "asm/current.h"
+Index: linux-2.6.10/arch/um/kernel/skas/trap_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/trap_user.c 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/trap_user.c 2005-04-05 12:40:36.033910184 +0800
+@@ -21,6 +21,14 @@
+ int save_errno = errno;
+ int save_user;
+
++ /* This is done because to allow SIGSEGV to be delivered inside a SEGV
++ * handler. This can happen in copy_user, and if SEGV is disabled,
++ * the process will die.
++ * XXX Figure out why this is better than SA_NODEFER
++ */
++ if(sig == SIGSEGV)
++ change_sig(SIGSEGV, 1);
++
+ r = &TASK_REGS(get_current())->skas;
+ save_user = r->is_user;
+ r->is_user = 0;
+Index: linux-2.6.10/arch/um/kernel/skas/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/skas/Makefile 2004-12-25 05:34:30.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/skas/Makefile 2005-04-05 12:40:36.034910032 +0800
+@@ -4,8 +4,7 @@
+ #
+
+ obj-y := exec_kern.o mem.o mem_user.o mmu.o process.o process_kern.o \
+- syscall_kern.o syscall_user.o time.o tlb.o trap_user.o uaccess.o \
+- sys-$(SUBARCH)/
++ syscall_kern.o syscall_user.o time.o tlb.o trap_user.o uaccess.o
+
+ subdir-y := util
+
+Index: linux-2.6.10/arch/um/kernel/helper.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/helper.c 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/helper.c 2005-04-05 12:40:36.027911096 +0800
+@@ -49,14 +49,14 @@
+ return(0);
+ }
+
+-/* XXX The alloc_stack here breaks if this is called in the tracing thread */
+-
++/* Returns either the pid of the child process we run or -E* on failure.
++ * XXX The alloc_stack here breaks if this is called in the tracing thread */
+ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv,
+ unsigned long *stack_out)
+ {
+ struct helper_data data;
+ unsigned long stack, sp;
+- int pid, fds[2], err, n;
++ int pid, fds[2], ret, n;
+
+ if((stack_out != NULL) && (*stack_out != 0))
+ stack = *stack_out;
+@@ -64,16 +64,16 @@
+ if(stack == 0)
+ return(-ENOMEM);
+
+- err = os_pipe(fds, 1, 0);
+- if(err < 0){
+- printk("run_helper : pipe failed, err = %d\n", -err);
++ ret = os_pipe(fds, 1, 0);
++ if(ret < 0){
++ printk("run_helper : pipe failed, ret = %d\n", -ret);
+ goto out_free;
+ }
+
+- err = os_set_exec_close(fds[1], 1);
+- if(err < 0){
+- printk("run_helper : setting FD_CLOEXEC failed, err = %d\n",
+- -err);
++ ret = os_set_exec_close(fds[1], 1);
++ if(ret < 0){
++ printk("run_helper : setting FD_CLOEXEC failed, ret = %d\n",
++ -ret);
+ goto out_close;
+ }
+
+@@ -85,34 +85,36 @@
+ pid = clone(helper_child, (void *) sp, CLONE_VM | SIGCHLD, &data);
+ if(pid < 0){
+ printk("run_helper : clone failed, errno = %d\n", errno);
+- err = -errno;
++ ret = -errno;
+ goto out_close;
+ }
+
+ os_close_file(fds[1]);
+- n = os_read_file(fds[0], &err, sizeof(err));
++ fds[1] = -1;
++
++ /*Read the errno value from the child.*/
++ n = os_read_file(fds[0], &ret, sizeof(ret));
+ if(n < 0){
+- printk("run_helper : read on pipe failed, err = %d\n", -n);
+- err = n;
+- goto out_kill;
++ printk("run_helper : read on pipe failed, ret = %d\n", -n);
++ ret = n;
++ os_kill_process(pid, 1);
+ }
+ else if(n != 0){
+ CATCH_EINTR(n = waitpid(pid, NULL, 0));
+- pid = -errno;
++ ret = -errno;
++ } else {
++ ret = pid;
+ }
+
+- if(stack_out == NULL) free_stack(stack, 0);
+- else *stack_out = stack;
+- return(pid);
+-
+- out_kill:
+- os_kill_process(pid, 1);
+ out_close:
++ if (fds[1] != -1)
++ os_close_file(fds[1]);
+ os_close_file(fds[0]);
+- os_close_file(fds[1]);
+ out_free:
+- free_stack(stack, 0);
+- return(err);
++ if(stack_out == NULL)
++ free_stack(stack, 0);
++ else *stack_out = stack;
++ return(ret);
+ }
+
+ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags,
+Index: linux-2.6.10/arch/um/kernel/time_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/time_kern.c 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/time_kern.c 2005-04-05 12:40:36.027911096 +0800
+@@ -170,7 +170,7 @@
+ void timer_handler(int sig, union uml_pt_regs *regs)
+ {
+ local_irq_disable();
+- update_process_times(user_context(UPT_SP(regs)));
++ update_process_times(CHOOSE_MODE(user_context(UPT_SP(regs)), (regs)->skas.is_user));
+ local_irq_enable();
+ if(current_thread->cpu == 0)
+ timer_irq(regs);
+Index: linux-2.6.10/arch/um/kernel/tt/include/mode-tt.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/include/mode-tt.h 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/include/mode-tt.h 2005-04-05 12:40:36.042908816 +0800
+@@ -14,6 +14,41 @@
+
+ extern int tracer(int (*init_proc)(void *), void *sp);
+ extern void user_time_init_tt(void);
++extern void sig_handler_common_tt(int sig, void *sc);
++extern void syscall_handler_tt(int sig, union uml_pt_regs *regs);
++extern void reboot_tt(void);
++extern void halt_tt(void);
++extern int is_tracer_winch(int pid, int fd, void *data);
++extern void kill_off_processes_tt(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only. This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
++/*
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MODE_TT_H__
++#define __MODE_TT_H__
++
++#include "sysdep/ptrace.h"
++
++enum { OP_NONE, OP_EXEC, OP_FORK, OP_TRACE_ON, OP_REBOOT, OP_HALT, OP_CB };
++
++extern int tracing_pid;
++
++extern int tracer(int (*init_proc)(void *), void *sp);
++extern void user_time_init_tt(void);
+ extern int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data);
+ extern int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr,
+ void *data);
+Index: linux-2.6.10/arch/um/kernel/tt/include/tt.h
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/include/tt.h 2004-12-25 05:34:58.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/include/tt.h 2005-04-05 12:40:36.043908664 +0800
+@@ -26,7 +26,8 @@
+ extern int is_tracing(void *task);
+ extern void syscall_handler(int sig, union uml_pt_regs *regs);
+ extern void exit_kernel(int pid, void *task);
+-extern int do_syscall(void *task, int pid, int local_using_sysemu);
++extern void do_syscall(void *task, int pid, int local_using_sysemu);
++extern void do_sigtrap(void *task);
+ extern int is_valid_pid(int pid);
+ extern void remap_data(void *segment_start, void *segment_end, int w);
+
+Index: linux-2.6.10/arch/um/kernel/tt/exec_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/exec_user.c 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/exec_user.c 2005-04-05 12:40:36.039909272 +0800
+@@ -10,6 +10,7 @@
+ #include <errno.h>
+ #include <sys/wait.h>
+ #include <sys/ptrace.h>
++#include <linux/ptrace.h>
+ #include <signal.h>
+ #include "user_util.h"
+ #include "kern_util.h"
+@@ -35,7 +36,10 @@
+ tracer_panic("do_exec failed to get registers - errno = %d",
+ errno);
+
+- kill(old_pid, SIGKILL);
++ os_kill_ptraced_process(old_pid, 0);
++
++ if (ptrace(PTRACE_OLDSETOPTIONS, new_pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0)
++ tracer_panic("do_exec: PTRACE_SETOPTIONS failed, errno = %d", errno);
+
+ if(ptrace_setregs(new_pid, regs) < 0)
+ tracer_panic("do_exec failed to start new proc - errno = %d",
+Index: linux-2.6.10/arch/um/kernel/tt/sys-i386/sigcontext.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/sys-i386/sigcontext.c 2004-12-25 05:35:39.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/sys-i386/sigcontext.c 2005-04-05 19:01:49.158500672 +0800
+@@ -1,60 +0,0 @@
+-/*
+- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+- * Licensed under the GPL
+- */
+-
+-#include <stdlib.h>
+-#include <asm/sigcontext.h>
+-#include "kern_util.h"
+-#include "sysdep/frame.h"
+-
+-int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data)
+-{
+- struct arch_frame_data *arch = data;
+- struct sigcontext *to = to_ptr, *from = from_ptr;
+- struct _fpstate *to_fp, *from_fp;
+- unsigned long sigs;
+- int err;
+-
+- to_fp = to->fpstate;
+- from_fp = from->fpstate;
+- sigs = to->oldmask;
+- err = copy_from_user_proc(to, from, sizeof(*to));
+- to->oldmask = sigs;
+- if(to_fp != NULL){
+- err |= copy_from_user_proc(&to->fpstate, &to_fp,
+- sizeof(to->fpstate));
+- err |= copy_from_user_proc(to_fp, from_fp, arch->fpstate_size);
+- }
+- return(err);
+-}
+-
+-int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr, void *data)
+-{
+- struct arch_frame_data *arch = data;
+- struct sigcontext *to = to_ptr, *from = from_ptr;
+- struct _fpstate *to_fp, *from_fp;
+- int err;
+-
+- to_fp = (struct _fpstate *)
+- (fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to)));
+- from_fp = from->fpstate;
+- err = copy_to_user_proc(to, from, sizeof(*to));
+- if(from_fp != NULL){
+- err |= copy_to_user_proc(&to->fpstate, &to_fp,
+- sizeof(to->fpstate));
+- err |= copy_to_user_proc(to_fp, from_fp, arch->fpstate_size);
+- }
+- return(err);
+-}
+-
+-/*
+- * Overrides for Emacs so that we follow Linus's tabbing style.
+- * Emacs will notice this stuff at the end of the file and automatically
+- * adjust the settings for this buffer only. This must remain at the end
+- * of the file.
+- * ---------------------------------------------------------------------------
+- * Local variables:
+- * c-file-style: "linux"
+- * End:
+- */
+Index: linux-2.6.10/arch/um/kernel/tt/sys-i386/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/sys-i386/Makefile 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/sys-i386/Makefile 2005-04-05 19:01:49.158500672 +0800
+@@ -1,12 +0,0 @@
+-#
+-# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+-# Licensed under the GPL
+-#
+-
+-obj-y = sigcontext.o
+-
+-USER_OBJS = sigcontext.o
+-USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
+-
+-$(USER_OBJS) : %.o: %.c
+- $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
+Index: linux-2.6.10/arch/um/kernel/tt/syscall_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/syscall_user.c 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/syscall_user.c 2005-04-05 12:40:36.037909576 +0800
+@@ -42,37 +42,31 @@
+ syscall_trace(regs, 1);
+ record_syscall_end(index, result);
+ }
+-
+-int do_syscall(void *task, int pid, int local_using_sysemu)
+-{
+- unsigned long proc_regs[FRAME_SIZE];
+- union uml_pt_regs *regs;
+- int syscall;
+-
+- if(ptrace_getregs(pid, proc_regs) < 0)
+- tracer_panic("Couldn't read registers");
+- syscall = PT_SYSCALL_NR(proc_regs);
+-
+- regs = TASK_REGS(task);
+- UPT_SYSCALL_NR(regs) = syscall;
+-
+- if(syscall < 0)
+- return(0);
+-
+- if((syscall != __NR_sigreturn) &&
+- ((unsigned long *) PT_IP(proc_regs) >= &_stext) &&
+- ((unsigned long *) PT_IP(proc_regs) <= &_etext))
+- tracer_panic("I'm tracing myself and I can't get out");
+-
+- if(local_using_sysemu)
+- return(1);
+-
+- if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
+- __NR_getpid) < 0)
+- tracer_panic("do_syscall : Nullifying syscall failed, "
+- "errno = %d", errno);
+- return(1);
+-}
++
++ void do_sigtrap(void *task)
++ {
++ UPT_SYSCALL_NR(TASK_REGS(task)) = -1;
++ }
++
++ void do_syscall(void *task, int pid, int local_using_sysemu)
++ {
++ unsigned long proc_regs[FRAME_SIZE];
++
++ if(ptrace_getregs(pid, proc_regs) < 0)
++ tracer_panic("Couldn't read registers");
++
++ UPT_SYSCALL_NR(TASK_REGS(task)) = PT_SYSCALL_NR(proc_regs);
++
++ if(((unsigned long *) PT_IP(proc_regs) >= &_stext) &&
++ ((unsigned long *) PT_IP(proc_regs) <= &_etext))
++ tracer_panic("I'm tracing myself and I can't get out");
++
++ /* syscall number -1 in sysemu skips syscall restarting in host */
++ if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
++ local_using_sysemu ? -1 : __NR_getpid) < 0)
++ tracer_panic("do_syscall : Nullifying syscall failed, "
++ "errno = %d", errno);
++ }
+
+ /*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+Index: linux-2.6.10/arch/um/kernel/tt/tracer.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/tracer.c 2005-04-01 01:16:47.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/tracer.c 2005-04-05 12:40:36.041908968 +0800
+@@ -13,6 +13,7 @@
+ #include <string.h>
+ #include <sys/mman.h>
+ #include <sys/ptrace.h>
++#include <linux/ptrace.h>
+ #include <sys/time.h>
+ #include <sys/wait.h>
+ #include "user.h"
+@@ -25,7 +26,6 @@
+ #include "mem_user.h"
+ #include "process.h"
+ #include "kern_util.h"
+-#include "frame.h"
+ #include "chan_user.h"
+ #include "ptrace_user.h"
+ #include "mode.h"
+@@ -72,6 +72,8 @@
+ (ptrace(PTRACE_CONT, pid, 0, 0) < 0))
+ tracer_panic("OP_FORK failed to attach pid");
+ wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL);
++ if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0)
++ tracer_panic("OP_FORK: PTRACE_SETOPTIONS failed, errno = %d", errno);
+ if(ptrace(PTRACE_CONT, pid, 0, 0) < 0)
+ tracer_panic("OP_FORK failed to continue process");
+ }
+@@ -141,7 +143,7 @@
+ * any more, the trace of those will land here. So, we need to just
+ * PTRACE_SYSCALL it.
+ */
+- case SIGTRAP:
++ case (SIGTRAP|SYSCALL_TRAP):
+ if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
+ tracer_panic("sleeping_process_signal : Failed to "
+ "PTRACE_SYSCALL pid %d, errno = %d\n",
+@@ -184,9 +186,8 @@
+ unsigned long eip = 0;
+ int status, pid = 0, sig = 0, cont_type, tracing = 0, op = 0;
+ int last_index, proc_id = 0, n, err, old_tracing = 0, strace = 0;
+- int pt_syscall_parm, local_using_sysemu;
++ int pt_syscall_parm, local_using_sysemu = 0;
+
+- capture_signal_stack();
+ signal(SIGPIPE, SIG_IGN);
+ setup_tracer_winch();
+ tracing_pid = os_getpid();
+@@ -198,6 +199,10 @@
+ printf("waitpid on idle thread failed, errno = %d\n", errno);
+ exit(1);
+ }
++ if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) {
++ printf("Failed to PTRACE_SETOPTIONS for idle thread, errno = %d\n", errno);
++ exit(1);
++ }
+ if((ptrace(PTRACE_CONT, pid, 0, 0) < 0)){
+ printf("Failed to continue idle thread, errno = %d\n", errno);
+ exit(1);
+@@ -315,7 +320,8 @@
+ task = cpu_tasks[proc_id].task;
+ tracing = is_tracing(task);
+ old_tracing = tracing;
+-
++ if ( tracing ) /* Assume: no syscall, when coming from user */
++ do_sigtrap(task);
+ local_using_sysemu = get_using_sysemu();
+ pt_syscall_parm = local_using_sysemu ? PTRACE_SYSEMU : PTRACE_SYSCALL;
+
+@@ -324,6 +330,15 @@
+ sig = 0;
+ op = do_proc_op(task, proc_id);
+ switch(op){
++ /*
++ * This is called when entering user mode; after
++ * this, we start intercepting syscalls.
++ *
++ * In fact, a process is started in kernel mode,
++ * so with is_tracing() == 0 (and that is reset
++ * when executing syscalls, since UML kernel has
++ * the right to do syscalls);
++ */
+ case OP_TRACE_ON:
+ arch_leave_kernel(task, pid);
+ tracing = 1;
+@@ -332,7 +347,13 @@
+ case OP_HALT:
+ unmap_physmem();
+ kmalloc_ok = 0;
+- ptrace(PTRACE_KILL, pid, 0, 0);
++ os_kill_ptraced_process(pid, 0);
++ /* Now let's reap remaining zombies */
++ errno = 0;
++ do {
++ waitpid(-1, &status,
++ WUNTRACED);
++ } while (errno != ECHILD);
+ return(op == OP_REBOOT);
+ case OP_NONE:
+ printf("Detaching pid %d\n", pid);
+@@ -346,14 +367,26 @@
+ */
+ pid = cpu_tasks[proc_id].pid;
+ break;
++ case (SIGTRAP|SYSCALL_TRAP):
++ if(!tracing && (debugger_pid != -1)){
++ child_signal(pid, W_STOPCODE(SIGTRAP));
++ continue;
++ }
++ tracing = 0;
++ /* local_using_sysemu has been already set
++ * below, since if we are here, is_tracing() on
++ * the traced task was 1, i.e. the process had
++ * already run through one iteration of the
++ * loop which executed a OP_TRACE_ON request.*/
++ do_syscall(task, pid, local_using_sysemu);
++ sig = SIGUSR2;
++ break;
+ case SIGTRAP:
+ if(!tracing && (debugger_pid != -1)){
+ child_signal(pid, status);
+ continue;
+ }
+ tracing = 0;
+- if(do_syscall(task, pid, local_using_sysemu))
+- sig = SIGUSR2;
+ break;
+ case SIGPROF:
+ if(tracing) sig = 0;
+@@ -389,6 +422,9 @@
+ continue;
+ }
+
++ local_using_sysemu = get_using_sysemu();
++ pt_syscall_parm = local_using_sysemu ? PTRACE_SYSEMU : PTRACE_SYSCALL;
++
+ if(tracing){
+ if(singlestepping(task))
+ cont_type = PTRACE_SINGLESTEP;
+Index: linux-2.6.10/arch/um/kernel/tt/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/tt/Makefile 2004-12-25 05:34:57.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/tt/Makefile 2005-04-05 12:40:36.041908968 +0800
+@@ -8,7 +8,7 @@
+
+ obj-y = exec_kern.o exec_user.o gdb.o ksyms.o mem.o mem_user.o process_kern.o \
+ syscall_kern.o syscall_user.o time.o tlb.o tracer.o trap_user.o \
+- uaccess.o uaccess_user.o sys-$(SUBARCH)/
++ uaccess.o uaccess_user.o
+
+ obj-$(CONFIG_PT_PROXY) += gdb_kern.o ptproxy/
+
+Index: linux-2.6.10/arch/um/kernel/trap_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/trap_user.c 2004-12-25 05:34:44.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/trap_user.c 2005-04-05 12:40:36.047908056 +0800
+@@ -18,7 +18,6 @@
+ #include "sigcontext.h"
+ #include "sysdep/sigcontext.h"
+ #include "irq_user.h"
+-#include "frame_user.h"
+ #include "signal_user.h"
+ #include "time_user.h"
+ #include "task.h"
+Index: linux-2.6.10/arch/um/kernel/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/Makefile 2004-12-25 05:35:01.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/Makefile 2005-04-05 12:40:36.051907448 +0800
+@@ -6,7 +6,7 @@
+ extra-y := vmlinux.lds
+ clean-files := vmlinux.lds.S
+
+-obj-y = checksum.o config.o exec_kern.o exitcode.o frame_kern.o frame.o \
++obj-y = checksum.o config.o exec_kern.o exitcode.o \
+ helper.o init_task.o irq.o irq_user.o ksyms.o main.o mem.o mem_user.o \
+ physmem.o process.o process_kern.o ptrace.o reboot.o resource.o \
+ sigio_user.o sigio_kern.o signal_kern.o signal_user.o smp.o \
+Index: linux-2.6.10/arch/um/kernel/mem.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/kernel/mem.c 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/arch/um/kernel/mem.c 2005-04-05 12:40:36.029910792 +0800
+@@ -175,6 +175,30 @@
+ }
+ #endif /* CONFIG_HIGHMEM */
+
++static void __init fixaddr_user_init( void)
++{
++ long size = FIXADDR_USER_END - FIXADDR_USER_START;
++ pgd_t *pgd;
++ pmd_t *pmd;
++ pte_t *pte;
++ unsigned long paddr, vaddr = FIXADDR_USER_START;
++
++ if ( ! size )
++ return;
++
++ fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir);
++ paddr = (unsigned long)alloc_bootmem_low_pages( size);
++ memcpy( (void *)paddr, (void *)FIXADDR_USER_START, size);
++ paddr = __pa(paddr);
++ for ( ; size > 0; size-=PAGE_SIZE, vaddr+=PAGE_SIZE, paddr+=PAGE_SIZE) {
++ pgd = swapper_pg_dir + pgd_index(vaddr);
++ pmd = pmd_offset(pgd, vaddr);
++ pte = pte_offset_kernel(pmd, vaddr);
++ /*pte_set_val( (*pte), paddr, PAGE_READONLY);*/
++ pte_val(*pte) = paddr | pgprot_val(PAGE_READONLY);
++ }
++}
++
+ void paging_init(void)
+ {
+ unsigned long zones_size[MAX_NR_ZONES], vaddr;
+@@ -195,6 +219,8 @@
+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+ fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir);
+
++ fixaddr_user_init();
++
+ #ifdef CONFIG_HIGHMEM
+ init_highmem();
+ #endif
+Index: linux-2.6.10/arch/um/os-Linux/user_syms.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/os-Linux/user_syms.c 2004-12-25 05:35:23.000000000 +0800
++++ linux-2.6.10/arch/um/os-Linux/user_syms.c 2005-04-05 12:40:36.019912312 +0800
+@@ -26,6 +26,9 @@
+
+ EXPORT_SYMBOL(strstr);
+
++EXPORT_SYMBOL(vsyscall_ehdr);
++EXPORT_SYMBOL(vsyscall_end);
++
+ /* Here, instead, I can provide a fake prototype. Yes, someone cares: genksyms.
+ * However, the modules will use the CRC defined *here*, no matter if it is
+ * good; so the versions of these symbols will always match
+Index: linux-2.6.10/arch/um/os-Linux/elf_aux.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/os-Linux/elf_aux.c 2005-04-05 19:01:49.158500672 +0800
++++ linux-2.6.10/arch/um/os-Linux/elf_aux.c 2005-04-05 12:40:36.018912464 +0800
+@@ -0,0 +1,67 @@
++/*
++ * arch/um/kernel/elf_aux.c
++ *
++ * Scan the Elf auxiliary vector provided by the host to extract
++ * information about vsyscall-page, etc.
++ *
++ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH
++ * Author: Bodo Stroesser (bodo.stroesser@fujitsu-siemens.com)
++ */
++#include <elf.h>
++#include <stddef.h>
++#include "init.h"
++#include "elf_user.h"
++
++#if ELF_CLASS == ELFCLASS32
++typedef Elf32_auxv_t elf_auxv_t;
++#else
++typedef Elf64_auxv_t elf_auxv_t;
++#endif
++
++char * elf_aux_platform;
++long elf_aux_hwcap;
++
++unsigned long vsyscall_ehdr;
++unsigned long vsyscall_end;
++
++unsigned long __kernel_vsyscall;
++
++
++__init void scan_elf_aux( char **envp)
++{
++ long page_size = 0;
++ elf_auxv_t * auxv;
++
++ while ( *envp++ != NULL) ;
++
++ for ( auxv = (elf_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++) {
++ switch ( auxv->a_type ) {
++ case AT_SYSINFO:
++ __kernel_vsyscall = auxv->a_un.a_val;
++ break;
++ case AT_SYSINFO_EHDR:
++ vsyscall_ehdr = auxv->a_un.a_val;
++ break;
++ case AT_HWCAP:
++ elf_aux_hwcap = auxv->a_un.a_val;
++ break;
++ case AT_PLATFORM:
++ elf_aux_platform = auxv->a_un.a_ptr;
++ break;
++ case AT_PAGESZ:
++ page_size = auxv->a_un.a_val;
++ break;
++ }
++ }
++ if ( ! __kernel_vsyscall || ! vsyscall_ehdr ||
++ ! elf_aux_hwcap || ! elf_aux_platform ||
++ ! page_size || (vsyscall_ehdr % page_size) ) {
++ __kernel_vsyscall = 0;
++ vsyscall_ehdr = 0;
++ elf_aux_hwcap = 0;
++ elf_aux_platform = "i586";
++ }
++ else {
++ vsyscall_end = vsyscall_ehdr + page_size;
++ }
++}
+Index: linux-2.6.10/arch/um/os-Linux/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/os-Linux/Makefile 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/arch/um/os-Linux/Makefile 2005-04-05 12:40:36.019912312 +0800
+@@ -3,9 +3,9 @@
+ # Licensed under the GPL
+ #
+
+-obj-y = file.o process.o time.o tty.o user_syms.o drivers/
++obj-y = elf_aux.o file.o process.o time.o tty.o user_syms.o drivers/
+
+-USER_OBJS := $(foreach file,file.o process.o time.o tty.o,$(obj)/$(file))
++USER_OBJS := $(foreach file,elf_aux.o file.o process.o time.o tty.o,$(obj)/$(file))
+
+ $(USER_OBJS) : %.o: %.c
+ $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
+Index: linux-2.6.10/arch/um/drivers/net_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/drivers/net_kern.c 2004-12-25 05:34:44.000000000 +0800
++++ linux-2.6.10/arch/um/drivers/net_kern.c 2005-04-05 12:40:36.016912768 +0800
+@@ -126,10 +126,6 @@
+ lp->tl.data = (unsigned long) &lp->user;
+ netif_start_queue(dev);
+
+- spin_lock(&opened_lock);
+- list_add(&lp->list, &opened);
+- spin_unlock(&opened_lock);
+-
+ /* clear buffer - it can happen that the host side of the interface
+ * is full when we get here. In this case, new data is never queued,
+ * SIGIOs never arrive, and the net never works.
+@@ -152,9 +148,6 @@
+ free_irq(dev->irq, dev);
+ if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user);
+ lp->fd = -1;
+- spin_lock(&opened_lock);
+- list_del(&lp->list);
+- spin_unlock(&opened_lock);
+
+ spin_unlock(&lp->lock);
+ return 0;
+@@ -397,6 +390,11 @@
+
+ if (device->have_mac)
+ set_ether_mac(dev, device->mac);
++
++ spin_lock(&opened_lock);
++ list_add(&lp->list, &opened);
++ spin_unlock(&opened_lock);
++
+ return(0);
+ }
+
+@@ -705,7 +703,7 @@
+ static void close_devices(void)
+ {
+ struct list_head *ele;
+- struct uml_net_private *lp;
++ struct uml_net_private *lp;
+
+ list_for_each(ele, &opened){
+ lp = list_entry(ele, struct uml_net_private, list);
+Index: linux-2.6.10/arch/um/drivers/mconsole_kern.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/drivers/mconsole_kern.c 2004-12-25 05:33:49.000000000 +0800
++++ linux-2.6.10/arch/um/drivers/mconsole_kern.c 2005-04-05 12:40:36.015912920 +0800
+@@ -204,6 +204,68 @@
+ }
+ #endif
+
++/* This is a more convoluted version of mconsole_proc, which has some stability
++ * problems; however, we need it fixed, because it is expected that UML users
++ * mount HPPFS instead of procfs on /proc. And we want mconsole_proc to still
++ * show the real procfs content, not the ones from hppfs.*/
++#if 0
++void mconsole_proc(struct mc_request *req)
++{
++ char path[64];
++ char *buf;
++ int len;
++ int fd;
++ int first_chunk = 1;
++ char *ptr = req->request.data;
++
++ ptr += strlen("proc");
++ while(isspace(*ptr)) ptr++;
++ snprintf(path, sizeof(path), "/proc/%s", ptr);
++
++ fd = sys_open(path, 0, 0);
++ if (fd < 0) {
++ mconsole_reply(req, "Failed to open file", 1, 0);
++ printk("open %s: %d\n",path,fd);
++ goto out;
++ }
++
++ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
++ if(buf == NULL){
++ mconsole_reply(req, "Failed to allocate buffer", 1, 0);
++ goto out_close;
++ }
++
++ for (;;) {
++ len = sys_read(fd, buf, PAGE_SIZE-1);
++ if (len < 0) {
++ mconsole_reply(req, "Read of file failed", 1, 0);
++ goto out_free;
++ }
++ /*Begin the file content on his own line.*/
++ if (first_chunk) {
++ mconsole_reply(req, "\n", 0, 1);
++ first_chunk = 0;
++ }
++ if (len == PAGE_SIZE-1) {
++ buf[len] = '\0';
++ mconsole_reply(req, buf, 0, 1);
++ } else {
++ buf[len] = '\0';
++ mconsole_reply(req, buf, 0, 0);
++ break;
++ }
++ }
++ /*END*/
++
++ out_free:
++ kfree(buf);
++ out_close:
++ sys_close(fd);
++ out:
++ /* nothing */;
++}
++#endif
++
+ void mconsole_proc(struct mc_request *req)
+ {
+ char path[64];
+Index: linux-2.6.10/arch/um/drivers/net_user.c
+===================================================================
+--- linux-2.6.10.orig/arch/um/drivers/net_user.c 2004-12-25 05:34:26.000000000 +0800
++++ linux-2.6.10/arch/um/drivers/net_user.c 2005-04-05 12:40:36.017912616 +0800
+@@ -173,10 +173,12 @@
+ pe_data.stdout = fds[1];
+ pid = run_helper(change_pre_exec, &pe_data, argv, NULL);
+
+- os_close_file(fds[1]);
+ read_output(fds[0], output, output_len);
++ os_close_file(fds[0]);
++ os_close_file(fds[1]);
+
+- CATCH_EINTR(err = waitpid(pid, NULL, 0));
++ if (pid > 0)
++ CATCH_EINTR(err = waitpid(pid, NULL, 0));
+ return(pid);
+ }
+
+Index: linux-2.6.10/arch/um/Kconfig
+===================================================================
+--- linux-2.6.10.orig/arch/um/Kconfig 2004-12-25 05:34:45.000000000 +0800
++++ linux-2.6.10/arch/um/Kconfig 2005-04-05 12:40:36.053907144 +0800
+@@ -139,6 +139,25 @@
+
+ It is safe to say 'Y' here.
+
++config MAGIC_SYSRQ
++ bool "Magic SysRq key"
++ depends on MCONSOLE
++ ---help---
++ If you say Y here, you will have some control over the system even
++ if the system crashes for example during kernel debugging (e.g., you
++ will be able to flush the buffer cache to disk, reboot the system
++ immediately or dump some status information). A key for each of the
++ possible requests is provided.
++
++ This is the feature normally accomplished by pressing a key
++ while holding SysRq (Alt+PrintScreen).
++
++ On UML, this is accomplished by sending a "sysrq" command with
++ mconsole, followed by the letter for the requested command.
++
++ The keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
++ unless you really know what this hack does.
++
+ config HOST_2G_2G
+ bool "2G/2G host address space split"
+ default n
+@@ -153,28 +172,28 @@
+ So, if you do not know what to do here, say 'N'.
+
+ config SMP
+- bool "Symmetric multi-processing support (EXPERIMENTAL)"
+- default n
+- depends on MODE_TT && EXPERIMENTAL
+- help
+- This option enables UML SMP support.
+- It is NOT related to having a real SMP box. Not directly, at least.
++ bool "Symmetric multi-processing support (EXPERIMENTAL)"
++ default n
++ depends on MODE_TT && EXPERIMENTAL
++ help
++ This option enables UML SMP support.
++ It is NOT related to having a real SMP box. Not directly, at least.
++
++ UML implements virtual SMP by allowing as many processes to run
++ simultaneously on the host as there are virtual processors configured.
++
++ Obviously, if the host is a uniprocessor, those processes will
++ timeshare, but, inside UML, will appear to be running simultaneously.
++ If the host is a multiprocessor, then UML processes may run
++ simultaneously, depending on the host scheduler.
++
++ This, however, is supported only in TT mode. So, if you use the SKAS
++ patch on your host, switching to TT mode and enabling SMP usually gives
++ you worse performances.
++ Also, since the support for SMP has been under-developed, there could
++ be some bugs being exposed by enabling SMP.
+
+- UML implements virtual SMP by allowing as many processes to run
+- simultaneously on the host as there are virtual processors configured.
+-
+- Obviously, if the host is a uniprocessor, those processes will
+- timeshare, but, inside UML, will appear to be running simultaneously.
+- If the host is a multiprocessor, then UML processes may run
+- simultaneously, depending on the host scheduler.
+-
+- This, however, is supported only in TT mode. So, if you use the SKAS
+- patch on your host, switching to TT mode and enabling SMP usually gives
+- you worse performances.
+- Also, since the support for SMP has been under-developed, there could
+- be some bugs being exposed by enabling SMP.
+-
+- If you don't know what to do, say N.
++ If you don't know what to do, say N.
+
+ config NR_CPUS
+ int "Maximum number of CPUs (2-32)"
+@@ -282,4 +301,8 @@
+ bool
+ default n
+
++config INPUT
++ bool
++ default n
++
+ source "arch/um/Kconfig.debug"
+Index: linux-2.6.10/arch/um/Makefile
+===================================================================
+--- linux-2.6.10.orig/arch/um/Makefile 2004-12-25 05:35:00.000000000 +0800
++++ linux-2.6.10/arch/um/Makefile 2005-04-05 12:40:53.158306880 +0800
+@@ -77,6 +77,8 @@
+ echo ' find in the kernel root.'
+ endef
+
++.PHONY: linux
++
+ prepare: $(ARCH_SYMLINKS) $(SYS_HEADERS) $(GEN_HEADERS) \
+ $(ARCH_DIR)/kernel/vmlinux.lds.S
+
+Index: linux-2.6.10/fs/hostfs/hostfs.h
+===================================================================
+--- linux-2.6.10.orig/fs/hostfs/hostfs.h 2004-12-25 05:35:24.000000000 +0800
++++ linux-2.6.10/fs/hostfs/hostfs.h 2005-04-05 12:40:36.068904864 +0800
+@@ -16,9 +16,30 @@
+ #define HOSTFS_ATTR_CTIME 64
+ #define HOSTFS_ATTR_ATIME_SET 128
+ #define HOSTFS_ATTR_MTIME_SET 256
++
++/* These two are unused by hostfs. */
+ #define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */
+ #define HOSTFS_ATTR_ATTR_FLAG 1024
+
++/* If you are very careful, you'll notice that these two are missing:
++ *
++ * #define ATTR_KILL_SUID 2048
++ * #define ATTR_KILL_SGID 4096
++ *
++ * and this is because they were added in 2.5 development in this patch:
++ *
++ * http://linux.bkbits.net:8080/linux-2.5/
++ * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
++ * |src/.|src/include|src/include/linux|related/include/linux/fs.h
++ *
++ * Actually, they are not needed by most ->setattr() methods - they are set by
++ * callers of notify_change() to notify that the setuid/setgid bits must be
++ * dropped.
++ * notify_change() will delete those flags, make sure attr->ia_valid & ATTR_MODE
++ * is on, and remove the appropriate bits from attr->ia_mode (attr is a
++ * "struct iattr *"). -BlaisorBlade
++ */
++
+ struct hostfs_iattr {
+ unsigned int ia_valid;
+ mode_t ia_mode;
+Index: linux-2.6.10/fs/hostfs/hostfs_kern.c
+===================================================================
+--- linux-2.6.10.orig/fs/hostfs/hostfs_kern.c 2004-12-25 05:34:01.000000000 +0800
++++ linux-2.6.10/fs/hostfs/hostfs_kern.c 2005-04-05 12:40:36.069904712 +0800
+@@ -393,6 +393,7 @@
+ static struct file_operations hostfs_file_fops = {
+ .llseek = generic_file_llseek,
+ .read = generic_file_read,
++ .sendfile = generic_file_sendfile,
+ .write = generic_file_write,
+ .mmap = generic_file_mmap,
+ .open = hostfs_file_open,
+@@ -818,6 +819,10 @@
+ char *name;
+ int err;
+
++ err = inode_change_ok(dentry->d_inode, attr);
++ if (err)
++ return err;
++
+ if(append)
+ attr->ia_valid &= ~ATTR_SIZE;
+
--- /dev/null
+Introduce lock-free versions of d_rehash and d_move.
+
+ fs/dcache.c | 22 ++++++++++++++++++----
+ include/linux/dcache.h | 2 ++
+ 2 files changed, 20 insertions(+), 4 deletions(-)
+
+Index: linux-2.6.10/fs/dcache.c
+===================================================================
+--- linux-2.6.10.orig/fs/dcache.c 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/fs/dcache.c 2005-03-31 19:16:50.807244880 +0800
+@@ -1116,29 +1116,23 @@
+ spin_unlock(&dcache_lock);
+ }
+
+-static void __d_rehash(struct dentry * entry, struct hlist_head *list)
++void __d_rehash(struct dentry * entry)
+ {
+-
+- entry->d_flags &= ~DCACHE_UNHASHED;
+- hlist_add_head_rcu(&entry->d_hash, list);
++ struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
++
++ spin_lock(&entry->d_lock);
++ entry->d_flags &= ~DCACHE_UNHASHED;
++ hlist_add_head_rcu(&entry->d_hash, list);
++ spin_unlock(&entry->d_lock);
+ }
+-
+-/**
+- * d_rehash - add an entry back to the hash
+- * @entry: dentry to add to the hash
+- *
+- * Adds a dentry to the hash according to its name.
+- */
+
++EXPORT_SYMBOL(__d_rehash);
++
+ void d_rehash(struct dentry * entry)
+ {
+- struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+-
+- spin_lock(&dcache_lock);
+- spin_lock(&entry->d_lock);
+- __d_rehash(entry, list);
+- spin_unlock(&entry->d_lock);
+- spin_unlock(&dcache_lock);
++ spin_lock(&dcache_lock);
++ __d_rehash(entry);
++ spin_unlock(&dcache_lock);
+ }
+
+ #define do_switch(x,y) do { \
+@@ -1213,14 +1207,13 @@
+ * dcache entries should not be moved in this way.
+ */
+
+-void d_move(struct dentry * dentry, struct dentry * target)
++void __d_move(struct dentry * dentry, struct dentry * target)
+ {
+ struct hlist_head *list;
+
+ if (!dentry->d_inode)
+ printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+
+- spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
+ /*
+ * XXXX: do we really need to take target->d_lock?
+@@ -1241,7 +1234,8 @@
+
+ already_unhashed:
+ list = d_hash(target->d_parent, target->d_name.hash);
+- __d_rehash(dentry, list);
++ dentry->d_flags &= ~DCACHE_UNHASHED;
++ hlist_add_head_rcu(&dentry->d_hash, list);
+
+ /* Unhash the target: dput() will then get rid of it */
+ __d_drop(target);
+@@ -1280,6 +1274,14 @@
+ spin_unlock(&target->d_lock);
+ spin_unlock(&dentry->d_lock);
+ write_sequnlock(&rename_lock);
++}
++
++EXPORT_SYMBOL(__d_move);
++
++void d_move(struct dentry *dentry, struct dentry *target)
++{
++ spin_lock(&dcache_lock);
++ __d_move(dentry, target);
+ spin_unlock(&dcache_lock);
+ }
+
+Index: linux-2.6.10/include/linux/dcache.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dcache.h 2005-03-31 15:35:26.000000000 +0800
++++ linux-2.6.10/include/linux/dcache.h 2005-03-31 19:15:49.684536944 +0800
+@@ -228,6 +228,7 @@
+ * This adds the entry to the hash queues.
+ */
+ extern void d_rehash(struct dentry *);
++extern void __d_rehash(struct dentry *);
+
+ /**
+ * d_add - add dentry to hash queues
+@@ -246,6 +247,7 @@
+
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
++extern void __d_move(struct dentry *, struct dentry *);
+
+ /* appendix may either be NULL or be used for transname suffixes */
+ extern struct dentry * d_lookup(struct dentry *, struct qstr *);
--- /dev/null
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c 2005-03-31 17:03:37.000000000 +0800
++++ linux-2.6.10/fs/namespace.c 2005-03-31 17:58:42.827926064 +0800
+@@ -365,7 +365,7 @@
+ }
+ }
+
+-static int do_umount(struct vfsmount *mnt, int flags)
++int do_umount(struct vfsmount *mnt, int flags)
+ {
+ struct super_block * sb = mnt->mnt_sb;
+ int retval;
+@@ -458,6 +458,8 @@
+ return retval;
+ }
+
++EXPORT_SYMBOL(do_umount);
++
+ /*
+ * Now umount can handle mount points as well as block devices.
+ * This is important for filesystems which use unnamed block devices.
+Index: linux-2.6.10/include/linux/mount.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mount.h 2005-03-31 17:15:40.000000000 +0800
++++ linux-2.6.10/include/linux/mount.h 2005-03-31 17:59:41.914943472 +0800
+@@ -70,6 +70,7 @@
+ extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
+ const char *name, void *data);
+
++extern int do_umount(struct vfsmount *mnt, int flags);
+ struct nameidata;
+
+ extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
--- /dev/null
+Index: linux-2.6.10/fs/open.c
+===================================================================
+--- linux-2.6.10.orig/fs/open.c 2005-03-31 15:35:27.683586616 +0800
++++ linux-2.6.10/fs/open.c 2005-03-31 17:13:48.440535208 +0800
+@@ -217,11 +217,12 @@
+ struct inode * inode;
+ int error;
+
++ intent_init(&nd.intent.open, IT_GETATTR);
+ error = -EINVAL;
+ if (length < 0) /* sorry, but loff_t says... */
+ goto out;
+
+- error = user_path_walk(path, &nd);
++ error = user_path_walk_it(path, &nd);
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+@@ -476,6 +477,7 @@
+ kernel_cap_t old_cap;
+ int res;
+
++ intent_init(&nd.intent.open, IT_GETATTR);
+ if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
+ return -EINVAL;
+
+@@ -499,7 +501,7 @@
+ else
+ current->cap_effective = current->cap_permitted;
+
+- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+ if (!res) {
+ res = permission(nd.dentry->d_inode, mode, &nd);
+ /* SuS v2 requires we report a read only fs too */
+@@ -521,7 +523,8 @@
+ struct nameidata nd;
+ int error;
+
+- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++ intent_init(&nd.intent.open, IT_GETATTR);
++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+ if (error)
+ goto out;
+
+@@ -574,7 +577,8 @@
+ struct nameidata nd;
+ int error;
+
+- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++ intent_init(&nd.intent.open, IT_GETATTR);
++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+ if (error)
+ goto out;
+
+@@ -759,6 +763,7 @@
+ {
+ int namei_flags, error;
+ struct nameidata nd;
++ intent_init(&nd.intent.open, IT_OPEN);
+
+ namei_flags = flags;
+ if ((namei_flags+1) & O_ACCMODE)
+@@ -768,14 +773,14 @@
+
+ error = open_namei(filename, namei_flags, mode, &nd);
+ if (!error)
+- return dentry_open(nd.dentry, nd.mnt, flags);
++ return dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent.open);
+
+ return ERR_PTR(error);
+ }
+
+ EXPORT_SYMBOL(filp_open);
+
+-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags, struct open_intent *it)
+ {
+ struct file * f;
+ struct inode *inode;
+@@ -787,6 +792,7 @@
+ goto cleanup_dentry;
+ f->f_flags = flags;
+ f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
++ f->f_it = it;
+ inode = dentry->d_inode;
+ if (f->f_mode & FMODE_WRITE) {
+ error = get_write_access(inode);
+@@ -805,6 +811,7 @@
+ error = f->f_op->open(inode,f);
+ if (error)
+ goto cleanup_all;
++ intent_release(it);
+ }
+ f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+@@ -830,11 +837,20 @@
+ cleanup_file:
+ put_filp(f);
+ cleanup_dentry:
++ intent_release(it);
+ dput(dentry);
+ mntput(mnt);
+ return ERR_PTR(error);
+ }
+
++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++{
++ struct open_intent it;
++ intent_init(&it, IT_LOOKUP);
++
++ return dentry_open_it(dentry, mnt, flags, &it);
++}
++
+ EXPORT_SYMBOL(dentry_open);
+
+ /*
+Index: linux-2.6.10/fs/xattr.c
+===================================================================
+--- linux-2.6.10.orig/fs/xattr.c 2004-12-25 05:34:32.000000000 +0800
++++ linux-2.6.10/fs/xattr.c 2005-03-31 17:03:37.148465728 +0800
+@@ -164,7 +164,8 @@
+ struct nameidata nd;
+ ssize_t error;
+
+- error = user_path_walk(path, &nd);
++ intent_init(&nd.intent.open, IT_GETXATTR);
++ error = user_path_walk_it(path, &nd);
+ if (error)
+ return error;
+ error = getxattr(nd.dentry, name, value, size);
+@@ -179,7 +180,8 @@
+ struct nameidata nd;
+ ssize_t error;
+
+- error = user_path_walk_link(path, &nd);
++ intent_init(&nd.intent.open, IT_GETXATTR);
++ error = user_path_walk_link_it(path, &nd);
+ if (error)
+ return error;
+ error = getxattr(nd.dentry, name, value, size);
+@@ -245,7 +247,8 @@
+ struct nameidata nd;
+ ssize_t error;
+
+- error = user_path_walk(path, &nd);
++ intent_init(&nd.intent.open, IT_GETXATTR);
++ error = user_path_walk_it(path, &nd);
+ if (error)
+ return error;
+ error = listxattr(nd.dentry, list, size);
+@@ -259,7 +262,8 @@
+ struct nameidata nd;
+ ssize_t error;
+
+- error = user_path_walk_link(path, &nd);
++ intent_init(&nd.intent.open, IT_GETXATTR);
++ error = user_path_walk_link_it(path, &nd);
+ if (error)
+ return error;
+ error = listxattr(nd.dentry, list, size);
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c 2005-03-31 15:35:26.294797744 +0800
++++ linux-2.6.10/fs/namei.c 2005-03-31 17:12:26.403006808 +0800
+@@ -288,8 +288,19 @@
+ return 0;
+ }
+
++void intent_release(struct open_intent *it)
++{
++ if (!it)
++ return;
++ if (it->magic != INTENT_MAGIC)
++ return;
++ if (it->op_release)
++ it->op_release(it);
++}
++
+ void path_release(struct nameidata *nd)
+ {
++ intent_release(&nd->intent.open);
+ dput(nd->dentry);
+ mntput(nd->mnt);
+ }
+@@ -448,6 +459,7 @@
+ static inline int __vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+ int res = 0;
++ struct open_intent it = nd->intent.open;
+ char *name;
+ if (IS_ERR(link))
+ goto fail;
+@@ -458,6 +470,10 @@
+ /* weird __emul_prefix() stuff did it */
+ goto out;
+ }
++ intent_release(&nd->intent.open);
++ intent_init(&nd->intent.open, it.op);
++ nd->intent.open.flags = it.flags;
++ nd->intent.open.create_mode = it.create_mode;
+ res = link_path_walk(link, nd);
+ out:
+ if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -876,8 +892,14 @@
+ return err;
+ }
+
++int fastcall path_walk_it(const char * name, struct nameidata *nd)
++{
++ current->total_link_count = 0;
++ return link_path_walk(name, nd);
++}
+ int fastcall path_walk(const char * name, struct nameidata *nd)
+ {
++ intent_init(&nd->intent.open, IT_LOOKUP);
+ current->total_link_count = 0;
+ return link_path_walk(name, nd);
+ }
+@@ -886,7 +908,7 @@
+ /* returns 1 if everything is done */
+ static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
+ {
+- if (path_walk(name, nd))
++ if (path_walk_it(name, nd))
+ return 0; /* something went wrong... */
+
+ if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) {
+@@ -947,7 +969,18 @@
+ }
+ }
+
+-int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
++static inline int it_mode_from_lookup_flags(int flags)
++{
++ int mode = IT_LOOKUP;
++
++ if (flags & LOOKUP_OPEN)
++ mode = IT_OPEN;
++ if (flags & LOOKUP_CREATE)
++ mode |= IT_CREAT;
++ return mode;
++}
++
++int fastcall path_lookup_it(const char *name, unsigned int flags, struct nameidata *nd)
+ {
+ int retval;
+
+@@ -982,6 +1015,12 @@
+ return retval;
+ }
+
++int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
++{
++ intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags));
++ return path_lookup_it(name, flags, nd);
++}
++
+ /*
+ * Restricted form of lookup. Doesn't follow links, single-component only,
+ * needs parent already locked. Doesn't follow mounts.
+@@ -1032,7 +1071,7 @@
+ }
+
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd)
+ {
+ unsigned long hash;
+ struct qstr this;
+@@ -1052,11 +1091,16 @@
+ }
+ this.hash = end_name_hash(hash);
+
+- return lookup_hash(&this, base);
++ return __lookup_hash(&this, base, nd);
+ access:
+ return ERR_PTR(-EACCES);
+ }
+
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++ return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+ * namei()
+ *
+@@ -1068,18 +1112,24 @@
+ * that namei follows links, while lnamei does not.
+ * SMP-safe
+ */
+-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)
+ {
+ char *tmp = getname(name);
+ int err = PTR_ERR(tmp);
+
+ if (!IS_ERR(tmp)) {
+- err = path_lookup(tmp, flags, nd);
++ err = path_lookup_it(tmp, flags, nd);
+ putname(tmp);
+ }
+ return err;
+ }
+
++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++{
++ intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags));
++ return __user_walk_it(name, flags, nd);
++}
++
+ /*
+ * It's inline, so penalty for filesystems that don't use sticky bit is
+ * minimal.
+@@ -1370,7 +1420,7 @@
+ * The simplest case - just a plain lookup.
+ */
+ if (!(flag & O_CREAT)) {
+- error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
++ error = path_lookup_it(pathname, lookup_flags(flag), nd);
+ if (error)
+ return error;
+ goto ok;
+@@ -1379,7 +1429,8 @@
+ /*
+ * Create - we need to know the parent.
+ */
+- error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
++ nd->intent.open.op |= IT_CREAT;
++ error = path_lookup_it(pathname, LOOKUP_PARENT, nd);
+ if (error)
+ return error;
+
+@@ -2344,6 +2395,7 @@
+ }
+ }
+
++
+ int page_symlink(struct inode *inode, const char *symname, int len)
+ {
+ struct address_space *mapping = inode->i_mapping;
+@@ -2405,8 +2457,10 @@
+ EXPORT_SYMBOL(page_symlink);
+ EXPORT_SYMBOL(page_symlink_inode_operations);
+ EXPORT_SYMBOL(path_lookup);
++EXPORT_SYMBOL(path_lookup_it);
+ EXPORT_SYMBOL(path_release);
+ EXPORT_SYMBOL(path_walk);
++EXPORT_SYMBOL(path_walk_it);
+ EXPORT_SYMBOL(permission);
+ EXPORT_SYMBOL(unlock_rename);
+ EXPORT_SYMBOL(vfs_create);
+Index: linux-2.6.10/fs/stat.c
+===================================================================
+--- linux-2.6.10.orig/fs/stat.c 2004-12-25 05:34:02.000000000 +0800
++++ linux-2.6.10/fs/stat.c 2005-03-31 17:03:37.144466336 +0800
+@@ -60,15 +60,15 @@
+ }
+ return 0;
+ }
+-
+ EXPORT_SYMBOL(vfs_getattr);
+
+ int vfs_stat(char __user *name, struct kstat *stat)
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent.open, IT_GETATTR);
+
+- error = user_path_walk(name, &nd);
++ error = user_path_walk_it(name, &nd);
+ if (!error) {
+ error = vfs_getattr(nd.mnt, nd.dentry, stat);
+ path_release(&nd);
+@@ -82,8 +82,9 @@
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent.open, IT_GETATTR);
+
+- error = user_path_walk_link(name, &nd);
++ error = user_path_walk_link_it(name, &nd);
+ if (!error) {
+ error = vfs_getattr(nd.mnt, nd.dentry, stat);
+ path_release(&nd);
+@@ -97,9 +98,12 @@
+ {
+ struct file *f = fget(fd);
+ int error = -EBADF;
++ struct nameidata nd;
++ intent_init(&nd.intent.open, IT_GETATTR);
+
+ if (f) {
+ error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat);
++ intent_release(&nd.intent.open);
+ fput(f);
+ }
+ return error;
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c 2005-03-31 15:35:26.295797592 +0800
++++ linux-2.6.10/fs/namespace.c 2005-03-31 17:03:37.145466184 +0800
+@@ -113,6 +113,7 @@
+
+ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
+ {
++ memset(old_nd, 0, sizeof(*old_nd));
+ old_nd->dentry = mnt->mnt_mountpoint;
+ old_nd->mnt = mnt->mnt_parent;
+ mnt->mnt_parent = mnt;
+Index: linux-2.6.10/fs/exec.c
+===================================================================
+--- linux-2.6.10.orig/fs/exec.c 2005-03-31 16:20:09.692859232 +0800
++++ linux-2.6.10/fs/exec.c 2005-03-31 17:03:37.147465880 +0800
+@@ -125,8 +125,9 @@
+ struct nameidata nd;
+ int error;
+
++ intent_init(&nd.intent.open, IT_OPEN);
+ nd.intent.open.flags = FMODE_READ;
+- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++ error = user_path_walk_it(library, &nd);
+ if (error)
+ goto out;
+
+@@ -138,7 +139,7 @@
+ if (error)
+ goto exit;
+
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open);
+ error = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out;
+@@ -485,8 +486,9 @@
+ int err;
+ struct file *file;
+
++ intent_init(&nd.intent.open, IT_OPEN);
+ nd.intent.open.flags = FMODE_READ;
+- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++ err = path_lookup_it(name, LOOKUP_FOLLOW, &nd);
+ file = ERR_PTR(err);
+
+ if (!err) {
+@@ -499,7 +501,7 @@
+ err = -EACCES;
+ file = ERR_PTR(err);
+ if (!err) {
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open);
+ if (!IS_ERR(file)) {
+ err = deny_write_access(file);
+ if (err) {
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h 2005-03-31 15:35:26.317794248 +0800
++++ linux-2.6.10/include/linux/fs.h 2005-03-31 17:03:37.135467704 +0800
+@@ -600,6 +600,7 @@
+ spinlock_t f_ep_lock;
+ #endif /* #ifdef CONFIG_EPOLL */
+ struct address_space *f_mapping;
++ struct open_intent *f_it;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+@@ -1245,6 +1246,7 @@
+ extern int do_truncate(struct dentry *, loff_t start);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct open_intent *);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char __user *);
+
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h 2005-03-31 15:35:27.689585704 +0800
++++ linux-2.6.10/include/linux/namei.h 2005-03-31 17:10:14.746021712 +0800
+@@ -2,14 +2,41 @@
+ #define _LINUX_NAMEI_H
+
+ #include <linux/linkage.h>
++#include <linux/string.h>
+
+ struct vfsmount;
+
++/* intent opcodes */
++#define IT_OPEN (1)
++#define IT_CREAT (1<<1)
++#define IT_READDIR (1<<2)
++#define IT_GETATTR (1<<3)
++#define IT_LOOKUP (1<<4)
++#define IT_UNLINK (1<<5)
++#define IT_TRUNC (1<<6)
++#define IT_GETXATTR (1<<7)
++
++#define INTENT_MAGIC 0x19620323
++
++
+ struct open_intent {
++ int magic;
++ int op;
++ void (*op_release)(struct open_intent *);
+ int flags;
+ int create_mode;
++ union {
++ void *fs_data; /* FS-specific intent data */
++ } d;
+ };
+
++static inline void intent_init(struct open_intent *it, int op)
++{
++ memset(it, 0, sizeof(*it));
++ it->magic = INTENT_MAGIC;
++ it->op = op;
++}
++
+ enum { MAX_NESTED_LINKS = 8 };
+
+ struct nameidata {
+@@ -55,14 +82,22 @@
+ #define LOOKUP_ACCESS (0x0400)
+
+ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char __user *, unsigned, struct nameidata *));
+ #define user_path_walk(name,nd) \
+ __user_walk(name, LOOKUP_FOLLOW, nd)
++#define user_path_walk_it(name,nd) \
++ __user_walk_it(name, LOOKUP_FOLLOW, nd)
+ #define user_path_walk_link(name,nd) \
+ __user_walk(name, 0, nd)
++#define user_path_walk_link_it(name,nd) \
++ __user_walk_it(name, 0, nd)
+ extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
++extern int FASTCALL(path_lookup_it(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
++extern int FASTCALL(path_walk_it(const char *, struct nameidata *));
+ extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
+ extern void path_release(struct nameidata *);
++extern void intent_release(struct open_intent *);
+ extern void path_release_on_umount(struct nameidata *);
+
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+Index: linux-2.6.10/include/linux/mount.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mount.h 2004-12-25 05:33:51.000000000 +0800
++++ linux-2.6.10/include/linux/mount.h 2005-03-31 17:15:40.613482328 +0800
+@@ -36,6 +36,8 @@
+ struct list_head mnt_list;
+ struct list_head mnt_fslink; /* link in fs-specific expiry list */
+ struct namespace *mnt_namespace; /* containing namespace */
++ struct list_head mnt_lustre_list; /* GNS mount list */
++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */
+ };
+
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
--- /dev/null
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c 2005-03-31 17:12:26.403006808 +0800
++++ linux-2.6.10/fs/namei.c 2005-03-31 17:20:37.388365688 +0800
+@@ -783,8 +783,11 @@
+ goto out_dput;
+
+ if (inode->i_op->follow_link) {
++ int saved_flags = nd->flags;
+ mntget(next.mnt);
++ nd->flags |= LOOKUP_LINK_NOTLAST;
+ err = do_follow_link(next.dentry, nd);
++ nd->flags = saved_flags;
+ dput(next.dentry);
+ mntput(next.mnt);
+ if (err)
+@@ -830,7 +833,9 @@
+ if (err < 0)
+ break;
+ }
++ nd->flags |= LOOKUP_LAST;
+ err = do_lookup(nd, &this, &next, atomic);
++ nd->flags &= ~LOOKUP_LAST;
+ if (err)
+ break;
+ follow_mount(&next.mnt, &next.dentry);
+@@ -876,10 +881,14 @@
+ */
+ if (nd->dentry && nd->dentry->d_sb &&
+ (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+- err = -ESTALE;
++ nd->flags |= LOOKUP_LAST;
++ err = !nd->dentry->d_op->d_revalidate(nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+ /* Note: we do not d_invalidate() */
+- if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
++ if (err) {
++ err = -ESTALE;
+ break;
++ }
+ }
+ return_base:
+ return 0;
+@@ -1446,7 +1455,9 @@
+ dir = nd->dentry;
+ nd->flags &= ~LOOKUP_PARENT;
+ down(&dir->d_inode->i_sem);
++ nd->flags |= LOOKUP_LAST;
+ dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+
+ do_last:
+ error = PTR_ERR(dentry);
+@@ -1559,7 +1570,9 @@
+ }
+ dir = nd->dentry;
+ down(&dir->d_inode->i_sem);
++ nd->flags |= LOOKUP_LAST;
+ dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+ putname(nd->last.name);
+ goto do_last;
+ }
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h 2005-03-31 17:10:14.746021712 +0800
++++ linux-2.6.10/include/linux/namei.h 2005-03-31 17:21:41.178668088 +0800
+@@ -73,7 +73,9 @@
+ #define LOOKUP_PARENT 16
+ #define LOOKUP_NOALT 32
+ #define LOOKUP_ATOMIC 64
+-
++#define LOOKUP_LAST 128
++#define LOOKUP_LINK_NOTLAST 256
+++
+ /*
+ * Intent data
+ */
--- /dev/null
+ fs/inode.c | 1
+ fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++---------------
+ include/linux/fs.h | 11 ++++----
+ 3 files changed, 54 insertions(+), 24 deletions(-)
+
+Index: linux-2.6.10/fs/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/inode.c 2004-12-25 05:35:40.000000000 +0800
++++ linux-2.6.10/fs/inode.c 2005-03-31 18:03:53.551688872 +0800
+@@ -166,6 +166,7 @@
+ }
+ memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
++ dynlock_init(&inode->i_dcache_lock);
+ }
+ return inode;
+ }
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c 2005-03-31 17:57:10.767921312 +0800
++++ linux-2.6.10/fs/namei.c 2005-03-31 18:05:52.839554360 +0800
+@@ -104,6 +104,38 @@
+ * any extra contention...
+ */
+
++void *lock_dir(struct inode *dir, struct qstr *name)
++{
++ unsigned long hash;
++
++ if (!IS_PDIROPS(dir)) {
++ down(&dir->i_sem);
++ return 0;
++ }
++
++ /* OK. fs understands parallel directory operations.
++ * so, we try to acquire lock for hash of requested
++ * filename in order to prevent any operations with
++ * same name in same time -bzzz */
++
++ /* calculate name hash */
++ hash = full_name_hash(name->name, name->len);
++
++ /* lock this hash */
++ return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC);
++}
++EXPORT_SYMBOL(lock_dir);
++
++void unlock_dir(struct inode *dir, void *lock)
++{
++ if (!IS_PDIROPS(dir)) {
++ up(&dir->i_sem);
++ return;
++ }
++ dynlock_unlock(&dir->i_dcache_lock, lock);
++}
++EXPORT_SYMBOL(unlock_dir);
++
+ /* In order to reduce some races, while at the same time doing additional
+ * checking and hopefully speeding things up, we copy filenames to the
+ * kernel data space before using them..
+@@ -390,8 +422,9 @@
+ {
+ struct dentry * result;
+ struct inode *dir = parent->d_inode;
++ void *lock;
+
+- down(&dir->i_sem);
++ lock = lock_dir(dir, name);
+ /*
+ * First re-do the cached lookup just in case it was created
+ * while we waited for the directory semaphore..
+@@ -417,7 +450,7 @@
+ else
+ result = dentry;
+ }
+- up(&dir->i_sem);
++ unlock_dir(dir, lock);
+ return result;
+ }
+
+@@ -425,7 +458,7 @@
+ * Uhhuh! Nasty case: the cache was re-populated while
+ * we waited on the semaphore. Need to revalidate.
+ */
+- up(&dir->i_sem);
++ unlock_dir(dir, lock);
+ if (result->d_op && result->d_op->d_revalidate) {
+ if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
+ dput(result);
+@@ -1461,7 +1494,7 @@
+
+ dir = nd->dentry;
+ nd->flags &= ~LOOKUP_PARENT;
+- down(&dir->d_inode->i_sem);
++ nd->lock = lock_dir(dir->d_inode, &nd->last);
+ nd->flags |= LOOKUP_LAST;
+ dentry = __lookup_hash(&nd->last, nd->dentry, nd);
+ nd->flags &= ~LOOKUP_LAST;
+@@ -1469,7 +1502,7 @@
+ do_last:
+ error = PTR_ERR(dentry);
+ if (IS_ERR(dentry)) {
+- up(&dir->d_inode->i_sem);
++ unlock_dir(dir->d_inode, nd->lock);
+ goto exit;
+ }
+
+@@ -1478,7 +1511,7 @@
+ if (!IS_POSIXACL(dir->d_inode))
+ mode &= ~current->fs->umask;
+ error = vfs_create(dir->d_inode, dentry, mode, nd);
+- up(&dir->d_inode->i_sem);
++ unlock_dir(dir->d_inode, nd->lock);
+ dput(nd->dentry);
+ nd->dentry = dentry;
+ if (error)
+@@ -1492,7 +1525,7 @@
+ /*
+ * It already exists.
+ */
+- up(&dir->d_inode->i_sem);
++ unlock_dir(dir->d_inode, nd->lock);
+
+ error = -EEXIST;
+ if (flag & O_EXCL)
+@@ -1576,7 +1609,7 @@
+ goto exit;
+ }
+ dir = nd->dentry;
+- down(&dir->d_inode->i_sem);
++ nd->lock = lock_dir(dir->d_inode, &nd->last);
+ nd->flags |= LOOKUP_LAST;
+ dentry = __lookup_hash(&nd->last, nd->dentry, nd);
+ nd->flags &= ~LOOKUP_LAST;
+@@ -1596,7 +1629,7 @@
+ {
+ struct dentry *dentry;
+
+- down(&nd->dentry->d_inode->i_sem);
++ nd->lock = lock_dir(nd->dentry->d_inode, &nd->last);
+ dentry = ERR_PTR(-EEXIST);
+ if (nd->last_type != LAST_NORM)
+ goto fail;
+@@ -1688,7 +1721,7 @@
+ }
+ dput(dentry);
+ }
+- up(&nd.dentry->d_inode->i_sem);
++ unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+ path_release(&nd);
+ out:
+@@ -1747,7 +1780,7 @@
+ error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
+ dput(dentry);
+ }
+- up(&nd.dentry->d_inode->i_sem);
++ unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+ path_release(&nd);
+ out:
+@@ -1852,14 +1885,14 @@
+ error = -EBUSY;
+ goto exit1;
+ }
+- down(&nd.dentry->d_inode->i_sem);
++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
+ dentry = lookup_hash(&nd.last, nd.dentry);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ error = vfs_rmdir(nd.dentry->d_inode, dentry);
+ dput(dentry);
+ }
+- up(&nd.dentry->d_inode->i_sem);
++ unlock_dir(nd.dentry->d_inode, nd.lock);
+ exit1:
+ path_release(&nd);
+ exit:
+@@ -1925,7 +1958,7 @@
+ error = -EISDIR;
+ if (nd.last_type != LAST_NORM)
+ goto exit1;
+- down(&nd.dentry->d_inode->i_sem);
++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
+ dentry = lookup_hash(&nd.last, nd.dentry);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+@@ -1939,7 +1972,7 @@
+ exit2:
+ dput(dentry);
+ }
+- up(&nd.dentry->d_inode->i_sem);
++ unlock_dir(nd.dentry->d_inode, nd.lock);
+ if (inode)
+ iput(inode); /* truncate the inode here */
+ exit1:
+@@ -2005,7 +2038,7 @@
+ error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
+ dput(dentry);
+ }
+- up(&nd.dentry->d_inode->i_sem);
++ unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+ path_release(&nd);
+ out:
+@@ -2094,7 +2127,7 @@
+ error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+ dput(new_dentry);
+ }
+- up(&nd.dentry->d_inode->i_sem);
++ unlock_dir(nd.dentry->d_inode, nd.lock);
+ out_release:
+ path_release(&nd);
+ out:
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h 2005-03-31 17:57:13.330531736 +0800
++++ linux-2.6.10/include/linux/fs.h 2005-03-31 18:08:59.645155592 +0800
+@@ -19,6 +19,7 @@
+ #include <linux/prio_tree.h>
+ #include <linux/kobject.h>
+ #include <asm/atomic.h>
++#include <linux/dynlocks.h>
+
+ struct iovec;
+ struct nameidata;
+@@ -151,7 +152,7 @@
+ #define S_DIRSYNC 64 /* Directory modifications are synchronous */
+ #define S_NOCMTIME 128 /* Do not update file c/mtime */
+ #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
+-
++#define S_PDIROPS 512 /* Parallel directory operations */
+ /*
+ * Note that nosuid etc flags are inode-specific: setting some file-system
+ * flags just means all the inodes inherit those flags by default. It might be
+@@ -181,6 +182,7 @@
+ #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME)
+ #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
+ #define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND)
++#define IS_PDIROPS(inode) __IS_FLG(inode, S_PDIROPS)
+
+ #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
+ #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
+@@ -482,6 +484,7 @@
+
+ atomic_t i_writecount;
+ void *i_security;
++ struct dynlock i_dcache_lock; /* for parallel directory ops */
+ union {
+ void *generic_ip;
+ } u;
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h 2005-03-31 17:50:12.533502608 +0800
++++ linux-2.6.10/include/linux/namei.h 2005-03-31 18:10:30.237383480 +0800
+@@ -63,7 +63,8 @@
+ int last_type;
+ unsigned depth;
+ char *saved_names[MAX_NESTED_LINKS + 1];
+-
++
++ void *lock;
+ /* Intent data */
+ union {
+ struct open_intent open;
+@@ -91,7 +92,7 @@
+ #define LOOKUP_ATOMIC 64
+ #define LOOKUP_LAST 128
+ #define LOOKUP_LINK_NOTLAST 256
+-+
++
+ /*
+ * Intent data
+ */
--- /dev/null
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c 2005-03-31 17:43:42.417809208 +0800
++++ linux-2.6.10/fs/namei.c 2005-03-31 17:47:14.292599344 +0800
+@@ -474,6 +474,7 @@
+ intent_init(&nd->intent.open, it.op);
+ nd->intent.open.flags = it.flags;
+ nd->intent.open.create_mode = it.create_mode;
++ nd->intent.open.create = it.create;
+ res = link_path_walk(link, nd);
+ out:
+ if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -866,14 +867,20 @@
+ lookup_parent:
+ nd->last = this;
+ nd->last_type = LAST_NORM;
+- if (this.name[0] != '.')
+- goto return_base;
+- if (this.len == 1)
+- nd->last_type = LAST_DOT;
+- else if (this.len == 2 && this.name[1] == '.')
+- nd->last_type = LAST_DOTDOT;
+- else
+- goto return_base;
++ if (this.name[0] == '.') {
++ if (this.len == 1)
++ nd->last_type = LAST_DOT;
++ else if (this.len == 2 && this.name[1] == '.')
++ nd->last_type = LAST_DOTDOT;
++ }
++
++ if ((nd->last_type == LAST_NORM) && inode->i_op &&
++ inode->i_op->endparentlookup) {
++ err = inode->i_op->endparentlookup(nd);
++ if (err)
++ break;
++ }
++ goto return_base;
+ return_reval:
+ /*
+ * We bypassed the ordinary revalidation routines.
+@@ -1646,9 +1653,16 @@
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+- error = path_lookup(tmp, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_MKNOD);
++ nd.intent.open.create_mode = mode;
++ nd.intent.open.create.dev = dev;
++
++ error = path_lookup_it(tmp, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto out2;
++
+ dentry = lookup_create(&nd, 0);
+ error = PTR_ERR(dentry);
+
+@@ -1675,6 +1689,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1717,9 +1732,13 @@
+ struct dentry *dentry;
+ struct nameidata nd;
+
+- error = path_lookup(tmp, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_MKDIR);
++ nd.intent.open.create_mode = mode;
++ error = path_lookup_it(tmp, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto out2;
+ dentry = lookup_create(&nd, 1);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+@@ -1729,6 +1748,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1814,9 +1834,12 @@
+ if(IS_ERR(name))
+ return PTR_ERR(name);
+
+- error = path_lookup(name, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_RMDIR);
++ error = path_lookup_it(name, LOOKUP_PARENT, &nd);
+ if (error)
+ goto exit;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto exit1;
+
+ switch(nd.last_type) {
+ case LAST_DOTDOT:
+@@ -1892,9 +1915,13 @@
+ if(IS_ERR(name))
+ return PTR_ERR(name);
+
+- error = path_lookup(name, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_UNLINK);
++ error = path_lookup_it(name, LOOKUP_PARENT, &nd);
+ if (error)
+ goto exit;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto exit1;
++
+ error = -EISDIR;
+ if (nd.last_type != LAST_NORM)
+ goto exit1;
+@@ -1965,9 +1992,13 @@
+ struct dentry *dentry;
+ struct nameidata nd;
+
+- error = path_lookup(to, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_SYMLINK);
++ nd.intent.open.create.link = from;
++ error = path_lookup_it(to, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto out2;
+ dentry = lookup_create(&nd, 0);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+@@ -1975,6 +2006,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(to);
+@@ -2046,9 +2078,13 @@
+ error = __user_walk(oldname, 0, &old_nd);
+ if (error)
+ goto exit;
+- error = path_lookup(to, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_LINK);
++ nd.intent.open.create.source_nd = &old_nd;
++ error = path_lookup_it(to, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto out_release;
+ error = -EXDEV;
+ if (old_nd.mnt != nd.mnt)
+ goto out_release;
+@@ -2229,9 +2265,18 @@
+ if (error)
+ goto exit;
+
+- error = path_lookup(newname, LOOKUP_PARENT, &newnd);
++ error = -EBUSY;
++ if (oldnd.last_type != LAST_NORM)
++ goto exit1;
++
++ intent_init(&newnd.intent.open, IT_RENAME);
++ newnd.intent.open.create.source_nd = &oldnd;
++ error = path_lookup_it(newname, LOOKUP_PARENT, &newnd);
+ if (error)
+ goto exit1;
++ if (newnd.intent.open.flags & IT_STATUS_RAW) {
++ goto exit2;
++ }
+
+ error = -EXDEV;
+ if (oldnd.mnt != newnd.mnt)
+@@ -2239,8 +2284,6 @@
+
+ old_dir = oldnd.dentry;
+ error = -EBUSY;
+- if (oldnd.last_type != LAST_NORM)
+- goto exit2;
+
+ new_dir = newnd.dentry;
+ if (newnd.last_type != LAST_NORM)
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h 2005-03-31 17:03:37.000000000 +0800
++++ linux-2.6.10/include/linux/fs.h 2005-03-31 17:46:35.715463960 +0800
+@@ -956,6 +956,7 @@
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+ int (*removexattr) (struct dentry *, const char *);
++ int (*endparentlookup) (struct nameidata *);
+ };
+
+ struct seq_file;
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h 2005-03-31 17:43:42.472800848 +0800
++++ linux-2.6.10/include/linux/namei.h 2005-03-31 17:50:12.533502608 +0800
+@@ -15,8 +15,19 @@
+ #define IT_UNLINK (1<<5)
+ #define IT_TRUNC (1<<6)
+ #define IT_GETXATTR (1<<7)
+-
++#define IT_RMDIR (1<<8)
++#define IT_LINK (1<<9)
++#define IT_RENAME (1<<10)
++#define IT_MKDIR (1<<11)
++#define IT_MKNOD (1<<12)
++#define IT_SYMLINK (1<<13)
++#define IT_CHDIR (1<<14)
++
+ #define INTENT_MAGIC 0x19620323
++#define IT_STATUS_RAW (1<<10) /* Setting this in it_flags on exit from lookup
++ means everything was done already and return
++ value from lookup is in fact status of
++ already performed operation */
+
+
+ struct open_intent {
+@@ -26,6 +37,11 @@
+ int flags;
+ int create_mode;
+ union {
++ unsigned dev; /* For mknod */
++ char *link; /* For symlink */
++ struct nameidata *source_nd; /* For link/rename */
++ } create;
++ union {
+ void *fs_data; /* FS-specific intent data */
+ } d;
+ };
--- /dev/null
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c 2005-03-31 17:58:42.827926064 +0800
++++ linux-2.6.10/fs/namespace.c 2005-03-31 18:19:21.976546840 +0800
+@@ -62,6 +62,7 @@
+ INIT_LIST_HEAD(&mnt->mnt_mounts);
+ INIT_LIST_HEAD(&mnt->mnt_list);
+ INIT_LIST_HEAD(&mnt->mnt_fslink);
++ INIT_LIST_HEAD(&mnt->mnt_lustre_list);
+ if (name) {
+ int size = strlen(name)+1;
+ char *newname = kmalloc(size, GFP_KERNEL);
+@@ -177,6 +178,9 @@
+ {
+ struct super_block *sb = mnt->mnt_sb;
+ dput(mnt->mnt_root);
++ spin_lock(&dcache_lock);
++ list_del(&mnt->mnt_lustre_list);
++ spin_unlock(&dcache_lock);
+ free_vfsmnt(mnt);
+ deactivate_super(sb);
+ }