From ed1792ed91ebd96acf00fa99a61b3bb32c64d32f Mon Sep 17 00:00:00 2001 From: wangdi Date: Wed, 6 Apr 2005 16:05:44 +0000 Subject: [PATCH] Branch: HEAD add FC3 kernel patches --- .../patches/dev_read_only-2.6.10-fc3.patch | 81 + .../patches/dynamic-locks-2.6.10-fc3.patch | 278 + .../patches/export-ext3-2.6.10-fc3.patch | 33 + .../patches/export-fedro-2.6.10.patch | 84 + .../patches/export_symbols-ext3-2.6.10-fc3.patch | 17 + .../patches/ext3-extents-2.6.10-fc3.patch | 2846 +++++ .../patches/ext3-extents-in-ea-2.6.10-fc3.patch | 361 + .../ext3-extents-in-ea-ioctl-2.6.10-fc3.patch | 230 + .../patches/ext3-mds-num-2.6.10-fc3.patch | 281 + .../patches/ext3-pdirops-2.6.10-fc3.patch | 1202 +++ .../patches/ext3-wantedi-2.6.10-fc3.patch | 192 + .../patches/hostfs_readdir_large.patch | 32 + .../kernel_patches/patches/iopen-2.6.10-fc3.patch | 476 + .../kernel_patches/patches/jbd-2.6.10-jcberr.patch | 222 + .../patches/jbd-buffer-release-2.6.10-fc3.patch | 399 + lustre/kernel_patches/patches/kgdb-ga.patch | 6358 +++++++++++ .../patches/linux-2.6.10-CITI_NFS4_ALL-1.patch | 10703 +++++++++++++++++++ .../patches/linux-2.6.10-fc3-left.patch | 1477 +++ .../patches/linux-2.6.10-fc3-lkcd.patch | 10676 ++++++++++++++++++ lustre/kernel_patches/patches/uml-2.6.10-fc3.patch | 3746 +++++++ .../vfs-dcache_locking-vanilla-2.6.10-fc3.patch | 113 + .../vfs-gns_export_doumount-2.6.10-fc3.patch | 34 + .../vfs-intent_api-vanilla-2.6.10-fc3.patch | 557 + .../vfs-lookup_last-vanilla-2.6.10-fc3.patch | 78 + .../patches/vfs-pdirops-2.6.10-fc3.patch | 274 + .../patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch | 235 + .../patches/vfs_gns-2.6.10-fc3.patch | 22 + 27 files changed, 41007 insertions(+) create mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/export-fedro-2.6.10.patch create mode 100644 lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch create mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch create mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch create mode 100755 lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/hostfs_readdir_large.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch create mode 100644 lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/kgdb-ga.patch create mode 100644 lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch create mode 100644 lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch create mode 100644 lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch create mode 100644 lustre/kernel_patches/patches/uml-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch new file mode 100644 index 0000000..1aec6f6 --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch @@ -0,0 +1,81 @@ + drivers/block/ll_rw_blk.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ + include/linux/blkdev.h | 1 + 2 files changed, 50 insertions(+) + +Index: linux-2.6.10/drivers/block/ll_rw_blk.c +=================================================================== +--- linux-2.6.10.orig/drivers/block/ll_rw_blk.c 2004-12-25 05:33:59.000000000 +0800 ++++ linux-2.6.10/drivers/block/ll_rw_blk.c 2005-04-05 15:42:58.075467024 +0800 +@@ -2679,6 +2679,13 @@ + if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) + goto end_io; + ++ /* this is cfs's dev_rdonly check */ ++ if (bio->bi_rw == WRITE && ++ dev_check_rdonly(bio->bi_bdev->bd_dev)) { ++ bio_endio(bio, bio->bi_size, 0); ++ break; ++ } ++ + block_wait_queue_running(q); + + /* +@@ -3287,6 +3294,58 @@ + return queue_var_show(max_hw_sectors_kb, (page)); + } + ++#define MAX_RDONLY_DEVS 16 ++ ++static dev_t rdonly_devs[MAX_RDONLY_DEVS] = {0, }; ++ ++/* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++void dev_set_rdonly(struct block_device *bdev, int no_write) ++{ ++ if (no_write >= MAX_RDONLY_DEVS) { ++ printk(KERN_ALERT "%s:%d illegal arg %d (max %d)\n", ++ __FILE__, __LINE__, no_write, MAX_RDONLY_DEVS); ++ return; ++ } ++ ++ if (bdev) { ++ printk(KERN_WARNING "Turning device %s read-only at %d\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : "?", ++ no_write); ++ rdonly_devs[no_write] = bdev->bd_dev; ++ } ++} ++ ++void dev_clear_rdonly(int no_write) ++{ ++ if (no_write >= MAX_RDONLY_DEVS) { ++ printk(KERN_ALERT "%s:%d illegal arg %d (max %d)\n", ++ __FILE__, __LINE__, no_write, MAX_RDONLY_DEVS); ++ return; ++ } ++ ++ if (rdonly_devs[no_write] == 0) ++ return; ++ ++ printk(KERN_WARNING "Clearing read-only at %d\n", no_write); ++ rdonly_devs[no_write] = 0; ++} ++ ++int dev_check_rdonly(dev_t dev) ++{ ++ int i; ++ ++ for (i = 0; i < MAX_RDONLY_DEVS; i++) ++ if (rdonly_devs[i] == dev) ++ return 1; ++ return 0; ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); + + static struct queue_sysfs_entry queue_requests_entry = { + .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, diff --git a/lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch b/lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch new file mode 100644 index 0000000..166deb6 --- /dev/null +++ b/lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch @@ -0,0 +1,278 @@ + include/linux/dynlocks.h | 33 ++++++++++ + lib/Makefile | 4 - + lib/dynlocks.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 187 insertions(+), 2 deletions(-) + +Index: linux-2.6.10/lib/dynlocks.c +=================================================================== +--- linux-2.6.10.orig/lib/dynlocks.c 2005-03-31 16:59:29.399768040 +0800 ++++ linux-2.6.10/lib/dynlocks.c 2005-03-31 18:02:41.470646856 +0800 +@@ -0,0 +1,187 @@ ++/* ++ * Dynamic Locks ++ * ++ * struct dynlock is lockspace ++ * one may request lock (exclusive or shared) for some value ++ * in that lockspace ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++static kmem_cache_t * dynlock_cachep = NULL; ++ ++void __init dynlock_cache_init(void) ++{ ++ printk(KERN_INFO "init dynlocks cache\n"); ++ dynlock_cachep = kmem_cache_create("dynlock_cache", ++ sizeof(struct dynlock_member), ++ 0, ++ SLAB_HWCACHE_ALIGN, ++ NULL, NULL); ++ if (dynlock_cachep == NULL) ++ panic("Can't create dynlock cache"); ++} ++ ++/* ++ * dynlock_init ++ * ++ * initialize lockspace ++ * ++ */ ++void dynlock_init(struct dynlock *dl) ++{ ++ spin_lock_init(&dl->dl_list_lock); ++ INIT_LIST_HEAD(&dl->dl_list); ++ dl->dl_magic = DYNLOCK_LIST_MAGIC; ++} ++ ++/* ++ * dynlock_lock ++ * ++ * acquires lock (exclusive or shared) in specified lockspace ++ * each lock in lockspace is allocated separately, so user have ++ * to specify GFP flags. ++ * routine returns pointer to lock. this pointer is intended to ++ * be passed to dynlock_unlock ++ * ++ */ ++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp) ++{ ++ struct dynlock_member *nhl = NULL; ++ struct dynlock_member *hl; ++ struct list_head *cur; ++ int num = 0; ++ ++ BUG_ON(dl == NULL); ++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC); ++repeat: ++ /* find requested lock in lockspace */ ++ spin_lock(&dl->dl_list_lock); ++ BUG_ON(dl->dl_list.next == NULL); ++ BUG_ON(dl->dl_list.prev == NULL); ++ list_for_each(cur, &dl->dl_list) { ++ BUG_ON(cur->next == NULL); ++ BUG_ON(cur->prev == NULL); ++ hl = list_entry(cur, struct dynlock_member, dl_list); ++ BUG_ON(hl->dl_magic != DYNLOCK_MAGIC); ++ if (hl->dl_value == value) { ++ /* lock is found */ ++ if (nhl) { ++ /* someone else just allocated ++ * lock we didn't find and just created ++ * so, we drop our lock ++ */ ++ kmem_cache_free(dynlock_cachep, nhl); ++ nhl = NULL; ++ } ++ hl->dl_refcount++; ++ goto found; ++ } ++ num++; ++ } ++ /* lock not found */ ++ if (nhl) { ++ /* we already have allocated lock. use it */ ++ hl = nhl; ++ nhl = NULL; ++ list_add(&hl->dl_list, &dl->dl_list); ++ goto found; ++ } ++ spin_unlock(&dl->dl_list_lock); ++ ++ /* lock not found and we haven't allocated lock yet. allocate it */ ++ nhl = kmem_cache_alloc(dynlock_cachep, gfp); ++ if (nhl == NULL) ++ return NULL; ++ nhl->dl_refcount = 1; ++ nhl->dl_value = value; ++ nhl->dl_readers = 0; ++ nhl->dl_writers = 0; ++ nhl->dl_magic = DYNLOCK_MAGIC; ++ init_waitqueue_head(&nhl->dl_wait); ++ ++ /* while lock is being allocated, someone else may allocate it ++ * and put onto to list. check this situation ++ */ ++ goto repeat; ++ ++found: ++ if (rw) { ++ /* exclusive lock: user don't want to share lock at all ++ * NOTE: one process may take the same lock several times ++ * this functionaly is useful for rename operations */ ++ while ((hl->dl_writers && hl->dl_pid != current->pid) || ++ hl->dl_readers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dl_wait, ++ hl->dl_writers == 0 && hl->dl_readers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dl_writers++; ++ } else { ++ /* shared lock: user do not want to share lock with writer */ ++ while (hl->dl_writers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dl_wait, hl->dl_writers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dl_readers++; ++ } ++ hl->dl_pid = current->pid; ++ spin_unlock(&dl->dl_list_lock); ++ ++ return hl; ++} ++ ++ ++/* ++ * dynlock_unlock ++ * ++ * user have to specify lockspace (dl) and pointer to lock structure ++ * returned by dynlock_lock() ++ * ++ */ ++void dynlock_unlock(struct dynlock *dl, void *lock) ++{ ++ struct dynlock_member *hl = lock; ++ int wakeup = 0; ++ ++ BUG_ON(dl == NULL); ++ BUG_ON(hl == NULL); ++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC); ++ BUG_ON(hl->dl_magic != DYNLOCK_MAGIC); ++ BUG_ON(current->pid != hl->dl_pid); ++ ++ spin_lock(&dl->dl_list_lock); ++ if (hl->dl_writers) { ++ BUG_ON(hl->dl_readers > 0 || hl->dl_readers < 0); ++ hl->dl_writers--; ++ if (hl->dl_writers == 0) ++ wakeup = 1; ++ } else if (hl->dl_readers) { ++ hl->dl_readers--; ++ if (hl->dl_readers == 0) ++ wakeup = 1; ++ } else { ++ BUG_ON(1); ++ } ++ if (wakeup) { ++ hl->dl_pid = 0; ++ wake_up(&hl->dl_wait); ++ } ++ if (--(hl->dl_refcount) == 0) { ++ hl->dl_magic = DYNLOCK_MAGIC2; ++ list_del(&hl->dl_list); ++ kmem_cache_free(dynlock_cachep, hl); ++ } ++ spin_unlock(&dl->dl_list_lock); ++} ++ ++EXPORT_SYMBOL(dynlock_init); ++EXPORT_SYMBOL(dynlock_lock); ++EXPORT_SYMBOL(dynlock_unlock); ++ +Index: linux-2.6.10/lib/Makefile +=================================================================== +--- linux-2.6.10.orig/lib/Makefile 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/lib/Makefile 2005-03-31 18:03:16.727287032 +0800 +@@ -5,7 +5,7 @@ + lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \ + bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \ + kobject.o kref.o idr.o div64.o parser.o int_sqrt.o \ +- bitmap.o extable.o kobject_uevent.o ++ bitmap.o extable.o kobject_uevent.o dynlocks.o + + ifeq ($(CONFIG_DEBUG_KOBJECT),y) + CFLAGS_kobject.o += -DDEBUG +Index: linux-2.6.10/fs/dcache.c +=================================================================== +--- linux-2.6.10.orig/fs/dcache.c 2005-03-31 17:02:41.000000000 +0800 ++++ linux-2.6.10/fs/dcache.c 2005-03-31 18:02:41.474646248 +0800 +@@ -1655,6 +1655,7 @@ + + extern void bdev_cache_init(void); + extern void chrdev_init(void); ++extern void dynlock_cache_init(void); + + void __init vfs_caches_init_early(void) + { +@@ -1684,6 +1685,7 @@ + mnt_init(mempages); + bdev_cache_init(); + chrdev_init(); ++ dynlock_cache_init(); + } + + EXPORT_SYMBOL(d_alloc); +Index: linux-2.6.10/include/linux/dynlocks.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dynlocks.h 2005-03-31 16:59:29.399768040 +0800 ++++ linux-2.6.10/include/linux/dynlocks.h 2005-03-31 18:02:41.469647008 +0800 +@@ -0,0 +1,43 @@ ++#ifndef _LINUX_DYNLOCKS_H ++#define _LINUX_DYNLOCKS_H ++ ++#include ++#include ++ ++#define DYNLOCK_MAGIC 0xd19a10c ++#define DYNLOCK_MAGIC2 0xd1956ee ++ ++struct dynlock; ++ ++struct dynlock_member { ++ unsigned dl_magic; ++ struct list_head dl_list; ++ unsigned long dl_value; /* lock value */ ++ int dl_refcount; /* number of users */ ++ int dl_readers; ++ int dl_writers; ++ int dl_pid; /* holder of the lock */ ++ wait_queue_head_t dl_wait; ++}; ++ ++/* ++ * lock's namespace: ++ * - list of locks ++ * - lock to protect this list ++ */ ++ ++#define DYNLOCK_LIST_MAGIC 0x11ee91e6 ++ ++struct dynlock { ++ unsigned dl_magic; ++ struct list_head dl_list; ++ spinlock_t dl_list_lock; ++}; ++ ++void dynlock_init(struct dynlock *dl); ++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp); ++void dynlock_unlock(struct dynlock *dl, void *lock); ++ ++ ++#endif ++ diff --git a/lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch b/lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch new file mode 100644 index 0000000..449c4b9 --- /dev/null +++ b/lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch @@ -0,0 +1,33 @@ +Index: linux-2.6.10/fs/ext3/super.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/super.c 2005-03-31 18:44:38.935933960 +0800 ++++ linux-2.6.10/fs/ext3/super.c 2005-03-31 18:46:03.008153040 +0800 +@@ -123,6 +123,8 @@ + journal_abort_handle(handle); + } + ++EXPORT_SYMBOL(ext3_journal_abort_handle); ++ + /* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * +@@ -2016,6 +2018,8 @@ + return ret; + } + ++EXPORT_SYMBOL(ext3_force_commit); ++ + /* + * Ext3 always journals updates to the superblock itself, so we don't + * have to propagate any other updates to the superblock on disk at this +@@ -2447,6 +2451,10 @@ + unsigned long *blocks, int *created, int create); + EXPORT_SYMBOL(ext3_map_inode_page); + ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_set_handle); ++EXPORT_SYMBOL(ext3_bread); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); + MODULE_LICENSE("GPL"); diff --git a/lustre/kernel_patches/patches/export-fedro-2.6.10.patch b/lustre/kernel_patches/patches/export-fedro-2.6.10.patch new file mode 100644 index 0000000..d724d6f --- /dev/null +++ b/lustre/kernel_patches/patches/export-fedro-2.6.10.patch @@ -0,0 +1,84 @@ +Index: linux-2.6.10/net/core/sock.c +=================================================================== +--- linux-2.6.10.orig/net/core/sock.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/net/core/sock.c 2005-03-31 20:42:01.084364672 +0800 +@@ -1359,6 +1359,7 @@ + EXPORT_SYMBOL(sk_alloc); + EXPORT_SYMBOL(sk_free); + EXPORT_SYMBOL(sk_send_sigurg); ++EXPORT_SYMBOL(sock_getsockopt); + EXPORT_SYMBOL(sock_alloc_send_pskb); + EXPORT_SYMBOL(sock_alloc_send_skb); + EXPORT_SYMBOL(sock_init_data); +Index: linux-2.6.10/fs/dcache.c +=================================================================== +--- linux-2.6.10.orig/fs/dcache.c 2005-03-31 19:44:53.000000000 +0800 ++++ linux-2.6.10/fs/dcache.c 2005-03-31 22:02:08.130582568 +0800 +@@ -1691,6 +1691,7 @@ + + EXPORT_SYMBOL(d_alloc); + EXPORT_SYMBOL(d_alloc_anon); ++EXPORT_SYMBOL(is_subdir); + EXPORT_SYMBOL(d_alloc_root); + EXPORT_SYMBOL(d_delete); + EXPORT_SYMBOL(d_find_alias); +Index: linux-2.6.10/fs/namespace.c +=================================================================== +--- linux-2.6.10.orig/fs/namespace.c 2005-03-31 19:44:54.000000000 +0800 ++++ linux-2.6.10/fs/namespace.c 2005-03-31 22:03:44.906870336 +0800 +@@ -1239,6 +1239,7 @@ + mntput(old_pwdmnt); + } + } ++EXPORT_SYMBOL(set_fs_pwd); + + static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) + { +Index: linux-2.6.10/fs/file_table.c +=================================================================== +--- linux-2.6.10.orig/fs/file_table.c 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/fs/file_table.c 2005-03-31 20:44:40.924065344 +0800 +@@ -196,6 +196,7 @@ + file_free(file); + } + } ++EXPORT_SYMBOL(put_filp); + + void file_move(struct file *file, struct list_head *list) + { +Index: linux-2.6.10/kernel/sched.c +=================================================================== +--- linux-2.6.10.orig/kernel/sched.c 2005-03-31 15:57:21.000000000 +0800 ++++ linux-2.6.10/kernel/sched.c 2005-03-31 22:00:30.616406976 +0800 +@@ -2942,6 +2942,19 @@ + + EXPORT_SYMBOL(sleep_on_timeout); + ++void fastcall __sched sleep_on(wait_queue_head_t *q) ++{ ++ SLEEP_ON_VAR ++ ++ current->state = TASK_UNINTERRUPTIBLE; ++ ++ SLEEP_ON_HEAD ++ schedule(); ++ SLEEP_ON_TAIL ++} ++ ++EXPORT_SYMBOL(sleep_on); ++ + void set_user_nice(task_t *p, long nice) + { + unsigned long flags; +Index: linux-2.6.10/kernel/exit.c +=================================================================== +--- linux-2.6.10.orig/kernel/exit.c 2005-03-31 19:44:52.509587264 +0800 ++++ linux-2.6.10/kernel/exit.c 2005-03-31 20:47:18.034180976 +0800 +@@ -515,6 +515,7 @@ + { + __exit_mm(tsk); + } ++EXPORT_SYMBOL(exit_mm); + + static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) + { diff --git a/lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch b/lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch new file mode 100644 index 0000000..d09fd6a --- /dev/null +++ b/lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch @@ -0,0 +1,17 @@ +Index: linux-2.6.10/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs_sb.h 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/include/linux/ext3_fs_sb.h 2005-03-31 18:44:21.076648984 +0800 +@@ -19,9 +19,12 @@ + #ifdef __KERNEL__ + #include + #include ++#ifndef EXT_INCLUDE ++#define EXT_INCLUDE + #include + #include + #endif ++#endif + #include + + /* diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch new file mode 100644 index 0000000..90064a2 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch @@ -0,0 +1,2846 @@ +%patch +Index: linux-2.6.10/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-04-05 12:26:19.494124024 +0800 ++++ linux-2.6.10/include/linux/ext3_fs.h 2005-04-05 12:26:25.474214912 +0800 +@@ -186,6 +186,7 @@ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + + #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ +@@ -238,7 +239,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) +- ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 10, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 11, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 12, long) + /* + * Structure of an inode on the disk + */ +@@ -361,6 +364,8 @@ + #define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ + #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x100000 /* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x200000 /* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -549,11 +554,13 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -759,6 +766,7 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -839,6 +847,14 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); + + #endif /* __KERNEL__ */ + +Index: linux-2.6.10/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs_i.h 2005-04-05 12:26:19.377141808 +0800 ++++ linux-2.6.10/include/linux/ext3_fs_i.h 2005-04-05 12:26:25.473215064 +0800 +@@ -134,6 +134,8 @@ + struct dynlock i_htree_lock; + struct semaphore i_append_sem; + struct semaphore i_rename_sem; ++ ++ __u32 i_cached_extent[3]; + }; + + #endif /* _LINUX_EXT3_FS_I */ +Index: linux-2.6.10/include/linux/ext3_extents.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_extents.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/linux/ext3_extents.h 2005-04-05 12:26:25.476214608 +0800 +@@ -0,0 +1,238 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_extent *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_extent *, int); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++#define EXT_CACHE_MARK 0xffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ee_len = 0; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ ++ +Index: linux-2.6.10/fs/ext3/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/inode.c 2005-04-05 12:26:19.367143328 +0800 ++++ linux-2.6.10/fs/ext3/inode.c 2005-04-05 12:26:25.462216736 +0800 +@@ -796,6 +796,17 @@ + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -806,8 +817,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -851,8 +862,8 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 0); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; + } +@@ -871,7 +882,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1591,7 +1602,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2089,6 +2100,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2817,6 +2831,9 @@ + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-2.6.10/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/ioctl.c 2005-04-05 12:25:13.631136720 +0800 ++++ linux-2.6.10/fs/ext3/ioctl.c 2005-04-05 12:26:25.471215368 +0800 +@@ -245,6 +245,10 @@ + return err; + } + ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + + default: + return -ENOTTY; +Index: linux-2.6.10/fs/ext3/super.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/super.c 2005-04-05 12:26:19.438132536 +0800 ++++ linux-2.6.10/fs/ext3/super.c 2005-04-05 12:26:25.471215368 +0800 +@@ -394,6 +394,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -463,6 +464,9 @@ + dynlock_init(&ei->i_htree_lock); + sema_init(&ei->i_rename_sem, 1); + sema_init(&ei->i_append_sem, 1); ++ ei->i_cached_extent[0] = 0; ++ ei->i_cached_extent[1] = 0; ++ ei->i_cached_extent[2] = 0; + return &ei->vfs_inode; + } + +@@ -595,6 +599,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_extdebug, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + }; + +@@ -647,6 +652,8 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, + }; +@@ -950,6 +957,12 @@ + match_int(&args[0], &option); + *n_blocks_count = option; + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1635,6 +1648,8 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); ++ + return 0; + + cantfind_ext3: +Index: linux-2.6.10/fs/ext3/extents.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/extents.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/fs/ext3/extents.c 2005-04-05 12:26:25.468215824 +0800 +@@ -0,0 +1,2306 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree, " -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++ ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++ ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(i == 0 || eh->eh_entries > 0); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++ } ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, ++ sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate e_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) ++ * sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent *ex, cbex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ee_block = start; ++ cbex.ee_len = end - start; ++ cbex.ee_start = 0; ++ } else ++ cbex = *ex; ++ ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex, exists); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ee_block + cbex.ee_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex) ++{ ++ if (tree->cex) { ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_len); ++ tree->cex->ee_block = ex->ee_block; ++ tree->cex->ee_start = ex->ee_start; ++ tree->cex->ee_len = ex->ee_len; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex, gex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ gex.ee_block = 0; ++ gex.ee_len = EXT_CACHE_MARK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ gex.ee_block = block; ++ gex.ee_len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ gex.ee_block = ex->ee_block + ex->ee_len; ++ gex.ee_len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(gex.ee_len > gex.ee_block); ++ gex.ee_len = gex.ee_len - gex.ee_block; ++ } else { ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.ee_block, ++ (unsigned long) gex.ee_len); ++ gex.ee_start = EXT_CACHE_MARK; ++ ext3_ext_put_in_cache(tree, &gex); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_extent *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return 0; ++ ++ /* has cache valid data? */ ++ if (cex->ee_len == 0) ++ return 0; ++ ++ if (block >= cex->ee_block && block < cex->ee_block + cex->ee_len) { ++ ex->ee_block = cex->ee_block; ++ ex->ee_start = cex->ee_start; ++ ex->ee_len = cex->ee_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return 1; ++ } ++ ++ /* not in cache */ ++ return 0; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++ ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ++ ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, "ext3_ext_remove_space", ++ "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_extent *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if (ext3_ext_in_cache(&tree, iblock, &newex)) { ++ if (newex.ee_start == EXT_CACHE_MARK) { ++ /* this is cached gap */ ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (newex.ee_start) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, &newex); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) ++ >> EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int exist) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); ++ +Index: linux-2.6.10/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/ialloc.c 2005-04-05 12:26:19.368143176 +0800 ++++ linux-2.6.10/fs/ext3/ialloc.c 2005-04-05 12:26:25.464216432 +0800 +@@ -644,6 +644,17 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-2.6.10/fs/ext3/Makefile +=================================================================== +--- linux-2.6.10.orig/fs/ext3/Makefile 2005-04-05 12:26:06.897039072 +0800 ++++ linux-2.6.10/fs/ext3/Makefile 2005-04-05 12:27:00.597875304 +0800 +@@ -5,8 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o +- ++ ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \ ++ extents.o + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o + ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o + +%diffstat + fs/ext3/Makefile | 4 + fs/ext3/extents.c | 2306 +++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/ialloc.c | 11 + fs/ext3/inode.c | 29 + fs/ext3/ioctl.c | 4 + fs/ext3/super.c | 15 + include/linux/ext3_extents.h | 238 ++++ + include/linux/ext3_fs.h | 20 + include/linux/ext3_fs_i.h | 2 + 9 files changed, 2619 insertions(+), 10 deletions(-) + diff --git a/lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch new file mode 100755 index 0000000..a400fb3 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch @@ -0,0 +1,361 @@ +Index: linux-2.6.10/fs/ext3/xattr.h +=================================================================== +--- linux-2.6.10.orig/fs/ext3/xattr.h 2005-04-05 12:26:19.376141960 +0800 ++++ linux-2.6.10/fs/ext3/xattr.h 2005-04-05 12:27:55.527524728 +0800 +@@ -70,6 +70,7 @@ + extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,const void *,size_t,int); + extern int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *,const void *,size_t,int); + ++extern int ext3_xattr_get_ea_loc(struct inode *, int, const char *, struct buffer_head **, int *, int *); + extern void ext3_xattr_delete_inode(handle_t *, struct inode *); + extern void ext3_xattr_put_super(struct super_block *); + +Index: linux-2.6.10/fs/ext3/extents-in-ea.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/extents-in-ea.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/fs/ext3/extents-in-ea.c 2005-04-05 12:27:55.524525184 +0800 +@@ -0,0 +1,224 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int ext3_get_ea_write_access(handle_t *handle, void *buffer) ++{ ++ struct buffer_head *bh = (struct buffer_head *) buffer; ++ return ext3_journal_get_write_access(handle, bh); ++} ++ ++static int ext3_mark_ea_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct buffer_head *bh = (struct buffer_head *) buffer; ++ ext3_journal_dirty_metadata(handle, bh); ++ return 0; ++} ++ ++static struct ext3_extents_helpers ext3_ea_helpers = { ++ .get_write_access = ext3_get_ea_write_access, ++ .mark_buffer_dirty = ext3_mark_ea_buffer_dirty, ++ .mergable = NULL, ++ .new_block = NULL, ++ .remove_extent = NULL, ++ .remove_extent_credits = NULL, ++}; ++ ++int ext3_init_tree_in_ea_desc(struct ext3_extents_tree *tree, ++ struct inode *inode, int name_index, ++ const char *eaname) ++{ ++ struct buffer_head *bh; ++ int offset, err, size; ++ ++ err = ext3_xattr_get_ea_loc(inode, name_index, eaname, ++ &bh, &offset, &size); ++ if (err) ++ return err; ++ ++ EXT_ASSERT(bh); ++ EXT_ASSERT(size >= sizeof(struct ext3_extent_header) ++ + sizeof(struct ext3_extent)); ++ tree->inode = inode; ++ tree->root = (void *) bh->b_data + offset; ++ tree->buffer_len = size; ++ tree->buffer = (void *) bh; ++ tree->ops = &ext3_ea_helpers; ++ tree->cex = NULL; /* FIXME: add cache store later */ ++ return 0; ++} ++ ++void ext3_release_tree_in_ea_desc(struct ext3_extents_tree *tree) ++{ ++ struct buffer_head *bh; ++ ++ bh = (struct buffer_head *) tree->buffer; ++ EXT_ASSERT(bh); ++ brelse(bh); ++} ++ ++int ext3_init_tree_in_ea(struct inode *inode, int name_index, ++ const char *eaname, int size) ++{ ++ struct ext3_extents_tree tree; ++ handle_t *handle; ++ char *root; ++ int err; ++ ++ root = kmalloc(size, GFP_USER); ++ if (!root) ++ return -ENOMEM; ++ memset(root, 0, size); ++ ++ /* first, create ea to store root of the tree */ ++ handle = ext3_journal_start(inode, EXT3_ALLOC_NEEDED + 3); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if ((err = ext3_xattr_set(inode, name_index, ++ eaname, root, size, 0))) ++ goto out; ++ if ((err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname))) ++ goto out; ++ err = ext3_extent_tree_init(handle, &tree); ++ ext3_release_tree_in_ea_desc(&tree); ++out: ++ ext3_journal_stop(handle, inode); ++ kfree(root); ++ return err; ++} ++ ++static int ++ext3_ext_in_ea_new_extent(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct inode *inode = tree->inode; ++ handle_t *handle; ++ int needed, err; ++ unsigned long tgen; ++ ++ if (exist) ++ return EXT_CONTINUE; ++ ++ tgen = EXT_GENERATION(tree); ++ needed = ext3_ext_calc_credits_for_insert(tree, path); ++ up(&EXT3_I(inode)->truncate_sem); ++ handle = ext3_journal_start(tree->inode, needed + 10); ++ if (IS_ERR(handle)) { ++ down_write(&EXT3_I(inode)->truncate_sem); ++ return PTR_ERR(handle); ++ } ++ ++ if (tgen != EXT_GENERATION(tree)) { ++ /* the tree has changed. so path can be invalid at moment */ ++ ext3_journal_stop(handle, inode); ++ down_write(&EXT3_I(inode)->truncate_sem); ++ return EXT_REPEAT; ++ } ++ ++ down_write(&EXT3_I(inode)->truncate_sem); ++ ++ /* insert new extent */ ++ newex->ee_start = 0; ++ err = ext3_ext_insert_extent(handle, tree, path, newex); ++ if (!err) ++ ext3_journal_stop(handle, tree->inode); ++ ++ return err; ++} ++ ++int ext3_ext_in_ea_alloc_space(struct inode *inode, int name_index, ++ const char *eaname, unsigned long from, ++ unsigned long num) ++{ ++ struct ext3_extents_tree tree; ++ int err; ++ ++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname); ++ if (err == 0) { ++ down_write(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, from, num, ++ ext3_ext_in_ea_new_extent); ++ ext3_release_tree_in_ea_desc(&tree); ++ up_write(&EXT3_I(inode)->truncate_sem); ++ } ++ return err; ++} ++ ++int ext3_ext_in_ea_remove_space(struct inode *inode, int name_index, ++ const char *eaname, unsigned long from, ++ unsigned long num) ++{ ++ struct ext3_extents_tree tree; ++ int err; ++ ++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname); ++ if (err == 0) { ++ err = ext3_ext_remove_space(&tree, from, num); ++ ext3_release_tree_in_ea_desc(&tree); ++ } ++ return err; ++} ++ ++int ext3_ext_in_ea_presence(struct inode *inode, int name_index, ++ const char *eaname, unsigned long block) ++{ ++ struct ext3_extents_tree tree; ++ struct ext3_ext_path *path; ++ struct ext3_extent *ex; ++ int err, depth; ++ ++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname); ++ if (err) ++ return err; ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, block, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ex = path[depth].p_ext; ++ if (!ex) { ++ /* there is no extent yet */ ++ goto out; ++ } ++ ++ if (block >= ex->ee_block && block < ex->ee_block + ex->ee_len) ++ err = 1; ++out: ++ ext3_release_tree_in_ea_desc(&tree); ++ return err; ++} ++ +Index: linux-2.6.10/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/xattr.c 2005-04-05 12:26:19.370142872 +0800 ++++ linux-2.6.10/fs/ext3/xattr.c 2005-04-05 12:27:55.527524728 +0800 +@@ -590,7 +590,8 @@ + */ + int + ext3_xattr_ibody_find(struct inode *inode, int name_index, +- const char *name, struct ext3_xattr_entry *rentry, int *free) ++ const char *name, struct ext3_xattr_entry *rentry, int *free, ++ struct buffer_head **bh, int *offset) + { + struct ext3_xattr_entry *last; + struct ext3_inode *raw_inode; +@@ -637,6 +638,15 @@ + name_len == last->e_name_len && + !memcmp(name, last->e_name, name_len)) { + memcpy(rentry, last, sizeof(struct ext3_xattr_entry)); ++ if (offset) { ++ void *voff; ++ voff = start + le16_to_cpu(last->e_value_offs); ++ *offset = voff - (void *) iloc.bh->b_data; ++ } ++ if (bh) { ++ get_bh(iloc.bh); ++ *bh = iloc.bh; ++ } + ret = 0; + } else { + *free -= EXT3_XATTR_LEN(last->e_name_len); +@@ -657,7 +667,8 @@ + */ + int + ext3_xattr_block_find(struct inode *inode, int name_index, const char *name, +- struct ext3_xattr_entry *rentry, int *free) ++ struct ext3_xattr_entry *rentry, int *free, ++ struct buffer_head **tbh, int *offset) + { + struct buffer_head *bh = NULL; + struct ext3_xattr_entry *entry; +@@ -700,6 +711,12 @@ + memcmp(name, entry->e_name, name_len) == 0) { + memcpy(rentry, entry, sizeof(struct ext3_xattr_entry)); + error = 0; ++ if (offset) ++ *offset = le16_to_cpu(entry->e_value_offs); ++ if (tbh) { ++ get_bh(bh); ++ *tbh = bh; ++ } + } else { + *free -= EXT3_XATTR_LEN(entry->e_name_len); + *free -= le32_to_cpu(entry->e_value_size); +@@ -894,7 +911,8 @@ + down_write(&EXT3_I(inode)->xattr_sem); + + /* try to find attribute in inode body */ +- err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1); ++ err = ext3_xattr_ibody_find(inode, name_index, name, ++ &entry, &free1, NULL, NULL); + if (err == 0) { + /* found EA in inode */ + found = 1; +@@ -903,7 +921,7 @@ + /* there is no such attribute in inode body */ + /* try to find attribute in dedicated block */ + err = ext3_xattr_block_find(inode, name_index, name, +- &entry, &free2); ++ &entry, &free2, NULL, NULL); + if (err != 0 && err != -ENOENT) { + /* not found EA in block */ + goto finish; +@@ -960,6 +978,35 @@ + return err; + } + ++int ext3_xattr_get_ea_loc(struct inode *inode, int name_index, ++ const char *name, struct buffer_head **bh, ++ int *offset, int *size) ++{ ++ int free1 = -1, free2 = -1, err, name_len; ++ struct ext3_xattr_entry entry; ++ ++ ea_idebug(inode, "name=%d.%s", name_index, name); ++ ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255) ++ return -ERANGE; ++ ++ /* try to find attribute in inode body */ ++ err = ext3_xattr_ibody_find(inode, name_index, name, ++ &entry, &free1, bh, offset); ++ if (err == -ENOENT) { ++ /* there is no such attribute in inode body */ ++ /* try to find attribute in dedicated block */ ++ err = ext3_xattr_block_find(inode, name_index, name, ++ &entry, &free2, bh, offset); ++ } ++ if (err == 0 && size) ++ *size = le32_to_cpu(entry.e_value_size); ++ return err; ++} ++ + /* + * ext3_xattr_block_set() + * +Index: linux-2.6.10/fs/ext3/Makefile +=================================================================== +--- linux-2.6.10.orig/fs/ext3/Makefile 2005-04-05 12:27:00.597875304 +0800 ++++ linux-2.6.10/fs/ext3/Makefile 2005-04-05 12:28:26.989741744 +0800 +@@ -7,6 +7,6 @@ + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \ + extents.o +-ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ++ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o extents-in-ea.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o + ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o diff --git a/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch new file mode 100755 index 0000000..b39fb93 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch @@ -0,0 +1,230 @@ +Index: linux-2.6.10/fs/ext3/extents-in-ea.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/extents-in-ea.c 2005-03-31 19:41:09.471494208 +0800 ++++ linux-2.6.10/fs/ext3/extents-in-ea.c 2005-03-31 19:41:09.580477640 +0800 +@@ -27,7 +27,7 @@ + #include + #include + #include +-#include ++#include "xattr.h" + #include + #include + +@@ -111,7 +111,7 @@ + err = ext3_extent_tree_init(handle, &tree); + ext3_release_tree_in_ea_desc(&tree); + out: +- ext3_journal_stop(handle, inode); ++ ext3_journal_stop(handle); + kfree(root); + return err; + } +@@ -134,24 +134,24 @@ + up(&EXT3_I(inode)->truncate_sem); + handle = ext3_journal_start(tree->inode, needed + 10); + if (IS_ERR(handle)) { +- down_write(&EXT3_I(inode)->truncate_sem); ++ down(&EXT3_I(inode)->truncate_sem); + return PTR_ERR(handle); + } + + if (tgen != EXT_GENERATION(tree)) { + /* the tree has changed. so path can be invalid at moment */ +- ext3_journal_stop(handle, inode); +- down_write(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++ down(&EXT3_I(inode)->truncate_sem); + return EXT_REPEAT; + } + +- down_write(&EXT3_I(inode)->truncate_sem); ++ down(&EXT3_I(inode)->truncate_sem); + + /* insert new extent */ + newex->ee_start = 0; + err = ext3_ext_insert_extent(handle, tree, path, newex); + if (!err) +- ext3_journal_stop(handle, tree->inode); ++ ext3_journal_stop(handle); + + return err; + } +@@ -165,11 +165,11 @@ + + err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname); + if (err == 0) { +- down_write(&EXT3_I(inode)->truncate_sem); ++ down(&EXT3_I(inode)->truncate_sem); + err = ext3_ext_walk_space(&tree, from, num, + ext3_ext_in_ea_new_extent); + ext3_release_tree_in_ea_desc(&tree); +- up_write(&EXT3_I(inode)->truncate_sem); ++ up(&EXT3_I(inode)->truncate_sem); + } + return err; + } +@@ -222,3 +222,112 @@ + return err; + } + ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int exist) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++struct ea_tree_desc { ++ int name_index; ++ char eaname[256]; ++}; ++ ++int ext3_ext_in_ea_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ int err = 0; ++ ++ if (cmd == EXT3_IOC_EA_TREE_INIT) { ++ struct ea_tree_desc desc; ++ ++ if (copy_from_user(&desc, (void *) arg, sizeof(desc))) ++ return -EFAULT; ++ err = ext3_init_tree_in_ea(inode, desc.name_index, ++ desc.eaname, 64); ++ } else if (cmd == EXT3_IOC_GET_EA_EXTENTS) { ++ struct ext3_extents_tree tree; ++ struct ext3_extent_buf buf; ++ struct ea_tree_desc desc; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ if (copy_from_user(&desc, buf.cur, sizeof(desc))) ++ return -EFAULT; ++ err = ext3_init_tree_in_ea_desc(&tree, inode, ++ desc.name_index, desc.eaname); ++ if (err) ++ goto out; ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ if (err == 0) ++ err = buf.err; ++ ext3_release_tree_in_ea_desc(&tree); ++ } else if (cmd == EXT3_IOC_EA_TREE_ALLOCATE) { ++ struct ext3_extent_buf buf; ++ struct ea_tree_desc desc; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ if (copy_from_user(&desc, buf.cur, sizeof(desc))) ++ return -EFAULT; ++ err = ext3_ext_in_ea_alloc_space(inode, desc.name_index, ++ desc.eaname, buf.start, ++ buf.err); ++ } else if (cmd == EXT3_IOC_EA_TREE_REMOVE) { ++ struct ext3_extent_buf buf; ++ struct ea_tree_desc desc; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ if (copy_from_user(&desc, buf.cur, sizeof(desc))) ++ return -EFAULT; ++ err = ext3_ext_in_ea_remove_space(inode, desc.name_index, ++ desc.eaname, buf.start, ++ buf.err); ++ } ++ ++out: ++ return err; ++} ++ +Index: linux-2.6.10/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/ioctl.c 2005-03-31 19:41:09.365510320 +0800 ++++ linux-2.6.10/fs/ext3/ioctl.c 2005-03-31 19:41:09.580477640 +0800 +@@ -249,7 +249,13 @@ + case EXT3_IOC_GET_TREE_STATS: + case EXT3_IOC_GET_TREE_DEPTH: + return ext3_ext_ioctl(inode, filp, cmd, arg); +- ++ case EXT3_IOC_GET_EA_EXTENTS: ++ case EXT3_IOC_GET_EA_TREE_DEPTH: ++ case EXT3_IOC_GET_EA_TREE_STATS: ++ case EXT3_IOC_EA_TREE_INIT: ++ case EXT3_IOC_EA_TREE_ALLOCATE: ++ case EXT3_IOC_EA_TREE_REMOVE: ++ return ext3_ext_in_ea_ioctl(inode, filp, cmd, arg); + default: + return -ENOTTY; + } +Index: linux-2.6.10/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-03-31 19:41:09.366510168 +0800 ++++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 19:43:30.539048680 +0800 +@@ -242,6 +242,15 @@ + #define EXT3_IOC_GET_EXTENTS _IOR('f', 10, long) + #define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 11, long) + #define EXT3_IOC_GET_TREE_STATS _IOR('f', 12, long) ++ ++#define EXT3_IOC_GET_EA_EXTENTS _IOR('f', 13, long) ++#define EXT3_IOC_GET_EA_TREE_DEPTH _IOR('f', 14, long) ++#define EXT3_IOC_GET_EA_TREE_STATS _IOR('f', 15, long) ++#define EXT3_IOC_EA_TREE_INIT _IOW('f', 16, long) ++#define EXT3_IOC_EA_TREE_ALLOCATE _IOW('f', 17, long) ++#define EXT3_IOC_EA_TREE_REMOVE _IOW('f', 18, long) ++ ++ + /* + * Structure of an inode on the disk + */ +@@ -788,7 +797,10 @@ + /* ioctl.c */ + extern int ext3_ioctl (struct inode *, struct file *, unsigned int, + unsigned long); +- ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg); ++extern int ext3_ext_in_ea_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + /* namei.c */ + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); diff --git a/lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch new file mode 100755 index 0000000..973d02f --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch @@ -0,0 +1,281 @@ +Index: linux-2.6.10/fs/ext3/dir.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/dir.c 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/fs/ext3/dir.c 2005-03-31 18:56:02.961946200 +0800 +@@ -53,6 +53,9 @@ + + static unsigned char get_dtype(struct super_block *sb, int filetype) + { ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM)) ++ return DT_UNKNOWN; ++ + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT3_FT_MAX)) + return DT_UNKNOWN; +@@ -79,7 +82,8 @@ + error_msg = "directory entry across blocks"; + else if (le32_to_cpu(de->inode) > + le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) +- error_msg = "inode out of bounds"; ++ if (de->file_type != 128) ++ error_msg = "inode out of bounds"; + + if (error_msg != NULL) + ext3_error (dir->i_sb, function, +Index: linux-2.6.10/fs/ext3/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/namei.c 2005-03-31 18:41:15.880803032 +0800 ++++ linux-2.6.10/fs/ext3/namei.c 2005-03-31 18:56:02.960946352 +0800 +@@ -24,6 +24,7 @@ + * Theodore Ts'o, 2002 + */ + ++#include + #include + #include + #include +@@ -1148,6 +1149,23 @@ + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); ++ unsigned type = de->file_type; ++ __u32 *mds; ++ mds = (__u32 *)((char *) de + EXT3_DIR_REC_LEN(de->name_len)); ++ if ((type & 128) && EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb, ++ EXT3_FEATURE_INCOMPAT_MDSNUM) && ++ mds[0] != EXT3_SB(dir->i_sb)->s_mdsnum) { ++ struct ext3_super_block *es; ++ es = EXT3_SB(dir->i_sb)->s_es; ++ brelse (bh); ++ dentry->d_flags |= DCACHE_CROSS_REF; ++ dentry->d_generation = mds[1]; ++ dentry->d_mdsnum = mds[0]; ++ dentry->d_inum = ino; ++ ext3_unlock_htree(dir, lock); ++ d_add(dentry, NULL); ++ return NULL; ++ } + ext3_unlock_htree(dir, lock); + brelse (bh); + inode = iget(dir->i_sb, ino); +@@ -1221,7 +1239,7 @@ + while (count--) { + struct ext3_dir_entry_2 *de = + (struct ext3_dir_entry_2 *) (from + map->offs); +- rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ rec_len = EXT3_DIR_REC_LEN_DE(de); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = + cpu_to_le16(rec_len); +@@ -1243,7 +1261,7 @@ + next = (struct ext3_dir_entry_2 *) ((char *) de + + le16_to_cpu(de->rec_len)); + if (de->inode && de->name_len) { +- rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ rec_len = EXT3_DIR_REC_LEN_DE(de); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = cpu_to_le16(rec_len); +@@ -1359,6 +1377,7 @@ + struct buffer_head * bh) + { + struct inode *dir = dentry->d_parent->d_inode; ++ struct super_block *sb = dir->i_sb; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset = 0; +@@ -1367,6 +1386,10 @@ + char *top; + + reclen = EXT3_DIR_REC_LEN(namelen); ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM) ++ && (dentry->d_flags & DCACHE_CROSS_REF) ++ && (dentry->d_mdsnum != EXT3_SB(sb)->s_mdsnum)) ++ reclen += 8; /* we need space to store mds num */ + if (!de) { + de = (struct ext3_dir_entry_2 *)bh->b_data; + top = bh->b_data + dir->i_sb->s_blocksize - reclen; +@@ -1380,7 +1403,7 @@ + brelse (bh); + return -EEXIST; + } +- nlen = EXT3_DIR_REC_LEN(de->name_len); ++ nlen = EXT3_DIR_REC_LEN_DE(de); + rlen = le16_to_cpu(de->rec_len); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; +@@ -1399,7 +1422,7 @@ + } + + /* By now the buffer is marked for journaling */ +- nlen = EXT3_DIR_REC_LEN(de->name_len); ++ nlen = EXT3_DIR_REC_LEN_DE(de); + rlen = le16_to_cpu(de->rec_len); + if (de->inode) { + struct ext3_dir_entry_2 *de1 = +@@ -1411,8 +1434,20 @@ + de->file_type = EXT3_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else ++ ext3_set_de_type(sb, de, inode->i_mode); ++ } else if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MDSNUM) ++ && (dentry->d_flags & DCACHE_CROSS_REF)) { ++ if (dentry->d_mdsnum != EXT3_SB(sb)->s_mdsnum) { ++ __u32 *mds; ++ mds = (__u32 *)((char *)de + EXT3_DIR_REC_LEN(namelen)); ++ mds[0] = cpu_to_le32(dentry->d_mdsnum); ++ mds[1] = cpu_to_le32(dentry->d_generation); ++ de->inode = cpu_to_le32(dentry->d_inum); ++ de->file_type = 128; ++ } else { ++ de->inode = cpu_to_le32(dentry->d_inum); ++ } ++ } else + de->inode = 0; + de->name_len = namelen; + memcpy (de->name, name, namelen); +@@ -2737,6 +2772,81 @@ + } + + /* ++ * caller has to make sure directory is protected ++ */ ++int ext3_add_dir_entry(struct dentry *dentry) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ handle_t *handle; ++ int err; ++ ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS); ++ if (IS_ERR(handle)) { ++ return PTR_ERR(handle); ++ } ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ err = ext3_add_entry(handle, dentry, NULL); ++ ext3_journal_stop(handle); ++ return err; ++} ++EXPORT_SYMBOL(ext3_add_dir_entry); ++/* ++ * caller has to make sure directory is protected ++ */ ++int ext3_del_dir_entry(struct dentry *dentry) ++{ ++ struct inode * inode; ++ struct inode * dir = dentry->d_parent->d_inode; ++ struct buffer_head * bh; ++ struct ext3_dir_entry_2 * de; ++ handle_t *handle; ++ int retval; ++ void *lock = NULL; ++ ++ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); ++ if (IS_ERR(handle)) { ++ return PTR_ERR(handle); ++ } ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ retval = -ENOENT; ++ bh = ext3_find_entry (dentry, &de, 1, &lock); ++ ext3_unlock_htree(dir, lock); ++ if (!bh) ++ goto end_unlink; ++ ++ inode = dentry->d_inode; ++ if (inode) ++ DQUOT_INIT(inode); ++ ++ retval = ext3_delete_entry(handle, dir, de, bh); ++ if (retval) ++ goto end_unlink; ++ dir->i_ctime = dir->i_mtime = CURRENT_TIME; ++ ext3_update_dx_flag(dir); ++ if (inode) { ++ inode->i_ctime = dir->i_ctime; ++ ext3_mark_inode_dirty(handle, inode); ++ if (S_ISDIR(inode->i_mode)) ++ dir->i_nlink--; ++ } ++ ext3_mark_inode_dirty(handle, dir); ++ retval = 0; ++ ++end_unlink: ++ ext3_journal_stop(handle); ++ brelse (bh); ++ return retval; ++} ++ ++EXPORT_SYMBOL(ext3_del_dir_entry); ++/* + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { +Index: linux-2.6.10/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-03-31 18:54:32.497698856 +0800 ++++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 18:56:41.955018352 +0800 +@@ -483,7 +483,8 @@ + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ +- __u32 s_reserved[190]; /* Padding to the end of the block */ ++ __u32 s_mdsnum; ++ __u32 s_reserved[189]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ +@@ -563,12 +564,14 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_MDSNUM 0x0020 /* direntry has mdsnum */ + #define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ + EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_MDSNUM| \ + EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -643,6 +646,9 @@ + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) ++#define EXT3_DIR_REC_LEN_DE(de) (EXT3_DIR_REC_LEN((de)->name_len) + \ ++ (((de)->file_type & 128) ? 8 : 0)) ++ + /* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 +@@ -868,6 +874,9 @@ + extern void ext3_ext_release(struct super_block *); + extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); + ++extern int ext3_add_dir_entry(struct dentry *dentry); ++ ++extern int ext3_del_dir_entry(struct dentry *dentry); + #endif /* __KERNEL__ */ + + #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) +Index: linux-2.6.10/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs_sb.h 2005-03-31 18:44:21.076648984 +0800 ++++ linux-2.6.10/include/linux/ext3_fs_sb.h 2005-03-31 18:56:02.964945744 +0800 +@@ -81,6 +81,7 @@ + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ u32 s_mdsnum; + }; + + #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch new file mode 100644 index 0000000..022b8d0 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch @@ -0,0 +1,1202 @@ + fs/ext3/ialloc.c | 3 + fs/ext3/inode.c | 3 + fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++--------- + fs/ext3/super.c | 14 + + include/linux/ext3_fs.h | 1 + include/linux/ext3_fs_i.h | 6 + 6 files changed, 500 insertions(+), 109 deletions(-) + +Index: linux-2.6.10/fs/ext3/super.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/super.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/fs/ext3/super.c 2005-03-31 19:44:54.251322480 +0800 +@@ -458,6 +458,9 @@ + #endif + ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + ei->vfs_inode.i_version = 1; ++ dynlock_init(&ei->i_htree_lock); ++ sema_init(&ei->i_rename_sem, 1); ++ sema_init(&ei->i_append_sem, 1); + return &ei->vfs_inode; + } + +@@ -588,7 +591,7 @@ + Opt_commit, Opt_journal_update, Opt_journal_inum, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, +- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, ++ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + }; + +@@ -637,6 +640,7 @@ + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, + {Opt_barrier, "barrier=%u"}, ++ {Opt_pdirops, "pdirops"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, + }; +Index: linux-2.6.10/fs/ext3/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/namei.c 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/fs/ext3/namei.c 2005-03-31 19:48:53.958881392 +0800 +@@ -53,6 +53,9 @@ + { + struct buffer_head *bh; + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&EXT3_I(inode)->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3_bread(handle, inode, *block, 1, err))) { +@@ -60,6 +63,8 @@ + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_journal_get_write_access(handle,bh); + } ++ up(&EXT3_I(inode)->i_append_sem); ++ + return bh; + } + +@@ -133,6 +138,8 @@ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; ++ unsigned long leaf; ++ unsigned int curidx; + }; + + struct dx_map_entry +@@ -141,6 +148,30 @@ + u32 offs; + }; + ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 ++ ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} ++ ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} ++ ++ + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block (struct dx_entry *entry); + static void dx_set_block (struct dx_entry *entry, unsigned value); +@@ -152,7 +183,7 @@ + static void dx_set_limit (struct dx_entry *entries, unsigned value); + static unsigned dx_root_limit (struct inode *dir, unsigned infosize); + static unsigned dx_node_limit (struct inode *dir); +-static struct dx_frame *dx_probe(struct dentry *dentry, ++static struct dx_frame *dx_probe(struct qstr *name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +@@ -164,15 +195,18 @@ + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err); ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); ++static void *ext3_lock_htree(struct inode *, unsigned long, int); ++static void ext3_unlock_htree(struct inode *, void *); + + /* + * Future: use high four bits of block for coalesce-on-delete flags +@@ -316,6 +350,94 @@ + #endif /* DX_DEBUG */ + + /* ++ * dx_find_position ++ * ++ * search position of specified hash in index ++ * ++ */ ++ ++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash) ++{ ++ struct dx_entry *p, *q, *m; ++ int count; ++ ++ count = dx_get_count(entries); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ return p - 1; ++} ++ ++/* ++ * returns 1 if path is unchanged ++ */ ++int dx_check_path(struct dx_frame *frame, u32 hash) ++{ ++ struct dx_entry *p; ++ int ret = 1; ++ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hash); ++ if (frame->leaf != dx_get_block(p)) ++ ret = 0; ++ dx_unlock_bh(frame->bh); ++ ++ return ret; ++} ++ ++/* ++ * 0 - changed ++ * 1 - hasn't changed ++ */ ++static int ++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo) ++{ ++ struct dx_entry *p; ++ struct dx_frame *frame = frames; ++ u32 leaf; ++ ++ /* check first level */ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ /* is there 2nd level? */ ++ frame++; ++ if (frame->bh == NULL) ++ return 1; ++ ++ /* check second level */ ++ dx_lock_bh(frame->bh); ++ ++ /* probably 1st level got changed, check it */ ++ if (!dx_check_path(frames, hinfo->hash)) { ++ /* path changed */ ++ dx_unlock_bh(frame->bh); ++ return 0; ++ } ++ ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ return 1; ++} ++ ++/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +@@ -325,19 +447,20 @@ + * back to userspace. + */ + static struct dx_frame * +-dx_probe(struct dentry *dentry, struct inode *dir, ++dx_probe(struct qstr *name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) + { +- unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ unsigned indirect; ++ struct dx_entry *at, *entries; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; ++ unsigned int curidx; + + frame->bh = NULL; +- if (dentry) +- dir = dentry->d_parent->d_inode; ++ frame[1].bh = NULL; ++ + if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; +@@ -353,8 +476,8 @@ + } + hinfo->hash_version = root->info.hash_version; + hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; +- if (dentry) +- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ if (name) ++ ext3fs_dirhash(name->name, name->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { +@@ -366,7 +489,19 @@ + goto fail; + } + ++repeat: ++ curidx = 0; ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ dx_lock_bh(bh); ++ /* indirect must be initialized under bh lock because ++ * 2nd level creation procedure may change it and dx_probe() ++ * will suggest htree is still single-level -bzzz */ + if ((indirect = root->info.indirect_levels) > 1) { ++ dx_unlock_bh(bh); + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -374,56 +509,46 @@ + *err = ERR_BAD_DX_DIR; + goto fail; + } +- +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); +- assert(dx_get_limit(entries) == dx_root_limit(dir, +- root->info.info_length)); +- dxtrace (printk("Look up %x", hash)); ++ + while (1) + { +- count = dx_get_count(entries); +- assert (count && count <= dx_get_limit(entries)); +- p = entries + 1; +- q = entries + count - 1; +- while (p <= q) +- { +- m = p + (q - p)/2; +- dxtrace(printk(".")); +- if (dx_get_hash(m) > hash) +- q = m - 1; +- else +- p = m + 1; +- } +- +- if (0) // linear search cross check +- { +- unsigned n = count - 1; +- at = entries; +- while (n--) +- { +- dxtrace(printk(",")); +- if (dx_get_hash(++at) > hash) +- { +- at--; +- break; +- } +- } +- assert (at == p - 1); +- } +- +- at = p - 1; +- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ at = dx_find_position(entries, hinfo->hash); ++ dxtrace(printk(" %x->%u\n", ++ at == entries? 0: dx_get_hash(at), ++ dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; +- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ frame->curidx = curidx; ++ frame->leaf = dx_get_block(at); ++ if (!indirect--) { ++ dx_unlock_bh(bh); ++ return frame; ++ } ++ ++ /* step into next htree level */ ++ curidx = dx_get_block(at); ++ dx_unlock_bh(bh); ++ if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err))) + goto fail2; ++ ++ dx_lock_bh(bh); ++ /* splitting may change root index block and move ++ * hash we're looking for into another index block ++ * so, we have to check this situation and repeat ++ * from begining if path got changed -bzzz */ ++ if (!dx_check_path(frame, hash)) { ++ dx_unlock_bh(bh); ++ bh = frame->bh; ++ indirect++; ++ goto repeat; ++ } ++ + at = entries = ((struct dx_node *) bh->b_data)->entries; + assert (dx_get_limit(entries) == dx_node_limit (dir)); + frame++; + } ++ dx_unlock_bh(bh); + fail2: + while (frame >= frame_in) { + brelse(frame->bh); +@@ -437,8 +562,7 @@ + { + if (frames[0].bh == NULL) + return; +- +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ if (frames[1].bh != NULL) + brelse(frames[1].bh); + brelse(frames[0].bh); + } +@@ -479,8 +603,10 @@ + * nodes need to be read. + */ + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) { ++ p->leaf = dx_get_block(p->at); + break; ++ } + if (p == frames) + return 0; + num_frames++; +@@ -506,13 +632,17 @@ + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), +- 0, &err))) ++ u32 idx; ++ ++ idx = p->leaf = dx_get_block(p->at); ++ if (!(bh = ext3_bread(NULL, dir, idx, 0, &err))) + return err; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ p->curidx = idx; ++ p->leaf = dx_get_block(p->at); + } + return 1; + } +@@ -673,7 +803,8 @@ + count++; + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ +- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ de = (struct ext3_dir_entry_2 *)((char*)de + ++ le16_to_cpu(de->rec_len)); + } + return count; + } +@@ -706,7 +837,8 @@ + } while(more); + } + +-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct inode *dir, struct dx_frame *frame, ++ u32 hash, u32 block, u32 idx) + { + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; +@@ -718,6 +850,7 @@ + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); ++ + } + #endif + +@@ -798,7 +931,8 @@ + * to brelse() it when appropriate. + */ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, +- struct ext3_dir_entry_2 ** res_dir) ++ struct ext3_dir_entry_2 ** res_dir, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; +@@ -814,6 +948,7 @@ + int namelen; + const u8 *name; + unsigned blocksize; ++ int do_not_use_dx = 0; + + *res_dir = NULL; + sb = dir->i_sb; +@@ -822,9 +957,10 @@ + name = dentry->d_name.name; + if (namelen > EXT3_NAME_LEN) + return NULL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { +- bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -833,8 +969,14 @@ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ do_not_use_dx = 1; + } + #endif ++ *lock = ext3_lock_htree(dir, 0, rwlock); ++ if (is_dx(dir) && !do_not_use_dx) { ++ ext3_unlock_htree(dir, *lock); ++ goto repeat; ++ } + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) +@@ -907,12 +1049,17 @@ + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); ++ if (!ret) { ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; ++ } + return ret; + } + + #ifdef CONFIG_EXT3_INDEX + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err) ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct dx_hash_info hinfo; +@@ -927,11 +1074,21 @@ + struct inode *dir = dentry->d_parent->d_inode; + + sb = dir->i_sb; +- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) ++repeat: ++ if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err))) + return NULL; ++ ++ *lock = ext3_lock_htree(dir, frame->leaf, rwlock); ++ /* while locking leaf we just found may get splitted ++ * so, we need another leaf. check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, *lock); ++ dx_release(frames); ++ goto repeat; ++ } + hash = hinfo.hash; + do { +- block = dx_get_block(frame->at); ++ block = frame->leaf; + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -966,6 +1123,8 @@ + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; + dx_release (frames); + return NULL; + } +@@ -976,14 +1135,16 @@ + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ void *lock = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + +- bh = ext3_find_entry(dentry, &de); ++ bh = ext3_find_entry(dentry, &de, 0, &lock); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); ++ ext3_unlock_htree(dir, lock); + brelse (bh); + inode = iget(dir->i_sb, ino); + +@@ -1005,17 +1166,19 @@ + struct dentry dotdot; + struct ext3_dir_entry_2 * de; + struct buffer_head *bh; ++ void *lock = NULL; + + dotdot.d_name.name = ".."; + dotdot.d_name.len = 2; + dotdot.d_parent = child; /* confusing, isn't it! */ + +- bh = ext3_find_entry(&dotdot, &de); ++ bh = ext3_find_entry(&dotdot, &de, 0, &lock); + inode = NULL; + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); ++ ext3_unlock_htree(child->d_inode, lock); + inode = iget(child->d_inode->i_sb, ino); + + if (!inode) +@@ -1054,7 +1217,8 @@ + unsigned rec_len = 0; + + while (count--) { +- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ struct ext3_dir_entry_2 *de = ++ (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = +@@ -1068,7 +1232,8 @@ + + static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) + { +- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ struct ext3_dir_entry_2 *next, *to, *prev; ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; +@@ -1090,7 +1255,8 @@ + + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct dx_hash_info *hinfo, void **target, ++ int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1137,23 +1303,30 @@ + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- ++ frame->leaf, hash2, split, count-split)); ++ + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1)); + + /* Which block gets the new entry? */ ++ *target = NULL; + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; +- } +- dx_insert_block (frame, hash2 + continued, newblock); ++ ++ /* entry will be stored into new block ++ * we have to lock it before add_dirent_to_buf */ ++ *target = ext3_lock_htree(dir, newblock, 1); ++ } ++ dx_lock_bh(frame->bh); ++ dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx); ++ dx_unlock_bh(frame->bh); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1227,7 +1400,8 @@ + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ struct ext3_dir_entry_2 *de1 = ++ (struct ext3_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = cpu_to_le16(rlen - nlen); + de->rec_len = cpu_to_le16(nlen); + de = de1; +@@ -1286,6 +1460,7 @@ + struct dx_hash_info hinfo; + u32 block; + struct fake_dirent *fde; ++ void *lock, *new_lock; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); +@@ -1305,6 +1480,8 @@ + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + data1 = bh2->b_data; + ++ lock = ext3_lock_htree(dir, block, 1); ++ + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len)); +@@ -1334,13 +1511,25 @@ + frame->entries = entries; + frame->at = entries; + frame->bh = bh; ++ frame->curidx = 0; ++ frame->leaf = 0; ++ frame[1].bh = NULL; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval); + dx_release (frames); + if (!(de)) +- return retval; ++ goto cleanup; ++ ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++cleanup: ++ if (new_lock) ++ ext3_unlock_htree(dir, new_lock); ++ /* we mark directory indexed in order to ++ * avoid races while htree being created -bzzz */ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ ext3_unlock_htree(dir, lock); + +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ return retval; + } + #endif + +@@ -1369,11 +1558,13 @@ + unsigned blocksize; + unsigned nlen, rlen; + u32 block, blocks; ++ void *lock; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { + retval = ext3_dx_add_entry(handle, dentry, inode); +@@ -1384,30 +1575,52 @@ + ext3_mark_inode_dirty(handle, dir); + } + #endif ++ lock = ext3_lock_htree(dir, 0, 1); ++ if (is_dx(dir)) { ++ /* we got lock for block 0 ++ * probably previous holder of the lock ++ * created htree -bzzz */ ++ ext3_unlock_htree(dir, lock); ++ goto repeat; ++ } ++ + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext3_bread(handle, dir, block, 0, &retval); +- if(!bh) +- return retval; ++ if(!bh) { ++ ext3_unlock_htree(dir, lock); ++ return retval; ++ } + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); ++ if (retval != -ENOSPC) { ++ ext3_unlock_htree(dir, lock); ++ return retval; ++ } + if (retval != -ENOSPC) + return retval; + + #ifdef CONFIG_EXT3_INDEX + if (blocks == 1 && !dx_fallback && +- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) +- return make_indexed_dir(handle, dentry, inode, bh); ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) { ++ retval = make_indexed_dir(handle, dentry, inode, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; ++ } + #endif + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); +- if (!bh) +- return retval; +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- de->inode = 0; +- de->rec_len = cpu_to_le16(rlen = blocksize); +- nlen = 0; +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ if (!bh) { ++ ext3_unlock_htree(dir, lock); ++ return retval; ++ } ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ de->inode = 0; ++ de->rec_len = cpu_to_le16(rlen = blocksize); ++ nlen = 0; ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; + } + + #ifdef CONFIG_EXT3_INDEX +@@ -1425,15 +1638,27 @@ + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; + int err; +- +- frame = dx_probe(dentry, NULL, &hinfo, frames, &err); ++ int curidx; ++ void *idx_lock, *leaf_lock, *newleaf_lock; ++ ++repeat: ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +- entries = frame->entries; +- at = frame->at; +- +- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) +- goto cleanup; ++ /* we're going to chage leaf, so lock it first */ ++ leaf_lock = ext3_lock_htree(dir, frame->leaf, 1); ++ ++ /* while locking leaf we just found may get splitted ++ * so we need to check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) { ++ printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err); ++ goto cleanup; ++ } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -1446,6 +1671,35 @@ + goto cleanup; + } + ++ /* our leaf has no enough space. hence, we have to ++ * split it. so lock index for this leaf first */ ++ curidx = frame->curidx; ++ idx_lock = ext3_lock_htree(dir, curidx, 1); ++ ++ /* now check did path get changed? */ ++ dx_release(frames); ++ ++ frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode, ++ &hinfo, frames, &err); ++ if (!frame) { ++ /* FIXME: error handling here */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ return err; ++ } ++ ++ if (frame->curidx != curidx) { ++ /* path has been changed. we have to drop old lock ++ * and repeat */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ entries = frame->entries; ++ at = frame->at; ++ + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +@@ -1457,7 +1711,8 @@ + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; +- ++ void *nb_lock; ++ + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3_warning(sb, __FUNCTION__, +@@ -1468,6 +1723,7 @@ + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; ++ nb_lock = ext3_lock_htree(dir, newblock, 1); + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); +@@ -1479,27 +1735,73 @@ + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); +- dxtrace(printk("Split index %i/%i\n", icount1, icount2)); ++ void *ri_lock; + +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ /* we have to protect root htree index against ++ * another dx_add_entry() which would want to ++ * split it too -bzzz */ ++ ri_lock = ext3_lock_htree(dir, 0, 1); ++ ++ /* as root index block blocked we must repeat ++ * searching for current position of our 2nd index -bzzz */ ++ dx_lock_bh(frame->bh); ++ frames->at = dx_find_position(frames->entries, hinfo.hash); ++ dx_unlock_bh(frame->bh); ++ ++ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + ++ /* copy index into new one */ + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { ++ /* unlock index we won't use */ ++ ext3_unlock_htree(dir, idx_lock); ++ idx_lock = nb_lock; + frame->at = at = at - entries - icount1 + entries2; +- frame->entries = entries = entries2; ++ frame->entries = entries2; ++ frame->curidx = curidx = newblock; + swap(frame->bh, bh2); ++ } else { ++ /* we'll use old index,so new one may be freed */ ++ ext3_unlock_htree(dir, nb_lock); + } +- dx_insert_block (frames + 0, hash2, newblock); ++ ++ /* NOTE: very subtle piece of code ++ * competing dx_probe() may find 2nd level index in root ++ * index, then we insert new index here and set new count ++ * in that 2nd level index. so, dx_probe() may see 2nd ++ * level index w/o hash it looks for. the solution is ++ * to check root index after we locked just founded 2nd ++ * level index -bzzz */ ++ dx_lock_bh(frames[0].bh); ++ dx_insert_block (dir, frames + 0, hash2, newblock, 0); ++ dx_unlock_bh(frames[0].bh); ++ ++ /* now old and new 2nd level index blocks contain ++ * all pointers, so dx_probe() may find it in the both. ++ * it's OK -bzzz */ ++ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, icount1); ++ dx_unlock_bh(frame->bh); ++ ++ /* now old 2nd level index block points to first half ++ * of leafs. it's importand that dx_probe() must ++ * check root index block for changes under ++ * dx_lock_bh(frame->bh) -bzzz */ ++ ++ ext3_unlock_htree(dir, ri_lock); ++ + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); +@@ -1508,38 +1810,60 @@ + goto journal_error; + brelse (bh2); + } else { ++ unsigned long leaf = frame->leaf; + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ ++ dx_lock_bh(frames[0].bh); + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ dx_unlock_bh(frames[0].bh); + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; ++ frame->curidx = newblock; ++ frame->leaf = leaf; + err = ext3_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; ++ ++ /* first level index was root. it's already initialized */ ++ /* we my unlock it now */ ++ ext3_unlock_htree(dir, idx_lock); ++ ++ /* current index is just created 2nd level index */ ++ curidx = newblock; ++ idx_lock = nb_lock; + } + ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err); + if (!de) + goto cleanup; ++ ++ /* index splitted */ ++ ext3_unlock_htree(dir, idx_lock); ++ + err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ++ if (newleaf_lock) ++ ext3_unlock_htree(dir, newleaf_lock); ++ + bh = NULL; + goto cleanup; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: ++ ext3_unlock_htree(dir, leaf_lock); + if (bh) + brelse(bh); + dx_release(frames); +@@ -1989,6 +2313,7 @@ + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ +@@ -1998,7 +2323,7 @@ + return PTR_ERR(handle); + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_rmdir; + +@@ -2008,14 +2333,19 @@ + inode = dentry->d_inode; + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = -ENOTEMPTY; +- if (!empty_dir (inode)) ++ if (!empty_dir (inode)) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) +@@ -2048,6 +2378,7 @@ + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ +@@ -2060,15 +2391,17 @@ + handle->h_sync = 1; + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_unlink; ++ } + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", +@@ -2077,6 +2410,7 @@ + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +@@ -2196,6 +2530,7 @@ + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval; ++ void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL; + + old_bh = new_bh = dir_bh = NULL; + +@@ -2211,7 +2546,10 @@ + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + handle->h_sync = 1; + +- old_bh = ext3_find_entry (old_dentry, &old_de); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); ++ ++ old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process +@@ -2224,7 +2562,7 @@ + goto end_rename; + + new_inode = new_dentry->d_inode; +- new_bh = ext3_find_entry (new_dentry, &new_de); ++ new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); +@@ -2288,7 +2626,7 @@ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + +- old_bh2 = ext3_find_entry(old_dentry, &old_de2); ++ old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, + old_de2, old_bh2); +@@ -2331,6 +2669,14 @@ + retval = 0; + + end_rename: ++ if (lock1) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1); ++ if (lock2) ++ ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2); ++ if (lock3) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); +@@ -2339,6 +2685,29 @@ + } + + /* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++static void *ext3_lock_htree(struct inode *dir, ++ unsigned long value, int rwlock) ++{ ++ void *lock; ++ ++ if (!test_opt(dir->i_sb, PDIROPS)) ++ return NULL; ++ lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL); ++ return lock; ++} ++ ++static void ext3_unlock_htree(struct inode *dir, ++ void *lock) ++{ ++ if (!test_opt(dir->i_sb, PDIROPS) || !lock) ++ return; ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock); ++} ++ ++/* + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { +Index: linux-2.6.10/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs_i.h 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/include/linux/ext3_fs_i.h 2005-03-31 19:44:54.254322024 +0800 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + struct ext3_reserve_window { + __u32 _rsv_start; /* First byte reserved */ +@@ -125,6 +126,11 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; + }; + + #endif /* _LINUX_EXT3_FS_I */ +Index: linux-2.6.10/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs.h 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 19:44:54.254322024 +0800 +@@ -355,6 +355,7 @@ + #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ + #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ + #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ ++#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch b/lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch new file mode 100644 index 0000000..d5de424 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch @@ -0,0 +1,192 @@ + fs/ext3/ialloc.c | 35 ++++++++++++++++++++++++++++++++++- + fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++ + fs/ext3/namei.c | 21 +++++++++++++++++---- + include/linux/dcache.h | 5 +++++ + include/linux/ext3_fs.h | 5 ++++- + 5 files changed, 85 insertions(+), 6 deletions(-) + +Index: linux-2.6.10/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/ialloc.c 2005-03-31 18:19:50.911148112 +0800 ++++ linux-2.6.10/fs/ext3/ialloc.c 2005-03-31 18:39:48.578075064 +0800 +@@ -419,7 +419,8 @@ + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) ++struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode, ++ unsigned long goal) + { + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; +@@ -447,6 +448,38 @@ + + sbi = EXT3_SB(sb); + es = sbi->s_es; ++ if (goal) { ++ group = (goal - 1) / EXT3_INODES_PER_GROUP(sb); ++ ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb); ++ gdp = ext3_get_group_desc(sb, group, &bh2); ++ ++ err = -EIO; ++ bitmap_bh = read_inode_bitmap (sb, group); ++ if (!bitmap_bh) ++ goto fail; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) goto fail; ++ ++ if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group), ++ ino, bitmap_bh->b_data)) { ++ printk(KERN_ERR "goal inode %lu unavailable\n", goal); ++ /* Oh well, we tried. */ ++ goto continue_allocation; ++ } ++ ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) goto fail; ++ ++ /* We've shortcircuited the allocation system successfully, ++ * now finish filling in the inode. ++ */ ++ goto got; ++ } ++ ++continue_allocation: + if (S_ISDIR(mode)) { + if (test_opt (sb, OLDALLOC)) + group = find_group_dir(sb, dir); +Index: linux-2.6.10/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/ioctl.c 2004-12-25 05:34:31.000000000 +0800 ++++ linux-2.6.10/fs/ext3/ioctl.c 2005-03-31 18:39:48.579074912 +0800 +@@ -9,6 +9,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -25,6 +26,31 @@ + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { ++ case EXT3_IOC_CREATE_INUM: { ++ char name[32]; ++ struct dentry *dchild, *dparent; ++ int rc = 0; ++ ++ dparent = list_entry(inode->i_dentry.next, struct dentry, ++ d_alias); ++ snprintf(name, sizeof name, "%lu", arg); ++ dchild = lookup_one_len(name, dparent, strlen(name)); ++ if (dchild->d_inode) { ++ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", ++ dparent->d_name.len, dparent->d_name.name, arg, ++ dchild->d_inode->i_ino); ++ rc = -EEXIST; ++ } else { ++ dchild->d_fsdata = (void *)arg; ++ rc = vfs_create(inode, dchild, 0644, NULL); ++ if (rc) ++ printk(KERN_ERR "vfs_create: %d\n", rc); ++ else if (dchild->d_inode->i_ino != arg) ++ rc = -EEXIST; ++ } ++ dput(dchild); ++ return rc; ++ } + case EXT3_IOC_GETFLAGS: + flags = ei->i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); +Index: linux-2.6.10/fs/ext3/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/namei.c 2005-03-31 18:36:12.177972880 +0800 ++++ linux-2.6.10/fs/ext3/namei.c 2005-03-31 18:39:48.582074456 +0800 +@@ -1940,6 +1940,19 @@ + return err; + } + ++static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir, ++ int mode, struct dentry *dentry) ++{ ++ unsigned long inum = 0; ++ ++ if (dentry->d_fsdata != NULL) { ++ struct dentry_params *param = ++ (struct dentry_params *) dentry->d_fsdata; ++ inum = param->p_inum; ++ } ++ return ext3_new_inode(handle, dir, mode, inum); ++} ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +@@ -1965,7 +1978,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, mode); ++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; +@@ -1999,7 +2012,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, mode); ++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +@@ -2035,7 +2048,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR | mode); ++ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -2450,7 +2463,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); ++ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +Index: linux-2.6.10/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-03-31 18:38:11.720799608 +0800 ++++ linux-2.6.10/include/linux/ext3_fs.h 2005-03-31 18:40:36.630769944 +0800 +@@ -230,6 +230,7 @@ + #define EXT3_IOC_SETVERSION _IOW('f', 4, long) + #define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) + #define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input) ++/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ + #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) + #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) + #ifdef CONFIG_JBD_DEBUG +@@ -742,7 +743,8 @@ + dx_hash_info *hinfo); + + /* ialloc.c */ +-extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); ++extern struct inode * ext3_new_inode (handle_t *, struct inode *, int, ++ unsigned long); + extern void ext3_free_inode (handle_t *, struct inode *); + extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); + extern unsigned long ext3_count_free_inodes (struct super_block *); +@@ -834,4 +836,5 @@ + + #endif /* __KERNEL__ */ + ++#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) + #endif /* _LINUX_EXT3_FS_H */ diff --git a/lustre/kernel_patches/patches/hostfs_readdir_large.patch b/lustre/kernel_patches/patches/hostfs_readdir_large.patch new file mode 100644 index 0000000..6ca6afd --- /dev/null +++ b/lustre/kernel_patches/patches/hostfs_readdir_large.patch @@ -0,0 +1,32 @@ +Index: linux-2.6.10/fs/hostfs/hostfs_user.c +=================================================================== +--- linux-2.6.10.orig/fs/hostfs/hostfs_user.c 2004-12-25 05:35:15.000000000 +0800 ++++ linux-2.6.10/fs/hostfs/hostfs_user.c 2005-03-31 19:26:03.810175656 +0800 +@@ -121,13 +121,26 @@ + { + DIR *dir = stream; + struct dirent *ent; ++ off_t off = 0; ++ off_t after_seek = 0; ++ off_t after_readdir = 0; ++ off_t after_readdir2 = 0; + + seekdir(dir, *pos); ++ after_seek = telldir(dir); + ent = readdir(dir); ++ after_readdir = telldir(dir); ++ if ( after_seek != after_readdir ) { ++ off = after_readdir; ++ } else { ++ readdir(dir); ++ after_readdir2 = telldir(dir); ++ off = after_readdir2; ++ } + if(ent == NULL) return(NULL); + *len_out = strlen(ent->d_name); + *ino_out = ent->d_ino; +- *pos = telldir(dir); ++ *pos = off; + return(ent->d_name); + } + diff --git a/lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch b/lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch new file mode 100644 index 0000000..afbd4d9 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch @@ -0,0 +1,476 @@ + fs/ext3/inode.c | 3 + fs/ext3/iopen.c | 239 +++++++++++++++++++++++++++++++++++++ + fs/ext3/iopen.h | 15 ++ + fs/ext3/namei.c | 13 ++ + fs/ext3/super.c | 17 ++ + include/linux/ext3_fs.h | 2 + 7 files changed, 304 insertions(+), 1 deletion(-) + +Index: linux-2.6.10/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_fs.h 2005-04-05 12:25:13.635136112 +0800 ++++ linux-2.6.10/include/linux/ext3_fs.h 2005-04-05 12:25:13.801110880 +0800 +@@ -357,6 +357,8 @@ + #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ + #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ + #define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ ++#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +Index: linux-2.6.10/fs/ext3/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/inode.c 2005-04-05 12:25:13.726122280 +0800 ++++ linux-2.6.10/fs/ext3/inode.c 2005-04-05 12:25:13.794111944 +0800 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -2411,6 +2412,9 @@ + #endif + ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + ++ if (ext3_iopen_get_inode(inode)) ++ return; ++ + if (ext3_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; + bh = iloc.bh; +Index: linux-2.6.10/fs/ext3/super.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/super.c 2005-04-05 12:25:13.728121976 +0800 ++++ linux-2.6.10/fs/ext3/super.c 2005-04-05 12:25:13.797111488 +0800 +@@ -592,6 +592,7 @@ + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + }; + +@@ -641,6 +642,9 @@ + {Opt_ignore, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_pdirops, "pdirops"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, + }; +@@ -921,6 +925,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_noiopen: ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_iopen_nopriv: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.10/fs/ext3/iopen.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/iopen.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/fs/ext3/iopen.c 2005-04-05 12:25:13.791112400 +0800 +@@ -0,0 +1,274 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ __d_rehash(dentry); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_rehash(dentry); ++ __d_move(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +Index: linux-2.6.10/fs/ext3/iopen.h +=================================================================== +--- linux-2.6.10.orig/fs/ext3/iopen.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/fs/ext3/iopen.h 2005-04-05 12:25:13.792112248 +0800 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-2.6.10/fs/ext3/Makefile +=================================================================== +--- linux-2.6.10.orig/fs/ext3/Makefile 2004-12-25 05:33:52.000000000 +0800 ++++ linux-2.6.10/fs/ext3/Makefile 2005-04-05 12:26:06.897039072 +0800 +@@ -5,7 +5,7 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.10/fs/ext3/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/namei.c 2005-04-05 12:25:13.633136416 +0800 ++++ linux-2.6.10/fs/ext3/namei.c 2005-04-05 12:25:13.799111184 +0800 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -1140,6 +1141,9 @@ + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de, 0, &lock); + inode = NULL; + if (bh) { +@@ -1151,10 +1155,8 @@ + if (!inode) + return ERR_PTR(-EACCES); + } +- if (inode) +- return d_splice_alias(inode, dentry); +- d_add(dentry, inode); +- return NULL; ++ ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -2367,10 +2369,6 @@ + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); +@@ -2497,6 +2495,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2520,7 +2535,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle,inode); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; diff --git a/lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch b/lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch new file mode 100644 index 0000000..64085b9 --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch @@ -0,0 +1,222 @@ +--- 1.46/include/linux/jbd.h 2004-10-19 03:40:17 -06:00 ++++ 1.47/include/linux/jbd.h 2004-11-07 19:13:24 -07:00 +@@ -352,6 +352,27 @@ + bit_spin_unlock(BH_JournalHead, &bh->b_state); + } + ++#define HAVE_JOURNAL_CALLBACK_STATUS ++/** ++ * struct journal_callback - Base structure for callback information. ++ * @jcb_list: list information for other callbacks attached to the same handle. ++ * @jcb_func: Function to call with this callback structure. ++ * ++ * This struct is a 'seed' structure for a using with your own callback ++ * structs. If you are using callbacks you must allocate one of these ++ * or another struct of your own definition which has this struct ++ * as it's first element and pass it to journal_callback_set(). ++ * ++ * This is used internally by jbd to maintain callback information. ++ * ++ * See journal_callback_set for more information. ++ **/ ++struct journal_callback { ++ struct list_head jcb_list; /* t_jcb_lock */ ++ void (*jcb_func)(struct journal_callback *jcb, int error); ++ /* user data goes here */ ++}; ++ + struct jbd_revoke_table_s; + + /** +@@ -360,6 +381,7 @@ + * @h_transaction: Which compound transaction is this update a part of? + * @h_buffer_credits: Number of remaining buffers we are allowed to dirty. + * @h_ref: Reference count on this handle ++ * @h_jcb: List of application registered callbacks for this handle. + * @h_err: Field for caller's use to track errors through large fs operations + * @h_sync: flag for sync-on-close + * @h_jdata: flag to force data journaling +@@ -385,6 +407,13 @@ + /* operations */ + int h_err; + ++ /* ++ * List of application registered callbacks for this handle. The ++ * function(s) will be called after the transaction that this handle is ++ * part of has been committed to disk. [t_jcb_lock] ++ */ ++ struct list_head h_jcb; ++ + /* Flags [no locking] */ + unsigned int h_sync: 1; /* sync-on-close */ + unsigned int h_jdata: 1; /* force data journaling */ +@@ -426,6 +455,8 @@ + * j_state_lock + * ->j_list_lock (journal_unmap_buffer) + * ++ * t_handle_lock ++ * ->t_jcb_lock + */ + + struct transaction_s +@@ -549,6 +580,15 @@ + */ + int t_handle_count; + ++ /* ++ * Protects the callback list ++ */ ++ spinlock_t t_jcb_lock; ++ /* ++ * List of registered callback functions for this transaction. ++ * Called when the transaction is committed. [t_jcb_lock] ++ */ ++ struct list_head t_jcb; + }; + + /** +@@ -881,6 +921,10 @@ + extern int journal_try_to_free_buffers(journal_t *, struct page *, int); + extern int journal_stop(handle_t *); + extern int journal_flush (journal_t *); ++extern void journal_callback_set(handle_t *handle, ++ void (*fn)(struct journal_callback *,int), ++ struct journal_callback *jcb); ++ + extern void journal_lock_updates (journal_t *); + extern void journal_unlock_updates (journal_t *); + +--- 1.23/fs/jbd/checkpoint.c 2003-07-10 23:23:54 -06:00 ++++ 1.24/fs/jbd/checkpoint.c 2004-11-07 19:13:24 -07:00 +@@ -616,6 +616,7 @@ + J_ASSERT(transaction->t_log_list == NULL); + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_updates == 0); ++ J_ASSERT(list_empty(&transaction->t_jcb)); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + + +--- 1.53/fs/jbd/commit.c 2004-10-19 03:40:17 -06:00 ++++ 1.54/fs/jbd/commit.c 2004-11-07 19:13:24 -07:00 +@@ -686,6 +686,30 @@ + if (err) + __journal_abort_hard(journal); + ++ /* ++ * Call any callbacks that had been registered for handles in this ++ * transaction. It is up to the callback to free any allocated ++ * memory. ++ * ++ * The spinlocking (t_jcb_lock) here is surely unnecessary... ++ */ ++ spin_lock(&commit_transaction->t_jcb_lock); ++ if (!list_empty(&commit_transaction->t_jcb)) { ++ struct list_head *p, *n; ++ int error = is_journal_aborted(journal); ++ ++ list_for_each_safe(p, n, &commit_transaction->t_jcb) { ++ struct journal_callback *jcb; ++ ++ jcb = list_entry(p, struct journal_callback, jcb_list); ++ list_del(p); ++ spin_unlock(&commit_transaction->t_jcb_lock); ++ jcb->jcb_func(jcb, error); ++ spin_lock(&commit_transaction->t_jcb_lock); ++ } ++ } ++ spin_unlock(&commit_transaction->t_jcb_lock); ++ + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_sync_datalist == NULL); + +--- 1.77/fs/jbd/journal.c 2004-09-21 20:58:08 -06:00 ++++ 1.78/fs/jbd/journal.c 2004-11-07 19:13:24 -07:00 +@@ -55,6 +55,7 @@ + #endif + EXPORT_SYMBOL(journal_flush); + EXPORT_SYMBOL(journal_revoke); ++EXPORT_SYMBOL(journal_callback_set); + + EXPORT_SYMBOL(journal_init_dev); + EXPORT_SYMBOL(journal_init_inode); +@@ -78,6 +79,7 @@ + EXPORT_SYMBOL(journal_blocks_per_page); + EXPORT_SYMBOL(journal_invalidatepage); + EXPORT_SYMBOL(journal_try_to_free_buffers); ++EXPORT_SYMBOL(journal_bmap); + EXPORT_SYMBOL(journal_force_commit); + + static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); + +--- 1.89/fs/jbd/transaction.c 2004-10-19 03:40:17 -06:00 ++++ 1.90/fs/jbd/transaction.c 2004-11-07 19:13:24 -07:00 +@@ -50,7 +50,9 @@ + transaction->t_state = T_RUNNING; + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; ++ INIT_LIST_HEAD(&transaction->t_jcb); + spin_lock_init(&transaction->t_handle_lock); ++ spin_lock_init(&transaction->t_jcb_lock); + + /* Set up the commit timer for the new transaction. */ + journal->j_commit_timer->expires = transaction->t_expires; +@@ -241,6 +243,7 @@ + memset(handle, 0, sizeof(*handle)); + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; ++ INIT_LIST_HEAD(&handle->h_jcb); + + return handle; + } +@@ -1274,6 +1277,36 @@ + } + + /** ++ * void journal_callback_set() - Register a callback function for this handle. ++ * @handle: handle to attach the callback to. ++ * @func: function to callback. ++ * @jcb: structure with additional information required by func() , and ++ * some space for jbd internal information. ++ * ++ * The function will be ++ * called when the transaction that this handle is part of has been ++ * committed to disk with the original callback data struct and the ++ * error status of the journal as parameters. There is no guarantee of ++ * ordering between handles within a single transaction, nor between ++ * callbacks registered on the same handle. ++ * ++ * The caller is responsible for allocating the journal_callback struct. ++ * This is to allow the caller to add as much extra data to the callback ++ * as needed, but reduce the overhead of multiple allocations. The caller ++ * allocated struct must start with a struct journal_callback at offset 0, ++ * and has the caller-specific data afterwards. ++ */ ++void journal_callback_set(handle_t *handle, ++ void (*func)(struct journal_callback *jcb, int error), ++ struct journal_callback *jcb) ++{ ++ spin_lock(&handle->h_transaction->t_jcb_lock); ++ list_add_tail(&jcb->jcb_list, &handle->h_jcb); ++ spin_unlock(&handle->h_transaction->t_jcb_lock); ++ jcb->jcb_func = func; ++} ++ ++/** + * int journal_stop() - complete a transaction + * @handle: tranaction to complete. + * +@@ -1338,6 +1371,11 @@ + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } ++ ++ /* Move callbacks from the handle to the transaction. */ ++ spin_lock(&transaction->t_jcb_lock); ++ list_splice(&handle->h_jcb, &transaction->t_jcb); ++ spin_unlock(&transaction->t_jcb_lock); + + /* + * If the handle is marked SYNC, we need to set another commit + diff --git a/lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch b/lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch new file mode 100644 index 0000000..1ac66bc --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch @@ -0,0 +1,399 @@ +fix against credits leak in journal_release_buffer() + +The idea is to charge a buffer at a time of modification (journal_dirty_metadata()), +not at a time of access (journal_get_*_access()). Each buffer has flag first call +journal_dirty_metadata() sets on the buffer. + +Signed-off-by: Alex Tomas + +Index: linux-2.6.10/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/ialloc.c 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/fs/ext3/ialloc.c 2005-03-31 18:11:10.672236448 +0800 +@@ -474,11 +474,9 @@ + ino = ext3_find_next_zero_bit((unsigned long *) + bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino); + if (ino < EXT3_INODES_PER_GROUP(sb)) { +- int credits = 0; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); +- err = ext3_journal_get_write_access_credits(handle, +- bitmap_bh, &credits); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) + goto fail; + +@@ -494,7 +492,7 @@ + goto got; + } + /* we lost it */ +- journal_release_buffer(handle, bitmap_bh, credits); ++ journal_release_buffer(handle, bitmap_bh); + + if (++ino < EXT3_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; +Index: linux-2.6.10/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/xattr.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/fs/ext3/xattr.c 2005-03-31 18:11:10.675235992 +0800 +@@ -507,8 +507,7 @@ + goto skip_get_write_access; + /* ext3_journal_get_write_access() requires an unlocked bh, + which complicates things here. */ +- error = ext3_journal_get_write_access_credits(handle, bh, +- &credits); ++ error = ext3_journal_get_write_access(handle, bh); + if (error) + goto cleanup; + ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, +@@ -525,7 +524,7 @@ + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); +- journal_release_buffer(handle, bh, credits); ++ journal_release_buffer(handle, bh); + skip_get_write_access: + ea_bdebug(bh, "cloning"); + header = kmalloc(bh->b_size, GFP_KERNEL); +@@ -669,8 +668,7 @@ + error = -EDQUOT; + if (DQUOT_ALLOC_BLOCK(inode, 1)) { + unlock_buffer(new_bh); +- journal_release_buffer(handle, new_bh, +- credits); ++ journal_release_buffer(handle, new_bh); + goto cleanup; + } + HDR(new_bh)->h_refcount = cpu_to_le32(1 + +@@ -986,8 +984,7 @@ + ext3_error(inode->i_sb, "ext3_xattr_cache_find", + "inode %ld: block %ld read error", + inode->i_ino, (unsigned long) ce->e_block); +- } else if (ext3_journal_get_write_access_credits( +- handle, bh, credits) == 0) { ++ } else if (ext3_journal_get_write_access(handle, bh) == 0) { + /* ext3_journal_get_write_access() requires an unlocked + * bh, which complicates things here. */ + lock_buffer(bh); +@@ -1003,7 +1000,7 @@ + return bh; + } + unlock_buffer(bh); +- journal_release_buffer(handle, bh, *credits); ++ journal_release_buffer(handle, bh); + *credits = 0; + brelse(bh); + } +Index: linux-2.6.10/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/balloc.c 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/fs/ext3/balloc.c 2005-03-31 18:14:05.705627328 +0800 +@@ -342,7 +342,7 @@ + */ + /* @@@ check errors */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); +- err = ext3_journal_get_undo_access(handle, bitmap_bh, NULL); ++ err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + +@@ -986,7 +986,6 @@ + unsigned long group_first_block; + int ret = 0; + int fatal; +- int credits = 0; + + *errp = 0; + +@@ -996,7 +995,7 @@ + * if the buffer is in BJ_Forget state in the committing transaction. + */ + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); +- fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits); ++ fatal = ext3_journal_get_undo_access(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; +@@ -1087,7 +1086,7 @@ + } + + BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); +- ext3_journal_release_buffer(handle, bitmap_bh, credits); ++ ext3_journal_release_buffer(handle, bitmap_bh); + return ret; + } + +Index: linux-2.6.10/fs/jbd/commit.c +=================================================================== +--- linux-2.6.10.orig/fs/jbd/commit.c 2004-12-25 05:35:27.000000000 +0800 ++++ linux-2.6.10/fs/jbd/commit.c 2005-03-31 18:11:10.668237056 +0800 +@@ -204,6 +204,19 @@ + } + + /* ++ * First, drop modified flag: all accesses to the buffers ++ * will be tracked for a new trasaction only -bzzz ++ */ ++ if (commit_transaction->t_buffers) { ++ new_jh = jh = commit_transaction->t_buffers->b_tnext; ++ do { ++ J_ASSERT_JH(new_jh, new_jh->b_modified == 1); ++ new_jh->b_modified = 0; ++ new_jh = new_jh->b_tnext; ++ } while (new_jh != jh); ++ } ++ ++ /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory +Index: linux-2.6.10/fs/jbd/transaction.c +=================================================================== +--- linux-2.6.10.orig/fs/jbd/transaction.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/fs/jbd/transaction.c 2005-03-31 18:11:10.666237360 +0800 +@@ -522,7 +522,7 @@ + */ + static int + do_get_write_access(handle_t *handle, struct journal_head *jh, +- int force_copy, int *credits) ++ int force_copy) + { + struct buffer_head *bh; + transaction_t *transaction; +@@ -604,11 +604,6 @@ + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + jh->b_next_transaction = transaction; +- +- J_ASSERT_JH(jh, handle->h_buffer_credits > 0); +- handle->h_buffer_credits--; +- if (credits) +- (*credits)++; + goto done; + } + +@@ -688,10 +683,6 @@ + jh->b_next_transaction = transaction; + } + +- J_ASSERT(handle->h_buffer_credits > 0); +- handle->h_buffer_credits--; +- if (credits) +- (*credits)++; + + /* + * Finally, if the buffer is not journaled right now, we need to make +@@ -749,8 +740,7 @@ + * because we're write()ing a buffer which is also part of a shared mapping. + */ + +-int journal_get_write_access(handle_t *handle, +- struct buffer_head *bh, int *credits) ++int journal_get_write_access(handle_t *handle, struct buffer_head *bh) + { + struct journal_head *jh = journal_add_journal_head(bh); + int rc; +@@ -758,7 +748,7 @@ + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ +- rc = do_get_write_access(handle, jh, 0, credits); ++ rc = do_get_write_access(handle, jh, 0); + journal_put_journal_head(jh); + return rc; + } +@@ -814,9 +804,6 @@ + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + +- J_ASSERT_JH(jh, handle->h_buffer_credits > 0); +- handle->h_buffer_credits--; +- + if (jh->b_transaction == NULL) { + jh->b_transaction = transaction; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); +@@ -869,8 +856,7 @@ + * + * Returns error number or 0 on success. + */ +-int journal_get_undo_access(handle_t *handle, struct buffer_head *bh, +- int *credits) ++int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) + { + int err; + struct journal_head *jh = journal_add_journal_head(bh); +@@ -883,7 +869,7 @@ + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. + */ +- err = do_get_write_access(handle, jh, 1, credits); ++ err = do_get_write_access(handle, jh, 1); + if (err) + goto out; + +@@ -1111,6 +1097,17 @@ + + jbd_lock_bh_state(bh); + ++ if (jh->b_modified == 0) { ++ /* ++ * This buffer's got modified and becoming part ++ * of the transaction. This needs to be done ++ * once a transaction -bzzz ++ */ ++ jh->b_modified = 1; ++ J_ASSERT_JH(jh, handle->h_buffer_credits > 0); ++ handle->h_buffer_credits--; ++ } ++ + /* + * fastpath, to avoid expensive locking. If this buffer is already + * on the running transaction's metadata list there is nothing to do. +@@ -1161,24 +1158,11 @@ + * journal_release_buffer: undo a get_write_access without any buffer + * updates, if the update decided in the end that it didn't need access. + * +- * The caller passes in the number of credits which should be put back for +- * this buffer (zero or one). +- * +- * We leave the buffer attached to t_reserved_list because even though this +- * handle doesn't want it, some other concurrent handle may want to journal +- * this buffer. If that handle is curently in between get_write_access() and +- * journal_dirty_metadata() then it expects the buffer to be reserved. If +- * we were to rip it off t_reserved_list here, the other handle will explode +- * when journal_dirty_metadata is presented with a non-reserved buffer. +- * +- * If nobody really wants to journal this buffer then it will be thrown +- * away at the start of commit. + */ + void +-journal_release_buffer(handle_t *handle, struct buffer_head *bh, int credits) ++journal_release_buffer(handle_t *handle, struct buffer_head *bh) + { + BUFFER_TRACE(bh, "entry"); +- handle->h_buffer_credits += credits; + } + + /** +@@ -1222,6 +1206,12 @@ + goto not_jbd; + } + ++ /* ++ * The buffer's going from the transaction, we must drop ++ * all references -bzzz ++ */ ++ jh->b_modified = 0; ++ + if (jh->b_transaction == handle->h_transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + +@@ -2015,7 +2005,10 @@ + __journal_unfile_buffer(jh); + jh->b_transaction = jh->b_next_transaction; + jh->b_next_transaction = NULL; +- __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); ++ if (jh->b_modified == 1) ++ __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); ++ else ++ __journal_file_buffer(jh, jh->b_transaction, BJ_Reserved); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + + if (was_dirty) +Index: linux-2.6.10/include/linux/journal-head.h +=================================================================== +--- linux-2.6.10.orig/include/linux/journal-head.h 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/include/linux/journal-head.h 2005-03-31 18:11:10.658238576 +0800 +@@ -32,6 +32,13 @@ + unsigned b_jlist; + + /* ++ * This flag signals the buffer has been modified by ++ * the currently running transaction ++ * [jbd_lock_bh_state()] ++ */ ++ unsigned b_modified; ++ ++ /* + * Copy of the buffer data frozen for writing to the log. + * [jbd_lock_bh_state()] + */ +Index: linux-2.6.10/include/linux/jbd.h +=================================================================== +--- linux-2.6.10.orig/include/linux/jbd.h 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/include/linux/jbd.h 2005-03-31 18:12:52.504755552 +0800 +@@ -867,15 +867,12 @@ + extern handle_t *journal_start(journal_t *, int nblocks); + extern int journal_restart (handle_t *, int nblocks); + extern int journal_extend (handle_t *, int nblocks); +-extern int journal_get_write_access(handle_t *, struct buffer_head *, +- int *credits); ++extern int journal_get_write_access(handle_t *, struct buffer_head *); + extern int journal_get_create_access (handle_t *, struct buffer_head *); +-extern int journal_get_undo_access(handle_t *, struct buffer_head *, +- int *credits); ++extern int journal_get_undo_access(handle_t *, struct buffer_head *); + extern int journal_dirty_data (handle_t *, struct buffer_head *); + extern int journal_dirty_metadata (handle_t *, struct buffer_head *); +-extern void journal_release_buffer (handle_t *, struct buffer_head *, +- int credits); ++extern void journal_release_buffer (handle_t *, struct buffer_head *); + extern int journal_forget (handle_t *, struct buffer_head *); + extern void journal_sync_buffer (struct buffer_head *); + extern int journal_invalidatepage(journal_t *, +Index: linux-2.6.10/include/linux/ext3_jbd.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ext3_jbd.h 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/include/linux/ext3_jbd.h 2005-03-31 18:11:10.660238272 +0800 +@@ -113,9 +113,9 @@ + + static inline int + __ext3_journal_get_undo_access(const char *where, handle_t *handle, +- struct buffer_head *bh, int *credits) ++ struct buffer_head *bh) + { +- int err = journal_get_undo_access(handle, bh, credits); ++ int err = journal_get_undo_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; +@@ -123,19 +123,18 @@ + + static inline int + __ext3_journal_get_write_access(const char *where, handle_t *handle, +- struct buffer_head *bh, int *credits) ++ struct buffer_head *bh) + { +- int err = journal_get_write_access(handle, bh, credits); ++ int err = journal_get_write_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline void +-ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh, +- int credits) ++ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh) + { +- journal_release_buffer(handle, bh, credits); ++ journal_release_buffer(handle, bh); + } + + static inline int +@@ -178,12 +177,10 @@ + } + + +-#define ext3_journal_get_undo_access(handle, bh, credits) \ +- __ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh), (credits)) ++#define ext3_journal_get_undo_access(handle, bh) \ ++ __ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh)) + #define ext3_journal_get_write_access(handle, bh) \ +- __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh), NULL) +-#define ext3_journal_get_write_access_credits(handle, bh, credits) \ +- __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh), (credits)) ++ __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh)) + #define ext3_journal_revoke(handle, blocknr, bh) \ + __ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) + #define ext3_journal_get_create_access(handle, bh) \ diff --git a/lustre/kernel_patches/patches/kgdb-ga.patch b/lustre/kernel_patches/patches/kgdb-ga.patch new file mode 100644 index 0000000..679853f --- /dev/null +++ b/lustre/kernel_patches/patches/kgdb-ga.patch @@ -0,0 +1,6358 @@ + + +This kgdb will get called and will trap almost any kernel +fault WITHOUT BEING ARMED. + +It is entered at boot time via "kgdb" in the boot string, +not "gdb". This entry occurs when the first setup on the +boot string is called, not sometime later. You will not +find a "waiting for gdb" on your console, as the console has +not yet been enabled at this time. (Note, this early stuff +is a bit fragile as the full trap table has yet to be +loaded, something I might address, sometime... So don't try +to look at memory that can not be reached, for example. +Once the full trap table is loaded this restriction goes +away.) + +If you hard code it, you can put a breakpoint() as the FIRST +LINE OF C CODE. + +It does NOT use the serial driver, but if the serial driver +is loaded, it tells it to release the port to avoid +conflict. + +The threads stuff is not configurable, does not require +redirection of schedule() calls and does back track to the +first non schedule() caller on the info threads command. If +you switch to the thread, however, it will show it in the +switch code (as it should). + +It is MUCH more aggressive and paranoid about grabbing the +other cpus on entry. It issues a "send_nmi_all_but_self()" +rather than depending on them to interrupt or hit an NMI +sometime in the distant future. If a cpu does not come to +the party, it will continue without it so all is not lost. + +It does not have anything to do with IOCTL calls, but does +do the control-C thing. + +There is a LOT of info in the patch which ends up in +.../Documentation/i386/kgdb/* + +There is a nifty little thing call kgdb_ts() (kgdb time +stamp) which is a function you can code calls to which puts +some useful stuff in a circular buffer which can be examined +with the supplied gdb macros. + +It also allows you do to do "p foobar(...)" i.e. to call a +function from gdb, just like gdb allows in program +debugging. + +In an SMP system, you can choose to "hold" any given set of +cpus. It also defaults to holding other cpus on single step +(this can be overridden). + +This said, you can imagine my consternation when I found it +"lost it" on continues on 2.5. I found and fixed this this +early pm, a hold cpu on exit goof on my part. + +Oh, and a final point, the configure options are more +extensive (the serial port is set up here, for example, (can +not wait for a command line to do this)). There is one to +do system call exit tests. This is VERY new and causes the +kernel to hit a hard "int 3" if a system call attempts to +exit with preempt count other than zero. This is a fault, +of course, but the current 2.5 is full of them so I don't +recommend turning this on. + + +DESC +kgdbL warning fix +EDESC +From: Ingo Molnar + +this patch fixes a deprecated use of asm input operands. (and shuts up a +gcc 3.3 warning.) + +DESC +kgdb buffer overflow fix +EDESC +From: George Anzinger + + +DESC +kgdbL warning fix +EDESC +From: Ingo Molnar + +this patch fixes a deprecated use of asm input operands. (and shuts up a +gcc 3.3 warning.) + +DESC +kgdb: CONFIG_DEBUG_INFO fix +EDESC +From: Thomas Schlichter + +that patch sets DEBUG_INFO to y by default, even if whether DEBUG_KERNEL nor +KGDB is enabled. The attached patch changes this to enable DEBUG_INFO by +default only if KGDB is enabled. + +DESC +x86_64 fixes +EDESC +From Andi Kleen + +Fix x86_64 for kgdb. We forget why. +DESC +correct kgdb.txt Documentation link (against 2.6.1-rc1-mm2) +EDESC +From: Jesper Juhl + +The help text for "config KGDB" in arch/i386/Kconfig refers to +Documentation/i386/kgdb.txt - the actual location is +Documentation/i386/kgdb/kgdb.txt - patch below to fix that. + +DESC +kgdb: fix for recent gcc +EDESC + +arch/i386/kernel/traps.c:97: error: conflicting types for 'int3' +arch/i386/kernel/traps.c:77: error: previous declaration of 'int3' was here +arch/i386/kernel/traps.c:97: error: conflicting types for 'int3' +arch/i386/kernel/traps.c:77: error: previous declaration of 'int3' was here +arch/i386/kernel/traps.c:99: error: conflicting types for 'debug' +arch/i386/kernel/traps.c:75: error: previous declaration of 'debug' was here +arch/i386/kernel/traps.c:99: error: conflicting types for 'debug' +arch/i386/kernel/traps.c:75: error: previous declaration of 'debug' was here + +DESC +kgdb warning fixes +EDESC + +arch/i386/kernel/kgdb_stub.c:1306: warning: 'time' might be used uninitialized in this function +arch/i386/kernel/kgdb_stub.c:1306: warning: 'dum' might be used uninitialized in this function +DESC +THREAD_SIZE fixes for kgdb +EDESC +From: Matt Mackall + +Noticed the THREAD_SIZE clean-ups are in -mm now. Here are the missing +bits for kgdb, tested in -tiny with 4k stacks. +DESC +Fix stack overflow test for non-8k stacks +EDESC +From: Matt Mackall + +This is needed to work properly with 4k and 16k stacks. +DESC +kgdb-ga.patch fix for i386 single-step into sysenter +EDESC +From: Roland McGrath + +Using kgdb-ga.patch from -mm, if userland single-steps (PTRACE_SINGLESTEP) +into the `sysenter' instruction, kgdb reports a bogus trap: + + Program received signal SIGTRAP, Trace/breakpoint trap. + sysenter_past_esp () at arch/i386/kernel/entry.S:249 + 1: x/i $pc 0xc0106023 : sti + (gdb) + +The hackery in the "FIX_STACK" macro in entry.S changes the saved PC for a +the spurious kernel-mode debug trap when TF was set on user-mode execution +of `sysenter', so sysenter_past_esp is where it actually lies in this case. + The following patch removes the kgdb hiccup when userland +PTRACE_SINGLESTEP's into sysenter. +DESC +fix TRAP_BAD_SYSCALL_EXITS on i386 +EDESC +From: Andy Whitcroft + +We are not using the right offset name, nor the right address when checking +for a non-zero preempt count. Move to TI_preempt_count(%ebp). + +Signed-off-by: Andy Whitcroft +DESC +add TRAP_BAD_SYSCALL_EXITS config for i386 +EDESC +From: Andy Whitcroft + +There seems to be code recently added to -bk and thereby -mm which supports +extra debug for preempt on system call exit. Oddly there doesn't seem to +be configuration options to enable them. Below is a possible patch to +allow enabling this on i386. Sadly the most obvious menu to add this to is +the Kernel Hacking menu, but that is defined in architecture specific +configuration. If this makes sense I could patch the other arches? + +Add a configuration option to allow enabling TRAP_BAD_SYSCALL_EXITS to the +Kernel Hacking menu. + +Signed-off-by: Andy Whitcroft +DESC +kgdb-is-incompatible-with-kprobes +EDESC +DESC +kgdb-ga-build-fix +EDESC +DESC +kgdb-ga-fixes +EDESC +Signed-off-by: Andrew Morton +Index: linux-2.6.10/include/asm-i386/kgdb_local.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/kgdb_local.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-i386/kgdb_local.h 2005-04-05 12:48:05.371600472 +0800 +@@ -0,0 +1,102 @@ ++#ifndef __KGDB_LOCAL ++#define ___KGDB_LOCAL ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define PORT 0x3f8 ++#ifdef CONFIG_KGDB_PORT ++#undef PORT ++#define PORT CONFIG_KGDB_PORT ++#endif ++#define IRQ 4 ++#ifdef CONFIG_KGDB_IRQ ++#undef IRQ ++#define IRQ CONFIG_KGDB_IRQ ++#endif ++#define SB_CLOCK 1843200 ++#define SB_BASE (SB_CLOCK/16) ++#define SB_BAUD9600 SB_BASE/9600 ++#define SB_BAUD192 SB_BASE/19200 ++#define SB_BAUD384 SB_BASE/38400 ++#define SB_BAUD576 SB_BASE/57600 ++#define SB_BAUD1152 SB_BASE/115200 ++#ifdef CONFIG_KGDB_9600BAUD ++#define SB_BAUD SB_BAUD9600 ++#endif ++#ifdef CONFIG_KGDB_19200BAUD ++#define SB_BAUD SB_BAUD192 ++#endif ++#ifdef CONFIG_KGDB_38400BAUD ++#define SB_BAUD SB_BAUD384 ++#endif ++#ifdef CONFIG_KGDB_57600BAUD ++#define SB_BAUD SB_BAUD576 ++#endif ++#ifdef CONFIG_KGDB_115200BAUD ++#define SB_BAUD SB_BAUD1152 ++#endif ++#ifndef SB_BAUD ++#define SB_BAUD SB_BAUD1152 /* Start with this if not given */ ++#endif ++ ++#ifndef CONFIG_X86_TSC ++#undef rdtsc ++#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;} ++#undef rdtscll ++#define rdtscll(s) s++ ++#endif ++ ++#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */ ++#undef spin_lock ++#undef spin_trylock ++#undef spin_unlock ++#define spin_lock _raw_spin_lock ++#define spin_trylock _raw_spin_trylock ++#define spin_unlock _raw_spin_unlock ++#else ++#endif ++#undef spin_unlock_wait ++#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \ ++ while(spin_is_locked(x)) ++ ++#define SB_IER 1 ++#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS ++ ++#define FLAGS 0 ++#define SB_STATE { \ ++ magic: SSTATE_MAGIC, \ ++ baud_base: SB_BASE, \ ++ port: PORT, \ ++ irq: IRQ, \ ++ flags: FLAGS, \ ++ custom_divisor:SB_BAUD} ++#define SB_INFO { \ ++ magic: SERIAL_MAGIC, \ ++ port: PORT,0,FLAGS, \ ++ state: &state, \ ++ tty: (struct tty_struct *)&state, \ ++ IER: SB_IER, \ ++ MCR: SB_MCR} ++extern void putDebugChar(int); ++/* RTAI support needs us to really stop/start interrupts */ ++ ++#define kgdb_sti() __asm__ __volatile__("sti": : :"memory") ++#define kgdb_cli() __asm__ __volatile__("cli": : :"memory") ++#define kgdb_local_save_flags(x) __asm__ __volatile__(\ ++ "pushfl ; popl %0":"=g" (x): /* no input */) ++#define kgdb_local_irq_restore(x) __asm__ __volatile__(\ ++ "pushl %0 ; popfl": \ ++ /* no output */ :"g" (x):"memory", "cc") ++#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli() ++ ++#ifdef CONFIG_SERIAL ++extern void shutdown_for_kgdb(struct async_struct *info); ++#endif ++#define INIT_KDEBUG putDebugChar("+"); ++#endif /* __KGDB_LOCAL */ +Index: linux-2.6.10/include/asm-i386/kgdb.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/kgdb.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-i386/kgdb.h 2005-04-05 12:48:05.399596216 +0800 +@@ -0,0 +1,59 @@ ++#ifndef __KGDB ++#define __KGDB ++ ++/* ++ * This file should not include ANY others. This makes it usable ++ * most anywhere without the fear of include order or inclusion. ++ * Make it so! ++ * ++ * This file may be included all the time. It is only active if ++ * CONFIG_KGDB is defined, otherwise it stubs out all the macros ++ * and entry points. ++ */ ++#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__) ++ ++extern void breakpoint(void); ++#define INIT_KGDB_INTS kgdb_enable_ints() ++ ++#ifndef BREAKPOINT ++#define BREAKPOINT asm(" int $3") ++#endif ++/* ++ * GDB debug stub (or any debug stub) can point the 'linux_debug_hook' ++ * pointer to its routine and it will be entered as the first thing ++ * when a trap occurs. ++ * ++ * Return values are, at present, undefined. ++ * ++ * The debug hook routine does not necessarily return to its caller. ++ * It has the register image and thus may choose to resume execution ++ * anywhere it pleases. ++ */ ++struct pt_regs; ++ ++extern int kgdb_handle_exception(int trapno, ++ int signo, int err_code, struct pt_regs *regs); ++extern int in_kgdb(struct pt_regs *regs); ++ ++#ifdef CONFIG_KGDB_TS ++void kgdb_tstamp(int line, char *source, int data0, int data1); ++/* ++ * This is the time stamp function. The macro adds the source info and ++ * does a cast on the data to allow most any 32-bit value. ++ */ ++ ++#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1) ++#else ++#define kgdb_ts(data0,data1) ++#endif ++#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */ ++#ifndef BREAKPOINT ++#define BREAKPOINT ++#endif ++#define kgdb_ts(data0,data1) ++#define in_kgdb ++#define kgdb_handle_exception ++#define breakpoint ++#define INIT_KGDB_INTS ++#endif ++#endif /* __KGDB */ +Index: linux-2.6.10/include/asm-i386/bugs.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/bugs.h 2004-12-25 05:34:01.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/bugs.h 2005-04-05 12:48:05.398596368 +0800 +@@ -1,11 +1,11 @@ + /* + * include/asm-i386/bugs.h + * +- * Copyright (C) 1994 Linus Torvalds ++ * Copyright (C) 1994 Linus Torvalds + * + * Cyrix stuff, June 1998 by: + * - Rafael R. Reilova (moved everything from head.S), +- * ++ * + * - Channing Corn (tests & fixes), + * - Andrew D. Balsa (code cleanup). + * +@@ -25,7 +25,20 @@ + #include + #include + #include +- ++#ifdef CONFIG_KGDB ++/* ++ * Provied the command line "gdb" initial break ++ */ ++int __init kgdb_initial_break(char * str) ++{ ++ if (*str == '\0'){ ++ breakpoint(); ++ return 1; ++ } ++ return 0; ++} ++__setup("gdb",kgdb_initial_break); ++#endif + static int __init no_halt(char *s) + { + boot_cpu_data.hlt_works_ok = 0; +@@ -140,7 +153,7 @@ + : "ecx", "edi" ); + /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ + if (res != 12345678) printk( "Buggy.\n" ); +- else printk( "OK.\n" ); ++ else printk( "OK.\n" ); + #endif + } + +Index: linux-2.6.10/include/linux/serial_core.h +=================================================================== +--- linux-2.6.10.orig/include/linux/serial_core.h 2004-12-25 05:34:00.000000000 +0800 ++++ linux-2.6.10/include/linux/serial_core.h 2005-04-05 12:48:05.367601080 +0800 +@@ -184,7 +184,6 @@ + unsigned char x_char; /* xon/xoff char */ + unsigned char regshift; /* reg offset shift */ + unsigned char iotype; /* io access style */ +- + #define UPIO_PORT (0) + #define UPIO_HUB6 (1) + #define UPIO_MEM (2) +Index: linux-2.6.10/include/linux/dwarf2.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dwarf2.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/linux/dwarf2.h 2005-04-05 12:48:05.369600776 +0800 +@@ -0,0 +1,738 @@ ++/* Declarations and definitions of codes relating to the DWARF2 symbolic ++ debugging information format. ++ Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002 ++ Free Software Foundation, Inc. ++ ++ Written by Gary Funck (gary@intrepid.com) The Ada Joint Program ++ Office (AJPO), Florida State Unviversity and Silicon Graphics Inc. ++ provided support for this effort -- June 21, 1995. ++ ++ Derived from the DWARF 1 implementation written by Ron Guilmette ++ (rfg@netcom.com), November 1990. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it under ++ the terms of the GNU General Public License as published by the Free ++ Software Foundation; either version 2, or (at your option) any later ++ version. ++ ++ GCC is distributed in the hope that it will be useful, but WITHOUT ++ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public ++ License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING. If not, write to the Free ++ Software Foundation, 59 Temple Place - Suite 330, Boston, MA ++ 02111-1307, USA. */ ++ ++/* This file is derived from the DWARF specification (a public document) ++ Revision 2.0.0 (July 27, 1993) developed by the UNIX International ++ Programming Languages Special Interest Group (UI/PLSIG) and distributed ++ by UNIX International. Copies of this specification are available from ++ UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. ++ ++ This file also now contains definitions from the DWARF 3 specification. */ ++ ++/* This file is shared between GCC and GDB, and should not contain ++ prototypes. */ ++ ++#ifndef _ELF_DWARF2_H ++#define _ELF_DWARF2_H ++ ++/* Structure found in the .debug_line section. */ ++#ifndef __ASSEMBLY__ ++typedef struct ++{ ++ unsigned char li_length [4]; ++ unsigned char li_version [2]; ++ unsigned char li_prologue_length [4]; ++ unsigned char li_min_insn_length [1]; ++ unsigned char li_default_is_stmt [1]; ++ unsigned char li_line_base [1]; ++ unsigned char li_line_range [1]; ++ unsigned char li_opcode_base [1]; ++} ++DWARF2_External_LineInfo; ++ ++typedef struct ++{ ++ unsigned long li_length; ++ unsigned short li_version; ++ unsigned int li_prologue_length; ++ unsigned char li_min_insn_length; ++ unsigned char li_default_is_stmt; ++ int li_line_base; ++ unsigned char li_line_range; ++ unsigned char li_opcode_base; ++} ++DWARF2_Internal_LineInfo; ++ ++/* Structure found in .debug_pubnames section. */ ++typedef struct ++{ ++ unsigned char pn_length [4]; ++ unsigned char pn_version [2]; ++ unsigned char pn_offset [4]; ++ unsigned char pn_size [4]; ++} ++DWARF2_External_PubNames; ++ ++typedef struct ++{ ++ unsigned long pn_length; ++ unsigned short pn_version; ++ unsigned long pn_offset; ++ unsigned long pn_size; ++} ++DWARF2_Internal_PubNames; ++ ++/* Structure found in .debug_info section. */ ++typedef struct ++{ ++ unsigned char cu_length [4]; ++ unsigned char cu_version [2]; ++ unsigned char cu_abbrev_offset [4]; ++ unsigned char cu_pointer_size [1]; ++} ++DWARF2_External_CompUnit; ++ ++typedef struct ++{ ++ unsigned long cu_length; ++ unsigned short cu_version; ++ unsigned long cu_abbrev_offset; ++ unsigned char cu_pointer_size; ++} ++DWARF2_Internal_CompUnit; ++ ++typedef struct ++{ ++ unsigned char ar_length [4]; ++ unsigned char ar_version [2]; ++ unsigned char ar_info_offset [4]; ++ unsigned char ar_pointer_size [1]; ++ unsigned char ar_segment_size [1]; ++} ++DWARF2_External_ARange; ++ ++typedef struct ++{ ++ unsigned long ar_length; ++ unsigned short ar_version; ++ unsigned long ar_info_offset; ++ unsigned char ar_pointer_size; ++ unsigned char ar_segment_size; ++} ++DWARF2_Internal_ARange; ++ ++#define ENUM(name) enum name { ++#define IF_NOT_ASM(a) a ++#define COMMA , ++#else ++#define ENUM(name) ++#define IF_NOT_ASM(a) ++#define COMMA ++ ++#endif ++ ++/* Tag names and codes. */ ++ENUM(dwarf_tag) ++ ++ DW_TAG_padding = 0x00 COMMA ++ DW_TAG_array_type = 0x01 COMMA ++ DW_TAG_class_type = 0x02 COMMA ++ DW_TAG_entry_point = 0x03 COMMA ++ DW_TAG_enumeration_type = 0x04 COMMA ++ DW_TAG_formal_parameter = 0x05 COMMA ++ DW_TAG_imported_declaration = 0x08 COMMA ++ DW_TAG_label = 0x0a COMMA ++ DW_TAG_lexical_block = 0x0b COMMA ++ DW_TAG_member = 0x0d COMMA ++ DW_TAG_pointer_type = 0x0f COMMA ++ DW_TAG_reference_type = 0x10 COMMA ++ DW_TAG_compile_unit = 0x11 COMMA ++ DW_TAG_string_type = 0x12 COMMA ++ DW_TAG_structure_type = 0x13 COMMA ++ DW_TAG_subroutine_type = 0x15 COMMA ++ DW_TAG_typedef = 0x16 COMMA ++ DW_TAG_union_type = 0x17 COMMA ++ DW_TAG_unspecified_parameters = 0x18 COMMA ++ DW_TAG_variant = 0x19 COMMA ++ DW_TAG_common_block = 0x1a COMMA ++ DW_TAG_common_inclusion = 0x1b COMMA ++ DW_TAG_inheritance = 0x1c COMMA ++ DW_TAG_inlined_subroutine = 0x1d COMMA ++ DW_TAG_module = 0x1e COMMA ++ DW_TAG_ptr_to_member_type = 0x1f COMMA ++ DW_TAG_set_type = 0x20 COMMA ++ DW_TAG_subrange_type = 0x21 COMMA ++ DW_TAG_with_stmt = 0x22 COMMA ++ DW_TAG_access_declaration = 0x23 COMMA ++ DW_TAG_base_type = 0x24 COMMA ++ DW_TAG_catch_block = 0x25 COMMA ++ DW_TAG_const_type = 0x26 COMMA ++ DW_TAG_constant = 0x27 COMMA ++ DW_TAG_enumerator = 0x28 COMMA ++ DW_TAG_file_type = 0x29 COMMA ++ DW_TAG_friend = 0x2a COMMA ++ DW_TAG_namelist = 0x2b COMMA ++ DW_TAG_namelist_item = 0x2c COMMA ++ DW_TAG_packed_type = 0x2d COMMA ++ DW_TAG_subprogram = 0x2e COMMA ++ DW_TAG_template_type_param = 0x2f COMMA ++ DW_TAG_template_value_param = 0x30 COMMA ++ DW_TAG_thrown_type = 0x31 COMMA ++ DW_TAG_try_block = 0x32 COMMA ++ DW_TAG_variant_part = 0x33 COMMA ++ DW_TAG_variable = 0x34 COMMA ++ DW_TAG_volatile_type = 0x35 COMMA ++ /* DWARF 3. */ ++ DW_TAG_dwarf_procedure = 0x36 COMMA ++ DW_TAG_restrict_type = 0x37 COMMA ++ DW_TAG_interface_type = 0x38 COMMA ++ DW_TAG_namespace = 0x39 COMMA ++ DW_TAG_imported_module = 0x3a COMMA ++ DW_TAG_unspecified_type = 0x3b COMMA ++ DW_TAG_partial_unit = 0x3c COMMA ++ DW_TAG_imported_unit = 0x3d COMMA ++ /* SGI/MIPS Extensions. */ ++ DW_TAG_MIPS_loop = 0x4081 COMMA ++ /* GNU extensions. */ ++ DW_TAG_format_label = 0x4101 COMMA /* For FORTRAN 77 and Fortran 90. */ ++ DW_TAG_function_template = 0x4102 COMMA /* For C++. */ ++ DW_TAG_class_template = 0x4103 COMMA /* For C++. */ ++ DW_TAG_GNU_BINCL = 0x4104 COMMA ++ DW_TAG_GNU_EINCL = 0x4105 COMMA ++ /* Extensions for UPC. See: http://upc.gwu.edu/~upc. */ ++ DW_TAG_upc_shared_type = 0x8765 COMMA ++ DW_TAG_upc_strict_type = 0x8766 COMMA ++ DW_TAG_upc_relaxed_type = 0x8767 ++IF_NOT_ASM(};) ++ ++#define DW_TAG_lo_user 0x4080 ++#define DW_TAG_hi_user 0xffff ++ ++/* Flag that tells whether entry has a child or not. */ ++#define DW_children_no 0 ++#define DW_children_yes 1 ++ ++/* Form names and codes. */ ++ENUM(dwarf_form) ++ ++ DW_FORM_addr = 0x01 COMMA ++ DW_FORM_block2 = 0x03 COMMA ++ DW_FORM_block4 = 0x04 COMMA ++ DW_FORM_data2 = 0x05 COMMA ++ DW_FORM_data4 = 0x06 COMMA ++ DW_FORM_data8 = 0x07 COMMA ++ DW_FORM_string = 0x08 COMMA ++ DW_FORM_block = 0x09 COMMA ++ DW_FORM_block1 = 0x0a COMMA ++ DW_FORM_data1 = 0x0b COMMA ++ DW_FORM_flag = 0x0c COMMA ++ DW_FORM_sdata = 0x0d COMMA ++ DW_FORM_strp = 0x0e COMMA ++ DW_FORM_udata = 0x0f COMMA ++ DW_FORM_ref_addr = 0x10 COMMA ++ DW_FORM_ref1 = 0x11 COMMA ++ DW_FORM_ref2 = 0x12 COMMA ++ DW_FORM_ref4 = 0x13 COMMA ++ DW_FORM_ref8 = 0x14 COMMA ++ DW_FORM_ref_udata = 0x15 COMMA ++ DW_FORM_indirect = 0x16 ++IF_NOT_ASM(};) ++ ++/* Attribute names and codes. */ ++ ++ENUM(dwarf_attribute) ++ ++ DW_AT_sibling = 0x01 COMMA ++ DW_AT_location = 0x02 COMMA ++ DW_AT_name = 0x03 COMMA ++ DW_AT_ordering = 0x09 COMMA ++ DW_AT_subscr_data = 0x0a COMMA ++ DW_AT_byte_size = 0x0b COMMA ++ DW_AT_bit_offset = 0x0c COMMA ++ DW_AT_bit_size = 0x0d COMMA ++ DW_AT_element_list = 0x0f COMMA ++ DW_AT_stmt_list = 0x10 COMMA ++ DW_AT_low_pc = 0x11 COMMA ++ DW_AT_high_pc = 0x12 COMMA ++ DW_AT_language = 0x13 COMMA ++ DW_AT_member = 0x14 COMMA ++ DW_AT_discr = 0x15 COMMA ++ DW_AT_discr_value = 0x16 COMMA ++ DW_AT_visibility = 0x17 COMMA ++ DW_AT_import = 0x18 COMMA ++ DW_AT_string_length = 0x19 COMMA ++ DW_AT_common_reference = 0x1a COMMA ++ DW_AT_comp_dir = 0x1b COMMA ++ DW_AT_const_value = 0x1c COMMA ++ DW_AT_containing_type = 0x1d COMMA ++ DW_AT_default_value = 0x1e COMMA ++ DW_AT_inline = 0x20 COMMA ++ DW_AT_is_optional = 0x21 COMMA ++ DW_AT_lower_bound = 0x22 COMMA ++ DW_AT_producer = 0x25 COMMA ++ DW_AT_prototyped = 0x27 COMMA ++ DW_AT_return_addr = 0x2a COMMA ++ DW_AT_start_scope = 0x2c COMMA ++ DW_AT_stride_size = 0x2e COMMA ++ DW_AT_upper_bound = 0x2f COMMA ++ DW_AT_abstract_origin = 0x31 COMMA ++ DW_AT_accessibility = 0x32 COMMA ++ DW_AT_address_class = 0x33 COMMA ++ DW_AT_artificial = 0x34 COMMA ++ DW_AT_base_types = 0x35 COMMA ++ DW_AT_calling_convention = 0x36 COMMA ++ DW_AT_count = 0x37 COMMA ++ DW_AT_data_member_location = 0x38 COMMA ++ DW_AT_decl_column = 0x39 COMMA ++ DW_AT_decl_file = 0x3a COMMA ++ DW_AT_decl_line = 0x3b COMMA ++ DW_AT_declaration = 0x3c COMMA ++ DW_AT_discr_list = 0x3d COMMA ++ DW_AT_encoding = 0x3e COMMA ++ DW_AT_external = 0x3f COMMA ++ DW_AT_frame_base = 0x40 COMMA ++ DW_AT_friend = 0x41 COMMA ++ DW_AT_identifier_case = 0x42 COMMA ++ DW_AT_macro_info = 0x43 COMMA ++ DW_AT_namelist_items = 0x44 COMMA ++ DW_AT_priority = 0x45 COMMA ++ DW_AT_segment = 0x46 COMMA ++ DW_AT_specification = 0x47 COMMA ++ DW_AT_static_link = 0x48 COMMA ++ DW_AT_type = 0x49 COMMA ++ DW_AT_use_location = 0x4a COMMA ++ DW_AT_variable_parameter = 0x4b COMMA ++ DW_AT_virtuality = 0x4c COMMA ++ DW_AT_vtable_elem_location = 0x4d COMMA ++ /* DWARF 3 values. */ ++ DW_AT_allocated = 0x4e COMMA ++ DW_AT_associated = 0x4f COMMA ++ DW_AT_data_location = 0x50 COMMA ++ DW_AT_stride = 0x51 COMMA ++ DW_AT_entry_pc = 0x52 COMMA ++ DW_AT_use_UTF8 = 0x53 COMMA ++ DW_AT_extension = 0x54 COMMA ++ DW_AT_ranges = 0x55 COMMA ++ DW_AT_trampoline = 0x56 COMMA ++ DW_AT_call_column = 0x57 COMMA ++ DW_AT_call_file = 0x58 COMMA ++ DW_AT_call_line = 0x59 COMMA ++ /* SGI/MIPS extensions. */ ++ DW_AT_MIPS_fde = 0x2001 COMMA ++ DW_AT_MIPS_loop_begin = 0x2002 COMMA ++ DW_AT_MIPS_tail_loop_begin = 0x2003 COMMA ++ DW_AT_MIPS_epilog_begin = 0x2004 COMMA ++ DW_AT_MIPS_loop_unroll_factor = 0x2005 COMMA ++ DW_AT_MIPS_software_pipeline_depth = 0x2006 COMMA ++ DW_AT_MIPS_linkage_name = 0x2007 COMMA ++ DW_AT_MIPS_stride = 0x2008 COMMA ++ DW_AT_MIPS_abstract_name = 0x2009 COMMA ++ DW_AT_MIPS_clone_origin = 0x200a COMMA ++ DW_AT_MIPS_has_inlines = 0x200b COMMA ++ /* GNU extensions. */ ++ DW_AT_sf_names = 0x2101 COMMA ++ DW_AT_src_info = 0x2102 COMMA ++ DW_AT_mac_info = 0x2103 COMMA ++ DW_AT_src_coords = 0x2104 COMMA ++ DW_AT_body_begin = 0x2105 COMMA ++ DW_AT_body_end = 0x2106 COMMA ++ DW_AT_GNU_vector = 0x2107 COMMA ++ /* VMS extensions. */ ++ DW_AT_VMS_rtnbeg_pd_address = 0x2201 COMMA ++ /* UPC extension. */ ++ DW_AT_upc_threads_scaled = 0x3210 ++IF_NOT_ASM(};) ++ ++#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */ ++#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */ ++ ++/* Location atom names and codes. */ ++ENUM(dwarf_location_atom) ++ ++ DW_OP_addr = 0x03 COMMA ++ DW_OP_deref = 0x06 COMMA ++ DW_OP_const1u = 0x08 COMMA ++ DW_OP_const1s = 0x09 COMMA ++ DW_OP_const2u = 0x0a COMMA ++ DW_OP_const2s = 0x0b COMMA ++ DW_OP_const4u = 0x0c COMMA ++ DW_OP_const4s = 0x0d COMMA ++ DW_OP_const8u = 0x0e COMMA ++ DW_OP_const8s = 0x0f COMMA ++ DW_OP_constu = 0x10 COMMA ++ DW_OP_consts = 0x11 COMMA ++ DW_OP_dup = 0x12 COMMA ++ DW_OP_drop = 0x13 COMMA ++ DW_OP_over = 0x14 COMMA ++ DW_OP_pick = 0x15 COMMA ++ DW_OP_swap = 0x16 COMMA ++ DW_OP_rot = 0x17 COMMA ++ DW_OP_xderef = 0x18 COMMA ++ DW_OP_abs = 0x19 COMMA ++ DW_OP_and = 0x1a COMMA ++ DW_OP_div = 0x1b COMMA ++ DW_OP_minus = 0x1c COMMA ++ DW_OP_mod = 0x1d COMMA ++ DW_OP_mul = 0x1e COMMA ++ DW_OP_neg = 0x1f COMMA ++ DW_OP_not = 0x20 COMMA ++ DW_OP_or = 0x21 COMMA ++ DW_OP_plus = 0x22 COMMA ++ DW_OP_plus_uconst = 0x23 COMMA ++ DW_OP_shl = 0x24 COMMA ++ DW_OP_shr = 0x25 COMMA ++ DW_OP_shra = 0x26 COMMA ++ DW_OP_xor = 0x27 COMMA ++ DW_OP_bra = 0x28 COMMA ++ DW_OP_eq = 0x29 COMMA ++ DW_OP_ge = 0x2a COMMA ++ DW_OP_gt = 0x2b COMMA ++ DW_OP_le = 0x2c COMMA ++ DW_OP_lt = 0x2d COMMA ++ DW_OP_ne = 0x2e COMMA ++ DW_OP_skip = 0x2f COMMA ++ DW_OP_lit0 = 0x30 COMMA ++ DW_OP_lit1 = 0x31 COMMA ++ DW_OP_lit2 = 0x32 COMMA ++ DW_OP_lit3 = 0x33 COMMA ++ DW_OP_lit4 = 0x34 COMMA ++ DW_OP_lit5 = 0x35 COMMA ++ DW_OP_lit6 = 0x36 COMMA ++ DW_OP_lit7 = 0x37 COMMA ++ DW_OP_lit8 = 0x38 COMMA ++ DW_OP_lit9 = 0x39 COMMA ++ DW_OP_lit10 = 0x3a COMMA ++ DW_OP_lit11 = 0x3b COMMA ++ DW_OP_lit12 = 0x3c COMMA ++ DW_OP_lit13 = 0x3d COMMA ++ DW_OP_lit14 = 0x3e COMMA ++ DW_OP_lit15 = 0x3f COMMA ++ DW_OP_lit16 = 0x40 COMMA ++ DW_OP_lit17 = 0x41 COMMA ++ DW_OP_lit18 = 0x42 COMMA ++ DW_OP_lit19 = 0x43 COMMA ++ DW_OP_lit20 = 0x44 COMMA ++ DW_OP_lit21 = 0x45 COMMA ++ DW_OP_lit22 = 0x46 COMMA ++ DW_OP_lit23 = 0x47 COMMA ++ DW_OP_lit24 = 0x48 COMMA ++ DW_OP_lit25 = 0x49 COMMA ++ DW_OP_lit26 = 0x4a COMMA ++ DW_OP_lit27 = 0x4b COMMA ++ DW_OP_lit28 = 0x4c COMMA ++ DW_OP_lit29 = 0x4d COMMA ++ DW_OP_lit30 = 0x4e COMMA ++ DW_OP_lit31 = 0x4f COMMA ++ DW_OP_reg0 = 0x50 COMMA ++ DW_OP_reg1 = 0x51 COMMA ++ DW_OP_reg2 = 0x52 COMMA ++ DW_OP_reg3 = 0x53 COMMA ++ DW_OP_reg4 = 0x54 COMMA ++ DW_OP_reg5 = 0x55 COMMA ++ DW_OP_reg6 = 0x56 COMMA ++ DW_OP_reg7 = 0x57 COMMA ++ DW_OP_reg8 = 0x58 COMMA ++ DW_OP_reg9 = 0x59 COMMA ++ DW_OP_reg10 = 0x5a COMMA ++ DW_OP_reg11 = 0x5b COMMA ++ DW_OP_reg12 = 0x5c COMMA ++ DW_OP_reg13 = 0x5d COMMA ++ DW_OP_reg14 = 0x5e COMMA ++ DW_OP_reg15 = 0x5f COMMA ++ DW_OP_reg16 = 0x60 COMMA ++ DW_OP_reg17 = 0x61 COMMA ++ DW_OP_reg18 = 0x62 COMMA ++ DW_OP_reg19 = 0x63 COMMA ++ DW_OP_reg20 = 0x64 COMMA ++ DW_OP_reg21 = 0x65 COMMA ++ DW_OP_reg22 = 0x66 COMMA ++ DW_OP_reg23 = 0x67 COMMA ++ DW_OP_reg24 = 0x68 COMMA ++ DW_OP_reg25 = 0x69 COMMA ++ DW_OP_reg26 = 0x6a COMMA ++ DW_OP_reg27 = 0x6b COMMA ++ DW_OP_reg28 = 0x6c COMMA ++ DW_OP_reg29 = 0x6d COMMA ++ DW_OP_reg30 = 0x6e COMMA ++ DW_OP_reg31 = 0x6f COMMA ++ DW_OP_breg0 = 0x70 COMMA ++ DW_OP_breg1 = 0x71 COMMA ++ DW_OP_breg2 = 0x72 COMMA ++ DW_OP_breg3 = 0x73 COMMA ++ DW_OP_breg4 = 0x74 COMMA ++ DW_OP_breg5 = 0x75 COMMA ++ DW_OP_breg6 = 0x76 COMMA ++ DW_OP_breg7 = 0x77 COMMA ++ DW_OP_breg8 = 0x78 COMMA ++ DW_OP_breg9 = 0x79 COMMA ++ DW_OP_breg10 = 0x7a COMMA ++ DW_OP_breg11 = 0x7b COMMA ++ DW_OP_breg12 = 0x7c COMMA ++ DW_OP_breg13 = 0x7d COMMA ++ DW_OP_breg14 = 0x7e COMMA ++ DW_OP_breg15 = 0x7f COMMA ++ DW_OP_breg16 = 0x80 COMMA ++ DW_OP_breg17 = 0x81 COMMA ++ DW_OP_breg18 = 0x82 COMMA ++ DW_OP_breg19 = 0x83 COMMA ++ DW_OP_breg20 = 0x84 COMMA ++ DW_OP_breg21 = 0x85 COMMA ++ DW_OP_breg22 = 0x86 COMMA ++ DW_OP_breg23 = 0x87 COMMA ++ DW_OP_breg24 = 0x88 COMMA ++ DW_OP_breg25 = 0x89 COMMA ++ DW_OP_breg26 = 0x8a COMMA ++ DW_OP_breg27 = 0x8b COMMA ++ DW_OP_breg28 = 0x8c COMMA ++ DW_OP_breg29 = 0x8d COMMA ++ DW_OP_breg30 = 0x8e COMMA ++ DW_OP_breg31 = 0x8f COMMA ++ DW_OP_regx = 0x90 COMMA ++ DW_OP_fbreg = 0x91 COMMA ++ DW_OP_bregx = 0x92 COMMA ++ DW_OP_piece = 0x93 COMMA ++ DW_OP_deref_size = 0x94 COMMA ++ DW_OP_xderef_size = 0x95 COMMA ++ DW_OP_nop = 0x96 COMMA ++ /* DWARF 3 extensions. */ ++ DW_OP_push_object_address = 0x97 COMMA ++ DW_OP_call2 = 0x98 COMMA ++ DW_OP_call4 = 0x99 COMMA ++ DW_OP_call_ref = 0x9a COMMA ++ /* GNU extensions. */ ++ DW_OP_GNU_push_tls_address = 0xe0 ++IF_NOT_ASM(};) ++ ++#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */ ++#define DW_OP_hi_user 0xff /* Implementation-defined range end. */ ++ ++/* Type encodings. */ ++ENUM(dwarf_type) ++ ++ DW_ATE_void = 0x0 COMMA ++ DW_ATE_address = 0x1 COMMA ++ DW_ATE_boolean = 0x2 COMMA ++ DW_ATE_complex_float = 0x3 COMMA ++ DW_ATE_float = 0x4 COMMA ++ DW_ATE_signed = 0x5 COMMA ++ DW_ATE_signed_char = 0x6 COMMA ++ DW_ATE_unsigned = 0x7 COMMA ++ DW_ATE_unsigned_char = 0x8 COMMA ++ /* DWARF 3. */ ++ DW_ATE_imaginary_float = 0x9 ++IF_NOT_ASM(};) ++ ++#define DW_ATE_lo_user 0x80 ++#define DW_ATE_hi_user 0xff ++ ++/* Array ordering names and codes. */ ++ENUM(dwarf_array_dim_ordering) ++ ++ DW_ORD_row_major = 0 COMMA ++ DW_ORD_col_major = 1 ++IF_NOT_ASM(};) ++ ++/* Access attribute. */ ++ENUM(dwarf_access_attribute) ++ ++ DW_ACCESS_public = 1 COMMA ++ DW_ACCESS_protected = 2 COMMA ++ DW_ACCESS_private = 3 ++IF_NOT_ASM(};) ++ ++/* Visibility. */ ++ENUM(dwarf_visibility_attribute) ++ ++ DW_VIS_local = 1 COMMA ++ DW_VIS_exported = 2 COMMA ++ DW_VIS_qualified = 3 ++IF_NOT_ASM(};) ++ ++/* Virtuality. */ ++ENUM(dwarf_virtuality_attribute) ++ ++ DW_VIRTUALITY_none = 0 COMMA ++ DW_VIRTUALITY_virtual = 1 COMMA ++ DW_VIRTUALITY_pure_virtual = 2 ++IF_NOT_ASM(};) ++ ++/* Case sensitivity. */ ++ENUM(dwarf_id_case) ++ ++ DW_ID_case_sensitive = 0 COMMA ++ DW_ID_up_case = 1 COMMA ++ DW_ID_down_case = 2 COMMA ++ DW_ID_case_insensitive = 3 ++IF_NOT_ASM(};) ++ ++/* Calling convention. */ ++ENUM(dwarf_calling_convention) ++ ++ DW_CC_normal = 0x1 COMMA ++ DW_CC_program = 0x2 COMMA ++ DW_CC_nocall = 0x3 ++IF_NOT_ASM(};) ++ ++#define DW_CC_lo_user 0x40 ++#define DW_CC_hi_user 0xff ++ ++/* Inline attribute. */ ++ENUM(dwarf_inline_attribute) ++ ++ DW_INL_not_inlined = 0 COMMA ++ DW_INL_inlined = 1 COMMA ++ DW_INL_declared_not_inlined = 2 COMMA ++ DW_INL_declared_inlined = 3 ++IF_NOT_ASM(};) ++ ++/* Discriminant lists. */ ++ENUM(dwarf_discrim_list) ++ ++ DW_DSC_label = 0 COMMA ++ DW_DSC_range = 1 ++IF_NOT_ASM(};) ++ ++/* Line number opcodes. */ ++ENUM(dwarf_line_number_ops) ++ ++ DW_LNS_extended_op = 0 COMMA ++ DW_LNS_copy = 1 COMMA ++ DW_LNS_advance_pc = 2 COMMA ++ DW_LNS_advance_line = 3 COMMA ++ DW_LNS_set_file = 4 COMMA ++ DW_LNS_set_column = 5 COMMA ++ DW_LNS_negate_stmt = 6 COMMA ++ DW_LNS_set_basic_block = 7 COMMA ++ DW_LNS_const_add_pc = 8 COMMA ++ DW_LNS_fixed_advance_pc = 9 COMMA ++ /* DWARF 3. */ ++ DW_LNS_set_prologue_end = 10 COMMA ++ DW_LNS_set_epilogue_begin = 11 COMMA ++ DW_LNS_set_isa = 12 ++IF_NOT_ASM(};) ++ ++/* Line number extended opcodes. */ ++ENUM(dwarf_line_number_x_ops) ++ ++ DW_LNE_end_sequence = 1 COMMA ++ DW_LNE_set_address = 2 COMMA ++ DW_LNE_define_file = 3 ++IF_NOT_ASM(};) ++ ++/* Call frame information. */ ++ENUM(dwarf_call_frame_info) ++ ++ DW_CFA_advance_loc = 0x40 COMMA ++ DW_CFA_offset = 0x80 COMMA ++ DW_CFA_restore = 0xc0 COMMA ++ DW_CFA_nop = 0x00 COMMA ++ DW_CFA_set_loc = 0x01 COMMA ++ DW_CFA_advance_loc1 = 0x02 COMMA ++ DW_CFA_advance_loc2 = 0x03 COMMA ++ DW_CFA_advance_loc4 = 0x04 COMMA ++ DW_CFA_offset_extended = 0x05 COMMA ++ DW_CFA_restore_extended = 0x06 COMMA ++ DW_CFA_undefined = 0x07 COMMA ++ DW_CFA_same_value = 0x08 COMMA ++ DW_CFA_register = 0x09 COMMA ++ DW_CFA_remember_state = 0x0a COMMA ++ DW_CFA_restore_state = 0x0b COMMA ++ DW_CFA_def_cfa = 0x0c COMMA ++ DW_CFA_def_cfa_register = 0x0d COMMA ++ DW_CFA_def_cfa_offset = 0x0e COMMA ++ ++ /* DWARF 3. */ ++ DW_CFA_def_cfa_expression = 0x0f COMMA ++ DW_CFA_expression = 0x10 COMMA ++ DW_CFA_offset_extended_sf = 0x11 COMMA ++ DW_CFA_def_cfa_sf = 0x12 COMMA ++ DW_CFA_def_cfa_offset_sf = 0x13 COMMA ++ ++ /* SGI/MIPS specific. */ ++ DW_CFA_MIPS_advance_loc8 = 0x1d COMMA ++ ++ /* GNU extensions. */ ++ DW_CFA_GNU_window_save = 0x2d COMMA ++ DW_CFA_GNU_args_size = 0x2e COMMA ++ DW_CFA_GNU_negative_offset_extended = 0x2f ++IF_NOT_ASM(};) ++ ++#define DW_CIE_ID 0xffffffff ++#define DW_CIE_VERSION 1 ++ ++#define DW_CFA_extended 0 ++#define DW_CFA_lo_user 0x1c ++#define DW_CFA_hi_user 0x3f ++ ++#define DW_CHILDREN_no 0x00 ++#define DW_CHILDREN_yes 0x01 ++ ++#define DW_ADDR_none 0 ++ ++/* Source language names and codes. */ ++ENUM(dwarf_source_language) ++ ++ DW_LANG_C89 = 0x0001 COMMA ++ DW_LANG_C = 0x0002 COMMA ++ DW_LANG_Ada83 = 0x0003 COMMA ++ DW_LANG_C_plus_plus = 0x0004 COMMA ++ DW_LANG_Cobol74 = 0x0005 COMMA ++ DW_LANG_Cobol85 = 0x0006 COMMA ++ DW_LANG_Fortran77 = 0x0007 COMMA ++ DW_LANG_Fortran90 = 0x0008 COMMA ++ DW_LANG_Pascal83 = 0x0009 COMMA ++ DW_LANG_Modula2 = 0x000a COMMA ++ DW_LANG_Java = 0x000b COMMA ++ /* DWARF 3. */ ++ DW_LANG_C99 = 0x000c COMMA ++ DW_LANG_Ada95 = 0x000d COMMA ++ DW_LANG_Fortran95 = 0x000e COMMA ++ /* MIPS. */ ++ DW_LANG_Mips_Assembler = 0x8001 COMMA ++ /* UPC. */ ++ DW_LANG_Upc = 0x8765 ++IF_NOT_ASM(};) ++ ++#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */ ++#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */ ++ ++/* Names and codes for macro information. */ ++ENUM(dwarf_macinfo_record_type) ++ ++ DW_MACINFO_define = 1 COMMA ++ DW_MACINFO_undef = 2 COMMA ++ DW_MACINFO_start_file = 3 COMMA ++ DW_MACINFO_end_file = 4 COMMA ++ DW_MACINFO_vendor_ext = 255 ++IF_NOT_ASM(};) ++ ++/* @@@ For use with GNU frame unwind information. */ ++ ++#define DW_EH_PE_absptr 0x00 ++#define DW_EH_PE_omit 0xff ++ ++#define DW_EH_PE_uleb128 0x01 ++#define DW_EH_PE_udata2 0x02 ++#define DW_EH_PE_udata4 0x03 ++#define DW_EH_PE_udata8 0x04 ++#define DW_EH_PE_sleb128 0x09 ++#define DW_EH_PE_sdata2 0x0A ++#define DW_EH_PE_sdata4 0x0B ++#define DW_EH_PE_sdata8 0x0C ++#define DW_EH_PE_signed 0x08 ++ ++#define DW_EH_PE_pcrel 0x10 ++#define DW_EH_PE_textrel 0x20 ++#define DW_EH_PE_datarel 0x30 ++#define DW_EH_PE_funcrel 0x40 ++#define DW_EH_PE_aligned 0x50 ++ ++#define DW_EH_PE_indirect 0x80 ++ ++#endif /* _ELF_DWARF2_H */ +Index: linux-2.6.10/include/linux/spinlock.h +=================================================================== +--- linux-2.6.10.orig/include/linux/spinlock.h 2005-03-31 15:35:27.000000000 +0800 ++++ linux-2.6.10/include/linux/spinlock.h 2005-04-05 12:48:05.365601384 +0800 +@@ -15,6 +15,12 @@ + + #include /* for cpu relax */ + #include ++#ifdef CONFIG_KGDB ++#include ++#define SET_WHO(x, him) (x)->who = him; ++#else ++#define SET_WHO(x, him) ++#endif + + /* + * Must define these before including other files, inline functions need them +@@ -94,6 +100,9 @@ + const char *module; + char *owner; + int oline; ++#ifdef CONFIG_KGDB ++ struct task_struct *who; ++#endif + } spinlock_t; + #define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0} + +@@ -105,6 +114,7 @@ + (x)->module = __FILE__; \ + (x)->owner = NULL; \ + (x)->oline = 0; \ ++ SET_WHO(x, NULL) \ + } while (0) + + #define CHECK_LOCK(x) \ +@@ -129,6 +139,7 @@ + (x)->lock = 1; \ + (x)->owner = __FILE__; \ + (x)->oline = __LINE__; \ ++ SET_WHO(x, current) \ + } while (0) + + /* without debugging, spin_is_locked on UP always says +@@ -159,6 +170,7 @@ + (x)->lock = 1; \ + (x)->owner = __FILE__; \ + (x)->oline = __LINE__; \ ++ SET_WHO(x, current) \ + 1; \ + }) + +Index: linux-2.6.10/include/linux/config.h +=================================================================== +--- linux-2.6.10.orig/include/linux/config.h 2005-03-31 15:35:27.000000000 +0800 ++++ linux-2.6.10/include/linux/config.h 2005-04-05 12:48:42.303985896 +0800 +@@ -2,6 +2,10 @@ + #define _LINUX_CONFIG_H + + #include ++#if defined(__i386__) && !defined(IN_BOOTLOADER) && defined(CONFIG_KGDB) ++#include ++#endif ++ + #if !defined (__KERNEL__) && !defined(__KERNGLUE__) + #error including kernel header in userspace; use the glibc headers instead! + #endif +Index: linux-2.6.10/include/linux/dwarf2-lang.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dwarf2-lang.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/linux/dwarf2-lang.h 2005-04-05 12:48:05.370600624 +0800 +@@ -0,0 +1,132 @@ ++#ifndef DWARF2_LANG ++#define DWARF2_LANG ++#include ++ ++/* ++ * This is free software; you can redistribute it and/or modify it under ++ * the terms of the GNU General Public License as published by the Free ++ * Software Foundation; either version 2, or (at your option) any later ++ * version. ++ */ ++/* ++ * This file defines macros that allow generation of DWARF debug records ++ * for asm files. This file is platform independent. Register numbers ++ * (which are about the only thing that is platform dependent) are to be ++ * supplied by a platform defined file. ++ */ ++#define DWARF_preamble() .section .debug_frame,"",@progbits ++/* ++ * This macro starts a debug frame section. The debug_frame describes ++ * where to find the registers that the enclosing function saved on ++ * entry. ++ * ++ * ORD is use by the label generator and should be the same as what is ++ * passed to CFI_postamble. ++ * ++ * pc, pc register gdb ordinal. ++ * ++ * code_align this is the factor used to define locations or regions ++ * where the given definitions apply. If you use labels to define these ++ * this should be 1. ++ * ++ * data_align this is the factor used to define register offsets. If ++ * you use struct offset, this should be the size of the register in ++ * bytes or the negative of that. This is how it is used: you will ++ * define a register as the reference register, say the stack pointer, ++ * then you will say where a register is located relative to this ++ * reference registers value, say 40 for register 3 (the gdb register ++ * number). The <40> will be multiplied by to define the ++ * byte offset of the given register (3, in this example). So if your ++ * <40> is the byte offset and the reference register points at the ++ * begining, you would want 1 for the data_offset. If <40> was the 40th ++ * 4-byte element in that structure you would want 4. And if your ++ * reference register points at the end of the structure you would want ++ * a negative data_align value(and you would have to do other math as ++ * well). ++ */ ++ ++#define CFI_preamble(ORD, pc, code_align, data_align) \ ++.section .debug_frame,"",@progbits ; \ ++frame/**/_/**/ORD: \ ++ .long end/**/_/**/ORD-start/**/_/**/ORD; \ ++start/**/_/**/ORD: \ ++ .long DW_CIE_ID; \ ++ .byte DW_CIE_VERSION; \ ++ .byte 0 ; \ ++ .uleb128 code_align; \ ++ .sleb128 data_align; \ ++ .byte pc; ++ ++/* ++ * After the above macro and prior to the CFI_postamble, you need to ++ * define the initial state. This starts with defining the reference ++ * register and, usually the pc. Here are some helper macros: ++ */ ++ ++#define CFA_define_reference(reg, offset) \ ++ .byte DW_CFA_def_cfa; \ ++ .uleb128 reg; \ ++ .uleb128 (offset); ++ ++#define CFA_define_offset(reg, offset) \ ++ .byte (DW_CFA_offset + reg); \ ++ .uleb128 (offset); ++ ++#define CFI_postamble(ORD) \ ++ .align 4; \ ++end/**/_/**/ORD: ++/* ++ * So now your code pushs stuff on the stack, you need a new location ++ * and the rules for what to do. This starts a running description of ++ * the call frame. You need to describe what changes with respect to ++ * the call registers as the location of the pc moves through the code. ++ * The following builds an FDE (fram descriptor entry?). Like the ++ * above, it has a preamble and a postamble. It also is tied to the CFI ++ * above. ++ * The first entry after the preamble must be the location in the code ++ * that the call frame is being described for. ++ */ ++#define FDE_preamble(ORD, fde_no, initial_address, length) \ ++ .long FDE_end/**/_/**/fde_no-FDE_start/**/_/**/fde_no; \ ++FDE_start/**/_/**/fde_no: \ ++ .long frame/**/_/**/ORD; \ ++ .long initial_address; \ ++ .long length; ++ ++#define FDE_postamble(fde_no) \ ++ .align 4; \ ++FDE_end/**/_/**/fde_no: ++/* ++ * That done, you can now add registers, subtract registers, move the ++ * reference and even change the reference. You can also define a new ++ * area of code the info applies to. For discontinuous bits you should ++ * start a new FDE. You may have as many as you like. ++ */ ++ ++/* ++ * To advance the address by ++ */ ++ ++#define FDE_advance(bytes) \ ++ .byte DW_CFA_advance_loc4 \ ++ .long bytes ++ ++ ++ ++/* ++ * With the above you can define all the register locations. But ++ * suppose the reference register moves... Takes the new offset NOT an ++ * increment. This is how esp is tracked if it is not saved. ++ */ ++ ++#define CFA_define_cfa_offset(offset) \ ++ .byte $DW_CFA_def_cfa_offset; \ ++ .uleb128 (offset); ++/* ++ * Or suppose you want to use a different reference register... ++ */ ++#define CFA_define_cfa_register(reg) \ ++ .byte DW_CFA_def_cfa_register; \ ++ .uleb128 reg; ++ ++#endif +Index: linux-2.6.10/kernel/pid.c +=================================================================== +--- linux-2.6.10.orig/kernel/pid.c 2005-03-31 15:35:27.000000000 +0800 ++++ linux-2.6.10/kernel/pid.c 2005-04-05 12:48:05.363601688 +0800 +@@ -252,6 +252,9 @@ + * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or + * more. + */ ++#ifdef CONFIG_KGDB ++int kgdb_pid_init_done; /* so we don't call prior to... */ ++#endif + void __init pidhash_init(void) + { + int i, j, pidhash_size; +@@ -273,6 +276,9 @@ + for (j = 0; j < pidhash_size; j++) + INIT_HLIST_HEAD(&pid_hash[i][j]); + } ++#ifdef CONFIG_KGDB ++ kgdb_pid_init_done++; ++#endif + } + + void __init pidmap_init(void) +Index: linux-2.6.10/kernel/sched.c +=================================================================== +--- linux-2.6.10.orig/kernel/sched.c 2005-03-31 15:57:21.000000000 +0800 ++++ linux-2.6.10/kernel/sched.c 2005-04-05 12:48:05.362601840 +0800 +@@ -2991,6 +2991,13 @@ + + EXPORT_SYMBOL(set_user_nice); + ++#ifdef CONFIG_KGDB ++struct task_struct *kgdb_get_idle(int this_cpu) ++{ ++ return cpu_rq(this_cpu)->idle; ++} ++#endif ++ + #ifdef __ARCH_WANT_SYS_NICE + + /* +Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit +=================================================================== +--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit 2005-04-05 12:48:05.263616888 +0800 +@@ -0,0 +1,14 @@ ++shell echo -e "\003" >/dev/ttyS0 ++set remotebaud 38400 ++target remote /dev/ttyS0 ++define si ++stepi ++printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx ++printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp ++x/i $eip ++end ++define ni ++nexti ++printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx ++printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp ++x/i $eip +Index: linux-2.6.10/Documentation/i386/kgdb/kgdb.txt +=================================================================== +--- linux-2.6.10.orig/Documentation/i386/kgdb/kgdb.txt 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/Documentation/i386/kgdb/kgdb.txt 2005-04-05 12:48:05.271615672 +0800 +@@ -0,0 +1,775 @@ ++Last edit: <20030806.1637.12> ++This file has information specific to the i386 kgdb option. Other ++platforms with the kgdb option may behave in a similar fashion. ++ ++New features: ++============ ++20030806.1557.37 ++This version was made against the 2.6.0-test2 kernel. We have made the ++following changes: ++ ++- The getthread() code in the stub calls find_task_by_pid(). It fails ++ if we are early in the bring up such that the pid arrays have yet to ++ be allocated. We have added a line to kernel/pid.c to make ++ "kgdb_pid_init_done" true once the arrays are allocated. This way the ++ getthread() code knows not to call. This is only used by the thread ++ debugging stuff and threads will not yet exist at this point in the ++ boot. ++ ++- For some reason, gdb was not asking for a new thread list when the ++ "info thread" command was given. We changed to the newer version of ++ the thread info command and gdb now seems to ask when needed. Result, ++ we now get all threads in the thread list. ++ ++- We now respond to the ThreadExtraInfo request from gdb with the thread ++ name from task_struct .comm. This then appears in the thread list. ++ Thoughts on additional options for this are welcome. Things such as ++ "has BKL" and "Preempted" come to mind. I think we could have a flag ++ word that could enable different bits of info here. ++ ++- We now honor, sort of, the C and S commands. These are continue and ++ single set after delivering a signal. We ignore the signal and do the ++ requested action. This only happens when we told gdb that a signal ++ was the reason for entry, which is only done on memory faults. The ++ result is that you can now continue into the Oops. ++ ++- We changed the -g to -gdwarf-2. This seems to be the same as -ggdb, ++ but it is more exact on what language to use. ++ ++- We added two dwarf2 include files and a bit of code at the end of ++ entry.S. This does not yet work, so it is disabled. Still we want to ++ keep track of the code and "maybe" someone out there can fix it. ++ ++- Randy Dunlap sent some fix ups for this file which are now merged. ++ ++- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a ++ compiler warning if CONFIG_KGDB is off (now who would do that :). ++ ++- Andrew Morton sent a fix for the serial driver which is now merged. ++ ++- Andrew also sent a change to the stub around the cpu managment code ++ which is also merged. ++ ++- Andrew also sent a patch to make "f" as well as "g" work as SysRq ++ commands to enter kgdb, merged. ++ ++- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a ++ "who" field to the spinlock data struct. This is filled with ++ "current" when ever the spinlock suceeds. Useful if you want to know ++ who has the lock. ++ ++_ And last, but not least, we fixed the "get_cu" macro to properly get ++ the current value of "current". ++ ++New features: ++============ ++20030505.1827.27 ++We are starting to align with the sourceforge version, at least in ++commands. To this end, the boot command string to start kgdb at ++boot time has been changed from "kgdb" to "gdb". ++ ++Andrew Morton sent a couple of patches which are now included as follows: ++1.) We now return a flag to the interrupt handler. ++2.) We no longer use smp_num_cpus (a conflict with the lock meter). ++3.) And from William Lee Irwin III code to make ++ sure high-mem is set up before we attempt to register our interrupt ++ handler. ++We now include asm/kgdb.h from config.h so you will most likely never ++have to include it. It also 'NULLS' the kgdb macros you might have in ++your code when CONFIG_KGDB is not defined. This allows you to just ++turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such. ++This include is conditioned on the machine being an x86 so as to not ++mess with other archs. ++ ++20020801.1129.03 ++This is currently the version for the 2.4.18 (and beyond?) kernel. ++ ++We have several new "features" beginning with this version: ++ ++1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI. No more ++ waiting and it will pull that guy out of an IRQ off spin lock :) ++ ++2.) We doctored up the code that tells where a task is waiting and ++ included it so that the "info thread" command will show a bit more ++ than "schedule()". Try it... ++ ++3.) Added the ability to call a function from gdb. All the standard gdb ++ issues apply, i.e. if you hit a breakpoint in the function, you are ++ not allowed to call another (gdb limitation, not kgdb). To help ++ this capability we added a memory allocation function. Gdb does not ++ return this memory (it is used for strings that you pass to that function ++ you are calling from gdb) so we fixed up a way to allow you to ++ manually return the memory (see below). ++ ++4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the ++ interrupt flag to now also include the preemption count and the ++ "in_interrupt" info. The flag is now called "with_pif" to indicate ++ the order, preempt_count, in_interrupt, flag. The preempt_count is ++ shifted left by 4 bits so you can read the count in hex by dropping ++ the low order digit. In_interrupt is in bit 1, and the flag is in ++ bit 0. ++ ++5.) The command: "p kgdb_info" is now expanded and prints something ++ like: ++(gdb) p kgdb_info ++$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259, ++ errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1, ++ cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0, ++ regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}} ++ ++ Things to note here: a.) used_malloc is the amount of memory that ++ has been malloc'ed to do calls from gdb. You can reclaim this ++ memory like this: "p kgdb_info.used_malloc=0" Cool, huh? b.) ++ cpus_waiting is now "sized" by the number of CPUs you enter at ++ configure time in the kgdb configure section. This is NOT used ++ anywhere else in the system, but it is "nice" here. c.) The task's ++ "pid" is now in the structure. This is the pid you will need to use ++ to decode to the thread id to get gdb to look at that thread. ++ Remember that the "info thread" command prints a list of threads ++ wherein it numbers each thread with its reference number followed ++ by the thread's pid. Note that the per-CPU idle threads actually ++ have pids of 0 (yes, there is more than one pid 0 in an SMP system). ++ To avoid confusion, kgdb numbers these threads with numbers beyond ++ the MAX_PID. That is why you see 32768 and above. ++ ++6.) A subtle change, we now provide the complete register set for tasks ++ that are active on the other CPUs. This allows better trace back on ++ those tasks. ++ ++ And, let's mention what we could not fix. Back-trace from all but the ++ thread that we trapped will, most likely, have a bogus entry in it. ++ The problem is that gdb does not recognize the entry code for ++ functions that use "current" near (at all?) the entry. The compiler ++ is putting the "current" decode as the first two instructions of the ++ function where gdb expects to find %ebp changing code. Back trace ++ also has trouble with interrupt frames. I am talking with Daniel ++ Jacobowitz about some way to fix this, but don't hold your breath. ++ ++20011220.0050.35 ++Major enhancement with this version is the ability to hold one or more ++CPUs in an SMP system while allowing the others to continue. Also, by ++default only the current CPU is enabled on single-step commands (please ++note that gdb issues single-step commands at times other than when you ++use the si command). ++ ++Another change is to collect some useful information in ++a global structure called "kgdb_info". You should be able to just: ++ ++p kgdb_info ++ ++although I have seen cases where the first time this is done gdb just ++prints the first member but prints the whole structure if you then enter ++CR (carriage return or enter). This also works: ++ ++p *&kgdb_info ++ ++Here is a sample: ++(gdb) p kgdb_info ++$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0, ++ vector = 3, print_debug_info = 0} ++ ++"Called_from" is the return address from the current entry into kgdb. ++Sometimes it is useful to know why you are in kgdb, for example, was ++it an NMI or a real breakpoint? The simple way to interrogate this ++return address is: ++ ++l *0xc010732c ++ ++which will print the surrounding few lines of source code. ++ ++"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the ++kgdb_ts entries). ++ ++"errcode" and "vector" are other entry parameters which may be helpful on ++some traps. ++ ++"print_debug_info" is the internal debugging kgdb print enable flag. Yes, ++you can modify it. ++ ++In SMP systems kgdb_info also includes the "cpus_waiting" structure and ++"hold_on_step": ++ ++(gdb) p kgdb_info ++$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0, ++ vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{ ++ task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0, ++ regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, ++ hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, ++ hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, ++ hold = 0, regs = 0x0}}} ++ ++"Cpus_waiting" has an entry for each CPU other than the current one that ++has been stopped. Each entry contains the task_struct address for that ++CPU, the address of the regs for that task and a hold flag. All these ++have the proper typing so that, for example: ++ ++p *kgdb_info.cpus_waiting[1].regs ++ ++will print the registers for CPU 1. ++ ++"Hold_on_sstep" is a new feature with this version and comes up set or ++true. What this means is that whenever kgdb is asked to single-step all ++other CPUs are held (i.e. not allowed to execute). The flag applies to ++all but the current CPU and, again, can be changed: ++ ++p kgdb_info.hold_on_sstep=0 ++ ++restores the old behavior of letting all CPUs run during single-stepping. ++ ++Likewise, each CPU has a "hold" flag, which if set, locks that CPU out ++of execution. Note that this has some risk in cases where the CPUs need ++to communicate with each other. If kgdb finds no CPU available on exit, ++it will push a message thru gdb and stay in kgdb. Note that it is legal ++to hold the current CPU as long as at least one CPU can execute. ++ ++20010621.1117.09 ++This version implements an event queue. Events are signaled by calling ++a function in the kgdb stub and may be examined from gdb. See EVENTS ++below for details. This version also tightens up the interrupt and SMP ++handling to not allow interrupts on the way to kgdb from a breakpoint ++trap. It is fine to allow these interrupts for user code, but not ++system debugging. ++ ++Version ++======= ++ ++This version of the kgdb package was developed and tested on ++kernel version 2.4.16. It will not install on any earlier kernels. ++It is possible that it will continue to work on later versions ++of 2.4 and then versions of 2.5 (I hope). ++ ++ ++Debugging Setup ++=============== ++ ++Designate one machine as the "development" machine. This is the ++machine on which you run your compiles and which has your source ++code for the kernel. Designate a second machine as the "target" ++machine. This is the machine that will run your experimental ++kernel. ++ ++The two machines will be connected together via a serial line out ++one or the other of the COM ports of the PC. You will need the ++appropriate modem eliminator (null modem) cable(s) for this. ++ ++Decide on which tty port you want the machines to communicate, then ++connect them up back-to-back using the null modem cable. COM1 is ++/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection ++with the two machines prior to trying to debug a kernel. Once you ++have it working, on the TARGET machine, enter: ++ ++setserial /dev/ttyS0 (or what ever tty you are using) ++ ++and record the port address and the IRQ number. ++ ++On the DEVELOPMENT machine you need to apply the patch for the kgdb ++hooks. You have probably already done that if you are reading this ++file. ++ ++On your DEVELOPMENT machine, go to your kernel source directory and do ++"make Xconfig" where X is one of "x", "menu", or "". If you are ++configuring in the standard serial driver, it must not be a module. ++Either yes or no is ok, but making the serial driver a module means it ++will initialize after kgdb has set up the UART interrupt code and may ++cause a failure of the control-C option discussed below. The configure ++question for the serial driver is under the "Character devices" heading ++and is: ++ ++"Standard/generic (8250/16550 and compatible UARTs) serial support" ++ ++Go down to the kernel debugging menu item and open it up. Enable the ++kernel kgdb stub code by selecting that item. You can also choose to ++turn on the "-ggdb -O1" compile options. The -ggdb causes the compiler ++to put more debug info (like local symbols) in the object file. On the ++i386 -g and -ggdb are the same so this option just reduces to "O1". The ++-O1 reduces the optimization level. This may be helpful in some cases, ++be aware, however, that this may also mask the problem you are looking ++for. ++ ++The baud rate. Default is 115200. What ever you choose be sure that ++the host machine is set to the same speed. I recommend the default. ++ ++The port. This is the I/O address of the serial UART that you should ++have gotten using setserial as described above. The standard COM1 port ++(3f8) using IRQ 4 is default. COM2 is 2f8 which by convention uses IRQ ++3. ++ ++The port IRQ (see above). ++ ++Stack overflow test. This option makes a minor change in the trap, ++system call and interrupt code to detect stack overflow and transfer ++control to kgdb if it happens. (Some platforms have this in the ++baseline code, but the i386 does not.) ++ ++You can also configure the system to recognize the boot option ++"console=kgdb" which if given will cause all console output during ++booting to be put thru gdb as well as other consoles. This option ++requires that gdb and kgdb be connected prior to sending console output ++so, if they are not, a breakpoint is executed to force the connection. ++This will happen before any kernel output (it is going thru gdb, right), ++and will stall the boot until the connection is made. ++ ++You can also configure in a patch to SysRq to enable the kGdb SysRq. ++This request generates a breakpoint. Since the serial port IRQ line is ++set up after any serial drivers, it is possible that this command will ++work when the control-C will not. ++ ++Save and exit the Xconfig program. Then do "make clean" , "make dep" ++and "make bzImage" (or whatever target you want to make). This gets the ++kernel compiled with the "-g" option set -- necessary for debugging. ++ ++You have just built the kernel on your DEVELOPMENT machine that you ++intend to run on your TARGET machine. ++ ++To install this new kernel, use the following installation procedure. ++Remember, you are on the DEVELOPMENT machine patching the kernel source ++for the kernel that you intend to run on the TARGET machine. ++ ++Copy this kernel to your target machine using your usual procedures. I ++usually arrange to copy development: ++/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine ++via a LAN based NFS access. That is, I run the cp command on the target ++and copy from the development machine via the LAN. Run Lilo (see "man ++lilo" for details on how to set this up) on the new kernel on the target ++machine so that it will boot! Then boot the kernel on the target ++machine. ++ ++On the DEVELOPMENT machine, create a file called .gdbinit in the ++directory /usr/src/linux. An example .gdbinit file looks like this: ++ ++shell echo -e "\003" >/dev/ttyS0 ++set remotebaud 38400 (or what ever speed you have chosen) ++target remote /dev/ttyS0 ++ ++ ++Change the "echo" and "target" definition so that it specifies the tty ++port that you intend to use. Change the "remotebaud" definition to ++match the data rate that you are going to use for the com line. ++ ++You are now ready to try it out. ++ ++Boot your target machine with "kgdb" in the boot command i.e. something ++like: ++ ++lilo> test kgdb ++ ++or if you also want console output thru gdb: ++ ++lilo> test kgdb console=kgdb ++ ++You should see the lilo message saying it has loaded the kernel and then ++all output stops. The kgdb stub is trying to connect with gdb. Start ++gdb something like this: ++ ++ ++On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". ++When gdb gets the symbols loaded it will read your .gdbinit file and, if ++everything is working correctly, you should see gdb print out a few ++lines indicating that a breakpoint has been taken. It will actually ++show a line of code in the target kernel inside the kgdb activation ++code. ++ ++The gdb interaction should look something like this: ++ ++ linux-dev:/usr/src/linux# gdb vmlinux ++ GDB is free software and you are welcome to distribute copies of it ++ under certain conditions; type "show copying" to see the conditions. ++ There is absolutely no warranty for GDB; type "show warranty" for details. ++ GDB 4.15.1 (i486-slackware-linux), ++ Copyright 1995 Free Software Foundation, Inc... ++ breakpoint () at i386-stub.c:750 ++ 750 } ++ (gdb) ++ ++You can now use whatever gdb commands you like to set breakpoints. ++Enter "continue" to start your target machine executing again. At this ++point the target system will run at full speed until it encounters ++your breakpoint or gets a segment violation in the kernel, or whatever. ++ ++If you have the kgdb console enabled when you continue, gdb will print ++out all the console messages. ++ ++The above example caused a breakpoint relatively early in the boot ++process. For the i386 kgdb it is possible to code a break instruction ++as the first C-language point in init/main.c, i.e. as the first instruction ++in start_kernel(). This could be done as follows: ++ ++#include ++ breakpoint(); ++ ++This breakpoint() is really a function that sets up the breakpoint and ++single-step hardware trap cells and then executes a breakpoint. Any ++early hard coded breakpoint will need to use this function. Once the ++trap cells are set up they need not be set again, but doing it again ++does not hurt anything, so you don't need to be concerned about which ++breakpoint is hit first. Once the trap cells are set up (and the kernel ++sets them up in due course even if breakpoint() is never called) the ++macro: ++ ++BREAKPOINT; ++ ++will generate an inline breakpoint. This may be more useful as it stops ++the processor at the instruction instead of in a function a step removed ++from the location of interest. In either case must be ++included to define both breakpoint() and BREAKPOINT. ++ ++Triggering kgdbstub at other times ++================================== ++ ++Often you don't need to enter the debugger until much later in the boot ++or even after the machine has been running for some time. Once the ++kernel is booted and interrupts are on, you can force the system to ++enter the debugger by sending a control-C to the debug port. This is ++what the first line of the recommended .gdbinit file does. This allows ++you to start gdb any time after the system is up as well as when the ++system is already at a breakpoint. (In the case where the system is ++already at a breakpoint the control-C is not needed, however, it will ++be ignored by the target so no harm is done. Also note the the echo ++command assumes that the port speed is already set. This will be true ++once gdb has connected, but it is best to set the port speed before you ++run gdb.) ++ ++Another simple way to do this is to put the following file in you ~/bin ++directory: ++ ++#!/bin/bash ++echo -e "\003" > /dev/ttyS0 ++ ++Here, the ttyS0 should be replaced with what ever port you are using. ++The "\003" is control-C. Once you are connected with gdb, you can enter ++control-C at the command prompt. ++ ++An alternative way to get control to the debugger is to enable the kGdb ++SysRq command. Then you would enter Alt-SysRq-g (all three keys at the ++same time, but push them down in the order given). To refresh your ++memory of the available SysRq commands try Alt-SysRq-=. Actually any ++undefined command could replace the "=", but I like to KNOW that what I ++am pushing will never be defined. ++ ++Debugging hints ++=============== ++ ++You can break into the target machine at any time from the development ++machine by typing ^C (see above paragraph). If the target machine has ++interrupts enabled this will stop it in the kernel and enter the ++debugger. ++ ++There is unfortunately no way of breaking into the kernel if it is ++in a loop with interrupts disabled, so if this happens to you then ++you need to place exploratory breakpoints or printk's into the kernel ++to find out where it is looping. The exploratory breakpoints can be ++entered either thru gdb or hard coded into the source. This is very ++handy if you do something like: ++ ++if () BREAKPOINT; ++ ++ ++There is a copy of an e-mail in the Documentation/i386/kgdb/ directory ++(debug-nmi.txt) which describes how to create an NMI on an ISA bus ++machine using a paper clip. I have a sophisticated version of this made ++by wiring a push button switch into a PC104/ISA bus adapter card. The ++adapter card nicely furnishes wire wrap pins for all the ISA bus ++signals. ++ ++When you are done debugging the kernel on the target machine it is a ++good idea to leave it in a running state. This makes reboots faster, ++bypassing the fsck. So do a gdb "continue" as the last gdb command if ++this is possible. To terminate gdb itself on the development machine ++and leave the target machine running, first clear all breakpoints and ++continue, then type ^Z to suspend gdb and then kill it with "kill %1" or ++something similar. ++ ++If gdbstub Does Not Work ++======================== ++ ++If it doesn't work, you will have to troubleshoot it. Do the easy ++things first like double checking your cabling and data rates. You ++might try some non-kernel based programs to see if the back-to-back ++connection works properly. Just something simple like cat /etc/hosts ++>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you ++if you can send data from one machine to the other. Make sure it works ++in both directions. There is no point in tearing out your hair in the ++kernel if the line doesn't work. ++ ++All of the real action takes place in the file ++/usr/src/linux/arch/i386/kernel/kgdb_stub.c. That is the code on the target ++machine that interacts with gdb on the development machine. In gdb you can ++turn on a debug switch with the following command: ++ ++ set remotedebug ++ ++This will print out the protocol messages that gdb is exchanging with ++the target machine. ++ ++Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is ++the code that talks to the serial port on the target side. There might ++be a problem there. In particular there is a section of this code that ++tests the UART which will tell you what UART you have if you define ++"PRNT" (just remove "_off" from the #define PRNT_off). To view this ++report you will need to boot the system without any beakpoints. This ++allows the kernel to run to the point where it calls kgdb to set up ++interrupts. At this time kgdb will test the UART and print out the type ++it finds. (You need to wait so that the printks are actually being ++printed. Early in the boot they are cached, waiting for the console to ++be enabled. Also, if kgdb is entered thru a breakpoint it is possible ++to cause a dead lock by calling printk when the console is locked. The ++stub thus avoids doing printks from breakpoints, especially in the ++serial code.) At this time, if the UART fails to do the expected thing, ++kgdb will print out (using printk) information on what failed. (These ++messages will be buried in all the other boot up messages. Look for ++lines that start with "gdb_hook_interrupt:". You may want to use dmesg ++once the system is up to view the log. If this fails or if you still ++don't connect, review your answers for the port address. Use: ++ ++setserial /dev/ttyS0 ++ ++to get the current port and IRQ information. This command will also ++tell you what the system found for the UART type. The stub recognizes ++the following UART types: ++ ++16450, 16550, and 16550A ++ ++If you are really desperate you can use printk debugging in the ++kgdbstub code in the target kernel until you get it working. In particular, ++there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c ++named "remote_debug". Compile your kernel with this set to 1, rather ++than 0 and the debug stub will print out lots of stuff as it does ++what it does. Likewise there are debug printks in the kgdb_serial.c ++code that can be turned on with simple changes in the macro defines. ++ ++ ++Debugging Loadable Modules ++========================== ++ ++This technique comes courtesy of Edouard Parmelan ++ ++ ++When you run gdb, enter the command ++ ++source gdbinit-modules ++ ++This will read in a file of gdb macros that was installed in your ++kernel source directory when kgdb was installed. This file implements ++the following commands: ++ ++mod-list ++ Lists the loaded modules in the form ++ ++mod-print-symbols ++ Prints all the symbols in the indicated module. ++ ++mod-add-symbols ++ Loads the symbols from the object file and associates them ++ with the indicated module. ++ ++After you have loaded the module that you want to debug, use the command ++mod-list to find the of your module. Then use that ++address in the mod-add-symbols command to load your module's symbols. ++From that point onward you can debug your module as if it were a part ++of the kernel. ++ ++The file gdbinit-modules also contains a command named mod-add-lis as ++an example of how to construct a command of your own to load your ++favorite module. The idea is to "can" the pathname of the module ++in the command so you don't have to type so much. ++ ++Threads ++======= ++ ++Each process in a target machine is seen as a gdb thread. gdb thread ++related commands (info threads, thread n) can be used. ++ ++ia-32 hardware breakpoints ++========================== ++ ++kgdb stub contains support for hardware breakpoints using debugging features ++of ia-32(x86) processors. These breakpoints do not need code modification. ++They use debugging registers. 4 hardware breakpoints are available in ia-32 ++processors. ++ ++Each hardware breakpoint can be of one of the following three types. ++ ++1. Execution breakpoint - An Execution breakpoint is triggered when code ++ at the breakpoint address is executed. ++ ++ As limited number of hardware breakpoints are available, it is ++ advisable to use software breakpoints ( break command ) instead ++ of execution hardware breakpoints, unless modification of code ++ is to be avoided. ++ ++2. Write breakpoint - A write breakpoint is triggered when memory ++ location at the breakpoint address is written. ++ ++ A write or can be placed for data of variable length. Length of ++ a write breakpoint indicates length of the datatype to be ++ watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for ++ 4 byte data. ++ ++3. Access breakpoint - An access breakpoint is triggered when memory ++ location at the breakpoint address is either read or written. ++ ++ Access breakpoints also have lengths similar to write breakpoints. ++ ++IO breakpoints in ia-32 are not supported. ++ ++Since gdb stub at present does not use the protocol used by gdb for hardware ++breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros ++for hardware breakpoints are described below. ++ ++hwebrk - Places an execution breakpoint ++ hwebrk breakpointno address ++hwwbrk - Places a write breakpoint ++ hwwbrk breakpointno length address ++hwabrk - Places an access breakpoint ++ hwabrk breakpointno length address ++hwrmbrk - Removes a breakpoint ++ hwrmbrk breakpointno ++exinfo - Tells whether a software or hardware breakpoint has occurred. ++ Prints number of the hardware breakpoint if a hardware breakpoint has ++ occurred. ++ ++Arguments required by these commands are as follows ++breakpointno - 0 to 3 ++length - 1 to 3 ++address - Memory location in hex digits ( without 0x ) e.g c015e9bc ++ ++SMP support ++========== ++ ++When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb ++client, all the processors are forced to enter the debugger. Current ++thread corresponds to the thread running on the processor where ++breakpoint occurred. Threads running on other processor(s) appear ++similar to other non-running threads in the 'info threads' output. ++Within the kgdb stub there is a structure "waiting_cpus" in which kgdb ++records the values of "current" and "regs" for each CPU other than the ++one that hit the breakpoint. "current" is a pointer to the task ++structure for the task that CPU is running, while "regs" points to the ++saved registers for the task. This structure can be examined with the ++gdb "p" command. ++ ++ia-32 hardware debugging registers on all processors are set to same ++values. Hence any hardware breakpoints may occur on any processor. ++ ++gdb troubleshooting ++=================== ++ ++1. gdb hangs ++Kill it. restart gdb. Connect to target machine. ++ ++2. gdb cannot connect to target machine (after killing a gdb and ++restarting another) If the target machine was not inside debugger when ++you killed gdb, gdb cannot connect because the target machine won't ++respond. In this case echo "Ctrl+C"(ASCII 3) to the serial line. ++e.g. echo -e "\003" > /dev/ttyS1 ++This forces that target machine into the debugger, after which you ++can connect. ++ ++3. gdb cannot connect even after echoing Ctrl+C into serial line ++Try changing serial line settings min to 1 and time to 0 ++e.g. stty min 1 time 0 < /dev/ttyS1 ++Try echoing again ++ ++Check serial line speed and set it to correct value if required ++e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 ++ ++EVENTS ++====== ++ ++Ever want to know the order of things happening? Which CPU did what and ++when? How did the spinlock get the way it is? Then events are for ++you. Events are defined by calls to an event collection interface and ++saved for later examination. In this case, kgdb events are saved by a ++very fast bit of code in kgdb which is fully SMP and interrupt protected ++and they are examined by using gdb to display them. Kgdb keeps only ++the last N events, where N must be a power of two and is defined at ++configure time. ++ ++ ++Events are signaled to kgdb by calling: ++ ++kgdb_ts(data0,data1) ++ ++For each call kgdb records each call in an array along with other info. ++Here is the array definition: ++ ++struct kgdb_and_then_struct { ++#ifdef CONFIG_SMP ++ int on_cpu; ++#endif ++ long long at_time; ++ int from_ln; ++ char * in_src; ++ void *from; ++ int with_if; ++ int data0; ++ int data1; ++}; ++ ++For SMP machines the CPU is recorded, for all machines the TSC is ++recorded (gets a time stamp) as well as the line number and source file ++the call was made from. The address of the (from), the "if" (interrupt ++flag) and the two data items are also recorded. The macro kgdb_ts casts ++the types to int, so you can put any 32-bit values here. There is a ++configure option to select the number of events you want to keep. A ++nice number might be 128, but you can keep up to 1024 if you want. The ++number must be a power of two. An "andthen" macro library is provided ++for gdb to help you look at these events. It is also possible to define ++a different structure for the event storage and cast the data to this ++structure. For example the following structure is defined in kgdb: ++ ++struct kgdb_and_then_struct2 { ++#ifdef CONFIG_SMP ++ int on_cpu; ++#endif ++ long long at_time; ++ int from_ln; ++ char * in_src; ++ void *from; ++ int with_if; ++ struct task_struct *t1; ++ struct task_struct *t2; ++}; ++ ++If you use this for display, the data elements will be displayed as ++pointers to task_struct entries. You may want to define your own ++structure to use in casting. You should only change the last two items ++and you must keep the structure size the same. Kgdb will handle these ++as 32-bit ints, but within that constraint you can define a structure to ++cast to any 32-bit quantity. This need only be available to gdb and is ++only used for casting in the display code. ++ ++Final Items ++=========== ++ ++I picked up this code from Amit S. Kale and enhanced it. ++ ++If you make some really cool modification to this stuff, or if you ++fix a bug, please let me know. ++ ++George Anzinger ++ ++ ++Amit S. Kale ++ ++ ++(First kgdb by David Grothe ) ++ ++(modified by Tigran Aivazian ) ++ Putting gdbstub into the kernel config menu. ++ ++(modified by Scott Foehner ) ++ Hooks for entering gdbstub at boot time. ++ ++(modified by Amit S. Kale ) ++ Threads, ia-32 hw debugging, mp support, console support, ++ nmi watchdog handling. ++ ++(modified by George Anzinger ) ++ Extended threads to include the idle threads. ++ Enhancements to allow breakpoint() at first C code. ++ Use of module_init() and __setup() to automate the configure. ++ Enhanced the cpu "collection" code to work in early bring-up. ++ Added ability to call functions from gdb ++ Print info thread stuff without going back to schedule() ++ Now collect the "other" cpus with an IPI/ NMI. +Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit.hw +=================================================================== +--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit.hw 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit.hw 2005-04-05 12:48:05.273615368 +0800 +@@ -0,0 +1,117 @@ ++ ++#Using ia-32 hardware breakpoints. ++# ++#4 hardware breakpoints are available in ia-32 processors. These breakpoints ++#do not need code modification. They are set using debug registers. ++# ++#Each hardware breakpoint can be of one of the ++#three types: execution, write, access. ++#1. An Execution breakpoint is triggered when code at the breakpoint address is ++#executed. ++#2. A write breakpoint ( aka watchpoints ) is triggered when memory location ++#at the breakpoint address is written. ++#3. An access breakpoint is triggered when memory location at the breakpoint ++#address is either read or written. ++# ++#As hardware breakpoints are available in limited number, use software ++#breakpoints ( br command in gdb ) instead of execution hardware breakpoints. ++# ++#Length of an access or a write breakpoint defines length of the datatype to ++#be watched. Length is 1 for char, 2 short , 3 int. ++# ++#For placing execution, write and access breakpoints, use commands ++#hwebrk, hwwbrk, hwabrk ++#To remove a breakpoint use hwrmbrk command. ++# ++#These commands take following types of arguments. For arguments associated ++#with each command, use help command. ++#1. breakpointno: 0 to 3 ++#2. length: 1 to 3 ++#3. address: Memory location in hex ( without 0x ) e.g c015e9bc ++# ++#Use the command exinfo to find which hardware breakpoint occured. ++ ++#hwebrk breakpointno address ++define hwebrk ++ maintenance packet Y$arg0,0,0,$arg1 ++end ++document hwebrk ++ hwebrk
++ Places a hardware execution breakpoint ++ = 0 - 3 ++
= Hex digits without leading "0x". ++end ++ ++#hwwbrk breakpointno length address ++define hwwbrk ++ maintenance packet Y$arg0,1,$arg1,$arg2 ++end ++document hwwbrk ++ hwwbrk
++ Places a hardware write breakpoint ++ = 0 - 3 ++ = 1 (1 byte), 2 (2 byte), 3 (4 byte) ++
= Hex digits without leading "0x". ++end ++ ++#hwabrk breakpointno length address ++define hwabrk ++ maintenance packet Y$arg0,1,$arg1,$arg2 ++end ++document hwabrk ++ hwabrk
++ Places a hardware access breakpoint ++ = 0 - 3 ++ = 1 (1 byte), 2 (2 byte), 3 (4 byte) ++
= Hex digits without leading "0x". ++end ++ ++#hwrmbrk breakpointno ++define hwrmbrk ++ maintenance packet y$arg0 ++end ++document hwrmbrk ++ hwrmbrk ++ = 0 - 3 ++ Removes a hardware breakpoint ++end ++ ++define reboot ++ maintenance packet r ++end ++#exinfo ++define exinfo ++ maintenance packet qE ++end ++document exinfo ++ exinfo ++ Gives information about a breakpoint. ++end ++define get_th ++ p $th=(struct thread_info *)((int)$esp & ~8191) ++end ++document get_th ++ get_tu ++ Gets and prints the current thread_info pointer, Defines th to be it. ++end ++define get_cu ++ p $cu=((struct thread_info *)((int)$esp & ~8191))->task ++end ++document get_cu ++ get_cu ++ Gets and print the "current" value. Defines $cu to be it. ++end ++define int_off ++ set var $flags=$eflags ++ set $eflags=$eflags&~0x200 ++ end ++define int_on ++ set var $eflags|=$flags&0x200 ++ end ++document int_off ++ saves the current interrupt state and clears the processor interrupt ++ flag. Use int_on to restore the saved flag. ++end ++document int_on ++ Restores the interrupt flag saved by int_off. ++end +Index: linux-2.6.10/Documentation/i386/kgdb/gdb-globals.txt +=================================================================== +--- linux-2.6.10.orig/Documentation/i386/kgdb/gdb-globals.txt 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/Documentation/i386/kgdb/gdb-globals.txt 2005-04-05 12:48:05.260617344 +0800 +@@ -0,0 +1,71 @@ ++Sender: akale@veritas.com ++Date: Fri, 23 Jun 2000 19:26:35 +0530 ++From: "Amit S. Kale" ++Organization: Veritas Software (India) ++To: Dave Grothe , linux-kernel@vger.rutgers.edu ++CC: David Milburn , ++ "Edouard G. Parmelan" , ++ ezannoni@cygnus.com, Keith Owens ++Subject: Re: Module debugging using kgdb ++ ++Dave Grothe wrote: ++> ++> Amit: ++> ++> There is a 2.4.0 version of kgdb on our ftp site: ++> ftp://ftp.gcom.com/pub/linux/src/kgdb. I mirrored your version of gdb ++> and loadmodule.sh there. ++> ++> Have a look at the README file and see if I go it right. If not, send ++> me some corrections and I will update it. ++> ++> Does your version of gdb solve the global variable problem? ++ ++Yes. ++Thanks to Elena Zanoni, gdb (developement version) can now calculate ++correctly addresses of dynamically loaded object files. I have not been ++following gdb developement for sometime and am not sure when symbol ++address calculation fix is going to appear in a gdb stable version. ++ ++Elena, any idea when the fix will make it to a prebuilt gdb from a ++redhat release? ++ ++For the time being I have built a gdb developement version. It can be ++used for module debugging with loadmodule.sh script. ++ ++The problem with calculating of module addresses with previous versions ++of gdb was as follows: ++gdb did not use base address of a section while calculating address of ++a symbol in the section in an object file loaded via 'add-symbol-file'. ++It used address of .text segment instead. Due to this addresses of ++symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly. ++ ++Above mentioned fix allow gdb to use base address of a segment while ++calculating address of a symbol in it. It adds a parameter '-s' to ++'add-symbol-file' command for specifying base address of a segment. ++ ++loadmodule.sh script works as follows. ++ ++1. Copy a module file to target machine. ++2. Load the module on the target machine using insmod with -m parameter. ++insmod produces a module load map which contains base addresses of all ++sections in the module and addresses of symbols in the module file. ++3. Find all sections and their base addresses in the module from ++the module map. ++4. Generate a script that loads the module file. The script uses ++'add-symbol-file' and specifies address of text segment followed by ++addresses of all segments in the module. ++ ++Here is an example gdb script produced by loadmodule.sh script. ++ ++add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5 ++-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38 ++-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838 ++ ++With this command gdb can calculate addresses of symbols in ANY segment ++in a module file. ++ ++Regards. ++-- ++Amit Kale ++Veritas Software ( http://www.veritas.com ) +Index: linux-2.6.10/Documentation/i386/kgdb/gdbinit-modules +=================================================================== +--- linux-2.6.10.orig/Documentation/i386/kgdb/gdbinit-modules 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/Documentation/i386/kgdb/gdbinit-modules 2005-04-05 12:48:05.262617040 +0800 +@@ -0,0 +1,146 @@ ++# ++# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub. ++# ++# This don't work for Linux-2.0 or older. ++# ++# Author Edouard G. Parmelan ++# ++# ++# Fri Apr 30 20:33:29 CEST 1999 ++# First public release. ++# ++# Major cleanup after experiment Linux-2.0 kernel without success. ++# Symbols of a module are not in the correct order, I can't explain ++# why :( ++# ++# Fri Mar 19 15:41:40 CET 1999 ++# Initial version. ++# ++# Thu Jan 6 16:29:03 CST 2000 ++# A little fixing by Dave Grothe ++# ++# Mon Jun 19 09:33:13 CDT 2000 ++# Alignment changes from Edouard Parmelan ++# ++# The basic idea is to find where insmod load the module and inform ++# GDB to load the symbol table of the module with the GDB command ++# ``add-symbol-file
''. ++# ++# The Linux kernel holds the list of all loaded modules in module_list, ++# this list end with &kernel_module (exactly with module->next == NULL, ++# but the last module is not a real module). ++# ++# Insmod allocates the struct module before the object file. Since ++# Linux-2.1, this structure contain his size. The real address of ++# the object file is then (char*)module + module->size_of_struct. ++# ++# You can use three user functions ``mod-list'', ``mod-print-symbols'' ++# and ``add-module-symbols''. ++# ++# mod-list list all loaded modules with the format: ++# ++# ++# As soon as you have found the address of your module, you can ++# print its exported symbols (mod-print-symbols) or inform GDB to add ++# symbols from your module file (mod-add-symbols). ++# ++# The argument that you give to mod-print-symbols or mod-add-symbols ++# is the from the mod-list command. ++# ++# When using the mod-add-symbols command you must also give the full ++# pathname of the modules object code file. ++# ++# The command mod-add-lis is an example of how to make this easier. ++# You can edit this macro to contain the path name of your own ++# favorite module and then use it as a shorthand to load it. You ++# still need the module-address, however. ++# ++# The internal function ``mod-validate'' set the GDB variable $mod ++# as a ``struct module*'' if the kernel known the module otherwise ++# $mod is set to NULL. This ensure to not add symbols for a wrong ++# address. ++# ++# Have a nice hacking day ! ++# ++# ++define mod-list ++ set $mod = (struct module*)module_list ++ # the last module is the kernel, ignore it ++ while $mod != &kernel_module ++ printf "%p\t%s\n", (long)$mod, ($mod)->name ++ set $mod = $mod->next ++ end ++end ++document mod-list ++List all modules in the form: ++Use the as the argument for the other ++mod-commands: mod-print-symbols, mod-add-symbols. ++end ++ ++define mod-validate ++ set $mod = (struct module*)module_list ++ while ($mod != $arg0) && ($mod != &kernel_module) ++ set $mod = $mod->next ++ end ++ if $mod == &kernel_module ++ set $mod = 0 ++ printf "%p is not a module\n", $arg0 ++ end ++end ++document mod-validate ++mod-validate ++Internal user-command used to validate the module parameter. ++If is a real loaded module, set $mod to it otherwise set $mod to 0. ++end ++ ++ ++define mod-print-symbols ++ mod-validate $arg0 ++ if $mod != 0 ++ set $i = 0 ++ while $i < $mod->nsyms ++ set $sym = $mod->syms[$i] ++ printf "%p\t%s\n", $sym->value, $sym->name ++ set $i = $i + 1 ++ end ++ end ++end ++document mod-print-symbols ++mod-print-symbols ++Print all exported symbols of the module. see mod-list ++end ++ ++ ++define mod-add-symbols-align ++ mod-validate $arg0 ++ if $mod != 0 ++ set $mod_base = ($mod->size_of_struct + (long)$mod) ++ if ($arg2 != 0) && (($mod_base & ($arg2 - 1)) != 0) ++ set $mod_base = ($mod_base | ($arg2 - 1)) + 1 ++ end ++ add-symbol-file $arg1 $mod_base ++ end ++end ++document mod-add-symbols-align ++mod-add-symbols-align ++Load the symbols table of the module from the object file where ++first section aligment is . ++To retreive alignment, use `objdump -h '. ++end ++ ++define mod-add-symbols ++ mod-add-symbols-align $arg0 $arg1 sizeof(long) ++end ++document mod-add-symbols ++mod-add-symbols ++Load the symbols table of the module from the object file. ++Default alignment is 4. See mod-add-symbols-align. ++end ++ ++define mod-add-lis ++ mod-add-symbols-align $arg0 /usr/src/LiS/streams.o 16 ++end ++document mod-add-lis ++mod-add-lis ++Does mod-add-symbols /usr/src/LiS/streams.o ++end +Index: linux-2.6.10/Documentation/i386/kgdb/debug-nmi.txt +=================================================================== +--- linux-2.6.10.orig/Documentation/i386/kgdb/debug-nmi.txt 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/Documentation/i386/kgdb/debug-nmi.txt 2005-04-05 12:48:05.261617192 +0800 +@@ -0,0 +1,37 @@ ++Subject: Debugging with NMI ++Date: Mon, 12 Jul 1999 11:28:31 -0500 ++From: David Grothe ++Organization: Gcom, Inc ++To: David Grothe ++ ++Kernel hackers: ++ ++Maybe this is old hat, but it is new to me -- ++ ++On an ISA bus machine, if you short out the A1 and B1 pins of an ISA ++slot you will generate an NMI to the CPU. This interrupts even a ++machine that is hung in a loop with interrupts disabled. Used in ++conjunction with kgdb < ++ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can ++gain debugger control of a machine that is hung in the kernel! Even ++without kgdb the kernel will print a stack trace so you can find out ++where it was hung. ++ ++The A1/B1 pins are directly opposite one another and the farthest pins ++towards the bracket end of the ISA bus socket. You can stick a paper ++clip or multi-meter probe between them to short them out. ++ ++I had a spare ISA bus to PC104 bus adapter around. The PC104 end of the ++board consists of two rows of wire wrap pins. So I wired a push button ++between the A1/B1 pins and now have an ISA board that I can stick into ++any ISA bus slot for debugger entry. ++ ++Microsoft has a circuit diagram of a PCI card at ++http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM. If you want to ++build one you will have to mail them and ask for the PAL equations. ++Nobody makes one comercially. ++ ++[THIS TIP COMES WITH NO WARRANTY WHATSOEVER. It works for me, but if ++your machine catches fire, it is your problem, not mine.] ++ ++-- Dave (the kgdb guy) +Index: linux-2.6.10/Documentation/i386/kgdb/loadmodule.sh +=================================================================== +--- linux-2.6.10.orig/Documentation/i386/kgdb/loadmodule.sh 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/Documentation/i386/kgdb/loadmodule.sh 2005-04-05 12:48:05.274615216 +0800 +@@ -0,0 +1,78 @@ ++#/bin/sh ++# This script loads a module on a target machine and generates a gdb script. ++# source generated gdb script to load the module file at appropriate addresses ++# in gdb. ++# ++# Usage: ++# Loading the module on target machine and generating gdb script) ++# [foo]$ loadmodule.sh ++# ++# Loading the module file into gdb ++# (gdb) source ++# ++# Modify following variables according to your setup. ++# TESTMACHINE - Name of the target machine ++# GDBSCRIPTS - The directory where a gdb script will be generated ++# ++# Author: Amit S. Kale (akale@veritas.com). ++# ++# If you run into problems, please check files pointed to by following ++# variables. ++# ERRFILE - /tmp/.errs contains stderr output of insmod ++# MAPFILE - /tmp/.map contains stdout output of insmod ++# GDBSCRIPT - $GDBSCRIPTS/load gdb script. ++ ++TESTMACHINE=foo ++GDBSCRIPTS=/home/bar ++ ++if [ $# -lt 1 ] ; then { ++ echo Usage: $0 modulefile ++ exit ++} ; fi ++ ++MODULEFILE=$1 ++MODULEFILEBASENAME=`basename $1` ++ ++if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then { ++ MODULEFILE=`pwd`/$MODULEFILE ++} fi ++ ++ERRFILE=/tmp/$MODULEFILEBASENAME.errs ++MAPFILE=/tmp/$MODULEFILEBASENAME.map ++GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME ++ ++function findaddr() { ++ local ADDR=0x$(echo "$SEGMENTS" | \ ++ grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \ ++ sed 's/[ ]*[^ ]*$//') ++ echo $ADDR ++} ++ ++function checkerrs() { ++ if [ "`cat $ERRFILE`" != "" ] ; then { ++ cat $ERRFILE ++ exit ++ } fi ++} ++ ++#load the module ++echo Copying $MODULEFILE to $TESTMACHINE ++rcp $MODULEFILE root@${TESTMACHINE}: ++ ++echo Loading module $MODULEFILE ++rsh -l root $TESTMACHINE /sbin/insmod -m ./`basename $MODULEFILE` \ ++ > $MAPFILE 2> $ERRFILE ++checkerrs ++ ++SEGMENTS=`head -n 11 $MAPFILE | tail -n 10` ++TEXTADDR=$(findaddr "\\.text[^.]") ++LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR" ++SEGADDRS=`echo "$SEGMENTS" | awk '//{ ++ if ($1 != ".text" && $1 != ".this" && ++ $1 != ".kstrtab" && $1 != ".kmodtab") { ++ print " -s " $1 " 0x" $3 " " ++ } ++}'` ++LOADSTRING="$LOADSTRING $SEGADDRS" ++echo Generating script $GDBSCRIPT ++echo $LOADSTRING > $GDBSCRIPT +Index: linux-2.6.10/Documentation/i386/kgdb/andthen +=================================================================== +--- linux-2.6.10.orig/Documentation/i386/kgdb/andthen 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/Documentation/i386/kgdb/andthen 2005-04-05 12:48:05.272615520 +0800 +@@ -0,0 +1,100 @@ ++ ++define set_andthen ++ set var $thp=0 ++ set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0] ++ set var $at_size = (sizeof kgdb_data)/(sizeof *$thp) ++ set var $at_oc=kgdb_and_then_count ++ set var $at_cc=$at_oc ++end ++ ++define andthen_next ++ set var $at_cc=$arg0 ++end ++ ++define andthen ++ andthen_set_edge ++ if ($at_cc >= $at_oc) ++ printf "Outside window. Window size is %d\n",($at_oc-$at_low) ++ else ++ printf "%d: ",$at_cc ++ output *($thp+($at_cc++ % $at_size )) ++ printf "\n" ++ end ++end ++define andthen_set_edge ++ set var $at_oc=kgdb_and_then_count ++ set var $at_low = $at_oc - $at_size ++ if ($at_low < 0 ) ++ set var $at_low = 0 ++ end ++ if (( $at_cc > $at_oc) || ($at_cc < $at_low)) ++ printf "Count outside of window, setting count to " ++ if ($at_cc >= $at_oc) ++ set var $at_cc = $at_oc ++ else ++ set var $at_cc = $at_low ++ end ++ printf "%d\n",$at_cc ++ end ++end ++ ++define beforethat ++ andthen_set_edge ++ if ($at_cc <= $at_low) ++ printf "Outside window. Window size is %d\n",($at_oc-$at_low) ++ else ++ printf "%d: ",$at_cc-1 ++ output *($thp+(--$at_cc % $at_size )) ++ printf "\n" ++ end ++end ++ ++document andthen_next ++ andthen_next ++ . sets the number of the event to display next. If this event ++ . is not in the event pool, either andthen or beforethat will ++ . correct it to the nearest event pool edge. The event pool ++ . ends at the last event recorded and begins ++ . prior to that. If beforethat is used next, it will display ++ . event -1. ++. ++ andthen commands are: set_andthen, andthen_next, andthen and beforethat ++end ++ ++ ++document andthen ++ andthen ++. displays the next event in the list. sets up to display ++. the oldest saved event first. ++. (optional) count of the event to display. ++. note the number of events saved is specified at configure time. ++. if events are saved between calls to andthen the index will change ++. but the displayed event will be the next one (unless the event buffer ++. is overrun). ++. ++. andthen commands are: set_andthen, andthen_next, andthen and beforethat ++end ++ ++document set_andthen ++ set_andthen ++. sets up to use the and commands. ++. if you have defined your own struct, use the above and ++. then enter the following: ++. p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0] ++. where is the name of your structure. ++. ++. andthen commands are: set_andthen, andthen_next, andthen and beforethat ++end ++ ++document beforethat ++ beforethat ++. displays the next prior event in the list. sets up to ++. display the last occuring event first. ++. ++. note the number of events saved is specified at configure time. ++. if events are saved between calls to beforethat the index will change ++. but the displayed event will be the next one (unless the event buffer ++. is overrun). ++. ++. andthen commands are: set_andthen, andthen_next, andthen and beforethat ++end +Index: linux-2.6.10/arch/i386/lib/kgdb_serial.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/lib/kgdb_serial.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/arch/i386/lib/kgdb_serial.c 2005-04-05 12:48:05.193627528 +0800 +@@ -0,0 +1,485 @@ ++/* ++ * Serial interface GDB stub ++ * ++ * Written (hacked together) by David Grothe (dave@gcom.com) ++ * Modified to allow invokation early in boot see also ++ * kgdb.h for instructions by George Anzinger(george@mvista.com) ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_KGDB_USER_CONSOLE ++extern void kgdb_console_finit(void); ++#endif ++#define PRNT_off ++#define TEST_EXISTANCE ++#ifdef PRNT ++#define dbprintk(s) printk s ++#else ++#define dbprintk(s) ++#endif ++#define TEST_INTERRUPT_off ++#ifdef TEST_INTERRUPT ++#define intprintk(s) printk s ++#else ++#define intprintk(s) ++#endif ++ ++#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) ++ ++#define GDB_BUF_SIZE 512 /* power of 2, please */ ++ ++static char gdb_buf[GDB_BUF_SIZE]; ++static int gdb_buf_in_inx; ++static atomic_t gdb_buf_in_cnt; ++static int gdb_buf_out_inx; ++ ++struct async_struct *gdb_async_info; ++static int gdb_async_irq; ++ ++#define outb_px(a,b) outb_p(b,a) ++ ++static void program_uart(struct async_struct *info); ++static void write_char(struct async_struct *info, int chr); ++/* ++ * Get a byte from the hardware data buffer and return it ++ */ ++static int ++read_data_bfr(struct async_struct *info) ++{ ++ char it = inb_p(info->port + UART_LSR); ++ ++ if (it & UART_LSR_DR) ++ return (inb_p(info->port + UART_RX)); ++ /* ++ * If we have a framing error assume somebody messed with ++ * our uart. Reprogram it and send '-' both ways... ++ */ ++ if (it & 0xc) { ++ program_uart(info); ++ write_char(info, '-'); ++ return ('-'); ++ } ++ return (-1); ++ ++} /* read_data_bfr */ ++ ++/* ++ * Get a char if available, return -1 if nothing available. ++ * Empty the receive buffer first, then look at the interface hardware. ++ ++ * Locking here is a bit of a problem. We MUST not lock out communication ++ * if we are trying to talk to gdb about a kgdb entry. ON the other hand ++ * we can loose chars in the console pass thru if we don't lock. It is also ++ * possible that we could hold the lock or be waiting for it when kgdb ++ * NEEDS to talk. Since kgdb locks down the world, it does not need locks. ++ * We do, of course have possible issues with interrupting a uart operation, ++ * but we will just depend on the uart status to help keep that straight. ++ ++ */ ++static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; ++#ifdef CONFIG_SMP ++extern spinlock_t kgdb_spinlock; ++#endif ++ ++static int ++read_char(struct async_struct *info) ++{ ++ int chr; ++ unsigned long flags; ++ local_irq_save(flags); ++#ifdef CONFIG_SMP ++ if (!spin_is_locked(&kgdb_spinlock)) { ++ spin_lock(&uart_interrupt_lock); ++ } ++#endif ++ if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ ++ chr = gdb_buf[gdb_buf_out_inx++]; ++ gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); ++ atomic_dec(&gdb_buf_in_cnt); ++ } else { ++ chr = read_data_bfr(info); ++ } ++#ifdef CONFIG_SMP ++ if (!spin_is_locked(&kgdb_spinlock)) { ++ spin_unlock(&uart_interrupt_lock); ++ } ++#endif ++ local_irq_restore(flags); ++ return (chr); ++} ++ ++/* ++ * Wait until the interface can accept a char, then write it. ++ */ ++static void ++write_char(struct async_struct *info, int chr) ++{ ++ while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; ++ ++ outb_p(chr, info->port + UART_TX); ++ ++} /* write_char */ ++ ++/* ++ * Mostly we don't need a spinlock, but since the console goes ++ * thru here with interrutps on, well, we need to catch those ++ * chars. ++ */ ++/* ++ * This is the receiver interrupt routine for the GDB stub. ++ * It will receive a limited number of characters of input ++ * from the gdb host machine and save them up in a buffer. ++ * ++ * When the gdb stub routine getDebugChar() is called it ++ * draws characters out of the buffer until it is empty and ++ * then reads directly from the serial port. ++ * ++ * We do not attempt to write chars from the interrupt routine ++ * since the stubs do all of that via putDebugChar() which ++ * writes one byte after waiting for the interface to become ++ * ready. ++ * ++ * The debug stubs like to run with interrupts disabled since, ++ * after all, they run as a consequence of a breakpoint in ++ * the kernel. ++ * ++ * Perhaps someone who knows more about the tty driver than I ++ * care to learn can make this work for any low level serial ++ * driver. ++ */ ++static irqreturn_t ++gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ struct async_struct *info; ++ unsigned long flags; ++ ++ info = gdb_async_info; ++ if (!info || !info->tty || irq != gdb_async_irq) ++ return IRQ_NONE; ++ ++ local_irq_save(flags); ++ spin_lock(&uart_interrupt_lock); ++ do { ++ int chr = read_data_bfr(info); ++ intprintk(("Debug char on int: %x hex\n", chr)); ++ if (chr < 0) ++ continue; ++ ++ if (chr == 3) { /* Ctrl-C means remote interrupt */ ++ BREAKPOINT; ++ continue; ++ } ++ ++ if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { ++ /* buffer overflow tosses early char */ ++ read_char(info); ++ } ++ gdb_buf[gdb_buf_in_inx++] = chr; ++ gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); ++ } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); ++ spin_unlock(&uart_interrupt_lock); ++ local_irq_restore(flags); ++ return IRQ_HANDLED; ++} /* gdb_interrupt */ ++ ++/* ++ * Just a NULL routine for testing. ++ */ ++void ++gdb_null(void) ++{ ++} /* gdb_null */ ++ ++/* These structure are filled in with values defined in asm/kgdb_local.h ++ */ ++static struct serial_state state = SB_STATE; ++static struct async_struct local_info = SB_INFO; ++static int ok_to_enable_ints = 0; ++static void kgdb_enable_ints_now(void); ++ ++extern char *kgdb_version; ++/* ++ * Hook an IRQ for KGDB. ++ * ++ * This routine is called from putDebugChar, below. ++ */ ++static int ints_disabled = 1; ++int ++gdb_hook_interrupt(struct async_struct *info, int verb) ++{ ++ struct serial_state *state = info->state; ++ unsigned long flags; ++ int port; ++#ifdef TEST_EXISTANCE ++ int scratch, scratch2; ++#endif ++ ++ /* The above fails if memory managment is not set up yet. ++ * Rather than fail the set up, just keep track of the fact ++ * and pick up the interrupt thing later. ++ */ ++ gdb_async_info = info; ++ port = gdb_async_info->port; ++ gdb_async_irq = state->irq; ++ if (verb) { ++ printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", ++ kgdb_version, ++ port, ++ gdb_async_irq, gdb_async_info->state->custom_divisor); ++ } ++ local_irq_save(flags); ++#ifdef TEST_EXISTANCE ++ /* Existance test */ ++ /* Should not need all this, but just in case.... */ ++ ++ scratch = inb_p(port + UART_IER); ++ outb_px(port + UART_IER, 0); ++ outb_px(0xff, 0x080); ++ scratch2 = inb_p(port + UART_IER); ++ outb_px(port + UART_IER, scratch); ++ if (scratch2) { ++ printk ++ ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); ++ local_irq_restore(flags); ++ return 1; /* We failed; there's nothing here */ ++ } ++ scratch2 = inb_p(port + UART_LCR); ++ outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ ++ outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ ++ outb_px(port + UART_LCR, 0); ++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); ++ scratch = inb_p(port + UART_IIR) >> 6; ++ if (scratch == 1) { ++ printk("gdb_hook_interrupt: Undefined UART type!" ++ " Not a UART! \n"); ++ local_irq_restore(flags); ++ return 1; ++ } else { ++ dbprintk(("gdb_hook_interrupt: UART type " ++ "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); ++ } ++ scratch = inb_p(port + UART_MCR); ++ outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); ++ outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); ++ scratch2 = inb_p(port + UART_MSR) & 0xF0; ++ outb_px(port + UART_MCR, scratch); ++ if (scratch2 != 0x90) { ++ printk("gdb_hook_interrupt: " ++ "Loop back test failed! Not a UART!\n"); ++ local_irq_restore(flags); ++ return scratch2 + 1000; /* force 0 to fail */ ++ } ++#endif /* test existance */ ++ program_uart(info); ++ local_irq_restore(flags); ++ ++ return (0); ++ ++} /* gdb_hook_interrupt */ ++ ++static void ++program_uart(struct async_struct *info) ++{ ++ int port = info->port; ++ ++ (void) inb_p(port + UART_RX); ++ outb_px(port + UART_IER, 0); ++ ++ (void) inb_p(port + UART_RX); /* serial driver comments say */ ++ (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ ++ (void) inb_p(port + UART_MSR); ++ outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); ++ outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ ++ outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ ++ outb_px(port + UART_MCR, info->MCR); ++ ++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ ++ outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ ++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ ++ if (!ints_disabled) { ++ intprintk(("KGDB: Sending %d to port %x offset %d\n", ++ gdb_async_info->IER, ++ (int) gdb_async_info->port, UART_IER)); ++ outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); ++ } ++ return; ++} ++ ++/* ++ * getDebugChar ++ * ++ * This is a GDB stub routine. It waits for a character from the ++ * serial interface and then returns it. If there is no serial ++ * interface connection then it returns a bogus value which will ++ * almost certainly cause the system to hang. In the ++ */ ++int kgdb_in_isr = 0; ++int kgdb_in_lsr = 0; ++extern spinlock_t kgdb_spinlock; ++ ++/* Caller takes needed protections */ ++ ++int ++getDebugChar(void) ++{ ++ volatile int chr, dum, time, end_time; ++ ++ dbprintk(("getDebugChar(port %x): ", gdb_async_info->port)); ++ ++ if (gdb_async_info == NULL) { ++ gdb_hook_interrupt(&local_info, 0); ++ } ++ /* ++ * This trick says if we wait a very long time and get ++ * no char, return the -1 and let the upper level deal ++ * with it. ++ */ ++ rdtsc(dum, time); ++ end_time = time + 2; ++ while (((chr = read_char(gdb_async_info)) == -1) && ++ (end_time - time) > 0) { ++ rdtsc(dum, time); ++ }; ++ /* ++ * This covers our butts if some other code messes with ++ * our uart, hay, it happens :o) ++ */ ++ if (chr == -1) ++ program_uart(gdb_async_info); ++ ++ dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); ++ return (chr); ++ ++} /* getDebugChar */ ++ ++static int count = 3; ++static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; ++ ++static int __init ++kgdb_enable_ints(void) ++{ ++ if (gdb_async_info == NULL) { ++ gdb_hook_interrupt(&local_info, 1); ++ } ++ ok_to_enable_ints = 1; ++ kgdb_enable_ints_now(); ++#ifdef CONFIG_KGDB_USER_CONSOLE ++ kgdb_console_finit(); ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_SERIAL_8250 ++void shutdown_for_kgdb(struct async_struct *gdb_async_info); ++#endif ++ ++#ifdef CONFIG_DISCONTIGMEM ++static inline int kgdb_mem_init_done(void) ++{ ++ return highmem_start_page != NULL; ++} ++#else ++static inline int kgdb_mem_init_done(void) ++{ ++ return max_mapnr != 0; ++} ++#endif ++ ++static void ++kgdb_enable_ints_now(void) ++{ ++ if (!spin_trylock(&one_at_atime)) ++ return; ++ if (!ints_disabled) ++ goto exit; ++ if (kgdb_mem_init_done() && ++ ints_disabled) { /* don't try till mem init */ ++#ifdef CONFIG_SERIAL_8250 ++ /* ++ * The ifdef here allows the system to be configured ++ * without the serial driver. ++ * Don't make it a module, however, it will steal the port ++ */ ++ shutdown_for_kgdb(gdb_async_info); ++#endif ++ ints_disabled = request_irq(gdb_async_info->state->irq, ++ gdb_interrupt, ++ IRQ_T(gdb_async_info), ++ "KGDB-stub", NULL); ++ intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); ++ } ++ if (!ints_disabled) { ++ intprintk(("KGDB: Sending %d to port %x offset %d\n", ++ gdb_async_info->IER, ++ (int) gdb_async_info->port, UART_IER)); ++ outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); ++ } ++ exit: ++ spin_unlock(&one_at_atime); ++} ++ ++/* ++ * putDebugChar ++ * ++ * This is a GDB stub routine. It waits until the interface is ready ++ * to transmit a char and then sends it. If there is no serial ++ * interface connection then it simply returns to its caller, having ++ * pretended to send the char. Caller takes needed protections. ++ */ ++void ++putDebugChar(int chr) ++{ ++ dbprintk(("putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", ++ gdb_async_info->port, ++ chr, ++ chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); ++ ++ if (gdb_async_info == NULL) { ++ gdb_hook_interrupt(&local_info, 0); ++ } ++ ++ write_char(gdb_async_info, chr); /* this routine will wait */ ++ count = (chr == '#') ? 0 : count + 1; ++ if ((count == 2)) { /* try to enable after */ ++ if (ints_disabled & ok_to_enable_ints) ++ kgdb_enable_ints_now(); /* try to enable after */ ++ ++ /* We do this a lot because, well we really want to get these ++ * interrupts. The serial driver will clear these bits when it ++ * initializes the chip. Every thing else it does is ok, ++ * but this. ++ */ ++ if (!ints_disabled) { ++ outb_px(gdb_async_info->port + UART_IER, ++ gdb_async_info->IER); ++ } ++ } ++ ++} /* putDebugChar */ ++ ++module_init(kgdb_enable_ints); +Index: linux-2.6.10/arch/i386/lib/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/i386/lib/Makefile 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/arch/i386/lib/Makefile 2005-04-05 12:48:05.194627376 +0800 +@@ -8,3 +8,4 @@ + + lib-$(CONFIG_X86_USE_3DNOW) += mmx.o + lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o ++lib-$(CONFIG_KGDB) += kgdb_serial.o +Index: linux-2.6.10/arch/i386/Kconfig.debug +=================================================================== +--- linux-2.6.10.orig/arch/i386/Kconfig.debug 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/arch/i386/Kconfig.debug 2005-04-05 12:48:05.204625856 +0800 +@@ -65,4 +65,6 @@ + depends on X86_LOCAL_APIC && !X86_VISWS + default y + ++source "arch/i386/Kconfig.kgdb" ++ + endmenu +Index: linux-2.6.10/arch/i386/kernel/entry.S +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/entry.S 2005-04-05 12:48:03.413898088 +0800 ++++ linux-2.6.10/arch/i386/kernel/entry.S 2005-04-05 12:48:05.244619776 +0800 +@@ -48,6 +48,18 @@ + #include + #include + #include "irq_vectors.h" ++ /* We do not recover from a stack overflow, but at least ++ * we know it happened and should be able to track it down. ++ */ ++#ifdef CONFIG_STACK_OVERFLOW_TEST ++#define STACK_OVERFLOW_TEST \ ++ testl $(THREAD_SIZE - 512),%esp; \ ++ jnz 10f; \ ++ call stack_overflow; \ ++10: ++#else ++#define STACK_OVERFLOW_TEST ++#endif + + #define nr_syscalls ((syscall_table_size)/4) + +@@ -94,7 +106,8 @@ + pushl %ebx; \ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ +- movl %edx, %es; ++ movl %edx, %es; \ ++ STACK_OVERFLOW_TEST + + #define RESTORE_INT_REGS \ + popl %ebx; \ +@@ -198,6 +211,7 @@ + # sysenter call handler stub + ENTRY(sysenter_entry) + movl TSS_sysenter_esp0(%esp),%esp ++ .globl sysenter_past_esp + sysenter_past_esp: + sti + pushl $(__USER_DS) +@@ -261,6 +275,19 @@ + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work + restore_all: ++#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS ++ movl EFLAGS(%esp), %eax # mix EFLAGS and CS ++ movb CS(%esp), %al ++ testl $(VM_MASK | 3), %eax ++ jz resume_kernelX # returning to kernel or vm86-space ++ ++ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? ++ jz resume_kernelX ++ ++ int $3 ++ ++resume_kernelX: ++#endif + RESTORE_ALL + + # perform work that needs to be done immediately before resumption +Index: linux-2.6.10/arch/i386/kernel/traps.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/traps.c 2005-03-31 16:20:09.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/traps.c 2005-04-05 12:48:05.221623272 +0800 +@@ -105,6 +105,39 @@ + return err; + } + ++#ifdef CONFIG_KGDB ++extern void sysenter_past_esp(void); ++#include ++#include ++void set_intr_gate(unsigned int n, void *addr); ++static void set_intr_usr_gate(unsigned int n, void *addr); ++/* ++ * Should be able to call this breakpoint() very early in ++ * bring up. Just hard code the call where needed. ++ * The breakpoint() code is here because set_?_gate() functions ++ * are local (static) to trap.c. They need be done only once, ++ * but it does not hurt to do them over. ++ */ ++void breakpoint(void) ++{ ++ set_intr_usr_gate(3,&int3); /* disable ints on trap */ ++ set_intr_gate(1,&debug); ++ set_intr_gate(14,&page_fault); ++ ++ BREAKPOINT; ++} ++#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ ++ { \ ++ if (!user_mode(regs) ) \ ++ { \ ++ kgdb_handle_exception(trapnr, signr, error_code, regs); \ ++ after; \ ++ } else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \ ++ } ++#else ++#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) ++#endif ++ + static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) + { + return p > (void *)tinfo && +@@ -332,6 +365,15 @@ + #endif + if (nl) + printk("\n"); ++#ifdef CONFIG_KGDB ++ /* This is about the only place we want to go to kgdb even if in ++ * user mode. But we must go in via a trap so within kgdb we will ++ * always be in kernel mode. ++ */ ++ if (user_mode(regs)) ++ BREAKPOINT; ++#endif ++ CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,) + notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); + show_registers(regs); + } else +@@ -397,6 +439,7 @@ + #define DO_ERROR(trapnr, signr, str, name) \ + fastcall void do_##name(struct pt_regs * regs, long error_code) \ + { \ ++ CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,) \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ +@@ -420,6 +463,7 @@ + #define DO_VM86_ERROR(trapnr, signr, str, name) \ + fastcall void do_##name(struct pt_regs * regs, long error_code) \ + { \ ++ CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return) \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ +@@ -503,6 +547,7 @@ + + gp_in_kernel: + if (!fixup_exception(regs)) { ++ CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; +@@ -716,12 +761,35 @@ + * allowing programs to debug themselves without the ptrace() + * interface. + */ +- if ((regs->xcs & 3) == 0) +- goto clear_TF_reenable; ++#ifdef CONFIG_KGDB ++ /* ++ * I think this is the only "real" case of a TF in the kernel ++ * that really belongs to user space. Others are ++ * "Ours all ours!" ++ */ ++ if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_past_esp)) ++ goto clear_TF_reenable; ++#else ++ if ((regs->xcs & 3) == 0) ++ goto clear_TF_reenable; ++#endif + if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) + goto clear_TF; + } + ++#ifdef CONFIG_KGDB ++ /* ++ * If this is a kernel mode trap, we need to reset db7 to allow us ++ * to continue sanely ALSO skip the signal delivery ++ */ ++ if ((regs->xcs & 3) == 0) ++ goto clear_dr7; ++ ++ /* if not kernel, allow ints but only if they were on */ ++ if (regs->eflags & 0x200) ++ local_irq_enable(); ++#endif ++ + /* Ok, finally something we can handle */ + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; +@@ -743,6 +811,7 @@ + __asm__("movl %0,%%db7" + : /* no output */ + : "r" (0)); ++ CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) + return; + + debug_vm86: +@@ -999,6 +1068,12 @@ + { + _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + } ++#ifdef CONFIG_KGDB ++void set_intr_usr_gate(unsigned int n, void *addr) ++{ ++ _set_gate(idt_table+n,14,3,addr,__KERNEL_CS); ++} ++#endif + + + void __init trap_init(void) +@@ -1016,7 +1091,11 @@ + set_trap_gate(0,÷_error); + set_intr_gate(1,&debug); + set_intr_gate(2,&nmi); ++#ifndef CONFIG_KGDB + set_system_intr_gate(3, &int3); /* int3-5 can be called from all */ ++#else ++ set_intr_usr_gate(3,&int3); /* int3-5 can be called from all */ ++#endif + set_system_gate(4,&overflow); + set_system_gate(5,&bounds); + set_trap_gate(6,&invalid_op); +Index: linux-2.6.10/arch/i386/kernel/nmi.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/nmi.c 2005-03-31 15:57:19.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/nmi.c 2005-04-05 12:48:05.222623120 +0800 +@@ -34,7 +34,17 @@ + + #include "mach_traps.h" + ++#ifdef CONFIG_KGDB ++#include ++#ifdef CONFIG_SMP ++unsigned int nmi_watchdog = NMI_IO_APIC; ++#else ++unsigned int nmi_watchdog = NMI_LOCAL_APIC; ++#endif ++#else + unsigned int nmi_watchdog = NMI_NONE; ++#endif ++ + extern int unknown_nmi_panic; + static unsigned int nmi_hz = HZ; + static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ +@@ -466,6 +476,9 @@ + for (i = 0; i < NR_CPUS; i++) + alert_counter[i] = 0; + } ++#ifdef CONFIG_KGDB ++int tune_watchdog = 5*HZ; ++#endif + + extern void die_nmi(struct pt_regs *, const char *msg); + +@@ -480,14 +493,25 @@ + int sum, cpu = smp_processor_id(); + + sum = irq_stat[cpu].apic_timer_irqs; +- +- if (last_irq_sums[cpu] == sum) { ++#ifdef CONFIG_KGDB ++ if (!in_kgdb(regs) && last_irq_sums[cpu] == sum) { ++ ++#else ++ if (last_irq_sums[cpu] == sum) { ++#endif + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + alert_counter[cpu]++; +- if (alert_counter[cpu] == 30*nmi_hz) ++#ifdef CONFIG_KGDB ++ if (alert_counter[cpu] == tune_watchdog) { ++ kgdb_handle_exception(2, SIGPWR, 0, regs); ++ last_irq_sums[cpu] = sum; ++ alert_counter[cpu] = 0; ++ } ++#endif ++ if (alert_counter[cpu] == 5*nmi_hz) + die_nmi(regs, "NMI Watchdog detected LOCKUP"); + } else { + last_irq_sums[cpu] = sum; +Index: linux-2.6.10/arch/i386/kernel/kgdb_stub.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/kgdb_stub.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/arch/i386/kernel/kgdb_stub.c 2005-04-05 12:48:05.242620080 +0800 +@@ -0,0 +1,2330 @@ ++/* ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2, or (at your option) any ++ * later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ */ ++ ++/* ++ * Copyright (c) 2000 VERITAS Software Corporation. ++ * ++ */ ++/**************************************************************************** ++ * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ ++ * ++ * Module name: remcom.c $ ++ * Revision: 1.34 $ ++ * Date: 91/03/09 12:29:49 $ ++ * Contributor: Lake Stevens Instrument Division$ ++ * ++ * Description: low level support for gdb debugger. $ ++ * ++ * Considerations: only works on target hardware $ ++ * ++ * Written by: Glenn Engel $ ++ * Updated by: David Grothe ++ * ModuleState: Experimental $ ++ * ++ * NOTES: See Below $ ++ * ++ * Modified for 386 by Jim Kingdon, Cygnus Support. ++ * Compatibility with 2.1.xx kernel by David Grothe ++ * ++ * Changes to allow auto initilization. All that is needed is that it ++ * be linked with the kernel and a break point (int 3) be executed. ++ * The header file defines BREAKPOINT to allow one to do ++ * this. It should also be possible, once the interrupt system is up, to ++ * call putDebugChar("+"). Once this is done, the remote debugger should ++ * get our attention by sending a ^C in a packet. George Anzinger ++ * ++ * Integrated into 2.2.5 kernel by Tigran Aivazian ++ * Added thread support, support for multiple processors, ++ * support for ia-32(x86) hardware debugging. ++ * Amit S. Kale ( akale@veritas.com ) ++ * ++ * ++ * To enable debugger support, two things need to happen. One, a ++ * call to set_debug_traps() is necessary in order to allow any breakpoints ++ * or error conditions to be properly intercepted and reported to gdb. ++ * Two, a breakpoint needs to be generated to begin communication. This ++ * is most easily accomplished by a call to breakpoint(). Breakpoint() ++ * simulates a breakpoint by executing an int 3. ++ * ++ ************* ++ * ++ * The following gdb commands are supported: ++ * ++ * command function Return value ++ * ++ * g return the value of the CPU registers hex data or ENN ++ * G set the value of the CPU registers OK or ENN ++ * ++ * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN ++ * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN ++ * ++ * c Resume at current address SNN ( signal NN) ++ * cAA..AA Continue at address AA..AA SNN ++ * ++ * s Step one instruction SNN ++ * sAA..AA Step one instruction from AA..AA SNN ++ * ++ * k kill ++ * ++ * ? What was the last sigval ? SNN (signal NN) ++ * ++ * All commands and responses are sent with a packet which includes a ++ * checksum. A packet consists of ++ * ++ * $#. ++ * ++ * where ++ * :: ++ * :: < two hex digits computed as modulo 256 sum of > ++ * ++ * When a packet is received, it is first acknowledged with either '+' or '-'. ++ * '+' indicates a successful transfer. '-' indicates a failed transfer. ++ * ++ * Example: ++ * ++ * Host: Reply: ++ * $m0,10#2a +$00010203040506070809101112131415#42 ++ * ++ ****************************************************************************/ ++#define KGDB_VERSION "<20030915.1651.33>" ++#include ++#include ++#include /* for strcpy */ ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/************************************************************************ ++ * ++ * external low-level support routines ++ */ ++typedef void (*Function) (void); /* pointer to a function */ ++ ++/* Thread reference */ ++typedef unsigned char threadref[8]; ++ ++extern void putDebugChar(int); /* write a single character */ ++extern int getDebugChar(void); /* read and return a single char */ ++ ++/************************************************************************/ ++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ ++/* at least NUMREGBYTES*2 are needed for register packets */ ++/* Longer buffer is needed to list all threads */ ++#define BUFMAX 400 ++ ++char *kgdb_version = KGDB_VERSION; ++ ++/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ ++int debug_regs = 0; /* set to non-zero to print registers */ ++ ++/* filled in by an external module */ ++char *gdb_module_offsets; ++ ++static const char hexchars[] = "0123456789abcdef"; ++ ++/* Number of bytes of registers. */ ++#define NUMREGBYTES 64 ++/* ++ * Note that this register image is in a different order than ++ * the register image that Linux produces at interrupt time. ++ * ++ * Linux's register image is defined by struct pt_regs in ptrace.h. ++ * Just why GDB uses a different order is a historical mystery. ++ */ ++enum regnames { _EAX, /* 0 */ ++ _ECX, /* 1 */ ++ _EDX, /* 2 */ ++ _EBX, /* 3 */ ++ _ESP, /* 4 */ ++ _EBP, /* 5 */ ++ _ESI, /* 6 */ ++ _EDI, /* 7 */ ++ _PC /* 8 also known as eip */ , ++ _PS /* 9 also known as eflags */ , ++ _CS, /* 10 */ ++ _SS, /* 11 */ ++ _DS, /* 12 */ ++ _ES, /* 13 */ ++ _FS, /* 14 */ ++ _GS /* 15 */ ++}; ++ ++/*************************** ASSEMBLY CODE MACROS *************************/ ++/* ++ * Put the error code here just in case the user cares. ++ * Likewise, the vector number here (since GDB only gets the signal ++ * number through the usual means, and that's not very specific). ++ * The called_from is the return address so he can tell how we entered kgdb. ++ * This will allow him to seperate out the various possible entries. ++ */ ++#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ ++ ++#define PID_MAX PID_MAX_DEFAULT ++ ++#ifdef CONFIG_SMP ++void smp_send_nmi_allbutself(void); ++#define IF_SMP(x) x ++#undef MAX_NO_CPUS ++#ifndef CONFIG_NO_KGDB_CPUS ++#define CONFIG_NO_KGDB_CPUS 2 ++#endif ++#if CONFIG_NO_KGDB_CPUS > NR_CPUS ++#define MAX_NO_CPUS NR_CPUS ++#else ++#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS ++#endif ++#define hold_init hold_on_sstep: 1, ++#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) ++#define NUM_CPUS num_online_cpus() ++#else ++#define IF_SMP(x) ++#define hold_init ++#undef MAX_NO_CPUS ++#define MAX_NO_CPUS 1 ++#define NUM_CPUS 1 ++#endif ++#define NOCPU (struct task_struct *)0xbad1fbad ++/* *INDENT-OFF* */ ++struct kgdb_info { ++ int used_malloc; ++ void *called_from; ++ long long entry_tsc; ++ int errcode; ++ int vector; ++ int print_debug_info; ++#ifdef CONFIG_SMP ++ int hold_on_sstep; ++ struct { ++ volatile struct task_struct *task; ++ int pid; ++ int hold; ++ struct pt_regs *regs; ++ } cpus_waiting[MAX_NO_CPUS]; ++#endif ++} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; ++ ++/* *INDENT-ON* */ ++ ++#define used_m kgdb_info.used_malloc ++/* ++ * This is little area we set aside to contain the stack we ++ * need to build to allow gdb to call functions. We use one ++ * per cpu to avoid locking issues. We will do all this work ++ * with interrupts off so that should take care of the protection ++ * issues. ++ */ ++#define LOOKASIDE_SIZE 200 /* should be more than enough */ ++#define MALLOC_MAX 200 /* Max malloc size */ ++struct { ++ unsigned int esp; ++ int array[LOOKASIDE_SIZE]; ++} fn_call_lookaside[MAX_NO_CPUS]; ++ ++static int trap_cpu; ++static unsigned int OLD_esp; ++ ++#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] ++#define IF_BIT 0x200 ++#define TF_BIT 0x100 ++ ++#define MALLOC_ROUND 8-1 ++ ++static char malloc_array[MALLOC_MAX]; ++IF_SMP(static void to_gdb(const char *mess)); ++void * ++malloc(int size) ++{ ++ ++ if (size <= (MALLOC_MAX - used_m)) { ++ int old_used = used_m; ++ used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); ++ return &malloc_array[old_used]; ++ } else { ++ return NULL; ++ } ++} ++ ++/* ++ * Gdb calls functions by pushing agruments, including a return address ++ * on the stack and the adjusting EIP to point to the function. The ++ * whole assumption in GDB is that we are on a different stack than the ++ * one the "user" i.e. code that hit the break point, is on. This, of ++ * course is not true in the kernel. Thus various dodges are needed to ++ * do the call without directly messing with EIP (which we can not change ++ * as it is just a location and not a register. To adjust it would then ++ * require that we move every thing below EIP up or down as needed. This ++ * will not work as we may well have stack relative pointer on the stack ++ * (such as the pointer to regs, for example). ++ ++ * So here is what we do: ++ * We detect gdb attempting to store into the stack area and instead, store ++ * into the fn_call_lookaside.array at the same relative location as if it ++ * were the area ESP pointed at. We also trap ESP modifications ++ * and uses these to adjust fn_call_lookaside.esp. On entry ++ * fn_call_lookaside.esp will be set to point at the last entry in ++ * fn_call_lookaside.array. This allows us to check if it has changed, and ++ * if so, on exit, we add the registers we will use to do the move and a ++ * trap/ interrupt return exit sequence. We then adjust the eflags in the ++ * regs array (remember we now have a copy in the fn_call_lookaside.array) to ++ * kill the interrupt bit, AND we change EIP to point at our set up stub. ++ * As part of the register set up we preset the registers to point at the ++ * begining and end of the fn_call_lookaside.array, so all the stub needs to ++ * do is move words from the array to the stack until ESP= the desired value ++ * then do the rti. This will then transfer to the desired function with ++ * all the correct registers. Nifty huh? ++ */ ++extern asmlinkage void fn_call_stub(void); ++extern asmlinkage void fn_rtn_stub(void); ++/* *INDENT-OFF* */ ++__asm__("fn_rtn_stub:\n\t" ++ "movl %eax,%esp\n\t" ++ "fn_call_stub:\n\t" ++ "1:\n\t" ++ "addl $-4,%ebx\n\t" ++ "movl (%ebx), %eax\n\t" ++ "pushl %eax\n\t" ++ "cmpl %esp,%ecx\n\t" ++ "jne 1b\n\t" ++ "popl %eax\n\t" ++ "popl %ebx\n\t" ++ "popl %ecx\n\t" ++ "iret \n\t"); ++/* *INDENT-ON* */ ++#define gdb_i386vector kgdb_info.vector ++#define gdb_i386errcode kgdb_info.errcode ++#define waiting_cpus kgdb_info.cpus_waiting ++#define remote_debug kgdb_info.print_debug_info ++#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold ++/* gdb locks */ ++ ++#ifdef CONFIG_SMP ++static int in_kgdb_called; ++static spinlock_t waitlocks[MAX_NO_CPUS] = ++ {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; ++/* ++ * The following array has the thread pointer of each of the "other" ++ * cpus. We make it global so it can be seen by gdb. ++ */ ++volatile int in_kgdb_entry_log[MAX_NO_CPUS]; ++volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; ++/* ++static spinlock_t continuelocks[MAX_NO_CPUS]; ++*/ ++spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; ++/* waiters on our spinlock plus us */ ++static atomic_t spinlock_waiters = ATOMIC_INIT(1); ++static int spinlock_count = 0; ++static int spinlock_cpu = 0; ++/* ++ * Note we use nested spin locks to account for the case where a break ++ * point is encountered when calling a function by user direction from ++ * kgdb. Also there is the memory exception recursion to account for. ++ * Well, yes, but this lets other cpus thru too. Lets add a ++ * cpu id to the lock. ++ */ ++#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ ++ spinlock_cpu != smp_processor_id()){\ ++ atomic_inc(&spinlock_waiters); \ ++ while (! spin_trylock(x)) {\ ++ in_kgdb(®s);\ ++ }\ ++ atomic_dec(&spinlock_waiters); \ ++ spinlock_count = 1; \ ++ spinlock_cpu = smp_processor_id(); \ ++ }else{ \ ++ spinlock_count++; \ ++ } ++#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) ++#else ++unsigned kgdb_spinlock = 0; ++#define KGDB_SPIN_LOCK(x) --*x ++#define KGDB_SPIN_UNLOCK(x) ++*x ++#endif ++ ++int ++hex(char ch) ++{ ++ if ((ch >= 'a') && (ch <= 'f')) ++ return (ch - 'a' + 10); ++ if ((ch >= '0') && (ch <= '9')) ++ return (ch - '0'); ++ if ((ch >= 'A') && (ch <= 'F')) ++ return (ch - 'A' + 10); ++ return (-1); ++} ++ ++/* scan for the sequence $# */ ++void ++getpacket(char *buffer) ++{ ++ unsigned char checksum; ++ unsigned char xmitcsum; ++ int i; ++ int count; ++ char ch; ++ ++ do { ++ /* wait around for the start character, ignore all other characters */ ++ while ((ch = (getDebugChar() & 0x7f)) != '$') ; ++ checksum = 0; ++ xmitcsum = -1; ++ ++ count = 0; ++ ++ /* now, read until a # or end of buffer is found */ ++ while (count < BUFMAX) { ++ ch = getDebugChar() & 0x7f; ++ if (ch == '#') ++ break; ++ checksum = checksum + ch; ++ buffer[count] = ch; ++ count = count + 1; ++ } ++ buffer[count] = 0; ++ ++ if (ch == '#') { ++ xmitcsum = hex(getDebugChar() & 0x7f) << 4; ++ xmitcsum += hex(getDebugChar() & 0x7f); ++ if ((remote_debug) && (checksum != xmitcsum)) { ++ printk ++ ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", ++ checksum, xmitcsum, buffer); ++ } ++ ++ if (checksum != xmitcsum) ++ putDebugChar('-'); /* failed checksum */ ++ else { ++ putDebugChar('+'); /* successful transfer */ ++ /* if a sequence char is present, reply the sequence ID */ ++ if (buffer[2] == ':') { ++ putDebugChar(buffer[0]); ++ putDebugChar(buffer[1]); ++ /* remove sequence chars from buffer */ ++ count = strlen(buffer); ++ for (i = 3; i <= count; i++) ++ buffer[i - 3] = buffer[i]; ++ } ++ } ++ } ++ } while (checksum != xmitcsum); ++ ++ if (remote_debug) ++ printk("R:%s\n", buffer); ++} ++ ++/* send the packet in buffer. */ ++ ++void ++putpacket(char *buffer) ++{ ++ unsigned char checksum; ++ int count; ++ char ch; ++ ++ /* $#. */ ++ do { ++ if (remote_debug) ++ printk("T:%s\n", buffer); ++ putDebugChar('$'); ++ checksum = 0; ++ count = 0; ++ ++ while ((ch = buffer[count])) { ++ putDebugChar(ch); ++ checksum += ch; ++ count += 1; ++ } ++ ++ putDebugChar('#'); ++ putDebugChar(hexchars[checksum >> 4]); ++ putDebugChar(hexchars[checksum % 16]); ++ ++ } while ((getDebugChar() & 0x7f) != '+'); ++ ++} ++ ++static char remcomInBuffer[BUFMAX]; ++static char remcomOutBuffer[BUFMAX]; ++static short error; ++ ++void ++debug_error(char *format, char *parm) ++{ ++ if (remote_debug) ++ printk(format, parm); ++} ++ ++static void ++print_regs(struct pt_regs *regs) ++{ ++ printk("EAX=%08lx ", regs->eax); ++ printk("EBX=%08lx ", regs->ebx); ++ printk("ECX=%08lx ", regs->ecx); ++ printk("EDX=%08lx ", regs->edx); ++ printk("\n"); ++ printk("ESI=%08lx ", regs->esi); ++ printk("EDI=%08lx ", regs->edi); ++ printk("EBP=%08lx ", regs->ebp); ++ printk("ESP=%08lx ", (long) ®s->esp); ++ printk("\n"); ++ printk(" DS=%08x ", regs->xds); ++ printk(" ES=%08x ", regs->xes); ++ printk(" SS=%08x ", __KERNEL_DS); ++ printk(" FL=%08lx ", regs->eflags); ++ printk("\n"); ++ printk(" CS=%08x ", regs->xcs); ++ printk(" IP=%08lx ", regs->eip); ++#if 0 ++ printk(" FS=%08x ", regs->fs); ++ printk(" GS=%08x ", regs->gs); ++#endif ++ printk("\n"); ++ ++} /* print_regs */ ++ ++#define NEW_esp fn_call_lookaside[trap_cpu].esp ++ ++static void ++regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) ++{ ++ gdb_regs[_EAX] = regs->eax; ++ gdb_regs[_EBX] = regs->ebx; ++ gdb_regs[_ECX] = regs->ecx; ++ gdb_regs[_EDX] = regs->edx; ++ gdb_regs[_ESI] = regs->esi; ++ gdb_regs[_EDI] = regs->edi; ++ gdb_regs[_EBP] = regs->ebp; ++ gdb_regs[_DS] = regs->xds; ++ gdb_regs[_ES] = regs->xes; ++ gdb_regs[_PS] = regs->eflags; ++ gdb_regs[_CS] = regs->xcs; ++ gdb_regs[_PC] = regs->eip; ++ /* Note, as we are a debugging the kernel, we will always ++ * trap in kernel code, this means no priviledge change, ++ * and so the pt_regs structure is not completely valid. In a non ++ * privilege change trap, only EFLAGS, CS and EIP are put on the stack, ++ * SS and ESP are not stacked, this means that the last 2 elements of ++ * pt_regs is not valid (they would normally refer to the user stack) ++ * also, using regs+1 is no good because you end up will a value that is ++ * 2 longs (8) too high. This used to cause stepping over functions ++ * to fail, so my fix is to use the address of regs->esp, which ++ * should point at the end of the stack frame. Note I have ignored ++ * completely exceptions that cause an error code to be stacked, such ++ * as double fault. Stuart Hughes, Zentropix. ++ * original code: gdb_regs[_ESP] = (int) (regs + 1) ; ++ ++ * this is now done on entry and moved to OLD_esp (as well as NEW_esp). ++ */ ++ gdb_regs[_ESP] = NEW_esp; ++ gdb_regs[_SS] = __KERNEL_DS; ++ gdb_regs[_FS] = 0xFFFF; ++ gdb_regs[_GS] = 0xFFFF; ++} /* regs_to_gdb_regs */ ++ ++static void ++gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) ++{ ++ regs->eax = gdb_regs[_EAX]; ++ regs->ebx = gdb_regs[_EBX]; ++ regs->ecx = gdb_regs[_ECX]; ++ regs->edx = gdb_regs[_EDX]; ++ regs->esi = gdb_regs[_ESI]; ++ regs->edi = gdb_regs[_EDI]; ++ regs->ebp = gdb_regs[_EBP]; ++ regs->xds = gdb_regs[_DS]; ++ regs->xes = gdb_regs[_ES]; ++ regs->eflags = gdb_regs[_PS]; ++ regs->xcs = gdb_regs[_CS]; ++ regs->eip = gdb_regs[_PC]; ++ NEW_esp = gdb_regs[_ESP]; /* keep the value */ ++#if 0 /* can't change these */ ++ regs->esp = gdb_regs[_ESP]; ++ regs->xss = gdb_regs[_SS]; ++ regs->fs = gdb_regs[_FS]; ++ regs->gs = gdb_regs[_GS]; ++#endif ++ ++} /* gdb_regs_to_regs */ ++ ++int thread_list = 0; ++ ++void ++get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs) ++{ ++ unsigned long stack_page; ++ int count = 0; ++ IF_SMP(int i); ++ if (!p || p == current) { ++ regs_to_gdb_regs(gdb_regs, regs); ++ return; ++ } ++#ifdef CONFIG_SMP ++ for (i = 0; i < MAX_NO_CPUS; i++) { ++ if (p == kgdb_info.cpus_waiting[i].task) { ++ regs_to_gdb_regs(gdb_regs, ++ kgdb_info.cpus_waiting[i].regs); ++ gdb_regs[_ESP] = ++ (int) &kgdb_info.cpus_waiting[i].regs->esp; ++ ++ return; ++ } ++ } ++#endif ++ memset(gdb_regs, 0, NUMREGBYTES); ++ gdb_regs[_ESP] = p->thread.esp; ++ gdb_regs[_PC] = p->thread.eip; ++ gdb_regs[_EBP] = *(int *) gdb_regs[_ESP]; ++ gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4); ++ gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8); ++ ++/* ++ * This code is to give a more informative notion of where a process ++ * is waiting. It is used only when the user asks for a thread info ++ * list. If he then switches to the thread, s/he will find the task ++ * is in schedule, but a back trace should show the same info we come ++ * up with. This code was shamelessly purloined from process.c. It was ++ * then enhanced to provide more registers than simply the program ++ * counter. ++ */ ++ ++ if (!thread_list) { ++ return; ++ } ++ ++ if (p->state == TASK_RUNNING) ++ return; ++ stack_page = (unsigned long) p->thread_info; ++ if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] > ++ THREAD_SIZE - sizeof(long) + stack_page) ++ return; ++ /* include/asm-i386/system.h:switch_to() pushes ebp last. */ ++ do { ++ if (gdb_regs[_EBP] < stack_page || ++ gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page) ++ return; ++ gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4); ++ gdb_regs[_ESP] = gdb_regs[_EBP] + 8; ++ gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP]; ++ if (!in_sched_functions(gdb_regs[_PC])) ++ return; ++ } while (count++ < 16); ++ return; ++} ++ ++/* Indicate to caller of mem2hex or hex2mem that there has been an ++ error. */ ++static volatile int mem_err = 0; ++static volatile int mem_err_expected = 0; ++static volatile int mem_err_cnt = 0; ++static int garbage_loc = -1; ++ ++int ++get_char(char *addr) ++{ ++ return *addr; ++} ++ ++void ++set_char(char *addr, int val, int may_fault) ++{ ++ /* ++ * This code traps references to the area mapped to the kernel ++ * stack as given by the regs and, instead, stores to the ++ * fn_call_lookaside[cpu].array ++ */ ++ if (may_fault && ++ (unsigned int) addr < OLD_esp && ++ ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { ++ addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); ++ } ++ *addr = val; ++} ++ ++/* convert the memory pointed to by mem into hex, placing result in buf */ ++/* return a pointer to the last char put in buf (null) */ ++/* If MAY_FAULT is non-zero, then we should set mem_err in response to ++ a fault; if zero treat a fault like any other fault in the stub. */ ++char * ++mem2hex(char *mem, char *buf, int count, int may_fault) ++{ ++ int i; ++ unsigned char ch; ++ ++ if (may_fault) { ++ mem_err_expected = 1; ++ mem_err = 0; ++ } ++ for (i = 0; i < count; i++) { ++ /* printk("%lx = ", mem) ; */ ++ ++ ch = get_char(mem++); ++ ++ /* printk("%02x\n", ch & 0xFF) ; */ ++ if (may_fault && mem_err) { ++ if (remote_debug) ++ printk("Mem fault fetching from addr %lx\n", ++ (long) (mem - 1)); ++ *buf = 0; /* truncate buffer */ ++ return (buf); ++ } ++ *buf++ = hexchars[ch >> 4]; ++ *buf++ = hexchars[ch % 16]; ++ } ++ *buf = 0; ++ if (may_fault) ++ mem_err_expected = 0; ++ return (buf); ++} ++ ++/* convert the hex array pointed to by buf into binary to be placed in mem */ ++/* return a pointer to the character AFTER the last byte written */ ++/* NOTE: We use the may fault flag to also indicate if the write is to ++ * the registers (0) or "other" memory (!=0) ++ */ ++char * ++hex2mem(char *buf, char *mem, int count, int may_fault) ++{ ++ int i; ++ unsigned char ch; ++ ++ if (may_fault) { ++ mem_err_expected = 1; ++ mem_err = 0; ++ } ++ for (i = 0; i < count; i++) { ++ ch = hex(*buf++) << 4; ++ ch = ch + hex(*buf++); ++ set_char(mem++, ch, may_fault); ++ ++ if (may_fault && mem_err) { ++ if (remote_debug) ++ printk("Mem fault storing to addr %lx\n", ++ (long) (mem - 1)); ++ return (mem); ++ } ++ } ++ if (may_fault) ++ mem_err_expected = 0; ++ return (mem); ++} ++ ++/**********************************************/ ++/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ ++/* RETURN NUMBER OF CHARS PROCESSED */ ++/**********************************************/ ++int ++hexToInt(char **ptr, int *intValue) ++{ ++ int numChars = 0; ++ int hexValue; ++ ++ *intValue = 0; ++ ++ while (**ptr) { ++ hexValue = hex(**ptr); ++ if (hexValue >= 0) { ++ *intValue = (*intValue << 4) | hexValue; ++ numChars++; ++ } else ++ break; ++ ++ (*ptr)++; ++ } ++ ++ return (numChars); ++} ++ ++#define stubhex(h) hex(h) ++#ifdef old_thread_list ++ ++static int ++stub_unpack_int(char *buff, int fieldlength) ++{ ++ int nibble; ++ int retval = 0; ++ ++ while (fieldlength) { ++ nibble = stubhex(*buff++); ++ retval |= nibble; ++ fieldlength--; ++ if (fieldlength) ++ retval = retval << 4; ++ } ++ return retval; ++} ++#endif ++static char * ++pack_hex_byte(char *pkt, int byte) ++{ ++ *pkt++ = hexchars[(byte >> 4) & 0xf]; ++ *pkt++ = hexchars[(byte & 0xf)]; ++ return pkt; ++} ++ ++#define BUF_THREAD_ID_SIZE 16 ++ ++static char * ++pack_threadid(char *pkt, threadref * id) ++{ ++ char *limit; ++ unsigned char *altid; ++ ++ altid = (unsigned char *) id; ++ limit = pkt + BUF_THREAD_ID_SIZE; ++ while (pkt < limit) ++ pkt = pack_hex_byte(pkt, *altid++); ++ return pkt; ++} ++ ++#ifdef old_thread_list ++static char * ++unpack_byte(char *buf, int *value) ++{ ++ *value = stub_unpack_int(buf, 2); ++ return buf + 2; ++} ++ ++static char * ++unpack_threadid(char *inbuf, threadref * id) ++{ ++ char *altref; ++ char *limit = inbuf + BUF_THREAD_ID_SIZE; ++ int x, y; ++ ++ altref = (char *) id; ++ ++ while (inbuf < limit) { ++ x = stubhex(*inbuf++); ++ y = stubhex(*inbuf++); ++ *altref++ = (x << 4) | y; ++ } ++ return inbuf; ++} ++#endif ++void ++int_to_threadref(threadref * id, int value) ++{ ++ unsigned char *scan; ++ ++ scan = (unsigned char *) id; ++ { ++ int i = 4; ++ while (i--) ++ *scan++ = 0; ++ } ++ *scan++ = (value >> 24) & 0xff; ++ *scan++ = (value >> 16) & 0xff; ++ *scan++ = (value >> 8) & 0xff; ++ *scan++ = (value & 0xff); ++} ++int ++int_to_hex_v(unsigned char * id, int value) ++{ ++ unsigned char *start = id; ++ int shift; ++ int ch; ++ ++ for (shift = 28; shift >= 0; shift -= 4) { ++ if ((ch = (value >> shift) & 0xf) || (id != start)) { ++ *id = hexchars[ch]; ++ id++; ++ } ++ } ++ if (id == start) ++ *id++ = '0'; ++ return id - start; ++} ++#ifdef old_thread_list ++ ++static int ++threadref_to_int(threadref * ref) ++{ ++ int i, value = 0; ++ unsigned char *scan; ++ ++ scan = (char *) ref; ++ scan += 4; ++ i = 4; ++ while (i-- > 0) ++ value = (value << 8) | ((*scan++) & 0xff); ++ return value; ++} ++#endif ++static int ++cmp_str(char *s1, char *s2, int count) ++{ ++ while (count--) { ++ if (*s1++ != *s2++) ++ return 0; ++ } ++ return 1; ++} ++ ++#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ ++extern struct task_struct *kgdb_get_idle(int cpu); ++#define idle_task(cpu) kgdb_get_idle(cpu) ++#else ++#define idle_task(cpu) init_tasks[cpu] ++#endif ++ ++extern int kgdb_pid_init_done; ++ ++struct task_struct * ++getthread(int pid) ++{ ++ struct task_struct *thread; ++ if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { ++ ++ return idle_task(pid - PID_MAX); ++ } else { ++ /* ++ * find_task_by_pid is relatively safe all the time ++ * Other pid functions require lock downs which imply ++ * that we may be interrupting them (as we get here ++ * in the middle of most any lock down). ++ * Still we don't want to call until the table exists! ++ */ ++ if (kgdb_pid_init_done){ ++ thread = find_task_by_pid(pid); ++ if (thread) { ++ return thread; ++ } ++ } ++ } ++ return NULL; ++} ++/* *INDENT-OFF* */ ++struct hw_breakpoint { ++ unsigned enabled; ++ unsigned type; ++ unsigned len; ++ unsigned addr; ++} breakinfo[4] = { {enabled:0}, ++ {enabled:0}, ++ {enabled:0}, ++ {enabled:0}}; ++/* *INDENT-ON* */ ++unsigned hw_breakpoint_status; ++void ++correct_hw_break(void) ++{ ++ int breakno; ++ int correctit; ++ int breakbit; ++ unsigned dr7; ++ ++ asm volatile ("movl %%db7, %0\n":"=r" (dr7) ++ :); ++ /* *INDENT-OFF* */ ++ do { ++ unsigned addr0, addr1, addr2, addr3; ++ asm volatile ("movl %%db0, %0\n" ++ "movl %%db1, %1\n" ++ "movl %%db2, %2\n" ++ "movl %%db3, %3\n" ++ :"=r" (addr0), "=r"(addr1), ++ "=r"(addr2), "=r"(addr3) ++ :); ++ } while (0); ++ /* *INDENT-ON* */ ++ correctit = 0; ++ for (breakno = 0; breakno < 3; breakno++) { ++ breakbit = 2 << (breakno << 1); ++ if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 |= breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ dr7 |= (((breakinfo[breakno].len << 2) | ++ breakinfo[breakno].type) << 16) << ++ (breakno << 2); ++ switch (breakno) { ++ case 0: ++ asm volatile ("movl %0, %%dr0\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 1: ++ asm volatile ("movl %0, %%dr1\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 2: ++ asm volatile ("movl %0, %%dr2\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 3: ++ asm volatile ("movl %0, %%dr3\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ } ++ } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 &= ~breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ } ++ } ++ if (correctit) { ++ asm volatile ("movl %0, %%db7\n"::"r" (dr7)); ++ } ++} ++ ++int ++remove_hw_break(unsigned breakno) ++{ ++ if (!breakinfo[breakno].enabled) { ++ return -1; ++ } ++ breakinfo[breakno].enabled = 0; ++ return 0; ++} ++ ++int ++set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) ++{ ++ if (breakinfo[breakno].enabled) { ++ return -1; ++ } ++ breakinfo[breakno].enabled = 1; ++ breakinfo[breakno].type = type; ++ breakinfo[breakno].len = len; ++ breakinfo[breakno].addr = addr; ++ return 0; ++} ++ ++#ifdef CONFIG_SMP ++static int in_kgdb_console = 0; ++ ++int ++in_kgdb(struct pt_regs *regs) ++{ ++ unsigned flags; ++ int cpu = smp_processor_id(); ++ in_kgdb_called = 1; ++ if (!spin_is_locked(&kgdb_spinlock)) { ++ if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ ++ in_kgdb_console) { /* or we are doing slow i/o */ ++ return 1; ++ } ++ return 0; ++ } ++ ++ /* As I see it the only reason not to let all cpus spin on ++ * the same spin_lock is to allow selected ones to proceed. ++ * This would be a good thing, so we leave it this way. ++ * Maybe someday.... Done ! ++ ++ * in_kgdb() is called from an NMI so we don't pretend ++ * to have any resources, like printk() for example. ++ */ ++ ++ kgdb_local_irq_save(flags); /* only local here, to avoid hanging */ ++ /* ++ * log arival of this cpu ++ * The NMI keeps on ticking. Protect against recurring more ++ * than once, and ignor the cpu that has the kgdb lock ++ */ ++ in_kgdb_entry_log[cpu]++; ++ in_kgdb_here_log[cpu] = regs; ++ if (cpu == spinlock_cpu || waiting_cpus[cpu].task) { ++ goto exit_in_kgdb; ++ } ++ /* ++ * For protection of the initilization of the spin locks by kgdb ++ * it locks the kgdb spinlock before it gets the wait locks set ++ * up. We wait here for the wait lock to be taken. If the ++ * kgdb lock goes away first?? Well, it could be a slow exit ++ * sequence where the wait lock is removed prior to the kgdb lock ++ * so if kgdb gets unlocked, we just exit. ++ */ ++ while (spin_is_locked(&kgdb_spinlock) && ++ !spin_is_locked(waitlocks + cpu)) ; ++ if (!spin_is_locked(&kgdb_spinlock)) { ++ goto exit_in_kgdb; ++ } ++ waiting_cpus[cpu].task = current; ++ waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); ++ waiting_cpus[cpu].regs = regs; ++ ++ spin_unlock_wait(waitlocks + cpu); ++ /* ++ * log departure of this cpu ++ */ ++ waiting_cpus[cpu].task = 0; ++ waiting_cpus[cpu].pid = 0; ++ waiting_cpus[cpu].regs = 0; ++ correct_hw_break(); ++ exit_in_kgdb: ++ in_kgdb_here_log[cpu] = 0; ++ kgdb_local_irq_restore(flags); ++ return 1; ++ /* ++ spin_unlock(continuelocks + smp_processor_id()); ++ */ ++} ++ ++void ++smp__in_kgdb(struct pt_regs regs) ++{ ++ ack_APIC_irq(); ++ in_kgdb(®s); ++} ++#else ++int ++in_kgdb(struct pt_regs *regs) ++{ ++ return (kgdb_spinlock); ++} ++#endif ++ ++void ++printexceptioninfo(int exceptionNo, int errorcode, char *buffer) ++{ ++ unsigned dr6; ++ int i; ++ switch (exceptionNo) { ++ case 1: /* debug exception */ ++ break; ++ case 3: /* breakpoint */ ++ sprintf(buffer, "Software breakpoint"); ++ return; ++ default: ++ sprintf(buffer, "Details not available"); ++ return; ++ } ++ asm volatile ("movl %%db6, %0\n":"=r" (dr6) ++ :); ++ if (dr6 & 0x4000) { ++ sprintf(buffer, "Single step"); ++ return; ++ } ++ for (i = 0; i < 4; ++i) { ++ if (dr6 & (1 << i)) { ++ sprintf(buffer, "Hardware breakpoint %d", i); ++ return; ++ } ++ } ++ sprintf(buffer, "Unknown trap"); ++ return; ++} ++ ++/* ++ * This function does all command procesing for interfacing to gdb. ++ * ++ * NOTE: The INT nn instruction leaves the state of the interrupt ++ * enable flag UNCHANGED. That means that when this routine ++ * is entered via a breakpoint (INT 3) instruction from code ++ * that has interrupts enabled, then interrupts will STILL BE ++ * enabled when this routine is entered. The first thing that ++ * we do here is disable interrupts so as to prevent recursive ++ * entries and bothersome serial interrupts while we are ++ * trying to run the serial port in polled mode. ++ * ++ * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so ++ * it is always necessary to do a restore_flags before returning ++ * so as to let go of that lock. ++ */ ++int ++kgdb_handle_exception(int exceptionVector, ++ int signo, int err_code, struct pt_regs *linux_regs) ++{ ++ struct task_struct *usethread = NULL; ++ struct task_struct *thread_list_start = 0, *thread = NULL; ++ int addr, length; ++ int breakno, breaktype; ++ char *ptr; ++ int newPC; ++ threadref thref; ++ int threadid; ++ int thread_min = PID_MAX + MAX_NO_CPUS; ++#ifdef old_thread_list ++ int maxthreads; ++#endif ++ int nothreads; ++ unsigned long flags; ++ int gdb_regs[NUMREGBYTES / 4]; ++ int dr6; ++ IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ ++#define NO_NMI 1 ++#define NO_SYNC 2 ++#define regs (*linux_regs) ++#define NUMREGS NUMREGBYTES/4 ++ /* ++ * If the entry is not from the kernel then return to the Linux ++ * trap handler and let it process the interrupt normally. ++ */ ++ if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { ++ printk("ignoring non-kernel exception\n"); ++ print_regs(®s); ++ return (0); ++ } ++ ++ kgdb_local_irq_save(flags); ++ ++ /* Get kgdb spinlock */ ++ ++ KGDB_SPIN_LOCK(&kgdb_spinlock); ++ rdtscll(kgdb_info.entry_tsc); ++ /* ++ * We depend on this spinlock and the NMI watch dog to control the ++ * other cpus. They will arrive at "in_kgdb()" as a result of the ++ * NMI and will wait there for the following spin locks to be ++ * released. ++ */ ++#ifdef CONFIG_SMP ++ ++#if 0 ++ if (cpu_callout_map & ~MAX_CPU_MASK) { ++ printk("kgdb : too many cpus, possibly not mapped" ++ " in contiguous space, change MAX_NO_CPUS" ++ " in kgdb_stub and make new kernel.\n" ++ " cpu_callout_map is %lx\n", cpu_callout_map); ++ goto exit_just_unlock; ++ } ++#endif ++ if (spinlock_count == 1) { ++ int time = 0, end_time, dum = 0; ++ int i; ++ int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) ++ }; ++ if (remote_debug) { ++ printk("kgdb : cpu %d entry, syncing others\n", ++ smp_processor_id()); ++ } ++ for (i = 0; i < MAX_NO_CPUS; i++) { ++ /* ++ * Use trylock as we may already hold the lock if ++ * we are holding the cpu. Net result is all ++ * locked. ++ */ ++ spin_trylock(&waitlocks[i]); ++ } ++ for (i = 0; i < MAX_NO_CPUS; i++) ++ cpu_logged_in[i] = 0; ++ /* ++ * Wait for their arrival. We know the watch dog is active if ++ * in_kgdb() has ever been called, as it is always called on a ++ * watchdog tick. ++ */ ++ rdtsc(dum, time); ++ end_time = time + 2; /* Note: we use the High order bits! */ ++ i = 1; ++ if (num_online_cpus() > 1) { ++ int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; ++ smp_send_nmi_allbutself(); ++ while (i < num_online_cpus() && time != end_time) { ++ int j; ++ for (j = 0; j < MAX_NO_CPUS; j++) { ++ if (waiting_cpus[j].task && ++ !cpu_logged_in[j]) { ++ i++; ++ cpu_logged_in[j] = 1; ++ if (remote_debug) { ++ printk ++ ("kgdb : cpu %d arrived at kgdb\n", ++ j); ++ } ++ break; ++ } else if (!waiting_cpus[j].task && ++ !cpu_online(j)) { ++ waiting_cpus[j].task = NOCPU; ++ cpu_logged_in[j] = 1; ++ waiting_cpus[j].hold = 1; ++ break; ++ } ++ if (!waiting_cpus[j].task && ++ in_kgdb_here_log[j]) { ++ ++ int wait = 100000; ++ while (wait--) ; ++ if (!waiting_cpus[j].task && ++ in_kgdb_here_log[j]) { ++ printk ++ ("kgdb : cpu %d stall" ++ " in in_kgdb\n", ++ j); ++ i++; ++ cpu_logged_in[j] = 1; ++ waiting_cpus[j].task = ++ (struct task_struct ++ *) 1; ++ } ++ } ++ } ++ ++ if (in_kgdb_entry_log[smp_processor_id()] > ++ (me_in_kgdb + 10)) { ++ break; ++ } ++ ++ rdtsc(dum, time); ++ } ++ if (i < num_online_cpus()) { ++ printk ++ ("kgdb : time out, proceeding without sync\n"); ++#if 0 ++ printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", ++ waiting_cpus[0].task != 0, ++ waiting_cpus[1].task != 0); ++ printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", ++ cpu_logged_in[0], cpu_logged_in[1]); ++ printk ++ ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", ++ in_kgdb_here_log[0] != 0, ++ in_kgdb_here_log[1] != 0); ++#endif ++ entry_state = NO_SYNC; ++ } else { ++#if 0 ++ int ent = ++ in_kgdb_entry_log[smp_processor_id()] - ++ me_in_kgdb; ++ printk("kgdb : sync after %d entries\n", ent); ++#endif ++ } ++ } else { ++ if (remote_debug) { ++ printk ++ ("kgdb : %d cpus, but watchdog not active\n" ++ "proceeding without locking down other cpus\n", ++ num_online_cpus()); ++ entry_state = NO_NMI; ++ } ++ } ++ } ++#endif ++ ++ if (remote_debug) { ++ unsigned long *lp = (unsigned long *) &linux_regs; ++ ++ printk("handle_exception(exceptionVector=%d, " ++ "signo=%d, err_code=%d, linux_regs=%p)\n", ++ exceptionVector, signo, err_code, linux_regs); ++ if (debug_regs) { ++ print_regs(®s); ++ printk("Stk: %8lx %8lx %8lx %8lx" ++ " %8lx %8lx %8lx %8lx\n", ++ lp[0], lp[1], lp[2], lp[3], ++ lp[4], lp[5], lp[6], lp[7]); ++ printk(" %8lx %8lx %8lx %8lx" ++ " %8lx %8lx %8lx %8lx\n", ++ lp[8], lp[9], lp[10], lp[11], ++ lp[12], lp[13], lp[14], lp[15]); ++ printk(" %8lx %8lx %8lx %8lx " ++ "%8lx %8lx %8lx %8lx\n", ++ lp[16], lp[17], lp[18], lp[19], ++ lp[20], lp[21], lp[22], lp[23]); ++ printk(" %8lx %8lx %8lx %8lx " ++ "%8lx %8lx %8lx %8lx\n", ++ lp[24], lp[25], lp[26], lp[27], ++ lp[28], lp[29], lp[30], lp[31]); ++ } ++ } ++ ++ /* Disable hardware debugging while we are in kgdb */ ++ /* Get the debug register status register */ ++/* *INDENT-OFF* */ ++ __asm__("movl %0,%%db7" ++ : /* no output */ ++ :"r"(0)); ++ ++ asm volatile ("movl %%db6, %0\n" ++ :"=r" (hw_breakpoint_status) ++ :); ++ ++/* *INDENT-ON* */ ++ switch (exceptionVector) { ++ case 0: /* divide error */ ++ case 1: /* debug exception */ ++ case 2: /* NMI */ ++ case 3: /* breakpoint */ ++ case 4: /* overflow */ ++ case 5: /* bounds check */ ++ case 6: /* invalid opcode */ ++ case 7: /* device not available */ ++ case 8: /* double fault (errcode) */ ++ case 10: /* invalid TSS (errcode) */ ++ case 12: /* stack fault (errcode) */ ++ case 16: /* floating point error */ ++ case 17: /* alignment check (errcode) */ ++ default: /* any undocumented */ ++ break; ++ case 11: /* segment not present (errcode) */ ++ case 13: /* general protection (errcode) */ ++ case 14: /* page fault (special errcode) */ ++ case 19: /* cache flush denied */ ++ if (mem_err_expected) { ++ /* ++ * This fault occured because of the ++ * get_char or set_char routines. These ++ * two routines use either eax of edx to ++ * indirectly reference the location in ++ * memory that they are working with. ++ * For a page fault, when we return the ++ * instruction will be retried, so we ++ * have to make sure that these ++ * registers point to valid memory. ++ */ ++ mem_err = 1; /* set mem error flag */ ++ mem_err_expected = 0; ++ mem_err_cnt++; /* helps in debugging */ ++ /* make valid address */ ++ regs.eax = (long) &garbage_loc; ++ /* make valid address */ ++ regs.edx = (long) &garbage_loc; ++ if (remote_debug) ++ printk("Return after memory error: " ++ "mem_err_cnt=%d\n", mem_err_cnt); ++ if (debug_regs) ++ print_regs(®s); ++ goto exit_kgdb; ++ } ++ break; ++ } ++ if (remote_debug) ++ printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); ++ ++ gdb_i386vector = exceptionVector; ++ gdb_i386errcode = err_code; ++ kgdb_info.called_from = __builtin_return_address(0); ++#ifdef CONFIG_SMP ++ /* ++ * OK, we can now communicate, lets tell gdb about the sync. ++ * but only if we had a problem. ++ */ ++ switch (entry_state) { ++ case NO_NMI: ++ to_gdb("NMI not active, other cpus not stopped\n"); ++ break; ++ case NO_SYNC: ++ to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); ++ default:; ++ } ++ ++#endif ++/* ++ * Set up the gdb function call area. ++ */ ++ trap_cpu = smp_processor_id(); ++ OLD_esp = NEW_esp = (int) (&linux_regs->esp); ++ ++ IF_SMP(once_again:) ++ /* reply to host that an exception has occurred */ ++ remcomOutBuffer[0] = 'S'; ++ remcomOutBuffer[1] = hexchars[signo >> 4]; ++ remcomOutBuffer[2] = hexchars[signo % 16]; ++ remcomOutBuffer[3] = 0; ++ ++ putpacket(remcomOutBuffer); ++ ++ while (1 == 1) { ++ error = 0; ++ remcomOutBuffer[0] = 0; ++ getpacket(remcomInBuffer); ++ switch (remcomInBuffer[0]) { ++ case '?': ++ remcomOutBuffer[0] = 'S'; ++ remcomOutBuffer[1] = hexchars[signo >> 4]; ++ remcomOutBuffer[2] = hexchars[signo % 16]; ++ remcomOutBuffer[3] = 0; ++ break; ++ case 'd': ++ remote_debug = !(remote_debug); /* toggle debug flag */ ++ printk("Remote debug %s\n", ++ remote_debug ? "on" : "off"); ++ break; ++ case 'g': /* return the value of the CPU registers */ ++ get_gdb_regs(usethread, ®s, gdb_regs); ++ mem2hex((char *) gdb_regs, ++ remcomOutBuffer, NUMREGBYTES, 0); ++ break; ++ case 'G': /* set the value of the CPU registers - return OK */ ++ hex2mem(&remcomInBuffer[1], ++ (char *) gdb_regs, NUMREGBYTES, 0); ++ if (!usethread || usethread == current) { ++ gdb_regs_to_regs(gdb_regs, ®s); ++ strcpy(remcomOutBuffer, "OK"); ++ } else { ++ strcpy(remcomOutBuffer, "E00"); ++ } ++ break; ++ ++ case 'P':{ /* set the value of a single CPU register - ++ return OK */ ++ /* ++ * For some reason, gdb wants to talk about psudo ++ * registers (greater than 15). These may have ++ * meaning for ptrace, but for us it is safe to ++ * ignor them. We do this by dumping them into ++ * _GS which we also ignor, but do have memory for. ++ */ ++ int regno; ++ ++ ptr = &remcomInBuffer[1]; ++ regs_to_gdb_regs(gdb_regs, ®s); ++ if ((!usethread || usethread == current) && ++ hexToInt(&ptr, ®no) && ++ *ptr++ == '=' && (regno >= 0)) { ++ regno = ++ (regno >= NUMREGS ? _GS : regno); ++ hex2mem(ptr, (char *) &gdb_regs[regno], ++ 4, 0); ++ gdb_regs_to_regs(gdb_regs, ®s); ++ strcpy(remcomOutBuffer, "OK"); ++ break; ++ } ++ strcpy(remcomOutBuffer, "E01"); ++ break; ++ } ++ ++ /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ ++ case 'm': ++ /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ ++ ptr = &remcomInBuffer[1]; ++ if (hexToInt(&ptr, &addr) && ++ (*(ptr++) == ',') && (hexToInt(&ptr, &length))) { ++ ptr = 0; ++ /* ++ * hex doubles the byte count ++ */ ++ if (length > (BUFMAX / 2)) ++ length = BUFMAX / 2; ++ mem2hex((char *) addr, ++ remcomOutBuffer, length, 1); ++ if (mem_err) { ++ strcpy(remcomOutBuffer, "E03"); ++ debug_error("memory fault\n", NULL); ++ } ++ } ++ ++ if (ptr) { ++ strcpy(remcomOutBuffer, "E01"); ++ debug_error ++ ("malformed read memory command: %s\n", ++ remcomInBuffer); ++ } ++ break; ++ ++ /* MAA..AA,LLLL: ++ Write LLLL bytes at address AA.AA return OK */ ++ case 'M': ++ /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ ++ ptr = &remcomInBuffer[1]; ++ if (hexToInt(&ptr, &addr) && ++ (*(ptr++) == ',') && ++ (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) { ++ hex2mem(ptr, (char *) addr, length, 1); ++ ++ if (mem_err) { ++ strcpy(remcomOutBuffer, "E03"); ++ debug_error("memory fault\n", NULL); ++ } else { ++ strcpy(remcomOutBuffer, "OK"); ++ } ++ ++ ptr = 0; ++ } ++ if (ptr) { ++ strcpy(remcomOutBuffer, "E02"); ++ debug_error ++ ("malformed write memory command: %s\n", ++ remcomInBuffer); ++ } ++ break; ++ case 'S': ++ remcomInBuffer[0] = 's'; ++ case 'C': ++ /* Csig;AA..AA where ;AA..AA is optional ++ * continue with signal ++ * Since signals are meaning less to us, delete that ++ * part and then fall into the 'c' code. ++ */ ++ ptr = &remcomInBuffer[1]; ++ length = 2; ++ while (*ptr && *ptr != ';') { ++ length++; ++ ptr++; ++ } ++ if (*ptr) { ++ do { ++ ptr++; ++ *(ptr - length++) = *ptr; ++ } while (*ptr); ++ } else { ++ remcomInBuffer[1] = 0; ++ } ++ ++ /* cAA..AA Continue at address AA..AA(optional) */ ++ /* sAA..AA Step one instruction from AA..AA(optional) */ ++ /* D detach, reply OK and then continue */ ++ case 'c': ++ case 's': ++ case 'D': ++ ++ /* try to read optional parameter, ++ pc unchanged if no parm */ ++ ptr = &remcomInBuffer[1]; ++ if (hexToInt(&ptr, &addr)) { ++ if (remote_debug) ++ printk("Changing EIP to 0x%x\n", addr); ++ ++ regs.eip = addr; ++ } ++ ++ newPC = regs.eip; ++ ++ /* clear the trace bit */ ++ regs.eflags &= 0xfffffeff; ++ ++ /* set the trace bit if we're stepping */ ++ if (remcomInBuffer[0] == 's') ++ regs.eflags |= 0x100; ++ ++ /* detach is a friendly version of continue. Note that ++ debugging is still enabled (e.g hit control C) ++ */ ++ if (remcomInBuffer[0] == 'D') { ++ strcpy(remcomOutBuffer, "OK"); ++ putpacket(remcomOutBuffer); ++ } ++ ++ if (remote_debug) { ++ printk("Resuming execution\n"); ++ print_regs(®s); ++ } ++ asm volatile ("movl %%db6, %0\n":"=r" (dr6) ++ :); ++ if (!(dr6 & 0x4000)) { ++ for (breakno = 0; breakno < 4; ++breakno) { ++ if (dr6 & (1 << breakno) && ++ (breakinfo[breakno].type == 0)) { ++ /* Set restore flag */ ++ regs.eflags |= 0x10000; ++ break; ++ } ++ } ++ } ++ correct_hw_break(); ++ asm volatile ("movl %0, %%db6\n"::"r" (0)); ++ goto exit_kgdb; ++ ++ /* kill the program */ ++ case 'k': /* do nothing */ ++ break; ++ ++ /* query */ ++ case 'q': ++ nothreads = 0; ++ switch (remcomInBuffer[1]) { ++ case 'f': ++ threadid = 1; ++ thread_list = 2; ++ thread_list_start = (usethread ? : current); ++ case 's': ++ if (!cmp_str(&remcomInBuffer[2], ++ "ThreadInfo", 10)) ++ break; ++ ++ remcomOutBuffer[nothreads++] = 'm'; ++ for (; threadid < PID_MAX + MAX_NO_CPUS; ++ threadid++) { ++ thread = getthread(threadid); ++ if (thread) { ++ nothreads += int_to_hex_v( ++ &remcomOutBuffer[ ++ nothreads], ++ threadid); ++ if (thread_min > threadid) ++ thread_min = threadid; ++ remcomOutBuffer[ ++ nothreads] = ','; ++ nothreads++; ++ if (nothreads > BUFMAX - 10) ++ break; ++ } ++ } ++ if (remcomOutBuffer[nothreads - 1] == 'm') { ++ remcomOutBuffer[nothreads - 1] = 'l'; ++ } else { ++ nothreads--; ++ } ++ remcomOutBuffer[nothreads] = 0; ++ break; ++ ++#ifdef old_thread_list /* Old thread info request */ ++ case 'L': ++ /* List threads */ ++ thread_list = 2; ++ thread_list_start = (usethread ? : current); ++ unpack_byte(remcomInBuffer + 3, &maxthreads); ++ unpack_threadid(remcomInBuffer + 5, &thref); ++ do { ++ int buf_thread_limit = ++ (BUFMAX - 22) / BUF_THREAD_ID_SIZE; ++ if (maxthreads > buf_thread_limit) { ++ maxthreads = buf_thread_limit; ++ } ++ } while (0); ++ remcomOutBuffer[0] = 'q'; ++ remcomOutBuffer[1] = 'M'; ++ remcomOutBuffer[4] = '0'; ++ pack_threadid(remcomOutBuffer + 5, &thref); ++ ++ threadid = threadref_to_int(&thref); ++ for (nothreads = 0; ++ nothreads < maxthreads && ++ threadid < PID_MAX + MAX_NO_CPUS; ++ threadid++) { ++ thread = getthread(threadid); ++ if (thread) { ++ int_to_threadref(&thref, ++ threadid); ++ pack_threadid(remcomOutBuffer + ++ 21 + ++ nothreads * 16, ++ &thref); ++ nothreads++; ++ if (thread_min > threadid) ++ thread_min = threadid; ++ } ++ } ++ ++ if (threadid == PID_MAX + MAX_NO_CPUS) { ++ remcomOutBuffer[4] = '1'; ++ } ++ pack_hex_byte(remcomOutBuffer + 2, nothreads); ++ remcomOutBuffer[21 + nothreads * 16] = '\0'; ++ break; ++#endif ++ case 'C': ++ /* Current thread id */ ++ remcomOutBuffer[0] = 'Q'; ++ remcomOutBuffer[1] = 'C'; ++ threadid = current->pid; ++ if (!threadid) { ++ /* ++ * idle thread ++ */ ++ for (threadid = PID_MAX; ++ threadid < PID_MAX + MAX_NO_CPUS; ++ threadid++) { ++ if (current == ++ idle_task(threadid - ++ PID_MAX)) ++ break; ++ } ++ } ++ int_to_threadref(&thref, threadid); ++ pack_threadid(remcomOutBuffer + 2, &thref); ++ remcomOutBuffer[18] = '\0'; ++ break; ++ ++ case 'E': ++ /* Print exception info */ ++ printexceptioninfo(exceptionVector, ++ err_code, remcomOutBuffer); ++ break; ++ case 'T':{ ++ char * nptr; ++ /* Thread extra info */ ++ if (!cmp_str(&remcomInBuffer[2], ++ "hreadExtraInfo,", 15)) { ++ break; ++ } ++ ptr = &remcomInBuffer[17]; ++ hexToInt(&ptr, &threadid); ++ thread = getthread(threadid); ++ nptr = &thread->comm[0]; ++ length = 0; ++ ptr = &remcomOutBuffer[0]; ++ do { ++ length++; ++ ptr = pack_hex_byte(ptr, *nptr++); ++ } while (*nptr && length < 16); ++ /* ++ * would like that 16 to be the size of ++ * task_struct.comm but don't know the ++ * syntax.. ++ */ ++ *ptr = 0; ++ } ++ } ++ break; ++ ++ /* task related */ ++ case 'H': ++ switch (remcomInBuffer[1]) { ++ case 'g': ++ ptr = &remcomInBuffer[2]; ++ hexToInt(&ptr, &threadid); ++ thread = getthread(threadid); ++ if (!thread) { ++ remcomOutBuffer[0] = 'E'; ++ remcomOutBuffer[1] = '\0'; ++ break; ++ } ++ /* ++ * Just in case I forget what this is all about, ++ * the "thread info" command to gdb causes it ++ * to ask for a thread list. It then switches ++ * to each thread and asks for the registers. ++ * For this (and only this) usage, we want to ++ * fudge the registers of tasks not on the run ++ * list (i.e. waiting) to show the routine that ++ * called schedule. Also, gdb, is a minimalist ++ * in that if the current thread is the last ++ * it will not re-read the info when done. ++ * This means that in this case we must show ++ * the real registers. So here is how we do it: ++ * Each entry we keep track of the min ++ * thread in the list (the last that gdb will) ++ * get info for. We also keep track of the ++ * starting thread. ++ * "thread_list" is cleared when switching back ++ * to the min thread if it is was current, or ++ * if it was not current, thread_list is set ++ * to 1. When the switch to current comes, ++ * if thread_list is 1, clear it, else do ++ * nothing. ++ */ ++ usethread = thread; ++ if ((thread_list == 1) && ++ (thread == thread_list_start)) { ++ thread_list = 0; ++ } ++ if (thread_list && (threadid == thread_min)) { ++ if (thread == thread_list_start) { ++ thread_list = 0; ++ } else { ++ thread_list = 1; ++ } ++ } ++ /* follow through */ ++ case 'c': ++ remcomOutBuffer[0] = 'O'; ++ remcomOutBuffer[1] = 'K'; ++ remcomOutBuffer[2] = '\0'; ++ break; ++ } ++ break; ++ ++ /* Query thread status */ ++ case 'T': ++ ptr = &remcomInBuffer[1]; ++ hexToInt(&ptr, &threadid); ++ thread = getthread(threadid); ++ if (thread) { ++ remcomOutBuffer[0] = 'O'; ++ remcomOutBuffer[1] = 'K'; ++ remcomOutBuffer[2] = '\0'; ++ if (thread_min > threadid) ++ thread_min = threadid; ++ } else { ++ remcomOutBuffer[0] = 'E'; ++ remcomOutBuffer[1] = '\0'; ++ } ++ break; ++ ++ case 'Y': /* set up a hardware breakpoint */ ++ ptr = &remcomInBuffer[1]; ++ hexToInt(&ptr, &breakno); ++ ptr++; ++ hexToInt(&ptr, &breaktype); ++ ptr++; ++ hexToInt(&ptr, &length); ++ ptr++; ++ hexToInt(&ptr, &addr); ++ if (set_hw_break(breakno & 0x3, ++ breaktype & 0x3, ++ length & 0x3, addr) == 0) { ++ strcpy(remcomOutBuffer, "OK"); ++ } else { ++ strcpy(remcomOutBuffer, "ERROR"); ++ } ++ break; ++ ++ /* Remove hardware breakpoint */ ++ case 'y': ++ ptr = &remcomInBuffer[1]; ++ hexToInt(&ptr, &breakno); ++ if (remove_hw_break(breakno & 0x3) == 0) { ++ strcpy(remcomOutBuffer, "OK"); ++ } else { ++ strcpy(remcomOutBuffer, "ERROR"); ++ } ++ break; ++ ++ case 'r': /* reboot */ ++ strcpy(remcomOutBuffer, "OK"); ++ putpacket(remcomOutBuffer); ++ /*to_gdb("Rebooting\n"); */ ++ /* triplefault no return from here */ ++ { ++ static long no_idt[2]; ++ __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); ++ BREAKPOINT; ++ } ++ ++ } /* switch */ ++ ++ /* reply to the request */ ++ putpacket(remcomOutBuffer); ++ } /* while(1==1) */ ++ /* ++ * reached by goto only. ++ */ ++ exit_kgdb: ++ /* ++ * Here is where we set up to trap a gdb function call. NEW_esp ++ * will be changed if we are trying to do this. We handle both ++ * adding and subtracting, thus allowing gdb to put grung on ++ * the stack which it removes later. ++ */ ++ if (NEW_esp != OLD_esp) { ++ int *ptr = END_OF_LOOKASIDE; ++ if (NEW_esp < OLD_esp) ++ ptr -= (OLD_esp - NEW_esp) / sizeof (int); ++ *--ptr = linux_regs->eflags; ++ *--ptr = linux_regs->xcs; ++ *--ptr = linux_regs->eip; ++ *--ptr = linux_regs->ecx; ++ *--ptr = linux_regs->ebx; ++ *--ptr = linux_regs->eax; ++ linux_regs->ecx = NEW_esp - (sizeof (int) * 6); ++ linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE; ++ if (NEW_esp < OLD_esp) { ++ linux_regs->eip = (unsigned int) fn_call_stub; ++ } else { ++ linux_regs->eip = (unsigned int) fn_rtn_stub; ++ linux_regs->eax = NEW_esp; ++ } ++ linux_regs->eflags &= ~(IF_BIT | TF_BIT); ++ } ++#ifdef CONFIG_SMP ++ /* ++ * Release gdb wait locks ++ * Sanity check time. Must have at least one cpu to run. Also single ++ * step must not be done if the current cpu is on hold. ++ */ ++ if (spinlock_count == 1) { ++ int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; ++ int cpu_avail = 0; ++ int i; ++ ++ for (i = 0; i < MAX_NO_CPUS; i++) { ++ if (!cpu_online(i)) ++ break; ++ if (!hold_cpu(i)) { ++ cpu_avail = 1; ++ } ++ } ++ /* ++ * Early in the bring up there will be NO cpus on line... ++ */ ++ if (!cpu_avail && !cpus_empty(cpu_online_map)) { ++ to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); ++ goto once_again; ++ } ++ if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { ++ to_gdb ++ ("Current cpu must be unblocked to single step\n"); ++ goto once_again; ++ } ++ if (!(ss_hold)) { ++ int i; ++ for (i = 0; i < MAX_NO_CPUS; i++) { ++ if (!hold_cpu(i)) { ++ spin_unlock(&waitlocks[i]); ++ } ++ } ++ } else { ++ spin_unlock(&waitlocks[smp_processor_id()]); ++ } ++ /* Release kgdb spinlock */ ++ KGDB_SPIN_UNLOCK(&kgdb_spinlock); ++ /* ++ * If this cpu is on hold, this is where we ++ * do it. Note, the NMI will pull us out of here, ++ * but will return as the above lock is not held. ++ * We will stay here till another cpu releases the lock for us. ++ */ ++ spin_unlock_wait(waitlocks + smp_processor_id()); ++ kgdb_local_irq_restore(flags); ++ return (0); ++ } ++#if 0 ++exit_just_unlock: ++#endif ++#endif ++ /* Release kgdb spinlock */ ++ KGDB_SPIN_UNLOCK(&kgdb_spinlock); ++ kgdb_local_irq_restore(flags); ++ return (0); ++} ++ ++/* this function is used to set up exception handlers for tracing and ++ * breakpoints. ++ * This function is not needed as the above line does all that is needed. ++ * We leave it for backward compatitability... ++ */ ++void ++set_debug_traps(void) ++{ ++ /* ++ * linux_debug_hook is defined in traps.c. We store a pointer ++ * to our own exception handler into it. ++ ++ * But really folks, every hear of labeled common, an old Fortran ++ * concept. Lots of folks can reference it and it is define if ++ * anyone does. Only one can initialize it at link time. We do ++ * this with the hook. See the statement above. No need for any ++ * executable code and it is ready as soon as the kernel is ++ * loaded. Very desirable in kernel debugging. ++ ++ linux_debug_hook = handle_exception ; ++ */ ++ ++ /* In case GDB is started before us, ack any packets (presumably ++ "$?#xx") sitting there. ++ putDebugChar ('+'); ++ ++ initialized = 1; ++ */ ++} ++ ++/* This function will generate a breakpoint exception. It is used at the ++ beginning of a program to sync up with a debugger and can be used ++ otherwise as a quick means to stop program execution and "break" into ++ the debugger. */ ++/* But really, just use the BREAKPOINT macro. We will handle the int stuff ++ */ ++ ++#ifdef later ++/* ++ * possibly we should not go thru the traps.c code at all? Someday. ++ */ ++void ++do_kgdb_int3(struct pt_regs *regs, long error_code) ++{ ++ kgdb_handle_exception(3, 5, error_code, regs); ++ return; ++} ++#endif ++#undef regs ++#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS ++asmlinkage void ++bad_sys_call_exit(int stuff) ++{ ++ struct pt_regs *regs = (struct pt_regs *) &stuff; ++ printk("Sys call %d return with %x preempt_count\n", ++ (int) regs->orig_eax, preempt_count()); ++} ++#endif ++#ifdef CONFIG_STACK_OVERFLOW_TEST ++#include ++asmlinkage void ++stack_overflow(void) ++{ ++#ifdef BREAKPOINT ++ BREAKPOINT; ++#else ++ printk("Kernel stack overflow, looping forever\n"); ++#endif ++ while (1) { ++ } ++} ++#endif ++ ++#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) ++char gdbconbuf[BUFMAX]; ++ ++static void ++kgdb_gdb_message(const char *s, unsigned count) ++{ ++ int i; ++ int wcount; ++ char *bufptr; ++ /* ++ * This takes care of NMI while spining out chars to gdb ++ */ ++ IF_SMP(in_kgdb_console = 1); ++ gdbconbuf[0] = 'O'; ++ bufptr = gdbconbuf + 1; ++ while (count > 0) { ++ if ((count << 1) > (BUFMAX - 2)) { ++ wcount = (BUFMAX - 2) >> 1; ++ } else { ++ wcount = count; ++ } ++ count -= wcount; ++ for (i = 0; i < wcount; i++) { ++ bufptr = pack_hex_byte(bufptr, s[i]); ++ } ++ *bufptr = '\0'; ++ s += wcount; ++ ++ putpacket(gdbconbuf); ++ ++ } ++ IF_SMP(in_kgdb_console = 0); ++} ++#endif ++#ifdef CONFIG_SMP ++static void ++to_gdb(const char *s) ++{ ++ int count = 0; ++ while (s[count] && (count++ < BUFMAX)) ; ++ kgdb_gdb_message(s, count); ++} ++#endif ++#ifdef CONFIG_KGDB_CONSOLE ++#include ++#include ++#include ++#include ++#include ++ ++void ++kgdb_console_write(struct console *co, const char *s, unsigned count) ++{ ++ ++ if (gdb_i386vector == -1) { ++ /* ++ * We have not yet talked to gdb. What to do... ++ * lets break, on continue we can do the write. ++ * But first tell him whats up. Uh, well no can do, ++ * as this IS the console. Oh well... ++ * We do need to wait or the messages will be lost. ++ * Other option would be to tell the above code to ++ * ignore this breakpoint and do an auto return, ++ * but that might confuse gdb. Also this happens ++ * early enough in boot up that we don't have the traps ++ * set up yet, so... ++ */ ++ breakpoint(); ++ } ++ kgdb_gdb_message(s, count); ++} ++ ++/* ++ * ------------------------------------------------------------ ++ * Serial KGDB driver ++ * ------------------------------------------------------------ ++ */ ++ ++static struct console kgdbcons = { ++ name:"kgdb", ++ write:kgdb_console_write, ++#ifdef CONFIG_KGDB_USER_CONSOLE ++ device:kgdb_console_device, ++#endif ++ flags:CON_PRINTBUFFER | CON_ENABLED, ++ index:-1, ++}; ++ ++/* ++ * The trick here is that this file gets linked before printk.o ++ * That means we get to peer at the console info in the command ++ * line before it does. If we are up, we register, otherwise, ++ * do nothing. By returning 0, we allow printk to look also. ++ */ ++static int kgdb_console_enabled; ++ ++int __init ++kgdb_console_init(char *str) ++{ ++ if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { ++ register_console(&kgdbcons); ++ kgdb_console_enabled = 1; ++ } ++ return 0; /* let others look at the string */ ++} ++ ++__setup("console=", kgdb_console_init); ++ ++#ifdef CONFIG_KGDB_USER_CONSOLE ++static kdev_t kgdb_console_device(struct console *c); ++/* This stuff sort of works, but it knocks out telnet devices ++ * we are leaving it here in case we (or you) find time to figure it out ++ * better.. ++ */ ++ ++/* ++ * We need a real char device as well for when the console is opened for user ++ * space activities. ++ */ ++ ++static int ++kgdb_consdev_open(struct inode *inode, struct file *file) ++{ ++ return 0; ++} ++ ++static ssize_t ++kgdb_consdev_write(struct file *file, const char *buf, ++ size_t count, loff_t * ppos) ++{ ++ int size, ret = 0; ++ static char kbuf[128]; ++ static DECLARE_MUTEX(sem); ++ ++ /* We are not reentrant... */ ++ if (down_interruptible(&sem)) ++ return -ERESTARTSYS; ++ ++ while (count > 0) { ++ /* need to copy the data from user space */ ++ size = count; ++ if (size > sizeof (kbuf)) ++ size = sizeof (kbuf); ++ if (copy_from_user(kbuf, buf, size)) { ++ ret = -EFAULT; ++ break;; ++ } ++ kgdb_console_write(&kgdbcons, kbuf, size); ++ count -= size; ++ ret += size; ++ buf += size; ++ } ++ ++ up(&sem); ++ ++ return ret; ++} ++ ++struct file_operations kgdb_consdev_fops = { ++ open:kgdb_consdev_open, ++ write:kgdb_consdev_write ++}; ++static kdev_t ++kgdb_console_device(struct console *c) ++{ ++ return MKDEV(TTYAUX_MAJOR, 1); ++} ++ ++/* ++ * This routine gets called from the serial stub in the i386/lib ++ * This is so it is done late in bring up (just before the console open). ++ */ ++void ++kgdb_console_finit(void) ++{ ++ if (kgdb_console_enabled) { ++ char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); ++ char *cp = cptr; ++ while (*cptr && *cptr != '(') ++ cptr++; ++ *cptr = 0; ++ unregister_chrdev(TTYAUX_MAJOR, cp); ++ register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); ++ } ++} ++#endif ++#endif ++#ifdef CONFIG_KGDB_TS ++#include /* time stamp code */ ++#include /* in_interrupt */ ++#ifdef CONFIG_KGDB_TS_64 ++#define DATA_POINTS 64 ++#endif ++#ifdef CONFIG_KGDB_TS_128 ++#define DATA_POINTS 128 ++#endif ++#ifdef CONFIG_KGDB_TS_256 ++#define DATA_POINTS 256 ++#endif ++#ifdef CONFIG_KGDB_TS_512 ++#define DATA_POINTS 512 ++#endif ++#ifdef CONFIG_KGDB_TS_1024 ++#define DATA_POINTS 1024 ++#endif ++#ifndef DATA_POINTS ++#define DATA_POINTS 128 /* must be a power of two */ ++#endif ++#define INDEX_MASK (DATA_POINTS - 1) ++#if (INDEX_MASK & DATA_POINTS) ++#error "CONFIG_KGDB_TS_COUNT must be a power of 2" ++#endif ++struct kgdb_and_then_struct { ++#ifdef CONFIG_SMP ++ int on_cpu; ++#endif ++ struct task_struct *task; ++ long long at_time; ++ int from_ln; ++ char *in_src; ++ void *from; ++ int *with_shpf; ++ int data0; ++ int data1; ++}; ++struct kgdb_and_then_struct2 { ++#ifdef CONFIG_SMP ++ int on_cpu; ++#endif ++ struct task_struct *task; ++ long long at_time; ++ int from_ln; ++ char *in_src; ++ void *from; ++ int *with_shpf; ++ struct task_struct *t1; ++ struct task_struct *t2; ++}; ++struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; ++ ++struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; ++int kgdb_and_then_count; ++ ++void ++kgdb_tstamp(int line, char *source, int data0, int data1) ++{ ++ static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; ++ int flags; ++ kgdb_local_irq_save(flags); ++ spin_lock(&ts_spin); ++ rdtscll(kgdb_and_then->at_time); ++#ifdef CONFIG_SMP ++ kgdb_and_then->on_cpu = smp_processor_id(); ++#endif ++ kgdb_and_then->task = current; ++ kgdb_and_then->from_ln = line; ++ kgdb_and_then->in_src = source; ++ kgdb_and_then->from = __builtin_return_address(0); ++ kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | ++ (preempt_count() << 8)); ++ kgdb_and_then->data0 = data0; ++ kgdb_and_then->data1 = data1; ++ kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; ++ spin_unlock(&ts_spin); ++ kgdb_local_irq_restore(flags); ++#ifdef CONFIG_PREEMPT ++ ++#endif ++ return; ++} ++#endif ++typedef int gdb_debug_hook(int exceptionVector, ++ int signo, int err_code, struct pt_regs *linux_regs); ++gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ +Index: linux-2.6.10/arch/i386/kernel/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/Makefile 2005-03-31 15:35:23.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/Makefile 2005-04-05 12:48:05.254618256 +0800 +@@ -14,6 +14,7 @@ + obj-$(CONFIG_ACPI_BOOT) += acpi/ + obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o + obj-$(CONFIG_MCA) += mca.o ++obj-$(CONFIG_KGDB) += kgdb_stub.o + obj-$(CONFIG_X86_MSR) += msr.o + obj-$(CONFIG_X86_CPUID) += cpuid.o + obj-$(CONFIG_MICROCODE) += microcode.o +Index: linux-2.6.10/arch/i386/kernel/smp.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/smp.c 2005-03-31 16:20:11.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/smp.c 2005-04-05 12:48:05.218623728 +0800 +@@ -466,7 +466,17 @@ + { + on_each_cpu(do_flush_tlb_all, NULL, 1, 1); + } +- ++#ifdef CONFIG_KGDB ++/* ++ * By using the NMI code instead of a vector we just sneak thru the ++ * word generator coming out with just what we want. AND it does ++ * not matter if clustered_apic_mode is set or not. ++ */ ++void smp_send_nmi_allbutself(void) ++{ ++ send_IPI_allbutself(APIC_DM_NMI); ++} ++#endif + /* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing +Index: linux-2.6.10/arch/i386/Kconfig.kgdb +=================================================================== +--- linux-2.6.10.orig/arch/i386/Kconfig.kgdb 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/arch/i386/Kconfig.kgdb 2005-04-05 12:48:05.205625704 +0800 +@@ -0,0 +1,175 @@ ++config KGDB ++ bool "Include kgdb kernel debugger" ++ depends on DEBUG_KERNEL && !KPROBES ++ help ++ If you say Y here, the system will be compiled with the debug ++ option (-g) and a debugging stub will be included in the ++ kernel. This stub communicates with gdb on another (host) ++ computer via a serial port. The host computer should have ++ access to the kernel binary file (vmlinux) and a serial port ++ that is connected to the target machine. Gdb can be made to ++ configure the serial port or you can use stty and setserial to ++ do this. See the 'target' command in gdb. This option also ++ configures in the ability to request a breakpoint early in the ++ boot process. To request the breakpoint just include 'kgdb' ++ as a boot option when booting the target machine. The system ++ will then break as soon as it looks at the boot options. This ++ option also installs a breakpoint in panic and sends any ++ kernel faults to the debugger. For more information see the ++ Documentation/i386/kgdb/kgdb.txt file. ++ ++choice ++ depends on KGDB ++ prompt "Debug serial port BAUD" ++ default KGDB_115200BAUD ++ help ++ Gdb and the kernel stub need to agree on the baud rate to be ++ used. Some systems (x86 family at this writing) allow this to ++ be configured. ++ ++config KGDB_9600BAUD ++ bool "9600" ++ ++config KGDB_19200BAUD ++ bool "19200" ++ ++config KGDB_38400BAUD ++ bool "38400" ++ ++config KGDB_57600BAUD ++ bool "57600" ++ ++config KGDB_115200BAUD ++ bool "115200" ++endchoice ++ ++config KGDB_PORT ++ hex "hex I/O port address of the debug serial port" ++ depends on KGDB ++ default 3f8 ++ help ++ Some systems (x86 family at this writing) allow the port ++ address to be configured. The number entered is assumed to be ++ hex, don't put 0x in front of it. The standard address are: ++ COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx ++ will tell you what you have. It is good to test the serial ++ connection with a live system before trying to debug. ++ ++config KGDB_IRQ ++ int "IRQ of the debug serial port" ++ depends on KGDB ++ default 4 ++ help ++ This is the irq for the debug port. If everything is working ++ correctly and the kernel has interrupts on a control C to the ++ port should cause a break into the kernel debug stub. ++ ++config DEBUG_INFO ++ bool ++ depends on KGDB ++ default y ++ ++config KGDB_MORE ++ bool "Add any additional compile options" ++ depends on KGDB ++ default n ++ help ++ Saying yes here turns on the ability to enter additional ++ compile options. ++ ++ ++config KGDB_OPTIONS ++ depends on KGDB_MORE ++ string "Additional compile arguments" ++ default "-O1" ++ help ++ This option allows you enter additional compile options for ++ the whole kernel compile. Each platform will have a default ++ that seems right for it. For example on PPC "-ggdb -O1", and ++ for i386 "-O1". Note that by configuring KGDB "-g" is already ++ turned on. In addition, on i386 platforms ++ "-fomit-frame-pointer" is deleted from the standard compile ++ options. ++ ++config NO_KGDB_CPUS ++ int "Number of CPUs" ++ depends on KGDB && SMP ++ default NR_CPUS ++ help ++ ++ This option sets the number of cpus for kgdb ONLY. It is used ++ to prune some internal structures so they look "nice" when ++ displayed with gdb. This is to overcome possibly larger ++ numbers that may have been entered above. Enter the real ++ number to get nice clean kgdb_info displays. ++ ++config KGDB_TS ++ bool "Enable kgdb time stamp macros?" ++ depends on KGDB ++ default n ++ help ++ Kgdb event macros allow you to instrument your code with calls ++ to the kgdb event recording function. The event log may be ++ examined with gdb at a break point. Turning on this ++ capability also allows you to choose how many events to ++ keep. Kgdb always keeps the lastest events. ++ ++choice ++ depends on KGDB_TS ++ prompt "Max number of time stamps to save?" ++ default KGDB_TS_128 ++ ++config KGDB_TS_64 ++ bool "64" ++ ++config KGDB_TS_128 ++ bool "128" ++ ++config KGDB_TS_256 ++ bool "256" ++ ++config KGDB_TS_512 ++ bool "512" ++ ++config KGDB_TS_1024 ++ bool "1024" ++ ++endchoice ++ ++config STACK_OVERFLOW_TEST ++ bool "Turn on kernel stack overflow testing?" ++ depends on KGDB ++ default n ++ help ++ This option enables code in the front line interrupt handlers ++ to check for kernel stack overflow on interrupts and system ++ calls. This is part of the kgdb code on x86 systems. ++ ++config KGDB_CONSOLE ++ bool "Enable serial console thru kgdb port" ++ depends on KGDB ++ default n ++ help ++ This option enables the command line "console=kgdb" option. ++ When the system is booted with this option in the command line ++ all kernel printk output is sent to gdb (as well as to other ++ consoles). For this to work gdb must be connected. For this ++ reason, this command line option will generate a breakpoint if ++ gdb has not yet connected. After the gdb continue command is ++ given all pent up console output will be printed by gdb on the ++ host machine. Neither this option, nor KGDB require the ++ serial driver to be configured. ++ ++config KGDB_SYSRQ ++ bool "Turn on SysRq 'G' command to do a break?" ++ depends on KGDB ++ default y ++ help ++ This option includes an option in the SysRq code that allows ++ you to enter SysRq G which generates a breakpoint to the KGDB ++ stub. This will work if the keyboard is alive and can ++ interrupt the system. Because of constraints on when the ++ serial port interrupt can be enabled, this code may allow you ++ to interrupt the system before the serial port control C is ++ available. Just say yes here. ++ +Index: linux-2.6.10/arch/i386/mm/fault.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/mm/fault.c 2004-12-25 05:33:48.000000000 +0800 ++++ linux-2.6.10/arch/i386/mm/fault.c 2005-04-05 12:48:05.196627072 +0800 +@@ -430,6 +430,12 @@ + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ ++#ifdef CONFIG_KGDB ++ if (!user_mode(regs)){ ++ kgdb_handle_exception(14,SIGBUS, error_code, regs); ++ return; ++ } ++#endif + + bust_spinlocks(1); + +Index: linux-2.6.10/arch/i386/Kconfig +=================================================================== +--- linux-2.6.10.orig/arch/i386/Kconfig 2005-04-05 12:48:03.417897480 +0800 ++++ linux-2.6.10/arch/i386/Kconfig 2005-04-05 12:48:05.257617800 +0800 +@@ -1196,6 +1196,14 @@ + + source "fs/Kconfig.binfmt" + ++config TRAP_BAD_SYSCALL_EXITS ++ bool "Debug bad system call exits" ++ depends on KGDB ++ help ++ If you say Y here the kernel will check for system calls which ++ return without clearing preempt. ++ default n ++ + endmenu + + source "drivers/Kconfig" +Index: linux-2.6.10/arch/i386/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/i386/Makefile 2005-03-31 15:35:27.000000000 +0800 ++++ linux-2.6.10/arch/i386/Makefile 2005-04-05 12:48:05.255618104 +0800 +@@ -99,6 +99,9 @@ + # default subarch .h files + mflags-y += -Iinclude/asm-i386/mach-default + ++mflags-$(CONFIG_KGDB) += -gdwarf-2 ++mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g') ++ + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o + + libs-y += arch/i386/lib/ +Index: linux-2.6.10/arch/x86_64/boot/compressed/head.S +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/boot/compressed/head.S 2004-12-25 05:35:39.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/boot/compressed/head.S 2005-04-05 12:48:05.258617648 +0800 +@@ -26,6 +26,7 @@ + .code32 + .text + ++#define IN_BOOTLOADER + #include + #include + +Index: linux-2.6.10/arch/x86_64/boot/compressed/misc.c +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/boot/compressed/misc.c 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/boot/compressed/misc.c 2005-04-05 12:48:05.259617496 +0800 +@@ -9,6 +9,7 @@ + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + */ + ++#define IN_BOOTLOADER + #include "miscsetup.h" + #include + +Index: linux-2.6.10/MAINTAINERS +=================================================================== +--- linux-2.6.10.orig/MAINTAINERS 2005-03-31 15:35:24.000000000 +0800 ++++ linux-2.6.10/MAINTAINERS 2005-04-05 12:48:05.181629352 +0800 +@@ -1245,6 +1245,12 @@ + W: http://developer.osdl.org/rddunlap/kj-patches/ + S: Maintained + ++KGDB FOR I386 PLATFORM ++P: George Anzinger ++M: george@mvista.com ++L: linux-net@vger.kernel.org ++S: Supported ++ + KERNEL NFSD + P: Neil Brown + M: neilb@cse.unsw.edu.au +Index: linux-2.6.10/drivers/char/sysrq.c +=================================================================== +--- linux-2.6.10.orig/drivers/char/sysrq.c 2005-03-31 15:57:20.000000000 +0800 ++++ linux-2.6.10/drivers/char/sysrq.c 2005-04-05 12:48:05.191627832 +0800 +@@ -35,6 +35,25 @@ + #include + + #include ++#ifdef CONFIG_KGDB_SYSRQ ++ ++#define GDB_OP &kgdb_op ++static void kgdb_sysrq(int key, struct pt_regs *pt_regs, struct tty_struct *tty) ++{ ++ printk("kgdb sysrq\n"); ++ breakpoint(); ++} ++ ++static struct sysrq_key_op kgdb_op = { ++ .handler = kgdb_sysrq, ++ .help_msg = "kGdb|Fgdb", ++ .action_msg = "Debug breakpoint\n", ++}; ++ ++#else ++#define GDB_OP NULL ++#endif ++ + + extern void reset_vc(unsigned int); + +@@ -249,7 +268,7 @@ + /* d */ NULL, + /* e */ &sysrq_term_op, + /* f */ NULL, +-/* g */ NULL, ++/* g */ GDB_OP, + /* h */ NULL, + /* i */ &sysrq_kill_op, + /* j */ NULL, +Index: linux-2.6.10/drivers/char/keyboard.c +=================================================================== +--- linux-2.6.10.orig/drivers/char/keyboard.c 2004-12-25 05:35:50.000000000 +0800 ++++ linux-2.6.10/drivers/char/keyboard.c 2005-04-05 12:48:05.190627984 +0800 +@@ -1078,6 +1078,9 @@ + } + if (sysrq_down && down && !rep) { + handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); ++#ifdef CONFIG_KGDB_SYSRQ ++ sysrq_down = 0; /* in case we miss the "up" event */ ++#endif + return; + } + #endif +Index: linux-2.6.10/drivers/serial/serial_core.c +=================================================================== +--- linux-2.6.10.orig/drivers/serial/serial_core.c 2004-12-25 05:34:26.000000000 +0800 ++++ linux-2.6.10/drivers/serial/serial_core.c 2005-04-05 12:48:05.188628288 +0800 +@@ -1924,6 +1924,15 @@ + { + unsigned int flags; + ++#ifdef CONFIG_KGDB ++ { ++ extern int kgdb_irq; ++ ++ if (port->irq == kgdb_irq) ++ return; ++ } ++#endif ++ + /* + * If there isn't a port here, don't do anything further. + */ +Index: linux-2.6.10/drivers/serial/8250.c +=================================================================== +--- linux-2.6.10.orig/drivers/serial/8250.c 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/drivers/serial/8250.c 2005-04-05 12:48:05.185628744 +0800 +@@ -1350,12 +1350,21 @@ + spin_unlock_irqrestore(&up->port.lock, flags); + } + ++#ifdef CONFIG_KGDB ++int kgdb_irq = -1; ++#endif ++ + static int serial8250_startup(struct uart_port *port) + { + struct uart_8250_port *up = (struct uart_8250_port *)port; + unsigned long flags; + int retval; + ++#ifdef CONFIG_KGDB ++ if (up->port.irq == kgdb_irq) ++ return -EBUSY; ++#endif ++ + up->capabilities = uart_config[up->port.type].flags; + up->mcr = 0; + +@@ -2438,6 +2447,33 @@ + } + EXPORT_SYMBOL(serial8250_unregister_port); + ++#ifdef CONFIG_KGDB ++#include ++ ++/* ++ * Find all the ports using the given irq and shut them down. ++ * Result should be that the irq will be released. ++ */ ++void shutdown_for_kgdb(struct async_struct * info) ++{ ++ int irq = info->state->irq; ++ struct uart_8250_port *up; ++ int ttyS; ++ ++ kgdb_irq = irq; /* save for later init */ ++ for (ttyS = 0; ttyS < UART_NR; ttyS++){ ++ up = &serial8250_ports[ttyS]; ++ if (up->port.irq == irq && (irq_lists + irq)->head) { ++#ifdef CONFIG_DEBUG_SPINLOCK /* ugly business... */ ++ if(up->port.lock.magic != SPINLOCK_MAGIC) ++ spin_lock_init(&up->port.lock); ++#endif ++ serial8250_shutdown(&up->port); ++ } ++ } ++} ++#endif /* CONFIG_KGDB */ ++ + static int __init serial8250_init(void) + { + int ret, i; diff --git a/lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch b/lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch new file mode 100644 index 0000000..cf91437 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch @@ -0,0 +1,10703 @@ + + +The complete set of citi nfsv4 patches combined into one patch. + +Changes since 2.6.10-rc3-CITI_NFS4_ALL-3 + * minor adjustments to xdr buffer length calculations in fs/nfs4xdr.c + * client acl revisions: pass acls in page array of xdr bufs, removing + arbitrary length restrictions. Temporarily disable acl caching. + +Index: linux-2.6.10/include/linux/nfsd/state.h +=================================================================== +--- linux-2.6.10.orig/include/linux/nfsd/state.h 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/include/linux/nfsd/state.h 2005-04-05 14:49:13.465682224 +0800 +@@ -67,6 +67,45 @@ + #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + ++/* Delegation recall states */ ++#define NFS4_NO_RECALL 0x000 ++#define NFS4_RECALL_IN_PROGRESS 0x001 ++#define NFS4_RECALL_COMPLETE 0x002 ++ ++ ++/* Delegation flags */ ++#define NFS4_DELAY_CLOSE 0x001 ++ ++struct nfs4_cb_recall { ++ u32 cbr_ident; ++ int cbr_trunc; ++ stateid_t cbr_stateid; ++ u32 cbr_fhlen; ++ u32 cbr_fhval[NFS4_FHSIZE]; ++ struct nfs4_delegation *cbr_dp; ++}; ++ ++struct nfs4_delegation { ++ struct list_head dl_del_perfile; /* nfs4_file->fi_del_perfile */ ++ struct list_head dl_del_perclnt; /* nfs4_client->cl_del_perclnt*/ ++ struct list_head dl_recall_lru; /* delegation recalled */ ++ atomic_t dl_recall_cnt; /* resend cb_recall only once */ ++ atomic_t dl_count; /* ref count */ ++ atomic_t dl_state; /* recall state */ ++ struct nfs4_client *dl_client; ++ struct nfs4_file *dl_file; ++ struct file_lock *dl_flock; ++ struct nfs4_stateid *dl_stp; ++ u32 dl_flags; ++ u32 dl_type; ++ time_t dl_time; ++ struct nfs4_cb_recall dl_recall; ++}; ++ ++#define dl_stateid dl_recall.cbr_stateid ++#define dl_fhlen dl_recall.cbr_fhlen ++#define dl_fhval dl_recall.cbr_fhval ++ + /* client delegation callback info */ + struct nfs4_callback { + /* SETCLIENTID info */ +@@ -75,9 +114,8 @@ + unsigned short cb_port; + u32 cb_prog; + u32 cb_ident; +- struct xdr_netobj cb_netid; + /* RPC client info */ +- u32 cb_set; /* successful CB_NULL call */ ++ atomic_t cb_set; /* successful CB_NULL call */ + struct rpc_program cb_program; + struct rpc_stat cb_stat; + struct rpc_clnt * cb_client; +@@ -97,6 +135,7 @@ + struct list_head cl_idhash; /* hash by cl_clientid.id */ + struct list_head cl_strhash; /* hash by cl_name */ + struct list_head cl_perclient; /* list: stateowners */ ++ struct list_head cl_del_perclnt; /* list: delegations */ + struct list_head cl_lru; /* tail queue */ + struct xdr_netobj cl_name; /* id generated by client */ + nfs4_verifier cl_verifier; /* generated by client */ +@@ -106,7 +145,8 @@ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ + struct nfs4_callback cl_callback; /* callback info */ +- time_t cl_first_state; /* first state aquisition*/ ++ atomic_t cl_count; /* ref count */ ++ u32 cl_firststate; /* recovery file creation */ + }; + + /* struct nfs4_client_reset +@@ -117,8 +157,6 @@ + struct nfs4_client_reclaim { + struct list_head cr_strhash; /* hash by cr_name */ + struct xdr_netobj cr_name; /* id generated by client */ +- time_t cr_first_state; /* first state aquisition */ +- u32 cr_expired; /* boolean: lease expired? */ + }; + + static inline void +@@ -194,6 +232,7 @@ + struct nfs4_file { + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_perfile; /* list: nfs4_stateid */ ++ struct list_head fi_del_perfile; /* list: nfs4_delegation */ + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ +@@ -231,8 +270,10 @@ + #define CONFIRM 0x00000002 + #define OPEN_STATE 0x00000004 + #define LOCK_STATE 0x00000008 +-#define RDWR_STATE 0x00000010 +-#define CLOSE_STATE 0x00000020 ++#define RD_STATE 0x00000010 ++#define WR_STATE 0x00000020 ++#define CLOSE_STATE 0x00000040 ++#define DELEG_RET 0x00000080 + + #define seqid_mutating_err(err) \ + (((err) != nfserr_stale_clientid) && \ +@@ -243,14 +284,24 @@ + extern time_t nfs4_laundromat(void); + extern int nfsd4_renew(clientid_t *clid); + extern int nfs4_preprocess_stateid_op(struct svc_fh *current_fh, +- stateid_t *stateid, int flags, struct nfs4_stateid **stpp); ++ stateid_t *stateid, int flags, struct file **filp); + extern int nfs4_share_conflict(struct svc_fh *current_fh, + unsigned int deny_type); + extern void nfs4_lock_state(void); + extern void nfs4_unlock_state(void); + extern int nfs4_in_grace(void); + extern int nfs4_check_open_reclaim(clientid_t *clid); ++extern void put_nfs4_client(struct nfs4_client *clp); + extern void nfs4_free_stateowner(struct kref *kref); ++extern void nfsd4_probe_callback(struct nfs4_client *clp); ++extern int nfsd4_cb_recall(struct nfs4_delegation *dp); ++extern int nfsd4_create_clid_file(struct nfs4_client *clp); ++extern void nfsd4_remove_clid_file(struct nfs4_client *clp); ++extern int nfsd4_list_rec_dir(int clear); ++extern void nfsd4_init_rec_dir(char *rec_dirname); ++extern void nfsd4_shutdown_rec_dir(void); ++extern int nfs4_client_to_reclaim(char *name, int namlen); ++ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +Index: linux-2.6.10/include/linux/nfsd/nfsd.h +=================================================================== +--- linux-2.6.10.orig/include/linux/nfsd/nfsd.h 2004-12-25 05:35:39.000000000 +0800 ++++ linux-2.6.10/include/linux/nfsd/nfsd.h 2005-04-05 14:49:13.464682376 +0800 +@@ -98,8 +98,12 @@ + void nfsd_close(struct file *); + int nfsd_read(struct svc_rqst *, struct svc_fh *, + loff_t, struct kvec *,int, unsigned long *); ++int nfsd_vfs_read(struct svc_rqst *, struct svc_fh *, struct file *, ++ loff_t, struct kvec *, int, unsigned long *); + int nfsd_write(struct svc_rqst *, struct svc_fh *, + loff_t, struct kvec *,int, unsigned long, int *); ++int nfsd_vfs_write(struct svc_rqst *, struct svc_fh *,struct file *, ++ loff_t, struct kvec *,int, unsigned long, int *); + int nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); + int nfsd_symlink(struct svc_rqst *, struct svc_fh *, +Index: linux-2.6.10/include/linux/nfsd/xdr4.h +=================================================================== +--- linux-2.6.10.orig/include/linux/nfsd/xdr4.h 2004-12-25 05:34:01.000000000 +0800 ++++ linux-2.6.10/include/linux/nfsd/xdr4.h 2005-04-05 14:49:13.466682072 +0800 +@@ -44,16 +44,6 @@ + #define NFSD4_MAX_TAGLEN 128 + #define XDR_LEN(n) (((n) + 3) & ~3) + +-typedef u32 delegation_zero_t; +-typedef u32 delegation_boot_t; +-typedef u64 delegation_id_t; +- +-typedef struct { +- delegation_zero_t ds_zero; +- delegation_boot_t ds_boot; +- delegation_id_t ds_id; +-} delegation_stateid_t; +- + struct nfsd4_change_info { + u32 atomic; + u32 before_ctime_sec; +@@ -104,6 +94,10 @@ + #define cr_specdata1 u.dev.specdata1 + #define cr_specdata2 u.dev.specdata2 + ++struct nfsd4_delegreturn { ++ stateid_t dr_stateid; ++}; ++ + struct nfsd4_getattr { + u32 ga_bmval[2]; /* request */ + struct svc_fh *ga_fhp; /* response */ +@@ -202,13 +196,13 @@ + u32 op_claim_type; /* request */ + struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_delegate_type; /* request - CLAIM_PREV only */ +- delegation_stateid_t op_delegate_stateid; /* request - CLAIM_DELEGATE_CUR only */ ++ stateid_t op_delegate_stateid; /* request - response */ + u32 op_create; /* request */ + u32 op_createmode; /* request */ + u32 op_bmval[2]; /* request */ + union { /* request */ +- struct iattr iattr; /* UNCHECKED4,GUARDED4 */ +- nfs4_verifier verf; /* EXCLUSIVE4 */ ++ struct iattr iattr; /* UNCHECKED4,GUARDED4 */ ++ nfs4_verifier verf; /* EXCLUSIVE4 */ + } u; + clientid_t op_clientid; /* request */ + struct xdr_netobj op_owner; /* request */ +@@ -247,6 +241,7 @@ + u32 rd_length; /* request */ + struct kvec rd_iov[RPCSVC_MAXPAGES]; + int rd_vlen; ++ struct file *rd_filp; + + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ +@@ -345,6 +340,7 @@ + struct nfsd4_close close; + struct nfsd4_commit commit; + struct nfsd4_create create; ++ struct nfsd4_delegreturn delegreturn; + struct nfsd4_getattr getattr; + struct svc_fh * getfh; + struct nfsd4_link link; +@@ -456,6 +452,8 @@ + nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_release_lockowner *rlockowner); + extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); ++extern int nfsd4_delegreturn(struct svc_rqst *rqstp, ++ struct svc_fh *current_fh, struct nfsd4_delegreturn *dr); + #endif + + /* +Index: linux-2.6.10/include/linux/fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/fs.h 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/include/linux/fs.h 2005-04-05 14:49:13.461682832 +0800 +@@ -1185,11 +1185,6 @@ + + extern int vfs_statfs(struct super_block *, struct kstatfs *); + +-/* Return value for VFS lock functions - tells locks.c to lock conventionally +- * REALLY kosha for root NFS and nfs_lock +- */ +-#define LOCK_USE_CLNT 1 +- + #define FLOCK_VERIFY_READ 1 + #define FLOCK_VERIFY_WRITE 2 + +Index: linux-2.6.10/include/linux/dcache.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dcache.h 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/include/linux/dcache.h 2005-04-05 14:49:13.460682984 +0800 +@@ -200,6 +200,7 @@ + * These are the low-level FS interfaces to the dcache.. + */ + extern void d_instantiate(struct dentry *, struct inode *); ++extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); + extern void d_delete(struct dentry *); + + /* allocate/de-allocate */ +@@ -244,6 +245,23 @@ + d_rehash(entry); + } + ++/** ++ * d_add_unique - add dentry to hash queues without aliasing ++ * @entry: dentry to add ++ * @inode: The inode to attach to this dentry ++ * ++ * This adds the entry to the hash queues and initializes @inode. ++ * The entry was actually filled in earlier during d_alloc(). ++ */ ++static inline struct dentry *d_add_unique(struct dentry *entry, struct inode *inode) ++{ ++ struct dentry *res; ++ ++ res = d_instantiate_unique(entry, inode); ++ d_rehash(res != NULL ? res : entry); ++ return res; ++} ++ + /* used for rename() and baskets */ + extern void d_move(struct dentry *, struct dentry *); + +Index: linux-2.6.10/include/linux/nfs_fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/nfs_fs.h 2004-12-25 05:34:31.000000000 +0800 ++++ linux-2.6.10/include/linux/nfs_fs.h 2005-04-05 14:49:13.463682528 +0800 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + /* + * Enable debugging support for nfs client. +@@ -201,6 +202,7 @@ + #define NFS_INO_INVALID_ATTR 0x0008 /* cached attrs are invalid */ + #define NFS_INO_INVALID_DATA 0x0010 /* cached data is invalid */ + #define NFS_INO_INVALID_ATIME 0x0020 /* cached atime is invalid */ ++#define NFS_INO_INVALID_ACCESS 0x0040 /* cached access cred invalid */ + + static inline struct nfs_inode *NFS_I(struct inode *inode) + { +@@ -239,7 +241,7 @@ + static inline void NFS_CACHEINV(struct inode *inode) + { + if (!nfs_caches_unstable(inode)) +- NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR; ++ NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; + } + + static inline int nfs_server_capable(struct inode *inode, int cap) +@@ -424,6 +426,44 @@ + return nfs_wb_page_priority(inode, page, 0); + } + ++/* ++ * Allocate and free nfs_write_data structures ++ */ ++extern mempool_t *nfs_wdata_mempool; ++extern mempool_t *nfs_commit_mempool; ++ ++static inline struct nfs_write_data *nfs_writedata_alloc(void) ++{ ++ struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); ++ if (p) { ++ memset(p, 0, sizeof(*p)); ++ INIT_LIST_HEAD(&p->pages); ++ } ++ return p; ++} ++ ++static inline void nfs_writedata_free(struct nfs_write_data *p) ++{ ++ mempool_free(p, nfs_wdata_mempool); ++} ++ ++extern void nfs_writedata_release(struct rpc_task *task); ++ ++static inline struct nfs_write_data *nfs_commit_alloc(void) ++{ ++ struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS); ++ if (p) { ++ memset(p, 0, sizeof(*p)); ++ INIT_LIST_HEAD(&p->pages); ++ } ++ return p; ++} ++ ++static inline void nfs_commit_free(struct nfs_write_data *p) ++{ ++ mempool_free(p, nfs_commit_mempool); ++} ++ + /* Hack for future NFS swap support */ + #ifndef IS_SWAPFILE + # define IS_SWAPFILE(inode) (0) +@@ -439,6 +479,26 @@ + extern void nfs_readpage_result(struct rpc_task *); + + /* ++ * Allocate and free nfs_read_data structures ++ */ ++extern mempool_t *nfs_rdata_mempool; ++ ++static inline struct nfs_read_data *nfs_readdata_alloc(void) ++{ ++ struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); ++ if (p) ++ memset(p, 0, sizeof(*p)); ++ return p; ++} ++ ++static inline void nfs_readdata_free(struct nfs_read_data *p) ++{ ++ mempool_free(p, nfs_rdata_mempool); ++} ++ ++extern void nfs_readdata_release(struct rpc_task *task); ++ ++/* + * linux/fs/mount_clnt.c + * (Used only by nfsroot module) + */ +@@ -644,6 +704,12 @@ + + extern struct dentry_operations nfs4_dentry_operations; + extern struct inode_operations nfs4_dir_inode_operations; ++extern struct inode_operations nfs4_file_inode_operations; ++ ++/* inode.c */ ++extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t); ++extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); + + /* nfs4proc.c */ + extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short); +@@ -651,13 +717,14 @@ + extern int nfs4_open_reclaim(struct nfs4_state_owner *, struct nfs4_state *); + extern int nfs4_proc_async_renew(struct nfs4_client *); + extern int nfs4_proc_renew(struct nfs4_client *); +-extern int nfs4_do_close(struct inode *, struct nfs4_state *); +-extern int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode); ++extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode); + extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *); + extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); + extern int nfs4_open_revalidate(struct inode *, struct dentry *, int); + extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); + extern int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request); ++extern ssize_t nfs4_proc_get_acl(struct inode *, void *buf, ssize_t buflen); ++extern int nfs4_proc_set_acl(struct inode *, const void *buf, ssize_t buflen); + + /* nfs4renewd.c */ + extern void nfs4_schedule_state_renewal(struct nfs4_client *); +Index: linux-2.6.10/include/linux/nfs4.h +=================================================================== +--- linux-2.6.10.orig/include/linux/nfs4.h 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/include/linux/nfs4.h 2005-04-05 14:49:13.474680856 +0800 +@@ -28,7 +28,7 @@ + #define NFS4_ACCESS_DELETE 0x0010 + #define NFS4_ACCESS_EXECUTE 0x0020 + +-#define NFS4_FH_PERISTENT 0x0000 ++#define NFS4_FH_PERSISTENT 0x0000 + #define NFS4_FH_NOEXPIRE_WITH_OPEN 0x0001 + #define NFS4_FH_VOLATILE_ANY 0x0002 + #define NFS4_FH_VOL_MIGRATION 0x0004 +@@ -382,6 +382,8 @@ + NFSPROC4_CLNT_READDIR, + NFSPROC4_CLNT_SERVER_CAPS, + NFSPROC4_CLNT_DELEGRETURN, ++ NFSPROC4_CLNT_GETACL, ++ NFSPROC4_CLNT_SETACL, + }; + + #endif +Index: linux-2.6.10/include/linux/sunrpc/auth.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/auth.h 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/auth.h 2005-04-05 14:49:13.468681768 +0800 +@@ -51,7 +51,6 @@ + }; + #define RPCAUTH_CRED_LOCKED 0x0001 + #define RPCAUTH_CRED_UPTODATE 0x0002 +-#define RPCAUTH_CRED_DEAD 0x0004 + + #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 + +@@ -133,7 +132,6 @@ + int rpcauth_refreshcred(struct rpc_task *); + void rpcauth_invalcred(struct rpc_task *); + int rpcauth_uptodatecred(struct rpc_task *); +-int rpcauth_deadcred(struct rpc_task *); + void rpcauth_init_credcache(struct rpc_auth *); + void rpcauth_free_credcache(struct rpc_auth *); + +Index: linux-2.6.10/include/linux/sunrpc/svc.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/svc.h 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/svc.h 2005-04-05 14:49:13.467681920 +0800 +@@ -251,8 +251,7 @@ + char * pg_name; /* service name */ + char * pg_class; /* class name: services sharing authentication */ + struct svc_stat * pg_stats; /* rpc statistics */ +- /* Override authentication. NULL means use default */ +- int (*pg_authenticate)(struct svc_rqst *, u32 *); ++ int (*pg_authenticate)(struct svc_rqst *); + }; + + /* +Index: linux-2.6.10/include/linux/sunrpc/cache.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/cache.h 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/cache.h 2005-04-05 14:49:13.470681464 +0800 +@@ -128,20 +128,17 @@ + * just like a template in C++, this macro does cache lookup + * for us. + * The function is passed some sort of HANDLE from which a cache_detail +- * structure can be determined (via SETUP, DETAIL), a template ++ * structure can be determined (via DETAIL), a template + * cache entry (type RTN*), and a "set" flag. Using the HASHFN and the + * TEST, the function will try to find a matching cache entry in the cache. + * If "set" == 0 : + * If an entry is found, it is returned + * If no entry is found, a new non-VALID entry is created. +- * If "set" == 1 and INPLACE == 0 : ++ * If "set" == 1: + * If no entry is found a new one is inserted with data from "template" + * If a non-CACHE_VALID entry is found, it is updated from template using UPDATE + * If a CACHE_VALID entry is found, a new entry is swapped in with data + * from "template" +- * If set == 1, and INPLACE == 1 : +- * As above, except that if a CACHE_VALID entry is found, we UPDATE in place +- * instead of swapping in a new entry. + * + * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not + * run but insteead CACHE_NEGATIVE is set in any new item. +@@ -153,25 +150,22 @@ + * MEMBER is the member of the cache which is cache_head, which must be first + * FNAME is the name for the function + * ARGS are arguments to function and must contain RTN *item, int set. May +- * also contain something to be usedby SETUP or DETAIL to find cache_detail. +- * SETUP locates the cache detail and makes it available as... +- * DETAIL identifies the cache detail, possibly set up by SETUP ++ * also contain something to be used by DETAIL to find cache_detail. ++ * DETAIL identifies the cache detail + * HASHFN returns a hash value of the cache entry "item" + * TEST tests if "tmp" matches "item" + * INIT copies key information from "item" to "new" + * UPDATE copies content information from "item" to "tmp" +- * INPLACE is true if updates can happen inplace rather than allocating a new structure + * + * WARNING: any substantial changes to this must be reflected in + * net/sunrpc/svcauth.c(auth_domain_lookup) + * which is a similar routine that is open-coded. + */ +-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE) \ ++#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,DETAIL,HASHFN,TEST,INIT,UPDATE) \ + RTN *FNAME ARGS \ + { \ + RTN *tmp, *new=NULL; \ + struct cache_head **hp, **head; \ +- SETUP; \ + head = &(DETAIL)->hash_table[HASHFN]; \ + retry: \ + if (set||new) write_lock(&(DETAIL)->hash_lock); \ +@@ -180,14 +174,14 @@ + tmp = container_of(*hp, RTN, MEMBER); \ + if (TEST) { /* found a match */ \ + \ +- if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \ ++ if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \ + break; \ + \ + if (new) \ + {INIT;} \ + cache_get(&tmp->MEMBER); \ + if (set) { \ +- if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\ ++ if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\ + { /* need to swap in new */ \ + RTN *t2; \ + \ +@@ -209,7 +203,7 @@ + else read_unlock(&(DETAIL)->hash_lock); \ + if (set) \ + cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \ +- if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0); \ ++ if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0); \ + if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL); \ + return tmp; \ + } \ +@@ -242,10 +236,10 @@ + return NULL; \ + } + +-#define DefineSimpleCacheLookup(STRUCT,INPLACE) \ +- DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */, \ ++#define DefineSimpleCacheLookup(STRUCT) \ ++ DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), \ + & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\ +- STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE) ++ STRUCT##_init(new, item), STRUCT##_update(tmp, item)) + + #define cache_for_each(pos, detail, index, member) \ + for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \ +Index: linux-2.6.10/include/linux/sunrpc/sched.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/sched.h 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/sched.h 2005-04-05 14:49:13.472681160 +0800 +@@ -11,7 +11,9 @@ + + #include + #include ++#include + #include ++#include + #include + + /* +@@ -25,11 +27,18 @@ + struct rpc_cred * rpc_cred; /* Credentials */ + }; + ++struct rpc_wait_queue; ++struct rpc_wait { ++ struct list_head list; /* wait queue links */ ++ struct list_head links; /* Links to related tasks */ ++ wait_queue_head_t waitq; /* sync: sleep on this q */ ++ struct rpc_wait_queue * rpc_waitq; /* RPC wait queue we're on */ ++}; ++ + /* + * This is the RPC task struct + */ + struct rpc_task { +- struct list_head tk_list; /* wait queue links */ + #ifdef RPC_DEBUG + unsigned long tk_magic; /* 0xf00baa */ + #endif +@@ -37,7 +46,6 @@ + struct rpc_clnt * tk_client; /* RPC client */ + struct rpc_rqst * tk_rqstp; /* RPC request */ + int tk_status; /* result of last operation */ +- struct rpc_wait_queue * tk_rpcwait; /* RPC wait queue we're on */ + + /* + * RPC call state +@@ -70,13 +78,18 @@ + * you have a pathological interest in kernel oopses. + */ + struct timer_list tk_timer; /* kernel timer */ +- wait_queue_head_t tk_wait; /* sync: sleep on this q */ + unsigned long tk_timeout; /* timeout for rpc_sleep() */ + unsigned short tk_flags; /* misc flags */ + unsigned char tk_active : 1;/* Task has been activated */ + unsigned char tk_priority : 2;/* Task priority */ + unsigned long tk_runstate; /* Task run status */ +- struct list_head tk_links; /* links to related tasks */ ++ struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could ++ * be any workqueue ++ */ ++ union { ++ struct work_struct tk_work; /* Async task work queue */ ++ struct rpc_wait tk_wait; /* RPC wait */ ++ } u; + #ifdef RPC_DEBUG + unsigned short tk_pid; /* debugging aid */ + #endif +@@ -87,11 +100,11 @@ + /* support walking a list of tasks on a wait queue */ + #define task_for_each(task, pos, head) \ + list_for_each(pos, head) \ +- if ((task=list_entry(pos, struct rpc_task, tk_list)),1) ++ if ((task=list_entry(pos, struct rpc_task, u.tk_wait.list)),1) + + #define task_for_first(task, head) \ + if (!list_empty(head) && \ +- ((task=list_entry((head)->next, struct rpc_task, tk_list)),1)) ++ ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1)) + + /* .. and walking list of all tasks */ + #define alltask_for_each(task, pos, head) \ +@@ -126,22 +139,39 @@ + #define RPC_IS_SOFT(t) ((t)->tk_flags & RPC_TASK_SOFT) + #define RPC_TASK_UNINTERRUPTIBLE(t) ((t)->tk_flags & RPC_TASK_NOINTR) + +-#define RPC_TASK_SLEEPING 0 +-#define RPC_TASK_RUNNING 1 +-#define RPC_IS_SLEEPING(t) (test_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate)) +-#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) ++#define RPC_TASK_RUNNING 0 ++#define RPC_TASK_QUEUED 1 ++#define RPC_TASK_WAKEUP 2 ++#define RPC_TASK_HAS_TIMER 3 + ++#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) + #define rpc_set_running(t) (set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) +-#define rpc_clear_running(t) (clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) ++#define rpc_test_and_set_running(t) \ ++ (test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) ++#define rpc_clear_running(t) \ ++ do { \ ++ smp_mb__before_clear_bit(); \ ++ clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \ ++ smp_mb__after_clear_bit(); \ ++ } while (0) + +-#define rpc_set_sleeping(t) (set_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate)) ++#define RPC_IS_QUEUED(t) (test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)) ++#define rpc_set_queued(t) (set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)) ++#define rpc_clear_queued(t) \ ++ do { \ ++ smp_mb__before_clear_bit(); \ ++ clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \ ++ smp_mb__after_clear_bit(); \ ++ } while (0) + +-#define rpc_clear_sleeping(t) \ ++#define rpc_start_wakeup(t) \ ++ (test_and_set_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate) == 0) ++#define rpc_finish_wakeup(t) \ + do { \ + smp_mb__before_clear_bit(); \ +- clear_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate); \ ++ clear_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate); \ + smp_mb__after_clear_bit(); \ +- } while(0) ++ } while (0) + + /* + * Task priorities. +@@ -157,6 +187,7 @@ + * RPC synchronization objects + */ + struct rpc_wait_queue { ++ spinlock_t lock; + struct list_head tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */ + unsigned long cookie; /* cookie of last task serviced */ + unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */ +@@ -177,6 +208,7 @@ + + #ifndef RPC_DEBUG + # define RPC_WAITQ_INIT(var,qname) { \ ++ .lock = SPIN_LOCK_UNLOCKED, \ + .tasks = { \ + [0] = LIST_HEAD_INIT(var.tasks[0]), \ + [1] = LIST_HEAD_INIT(var.tasks[1]), \ +@@ -185,6 +217,7 @@ + } + #else + # define RPC_WAITQ_INIT(var,qname) { \ ++ .lock = SPIN_LOCK_UNLOCKED, \ + .tasks = { \ + [0] = LIST_HEAD_INIT(var.tasks[0]), \ + [1] = LIST_HEAD_INIT(var.tasks[1]), \ +@@ -209,13 +242,10 @@ + int rpc_execute(struct rpc_task *); + void rpc_run_child(struct rpc_task *parent, struct rpc_task *child, + rpc_action action); +-int rpc_add_wait_queue(struct rpc_wait_queue *, struct rpc_task *); +-void rpc_remove_wait_queue(struct rpc_task *); + void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); + void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); + void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, + rpc_action action, rpc_action timer); +-void rpc_add_timer(struct rpc_task *, rpc_action); + void rpc_wake_up_task(struct rpc_task *); + void rpc_wake_up(struct rpc_wait_queue *); + struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); +Index: linux-2.6.10/include/linux/sunrpc/gss_krb5.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/gss_krb5.h 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/gss_krb5.h 2005-04-05 14:49:13.473681008 +0800 +@@ -53,6 +53,8 @@ + struct xdr_netobj mech_used; + }; + ++extern spinlock_t krb5_seq_lock; ++ + #define KG_TOK_MIC_MSG 0x0101 + #define KG_TOK_WRAP_MSG 0x0201 + +@@ -116,18 +118,25 @@ + + s32 + make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, +- struct xdr_netobj *cksum); ++ int body_offset, struct xdr_netobj *cksum); + + u32 + krb5_make_token(struct krb5_ctx *context_handle, int qop_req, + struct xdr_buf *input_message_buffer, +- struct xdr_netobj *output_message_buffer, int toktype); ++ struct xdr_netobj *output_message_buffer); + + u32 + krb5_read_token(struct krb5_ctx *context_handle, + struct xdr_netobj *input_token_buffer, +- struct xdr_buf *message_buffer, +- int *qop_state, int toktype); ++ struct xdr_buf *message_buffer, int *qop_state); ++ ++u32 ++gss_wrap_kerberos(struct gss_ctx *ctx_id, u32 qop, int offset, ++ struct xdr_buf *outbuf, struct page **pages); ++ ++u32 ++gss_unwrap_kerberos(struct gss_ctx *ctx_id, u32 *qop, int offset, ++ struct xdr_buf *buf, int *out_offset); + + u32 + krb5_encrypt(struct crypto_tfm * key, +@@ -137,6 +146,13 @@ + krb5_decrypt(struct crypto_tfm * key, + void *iv, void *in, void *out, int length); + ++int ++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *outbuf, int offset, ++ struct page **pages); ++ ++int ++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *inbuf, int offset); ++ + s32 + krb5_make_seq_num(struct crypto_tfm * key, + int direction, +Index: linux-2.6.10/include/linux/sunrpc/xdr.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/xdr.h 2004-12-25 05:35:40.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/xdr.h 2005-04-05 14:49:13.467681920 +0800 +@@ -192,6 +192,7 @@ + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p); + extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); + extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len); ++extern void truncate_xdr_buf(struct xdr_buf *xdr, int len); + + #endif /* __KERNEL__ */ + +Index: linux-2.6.10/include/linux/sunrpc/gss_api.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/gss_api.h 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/gss_api.h 2005-04-05 14:49:13.471681312 +0800 +@@ -47,6 +47,18 @@ + struct xdr_buf *message, + struct xdr_netobj *mic_token, + u32 *qstate); ++u32 gss_wrap( ++ struct gss_ctx *ctx_id, ++ u32 qop, ++ int offset, ++ struct xdr_buf *outbuf, ++ struct page **inpages); ++u32 gss_unwrap( ++ struct gss_ctx *ctx_id, ++ u32 *qop, ++ int offset, ++ struct xdr_buf *inbuf, ++ int *out_offset); + u32 gss_delete_sec_context( + struct gss_ctx **ctx_id); + +@@ -93,6 +105,18 @@ + struct xdr_buf *message, + struct xdr_netobj *mic_token, + u32 *qstate); ++ u32 (*gss_wrap)( ++ struct gss_ctx *ctx_id, ++ u32 qop, ++ int offset, ++ struct xdr_buf *outbuf, ++ struct page **inpages); ++ u32 (*gss_unwrap)( ++ struct gss_ctx *ctx_id, ++ u32 *qop, ++ int offset, ++ struct xdr_buf *buf, ++ int *out_offset); + void (*gss_delete_sec_context)( + void *internal_ctx_id); + }; +Index: linux-2.6.10/include/linux/sunrpc/svcauth.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/svcauth.h 2004-12-25 05:34:31.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/svcauth.h 2005-04-05 14:49:13.469681616 +0800 +@@ -26,21 +26,23 @@ + struct svc_rqst; /* forward decl */ + + /* Authentication is done in the context of a domain. +- * For a server, a domain represents a group of clients using ++ * ++ * Currently, the nfs server uses the auth_domain to stand ++ * for the "client" listed in /etc/exports. ++ * ++ * More generally, a domain might represent a group of clients using + * a common mechanism for authentication and having a common mapping + * between local identity (uid) and network identity. All clients + * in a domain have similar general access rights. Each domain can + * contain multiple principals which will have different specific right + * based on normal Discretionary Access Control. + * +- * For a client, a domain represents a number of servers which all +- * use a common authentication mechanism and network identity name space. +- * + * A domain is created by an authentication flavour module based on name + * only. Userspace then fills in detail on demand. + * +- * The creation of a domain typically implies creation of one or +- * more caches for storing domain specific information. ++ * In the case of auth_unix and auth_null, the auth_domain is also ++ * associated with entries in another cache representing the mapping ++ * of ip addresses to the given client. + */ + struct auth_domain { + struct cache_head h; +@@ -92,6 +94,7 @@ + int (*accept)(struct svc_rqst *rq, u32 *authp); + int (*release)(struct svc_rqst *rq); + void (*domain_release)(struct auth_domain *); ++ int (*set_client)(struct svc_rqst *rq); + }; + + #define SVC_GARBAGE 1 +@@ -107,6 +110,7 @@ + + extern int svc_authenticate(struct svc_rqst *rqstp, u32 *authp); + extern int svc_authorise(struct svc_rqst *rqstp); ++extern int svc_set_client(struct svc_rqst *rqstp); + extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); + extern void svc_auth_unregister(rpc_authflavor_t flavor); + +Index: linux-2.6.10/include/linux/sunrpc/xprt.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sunrpc/xprt.h 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/include/linux/sunrpc/xprt.h 2005-04-05 14:49:13.471681312 +0800 +@@ -95,7 +95,10 @@ + int rq_cong; /* has incremented xprt->cong */ + int rq_received; /* receive completed */ + u32 rq_seqno; /* gss seq no. used on req. */ +- ++ int rq_enc_pages_num; ++ struct page **rq_enc_pages; /* scratch pages for use by ++ gss privacy code */ ++ void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ + struct list_head rq_list; + + struct xdr_buf rq_private_buf; /* The receive buffer +Index: linux-2.6.10/include/linux/nfs_xdr.h +=================================================================== +--- linux-2.6.10.orig/include/linux/nfs_xdr.h 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/include/linux/nfs_xdr.h 2005-04-05 14:49:13.459683136 +0800 +@@ -326,6 +326,20 @@ + const u32 * bitmask; + }; + ++struct nfs_setaclargs { ++ struct nfs_fh * fh; ++ ssize_t acl_len; ++ unsigned int acl_pgbase; ++ struct page ** acl_pages; ++}; ++ ++struct nfs_getaclargs { ++ struct nfs_fh * fh; ++ ssize_t acl_len; ++ unsigned int acl_pgbase; ++ struct page ** acl_pages; ++}; ++ + struct nfs_setattrres { + struct nfs_fattr * fattr; + const struct nfs_server * server; +@@ -666,6 +680,7 @@ + int version; /* Protocol version */ + struct dentry_operations *dentry_ops; + struct inode_operations *dir_inode_ops; ++ struct inode_operations *file_inode_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -681,7 +696,7 @@ + int (*read) (struct nfs_read_data *); + int (*write) (struct nfs_write_data *); + int (*commit) (struct nfs_write_data *); +- struct inode * (*create) (struct inode *, struct qstr *, ++ struct inode * (*create) (struct inode *, struct dentry *, + struct iattr *, int); + int (*remove) (struct inode *, struct qstr *); + int (*unlink_setup) (struct rpc_message *, +Index: linux-2.6.10/net/sunrpc/xprt.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/xprt.c 2004-12-25 05:35:14.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/xprt.c 2005-04-05 14:49:13.393693168 +0800 +@@ -891,7 +891,8 @@ + xprt->tcp_flags &= ~XPRT_COPY_XID; + xprt->tcp_flags |= XPRT_COPY_DATA; + xprt->tcp_copied = 4; +- dprintk("RPC: reading reply for XID %08x\n", xprt->tcp_xid); ++ dprintk("RPC: reading reply for XID %08x\n", ++ ntohl(xprt->tcp_xid)); + tcp_check_recm(xprt); + } + +@@ -911,7 +912,7 @@ + if (!req) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x request not found!\n", +- xprt->tcp_xid); ++ ntohl(xprt->tcp_xid)); + spin_unlock(&xprt->sock_lock); + return; + } +@@ -1101,7 +1102,7 @@ + goto out; + + spin_lock_bh(&xprt->sock_lock); +- if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending) ++ if (xprt->snd_task) + rpc_wake_up_task(xprt->snd_task); + spin_unlock_bh(&xprt->sock_lock); + out: +@@ -1359,8 +1360,9 @@ + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_xid = xprt_alloc_xid(xprt); ++ req->rq_release_snd_buf = NULL; + dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, +- req, req->rq_xid); ++ req, ntohl(req->rq_xid)); + } + + /* +@@ -1384,6 +1386,8 @@ + mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT); + spin_unlock_bh(&xprt->sock_lock); + task->tk_rqstp = NULL; ++ if (req->rq_release_snd_buf) ++ req->rq_release_snd_buf(req); + memset(req, 0, sizeof(*req)); /* mark unused */ + + dprintk("RPC: %4d release request %p\n", task->tk_pid, req); +Index: linux-2.6.10/net/sunrpc/auth.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth.c 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth.c 2005-04-05 14:49:13.394693016 +0800 +@@ -214,8 +214,6 @@ + list_for_each_safe(pos, next, &auth->au_credcache[nr]) { + struct rpc_cred *entry; + entry = list_entry(pos, struct rpc_cred, cr_hash); +- if (entry->cr_flags & RPCAUTH_CRED_DEAD) +- continue; + if (rpcauth_prune_expired(entry, &free)) + continue; + if (entry->cr_ops->crmatch(acred, entry, taskflags)) { +@@ -307,9 +305,6 @@ + if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock)) + return; + +- if ((cred->cr_flags & RPCAUTH_CRED_DEAD) && !list_empty(&cred->cr_hash)) +- list_del_init(&cred->cr_hash); +- + if (list_empty(&cred->cr_hash)) { + spin_unlock(&rpc_credcache_lock); + rpcauth_crdestroy(cred); +@@ -413,10 +408,3 @@ + return !(task->tk_msg.rpc_cred) || + (task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_UPTODATE); + } +- +-int +-rpcauth_deadcred(struct rpc_task *task) +-{ +- return !(task->tk_msg.rpc_cred) || +- (task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_DEAD); +-} +Index: linux-2.6.10/net/sunrpc/svcauth_unix.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/svcauth_unix.c 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/svcauth_unix.c 2005-04-05 14:49:13.395692864 +0800 +@@ -97,7 +97,7 @@ + }; + static struct cache_head *ip_table[IP_HASHMAX]; + +-void ip_map_put(struct cache_head *item, struct cache_detail *cd) ++static void ip_map_put(struct cache_head *item, struct cache_detail *cd) + { + struct ip_map *im = container_of(item, struct ip_map,h); + if (cache_put(item, cd)) { +@@ -258,7 +258,7 @@ + .cache_show = ip_map_show, + }; + +-static DefineSimpleCacheLookup(ip_map, 0) ++static DefineSimpleCacheLookup(ip_map) + + + int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) +@@ -329,14 +329,49 @@ + cache_purge(&auth_domain_cache); + } + ++int ++svcauth_unix_set_client(struct svc_rqst *rqstp) ++{ ++ struct ip_map key, *ipm; ++ ++ rqstp->rq_client = NULL; ++ if (rqstp->rq_proc == 0) ++ return SVC_OK; ++ ++ strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class); ++ key.m_addr = rqstp->rq_addr.sin_addr; ++ ++ ipm = ip_map_lookup(&key, 0); ++ ++ if (ipm == NULL) ++ return SVC_DENIED; ++ ++ switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { ++ case -EAGAIN: ++ return SVC_DROP; ++ case -ENOENT: ++ return SVC_DENIED; ++ case 0: ++ rqstp->rq_client = &ipm->m_client->h; ++ cache_get(&rqstp->rq_client->h); ++ ip_map_put(&ipm->h, &ip_map_cache); ++ return SVC_OK; ++ default: ++ BUG(); ++ } ++ /* shut up gcc: */ ++ return -1; ++} + + static int + svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp) + { + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; +- int rv=0; +- struct ip_map key, *ipm; ++ struct svc_cred *cred = &rqstp->rq_cred; ++ ++ cred->cr_group_info = NULL; ++ rqstp->rq_client = NULL; + + if (argv->iov_len < 3*4) + return SVC_GARBAGE; +@@ -353,45 +388,17 @@ + } + + /* Signal that mapping to nobody uid/gid is required */ +- rqstp->rq_cred.cr_uid = (uid_t) -1; +- rqstp->rq_cred.cr_gid = (gid_t) -1; +- rqstp->rq_cred.cr_group_info = groups_alloc(0); +- if (rqstp->rq_cred.cr_group_info == NULL) ++ cred->cr_uid = (uid_t) -1; ++ cred->cr_gid = (gid_t) -1; ++ cred->cr_group_info = groups_alloc(0); ++ if (cred->cr_group_info == NULL) + return SVC_DROP; /* kmalloc failure - client must retry */ + + /* Put NULL verifier */ + svc_putu32(resv, RPC_AUTH_NULL); + svc_putu32(resv, 0); + +- strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class); +- key.m_addr = rqstp->rq_addr.sin_addr; +- +- ipm = ip_map_lookup(&key, 0); +- +- rqstp->rq_client = NULL; +- +- if (ipm) +- switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { +- case -EAGAIN: +- rv = SVC_DROP; +- break; +- case -ENOENT: +- rv = SVC_OK; /* rq_client is NULL */ +- break; +- case 0: +- rqstp->rq_client = &ipm->m_client->h; +- cache_get(&rqstp->rq_client->h); +- ip_map_put(&ipm->h, &ip_map_cache); +- rv = SVC_OK; +- break; +- default: BUG(); +- } +- else rv = SVC_DROP; +- +- if (rqstp->rq_client == NULL && rqstp->rq_proc != 0) +- *authp = rpc_autherr_badcred; +- +- return rv; ++ return SVC_OK; + } + + static int +@@ -414,6 +421,7 @@ + .flavour = RPC_AUTH_NULL, + .accept = svcauth_null_accept, + .release = svcauth_null_release, ++ .set_client = svcauth_unix_set_client, + }; + + +@@ -425,8 +433,6 @@ + struct svc_cred *cred = &rqstp->rq_cred; + u32 slen, i; + int len = argv->iov_len; +- int rv=0; +- struct ip_map key, *ipm; + + cred->cr_group_info = NULL; + rqstp->rq_client = NULL; +@@ -458,39 +464,11 @@ + return SVC_DENIED; + } + +- +- strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class); +- key.m_addr = rqstp->rq_addr.sin_addr; +- +- +- ipm = ip_map_lookup(&key, 0); +- +- if (ipm) +- switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { +- case -EAGAIN: +- rv = SVC_DROP; +- break; +- case -ENOENT: +- rv = SVC_OK; /* rq_client is NULL */ +- break; +- case 0: +- rqstp->rq_client = &ipm->m_client->h; +- cache_get(&rqstp->rq_client->h); +- ip_map_put(&ipm->h, &ip_map_cache); +- rv = SVC_OK; +- break; +- default: BUG(); +- } +- else rv = SVC_DROP; +- +- if (rv == SVC_OK && rqstp->rq_client == NULL && rqstp->rq_proc != 0) +- goto badcred; +- + /* Put NULL verifier */ + svc_putu32(resv, RPC_AUTH_NULL); + svc_putu32(resv, 0); + +- return rv; ++ return SVC_OK; + + badcred: + *authp = rpc_autherr_badcred; +@@ -520,5 +498,6 @@ + .accept = svcauth_unix_accept, + .release = svcauth_unix_release, + .domain_release = svcauth_unix_domain_release, ++ .set_client = svcauth_unix_set_client, + }; + +Index: linux-2.6.10/net/sunrpc/clnt.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/clnt.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/clnt.c 2005-04-05 14:49:13.410690584 +0800 +@@ -636,8 +636,14 @@ + rpc_exit(task, -EIO); + return; + } +- if (encode && (status = rpcauth_wrap_req(task, encode, req, p, +- task->tk_msg.rpc_argp)) < 0) { ++ if (encode == NULL) ++ return; ++ ++ status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp); ++ if (status == -EAGAIN) { ++ printk("XXXJBF: out of memory? Should retry here!!!\n"); ++ } ++ if (status < 0) { + printk(KERN_WARNING "%s: can't encode arguments: %d\n", + clnt->cl_protname, -status); + rpc_exit(task, status); +@@ -935,7 +941,7 @@ + task->tk_action = call_reserve; + if (status >= 0 && rpcauth_uptodatecred(task)) + return; +- if (rpcauth_deadcred(task)) { ++ if (status == -EACCES) { + rpc_exit(task, -EACCES); + return; + } +@@ -993,7 +999,7 @@ + goto garbage; + if ((n = ntohl(*p++)) != RPC_AUTH_ERROR) { + printk(KERN_WARNING "call_verify: RPC call rejected: %x\n", n); +- } else if (--len < 0) ++ } else if (--len == 0) + switch ((n = ntohl(*p++))) { + case RPC_AUTH_REJECTEDCRED: + case RPC_AUTH_REJECTEDVERF: +Index: linux-2.6.10/net/sunrpc/svcauth.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/svcauth.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/svcauth.c 2005-04-05 14:49:13.392693320 +0800 +@@ -59,6 +59,11 @@ + return aops->accept(rqstp, authp); + } + ++int svc_set_client(struct svc_rqst *rqstp) ++{ ++ return rqstp->rq_authop->set_client(rqstp); ++} ++ + /* A request, which was authenticated, has now executed. + * Time to finalise the the credentials and verifier + * and release and resources +Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_unseal.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_unseal.c 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_unseal.c 2005-04-05 14:49:13.401691952 +0800 +@@ -68,20 +68,13 @@ + #endif + + +-/* message_buffer is an input if toktype is MIC and an output if it is WRAP: +- * If toktype is MIC: read_token is a mic token, and message_buffer is the +- * data that the mic was supposedly taken over. +- * If toktype is WRAP: read_token is a wrap token, and message_buffer is used +- * to return the decrypted data. +- */ ++/* read_token is a mic token, and message_buffer is the data that the mic was ++ * supposedly taken over. */ + +-/* XXX will need to change prototype and/or just split into a separate function +- * when we add privacy (because read_token will be in pages too). */ + u32 + krb5_read_token(struct krb5_ctx *ctx, + struct xdr_netobj *read_token, +- struct xdr_buf *message_buffer, +- int *qop_state, int toktype) ++ struct xdr_buf *message_buffer, int *qop_state) + { + int signalg; + int sealalg; +@@ -100,16 +93,12 @@ + read_token->len)) + goto out; + +- if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff))) ++ if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || ++ (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) + goto out; + + /* XXX sanity-check bodysize?? */ + +- if (toktype == KG_TOK_WRAP_MSG) { +- /* XXX gone */ +- goto out; +- } +- + /* get the sign and seal algorithms */ + + signalg = ptr[0] + (ptr[1] << 8); +@@ -120,14 +109,7 @@ + if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + goto out; + +- if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) || +- ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff))) +- goto out; +- +- /* in the current spec, there is only one valid seal algorithm per +- key type, so a simple comparison is ok */ +- +- if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg)) ++ if (sealalg != 0xffff) + goto out; + + /* there are several mappings of seal algorithms to sign algorithms, +@@ -154,7 +136,7 @@ + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + ret = make_checksum(checksum_type, ptr - 2, 8, +- message_buffer, &md5cksum); ++ message_buffer, 0, &md5cksum); + if (ret) + goto out; + +Index: linux-2.6.10/net/sunrpc/auth_gss/gss_mech_switch.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_mech_switch.c 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/gss_mech_switch.c 2005-04-05 14:49:13.408690888 +0800 +@@ -279,6 +279,29 @@ + qstate); + } + ++u32 ++gss_wrap(struct gss_ctx *ctx_id, ++ u32 qop, ++ int offset, ++ struct xdr_buf *buf, ++ struct page **inpages) ++{ ++ return ctx_id->mech_type->gm_ops ++ ->gss_wrap(ctx_id, qop, offset, buf, inpages); ++} ++ ++u32 ++gss_unwrap(struct gss_ctx *ctx_id, ++ u32 *qop, ++ int offset, ++ struct xdr_buf *buf, ++ int *out_offset) ++{ ++ return ctx_id->mech_type->gm_ops ++ ->gss_unwrap(ctx_id, qop, offset, buf, out_offset); ++} ++ ++ + /* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ +Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_wrap.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_wrap.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_wrap.c 2005-04-05 14:49:13.397692560 +0800 +@@ -0,0 +1,337 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++static inline int ++gss_krb5_padding(int blocksize, int length) ++{ ++ /* Most of the code is block-size independent but currently we ++ * use only 8: */ ++ BUG_ON(blocksize != 8); ++ return 8 - (length & 7); ++} ++ ++static inline void ++gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) ++{ ++ int padding = gss_krb5_padding(blocksize, buf->len - offset); ++ char *p; ++ struct kvec *iov; ++ ++ if (buf->page_len || buf->tail[0].iov_len) ++ iov = &buf->tail[0]; ++ else ++ iov = &buf->head[0]; ++ p = iov->iov_base + iov->iov_len; ++ iov->iov_len += padding; ++ buf->len += padding; ++ memset(p, padding, padding); ++} ++ ++static inline int ++gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) ++{ ++ u8 *ptr; ++ u8 pad; ++ int len = buf->len; ++ ++ if (len <= buf->head[0].iov_len) { ++ pad = *(u8 *)(buf->head[0].iov_base + len - 1); ++ goto out; ++ } else ++ len -= buf->head[0].iov_len; ++ if (len <= buf->page_len) { ++ int last = (buf->page_base + len - 1) ++ >>PAGE_CACHE_SHIFT; ++ int offset = (buf->page_base + len - 1) ++ & (PAGE_CACHE_SIZE - 1); ++ ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA); ++ pad = *(ptr + offset); ++ kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA); ++ goto out; ++ } else ++ len -= buf->page_len; ++ BUG_ON(len > buf->tail[0].iov_len); ++ pad = *(u8 *)(buf->tail[0].iov_base + len - 1); ++out: ++ if (pad > blocksize) ++ return -EINVAL; ++ buf->len -= pad; ++ return 0; ++} ++ ++static inline void ++make_confounder(char *p, int blocksize) ++{ ++ /* XXX? Is this OK to do on every packet? */ ++ get_random_bytes(p, blocksize); ++} ++ ++/* Assumptions: the head and tail of inbuf are ours to play with. ++ * The pages, however, may be real pages in the page cache and we replace ++ * them with scratch pages from **pages before writing to them. */ ++/* XXX: obviously the above should be documentation of wrap interface, ++ * and shouldn't be in this kerberos-specific file. */ ++ ++/* XXX factor out common code with seal/unseal. */ ++ ++u32 ++gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, ++ struct xdr_buf *buf, struct page **pages) ++{ ++ struct krb5_ctx *kctx = ctx->internal_ctx_id; ++ s32 checksum_type; ++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; ++ int blocksize = 0, plainlen; ++ unsigned char *ptr, *krb5_hdr, *msg_start; ++ s32 now; ++ int headlen; ++ struct page **tmp_pages; ++ u32 seq_send; ++ ++ dprintk("RPC: gss_wrap_kerberos\n"); ++ ++ now = get_seconds(); ++ ++ if (qop != 0) ++ goto out_err; ++ ++ switch (kctx->signalg) { ++ case SGN_ALG_DES_MAC_MD5: ++ checksum_type = CKSUMTYPE_RSA_MD5; ++ break; ++ default: ++ dprintk("RPC: gss_krb5_seal: kctx->signalg %d not" ++ " supported\n", kctx->signalg); ++ goto out_err; ++ } ++ if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) { ++ dprintk("RPC: gss_krb5_seal: kctx->sealalg %d not supported\n", ++ kctx->sealalg); ++ goto out_err; ++ } ++ ++ blocksize = crypto_tfm_alg_blocksize(kctx->enc); ++ gss_krb5_add_padding(buf, offset, blocksize); ++ BUG_ON((buf->len - offset) % blocksize); ++ plainlen = blocksize + buf->len - offset; ++ ++ headlen = g_token_size(&kctx->mech_used, 22 + plainlen) - ++ (buf->len - offset); ++ ++ ptr = buf->head[0].iov_base + offset; ++ /* shift data to make room for header. */ ++ /* XXX Would be cleverer to encrypt while copying. */ ++ /* XXX bounds checking, slack, etc. */ ++ memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset); ++ buf->head[0].iov_len += headlen; ++ buf->len += headlen; ++ BUG_ON((buf->len - offset - headlen) % blocksize); ++ ++ g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr); ++ ++ ++ *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); ++ *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); ++ ++ /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ ++ krb5_hdr = ptr - 2; ++ msg_start = krb5_hdr + 24; ++ /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize); ++ ++ *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg); ++ memset(krb5_hdr + 4, 0xff, 4); ++ *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg); ++ ++ make_confounder(msg_start, blocksize); ++ ++ /* XXXJBF: UGH!: */ ++ tmp_pages = buf->pages; ++ buf->pages = pages; ++ if (make_checksum(checksum_type, krb5_hdr, 8, buf, ++ offset + headlen - blocksize, &md5cksum)) ++ goto out_err; ++ buf->pages = tmp_pages; ++ ++ switch (kctx->signalg) { ++ case SGN_ALG_DES_MAC_MD5: ++ if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, ++ md5cksum.data, md5cksum.len)) ++ goto out_err; ++ memcpy(krb5_hdr + 16, ++ md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, ++ KRB5_CKSUM_LENGTH); ++ ++ dprintk("RPC: make_seal_token: cksum data: \n"); ++ print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0); ++ break; ++ default: ++ BUG(); ++ } ++ ++ kfree(md5cksum.data); ++ ++ spin_lock(&krb5_seq_lock); ++ seq_send = kctx->seq_send++; ++ spin_unlock(&krb5_seq_lock); ++ ++ /* XXX would probably be more efficient to compute checksum ++ * and encrypt at the same time: */ ++ if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, ++ seq_send, krb5_hdr + 16, krb5_hdr + 8))) ++ goto out_err; ++ ++ if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, ++ pages)) ++ goto out_err; ++ ++ return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); ++out_err: ++ if (md5cksum.data) kfree(md5cksum.data); ++ return GSS_S_FAILURE; ++} ++ ++u32 ++gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset, ++ struct xdr_buf *buf, int *out_offset) ++{ ++ struct krb5_ctx *kctx = ctx->internal_ctx_id; ++ int signalg; ++ int sealalg; ++ s32 checksum_type; ++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; ++ s32 now; ++ int direction; ++ s32 seqnum; ++ unsigned char *ptr; ++ int bodysize; ++ u32 ret = GSS_S_DEFECTIVE_TOKEN; ++ u8 *data_start; ++ int blocksize; ++ ++ dprintk("RPC: gss_unwrap_kerberos\n"); ++ ++ ptr = (u8 *)buf->head[0].iov_base + offset; ++ if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, ++ buf->len - offset)) ++ goto out; ++ ++ if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || ++ (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) ++ goto out; ++ ++ /* XXX sanity-check bodysize?? */ ++ ++ /* get the sign and seal algorithms */ ++ ++ signalg = ptr[0] + (ptr[1] << 8); ++ sealalg = ptr[2] + (ptr[3] << 8); ++ ++ /* Sanity checks */ ++ ++ if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) ++ goto out; ++ ++ if (sealalg == 0xffff) ++ goto out; ++ ++ /* in the current spec, there is only one valid seal algorithm per ++ key type, so a simple comparison is ok */ ++ ++ if (sealalg != kctx->sealalg) ++ goto out; ++ ++ /* there are several mappings of seal algorithms to sign algorithms, ++ but few enough that we can try them all. */ ++ ++ if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) || ++ (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || ++ (kctx->sealalg == SEAL_ALG_DES3KD && ++ signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) ++ goto out; ++ ++ if (gss_decrypt_xdr_buf(kctx->enc, buf, ++ ptr + 22 - (unsigned char *)buf->head[0].iov_base)) ++ goto out; ++ ++ /* compute the checksum of the message */ ++ ++ /* initialize the the cksum */ ++ switch (signalg) { ++ case SGN_ALG_DES_MAC_MD5: ++ checksum_type = CKSUMTYPE_RSA_MD5; ++ break; ++ default: ++ ret = GSS_S_DEFECTIVE_TOKEN; ++ goto out; ++ } ++ ++ switch (signalg) { ++ case SGN_ALG_DES_MAC_MD5: ++ ret = make_checksum(checksum_type, ptr - 2, 8, buf, ++ ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum); ++ if (ret) ++ goto out; ++ ++ ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data, ++ md5cksum.data, md5cksum.len); ++ if (ret) ++ goto out; ++ ++ if (memcmp(md5cksum.data + 8, ptr + 14, 8)) { ++ ret = GSS_S_BAD_SIG; ++ goto out; ++ } ++ break; ++ default: ++ ret = GSS_S_DEFECTIVE_TOKEN; ++ goto out; ++ } ++ ++ /* it got through unscathed. Make sure the context is unexpired */ ++ ++ if (qop) ++ *qop = GSS_C_QOP_DEFAULT; ++ ++ now = get_seconds(); ++ ++ ret = GSS_S_CONTEXT_EXPIRED; ++ if (now > kctx->endtime) ++ goto out; ++ ++ /* do sequencing checks */ ++ ++ ret = GSS_S_BAD_SIG; ++ if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, ++ &seqnum))) ++ goto out; ++ ++ if ((kctx->initiate && direction != 0xff) || ++ (!kctx->initiate && direction != 0)) ++ goto out; ++ ++ /* Copy the data back to the right position. XXX: Would probably be ++ * better to copy and encrypt at the same time. */ ++ ++ blocksize = crypto_tfm_alg_blocksize(kctx->enc); ++ data_start = ptr + 22 + blocksize; ++ *out_offset = data_start - (u8 *)buf->head[0].iov_base; ++ ++ ret = GSS_S_DEFECTIVE_TOKEN; ++ if (gss_krb5_remove_padding(buf, blocksize)) ++ goto out; ++ ++ ret = GSS_S_COMPLETE; ++out: ++ if (md5cksum.data) kfree(md5cksum.data); ++ return ret; ++} +Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_crypto.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_crypto.c 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_crypto.c 2005-04-05 14:49:13.398692408 +0800 +@@ -139,17 +139,91 @@ + sg->length = len; + } + ++static int ++process_xdr_buf(struct xdr_buf *buf, int offset, int len, ++ int (*actor)(struct scatterlist *, void *), void *data) ++{ ++ int i, page_len, thislen, page_offset, ret = 0; ++ struct scatterlist sg[1]; ++ ++ if (offset >= buf->head[0].iov_len) { ++ offset -= buf->head[0].iov_len; ++ } else { ++ thislen = buf->head[0].iov_len - offset; ++ if (thislen > len) ++ thislen = len; ++ buf_to_sg(sg, buf->head[0].iov_base + offset, thislen); ++ ret = actor(sg, data); ++ if (ret) ++ goto out; ++ offset = 0; ++ len -= thislen; ++ } ++ if (len == 0) ++ goto out; ++ ++ if (offset >= buf->page_len) { ++ offset -= buf->page_len; ++ } else { ++ page_len = buf->page_len - offset; ++ if (page_len > len) ++ page_len = len; ++ len -= page_len; ++ page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1); ++ i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT; ++ thislen = PAGE_CACHE_SIZE - page_offset; ++ do { ++ if (thislen > page_len) ++ thislen = page_len; ++ sg->page = buf->pages[i]; ++ sg->offset = page_offset; ++ sg->length = thislen; ++ ret = actor(sg, data); ++ if (ret) ++ goto out; ++ page_len -= thislen; ++ i++; ++ page_offset = 0; ++ thislen = PAGE_CACHE_SIZE; ++ } while (page_len != 0); ++ offset = 0; ++ } ++ if (len == 0) ++ goto out; ++ ++ if (offset < buf->tail[0].iov_len) { ++ thislen = buf->tail[0].iov_len - offset; ++ if (thislen > len) ++ thislen = len; ++ buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen); ++ ret = actor(sg, data); ++ len -= thislen; ++ } ++ if (len != 0) ++ ret = -EINVAL; ++out: ++ return ret; ++} ++ ++static int ++checksummer(struct scatterlist *sg, void *data) ++{ ++ struct crypto_tfm *tfm = (struct crypto_tfm *)data; ++ ++ crypto_digest_update(tfm, sg, 1); ++ ++ return 0; ++} ++ + /* checksum the plaintext data and hdrlen bytes of the token header */ + s32 + make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, +- struct xdr_netobj *cksum) ++ int body_offset, struct xdr_netobj *cksum) + { + char *cksumname; + struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ + struct scatterlist sg[1]; + u32 code = GSS_S_FAILURE; +- int len, thislen, offset; +- int i; + + switch (cksumtype) { + case CKSUMTYPE_RSA_MD5: +@@ -169,35 +243,8 @@ + crypto_digest_init(tfm); + buf_to_sg(sg, header, hdrlen); + crypto_digest_update(tfm, sg, 1); +- if (body->head[0].iov_len) { +- buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len); +- crypto_digest_update(tfm, sg, 1); +- } +- +- len = body->page_len; +- if (len != 0) { +- offset = body->page_base & (PAGE_CACHE_SIZE - 1); +- i = body->page_base >> PAGE_CACHE_SHIFT; +- thislen = PAGE_CACHE_SIZE - offset; +- do { +- if (thislen > len) +- thislen = len; +- sg->page = body->pages[i]; +- sg->offset = offset; +- sg->length = thislen; +- kmap(sg->page); /* XXX kmap_atomic? */ +- crypto_digest_update(tfm, sg, 1); +- kunmap(sg->page); +- len -= thislen; +- i++; +- offset = 0; +- thislen = PAGE_CACHE_SIZE; +- } while(len != 0); +- } +- if (body->tail[0].iov_len) { +- buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len); +- crypto_digest_update(tfm, sg, 1); +- } ++ process_xdr_buf(body, body_offset, body->len - body_offset, ++ checksummer, tfm); + crypto_digest_final(tfm, cksum->data); + code = 0; + out: +@@ -207,3 +254,154 @@ + } + + EXPORT_SYMBOL(make_checksum); ++ ++struct encryptor_desc { ++ u8 iv[8]; /* XXX hard-coded blocksize */ ++ struct crypto_tfm *tfm; ++ int pos; ++ struct xdr_buf *outbuf; ++ struct page **pages; ++ struct scatterlist infrags[4]; ++ struct scatterlist outfrags[4]; ++ int fragno; ++ int fraglen; ++}; ++ ++static int ++encryptor(struct scatterlist *sg, void *data) ++{ ++ struct encryptor_desc *desc = data; ++ struct xdr_buf *outbuf = desc->outbuf; ++ struct page *in_page; ++ int thislen = desc->fraglen + sg->length; ++ int fraglen, ret; ++ int page_pos; ++ ++ /* Worst case is 4 fragments: head, end of page 1, start ++ * of page 2, tail. Anything more is a bug. */ ++ BUG_ON(desc->fragno > 3); ++ desc->infrags[desc->fragno] = *sg; ++ desc->outfrags[desc->fragno] = *sg; ++ ++ page_pos = desc->pos - outbuf->head[0].iov_len; ++ if (page_pos >= 0 && page_pos < outbuf->page_len) { ++ /* pages are not in place: */ ++ int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT; ++ in_page = desc->pages[i]; ++ } else { ++ in_page = sg->page; ++ } ++ desc->infrags[desc->fragno].page = in_page; ++ desc->fragno++; ++ desc->fraglen += sg->length; ++ desc->pos += sg->length; ++ ++ fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ thislen -= fraglen; ++ ++ if (thislen == 0) ++ return 0; ++ ++ ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags, ++ thislen, desc->iv); ++ if (ret) ++ return ret; ++ if (fraglen) { ++ desc->outfrags[0].page = sg->page; ++ desc->outfrags[0].offset = sg->offset + sg->length - fraglen; ++ desc->outfrags[0].length = fraglen; ++ desc->infrags[0] = desc->outfrags[0]; ++ desc->infrags[0].page = in_page; ++ desc->fragno = 1; ++ desc->fraglen = fraglen; ++ } else { ++ desc->fragno = 0; ++ desc->fraglen = 0; ++ } ++ return 0; ++} ++ ++int ++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset, ++ struct page **pages) ++{ ++ int ret; ++ struct encryptor_desc desc; ++ ++ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ desc.tfm = tfm; ++ desc.pos = offset; ++ desc.outbuf = buf; ++ desc.pages = pages; ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ ++ ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc); ++ return ret; ++} ++ ++EXPORT_SYMBOL(gss_encrypt_xdr_buf); ++ ++struct decryptor_desc { ++ u8 iv[8]; /* XXX hard-coded blocksize */ ++ struct crypto_tfm *tfm; ++ struct scatterlist frags[4]; ++ int fragno; ++ int fraglen; ++}; ++ ++static int ++decryptor(struct scatterlist *sg, void *data) ++{ ++ struct decryptor_desc *desc = data; ++ int thislen = desc->fraglen + sg->length; ++ int fraglen, ret; ++ ++ /* Worst case is 4 fragments: head, end of page 1, start ++ * of page 2, tail. Anything more is a bug. */ ++ BUG_ON(desc->fragno > 3); ++ desc->frags[desc->fragno] = *sg; ++ desc->fragno++; ++ desc->fraglen += sg->length; ++ ++ fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ thislen -= fraglen; ++ ++ if (thislen == 0) ++ return 0; ++ ++ ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags, ++ thislen, desc->iv); ++ if (ret) ++ return ret; ++ if (fraglen) { ++ desc->frags[0].page = sg->page; ++ desc->frags[0].offset = sg->offset + sg->length - fraglen; ++ desc->frags[0].length = fraglen; ++ desc->fragno = 1; ++ desc->fraglen = fraglen; ++ } else { ++ desc->fragno = 0; ++ desc->fraglen = 0; ++ } ++ return 0; ++} ++ ++int ++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset) ++{ ++ struct decryptor_desc desc; ++ ++ /* XXXJBF: */ ++ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ desc.tfm = tfm; ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc); ++} ++ ++EXPORT_SYMBOL(gss_decrypt_xdr_buf); +Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_seal.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_seal.c 2004-12-25 05:33:47.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_seal.c 2005-04-05 14:49:13.402691800 +0800 +@@ -70,24 +70,17 @@ + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + +-static inline int +-gss_krb5_padding(int blocksize, int length) { +- /* Most of the code is block-size independent but in practice we +- * use only 8: */ +- BUG_ON(blocksize != 8); +- return 8 - (length & 7); +-} ++spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED; + + u32 + krb5_make_token(struct krb5_ctx *ctx, int qop_req, +- struct xdr_buf *text, struct xdr_netobj *token, +- int toktype) ++ struct xdr_buf *text, struct xdr_netobj *token) + { + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; +- int blocksize = 0, tmsglen; + unsigned char *ptr, *krb5_hdr, *msg_start; + s32 now; ++ u32 seq_send; + + dprintk("RPC: gss_krb5_seal\n"); + +@@ -111,21 +104,13 @@ + goto out_err; + } + +- if (toktype == KG_TOK_WRAP_MSG) { +- blocksize = crypto_tfm_alg_blocksize(ctx->enc); +- tmsglen = blocksize + text->len +- + gss_krb5_padding(blocksize, blocksize + text->len); +- } else { +- tmsglen = 0; +- } +- +- token->len = g_token_size(&ctx->mech_used, 22 + tmsglen); ++ token->len = g_token_size(&ctx->mech_used, 22); + + ptr = token->data; +- g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr); ++ g_make_token_header(&ctx->mech_used, 22, &ptr); + +- *ptr++ = (unsigned char) ((toktype>>8)&0xff); +- *ptr++ = (unsigned char) (toktype&0xff); ++ *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); ++ *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); + + /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr - 2; +@@ -133,17 +118,9 @@ + + *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); +- if (toktype == KG_TOK_WRAP_MSG) +- *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg); + +- if (toktype == KG_TOK_WRAP_MSG) { +- /* XXX removing support for now */ +- goto out_err; +- } else { /* Sign only. */ +- if (make_checksum(checksum_type, krb5_hdr, 8, text, +- &md5cksum)) ++ if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum)) + goto out_err; +- } + + switch (ctx->signalg) { + case SGN_ALG_DES_MAC_MD5: +@@ -163,12 +140,14 @@ + + kfree(md5cksum.data); + ++ spin_lock(&krb5_seq_lock); ++ seq_send = ctx->seq_send++; ++ spin_unlock(&krb5_seq_lock); ++ + if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, +- ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8))) ++ seq_send, krb5_hdr + 16, krb5_hdr + 8))) + goto out_err; + +- ctx->seq_send++; +- + return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); + out_err: + if (md5cksum.data) kfree(md5cksum.data); +Index: linux-2.6.10/net/sunrpc/auth_gss/gss_pseudoflavors.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_pseudoflavors.c 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/gss_pseudoflavors.c 2005-04-05 19:01:49.158500672 +0800 +@@ -1,237 +0,0 @@ +-/* +- * linux/net/sunrpc/gss_union.c +- * +- * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic code +- * +- * Copyright (c) 2001 The Regents of the University of Michigan. +- * All rights reserved. +- * +- * Andy Adamson +- * +- */ +- +-/* +- * Copyright 1993 by OpenVision Technologies, Inc. +- * +- * Permission to use, copy, modify, distribute, and sell this software +- * and its documentation for any purpose is hereby granted without fee, +- * provided that the above copyright notice appears in all copies and +- * that both that copyright notice and this permission notice appear in +- * supporting documentation, and that the name of OpenVision not be used +- * in advertising or publicity pertaining to distribution of the software +- * without specific, written prior permission. OpenVision makes no +- * representations about the suitability of this software for any +- * purpose. It is provided "as is" without express or implied warranty. +- * +- * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, +- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO +- * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR +- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF +- * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +- * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +- * PERFORMANCE OF THIS SOFTWARE. +- */ +- +-#include +-#include +-#include +-#include +-#include +- +-#ifdef RPC_DEBUG +-# define RPCDBG_FACILITY RPCDBG_AUTH +-#endif +- +-static LIST_HEAD(registered_triples); +-static spinlock_t registered_triples_lock = SPIN_LOCK_UNLOCKED; +- +-/* The following must be called with spinlock held: */ +-static struct sup_sec_triple * +-do_lookup_triple_by_pseudoflavor(u32 pseudoflavor) +-{ +- struct sup_sec_triple *pos, *triple = NULL; +- +- list_for_each_entry(pos, ®istered_triples, triples) { +- if (pos->pseudoflavor == pseudoflavor) { +- triple = pos; +- break; +- } +- } +- return triple; +-} +- +-/* XXX Need to think about reference counting of triples and of mechs. +- * Currently we do no reference counting of triples, and I think that's +- * probably OK given the reference counting on mechs, but there's probably +- * a better way to do all this. */ +- +-int +-gss_register_triple(u32 pseudoflavor, struct gss_api_mech *mech, +- u32 qop, u32 service) +-{ +- struct sup_sec_triple *triple; +- +- if (!(triple = kmalloc(sizeof(*triple), GFP_KERNEL))) { +- printk("Alloc failed in gss_register_triple"); +- goto err; +- } +- triple->pseudoflavor = pseudoflavor; +- triple->mech = gss_mech_get_by_OID(&mech->gm_oid); +- triple->qop = qop; +- triple->service = service; +- +- spin_lock(®istered_triples_lock); +- if (do_lookup_triple_by_pseudoflavor(pseudoflavor)) { +- printk(KERN_WARNING "RPC: Registered pseudoflavor %d again\n", +- pseudoflavor); +- goto err_unlock; +- } +- list_add(&triple->triples, ®istered_triples); +- spin_unlock(®istered_triples_lock); +- dprintk("RPC: registered pseudoflavor %d\n", pseudoflavor); +- +- return 0; +- +-err_unlock: +- kfree(triple); +- spin_unlock(®istered_triples_lock); +-err: +- return -1; +-} +- +-int +-gss_unregister_triple(u32 pseudoflavor) +-{ +- struct sup_sec_triple *triple; +- +- spin_lock(®istered_triples_lock); +- if (!(triple = do_lookup_triple_by_pseudoflavor(pseudoflavor))) { +- spin_unlock(®istered_triples_lock); +- printk("Can't unregister unregistered pseudoflavor %d\n", +- pseudoflavor); +- return -1; +- } +- list_del(&triple->triples); +- spin_unlock(®istered_triples_lock); +- gss_mech_put(triple->mech); +- kfree(triple); +- return 0; +- +-} +- +-void +-print_sec_triple(struct xdr_netobj *oid,u32 qop,u32 service) +-{ +- dprintk("RPC: print_sec_triple:\n"); +- dprintk(" oid_len %d\n oid :\n",oid->len); +- print_hexl((u32 *)oid->data,oid->len,0); +- dprintk(" qop %d\n",qop); +- dprintk(" service %d\n",service); +-} +- +-/* Function: gss_get_cmp_triples +- * +- * Description: search sec_triples for a matching security triple +- * return pseudoflavor if match, else 0 +- * (Note that 0 is a valid pseudoflavor, but not for any gss pseudoflavor +- * (0 means auth_null), so this shouldn't cause confusion.) +- */ +-u32 +-gss_cmp_triples(u32 oid_len, char *oid_data, u32 qop, u32 service) +-{ +- struct sup_sec_triple *triple; +- u32 pseudoflavor = 0; +- struct xdr_netobj oid; +- +- oid.len = oid_len; +- oid.data = oid_data; +- +- dprintk("RPC: gss_cmp_triples\n"); +- print_sec_triple(&oid,qop,service); +- +- spin_lock(®istered_triples_lock); +- list_for_each_entry(triple, ®istered_triples, triples) { +- if((g_OID_equal(&oid, &triple->mech->gm_oid)) +- && (qop == triple->qop) +- && (service == triple->service)) { +- pseudoflavor = triple->pseudoflavor; +- break; +- } +- } +- spin_unlock(®istered_triples_lock); +- dprintk("RPC: gss_cmp_triples return %d\n", pseudoflavor); +- return pseudoflavor; +-} +- +-u32 +-gss_get_pseudoflavor(struct gss_ctx *ctx, u32 qop, u32 service) +-{ +- return gss_cmp_triples(ctx->mech_type->gm_oid.len, +- ctx->mech_type->gm_oid.data, +- qop, service); +-} +- +-/* Returns nonzero iff the given pseudoflavor is in the supported list. +- * (Note that without incrementing a reference count or anything, this +- * doesn't give any guarantees.) */ +-int +-gss_pseudoflavor_supported(u32 pseudoflavor) +-{ +- struct sup_sec_triple *triple; +- +- spin_lock(®istered_triples_lock); +- triple = do_lookup_triple_by_pseudoflavor(pseudoflavor); +- spin_unlock(®istered_triples_lock); +- return (triple ? 1 : 0); +-} +- +-u32 +-gss_pseudoflavor_to_service(u32 pseudoflavor) +-{ +- struct sup_sec_triple *triple; +- +- spin_lock(®istered_triples_lock); +- triple = do_lookup_triple_by_pseudoflavor(pseudoflavor); +- spin_unlock(®istered_triples_lock); +- if (!triple) { +- dprintk("RPC: gss_pseudoflavor_to_service called with unsupported pseudoflavor %d\n", +- pseudoflavor); +- return 0; +- } +- return triple->service; +-} +- +-struct gss_api_mech * +-gss_pseudoflavor_to_mech(u32 pseudoflavor) { +- struct sup_sec_triple *triple; +- struct gss_api_mech *mech = NULL; +- +- spin_lock(®istered_triples_lock); +- triple = do_lookup_triple_by_pseudoflavor(pseudoflavor); +- spin_unlock(®istered_triples_lock); +- if (triple) +- mech = gss_mech_get(triple->mech); +- else +- dprintk("RPC: gss_pseudoflavor_to_mech called with unsupported pseudoflavor %d\n", +- pseudoflavor); +- return mech; +-} +- +-int +-gss_pseudoflavor_to_mechOID(u32 pseudoflavor, struct xdr_netobj * oid) +-{ +- struct gss_api_mech *mech; +- +- mech = gss_pseudoflavor_to_mech(pseudoflavor); +- if (!mech) { +- dprintk("RPC: gss_pseudoflavor_to_mechOID called with unsupported pseudoflavor %d\n", +- pseudoflavor); +- return -1; +- } +- oid->len = mech->gm_oid.len; +- if (!(oid->data = kmalloc(oid->len, GFP_KERNEL))) +- return -1; +- memcpy(oid->data, mech->gm_oid.data, oid->len); +- gss_mech_put(mech); +- return 0; +-} +Index: linux-2.6.10/net/sunrpc/auth_gss/svcauth_gss.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/svcauth_gss.c 2004-12-25 05:34:44.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/svcauth_gss.c 2005-04-05 14:49:13.407691040 +0800 +@@ -37,6 +37,7 @@ + * + */ + ++#include + #include + #include + #include +@@ -78,7 +79,6 @@ + + static struct cache_head *rsi_table[RSI_HASHMAX]; + static struct cache_detail rsi_cache; +-static struct rsi *rsi_lookup(struct rsi *item, int set); + + static void rsi_free(struct rsi *rsii) + { +@@ -125,38 +125,6 @@ + return dup_to_netobj(dst, src->data, src->len); + } + +-static inline void rsi_init(struct rsi *new, struct rsi *item) +-{ +- new->out_handle.data = NULL; +- new->out_handle.len = 0; +- new->out_token.data = NULL; +- new->out_token.len = 0; +- new->in_handle.len = item->in_handle.len; +- item->in_handle.len = 0; +- new->in_token.len = item->in_token.len; +- item->in_token.len = 0; +- new->in_handle.data = item->in_handle.data; +- item->in_handle.data = NULL; +- new->in_token.data = item->in_token.data; +- item->in_token.data = NULL; +-} +- +-static inline void rsi_update(struct rsi *new, struct rsi *item) +-{ +- BUG_ON(new->out_handle.data || new->out_token.data); +- new->out_handle.len = item->out_handle.len; +- item->out_handle.len = 0; +- new->out_token.len = item->out_token.len; +- item->out_token.len = 0; +- new->out_handle.data = item->out_handle.data; +- item->out_handle.data = NULL; +- new->out_token.data = item->out_token.data; +- item->out_token.data = NULL; +- +- new->major_status = item->major_status; +- new->minor_status = item->minor_status; +-} +- + static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +@@ -168,6 +136,75 @@ + (*bpp)[-1] = '\n'; + } + ++static inline int ++gssd_reply(struct rsi *item) ++{ ++ struct rsi *tmp; ++ struct cache_head **hp, **head; ++ ++ head = &rsi_cache.hash_table[rsi_hash(item)]; ++ write_lock(&rsi_cache.hash_lock); ++ for (hp = head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct rsi, h); ++ if (rsi_match(tmp, item)) { ++ cache_get(&tmp->h); ++ clear_bit(CACHE_HASHED, &tmp->h.flags); ++ *hp = tmp->h.next; ++ tmp->h.next = NULL; ++ rsi_cache.entries--; ++ if (test_bit(CACHE_VALID, &tmp->h.flags)) { ++ write_unlock(&rsi_cache.hash_lock); ++ rsi_put(&tmp->h, &rsi_cache); ++ return -EINVAL; ++ } ++ set_bit(CACHE_HASHED, &item->h.flags); ++ item->h.next = *hp; ++ *hp = &item->h; ++ rsi_cache.entries++; ++ set_bit(CACHE_VALID, &item->h.flags); ++ item->h.last_refresh = get_seconds(); ++ write_unlock(&rsi_cache.hash_lock); ++ cache_fresh(&rsi_cache, &tmp->h, 0); ++ rsi_put(&tmp->h, &rsi_cache); ++ return 0; ++ } ++ } ++ write_unlock(&rsi_cache.hash_lock); ++ return -EINVAL; ++} ++ ++static inline struct rsi * ++gssd_upcall(struct rsi *item, struct svc_rqst *rqstp) ++{ ++ struct rsi *tmp; ++ struct cache_head **hp, **head; ++ ++ head = &rsi_cache.hash_table[rsi_hash(item)]; ++ read_lock(&rsi_cache.hash_lock); ++ for (hp = head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct rsi, h); ++ if (rsi_match(tmp, item)) { ++ if (!test_bit(CACHE_VALID, &tmp->h.flags)) { ++ read_unlock(&rsi_cache.hash_lock); ++ return NULL; ++ } ++ *hp = tmp->h.next; ++ tmp->h.next = NULL; ++ rsi_cache.entries--; ++ read_unlock(&rsi_cache.hash_lock); ++ return tmp; ++ } ++ } ++ cache_get(&item->h); ++ item->h.next = *head; ++ *head = &item->h; ++ rsi_cache.entries++; ++ read_unlock(&rsi_cache.hash_lock); ++ cache_get(&item->h); ++ if (cache_check(&rsi_cache, &item->h, &rqstp->rq_chandle)) ++ return NULL; ++ return item; ++} + + static int rsi_parse(struct cache_detail *cd, + char *mesg, int mlen) +@@ -176,17 +213,22 @@ + char *buf = mesg; + char *ep; + int len; +- struct rsi rsii, *rsip = NULL; ++ struct rsi *rsii; + time_t expiry; + int status = -EINVAL; + +- memset(&rsii, 0, sizeof(rsii)); ++ rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); ++ if (!rsii) ++ return -ENOMEM; ++ memset(rsii, 0, sizeof(*rsii)); ++ cache_init(&rsii->h); ++ + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsii.in_handle, buf, len)) ++ if (dup_to_netobj(&rsii->in_handle, buf, len)) + goto out; + + /* token */ +@@ -195,10 +237,9 @@ + if (len < 0) + goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsii.in_token, buf, len)) ++ if (dup_to_netobj(&rsii->in_token, buf, len)) + goto out; + +- rsii.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; +@@ -212,13 +253,13 @@ + if (len == 0) { + goto out; + } else { +- rsii.major_status = simple_strtoul(buf, &ep, 10); ++ rsii->major_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; +- rsii.minor_status = simple_strtoul(buf, &ep, 10); ++ rsii->minor_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + +@@ -227,7 +268,7 @@ + if (len < 0) + goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsii.out_handle, buf, len)) ++ if (dup_to_netobj(&rsii->out_handle, buf, len)) + goto out; + + /* out_token */ +@@ -236,16 +277,14 @@ + if (len < 0) + goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsii.out_token, buf, len)) ++ if (dup_to_netobj(&rsii->out_token, buf, len)) + goto out; + } +- rsii.h.expiry_time = expiry; +- rsip = rsi_lookup(&rsii, 1); +- status = 0; ++ rsii->h.expiry_time = expiry; ++ status = gssd_reply(rsii); + out: +- rsi_free(&rsii); +- if (rsip) +- rsi_put(&rsip->h, &rsi_cache); ++ if (rsii) ++ rsi_put(&rsii->h, &rsi_cache); + return status; + } + +@@ -258,8 +297,6 @@ + .cache_parse = rsi_parse, + }; + +-static DefineSimpleCacheLookup(rsi, 0) +- + /* + * The rpcsec_context cache is used to store a context that is + * used in data exchange. +@@ -292,7 +329,6 @@ + + static struct cache_head *rsc_table[RSC_HASHMAX]; + static struct cache_detail rsc_cache; +-static struct rsc *rsc_lookup(struct rsc *item, int set); + + static void rsc_free(struct rsc *rsci) + { +@@ -325,26 +361,46 @@ + return netobj_equal(&new->handle, &tmp->handle); + } + +-static inline void +-rsc_init(struct rsc *new, struct rsc *tmp) ++static struct rsc *rsc_lookup(struct rsc *item, int set) + { +- new->handle.len = tmp->handle.len; +- tmp->handle.len = 0; +- new->handle.data = tmp->handle.data; +- tmp->handle.data = NULL; +- new->mechctx = NULL; +- new->cred.cr_group_info = NULL; +-} +- +-static inline void +-rsc_update(struct rsc *new, struct rsc *tmp) +-{ +- new->mechctx = tmp->mechctx; +- tmp->mechctx = NULL; +- memset(&new->seqdata, 0, sizeof(new->seqdata)); +- spin_lock_init(&new->seqdata.sd_lock); +- new->cred = tmp->cred; +- tmp->cred.cr_group_info = NULL; ++ struct rsc *tmp = NULL; ++ struct cache_head **hp, **head; ++ head = &rsc_cache.hash_table[rsc_hash(item)]; ++ ++ if (set) ++ write_lock(&rsc_cache.hash_lock); ++ else ++ read_lock(&rsc_cache.hash_lock); ++ for (hp = head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct rsc, h); ++ if (!rsc_match(tmp, item)) ++ continue; ++ cache_get(&tmp->h); ++ if (!set) ++ goto out_noset; ++ *hp = tmp->h.next; ++ tmp->h.next = NULL; ++ clear_bit(CACHE_HASHED, &tmp->h.flags); ++ rsc_put(&tmp->h, &rsc_cache); ++ goto out_set; ++ } ++ /* Didn't find anything */ ++ if (!set) ++ goto out_nada; ++ rsc_cache.entries++; ++out_set: ++ set_bit(CACHE_HASHED, &item->h.flags); ++ item->h.next = *head; ++ *head = &item->h; ++ write_unlock(&rsc_cache.hash_lock); ++ cache_fresh(&rsc_cache, &item->h, item->h.expiry_time); ++ cache_get(&item->h); ++ return item; ++out_nada: ++ tmp = NULL; ++out_noset: ++ read_unlock(&rsc_cache.hash_lock); ++ return tmp; + } + + static int rsc_parse(struct cache_detail *cd, +@@ -353,19 +409,22 @@ + /* contexthandle expiry [ uid gid N mechname ...mechdata... ] */ + char *buf = mesg; + int len, rv; +- struct rsc rsci, *rscp = NULL; ++ struct rsc *rsci, *res = NULL; + time_t expiry; + int status = -EINVAL; + +- memset(&rsci, 0, sizeof(rsci)); ++ rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); ++ if (!rsci) ++ return -ENOMEM; ++ memset(rsci, 0, sizeof(*rsci)); ++ cache_init(&rsci->h); + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsci.handle, buf, len)) ++ if (dup_to_netobj(&rsci->handle, buf, len)) + goto out; + +- rsci.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; +@@ -373,26 +432,26 @@ + goto out; + + /* uid, or NEGATIVE */ +- rv = get_int(&mesg, &rsci.cred.cr_uid); ++ rv = get_int(&mesg, &rsci->cred.cr_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) +- set_bit(CACHE_NEGATIVE, &rsci.h.flags); ++ set_bit(CACHE_NEGATIVE, &rsci->h.flags); + else { + int N, i; + struct gss_api_mech *gm; + struct xdr_netobj tmp_buf; + + /* gid */ +- if (get_int(&mesg, &rsci.cred.cr_gid)) ++ if (get_int(&mesg, &rsci->cred.cr_gid)) + goto out; + + /* number of additional gid's */ + if (get_int(&mesg, &N)) + goto out; + status = -ENOMEM; +- rsci.cred.cr_group_info = groups_alloc(N); +- if (rsci.cred.cr_group_info == NULL) ++ rsci->cred.cr_group_info = groups_alloc(N); ++ if (rsci->cred.cr_group_info == NULL) + goto out; + + /* gid's */ +@@ -401,7 +460,7 @@ + gid_t gid; + if (get_int(&mesg, &gid)) + goto out; +- GROUP_AT(rsci.cred.cr_group_info, i) = gid; ++ GROUP_AT(rsci->cred.cr_group_info, i) = gid; + } + + /* mech name */ +@@ -422,19 +481,21 @@ + } + tmp_buf.len = len; + tmp_buf.data = buf; +- if (gss_import_sec_context(&tmp_buf, gm, &rsci.mechctx)) { ++ if (gss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) { + gss_mech_put(gm); + goto out; + } + gss_mech_put(gm); + } +- rsci.h.expiry_time = expiry; +- rscp = rsc_lookup(&rsci, 1); ++ rsci->h.expiry_time = expiry; ++ spin_lock_init(&rsci->seqdata.sd_lock); ++ res = rsc_lookup(rsci, 1); ++ rsc_put(&res->h, &rsc_cache); ++ rsci = NULL; + status = 0; + out: +- rsc_free(&rsci); +- if (rscp) +- rsc_put(&rscp->h, &rsc_cache); ++ if (rsci) ++ rsc_put(&rsci->h, &rsc_cache); + return status; + } + +@@ -446,19 +507,14 @@ + .cache_parse = rsc_parse, + }; + +-static DefineSimpleCacheLookup(rsc, 0); +- + struct rsc * + gss_svc_searchbyctx(struct xdr_netobj *handle) + { + struct rsc rsci; + struct rsc *found; + +- memset(&rsci, 0, sizeof(rsci)); +- if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) +- return NULL; ++ rsci.handle = *handle; + found = rsc_lookup(&rsci, 0); +- rsc_free(&rsci); + if (!found) + return NULL; + if (cache_check(&rsc_cache, &found->h, NULL)) +@@ -721,6 +777,45 @@ + return stat; + } + ++static int ++unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) ++{ ++ int stat = -EINVAL; ++ int out_offset; ++ u32 * lenp; ++ u32 priv_len, maj_stat; ++ int saved_len; ++ ++ lenp = buf->head[0].iov_base; ++ priv_len = ntohl(svc_getu32(&buf->head[0])); ++ if (priv_len > buf->len) /* XXXJBF: wrong check */ ++ goto out; ++ /* XXXJBF: bizarre hack: to handle revisits (and not decrypt ++ * twice), the first time through we write an offset ++ * telling us where to skip to find the already-decrypted data */ ++ if (rqstp->rq_deferred) { ++ buf->head[0].iov_base += priv_len; ++ buf->head[0].iov_len -= priv_len; ++ return 0; ++ } ++ saved_len = buf->len; /* XXX HACK */ ++ buf->len = priv_len; ++ maj_stat = gss_unwrap(ctx, NULL, 0, buf, &out_offset); ++ buf->len = saved_len; ++ buf->head[0].iov_base += out_offset; ++ buf->head[0].iov_len -= out_offset; ++ BUG_ON(buf->head[0].iov_len <= 0); ++ if (maj_stat != GSS_S_COMPLETE) ++ goto out; ++ if (ntohl(svc_getu32(&buf->head[0])) != seq) ++ goto out; ++ /* XXXJBF: see "bizarre hack", above. */ ++ *lenp = htonl(out_offset + 4); ++ stat = 0; ++out: ++ return stat; ++} ++ + struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; +@@ -730,6 +825,19 @@ + struct rsc *rsci; + }; + ++static int ++svcauth_gss_set_client(struct svc_rqst *rqstp) ++{ ++ struct gss_svc_data *svcdata = rqstp->rq_auth_data; ++ struct rsc *rsci = svcdata->rsci; ++ struct rpc_gss_wire_cred *gc = &svcdata->clcred; ++ ++ rqstp->rq_client = find_gss_auth_domain(rsci->mechctx, gc->gc_svc); ++ if (rqstp->rq_client == NULL) ++ return SVC_DENIED; ++ return SVC_OK; ++} ++ + /* + * Accept an rpcsec packet. + * If context establishment, punt to user space +@@ -748,7 +856,7 @@ + struct gss_svc_data *svcdata = rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc; + struct rsc *rsci = NULL; +- struct rsi *rsip, rsikey; ++ struct rsi *rsip, *rsikey = NULL; + u32 *rpcstart; + u32 *reject_stat = resv->iov_base + resv->iov_len; + int ret; +@@ -841,30 +949,23 @@ + *authp = rpc_autherr_badcred; + if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) + goto auth_err; +- memset(&rsikey, 0, sizeof(rsikey)); +- if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) ++ rsikey = kmalloc(sizeof(*rsikey), GFP_KERNEL); ++ if (!rsikey) ++ goto drop; ++ memset(rsikey, 0, sizeof(*rsikey)); ++ cache_init(&rsikey->h); ++ if (dup_netobj(&rsikey->in_handle, &gc->gc_ctx)) + goto drop; + *authp = rpc_autherr_badverf; +- if (svc_safe_getnetobj(argv, &tmpobj)) { +- kfree(rsikey.in_handle.data); ++ if (svc_safe_getnetobj(argv, &tmpobj)) + goto auth_err; +- } +- if (dup_netobj(&rsikey.in_token, &tmpobj)) { +- kfree(rsikey.in_handle.data); ++ if (dup_netobj(&rsikey->in_token, &tmpobj)) + goto drop; +- } + +- rsip = rsi_lookup(&rsikey, 0); +- rsi_free(&rsikey); +- if (!rsip) { +- goto drop; +- } +- switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) { +- case -EAGAIN: ++ rsip = gssd_upcall(rsikey, rqstp); ++ if (!rsip) + goto drop; +- case -ENOENT: +- goto drop; +- case 0: ++ else { + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (!rsci) { + goto drop; +@@ -893,11 +994,6 @@ + svc_putu32(resv, rpc_success); + goto complete; + case RPC_GSS_PROC_DATA: +- *authp = rpc_autherr_badcred; +- rqstp->rq_client = +- find_gss_auth_domain(rsci->mechctx, gc->gc_svc); +- if (rqstp->rq_client == NULL) +- goto auth_err; + *authp = rpcsec_gsserr_ctxproblem; + if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) + goto auth_err; +@@ -911,6 +1007,15 @@ + if (unwrap_integ_data(&rqstp->rq_arg, + gc->gc_seq, rsci->mechctx)) + goto auth_err; ++ /* placeholders for length and seq. number: */ ++ svcdata->body_start = resv->iov_base + resv->iov_len; ++ svc_putu32(resv, 0); ++ svc_putu32(resv, 0); ++ break; ++ case RPC_GSS_SVC_PRIVACY: ++ if (unwrap_priv_data(rqstp, &rqstp->rq_arg, ++ gc->gc_seq, rsci->mechctx)) ++ goto auth_err; + svcdata->rsci = rsci; + cache_get(&rsci->h); + /* placeholders for length and seq. number: */ +@@ -918,11 +1023,11 @@ + svc_putu32(resv, 0); + svc_putu32(resv, 0); + break; +- case RPC_GSS_SVC_PRIVACY: +- /* currently unsupported */ + default: + goto auth_err; + } ++ svcdata->rsci = rsci; ++ cache_get(&rsci->h); + ret = SVC_OK; + goto out; + } +@@ -937,13 +1042,15 @@ + drop: + ret = SVC_DROP; + out: ++ if (rsikey) ++ rsi_put(&rsikey->h, &rsi_cache); + if (rsci) + rsc_put(&rsci->h, &rsc_cache); + return ret; + } + +-static int +-svcauth_gss_release(struct svc_rqst *rqstp) ++static inline int ++svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) + { + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; +@@ -955,10 +1062,160 @@ + int integ_offset, integ_len; + int stat = -EINVAL; + ++ p = gsd->body_start; ++ gsd->body_start = NULL; ++ /* move accept_stat to right place: */ ++ memcpy(p, p + 2, 4); ++ /* Don't wrap in failure case: */ ++ /* Counting on not getting here if call was not even accepted! */ ++ if (*p != rpc_success) { ++ resbuf->head[0].iov_len -= 2 * 4; ++ goto out; ++ } ++ p++; ++ integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; ++ integ_len = resbuf->len - integ_offset; ++ BUG_ON(integ_len % 4); ++ *p++ = htonl(integ_len); ++ *p++ = htonl(gc->gc_seq); ++ if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, ++ integ_len)) ++ BUG(); ++ if (resbuf->page_len == 0 ++ && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE ++ < PAGE_SIZE) { ++ BUG_ON(resbuf->tail[0].iov_len); ++ /* Use head for everything */ ++ resv = &resbuf->head[0]; ++ } else if (resbuf->tail[0].iov_base == NULL) { ++ /* copied from nfsd4_encode_read */ ++ svc_take_page(rqstp); ++ resbuf->tail[0].iov_base = page_address(rqstp ++ ->rq_respages[rqstp->rq_resused-1]); ++ rqstp->rq_restailpage = rqstp->rq_resused-1; ++ resbuf->tail[0].iov_len = 0; ++ resv = &resbuf->tail[0]; ++ } else { ++ resv = &resbuf->tail[0]; ++ } ++ mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; ++ if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) ++ goto out_err; ++ svc_putu32(resv, htonl(mic.len)); ++ memset(mic.data + mic.len, 0, ++ round_up_to_quad(mic.len) - mic.len); ++ resv->iov_len += XDR_QUADLEN(mic.len) << 2; ++ /* not strictly required: */ ++ resbuf->len += XDR_QUADLEN(mic.len) << 2; ++ BUG_ON(resv->iov_len > PAGE_SIZE); ++out: ++ stat = 0; ++out_err: ++ return stat; ++} ++ ++/* XXXJBF: Look for chances to share code with client */ ++/* XXXJBF: Do we need to preallocate these pages somehow? E.g. see ++ * buffer size calculations in svcsock.c */ ++/* XXXJBF: how does reference counting on pages work? */ ++static struct page ** ++svc_alloc_enc_pages(struct xdr_buf *buf) ++{ ++ struct page **ret; ++ int last, i; ++ ++ if (buf->page_len == 0) ++ return NULL; ++ BUG_ON(buf->page_base >> PAGE_CACHE_SHIFT); ++ last = (buf->page_base + buf->page_len - 1) >> PAGE_CACHE_SHIFT; ++ ret = kmalloc((last + 1) * sizeof(struct page *), GFP_KERNEL); ++ if (!ret) ++ goto out; ++ for (i = 0; i<= last; i++) { ++ ret[i] = alloc_page(GFP_KERNEL); ++ if (ret[i] == NULL) ++ goto out_free; ++ } ++out: ++ return ret; ++out_free: ++ for (i--; i >= 0; i--) { ++ __free_page(ret[i]); ++ } ++ return NULL; ++} ++ ++static inline int ++svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) ++{ ++ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; ++ struct rpc_gss_wire_cred *gc = &gsd->clcred; ++ struct xdr_buf *resbuf = &rqstp->rq_res; ++ struct page **inpages; ++ u32 *p; ++ int offset, *len; ++ int pad; ++ int stat = -EINVAL; ++ ++ p = gsd->body_start; ++ gsd->body_start = NULL; ++ /* move accept_stat to right place: */ ++ memcpy(p, p + 2, 4); ++ /* Don't wrap in failure case: */ ++ /* Counting on not getting here if call was not even accepted! */ ++ if (*p != rpc_success) { ++ resbuf->head[0].iov_len -= 2 * 4; ++ goto out; ++ } ++ p++; ++ len = p++; ++ offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base; ++ *p++ = htonl(gc->gc_seq); ++ stat = -ENOMEM; ++ inpages = resbuf->pages; ++ /* XXXJBF: huge memory leaks here: allocated pages probably aren't ++ * freed, and neither is memory used to hold page array. */ ++ resbuf->pages = svc_alloc_enc_pages(resbuf); ++ if (resbuf->page_len && !resbuf->pages) ++ goto out_err; /* XXX sleep and retry? Reserve ahead of time ++ and BUG_ON? */ ++ if (resbuf->tail[0].iov_len == 0 || resbuf->tail[0].iov_base == NULL) { ++ /* copied from nfsd4_encode_read */ ++ {int i = svc_take_page(rqstp); BUG_ON(i); } ++ resbuf->tail[0].iov_base = page_address(rqstp ++ ->rq_respages[rqstp->rq_resused-1]); ++ rqstp->rq_restailpage = rqstp->rq_resused-1; ++ resbuf->tail[0].iov_len = 0; ++ } ++ /* XXX: Will svc code attempt to free stuff in xdr_buf->pages? ++ * Or can we leave it in any old state on error?? */ ++ stat = -EINVAL; ++ if (gss_wrap(gsd->rsci->mechctx, GSS_C_QOP_DEFAULT, offset, ++ resbuf, inpages)) ++ goto out_err; ++ *len = htonl(resbuf->len - offset); ++ pad = 3 - ((resbuf->len - offset - 1)&3); ++ p = (u32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len); ++ memset(p, 0, pad); ++ resbuf->tail[0].iov_len += pad; ++out: ++ return 0; ++out_err: ++ return stat; ++} ++ ++static int ++svcauth_gss_release(struct svc_rqst *rqstp) ++{ ++ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; ++ struct rpc_gss_wire_cred *gc = &gsd->clcred; ++ struct xdr_buf *resbuf = &rqstp->rq_res; ++ int stat = -EINVAL; ++ + if (gc->gc_proc != RPC_GSS_PROC_DATA) + goto out; + /* Release can be called twice, but we only wrap once. */ +- if (gsd->body_start == 0) ++ if (gsd->body_start == NULL) + goto out; + /* normally not set till svc_send, but we need it here: */ + resbuf->len = resbuf->head[0].iov_len +@@ -967,55 +1224,15 @@ + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: +- p = gsd->body_start; +- gsd->body_start = NULL; +- /* move accept_stat to right place: */ +- memcpy(p, p + 2, 4); +- /* don't wrap in failure case: */ +- /* Note: counting on not getting here if call was not even +- * accepted! */ +- if (*p != rpc_success) { +- resbuf->head[0].iov_len -= 2 * 4; +- goto out; +- } +- p++; +- integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; +- integ_len = resbuf->len - integ_offset; +- BUG_ON(integ_len % 4); +- *p++ = htonl(integ_len); +- *p++ = htonl(gc->gc_seq); +- if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, +- integ_len)) +- BUG(); +- if (resbuf->page_len == 0 +- && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE +- < PAGE_SIZE) { +- BUG_ON(resbuf->tail[0].iov_len); +- /* Use head for everything */ +- resv = &resbuf->head[0]; +- } else if (resbuf->tail[0].iov_base == NULL) { +- /* copied from nfsd4_encode_read */ +- svc_take_page(rqstp); +- resbuf->tail[0].iov_base = page_address(rqstp +- ->rq_respages[rqstp->rq_resused-1]); +- rqstp->rq_restailpage = rqstp->rq_resused-1; +- resbuf->tail[0].iov_len = 0; +- resv = &resbuf->tail[0]; +- } else { +- resv = &resbuf->tail[0]; +- } +- mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; +- if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) ++ stat = svcauth_gss_wrap_resp_integ(rqstp); ++ if (stat) + goto out_err; +- svc_putu32(resv, htonl(mic.len)); +- memset(mic.data + mic.len, 0, +- round_up_to_quad(mic.len) - mic.len); +- resv->iov_len += XDR_QUADLEN(mic.len) << 2; +- /* not strictly required: */ +- resbuf->len += XDR_QUADLEN(mic.len) << 2; +- BUG_ON(resv->iov_len > PAGE_SIZE); + break; + case RPC_GSS_SVC_PRIVACY: ++ stat = svcauth_gss_wrap_resp_priv(rqstp); ++ if (stat) ++ goto out_err; ++ break; + default: + goto out_err; + } +@@ -1052,6 +1269,7 @@ + .accept = svcauth_gss_accept, + .release = svcauth_gss_release, + .domain_release = svcauth_gss_domain_release, ++ .set_client = svcauth_gss_set_client, + }; + + int +Index: linux-2.6.10/net/sunrpc/auth_gss/sunrpcgss_syms.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/sunrpcgss_syms.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/sunrpcgss_syms.c 2005-04-05 19:01:49.158500672 +0800 +@@ -1,37 +0,0 @@ +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +- +-/* svcauth_gss.c: */ +-EXPORT_SYMBOL(svcauth_gss_register_pseudoflavor); +- +-/* registering gss mechanisms to the mech switching code: */ +-EXPORT_SYMBOL(gss_mech_register); +-EXPORT_SYMBOL(gss_mech_unregister); +-EXPORT_SYMBOL(gss_mech_get); +-EXPORT_SYMBOL(gss_mech_get_by_pseudoflavor); +-EXPORT_SYMBOL(gss_mech_get_by_name); +-EXPORT_SYMBOL(gss_mech_put); +-EXPORT_SYMBOL(gss_pseudoflavor_to_service); +-EXPORT_SYMBOL(gss_service_to_auth_domain_name); +- +-/* generic functionality in gss code: */ +-EXPORT_SYMBOL(g_make_token_header); +-EXPORT_SYMBOL(g_verify_token_header); +-EXPORT_SYMBOL(g_token_size); +-EXPORT_SYMBOL(make_checksum); +-EXPORT_SYMBOL(krb5_encrypt); +-EXPORT_SYMBOL(krb5_decrypt); +- +-/* debug */ +-EXPORT_SYMBOL(print_hexl); +Index: linux-2.6.10/net/sunrpc/auth_gss/Makefile +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/Makefile 2004-12-25 05:34:33.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/Makefile 2005-04-05 14:49:13.408690888 +0800 +@@ -10,7 +10,7 @@ + obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o + + rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ +- gss_krb5_seqnum.o ++ gss_krb5_seqnum.o gss_krb5_wrap.o + + obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o + +Index: linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_mech.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/gss_krb5_mech.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/gss_krb5_mech.c 2005-04-05 14:49:13.400692104 +0800 +@@ -182,6 +182,7 @@ + kfree(kctx); + } + ++/* XXX the following wrappers have become pointless; kill them. */ + static u32 + gss_verify_mic_kerberos(struct gss_ctx *ctx, + struct xdr_buf *message, +@@ -191,8 +192,7 @@ + int qop_state; + struct krb5_ctx *kctx = ctx->internal_ctx_id; + +- maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state, +- KG_TOK_MIC_MSG); ++ maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state); + if (!maj_stat && qop_state) + *qstate = qop_state; + +@@ -208,7 +208,7 @@ + u32 err = 0; + struct krb5_ctx *kctx = ctx->internal_ctx_id; + +- err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG); ++ err = krb5_make_token(kctx, qop, message, mic_token); + + dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); + +@@ -219,6 +219,8 @@ + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, ++ .gss_wrap = gss_wrap_kerberos, ++ .gss_unwrap = gss_unwrap_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, + }; + +@@ -233,6 +235,11 @@ + .service = RPC_GSS_SVC_INTEGRITY, + .name = "krb5i", + }, ++ [2] = { ++ .pseudoflavor = RPC_AUTH_GSS_KRB5P, ++ .service = RPC_GSS_SVC_PRIVACY, ++ .name = "krb5p", ++ }, + }; + + static struct gss_api_mech gss_kerberos_mech = { +Index: linux-2.6.10/net/sunrpc/auth_gss/auth_gss.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/auth_gss/auth_gss.c 2004-12-25 05:34:44.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/auth_gss/auth_gss.c 2005-04-05 14:49:13.404691496 +0800 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -480,12 +481,14 @@ + if (!cred) + goto err; + if (gss_err) +- cred->cr_flags |= RPCAUTH_CRED_DEAD; ++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + else + gss_cred_set_ctx(cred, ctx); + spin_lock(&gss_auth->lock); + gss_msg = __gss_find_upcall(gss_auth, acred.uid); + if (gss_msg) { ++ if (gss_err) ++ gss_msg->msg.errno = -EACCES; + __gss_unhash_msg(gss_msg); + spin_unlock(&gss_auth->lock); + gss_release_msg(gss_msg); +@@ -740,7 +743,9 @@ + maj_stat = gss_get_mic(ctx->gc_gss_ctx, + GSS_C_QOP_DEFAULT, + &verf_buf, &mic); +- if(maj_stat != 0){ ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) { ++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; ++ } else if (maj_stat != 0) { + printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat); + goto out_put_ctx; + } +@@ -779,6 +784,7 @@ + struct xdr_netobj mic; + u32 flav,len; + u32 service; ++ u32 maj_stat; + + dprintk("RPC: %4u gss_validate\n", task->tk_pid); + +@@ -794,8 +800,11 @@ + mic.data = (u8 *)p; + mic.len = len; + +- if (gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state)) +- goto out_bad; ++ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; ++ if (maj_stat) ++ goto out_bad; + service = gss_pseudoflavor_to_service(ctx->gc_gss_ctx->mech_type, + gss_cred->gc_flavor); + switch (service) { +@@ -807,6 +816,11 @@ + /* verifier data, flavor, length, length, sequence number: */ + task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4; + break; ++ case RPC_GSS_SVC_PRIVACY: ++ /* XXXJBF: Ugh. Going for a wild overestimate. ++ * Need some info from krb5 layer? */ ++ task->tk_auth->au_rslack = XDR_QUADLEN(len) + 32; ++ break; + default: + goto out_bad; + } +@@ -821,11 +835,10 @@ + } + + static inline int +-gss_wrap_req_integ(struct gss_cl_ctx *ctx, +- kxdrproc_t encode, void *rqstp, u32 *p, void *obj) ++gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, ++ kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj) + { +- struct rpc_rqst *req = (struct rpc_rqst *)rqstp; +- struct xdr_buf *snd_buf = &req->rq_snd_buf; ++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + struct xdr_buf integ_buf; + u32 *integ_len = NULL; + struct xdr_netobj mic; +@@ -836,7 +849,7 @@ + + integ_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; +- *p++ = htonl(req->rq_seqno); ++ *p++ = htonl(rqstp->rq_seqno); + + status = encode(rqstp, p, obj); + if (status) +@@ -848,7 +861,7 @@ + *integ_len = htonl(integ_buf.len); + + /* guess whether we're in the head or the tail: */ +- if (snd_buf->page_len || snd_buf->tail[0].iov_len) ++ if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; +@@ -858,7 +871,9 @@ + maj_stat = gss_get_mic(ctx->gc_gss_ctx, + GSS_C_QOP_DEFAULT, &integ_buf, &mic); + status = -EIO; /* XXX? */ +- if (maj_stat) ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; ++ else if (maj_stat) + return status; + q = xdr_encode_opaque(p, NULL, mic.len); + +@@ -868,6 +883,112 @@ + return 0; + } + ++static void ++priv_release_snd_buf(struct rpc_rqst *rqstp) ++{ ++ int i; ++ ++ for (i=0; i < rqstp->rq_enc_pages_num; i++) ++ __free_page(rqstp->rq_enc_pages[i]); ++ kfree(rqstp->rq_enc_pages); ++} ++ ++static int ++alloc_enc_pages(struct rpc_rqst *rqstp) ++{ ++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; ++ int first, last, i; ++ ++ if (snd_buf->page_len == 0) { ++ rqstp->rq_enc_pages_num = 0; ++ return 0; ++ } ++ ++ first = snd_buf->page_base >> PAGE_CACHE_SHIFT; ++ last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT; ++ rqstp->rq_enc_pages_num = last - first + 1 + 1; ++ rqstp->rq_enc_pages ++ = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *), ++ GFP_NOFS); ++ if (!rqstp->rq_enc_pages) ++ goto out; ++ for (i=0; i < rqstp->rq_enc_pages_num; i++) { ++ rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS); ++ if (rqstp->rq_enc_pages[i] == NULL) ++ goto out_free; ++ } ++ rqstp->rq_release_snd_buf = priv_release_snd_buf; ++ return 0; ++out_free: ++ for (i--; i >= 0; i--) { ++ __free_page(rqstp->rq_enc_pages[i]); ++ } ++out: ++ return -EAGAIN; ++} ++ ++static inline int ++gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, ++ kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj) ++{ ++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; ++ u32 offset; ++ u32 maj_stat; ++ int status; ++ u32 *opaque_len; ++ struct page **inpages; ++ int first; ++ int pad; ++ struct kvec *iov; ++ char *tmp; ++ ++ opaque_len = p++; ++ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; ++ *p++ = htonl(rqstp->rq_seqno); ++ ++ status = encode(rqstp, p, obj); ++ if (status) ++ return status; ++ ++ status = alloc_enc_pages(rqstp); ++ if (status) ++ return status; ++ /* XXXJBF: Oops! Do we need rq_enc_pages really any more?? */ ++ first = snd_buf->page_base >> PAGE_CACHE_SHIFT; ++ inpages = snd_buf->pages + first; ++ snd_buf->pages = rqstp->rq_enc_pages; ++ snd_buf->page_base -= first << PAGE_CACHE_SHIFT; ++ /* XXX?: tail needs to be separate if we want to be able to expand ++ * the head (since it's often put right after the head). But is ++ * expanding the head safe in any case? */ ++ if (snd_buf->page_len || snd_buf->tail[0].iov_len) { ++ tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); ++ memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); ++ snd_buf->tail[0].iov_base = tmp; ++ } ++ maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset, ++ snd_buf, inpages); ++ status = -EIO; /* XXX? */ ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; ++ else if (maj_stat) ++ return status; ++ ++ *opaque_len = htonl(snd_buf->len - offset); ++ /* guess whether we're in the head or the tail: */ ++ if (snd_buf->page_len || snd_buf->tail[0].iov_len) ++ iov = snd_buf->tail; ++ else ++ iov = snd_buf->head; ++ p = iov->iov_base + iov->iov_len; ++ pad = 3 - ((snd_buf->len - offset - 1) & 3); ++ memset(p, 0, pad); ++ iov->iov_len += pad; ++ snd_buf->len += pad; ++ ++ return 0; ++} ++ + static int + gss_wrap_req(struct rpc_task *task, + kxdrproc_t encode, void *rqstp, u32 *p, void *obj) +@@ -894,9 +1015,13 @@ + status = encode(rqstp, p, obj); + goto out; + case RPC_GSS_SVC_INTEGRITY: +- status = gss_wrap_req_integ(ctx, encode, rqstp, p, obj); ++ status = gss_wrap_req_integ(cred, ctx, encode, ++ rqstp, p, obj); + goto out; + case RPC_GSS_SVC_PRIVACY: ++ status = gss_wrap_req_priv(cred, ctx, encode, ++ rqstp, p, obj); ++ goto out; + default: + goto out; + } +@@ -907,11 +1032,10 @@ + } + + static inline int +-gss_unwrap_resp_integ(struct gss_cl_ctx *ctx, +- kxdrproc_t decode, void *rqstp, u32 **p, void *obj) ++gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, ++ struct rpc_rqst *rqstp, u32 **p) + { +- struct rpc_rqst *req = (struct rpc_rqst *)rqstp; +- struct xdr_buf *rcv_buf = &req->rq_rcv_buf; ++ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + u32 data_offset, mic_offset; +@@ -926,7 +1050,7 @@ + mic_offset = integ_len + data_offset; + if (mic_offset > rcv_buf->len) + return status; +- if (ntohl(*(*p)++) != req->rq_seqno) ++ if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, +@@ -938,11 +1062,44 @@ + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, + &mic, NULL); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; ++ if (maj_stat != GSS_S_COMPLETE) ++ return status; ++ return 0; ++} ++ ++static inline int ++gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, ++ struct rpc_rqst *rqstp, u32 **p) ++{ ++ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; ++ u32 offset, out_offset; ++ u32 opaque_len; ++ u32 maj_stat; ++ int status = -EIO; ++ ++ opaque_len = ntohl(*(*p)++); ++ offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; ++ if (offset + opaque_len > rcv_buf->len) ++ return status; ++ /* remove padding: */ ++ rcv_buf->len = offset + opaque_len; ++ ++ maj_stat = gss_unwrap(ctx->gc_gss_ctx, NULL, ++ offset, rcv_buf, &out_offset); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat != GSS_S_COMPLETE) + return status; ++ *p = (u32 *)(rcv_buf->head[0].iov_base + out_offset); ++ if (ntohl(*(*p)++) != rqstp->rq_seqno) ++ return status; ++ + return 0; + } + ++ + static int + gss_unwrap_resp(struct rpc_task *task, + kxdrproc_t decode, void *rqstp, u32 *p, void *obj) +@@ -962,12 +1119,16 @@ + case RPC_GSS_SVC_NONE: + goto out_decode; + case RPC_GSS_SVC_INTEGRITY: +- status = gss_unwrap_resp_integ(ctx, decode, +- rqstp, &p, obj); ++ status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p); + if (status) + goto out; + break; + case RPC_GSS_SVC_PRIVACY: ++ status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p); ++ if (status) ++ goto out; ++ break; ++ + default: + goto out; + } +Index: linux-2.6.10/net/sunrpc/svc.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/svc.c 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/svc.c 2005-04-05 14:49:13.409690736 +0800 +@@ -264,6 +264,7 @@ + u32 dir, prog, vers, proc, + auth_stat, rpc_stat; + int auth_res; ++ u32 *accept_statp; + + rpc_stat = rpc_success; + +@@ -299,6 +300,9 @@ + if (vers != 2) /* RPC version number */ + goto err_bad_rpc; + ++ /* Save position in case we later decide to reject: */ ++ accept_statp = resv->iov_base + resv->iov_len; ++ + svc_putu32(resv, xdr_zero); /* ACCEPT */ + + rqstp->rq_prog = prog = ntohl(svc_getu32(argv)); /* program number */ +@@ -311,10 +315,12 @@ + * We do this before anything else in order to get a decent + * auth verifier. + */ +- if (progp->pg_authenticate != NULL) +- auth_res = progp->pg_authenticate(rqstp, &auth_stat); +- else +- auth_res = svc_authenticate(rqstp, &auth_stat); ++ auth_res = svc_authenticate(rqstp, &auth_stat); ++ /* Also give the program a chance to reject this call: */ ++ if (auth_res == SVC_OK) { ++ auth_stat = rpc_autherr_badcred; ++ auth_res = progp->pg_authenticate(rqstp); ++ } + switch (auth_res) { + case SVC_OK: + break; +@@ -437,7 +443,8 @@ + err_bad_auth: + dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); + serv->sv_stats->rpcbadauth++; +- resv->iov_len -= 4; ++ /* Restore write pointer to location of accept status: */ ++ xdr_ressize_check(rqstp, accept_statp); + svc_putu32(resv, xdr_one); /* REJECT */ + svc_putu32(resv, xdr_one); /* AUTH_ERROR */ + svc_putu32(resv, auth_stat); /* status */ +Index: linux-2.6.10/net/sunrpc/sched.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/sched.c 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/sched.c 2005-04-05 14:49:13.391693472 +0800 +@@ -41,13 +41,7 @@ + + static void __rpc_default_timer(struct rpc_task *task); + static void rpciod_killall(void); +- +-/* +- * When an asynchronous RPC task is activated within a bottom half +- * handler, or while executing another RPC task, it is put on +- * schedq, and rpciod is woken up. +- */ +-static RPC_WAITQ(schedq, "schedq"); ++static void rpc_async_schedule(void *); + + /* + * RPC tasks that create another task (e.g. for contacting the portmapper) +@@ -68,26 +62,18 @@ + /* + * rpciod-related stuff + */ +-static DECLARE_WAIT_QUEUE_HEAD(rpciod_idle); +-static DECLARE_COMPLETION(rpciod_killer); + static DECLARE_MUTEX(rpciod_sema); + static unsigned int rpciod_users; +-static pid_t rpciod_pid; +-static int rpc_inhibit; ++static struct workqueue_struct *rpciod_workqueue; + + /* +- * Spinlock for wait queues. Access to the latter also has to be +- * interrupt-safe in order to allow timers to wake up sleeping tasks. +- */ +-static spinlock_t rpc_queue_lock = SPIN_LOCK_UNLOCKED; +-/* + * Spinlock for other critical sections of code. + */ + static spinlock_t rpc_sched_lock = SPIN_LOCK_UNLOCKED; + + /* + * Disable the timer for a given RPC task. Should be called with +- * rpc_queue_lock and bh_disabled in order to avoid races within ++ * queue->lock and bh_disabled in order to avoid races within + * rpc_run_timer(). + */ + static inline void +@@ -105,19 +91,19 @@ + * without calling del_timer_sync(). The latter could cause a + * deadlock if called while we're holding spinlocks... + */ +-static void +-rpc_run_timer(struct rpc_task *task) ++static void rpc_run_timer(struct rpc_task *task) + { + void (*callback)(struct rpc_task *); + +- spin_lock_bh(&rpc_queue_lock); + callback = task->tk_timeout_fn; + task->tk_timeout_fn = NULL; +- spin_unlock_bh(&rpc_queue_lock); +- if (callback) { ++ if (callback && RPC_IS_QUEUED(task)) { + dprintk("RPC: %4d running timer\n", task->tk_pid); + callback(task); + } ++ smp_mb__before_clear_bit(); ++ clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); ++ smp_mb__after_clear_bit(); + } + + /* +@@ -136,29 +122,21 @@ + task->tk_timeout_fn = timer; + else + task->tk_timeout_fn = __rpc_default_timer; ++ set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); + mod_timer(&task->tk_timer, jiffies + task->tk_timeout); + } + + /* +- * Set up a timer for an already sleeping task. +- */ +-void rpc_add_timer(struct rpc_task *task, rpc_action timer) +-{ +- spin_lock_bh(&rpc_queue_lock); +- if (!RPC_IS_RUNNING(task)) +- __rpc_add_timer(task, timer); +- spin_unlock_bh(&rpc_queue_lock); +-} +- +-/* + * Delete any timer for the current task. Because we use del_timer_sync(), +- * this function should never be called while holding rpc_queue_lock. ++ * this function should never be called while holding queue->lock. + */ + static inline void + rpc_delete_timer(struct rpc_task *task) + { +- if (del_timer_sync(&task->tk_timer)) ++ if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) { ++ del_singleshot_timer_sync(&task->tk_timer); + dprintk("RPC: %4d deleting timer\n", task->tk_pid); ++ } + } + + /* +@@ -169,16 +147,17 @@ + struct list_head *q; + struct rpc_task *t; + ++ INIT_LIST_HEAD(&task->u.tk_wait.links); + q = &queue->tasks[task->tk_priority]; + if (unlikely(task->tk_priority > queue->maxpriority)) + q = &queue->tasks[queue->maxpriority]; +- list_for_each_entry(t, q, tk_list) { ++ list_for_each_entry(t, q, u.tk_wait.list) { + if (t->tk_cookie == task->tk_cookie) { +- list_add_tail(&task->tk_list, &t->tk_links); ++ list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + return; + } + } +- list_add_tail(&task->tk_list, q); ++ list_add_tail(&task->u.tk_wait.list, q); + } + + /* +@@ -189,37 +168,21 @@ + * improve overall performance. + * Everyone else gets appended to the queue to ensure proper FIFO behavior. + */ +-static int __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) ++static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) + { +- if (task->tk_rpcwait == queue) +- return 0; ++ BUG_ON (RPC_IS_QUEUED(task)); + +- if (task->tk_rpcwait) { +- printk(KERN_WARNING "RPC: doubly enqueued task!\n"); +- return -EWOULDBLOCK; +- } + if (RPC_IS_PRIORITY(queue)) + __rpc_add_wait_queue_priority(queue, task); + else if (RPC_IS_SWAPPER(task)) +- list_add(&task->tk_list, &queue->tasks[0]); ++ list_add(&task->u.tk_wait.list, &queue->tasks[0]); + else +- list_add_tail(&task->tk_list, &queue->tasks[0]); +- task->tk_rpcwait = queue; ++ list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); ++ task->u.tk_wait.rpc_waitq = queue; ++ rpc_set_queued(task); + + dprintk("RPC: %4d added to queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); +- +- return 0; +-} +- +-int rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task) +-{ +- int result; +- +- spin_lock_bh(&rpc_queue_lock); +- result = __rpc_add_wait_queue(q, task); +- spin_unlock_bh(&rpc_queue_lock); +- return result; + } + + /* +@@ -229,12 +192,12 @@ + { + struct rpc_task *t; + +- if (!list_empty(&task->tk_links)) { +- t = list_entry(task->tk_links.next, struct rpc_task, tk_list); +- list_move(&t->tk_list, &task->tk_list); +- list_splice_init(&task->tk_links, &t->tk_links); ++ if (!list_empty(&task->u.tk_wait.links)) { ++ t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); ++ list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); ++ list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); + } +- list_del(&task->tk_list); ++ list_del(&task->u.tk_wait.list); + } + + /* +@@ -243,31 +206,17 @@ + */ + static void __rpc_remove_wait_queue(struct rpc_task *task) + { +- struct rpc_wait_queue *queue = task->tk_rpcwait; +- +- if (!queue) +- return; ++ struct rpc_wait_queue *queue; ++ queue = task->u.tk_wait.rpc_waitq; + + if (RPC_IS_PRIORITY(queue)) + __rpc_remove_wait_queue_priority(task); + else +- list_del(&task->tk_list); +- task->tk_rpcwait = NULL; +- ++ list_del(&task->u.tk_wait.list); + dprintk("RPC: %4d removed from queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); + } + +-void +-rpc_remove_wait_queue(struct rpc_task *task) +-{ +- if (!task->tk_rpcwait) +- return; +- spin_lock_bh(&rpc_queue_lock); +- __rpc_remove_wait_queue(task); +- spin_unlock_bh(&rpc_queue_lock); +-} +- + static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) + { + queue->priority = priority; +@@ -290,6 +239,7 @@ + { + int i; + ++ spin_lock_init(&queue->lock); + for (i = 0; i < ARRAY_SIZE(queue->tasks); i++) + INIT_LIST_HEAD(&queue->tasks[i]); + queue->maxpriority = maxprio; +@@ -316,34 +266,31 @@ + * Note: If the task is ASYNC, this must be called with + * the spinlock held to protect the wait queue operation. + */ +-static inline void +-rpc_make_runnable(struct rpc_task *task) ++static void rpc_make_runnable(struct rpc_task *task) + { +- if (task->tk_timeout_fn) { +- printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n"); ++ int do_ret; ++ ++ BUG_ON(task->tk_timeout_fn); ++ do_ret = rpc_test_and_set_running(task); ++ rpc_clear_queued(task); ++ if (do_ret) + return; +- } +- rpc_set_running(task); + if (RPC_IS_ASYNC(task)) { +- if (RPC_IS_SLEEPING(task)) { +- int status; +- status = __rpc_add_wait_queue(&schedq, task); +- if (status < 0) { +- printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); +- task->tk_status = status; +- return; +- } +- rpc_clear_sleeping(task); +- wake_up(&rpciod_idle); ++ int status; ++ ++ INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task); ++ status = queue_work(task->tk_workqueue, &task->u.tk_work); ++ if (status < 0) { ++ printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); ++ task->tk_status = status; ++ return; + } +- } else { +- rpc_clear_sleeping(task); +- wake_up(&task->tk_wait); +- } ++ } else ++ wake_up(&task->u.tk_wait.waitq); + } + + /* +- * Place a newly initialized task on the schedq. ++ * Place a newly initialized task on the workqueue. + */ + static inline void + rpc_schedule_run(struct rpc_task *task) +@@ -352,33 +299,18 @@ + if (RPC_IS_ACTIVATED(task)) + return; + task->tk_active = 1; +- rpc_set_sleeping(task); + rpc_make_runnable(task); + } + + /* +- * For other people who may need to wake the I/O daemon +- * but should (for now) know nothing about its innards +- */ +-void rpciod_wake_up(void) +-{ +- if(rpciod_pid==0) +- printk(KERN_ERR "rpciod: wot no daemon?\n"); +- wake_up(&rpciod_idle); +-} +- +-/* + * Prepare for sleeping on a wait queue. + * By always appending tasks to the list we ensure FIFO behavior. + * NB: An RPC task will only receive interrupt-driven events as long + * as it's on a wait queue. + */ +-static void +-__rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, ++static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, rpc_action timer) + { +- int status; +- + dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid, + rpc_qname(q), jiffies); + +@@ -388,49 +320,36 @@ + } + + /* Mark the task as being activated if so needed */ +- if (!RPC_IS_ACTIVATED(task)) { ++ if (!RPC_IS_ACTIVATED(task)) + task->tk_active = 1; +- rpc_set_sleeping(task); +- } + +- status = __rpc_add_wait_queue(q, task); +- if (status) { +- printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); +- task->tk_status = status; +- } else { +- rpc_clear_running(task); +- if (task->tk_callback) { +- dprintk(KERN_ERR "RPC: %4d overwrites an active callback\n", task->tk_pid); +- BUG(); +- } +- task->tk_callback = action; +- __rpc_add_timer(task, timer); +- } ++ __rpc_add_wait_queue(q, task); ++ ++ BUG_ON(task->tk_callback != NULL); ++ task->tk_callback = action; ++ __rpc_add_timer(task, timer); + } + +-void +-rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, ++void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, rpc_action timer) + { + /* + * Protect the queue operations. + */ +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&q->lock); + __rpc_sleep_on(q, task, action, timer); +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&q->lock); + } + + /** +- * __rpc_wake_up_task - wake up a single rpc_task ++ * __rpc_do_wake_up_task - wake up a single rpc_task + * @task: task to be woken up + * +- * Caller must hold rpc_queue_lock ++ * Caller must hold queue->lock, and have cleared the task queued flag. + */ +-static void +-__rpc_wake_up_task(struct rpc_task *task) ++static void __rpc_do_wake_up_task(struct rpc_task *task) + { +- dprintk("RPC: %4d __rpc_wake_up_task (now %ld inh %d)\n", +- task->tk_pid, jiffies, rpc_inhibit); ++ dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies); + + #ifdef RPC_DEBUG + if (task->tk_magic != 0xf00baa) { +@@ -445,12 +364,9 @@ + printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); + return; + } +- if (RPC_IS_RUNNING(task)) +- return; + + __rpc_disable_timer(task); +- if (task->tk_rpcwait != &schedq) +- __rpc_remove_wait_queue(task); ++ __rpc_remove_wait_queue(task); + + rpc_make_runnable(task); + +@@ -458,6 +374,18 @@ + } + + /* ++ * Wake up the specified task ++ */ ++static void __rpc_wake_up_task(struct rpc_task *task) ++{ ++ if (rpc_start_wakeup(task)) { ++ if (RPC_IS_QUEUED(task)) ++ __rpc_do_wake_up_task(task); ++ rpc_finish_wakeup(task); ++ } ++} ++ ++/* + * Default timeout handler if none specified by user + */ + static void +@@ -471,14 +399,18 @@ + /* + * Wake up the specified task + */ +-void +-rpc_wake_up_task(struct rpc_task *task) ++void rpc_wake_up_task(struct rpc_task *task) + { +- if (RPC_IS_RUNNING(task)) +- return; +- spin_lock_bh(&rpc_queue_lock); +- __rpc_wake_up_task(task); +- spin_unlock_bh(&rpc_queue_lock); ++ if (rpc_start_wakeup(task)) { ++ if (RPC_IS_QUEUED(task)) { ++ struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; ++ ++ spin_lock_bh(&queue->lock); ++ __rpc_do_wake_up_task(task); ++ spin_unlock_bh(&queue->lock); ++ } ++ rpc_finish_wakeup(task); ++ } + } + + /* +@@ -494,11 +426,11 @@ + */ + q = &queue->tasks[queue->priority]; + if (!list_empty(q)) { +- task = list_entry(q->next, struct rpc_task, tk_list); ++ task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + if (queue->cookie == task->tk_cookie) { + if (--queue->nr) + goto out; +- list_move_tail(&task->tk_list, q); ++ list_move_tail(&task->u.tk_wait.list, q); + } + /* + * Check if we need to switch queues. +@@ -516,7 +448,7 @@ + else + q = q - 1; + if (!list_empty(q)) { +- task = list_entry(q->next, struct rpc_task, tk_list); ++ task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + goto new_queue; + } + } while (q != &queue->tasks[queue->priority]); +@@ -541,14 +473,14 @@ + struct rpc_task *task = NULL; + + dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue)); +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&queue->lock); + if (RPC_IS_PRIORITY(queue)) + task = __rpc_wake_up_next_priority(queue); + else { + task_for_first(task, &queue->tasks[0]) + __rpc_wake_up_task(task); + } +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&queue->lock); + + return task; + } +@@ -557,25 +489,25 @@ + * rpc_wake_up - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * +- * Grabs rpc_queue_lock ++ * Grabs queue->lock + */ + void rpc_wake_up(struct rpc_wait_queue *queue) + { + struct rpc_task *task; + + struct list_head *head; +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { +- task = list_entry(head->next, struct rpc_task, tk_list); ++ task = list_entry(head->next, struct rpc_task, u.tk_wait.list); + __rpc_wake_up_task(task); + } + if (head == &queue->tasks[0]) + break; + head--; + } +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&queue->lock); + } + + /** +@@ -583,18 +515,18 @@ + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + * +- * Grabs rpc_queue_lock ++ * Grabs queue->lock + */ + void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) + { + struct list_head *head; + struct rpc_task *task; + +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { +- task = list_entry(head->next, struct rpc_task, tk_list); ++ task = list_entry(head->next, struct rpc_task, u.tk_wait.list); + task->tk_status = status; + __rpc_wake_up_task(task); + } +@@ -602,7 +534,7 @@ + break; + head--; + } +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&queue->lock); + } + + /* +@@ -626,22 +558,23 @@ + /* + * This is the RPC `scheduler' (or rather, the finite state machine). + */ +-static int +-__rpc_execute(struct rpc_task *task) ++static int __rpc_execute(struct rpc_task *task) + { + int status = 0; + + dprintk("RPC: %4d rpc_execute flgs %x\n", + task->tk_pid, task->tk_flags); + +- if (!RPC_IS_RUNNING(task)) { +- printk(KERN_WARNING "RPC: rpc_execute called for sleeping task!!\n"); +- return 0; +- } ++ BUG_ON(RPC_IS_QUEUED(task)); + + restarted: + while (1) { + /* ++ * Garbage collection of pending timers... ++ */ ++ rpc_delete_timer(task); ++ ++ /* + * Execute any pending callback. + */ + if (RPC_DO_CALLBACK(task)) { +@@ -657,7 +590,9 @@ + */ + save_callback=task->tk_callback; + task->tk_callback=NULL; ++ lock_kernel(); + save_callback(task); ++ unlock_kernel(); + } + + /* +@@ -665,43 +600,35 @@ + * tk_action may be NULL when the task has been killed + * by someone else. + */ +- if (RPC_IS_RUNNING(task)) { +- /* +- * Garbage collection of pending timers... +- */ +- rpc_delete_timer(task); ++ if (!RPC_IS_QUEUED(task)) { + if (!task->tk_action) + break; ++ lock_kernel(); + task->tk_action(task); +- /* micro-optimization to avoid spinlock */ +- if (RPC_IS_RUNNING(task)) +- continue; ++ unlock_kernel(); + } + + /* +- * Check whether task is sleeping. ++ * Lockless check for whether task is sleeping or not. + */ +- spin_lock_bh(&rpc_queue_lock); +- if (!RPC_IS_RUNNING(task)) { +- rpc_set_sleeping(task); +- if (RPC_IS_ASYNC(task)) { +- spin_unlock_bh(&rpc_queue_lock); ++ if (!RPC_IS_QUEUED(task)) ++ continue; ++ rpc_clear_running(task); ++ if (RPC_IS_ASYNC(task)) { ++ /* Careful! we may have raced... */ ++ if (RPC_IS_QUEUED(task)) + return 0; +- } ++ if (rpc_test_and_set_running(task)) ++ return 0; ++ continue; + } +- spin_unlock_bh(&rpc_queue_lock); + +- if (!RPC_IS_SLEEPING(task)) +- continue; + /* sync task: sleep here */ + dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); +- if (current->pid == rpciod_pid) +- printk(KERN_ERR "RPC: rpciod waiting on sync task!\n"); +- + if (RPC_TASK_UNINTERRUPTIBLE(task)) { +- __wait_event(task->tk_wait, !RPC_IS_SLEEPING(task)); ++ __wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task)); + } else { +- __wait_event_interruptible(task->tk_wait, !RPC_IS_SLEEPING(task), status); ++ __wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status); + /* + * When a sync task receives a signal, it exits with + * -ERESTARTSYS. In order to catch any callbacks that +@@ -715,11 +642,14 @@ + rpc_wake_up_task(task); + } + } ++ rpc_set_running(task); + dprintk("RPC: %4d sync task resuming\n", task->tk_pid); + } + + if (task->tk_exit) { ++ lock_kernel(); + task->tk_exit(task); ++ unlock_kernel(); + /* If tk_action is non-null, the user wants us to restart */ + if (task->tk_action) { + if (!RPC_ASSASSINATED(task)) { +@@ -738,7 +668,6 @@ + + /* Release all resources associated with the task */ + rpc_release_task(task); +- + return status; + } + +@@ -754,57 +683,16 @@ + int + rpc_execute(struct rpc_task *task) + { +- int status = -EIO; +- if (rpc_inhibit) { +- printk(KERN_INFO "RPC: execution inhibited!\n"); +- goto out_release; +- } +- +- status = -EWOULDBLOCK; +- if (task->tk_active) { +- printk(KERN_ERR "RPC: active task was run twice!\n"); +- goto out_err; +- } ++ BUG_ON(task->tk_active); + + task->tk_active = 1; + rpc_set_running(task); + return __rpc_execute(task); +- out_release: +- rpc_release_task(task); +- out_err: +- return status; + } + +-/* +- * This is our own little scheduler for async RPC tasks. +- */ +-static void +-__rpc_schedule(void) ++static void rpc_async_schedule(void *arg) + { +- struct rpc_task *task; +- int count = 0; +- +- dprintk("RPC: rpc_schedule enter\n"); +- while (1) { +- +- task_for_first(task, &schedq.tasks[0]) { +- __rpc_remove_wait_queue(task); +- spin_unlock_bh(&rpc_queue_lock); +- +- __rpc_execute(task); +- spin_lock_bh(&rpc_queue_lock); +- } else { +- break; +- } +- +- if (++count >= 200 || need_resched()) { +- count = 0; +- spin_unlock_bh(&rpc_queue_lock); +- schedule(); +- spin_lock_bh(&rpc_queue_lock); +- } +- } +- dprintk("RPC: rpc_schedule leave\n"); ++ __rpc_execute((struct rpc_task *)arg); + } + + /* +@@ -862,7 +750,6 @@ + task->tk_client = clnt; + task->tk_flags = flags; + task->tk_exit = callback; +- init_waitqueue_head(&task->tk_wait); + if (current->uid != current->fsuid || current->gid != current->fsgid) + task->tk_flags |= RPC_TASK_SETUID; + +@@ -873,7 +760,11 @@ + + task->tk_priority = RPC_PRIORITY_NORMAL; + task->tk_cookie = (unsigned long)current; +- INIT_LIST_HEAD(&task->tk_links); ++ ++ /* Initialize workqueue for async tasks */ ++ task->tk_workqueue = rpciod_workqueue; ++ if (!RPC_IS_ASYNC(task)) ++ init_waitqueue_head(&task->u.tk_wait.waitq); + + /* Add to global list of all tasks */ + spin_lock(&rpc_sched_lock); +@@ -944,8 +835,7 @@ + goto out; + } + +-void +-rpc_release_task(struct rpc_task *task) ++void rpc_release_task(struct rpc_task *task) + { + dprintk("RPC: %4d release task\n", task->tk_pid); + +@@ -963,19 +853,9 @@ + list_del(&task->tk_task); + spin_unlock(&rpc_sched_lock); + +- /* Protect the execution below. */ +- spin_lock_bh(&rpc_queue_lock); +- +- /* Disable timer to prevent zombie wakeup */ +- __rpc_disable_timer(task); +- +- /* Remove from any wait queue we're still on */ +- __rpc_remove_wait_queue(task); +- ++ BUG_ON (RPC_IS_QUEUED(task)); + task->tk_active = 0; + +- spin_unlock_bh(&rpc_queue_lock); +- + /* Synchronously delete any running timer */ + rpc_delete_timer(task); + +@@ -1005,10 +885,9 @@ + * queue 'childq'. If so returns a pointer to the parent. + * Upon failure returns NULL. + * +- * Caller must hold rpc_queue_lock ++ * Caller must hold childq.lock + */ +-static inline struct rpc_task * +-rpc_find_parent(struct rpc_task *child) ++static inline struct rpc_task *rpc_find_parent(struct rpc_task *child) + { + struct rpc_task *task, *parent; + struct list_head *le; +@@ -1021,17 +900,16 @@ + return NULL; + } + +-static void +-rpc_child_exit(struct rpc_task *child) ++static void rpc_child_exit(struct rpc_task *child) + { + struct rpc_task *parent; + +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&childq.lock); + if ((parent = rpc_find_parent(child)) != NULL) { + parent->tk_status = child->tk_status; + __rpc_wake_up_task(parent); + } +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&childq.lock); + } + + /* +@@ -1054,22 +932,20 @@ + return NULL; + } + +-void +-rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) ++void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) + { +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&childq.lock); + /* N.B. Is it possible for the child to have already finished? */ + __rpc_sleep_on(&childq, task, func, NULL); + rpc_schedule_run(child); +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&childq.lock); + } + + /* + * Kill all tasks for the given client. + * XXX: kill their descendants as well? + */ +-void +-rpc_killall_tasks(struct rpc_clnt *clnt) ++void rpc_killall_tasks(struct rpc_clnt *clnt) + { + struct rpc_task *rovr; + struct list_head *le; +@@ -1091,93 +967,14 @@ + + static DECLARE_MUTEX_LOCKED(rpciod_running); + +-static inline int +-rpciod_task_pending(void) +-{ +- return !list_empty(&schedq.tasks[0]); +-} +- +- +-/* +- * This is the rpciod kernel thread +- */ +-static int +-rpciod(void *ptr) +-{ +- int rounds = 0; +- +- lock_kernel(); +- /* +- * Let our maker know we're running ... +- */ +- rpciod_pid = current->pid; +- up(&rpciod_running); +- +- daemonize("rpciod"); +- allow_signal(SIGKILL); +- +- dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid); +- spin_lock_bh(&rpc_queue_lock); +- while (rpciod_users) { +- DEFINE_WAIT(wait); +- if (signalled()) { +- spin_unlock_bh(&rpc_queue_lock); +- rpciod_killall(); +- flush_signals(current); +- spin_lock_bh(&rpc_queue_lock); +- } +- __rpc_schedule(); +- if (current->flags & PF_FREEZE) { +- spin_unlock_bh(&rpc_queue_lock); +- refrigerator(PF_FREEZE); +- spin_lock_bh(&rpc_queue_lock); +- } +- +- if (++rounds >= 64) { /* safeguard */ +- spin_unlock_bh(&rpc_queue_lock); +- schedule(); +- rounds = 0; +- spin_lock_bh(&rpc_queue_lock); +- } +- +- dprintk("RPC: rpciod back to sleep\n"); +- prepare_to_wait(&rpciod_idle, &wait, TASK_INTERRUPTIBLE); +- if (!rpciod_task_pending() && !signalled()) { +- spin_unlock_bh(&rpc_queue_lock); +- schedule(); +- rounds = 0; +- spin_lock_bh(&rpc_queue_lock); +- } +- finish_wait(&rpciod_idle, &wait); +- dprintk("RPC: switch to rpciod\n"); +- } +- spin_unlock_bh(&rpc_queue_lock); +- +- dprintk("RPC: rpciod shutdown commences\n"); +- if (!list_empty(&all_tasks)) { +- printk(KERN_ERR "rpciod: active tasks at shutdown?!\n"); +- rpciod_killall(); +- } +- +- dprintk("RPC: rpciod exiting\n"); +- unlock_kernel(); +- +- rpciod_pid = 0; +- complete_and_exit(&rpciod_killer, 0); +- return 0; +-} +- +-static void +-rpciod_killall(void) ++static void rpciod_killall(void) + { + unsigned long flags; + + while (!list_empty(&all_tasks)) { + clear_thread_flag(TIF_SIGPENDING); + rpc_killall_tasks(NULL); +- spin_lock_bh(&rpc_queue_lock); +- __rpc_schedule(); +- spin_unlock_bh(&rpc_queue_lock); ++ flush_workqueue(rpciod_workqueue); + if (!list_empty(&all_tasks)) { + dprintk("rpciod_killall: waiting for tasks to exit\n"); + yield(); +@@ -1195,28 +992,30 @@ + int + rpciod_up(void) + { ++ struct workqueue_struct *wq; + int error = 0; + + down(&rpciod_sema); +- dprintk("rpciod_up: pid %d, users %d\n", rpciod_pid, rpciod_users); ++ dprintk("rpciod_up: users %d\n", rpciod_users); + rpciod_users++; +- if (rpciod_pid) ++ if (rpciod_workqueue) + goto out; + /* + * If there's no pid, we should be the first user. + */ + if (rpciod_users > 1) +- printk(KERN_WARNING "rpciod_up: no pid, %d users??\n", rpciod_users); ++ printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users); + /* + * Create the rpciod thread and wait for it to start. + */ +- error = kernel_thread(rpciod, NULL, 0); +- if (error < 0) { +- printk(KERN_WARNING "rpciod_up: create thread failed, error=%d\n", error); ++ error = -ENOMEM; ++ wq = create_workqueue("rpciod"); ++ if (wq == NULL) { ++ printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error); + rpciod_users--; + goto out; + } +- down(&rpciod_running); ++ rpciod_workqueue = wq; + error = 0; + out: + up(&rpciod_sema); +@@ -1227,20 +1026,21 @@ + rpciod_down(void) + { + down(&rpciod_sema); +- dprintk("rpciod_down pid %d sema %d\n", rpciod_pid, rpciod_users); ++ dprintk("rpciod_down sema %d\n", rpciod_users); + if (rpciod_users) { + if (--rpciod_users) + goto out; + } else +- printk(KERN_WARNING "rpciod_down: pid=%d, no users??\n", rpciod_pid); ++ printk(KERN_WARNING "rpciod_down: no users??\n"); + +- if (!rpciod_pid) { ++ if (!rpciod_workqueue) { + dprintk("rpciod_down: Nothing to do!\n"); + goto out; + } ++ rpciod_killall(); + +- kill_proc(rpciod_pid, SIGKILL, 1); +- wait_for_completion(&rpciod_killer); ++ destroy_workqueue(rpciod_workqueue); ++ rpciod_workqueue = NULL; + out: + up(&rpciod_sema); + } +@@ -1258,7 +1058,12 @@ + } + printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout " + "-rpcwait -action- --exit--\n"); +- alltask_for_each(t, le, &all_tasks) ++ alltask_for_each(t, le, &all_tasks) { ++ const char *rpc_waitq = "none"; ++ ++ if (RPC_IS_QUEUED(t)) ++ rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); ++ + printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n", + t->tk_pid, + (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1), +@@ -1266,8 +1071,9 @@ + t->tk_client, + (t->tk_client ? t->tk_client->cl_prog : 0), + t->tk_rqstp, t->tk_timeout, +- rpc_qname(t->tk_rpcwait), ++ rpc_waitq, + t->tk_action, t->tk_exit); ++ } + spin_unlock(&rpc_sched_lock); + } + #endif +Index: linux-2.6.10/net/sunrpc/sunrpc_syms.c +=================================================================== +--- linux-2.6.10.orig/net/sunrpc/sunrpc_syms.c 2004-12-25 05:35:25.000000000 +0800 ++++ linux-2.6.10/net/sunrpc/sunrpc_syms.c 2005-04-05 14:49:13.411690432 +0800 +@@ -58,6 +58,9 @@ + EXPORT_SYMBOL(rpc_wake_up); + EXPORT_SYMBOL(rpc_queue_upcall); + EXPORT_SYMBOL(rpc_mkpipe); ++EXPORT_SYMBOL(rpc_mkdir); ++EXPORT_SYMBOL(rpc_rmdir); ++ + + /* Client transport */ + EXPORT_SYMBOL(xprt_create_proto); +@@ -90,6 +93,7 @@ + EXPORT_SYMBOL(svc_auth_register); + EXPORT_SYMBOL(auth_domain_lookup); + EXPORT_SYMBOL(svc_authenticate); ++EXPORT_SYMBOL(svc_set_client); + + /* RPC statistics */ + #ifdef CONFIG_PROC_FS +Index: linux-2.6.10/kernel/exit.c +=================================================================== +--- linux-2.6.10.orig/kernel/exit.c 2005-04-05 14:48:52.534864192 +0800 ++++ linux-2.6.10/kernel/exit.c 2005-04-05 14:50:57.737830448 +0800 +@@ -848,6 +848,8 @@ + for (;;) ; + } + ++EXPORT_SYMBOL(do_exit); ++ + NORET_TYPE void complete_and_exit(struct completion *comp, long code) + { + if (comp) +Index: linux-2.6.10/fs/locks.c +=================================================================== +--- linux-2.6.10.orig/fs/locks.c 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/fs/locks.c 2005-04-05 14:49:13.434686936 +0800 +@@ -1096,15 +1096,13 @@ + */ + void remove_lease(struct file_lock *fl) + { +- if (!IS_LEASE(fl)) +- return; +- + lock_kernel(); +- ++ if (!fl || !IS_LEASE(fl)) ++ goto out; + fl->fl_type = F_UNLCK | F_INPROGRESS; + fl->fl_break_time = jiffies - 10; + time_out_leases(fl->fl_file->f_dentry->d_inode); +- ++out: + unlock_kernel(); + } + +@@ -1563,9 +1561,6 @@ + error = filp->f_op->lock(filp, F_GETLK, &file_lock); + if (error < 0) + goto out; +- else if (error == LOCK_USE_CLNT) +- /* Bypass for NFS with no locking - 2.0.36 compat */ +- fl = posix_test_lock(filp, &file_lock); + else + fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock); + } else { +@@ -1708,9 +1703,6 @@ + error = filp->f_op->lock(filp, F_GETLK, &file_lock); + if (error < 0) + goto out; +- else if (error == LOCK_USE_CLNT) +- /* Bypass for NFS with no locking - 2.0.36 compat */ +- fl = posix_test_lock(filp, &file_lock); + else + fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock); + } else { +Index: linux-2.6.10/fs/dcache.c +=================================================================== +--- linux-2.6.10.orig/fs/dcache.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/fs/dcache.c 2005-04-05 14:49:13.413690128 +0800 +@@ -789,6 +789,54 @@ + } + + /** ++ * d_instantiate_unique - instantiate a non-aliased dentry ++ * @entry: dentry to instantiate ++ * @inode: inode to attach to this dentry ++ * ++ * Fill in inode information in the entry. On success, it returns NULL. ++ * If an unhashed alias of "entry" already exists, then we return the ++ * aliased dentry instead. ++ * ++ * Note that in order to avoid conflicts with rename() etc, the caller ++ * had better be holding the parent directory semaphore. ++ */ ++struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) ++{ ++ struct dentry *alias; ++ int len = entry->d_name.len; ++ const char *name = entry->d_name.name; ++ unsigned int hash = entry->d_name.hash; ++ ++ BUG_ON(!list_empty(&entry->d_alias)); ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_negative; ++ list_for_each_entry(alias, &inode->i_dentry, d_alias) { ++ struct qstr *qstr = &alias->d_name; ++ ++ if (qstr->hash != hash) ++ continue; ++ if (alias->d_parent != entry->d_parent) ++ continue; ++ if (qstr->len != len) ++ continue; ++ if (memcmp(qstr->name, name, len)) ++ continue; ++ dget_locked(alias); ++ spin_unlock(&dcache_lock); ++ BUG_ON(!d_unhashed(alias)); ++ return alias; ++ } ++ list_add(&entry->d_alias, &inode->i_dentry); ++do_negative: ++ entry->d_inode = inode; ++ spin_unlock(&dcache_lock); ++ security_d_instantiate(entry, inode); ++ return NULL; ++} ++EXPORT_SYMBOL(d_instantiate_unique); ++ ++/** + * d_alloc_root - allocate root dentry + * @root_inode: inode to allocate the root for + * +Index: linux-2.6.10/fs/lockd/svc.c +=================================================================== +--- linux-2.6.10.orig/fs/lockd/svc.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/fs/lockd/svc.c 2005-04-05 14:49:13.458683288 +0800 +@@ -418,6 +418,38 @@ + return 0; \ + } + ++static inline int is_callback(u32 proc) ++{ ++ return proc == NLMPROC_GRANTED ++ || proc == NLMPROC_GRANTED_MSG ++ || proc == NLMPROC_TEST_RES ++ || proc == NLMPROC_LOCK_RES ++ || proc == NLMPROC_CANCEL_RES ++ || proc == NLMPROC_UNLOCK_RES ++ || proc == NLMPROC_NSM_NOTIFY; ++} ++ ++ ++static int lockd_authenticate(struct svc_rqst *rqstp) ++{ ++ rqstp->rq_client = NULL; ++ switch (rqstp->rq_authop->flavour) { ++ case RPC_AUTH_NULL: ++ case RPC_AUTH_UNIX: ++ if (rqstp->rq_proc == 0) ++ return SVC_OK; ++ if (is_callback(rqstp->rq_proc)) { ++ /* Leave it to individual procedures to ++ * call nlmsvc_lookup_host(rqstp) ++ */ ++ return SVC_OK; ++ } ++ return svc_set_client(rqstp); ++ } ++ return SVC_DENIED; ++} ++ ++ + param_set_min_max(port, int, simple_strtol, 0, 65535) + param_set_min_max(grace_period, unsigned long, simple_strtoul, + nlm_grace_period_min, nlm_grace_period_max) +@@ -498,4 +530,5 @@ + .pg_name = "lockd", /* service name */ + .pg_class = "nfsd", /* share authentication with nfsd */ + .pg_stats = &nlmsvc_stats, /* stats table */ ++ .pg_authenticate = &lockd_authenticate /* export authentication */ + }; +Index: linux-2.6.10/fs/nfsd/nfs4xdr.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfs4xdr.c 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/nfs4xdr.c 2005-04-05 14:49:13.425688304 +0800 +@@ -60,121 +60,6 @@ + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +-static const char utf8_byte_len[256] = { +- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +- 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0 +-}; +- +-static inline int +-is_legal_utf8_sequence(unsigned char *source, int length) +-{ +- unsigned char *ptr; +- unsigned char c; +- +- if (length==1) return 1; +- +- /* Check for overlong sequence, and check second byte */ +- c = *(source + 1); +- switch (*source) { +- case 0xE0: /* 3 bytes */ +- if ( c < 0xA0 ) return 0; +- break; +- case 0xF0: /* 4 bytes */ +- if ( c < 0x90 ) return 0; +- break; +- case 0xF8: /* 5 bytes */ +- if ( c < 0xC8 ) return 0; +- break; +- case 0xFC: /* 6 bytes */ +- if ( c < 0x84 ) return 0; +- break; +- default: +- if ( (c & 0xC0) != 0x80) return 0; +- } +- +- /* Check that trailing bytes look like 10xxxxxx */ +- for (ptr = source++ + length - 1; ptr>source; ptr--) +- if ( ((*ptr) & 0xC0) != 0x80 ) return 0; +- return 1; +-} +- +-/* This does some screening on disallowed unicode characters. It is NOT +- * comprehensive. +- */ +-static int +-is_allowed_utf8_char(unsigned char *source, int length) +-{ +- /* We assume length and source point to a valid utf8 sequence */ +- unsigned char c; +- +- /* Disallow F0000 and up (in utf8, F3B08080) */ +- if (*source > 0xF3 ) return 0; +- c = *(source + 1); +- switch (*source) { +- case 0xF3: +- if (c >= 0xB0) return 0; +- break; +- /* Disallow D800-F8FF (in utf8, EDA080-EFA3BF */ +- case 0xED: +- if (c >= 0xA0) return 0; +- break; +- case 0xEE: +- return 0; +- break; +- case 0xEF: +- if (c <= 0xA3) return 0; +- /* Disallow FFF9-FFFF (EFBFB9-EFBFBF) */ +- if (c==0xBF) +- /* Don't need to check <=0xBF, since valid utf8 */ +- if ( *(source+2) >= 0xB9) return 0; +- break; +- } +- return 1; +-} +- +-/* This routine should really check to see that the proper stringprep +- * mappings have been applied. Instead, we do a simple screen of some +- * of the more obvious illegal values by calling is_allowed_utf8_char. +- * This will allow many illegal strings through, but if a client behaves, +- * it will get full functionality. The other option (apart from full +- * stringprep checking) is to limit everything to an easily handled subset, +- * such as 7-bit ascii. +- * +- * Note - currently calling routines ignore return value except as boolean. +- */ +-static int +-check_utf8(char *str, int len) +-{ +- unsigned char *chunk, *sourceend; +- int chunklen; +- +- chunk = str; +- sourceend = str + len; +- +- while (chunk < sourceend) { +- chunklen = utf8_byte_len[*chunk]; +- if (!chunklen) +- return nfserr_inval; +- if (chunk + chunklen > sourceend) +- return nfserr_inval; +- if (!is_legal_utf8_sequence(chunk, chunklen)) +- return nfserr_inval; +- if (!is_allowed_utf8_char(chunk, chunklen)) +- return nfserr_inval; +- if ( (chunklen==1) && (!*chunk) ) +- return nfserr_inval; /* Disallow embedded nulls */ +- chunk += chunklen; +- } +- +- return 0; +-} +- + static int + check_filename(char *str, int len, int err) + { +@@ -187,7 +72,7 @@ + for (i = 0; i < len; i++) + if (str[i] == '/') + return err; +- return check_utf8(str, len); ++ return 0; + } + + /* +@@ -403,8 +288,6 @@ + READ_BUF(dummy32); + len += XDR_QUADLEN(dummy32) << 2; + READMEM(buf, dummy32); +- if (check_utf8(buf, dummy32)) +- return nfserr_inval; + ace.whotype = nfs4_acl_get_whotype(buf, dummy32); + status = 0; + if (ace.whotype != NFS4_ACL_WHO_NAMED) +@@ -439,8 +322,6 @@ + READ_BUF(dummy32); + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); +- if (check_utf8(buf, dummy32)) +- return nfserr_inval; + if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + goto out_nfserr; + iattr->ia_valid |= ATTR_UID; +@@ -452,8 +333,6 @@ + READ_BUF(dummy32); + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); +- if (check_utf8(buf, dummy32)) +- return nfserr_inval; + if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + goto out_nfserr; + iattr->ia_valid |= ATTR_GID; +@@ -525,7 +404,7 @@ + } + } + if (len != expected_len) +- goto xdr_error; ++ printk("nfsd: funky nfs4 client sent extra bytes in setattr\n"); + + DECODE_TAIL; + +@@ -585,8 +464,6 @@ + READ32(create->cr_linklen); + READ_BUF(create->cr_linklen); + SAVEMEM(create->cr_linkname, create->cr_linklen); +- if (check_utf8(create->cr_linkname, create->cr_linklen)) +- return nfserr_inval; + break; + case NF4BLK: + case NF4CHR: +@@ -615,6 +492,18 @@ + } + + static inline int ++nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(sizeof(stateid_t)); ++ READ32(dr->dr_stateid.si_generation); ++ COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t)); ++ ++ DECODE_TAIL; ++} ++ ++static inline int + nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) + { + return nfsd4_decode_bitmap(argp, getattr->ga_bmval); +@@ -790,8 +679,8 @@ + READ32(open->op_delegate_type); + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: +- READ_BUF(sizeof(delegation_stateid_t) + 4); +- COPYMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t)); ++ READ_BUF(sizeof(stateid_t) + 4); ++ COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + READ32(open->op_fname.len); + READ_BUF(open->op_fname.len); + SAVEMEM(open->op_fname.data, open->op_fname.len); +@@ -825,7 +714,7 @@ + DECODE_HEAD; + + open_down->od_stateowner = NULL; +- READ_BUF(4 + sizeof(stateid_t)); ++ READ_BUF(12 + sizeof(stateid_t)); + READ32(open_down->od_stateid.si_generation); + COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t)); + READ32(open_down->od_seqid); +@@ -1170,6 +1059,9 @@ + case OP_CREATE: + op->status = nfsd4_decode_create(argp, &op->u.create); + break; ++ case OP_DELEGRETURN: ++ op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn); ++ break; + case OP_GETATTR: + op->status = nfsd4_decode_getattr(argp, &op->u.getattr); + break; +@@ -1425,7 +1317,7 @@ + if (status) + goto out_nfserr; + } +- if ((bmval0 & FATTR4_WORD0_FILEHANDLE) && !fhp) { ++ if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { + fh_init(&tempfh, NFS4_FHSIZE); + status = fh_compose(&tempfh, exp, dentry, NULL); + if (status) +@@ -1471,7 +1363,10 @@ + if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { + if ((buflen -= 4) < 0) + goto out_resource; +- WRITE32( NFS4_FH_NOEXPIRE_WITH_OPEN | NFS4_FH_VOL_RENAME ); ++ if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) ++ WRITE32(NFS4_FH_PERSISTENT); ++ else ++ WRITE32(NFS4_FH_VOL_RENAME); + } + if (bmval0 & FATTR4_WORD0_CHANGE) { + /* +@@ -1508,10 +1403,15 @@ + if (bmval0 & FATTR4_WORD0_FSID) { + if ((buflen -= 16) < 0) + goto out_resource; +- WRITE32(0); +- WRITE32(MAJOR(stat.dev)); +- WRITE32(0); +- WRITE32(MINOR(stat.dev)); ++ if (is_fsid(fhp, rqstp->rq_reffh)) { ++ WRITE64((u64)exp->ex_fsid); ++ WRITE64((u64)0); ++ } else { ++ WRITE32(0); ++ WRITE32(MAJOR(stat.dev)); ++ WRITE32(0); ++ WRITE32(MINOR(stat.dev)); ++ } + } + if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { + if ((buflen -= 4) < 0) +@@ -1765,17 +1665,65 @@ + } + + static int ++nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, ++ const char *name, int namlen, u32 *p, int *buflen) ++{ ++ struct svc_export *exp = cd->rd_fhp->fh_export; ++ struct dentry *dentry; ++ int nfserr; ++ ++ dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); ++ if (IS_ERR(dentry)) ++ return nfserrno(PTR_ERR(dentry)); ++ ++ exp_get(exp); ++ if (d_mountpoint(dentry)) { ++ if (nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp)) { ++ /* ++ * -EAGAIN is the only error returned from ++ * nfsd_cross_mnt() and it indicates that an ++ * up-call has been initiated to fill in the export ++ * options on exp. When the answer comes back, ++ * this call will be retried. ++ */ ++ nfserr = nfserr_dropit; ++ goto out_put; ++ } ++ ++ } ++ nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, ++ cd->rd_rqstp); ++out_put: ++ dput(dentry); ++ exp_put(exp); ++ return nfserr; ++} ++ ++static u32 * ++nfsd4_encode_rdattr_error(u32 *p, int buflen, int nfserr) ++{ ++ u32 *attrlenp; ++ ++ if (buflen < 6) ++ return NULL; ++ *p++ = htonl(2); ++ *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ ++ *p++ = htonl(0); /* bmval1 */ ++ ++ attrlenp = p++; ++ *p++ = nfserr; /* no htonl */ ++ *attrlenp = htonl((char *)p - (char *)attrlenp - 4); ++ return p; ++} ++ ++static int + nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen, + loff_t offset, ino_t ino, unsigned int d_type) + { + struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); + int buflen; + u32 *p = cd->buffer; +- u32 *attrlenp; +- struct dentry *dentry; +- struct svc_export *exp = cd->rd_fhp->fh_export; +- u32 bmval0, bmval1; +- int nfserr = 0; ++ int nfserr = nfserr_toosmall; + + /* In nfsv4, "." and ".." never make it onto the wire.. */ + if (name && isdotent(name, namlen)) { +@@ -1788,106 +1736,44 @@ + + buflen = cd->buflen - 4 - XDR_QUADLEN(namlen); + if (buflen < 0) +- goto nospc; ++ goto fail; + + *p++ = xdr_one; /* mark entry present */ + cd->offset = p; /* remember pointer */ + p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ + p = xdr_encode_array(p, name, namlen); /* name length & name */ + +- /* +- * Now we come to the ugly part: writing the fattr for this entry. +- */ +- bmval0 = cd->rd_bmval[0]; +- bmval1 = cd->rd_bmval[1]; +- if ((bmval0 & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_FILEID)) || bmval1) { +- /* +- * "Heavyweight" case: we have no choice except to +- * call nfsd4_encode_fattr(). +- */ +- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); +- if (IS_ERR(dentry)) { +- nfserr = nfserrno(PTR_ERR(dentry)); +- goto error; +- } +- +- exp_get(exp); +- if (d_mountpoint(dentry)) { +- if ((nfserr = nfsd_cross_mnt(cd->rd_rqstp, &dentry, +- &exp))) { +- /* +- * -EAGAIN is the only error returned from +- * nfsd_cross_mnt() and it indicates that an +- * up-call has been initiated to fill in the export +- * options on exp. When the answer comes back, +- * this call will be retried. +- */ +- dput(dentry); +- exp_put(exp); +- nfserr = nfserr_dropit; +- goto error; +- } +- +- } +- +- nfserr = nfsd4_encode_fattr(NULL, exp, +- dentry, p, &buflen, cd->rd_bmval, +- cd->rd_rqstp); +- dput(dentry); +- exp_put(exp); +- if (!nfserr) { +- p += buflen; +- goto out; +- } +- if (nfserr == nfserr_resource) +- goto nospc; +- +-error: ++ nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen); ++ switch (nfserr) { ++ case nfs_ok: ++ p += buflen; ++ break; ++ case nfserr_resource: ++ nfserr = nfserr_toosmall; ++ goto fail; ++ case nfserr_dropit: ++ goto fail; ++ default: + /* +- * If we get here, we experienced a miscellaneous +- * failure while writing the attributes. If the +- * client requested the RDATTR_ERROR attribute, ++ * If the client requested the RDATTR_ERROR attribute, + * we stuff the error code into this attribute + * and continue. If this attribute was not requested, + * then in accordance with the spec, we fail the + * entire READDIR operation(!) + */ +- if (!(bmval0 & FATTR4_WORD0_RDATTR_ERROR)) { +- cd->common.err = nfserr; +- return -EINVAL; +- } +- +- bmval0 = FATTR4_WORD0_RDATTR_ERROR; +- bmval1 = 0; +- /* falling through here will do the right thing... */ ++ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) ++ goto fail; ++ nfserr = nfserr_toosmall; ++ p = nfsd4_encode_rdattr_error(p, buflen, nfserr); ++ if (p == NULL) ++ goto fail; + } +- +- /* +- * In the common "lightweight" case, we avoid +- * the overhead of nfsd4_encode_fattr() by assembling +- * a small fattr by hand. +- */ +- if (buflen < 6) +- goto nospc; +- *p++ = htonl(2); +- *p++ = htonl(bmval0); +- *p++ = htonl(bmval1); +- +- attrlenp = p++; +- if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) +- *p++ = nfserr; /* no htonl */ +- if (bmval0 & FATTR4_WORD0_FILEID) +- p = xdr_encode_hyper(p, (u64)ino); +- *attrlenp = htonl((char *)p - (char *)attrlenp - 4); +- +-out: + cd->buflen -= (p - cd->buffer); + cd->buffer = p; + cd->common.err = nfs_ok; + return 0; +- +-nospc: +- cd->common.err = nfserr_toosmall; ++fail: ++ cd->common.err = nfserr; + return -EINVAL; + } + +@@ -2081,8 +1967,8 @@ + case NFS4_OPEN_DELEGATE_NONE: + break; + case NFS4_OPEN_DELEGATE_READ: +- RESERVE_SPACE(20 + sizeof(delegation_stateid_t)); +- WRITEMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t)); ++ RESERVE_SPACE(20 + sizeof(stateid_t)); ++ WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + WRITE32(0); + + /* +@@ -2095,8 +1981,8 @@ + ADJUST_ARGS(); + break; + case NFS4_OPEN_DELEGATE_WRITE: +- RESERVE_SPACE(32 + sizeof(delegation_stateid_t)); +- WRITEMEM(&open->op_delegate_stateid, sizeof(delegation_stateid_t)); ++ RESERVE_SPACE(32 + sizeof(stateid_t)); ++ WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + WRITE32(0); + + /* +@@ -2185,10 +2071,17 @@ + } + read->rd_vlen = v; + +- nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, +- read->rd_offset, +- read->rd_iov, read->rd_vlen, +- &maxcount); ++ if (read->rd_filp) ++ nfserr = nfsd_vfs_read(read->rd_rqstp, read->rd_fhp, ++ read->rd_filp, read->rd_offset, ++ read->rd_iov, read->rd_vlen, ++ &maxcount); ++ else ++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, ++ read->rd_offset, ++ read->rd_iov, read->rd_vlen, ++ &maxcount); ++ + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; + if (nfserr) +@@ -2460,6 +2353,8 @@ + case OP_CREATE: + nfsd4_encode_create(resp, op->status, &op->u.create); + break; ++ case OP_DELEGRETURN: ++ break; + case OP_GETATTR: + op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr); + break; +Index: linux-2.6.10/fs/nfsd/nfs4state.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfs4state.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/nfs4state.c 2005-04-05 14:49:13.421688912 +0800 +@@ -44,6 +44,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -56,9 +57,11 @@ + static u32 nfs4_reclaim_init = 0; + time_t boot_time; + static time_t grace_end = 0; ++static u32 first_run = 1; /* laundromat threads first run */ + static u32 current_clientid = 1; +-static u32 current_ownerid; +-static u32 current_fileid; ++static u32 current_ownerid = 1; ++static u32 current_fileid = 1; ++static u32 current_delegid = 1; + static u32 nfs4_init; + stateid_t zerostateid; /* bits all 0 */ + stateid_t onestateid; /* bits all 1 */ +@@ -70,14 +73,17 @@ + u32 del_perclient = 0; + u32 alloc_file = 0; + u32 free_file = 0; +-u32 alloc_sowner = 0; +-u32 free_sowner = 0; + u32 vfsopen = 0; + u32 vfsclose = 0; +-u32 alloc_lsowner= 0; ++u32 alloc_delegation= 0; ++u32 free_delegation= 0; + + /* forward declarations */ + struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); ++static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); ++static void release_delegation(struct nfs4_delegation *dp); ++static void release_stateid_lockowner(struct nfs4_stateid *open_stp); ++extern char recovery_dirname[]; + + /* Locking: + * +@@ -117,6 +123,112 @@ + static void release_stateid(struct nfs4_stateid *stp, int flags); + static void release_file(struct nfs4_file *fp); + ++/* ++ * Delegation state ++ */ ++ ++/* recall_lock protects the del_recall_lru */ ++spinlock_t recall_lock; ++static struct list_head del_recall_lru; ++ ++static struct nfs4_delegation * ++alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) ++{ ++ struct nfs4_delegation *dp; ++ struct nfs4_file *fp = stp->st_file; ++ ++ dprintk("NFSD alloc_init_deleg\n"); ++ if ((dp = kmalloc(sizeof(struct nfs4_delegation), ++ GFP_KERNEL)) == NULL) ++ return dp; ++ INIT_LIST_HEAD(&dp->dl_del_perfile); ++ INIT_LIST_HEAD(&dp->dl_del_perclnt); ++ INIT_LIST_HEAD(&dp->dl_recall_lru); ++ dp->dl_client = clp; ++ dp->dl_file = fp; ++ dp->dl_flock = NULL; ++ dp->dl_stp = stp; ++ dp->dl_flags = 0; ++ dp->dl_type = type; ++ dp->dl_recall.cbr_dp = NULL; ++ dp->dl_recall.cbr_ident = 0; ++ dp->dl_recall.cbr_trunc = 0; ++ dp->dl_stateid.si_boot = boot_time; ++ dp->dl_stateid.si_stateownerid = current_delegid++; ++ dp->dl_stateid.si_fileid = 0; ++ dp->dl_stateid.si_generation = 0; ++ dp->dl_fhlen = current_fh->fh_handle.fh_size; ++ memcpy(dp->dl_fhval, ¤t_fh->fh_handle.fh_base, ++ current_fh->fh_handle.fh_size); ++ dp->dl_time = 0; ++ atomic_set(&dp->dl_state, NFS4_NO_RECALL); ++ atomic_set(&dp->dl_count, 1); ++ atomic_set(&dp->dl_recall_cnt, 0); ++ list_add(&dp->dl_del_perfile, &fp->fi_del_perfile); ++ list_add(&dp->dl_del_perclnt, &clp->cl_del_perclnt); ++ alloc_delegation++; ++ return dp; ++} ++ ++/* ++ * Free the delegation structure. ++ */ ++static void ++nfs4_free_delegation(struct nfs4_delegation *dp) ++{ ++ dprintk("NFSD: nfs4_free_delegation freeing dp %p\n",dp); ++ list_del(&dp->dl_recall_lru); ++ kfree(dp); ++ free_delegation++; ++} ++ ++/* release_delegation: ++ * ++ * lease_modify() is called to remove the FS_LEASE file_lock from ++ * the i_flock list, eventually calling nfsd's lock_manager ++ * fl_release_callback. ++ * ++ * call either: ++ * nfsd_close : if last close, locks_remove_flock calls lease_modify. ++ * otherwise, recalled state set to NFS4_RECALL_COMPLETE ++ * so that it will be reaped by the laundromat service. ++ * or ++ * remove_lease (calls time_out_lease which calls lease_modify). ++ * and nfs4_free_delegation. ++ * ++ * lock_kernel() protects dp->dl_flock which is set under the kernel lock ++ * by nfsd_copy_lock_deleg_callback and nfsd_release_deleg_callback. ++ * ++ */ ++ ++static void ++release_delegation(struct nfs4_delegation *dp) ++{ ++ /* delayed nfsd_close */ ++ if (dp->dl_flags && NFS4_DELAY_CLOSE) { ++ struct file *filp = dp->dl_stp->st_vfs_file; ++ ++ dprintk("NFSD: release_delegation CLOSE\n"); ++ release_stateid_lockowner(dp->dl_stp); ++ kfree(dp->dl_stp); ++ dp->dl_flags &= ~NFS4_DELAY_CLOSE; ++ dp->dl_stp = NULL; ++ atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE); ++ nfsd_close(filp); ++ vfsclose++; ++ } else { ++ dprintk("NFSD: release_delegation remove lease dl_flock %p\n", ++ dp->dl_flock); ++ remove_lease(dp->dl_flock); ++ list_del_init(&dp->dl_del_perfile); ++ list_del_init(&dp->dl_del_perclnt); ++ /* dl_count > 0 => outstanding recall rpc */ ++ dprintk("NFSD: release_delegation free deleg dl_count %d\n", ++ atomic_read(&dp->dl_count)); ++ if (atomic_dec_and_test(&dp->dl_count)) ++ nfs4_free_delegation(dp); ++ } ++} + + /* + * SETCLIENTID state +@@ -148,7 +260,7 @@ + * for last close replay. + */ + static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE]; +-static int reclaim_str_hashtbl_size; ++static int reclaim_str_hashtbl_size = 0; + static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE]; + static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE]; + static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE]; +@@ -213,12 +325,38 @@ + kfree(clp); + } + ++void ++put_nfs4_client(struct nfs4_client *clp) ++{ ++ if (atomic_dec_and_test(&clp->cl_count)) ++ free_client(clp); ++} ++ + static void + expire_client(struct nfs4_client *clp) + { + struct nfs4_stateowner *sop; ++ struct nfs4_delegation *dp; ++ struct nfs4_callback *cb = &clp->cl_callback; ++ struct rpc_clnt *clnt = clp->cl_callback.cb_client; ++ ++ dprintk("NFSD: expire_client cl_count %d\n", ++ atomic_read(&clp->cl_count)); + +- dprintk("NFSD: expire_client\n"); ++ /* shutdown rpc client, ending any outstanding recall rpcs */ ++ if (atomic_read(&cb->cb_set) == 1 && clnt) { ++ rpc_shutdown_client(clnt); ++ clnt = clp->cl_callback.cb_client = NULL; ++ } ++ while (!list_empty(&clp->cl_del_perclnt)) { ++ dp = list_entry(clp->cl_del_perclnt.next, struct nfs4_delegation, dl_del_perclnt); ++ dprintk("NFSD: expire client. dp %p, dl_state %d, fp %p\n", ++ dp, atomic_read(&dp->dl_state), dp->dl_flock); ++ ++ /* force release of delegation. */ ++ atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE); ++ release_delegation(dp); ++ } + list_del(&clp->cl_idhash); + list_del(&clp->cl_strhash); + list_del(&clp->cl_lru); +@@ -226,7 +364,7 @@ + sop = list_entry(clp->cl_perclient.next, struct nfs4_stateowner, so_perclient); + release_stateowner(sop); + } +- free_client(clp); ++ put_nfs4_client(clp); + } + + static struct nfs4_client * +@@ -235,9 +373,13 @@ + + if (!(clp = alloc_client(name))) + goto out; ++ atomic_set(&clp->cl_count, 1); ++ atomic_set(&clp->cl_callback.cb_set, 0); ++ clp->cl_callback.cb_parsed = 0; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_perclient); ++ INIT_LIST_HEAD(&clp->cl_del_perclnt); + INIT_LIST_HEAD(&clp->cl_lru); + out: + return clp; +@@ -420,17 +562,24 @@ + { + struct nfs4_callback *cb = &clp->cl_callback; + ++ /* Currently, we only support tcp for the callback channel */ ++ if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3)) ++ goto out_err; ++ + if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, +- &cb->cb_addr, &cb->cb_port))) { +- printk(KERN_INFO "NFSD: BAD callback address. client will not receive delegations\n"); +- cb->cb_parsed = 0; +- return; +- } +- cb->cb_netid.len = se->se_callback_netid_len; +- cb->cb_netid.data = se->se_callback_netid_val; ++ &cb->cb_addr, &cb->cb_port))) ++ goto out_err; + cb->cb_prog = se->se_callback_prog; + cb->cb_ident = se->se_callback_ident; + cb->cb_parsed = 1; ++ return; ++out_err: ++ printk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " ++ "will not receive delegations\n", ++ clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); ++ ++ cb->cb_parsed = 0; ++ return; + } + + /* +@@ -707,6 +856,7 @@ + status = nfserr_clid_inuse; + else { + expire_client(conf); ++ clp = unconf; + move_to_confirmed(unconf, idhashval); + status = nfs_ok; + } +@@ -724,6 +874,7 @@ + if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) { + status = nfserr_clid_inuse; + } else { ++ clp = conf; + status = nfs_ok; + } + goto out; +@@ -738,6 +889,7 @@ + status = nfserr_clid_inuse; + } else { + status = nfs_ok; ++ clp = unconf; + move_to_confirmed(unconf, idhashval); + } + goto out; +@@ -757,7 +909,8 @@ + status = nfserr_inval; + goto out; + out: +- /* XXX if status == nfs_ok, probe callback path */ ++ if (!status) ++ nfsd4_probe_callback(clp); + nfs4_unlock_state(); + return status; + } +@@ -803,6 +956,7 @@ + if ((fp = kmalloc(sizeof(struct nfs4_file),GFP_KERNEL))) { + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_perfile); ++ INIT_LIST_HEAD(&fp->fi_del_perfile); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; +@@ -822,7 +976,7 @@ + while (!list_empty(&file_hashtbl[i])) { + fp = list_entry(file_hashtbl[i].next, struct nfs4_file, fi_hash); + /* this should never be more than once... */ +- if (!list_empty(&fp->fi_perfile)) { ++ if (!list_empty(&fp->fi_perfile) || !list_empty(&fp->fi_del_perfile)) { + printk("ERROR: release_all_files: file %p is open, creating dangling state !!!\n",fp); + } + release_file(fp); +@@ -830,15 +984,36 @@ + } + } + +-/* should use a slab cache */ ++kmem_cache_t *stateowner_slab = NULL; ++ ++int ++nfsd4_init_slabs(void) ++{ ++ stateowner_slab = kmem_cache_create("nfsd4_stateowners", ++ sizeof(struct nfs4_stateowner), 0, 0, NULL, NULL); ++ if (stateowner_slab == NULL) ++ return -ENOMEM; ++ return 0; ++} ++ ++int ++nfsd4_free_slabs(void) ++{ ++ int status = 0; ++ ++ if (stateowner_slab) ++ status = kmem_cache_destroy(stateowner_slab); ++ stateowner_slab = NULL; ++ return status; ++} ++ + void + nfs4_free_stateowner(struct kref *kref) + { + struct nfs4_stateowner *sop = + container_of(kref, struct nfs4_stateowner, so_ref); + kfree(sop->so_owner.data); +- kfree(sop); +- free_sowner++; ++ kmem_cache_free(stateowner_slab, sop); + } + + static inline struct nfs4_stateowner * +@@ -846,14 +1021,14 @@ + { + struct nfs4_stateowner *sop; + +- if ((sop = kmalloc(sizeof(struct nfs4_stateowner),GFP_KERNEL))) { ++ if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) { + if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) { + memcpy(sop->so_owner.data, owner->data, owner->len); + sop->so_owner.len = owner->len; + kref_init(&sop->so_ref); + return sop; + } +- kfree(sop); ++ kmem_cache_free(stateowner_slab, sop); + } + return NULL; + } +@@ -887,7 +1062,6 @@ + rp->rp_status = NFSERR_SERVERFAULT; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; +- alloc_sowner++; + return sop; + } + +@@ -957,14 +1131,29 @@ + __set_bit(open->op_share_deny, &stp->st_deny_bmap); + } + ++/* ++* Because nfsd_close() can call locks_remove_flock() which removes leases, ++* delay nfsd_close() for delegations from the nfsd_open() clientid ++* until the delegation is reaped. ++*/ + static void +-release_stateid(struct nfs4_stateid *stp, int flags) { ++release_stateid(struct nfs4_stateid *stp, int flags) ++{ ++ struct nfs4_delegation *dp; ++ struct nfs4_file *fp = stp->st_file; + + list_del(&stp->st_hash); + list_del_perfile++; + list_del(&stp->st_perfile); + list_del(&stp->st_perfilestate); + if ((stp->st_vfs_set) && (flags & OPEN_STATE)) { ++ list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) { ++ if(cmp_clid(&dp->dl_client->cl_clientid, ++ &stp->st_stateowner->so_client->cl_clientid)) { ++ dp->dl_flags |= NFS4_DELAY_CLOSE; ++ return; ++ } ++ } + release_stateid_lockowner(stp); + nfsd_close(stp->st_vfs_file); + vfsclose++; +@@ -1013,7 +1202,7 @@ + if (sop->so_confirmed && list_empty(&sop->so_perfilestate)) + move_to_close_lru(sop); + /* unused nfs4_file's are releseed. XXX slab cache? */ +- if (list_empty(&fp->fi_perfile)) { ++ if (list_empty(&fp->fi_perfile) && list_empty(&fp->fi_del_perfile)) { + release_file(fp); + } + } +@@ -1141,6 +1330,100 @@ + } + } + ++/* ++ * Recall a delegation ++ */ ++static int ++do_recall(void *__dp) ++{ ++ struct nfs4_delegation *dp = __dp; ++ ++ atomic_inc(&dp->dl_count); ++ nfsd4_cb_recall(dp); ++ do_exit(0); ++ return 0; ++} ++ ++/* ++ * Spawn a thread to perform a recall on the delegation represented ++ * by the lease (file_lock) ++ * ++ * Called from break_lease() with lock_kernel() held, ++ * ++ */ ++static ++void nfsd_break_deleg_cb(struct file_lock *fl) ++{ ++ struct nfs4_delegation *dp= (struct nfs4_delegation *)fl->fl_owner; ++ struct task_struct *t; ++ ++ dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl); ++ if (!dp) ++ return; ++ ++ /* schedule delegation for recall */ ++ spin_lock(&recall_lock); ++ atomic_set(&dp->dl_state, NFS4_RECALL_IN_PROGRESS); ++ list_add_tail(&dp->dl_recall_lru, &del_recall_lru); ++ spin_unlock(&recall_lock); ++ ++ /* only place dl_time is set. protected by lock_kernel*/ ++ dp->dl_time = get_seconds(); ++ ++ /* XXX need to merge NFSD_LEASE_TIME with fs/locks.c:lease_break_time */ ++ fl->fl_break_time = jiffies + NFSD_LEASE_TIME * HZ; ++ ++ t = kthread_run(do_recall, dp, "%s", "nfs4_cb_recall"); ++ if (IS_ERR(t)) { ++ struct nfs4_client *clp = dp->dl_client; ++ ++ printk(KERN_INFO "NFSD: Callback thread failed for " ++ "for client (clientid %08x/%08x)\n", ++ clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); ++ } ++} ++ ++/* ++ * The file_lock is being reapd. ++ * ++ * Called by locks_free_lock() with lock_kernel() held. ++ */ ++static ++void nfsd_release_deleg_cb(struct file_lock *fl) ++{ ++ struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; ++ ++ dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d, dl_state %d\n", fl,dp, atomic_read(&dp->dl_count), atomic_read(&dp->dl_state)); ++ ++ if (!(fl->fl_flags & FL_LEASE) || !dp) ++ return; ++ atomic_set(&dp->dl_state,NFS4_RECALL_COMPLETE); ++ dp->dl_flock = NULL; ++} ++ ++/* ++ * Set the delegation file_lock back pointer. ++ * ++ * Called from __setlease() with lock_kernel() held. ++ */ ++static ++void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl) ++{ ++ struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner; ++ ++ dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp); ++ if (!dp) ++ return; ++ dp->dl_flock = new; ++} ++ ++struct lock_manager_operations nfsd_lease_mng_ops = { ++ .fl_break = nfsd_break_deleg_cb, ++ .fl_release_private = nfsd_release_deleg_cb, ++ .fl_copy_lock = nfsd_copy_lock_deleg_cb, ++}; ++ ++ + + /* + * nfsd4_process_open1() +@@ -1238,6 +1521,43 @@ + } + + static int ++nfs4_deleg_conflict(u32 share, u32 dtype) ++{ ++ return (((share & NFS4_SHARE_ACCESS_WRITE) && ++ dtype == NFS4_OPEN_DELEGATE_READ) || ++ ((share & NFS4_SHARE_ACCESS_READ) && ++ dtype == NFS4_OPEN_DELEGATE_WRITE)); ++} ++ ++#define DONT_DELEGATE 8 ++ ++/* ++ * nfs4_check_deleg_recall() ++ * ++ * Test any delegation that is currently within an incompleted recalled ++ * state, and return NFSERR_DELAY for conflicting open share. ++ * flag is set to DONT_DELEGATE for shares that match the deleg type. ++ */ ++static int ++nfs4_check_deleg_recall(struct nfs4_file *fp, struct nfsd4_open *op, int *flag) ++{ ++ struct nfs4_delegation *dp; ++ int status = 0; ++ ++ list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) { ++ dprintk("NFSD: found delegation %p with dl_state %d\n", ++ dp, atomic_read(&dp->dl_state)); ++ if (atomic_read(&dp->dl_state) == NFS4_RECALL_IN_PROGRESS) { ++ if(nfs4_deleg_conflict(op->op_share_access, dp->dl_type)) ++ status = nfserr_jukebox; ++ else ++ *flag = DONT_DELEGATE; ++ } ++ } ++ return status; ++} ++ ++static int + nfs4_check_open(struct nfs4_file *fp, struct nfs4_stateowner *sop, struct nfsd4_open *open, struct nfs4_stateid **stpp) + { + struct nfs4_stateid *local; +@@ -1339,6 +1659,65 @@ + } + + /* ++ * Attempt to hand out a delegation. ++ */ ++static void ++nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp, int *flag) ++{ ++ struct nfs4_delegation *dp; ++ struct nfs4_stateowner *sop = stp->st_stateowner; ++ struct nfs4_callback *cb = &sop->so_client->cl_callback; ++ struct file_lock fl, *flp = &fl; ++ int status; ++ ++ if (*flag == DONT_DELEGATE) { ++ *flag = NFS4_OPEN_DELEGATE_NONE; ++ return; ++ } ++ ++ /* set flag */ ++ *flag = NFS4_OPEN_DELEGATE_NONE; ++ if (open->op_claim_type != NFS4_OPEN_CLAIM_NULL ++ || !atomic_read(&cb->cb_set) || !sop->so_confirmed) ++ return; ++ ++ if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) ++ *flag = NFS4_OPEN_DELEGATE_READ; ++ ++ else if (!(open->op_share_access & NFS4_SHARE_ACCESS_READ)) ++ *flag = NFS4_OPEN_DELEGATE_WRITE; ++ ++ if (!(dp = alloc_init_deleg(sop->so_client, stp, fh, *flag))) ++ return; ++ locks_init_lock(&fl); ++ fl.fl_lmops = &nfsd_lease_mng_ops; ++ fl.fl_flags = FL_LEASE; ++ fl.fl_end = OFFSET_MAX; ++ fl.fl_owner = (fl_owner_t)dp; ++ fl.fl_file = stp->st_vfs_file; ++ fl.fl_pid = current->tgid; ++ ++ if ((status = setlease(stp->st_vfs_file, ++ *flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK, &flp))) { ++ dprintk("NFSD: setlease failed [%d], no delegation\n", status); ++ list_del(&dp->dl_del_perfile); ++ list_del(&dp->dl_del_perclnt); ++ kfree(dp); ++ free_delegation++; ++ *flag = NFS4_OPEN_DELEGATE_NONE; ++ return; ++ } ++ ++ memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); ++ ++ dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n", ++ dp->dl_stateid.si_boot, ++ dp->dl_stateid.si_stateownerid, ++ dp->dl_stateid.si_fileid, ++ dp->dl_stateid.si_generation); ++} ++ ++/* + * called with nfs4_lock_state() held. + */ + int +@@ -1346,28 +1725,24 @@ + { + struct nfs4_stateowner *sop = open->op_stateowner; + struct nfs4_file *fp = NULL; +- struct inode *ino; ++ struct inode *ino = current_fh->fh_dentry->d_inode; + unsigned int fi_hashval; + struct nfs4_stateid *stp = NULL; +- int status; +- +- status = nfserr_resource; +- if (!sop) +- return status; +- +- ino = current_fh->fh_dentry->d_inode; ++ int status, delegflag = 0; + + status = nfserr_inval; + if (!TEST_ACCESS(open->op_share_access) || !TEST_DENY(open->op_share_deny)) + goto out; + /* +- * Lookup file; if found, lookup stateid and check open request; +- * not found, create ++ * Lookup file; if found, lookup stateid and check open request, ++ * and check for delegations in the process of being recalled. ++ * If not found, create the nfs4_file struct + */ + fi_hashval = file_hashval(ino); + if (find_file(fi_hashval, ino, &fp)) { +- status = nfs4_check_open(fp, sop, open, &stp); +- if (status) ++ if ((status = nfs4_check_open(fp, sop, open, &stp))) ++ goto out; ++ if ((status = nfs4_check_deleg_recall(fp, open, &delegflag))) + goto out; + } else { + status = nfserr_resource; +@@ -1407,14 +1782,20 @@ + } + } + } +- dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n", +- stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid, +- stp->st_stateid.si_fileid, stp->st_stateid.si_generation); +- + memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + +- open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; ++ /* ++ * Attempt to hand out a delegation. No error return, because the ++ * OPEN succeeds even if we fail. ++ */ ++ nfs4_open_delegation(current_fh, open, stp, &delegflag); ++ open->op_delegate_type = delegflag; ++ + status = nfs_ok; ++ ++ dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n", ++ stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid, ++ stp->st_stateid.si_fileid, stp->st_stateid.si_generation); + out: + /* take the opportunity to clean up unused state */ + if (fp && list_empty(&fp->fi_perfile)) +@@ -1480,14 +1861,26 @@ + { + struct nfs4_client *clp; + struct nfs4_stateowner *sop; ++ struct nfs4_delegation *dp; + struct list_head *pos, *next; + time_t cutoff = get_seconds() - NFSD_LEASE_TIME; + time_t t, clientid_val = NFSD_LEASE_TIME; +- time_t u, close_val = NFSD_LEASE_TIME; ++ time_t u, test_val = NFSD_LEASE_TIME; + + nfs4_lock_state(); + +- dprintk("NFSD: laundromat service - starting, examining clients\n"); ++ dprintk("NFSD: laundromat service - starting\n"); ++ /* Remove clientid's from recovery directory */ ++ if (first_run) { ++ int status; ++ ++ dprintk("NFSD: laundromat service - FIRST_RUN\n"); ++ status = nfsd4_list_rec_dir(1); ++ if (status < 0) ++ printk("NFSD: error clearing recovery directory %s\n", ++ recovery_dirname); ++ first_run = 0; ++ } + list_for_each_safe(pos, next, &client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { +@@ -1498,14 +1891,34 @@ + } + dprintk("NFSD: purging unused client (clientid %08x)\n", + clp->cl_clientid.cl_id); ++ if (clp->cl_firststate) ++ nfsd4_remove_clid_file(clp); + expire_client(clp); + } ++ spin_lock(&recall_lock); ++ list_for_each_safe(pos, next, &del_recall_lru) { ++ dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); ++ if (atomic_read(&dp->dl_state) == NFS4_RECALL_COMPLETE) ++ goto reap; ++ if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { ++ u = dp->dl_time - cutoff; ++ if (test_val > u) ++ test_val = u; ++ break; ++ } ++reap: ++ dprintk("NFSD: purging unused delegation dp %p, fp %p\n", ++ dp, dp->dl_flock); ++ release_delegation(dp); ++ } ++ spin_unlock(&recall_lock); ++ test_val = NFSD_LEASE_TIME; + list_for_each_safe(pos, next, &close_lru) { + sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); + if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { + u = sop->so_time - cutoff; +- if (close_val > u) +- close_val = u; ++ if (test_val > u) ++ test_val = u; + break; + } + dprintk("NFSD: purging unused open stateowner (so_id %d)\n", +@@ -1564,21 +1977,81 @@ + return 1; + } + ++static inline int ++access_permit_read(unsigned long access_bmap) ++{ ++ return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) || ++ test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap); ++} ++ ++static inline int ++access_permit_write(unsigned long access_bmap) ++{ ++ return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) || ++ test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap); ++} ++ ++static ++int nfs4_check_openmode(struct nfs4_stateid *stp, int flags) ++{ ++ int status = nfserr_openmode; ++ ++ if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) ++ goto out; ++ if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) ++ goto out; ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static int ++nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) ++{ ++ int status = nfserr_openmode; ++ ++ if ((flags & WR_STATE) & (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) ++ goto out; ++ if ((flags & RD_STATE) & (dp->dl_type == NFS4_OPEN_DELEGATE_WRITE)) ++ goto out; ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static int ++nfs4_rw_grace(int flags) ++{ ++ return (nfs4_in_grace() && ((flags & RD_STATE) || (flags & WR_STATE))); ++} ++ ++/* ++ * Allow READ/WRITE during grace period on recovered state only for files ++ * that are not able to provide mandatory locking. ++ */ ++static int ++nfs4_check_rw_grace(umode_t mode, int flags) ++{ ++ return (nfs4_rw_grace(flags) && ((mode & S_IXGRP) && (mode & S_ISGID))); ++} + + /* + * Checks for stateid operations + */ + int +-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct nfs4_stateid **stpp) ++nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) + { +- struct nfs4_stateid *stp; ++ struct nfs4_stateid *stp = NULL; ++ struct nfs4_delegation *dp = NULL; ++ stateid_t *stidp; ++ struct inode *ino = current_fh->fh_dentry->d_inode; + int status; + + dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n", + stateid->si_boot, stateid->si_stateownerid, + stateid->si_fileid, stateid->si_generation); +- +- *stpp = NULL; ++ if (filpp) ++ *filpp = NULL; + + /* STALE STATEID */ + status = nfserr_stale_stateid; +@@ -1587,33 +2060,58 @@ + + /* BAD STATEID */ + status = nfserr_bad_stateid; +- if (!(stp = find_stateid(stateid, flags))) { +- dprintk("NFSD: preprocess_stateid_op: no open stateid!\n"); +- goto out; +- } +- if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) { +- dprintk("NFSD: preprocess_stateid_op: fh-stateid mismatch!\n"); +- stp->st_vfs_set = 0; +- goto out; +- } +- if (!stp->st_stateowner->so_confirmed) { +- dprintk("preprocess_stateid_op: lockowner not confirmed yet!\n"); +- goto out; ++ if (!stateid->si_fileid) { /* delegation stateid */ ++ ++ if(!(dp = find_delegation_stateid(ino, stateid))) { ++ dprintk("NFSD: delegation stateid not found\n"); ++ if (nfs4_rw_grace(flags)) ++ status = nfserr_grace; ++ goto out; ++ } ++ stidp = &dp->dl_stateid; ++ } else { /* open or lock stateid */ ++ if (!(stp = find_stateid(stateid, flags))) { ++ dprintk("NFSD: open or lock stateid not found\n"); ++ if (nfs4_rw_grace(flags)) ++ status = nfserr_grace; ++ goto out; ++ } ++ if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) ++ goto out; ++ if (!stp->st_stateowner->so_confirmed) ++ goto out; ++ stidp = &stp->st_stateid; + } +- if (stateid->si_generation > stp->st_stateid.si_generation) { +- dprintk("preprocess_stateid_op: future stateid?!\n"); ++ if (stateid->si_generation > stidp->si_generation) + goto out; +- } + + /* OLD STATEID */ + status = nfserr_old_stateid; +- if (stateid->si_generation < stp->st_stateid.si_generation) { +- dprintk("preprocess_stateid_op: old stateid!\n"); ++ if (stateid->si_generation < stidp->si_generation) + goto out; ++ ++ status = nfserr_grace; ++ if (nfs4_check_rw_grace(ino->i_mode, flags)) ++ goto out; ++ ++ if (stp) { ++ renew_client(stp->st_stateowner->so_client); ++ if ((status = nfs4_check_openmode(stp,flags))) ++ goto out; ++ if (filpp) ++ *filpp = stp->st_vfs_file; ++ } else if (dp) { ++ renew_client(dp->dl_client); ++ if ((status = nfs4_check_delegmode(dp, flags))) ++ goto out; ++ if (flags & DELEG_RET) { ++ atomic_set(&dp->dl_state,NFS4_RECALL_COMPLETE); ++ release_delegation(dp); ++ } ++ if (filpp && dp && dp->dl_stp) ++ *filpp = dp->dl_stp->st_vfs_file; + } +- *stpp = stp; + status = nfs_ok; +- renew_client(stp->st_stateowner->so_client); + out: + return status; + } +@@ -1750,17 +2248,6 @@ + goto out; + } + +-/* +- * eventually, this will perform an upcall to the 'state daemon' as well as +- * set the cl_first_state field. +- */ +-void +-first_state(struct nfs4_client *clp) +-{ +- if (!clp->cl_first_state) +- clp->cl_first_state = get_seconds(); +-} +- + int + nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc) + { +@@ -1793,8 +2280,16 @@ + stp->st_stateid.si_stateownerid, + stp->st_stateid.si_fileid, + stp->st_stateid.si_generation); +- status = nfs_ok; +- first_state(sop->so_client); ++ ++ if (!sop->so_client->cl_firststate) { ++ int err = nfsd4_create_clid_file(sop->so_client); ++ if (!err) { ++ sop->so_client->cl_firststate = 1; ++ dprintk("NFSD: OPEN_CONFIRM firststate set [%.*s]\n", ++ sop->so_client->cl_name.len, ++ sop->so_client->cl_name.data); ++ } ++ } + out: + if (oc->oc_stateowner) + nfs4_get_stateowner(oc->oc_stateowner); +@@ -1912,6 +2407,22 @@ + return status; + } + ++int ++nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr) ++{ ++ int status; ++ ++ if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0))) ++ goto out; ++ ++ nfs4_lock_state(); ++ status = nfs4_preprocess_stateid_op(current_fh, &dr->dr_stateid, DELEG_RET, NULL); ++ nfs4_unlock_state(); ++out: ++ return status; ++} ++ ++ + /* + * Lock owner state (byte-range locks) + */ +@@ -1938,7 +2449,7 @@ + unsigned int hashval; + + dprintk("NFSD: find_stateid flags 0x%x\n",flags); +- if ((flags & LOCK_STATE) || (flags & RDWR_STATE)) { ++ if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { + hashval = stateid_hashval(st_id, f_id); + list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { + if ((local->st_stateid.si_stateownerid == st_id) && +@@ -1946,7 +2457,7 @@ + return local; + } + } +- if ((flags & OPEN_STATE) || (flags & RDWR_STATE)) { ++ if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { + hashval = stateid_hashval(st_id, f_id); + list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { + if ((local->st_stateid.si_stateownerid == st_id) && +@@ -1958,6 +2469,30 @@ + return NULL; + } + ++static struct nfs4_delegation * ++find_delegation_stateid(struct inode *ino, stateid_t *stid) ++{ ++ struct nfs4_delegation *dp = NULL; ++ struct nfs4_file *fp = NULL; ++ u32 st_id; ++ unsigned int fi_hashval; ++ ++ dprintk("NFSD:find_delegation_stateid ino %p, stid %p\n",ino,stid); ++ ++ if(!ino || !stid) ++ return NULL; ++ st_id = stid->si_stateownerid; ++ fi_hashval = file_hashval(ino); ++ if (find_file(fi_hashval, ino, &fp)) { ++ list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) { ++ if(dp->dl_stateid.si_stateownerid == st_id) { ++ dprintk("NFSD: find_delegation dp %p\n",dp); ++ return dp; ++ } ++ } ++ } ++ return NULL; ++} + + /* + * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that +@@ -2085,7 +2620,6 @@ + rp->rp_status = NFSERR_SERVERFAULT; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; +- alloc_lsowner++; + return sop; + } + +@@ -2558,22 +3092,22 @@ + /* + * failure => all reset bets are off, nfserr_no_grace... + */ +-static int +-nfs4_client_to_reclaim(struct nfs4_client *clp) ++int ++nfs4_client_to_reclaim(char *name, int namlen) + { + unsigned int strhashval; + struct nfs4_client_reclaim *crp = NULL; + +- crp = alloc_reclaim(clp->cl_name.len); ++ dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", namlen, name); ++ crp = alloc_reclaim(namlen); + if (!crp) + return 0; +- strhashval = clientstr_hashval(clp->cl_name.data, clp->cl_name.len); ++ strhashval = clientstr_hashval(name, namlen); + INIT_LIST_HEAD(&crp->cr_strhash); + list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); +- memcpy(crp->cr_name.data, clp->cl_name.data, clp->cl_name.len); +- crp->cr_name.len = clp->cl_name.len; +- crp->cr_first_state = clp->cl_first_state; +- crp->cr_expired = 0; ++ memcpy(crp->cr_name.data, name, namlen); ++ crp->cr_name.len = namlen; ++ reclaim_str_hashtbl_size++; + return 1; + } + +@@ -2618,6 +3152,9 @@ + if (!client) + return NULL; + ++ dprintk("NFSD: nfs4_find_reclaim_client for %.*s\n", ++ clp->cl_name.len, clp->cl_name.data); ++ + /* find clp->cl_name in reclaim_str_hashtbl */ + strhashval = clientstr_hashval(client->cl_name.data, + client->cl_name.len); +@@ -2639,8 +3176,6 @@ + + if ((crp = nfs4_find_reclaim_client(clid)) == NULL) + return nfserr_reclaim_bad; +- if (crp->cr_expired) +- return nfserr_no_grace; + return nfs_ok; + } + +@@ -2657,10 +3192,18 @@ + + if (nfs4_init) + return; ++ if (nfsd4_init_slabs()) ++ BUG(); /* XXXXXX!!! */ + if (!nfs4_reclaim_init) { ++ int status; ++ + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); + reclaim_str_hashtbl_size = 0; ++ nfsd4_init_rec_dir(recovery_dirname); ++ status = nfsd4_list_rec_dir(0); ++ if (status) ++ printk("NFSD: Failure in reading recovery data\n"); + nfs4_reclaim_init = 1; + } + for (i = 0; i < CLIENT_HASH_SIZE; i++) { +@@ -2689,6 +3232,8 @@ + + INIT_LIST_HEAD(&close_lru); + INIT_LIST_HEAD(&client_lru); ++ INIT_LIST_HEAD(&del_recall_lru); ++ spin_lock_init(&recall_lock); + boot_time = get_seconds(); + grace_time = max(old_lease_time, lease_time); + if (reclaim_str_hashtbl_size == 0) +@@ -2725,6 +3270,15 @@ + { + int i; + struct nfs4_client *clp = NULL; ++ struct nfs4_delegation *dp = NULL; ++ struct nfs4_stateowner *sop = NULL; ++ struct list_head *pos, *next; ++ ++ list_for_each_safe(pos, next, &close_lru) { ++ sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); ++ list_del(&sop->so_close_lru); ++ nfs4_put_stateowner(sop); ++ } + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&conf_id_hashtbl[i])) { +@@ -2736,20 +3290,31 @@ + expire_client(clp); + } + } ++ spin_lock(&recall_lock); ++ list_for_each_safe(pos, next, &del_recall_lru) { ++ dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); ++ atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE); ++ release_delegation(dp); ++ } ++ spin_unlock(&recall_lock); ++ + release_all_files(); + cancel_delayed_work(&laundromat_work); + flush_scheduled_work(); + nfs4_init = 0; ++ nfs4_reclaim_init = 0; + dprintk("NFSD: list_add_perfile %d list_del_perfile %d\n", + list_add_perfile, list_del_perfile); + dprintk("NFSD: add_perclient %d del_perclient %d\n", + add_perclient, del_perclient); + dprintk("NFSD: alloc_file %d free_file %d\n", + alloc_file, free_file); +- dprintk("NFSD: alloc_sowner %d alloc_lsowner %d free_sowner %d\n", +- alloc_sowner, alloc_lsowner, free_sowner); + dprintk("NFSD: vfsopen %d vfsclose %d\n", + vfsopen, vfsclose); ++ dprintk("NFSD: alloc_delegation %d free_delegation %d\n", ++ alloc_delegation, free_delegation); ++ if (nfsd4_free_slabs()) ++ BUG(); /* XXX? */ + } + + void +@@ -2801,11 +3366,10 @@ + /* populate reclaim_str_hashtbl with current confirmed nfs4_clientid */ + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + list_for_each_entry(clp, &conf_id_hashtbl[i], cl_idhash) { +- if (!nfs4_client_to_reclaim(clp)) { ++ if (!nfs4_client_to_reclaim(clp->cl_name.data, clp->cl_name.len)) { + nfs4_release_reclaim(); + goto init_state; + } +- reclaim_str_hashtbl_size++; + } + } + init_state: +Index: linux-2.6.10/fs/nfsd/nfsproc.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfsproc.c 2004-12-25 05:34:30.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/nfsproc.c 2005-04-05 14:49:13.426688152 +0800 +@@ -586,7 +586,6 @@ + { nfserr_dquot, -EDQUOT }, + #endif + { nfserr_stale, -ESTALE }, +- { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_dropit, -EAGAIN }, + { nfserr_dropit, -ENOMEM }, +Index: linux-2.6.10/fs/nfsd/nfs4acl.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfs4acl.c 2004-12-25 05:34:29.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/nfs4acl.c 2005-04-05 14:49:13.429687696 +0800 +@@ -89,6 +89,8 @@ + return ret; + } + ++/* modify functions to take NFS errors */ ++ + static int + mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) + { +Index: linux-2.6.10/fs/nfsd/nfs4idmap.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfs4idmap.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/nfs4idmap.c 2005-04-05 14:49:13.414689976 +0800 +@@ -78,9 +78,9 @@ + + #define DefineSimpleCacheLookupMap(STRUCT, FUNC) \ + DefineCacheLookup(struct STRUCT, h, FUNC##_lookup, \ +- (struct STRUCT *item, int set), /*no setup */, \ ++ (struct STRUCT *item, int set), \ + & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp), \ +- STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0) ++ STRUCT##_init(new, item), STRUCT##_update(tmp, item)) + + /* Common entry handling */ + +Index: linux-2.6.10/fs/nfsd/vfs.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/vfs.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/vfs.c 2005-04-05 14:49:13.417689520 +0800 +@@ -304,6 +304,8 @@ + * we need to break all leases. + */ + err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); ++ if (err == -EWOULDBLOCK) ++ err = -ETIMEDOUT; + if (err) /* ENOMEM or EWOULDBLOCK */ + goto out_nfserr; + +@@ -678,6 +680,8 @@ + * This may block while leases are broken. + */ + err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0)); ++ if (err == -EWOULDBLOCK) ++ err = -ETIMEDOUT; + if (err) /* NOMEM or WOULDBLOCK */ + goto out_nfserr; + +@@ -822,21 +826,34 @@ + nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, + struct kvec *vec, int vlen, unsigned long *count) + { +- struct raparms *ra; +- mm_segment_t oldfs; + int err; + struct file *file; +- struct inode *inode; + + err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file); + if (err) + goto out; ++ err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); ++ ++ nfsd_close(file); ++out: ++ return err; ++} ++ ++int ++nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, ++ loff_t offset, struct kvec *vec, int vlen, unsigned long *count) ++{ ++ struct inode *inode; ++ struct raparms *ra; ++ mm_segment_t oldfs; ++ int err; ++ + err = nfserr_perm; + inode = file->f_dentry->d_inode; + #ifdef MSNFS + if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && + (!lock_may_read(inode, offset, *count))) +- goto out_close; ++ goto out; + #endif + + /* Get readahead parameters */ +@@ -872,8 +889,6 @@ + dnotify_parent(file->f_dentry, DN_ACCESS); + } else + err = nfserrno(err); +-out_close: +- nfsd_close(file); + out: + return err; + } +@@ -888,25 +903,40 @@ + struct kvec *vec, int vlen, + unsigned long cnt, int *stablep) + { +- struct svc_export *exp; + struct file *file; +- struct dentry *dentry; +- struct inode *inode; +- mm_segment_t oldfs; + int err = 0; +- int stable = *stablep; + + err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file); + if (err) + goto out; + if (!cnt) + goto out_close; ++ ++ err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); ++out_close: ++ nfsd_close(file); ++out: ++ return err; ++} ++ ++int ++nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, ++ loff_t offset, struct kvec *vec, int vlen, ++ unsigned long cnt, int *stablep) ++{ ++ struct svc_export *exp; ++ struct dentry *dentry; ++ struct inode *inode; ++ mm_segment_t oldfs; ++ int err = 0; ++ int stable = *stablep; ++ + err = nfserr_perm; + + #ifdef MSNFS + if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && + (!lock_may_write(file->f_dentry->d_inode, offset, cnt))) +- goto out_close; ++ goto out; + #endif + + dentry = file->f_dentry; +@@ -993,13 +1023,10 @@ + err = 0; + else + err = nfserrno(err); +-out_close: +- nfsd_close(file); + out: + return err; + } + +- + #ifdef CONFIG_NFSD_V3 + /* + * Commit all pending writes to stable storage. +Index: linux-2.6.10/fs/nfsd/nfs4callback.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfs4callback.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/fs/nfsd/nfs4callback.c 2005-04-05 14:49:13.428687848 +0800 +@@ -0,0 +1,589 @@ ++/* ++ * linux/fs/nfsd/nfs4callback.c ++ * ++ * Copyright (c) 2001 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Kendrick Smith ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++#define NFSPROC4_CB_NULL 0 ++#define NFSPROC4_CB_COMPOUND 1 ++ ++/* declarations */ ++static void nfs4_cb_null(struct rpc_task *task); ++extern spinlock_t recall_lock; ++ ++/* Index of predefined Linux callback client operations */ ++ ++enum { ++ NFSPROC4_CLNT_CB_NULL = 0, ++ NFSPROC4_CLNT_CB_RECALL, ++}; ++ ++enum nfs_cb_opnum4 { ++ OP_CB_RECALL = 4, ++}; ++ ++#define NFS4_MAXTAGLEN 20 ++ ++#define NFS4_enc_cb_null_sz 0 ++#define NFS4_dec_cb_null_sz 0 ++#define cb_compound_enc_hdr_sz 4 ++#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) ++#define op_enc_sz 1 ++#define op_dec_sz 2 ++#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) ++#define enc_stateid_sz 16 ++#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ ++ 1 + enc_stateid_sz + \ ++ enc_nfs4_fh_sz) ++ ++#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ ++ op_dec_sz) ++ ++/* ++* Generic encode routines from fs/nfs/nfs4xdr.c ++*/ ++static inline u32 * ++xdr_writemem(u32 *p, const void *ptr, int nbytes) ++{ ++ int tmp = XDR_QUADLEN(nbytes); ++ if (!tmp) ++ return p; ++ p[tmp-1] = 0; ++ memcpy(p, ptr, nbytes); ++ return p + tmp; ++} ++ ++#define WRITE32(n) *p++ = htonl(n) ++#define WRITEMEM(ptr,nbytes) do { \ ++ p = xdr_writemem(p, ptr, nbytes); \ ++} while (0) ++#define RESERVE_SPACE(nbytes) do { \ ++ p = xdr_reserve_space(xdr, nbytes); \ ++ if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \ ++ BUG_ON(!p); \ ++} while (0) ++ ++/* ++ * Generic decode routines from fs/nfs/nfs4xdr.c ++ */ ++#define DECODE_TAIL \ ++ status = 0; \ ++out: \ ++ return status; \ ++xdr_error: \ ++ dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \ ++ status = -EIO; \ ++ goto out ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define READTIME(x) do { \ ++ p++; \ ++ (x.tv_sec) = ntohl(*p++); \ ++ (x.tv_nsec) = ntohl(*p++); \ ++} while (0) ++#define READ_BUF(nbytes) do { \ ++ p = xdr_inline_decode(xdr, nbytes); \ ++ if (!p) { \ ++ dprintk("NFSD: %s: reply buffer overflowed in line %d.", \ ++ __FUNCTION__, __LINE__); \ ++ return -EIO; \ ++ } \ ++} while (0) ++ ++struct nfs4_cb_compound_hdr { ++ int status; ++ u32 ident; ++ u32 nops; ++ u32 taglen; ++ char * tag; ++}; ++ ++static struct { ++int stat; ++int errno; ++} nfs_cb_errtbl[] = { ++ { NFS4_OK, 0 }, ++ { NFS4ERR_PERM, EPERM }, ++ { NFS4ERR_NOENT, ENOENT }, ++ { NFS4ERR_IO, EIO }, ++ { NFS4ERR_NXIO, ENXIO }, ++ { NFS4ERR_ACCESS, EACCES }, ++ { NFS4ERR_EXIST, EEXIST }, ++ { NFS4ERR_XDEV, EXDEV }, ++ { NFS4ERR_NOTDIR, ENOTDIR }, ++ { NFS4ERR_ISDIR, EISDIR }, ++ { NFS4ERR_INVAL, EINVAL }, ++ { NFS4ERR_FBIG, EFBIG }, ++ { NFS4ERR_NOSPC, ENOSPC }, ++ { NFS4ERR_ROFS, EROFS }, ++ { NFS4ERR_MLINK, EMLINK }, ++ { NFS4ERR_NAMETOOLONG, ENAMETOOLONG }, ++ { NFS4ERR_NOTEMPTY, ENOTEMPTY }, ++ { NFS4ERR_DQUOT, EDQUOT }, ++ { NFS4ERR_STALE, ESTALE }, ++ { NFS4ERR_BADHANDLE, EBADHANDLE }, ++ { NFS4ERR_BAD_COOKIE, EBADCOOKIE }, ++ { NFS4ERR_NOTSUPP, ENOTSUPP }, ++ { NFS4ERR_TOOSMALL, ETOOSMALL }, ++ { NFS4ERR_SERVERFAULT, ESERVERFAULT }, ++ { NFS4ERR_BADTYPE, EBADTYPE }, ++ { NFS4ERR_LOCKED, EAGAIN }, ++ { NFS4ERR_RESOURCE, EREMOTEIO }, ++ { NFS4ERR_SYMLINK, ELOOP }, ++ { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP }, ++ { NFS4ERR_DEADLOCK, EDEADLK }, ++ { -1, EIO } ++}; ++ ++static int ++nfs_cb_stat_to_errno(int stat) ++{ ++ int i; ++ for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { ++ if (nfs_cb_errtbl[i].stat == stat) ++ return nfs_cb_errtbl[i].errno; ++ } ++ /* If we cannot translate the error, the recovery routines should ++ * handle it. ++ * Note: remaining NFSv4 error codes have values > 10000, so should ++ * not conflict with native Linux error codes. ++ */ ++ return stat; ++} ++ ++/* ++ * XDR encode ++ */ ++ ++static int ++encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 * p; ++ ++ RESERVE_SPACE(16); ++ WRITE32(0); /* tag length is always 0 */ ++ WRITE32(NFS4_MINOR_VERSION); ++ WRITE32(hdr->ident); ++ WRITE32(hdr->nops); ++ return 0; ++} ++ ++static int ++encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) ++{ ++ u32 *p; ++ int len = cb_rec->cbr_fhlen; ++ ++ RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); ++ WRITE32(OP_CB_RECALL); ++ WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t)); ++ WRITE32(cb_rec->cbr_trunc); ++ WRITE32(len); ++ WRITEMEM(cb_rec->cbr_fhval, len); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p) ++{ ++ struct xdr_stream xdrs, *xdr = &xdrs; ++ ++ xdr_init_encode(&xdrs, &req->rq_snd_buf, p); ++ RESERVE_SPACE(0); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr = { ++ .nops = 1, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ return (encode_cb_recall(&xdr, args)); ++} ++ ++ ++static int ++decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ ++ u32 *p; ++ ++ READ_BUF(8); ++ READ32(hdr->status); ++ READ32(hdr->taglen); ++ READ_BUF(hdr->taglen + 4); ++ hdr->tag = (char *)p; ++ p += XDR_QUADLEN(hdr->taglen); ++ READ32(hdr->nops); ++ return 0; ++} ++ ++static int ++decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) ++{ ++ u32 *p; ++ u32 op; ++ int32_t nfserr; ++ ++ READ_BUF(8); ++ READ32(op); ++ if (op != expected) { ++ dprintk("NFSD: decode_cb_op_hdr: Callback server returned " ++ " operation %d but we issued a request for %d\n", ++ op, expected); ++ return -EIO; ++ } ++ READ32(nfserr); ++ if (nfserr != NFS_OK) ++ return -nfs_cb_stat_to_errno(nfserr); ++ return 0; ++} ++ ++static int ++nfs4_xdr_dec_cb_null(struct rpc_rqst *req, u32 *p) ++{ ++ return 0; ++} ++ ++static int ++nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); ++out : ++ return status; ++} ++ ++/* ++ * RPC procedure tables ++ */ ++#ifndef MAX ++# define MAX(a, b) (((a) > (b))? (a) : (b)) ++#endif ++ ++#define PROC(proc, call, argtype, restype) \ ++[NFSPROC4_CLNT_##proc] = { \ ++ .p_proc = NFSPROC4_CB_##call, \ ++ .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ ++ .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ ++ .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \ ++} ++ ++struct rpc_procinfo nfs4_cb_procedures[] = { ++ PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), ++ PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), ++}; ++ ++struct rpc_version nfs_cb_version4 = { ++ .number = 1, ++ .nrprocs = sizeof(nfs4_cb_procedures)/sizeof(nfs4_cb_procedures[0]), ++ .procs = nfs4_cb_procedures ++}; ++ ++static struct rpc_version * nfs_cb_version[] = { ++ NULL, ++ &nfs_cb_version4, ++}; ++ ++/* ++ * Use the SETCLIENTID credential ++ */ ++struct rpc_cred * ++nfsd4_lookupcred(struct nfs4_client *clp, int taskflags) ++{ ++ struct auth_cred acred; ++ struct rpc_clnt *clnt = clp->cl_callback.cb_client; ++ struct rpc_cred *ret = NULL; ++ ++ if (!clnt) ++ goto out; ++ get_group_info(clp->cl_cred.cr_group_info); ++ acred.uid = clp->cl_cred.cr_uid; ++ acred.gid = clp->cl_cred.cr_gid; ++ acred.group_info = clp->cl_cred.cr_group_info; ++ ++ dprintk("NFSD: looking up %s cred\n", ++ clnt->cl_auth->au_ops->au_name); ++ ret = rpcauth_lookup_credcache(clnt->cl_auth, &acred, taskflags); ++ put_group_info(clp->cl_cred.cr_group_info); ++out: ++ return ret; ++} ++ ++/* ++ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... ++ */ ++void ++nfsd4_probe_callback(struct nfs4_client *clp) ++{ ++ struct sockaddr_in addr; ++ struct nfs4_callback *cb = &clp->cl_callback; ++ struct rpc_timeout timeparms; ++ struct rpc_xprt * xprt; ++ struct rpc_program * program = &cb->cb_program; ++ struct rpc_stat * stat = &cb->cb_stat; ++ struct rpc_clnt * clnt; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], ++ .rpc_argp = clp, ++ }; ++ char hostname[32]; ++ int status; ++ ++ dprintk("NFSD: probe_callback. cb_parsed %d cb_set %d\n", ++ cb->cb_parsed, atomic_read(&cb->cb_set)); ++ if (!cb->cb_parsed || atomic_read(&cb->cb_set)) ++ return; ++ ++ /* Initialize address */ ++ memset(&addr, 0, sizeof(addr)); ++ addr.sin_family = AF_INET; ++ addr.sin_port = htons(cb->cb_port); ++ addr.sin_addr.s_addr = htonl(cb->cb_addr); ++ ++ /* Initialize timeout */ ++ timeparms.to_initval = (NFSD_LEASE_TIME/4) * HZ; ++ timeparms.to_retries = 5; ++ timeparms.to_maxval = (NFSD_LEASE_TIME/2) * HZ; ++ timeparms.to_exponential = 1; ++ ++ /* Create RPC transport */ ++ if (!(xprt = xprt_create_proto(IPPROTO_TCP, &addr, &timeparms))) { ++ dprintk("NFSD: couldn't create callback transport!\n"); ++ goto out_err; ++ } ++ ++ /* Initialize rpc_program */ ++ program->name = "nfs4_cb"; ++ program->number = cb->cb_prog; ++ program->nrvers = sizeof(nfs_cb_version)/sizeof(nfs_cb_version[0]); ++ program->version = nfs_cb_version; ++ program->stats = stat; ++ ++ /* Initialize rpc_stat */ ++ memset(stat, 0, sizeof(struct rpc_stat)); ++ stat->program = program; ++ ++ /* Create RPC client ++ * ++ * XXX AUTH_UNIX only - need AUTH_GSS.... ++ */ ++ sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr)); ++ if (!(clnt = rpc_create_client(xprt, hostname, program, 1, RPC_AUTH_UNIX))) { ++ dprintk("NFSD: couldn't create callback client\n"); ++ goto out_xprt; ++ } ++ clnt->cl_intr = 1; ++ clnt->cl_softrtry = 1; ++ clnt->cl_chatty = 1; ++ ++ /* Kick rpciod, put the call on the wire. */ ++ ++ if (rpciod_up() != 0) { ++ dprintk("nfsd: couldn't start rpciod for callbacks!\n"); ++ goto out_clnt; ++ } ++ ++ /* the task holds a reference to the nfs4_client struct */ ++ cb->cb_client = clnt; ++ atomic_inc(&clp->cl_count); ++ ++ msg.rpc_cred = nfsd4_lookupcred(clp,0); ++ status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, NULL); ++ ++ if (status != 0) { ++ dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n"); ++ goto out_rpciod; ++ } ++ return; ++ ++out_rpciod: ++ atomic_dec(&clp->cl_count); ++ rpciod_down(); ++out_clnt: ++ rpc_shutdown_client(clnt); ++ goto out_err; ++out_xprt: ++ xprt_destroy(xprt); ++out_err: ++ dprintk("NFSD: warning: no callback path to client %.*s\n", ++ clp->cl_name.len, clp->cl_name.data); ++ cb->cb_client = NULL; ++} ++ ++static void ++nfs4_cb_null(struct rpc_task *task) ++{ ++ struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; ++ struct nfs4_callback *cb = &clp->cl_callback; ++ u32 addr = htonl(cb->cb_addr); ++ ++ dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status); ++ ++ if (task->tk_status < 0) { ++ dprintk("NFSD: callback establishment to client %.*s failed\n", ++ clp->cl_name.len, clp->cl_name.data); ++ goto out; ++ } ++ atomic_set(&cb->cb_set, 1); ++ dprintk("NFSD: callback set to client %u.%u.%u.%u\n", NIPQUAD(addr)); ++out: ++ put_nfs4_client(clp); ++} ++ ++/* ++ * Called with dp->dl_count incremented ++ */ ++static void ++nfs4_cb_recall_done(struct rpc_task *task) ++{ ++ struct nfs4_cb_recall *cbr = (struct nfs4_cb_recall *)task->tk_calldata; ++ struct nfs4_delegation *dp = cbr->cbr_dp; ++ int status; ++ ++ spin_lock(&recall_lock); ++ ++ /* all is well... */ ++ if (task->tk_status == 0) ++ goto out; ++ ++ /* network partition, retry nfsd4_cb_recall once. */ ++ if (task->tk_status == -EIO) { ++ if (atomic_read(&dp->dl_recall_cnt) == 0) ++ goto retry; ++ else ++ /* callback channel no longer available */ ++ atomic_set(&dp->dl_client->cl_callback.cb_set, 0); ++ } ++ ++ /* Race: a recall occurred miliseconds after a delegation was granted. ++ * Client may have received recall prior to delegation. retry recall ++ * once. ++ * XXX what about nfserr_bad_stateid? ++ */ ++ if (task->tk_status == -EBADHANDLE) { ++ if (atomic_read(&dp->dl_recall_cnt) == 0) ++ goto retry; ++ } ++ ++ /* nfs4_laundromat will reap delegation */ ++ atomic_set(&dp->dl_state, NFS4_RECALL_COMPLETE); ++ ++out: ++ atomic_dec(&dp->dl_count); ++ BUG_ON(atomic_read(&dp->dl_count) < 0); ++ spin_unlock(&recall_lock); ++ return; ++ ++retry: ++ atomic_inc(&dp->dl_recall_cnt); ++ spin_unlock(&recall_lock); ++ /* sleep 2 seconds before retrying recall */ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_timeout(2*HZ); ++ status = nfsd4_cb_recall(dp); ++ dprintk("NFSD: nfs4_cb_recall_done: retry status: %d dp %p dl_flock %p\n",status,dp, dp->dl_flock); ++} ++ ++/* ++ * called with dp->dl_count inc'ed. ++ * nfs4_lock_state() may or may not have been called. ++ */ ++int ++nfsd4_cb_recall(struct nfs4_delegation *dp) ++{ ++ struct nfs4_client *clp; ++ struct rpc_clnt *clnt; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], ++ }; ++ struct nfs4_cb_recall *cbr = &dp->dl_recall; ++ int status; ++ ++ dprintk("NFSD: nfsd4_cb_recall NFS4_enc_cb_recall_sz %d NFS4_dec_cb_recall_sz %d \n",NFS4_enc_cb_recall_sz,NFS4_dec_cb_recall_sz); ++ ++ clp = dp->dl_client; ++ clnt = clp->cl_callback.cb_client; ++ status = EIO; ++ if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt) ++ goto out_free; ++ ++ msg.rpc_argp = cbr; ++ msg.rpc_resp = cbr; ++ msg.rpc_cred = nfsd4_lookupcred(clp,0); ++ ++ cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */ ++ cbr->cbr_dp = dp; ++ ++ if ((status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ nfs4_cb_recall_done, cbr ))) { ++ dprintk("NFSD: recall_delegation: rpc_call_async failed %d\n", ++ status); ++ goto out_fail; ++ } ++out: ++ return status; ++out_fail: ++ status = nfserrno(status); ++ out_free: ++ kfree(cbr); ++ goto out; ++} +Index: linux-2.6.10/fs/nfsd/nfs4proc.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfs4proc.c 2004-12-25 05:35:40.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/nfs4proc.c 2005-04-05 14:49:13.432687240 +0800 +@@ -461,28 +461,12 @@ + } + + static inline int +-access_bits_permit_read(unsigned long access_bmap) +-{ +- return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) || +- test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap); +-} +- +-static inline int +-access_bits_permit_write(unsigned long access_bmap) +-{ +- return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) || +- test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap); +-} +- +-static inline int + nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read) + { +- struct nfs4_stateid *stp; + int status; ++ struct file *filp; + + /* no need to check permission - this will be done in nfsd_read() */ +- if (nfs4_in_grace()) +- return nfserr_grace; + + if (read->rd_offset >= OFFSET_MAX) + return nfserr_inval; +@@ -508,21 +492,17 @@ + goto out; + } + /* check stateid */ +- if ((status = nfs4_preprocess_stateid_op(current_fh, &read->rd_stateid, +- CHECK_FH | RDWR_STATE, &stp))) { ++ if ((status = nfs4_preprocess_stateid_op(current_fh, &read->rd_stateid, ++ CHECK_FH | RD_STATE, &filp))) { + dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); + goto out; + } +- status = nfserr_openmode; +- if (!access_bits_permit_read(stp->st_access_bmap)) { +- dprintk("NFSD: nfsd4_read: file not opened for read!\n"); +- goto out; +- } + status = nfs_ok; + out: + nfs4_unlock_state(); + read->rd_rqstp = rqstp; + read->rd_fhp = current_fh; ++ read->rd_filp = filp; + return status; + } + +@@ -562,6 +542,8 @@ + { + int status; + ++ if (nfs4_in_grace()) ++ return nfserr_grace; + status = nfsd_unlink(rqstp, current_fh, 0, remove->rm_name, remove->rm_namelen); + if (status == nfserr_symlink) + return nfserr_notdir; +@@ -580,6 +562,9 @@ + + if (!save_fh->fh_dentry) + return status; ++ if (nfs4_in_grace() && !(save_fh->fh_export->ex_flags ++ & NFSEXP_NOSUBTREECHECK)) ++ return nfserr_grace; + status = nfsd_rename(rqstp, save_fh, rename->rn_sname, + rename->rn_snamelen, current_fh, + rename->rn_tname, rename->rn_tnamelen); +@@ -605,12 +590,8 @@ + static inline int + nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr) + { +- struct nfs4_stateid *stp; + int status = nfs_ok; + +- if (nfs4_in_grace()) +- return nfserr_grace; +- + if (!current_fh->fh_dentry) + return nfserr_nofilehandle; + +@@ -626,15 +607,10 @@ + nfs4_lock_state(); + if ((status = nfs4_preprocess_stateid_op(current_fh, + &setattr->sa_stateid, +- CHECK_FH | RDWR_STATE, &stp))) { ++ CHECK_FH | WR_STATE, NULL))) { + dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); + goto out_unlock; + } +- status = nfserr_openmode; +- if (!access_bits_permit_write(stp->st_access_bmap)) { +- dprintk("NFSD: nfsd4_setattr: not opened for write!\n"); +- goto out_unlock; +- } + nfs4_unlock_state(); + } + status = nfs_ok; +@@ -654,14 +630,11 @@ + static inline int + nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write) + { +- struct nfs4_stateid *stp; + stateid_t *stateid = &write->wr_stateid; ++ struct file *filp; + u32 *p; + int status = nfs_ok; + +- if (nfs4_in_grace()) +- return nfserr_grace; +- + /* no need to check permission - this will be done in nfsd_write() */ + + if (write->wr_offset >= OFFSET_MAX) +@@ -677,18 +650,13 @@ + goto zero_stateid; + } + if ((status = nfs4_preprocess_stateid_op(current_fh, stateid, +- CHECK_FH | RDWR_STATE, &stp))) { ++ CHECK_FH | WR_STATE, &filp))) { + dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); + goto out; + } + +- status = nfserr_openmode; +- if (!access_bits_permit_write(stp->st_access_bmap)) { +- dprintk("NFSD: nfsd4_write: file not open for write!\n"); +- goto out; +- } +- + zero_stateid: ++ + nfs4_unlock_state(); + write->wr_bytes_written = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +@@ -696,9 +664,16 @@ + *p++ = nfssvc_boot.tv_sec; + *p++ = nfssvc_boot.tv_usec; + +- status = nfsd_write(rqstp, current_fh, write->wr_offset, +- write->wr_vec, write->wr_vlen, write->wr_buflen, +- &write->wr_how_written); ++ if (filp) ++ status = nfsd_vfs_write(rqstp, current_fh, filp, ++ write->wr_offset, write->wr_vec, ++ write->wr_vlen, write->wr_buflen, ++ &write->wr_how_written); ++ else ++ status = nfsd_write(rqstp, current_fh, write->wr_offset, ++ write->wr_vec, write->wr_vlen, write->wr_buflen, ++ &write->wr_how_written); ++ + if (status == nfserr_symlink) + status = nfserr_inval; + return status; +@@ -872,6 +847,9 @@ + case OP_CREATE: + op->status = nfsd4_create(rqstp, current_fh, &op->u.create); + break; ++ case OP_DELEGRETURN: ++ op->status = nfsd4_delegreturn(rqstp, current_fh, &op->u.delegreturn); ++ break; + case OP_GETATTR: + op->status = nfsd4_getattr(rqstp, current_fh, &op->u.getattr); + break; +Index: linux-2.6.10/fs/nfsd/export.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/export.c 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/export.c 2005-04-05 14:49:13.415689824 +0800 +@@ -255,7 +255,7 @@ + new->ek_export = item->ek_export; + } + +-static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */ ++static DefineSimpleCacheLookup(svc_expkey) + + #define EXPORT_HASHBITS 8 + #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) +@@ -492,8 +492,72 @@ + new->ex_fsid = item->ex_fsid; + } + +-static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */ ++struct svc_export * ++svc_export_lookup(struct svc_export *item, int set) ++{ ++ struct svc_export *tmp, *new = NULL; ++ struct cache_head **hp, **head; + ++ head = &svc_export_cache.hash_table[svc_export_hash(item)]; ++retry: ++ if (set||new) ++ write_lock(&svc_export_cache.hash_lock); ++ else ++ read_lock(&svc_export_cache.hash_lock); ++ for(hp=head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct svc_export, h); ++ if (svc_export_match(item, tmp)) { /* found a match */ ++ cache_get(&tmp->h); ++ if (set) { ++ if (test_bit(CACHE_NEGATIVE, &item->h.flags)) ++ set_bit(CACHE_NEGATIVE, &tmp->h.flags); ++ else { ++ clear_bit(CACHE_NEGATIVE, &tmp->h.flags); ++ svc_export_update(tmp, item); ++ } ++ } ++ if (set||new) ++ write_unlock(&svc_export_cache.hash_lock); ++ else ++ read_unlock(&svc_export_cache.hash_lock); ++ if (set) ++ cache_fresh(&svc_export_cache, &tmp->h, ++ item->h.expiry_time); ++ if (new) ++ svc_export_put(&new->h, &svc_export_cache); ++ return tmp; ++ } ++ } ++ /* Didn't find anything */ ++ if (new) { ++ svc_export_init(new, item); ++ new->h.next = *head; ++ *head = &new->h; ++ set_bit(CACHE_HASHED, &new->h.flags); ++ svc_export_cache.entries++; ++ if (set) { ++ tmp = new; ++ if (test_bit(CACHE_NEGATIVE, &item->h.flags)) ++ set_bit(CACHE_NEGATIVE, &tmp->h.flags); ++ else ++ svc_export_update(tmp, item); ++ } ++ } ++ if (set||new) ++ write_unlock(&svc_export_cache.hash_lock); ++ else ++ read_unlock(&svc_export_cache.hash_lock); ++ if (new && set) ++ cache_fresh(&svc_export_cache, &new->h, item->h.expiry_time); ++ if (new) ++ return new; ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new) { ++ cache_init(&new->h); ++ goto retry; ++ } ++ return NULL; ++} + + struct svc_expkey * + exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) +Index: linux-2.6.10/fs/nfsd/nfssvc.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfssvc.c 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/nfssvc.c 2005-04-05 14:49:13.422688760 +0800 +@@ -378,4 +378,6 @@ + .pg_name = "nfsd", /* program name */ + .pg_class = "nfsd", /* authentication class */ + .pg_stats = &nfsd_svcstats, /* version table */ ++ .pg_authenticate = &svc_set_client, /* export authentication */ ++ + }; +Index: linux-2.6.10/fs/nfsd/nfs4recover.c +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/nfs4recover.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/fs/nfsd/nfs4recover.c 2005-04-05 14:49:13.430687544 +0800 +@@ -0,0 +1,411 @@ ++/* ++* linux/fs/nfsd/nfs4recover.c ++* ++* Copyright (c) 2004 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* MAX_FILE_LEN/2 = max client id name length due to changing name ++ * into hex ++ */ ++#define MAX_FILE_LEN 256 ++ ++/* Globals */ ++char recovery_dirname[] = "/var/lib/nfs/v4recovery"; ++static uid_t saveuid; ++static gid_t savegid; ++static struct nameidata nd_rec_init; ++static int rec_dir_init = 0; ++ ++void ++nfs4_save_set_user(void) ++{ ++ saveuid = current->fsuid; ++ savegid = current->fsgid; ++ current->fsuid = 0; ++ current->fsgid = 0; ++} ++ ++void ++nfs4_reset_user(void) ++{ ++ current->fsuid = saveuid; ++ current->fsgid = savegid; ++} ++ ++void ++nfs4_make_rec_filename(char **filename, struct nfs4_client *clp) ++{ ++ char *fname = *filename; ++ int flen = MAX_FILE_LEN; ++ ++ memset(fname, 0, flen); ++ qword_addhex(&fname, &flen, clp->cl_name.data, clp->cl_name.len); ++} ++ ++/* XXX need to check dput() mntput ?? */ ++int ++nfsd4_create_clid_file(struct nfs4_client *clp) ++{ ++ struct file *filp = NULL; ++ struct dentry *dentry; ++ mm_segment_t oldfs; ++ loff_t offset = 0; ++ char fbuf[MAX_FILE_LEN], *fname = fbuf; ++ int status; ++ ++ ++ if (!rec_dir_init) ++ return -EINVAL; ++ nfs4_save_set_user(); ++ ++ dprintk("NFSD: nfsd4_create_clid_file IN recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ /* lock the parent */ ++ down(&nd_rec_init.dentry->d_inode->i_sem); ++ ++ nfs4_make_rec_filename(&fname, clp); ++ /* dentry->d_count will be 1 */ ++ dentry = lookup_one_len(fname, nd_rec_init.dentry, strlen(fname)); ++ status = PTR_ERR(dentry); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ ++ status = -EEXIST; ++ if (dentry->d_inode){ ++ dprintk("NFSD: nfsd4_create_clid_file: FILE EXISTS\n"); ++ goto out_unlock; ++ } ++ ++ /* nd_rec_init.dentry->d_count is bumped */ ++ status = vfs_create(nd_rec_init.dentry->d_inode, dentry, S_IRWXU, NULL); ++ if (status < 0) ++ goto out_unlock; ++ ++ up(&nd_rec_init.dentry->d_inode->i_sem); ++ ++ filp = dentry_open(dget(dentry), mntget(nd_rec_init.mnt), O_RDWR); ++ status = PTR_ERR(filp); ++ if (IS_ERR(filp)) ++ goto out_mnt; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ status = vfs_write(filp, clp->cl_name.data, clp->cl_name.len, &offset); ++ set_fs(oldfs); ++ ++ dprintk("NFSD: nfsd4_create_clid_file vfs_write returns %d\n",status); ++ if (status >= 0) ++ status = nfs_ok; ++ ++ if (filp->f_op && filp->f_op->flush) { ++ int err = filp->f_op->flush(filp); ++ dprintk("NFSD: nfsd4_create_clid_file called flush\n"); ++ if (!status) ++ status = err; ++ } ++ /* dget and mntget in dentry_open call */ ++ fput(filp); ++ ++ /* dentry->d_count will be 0 */ ++ dput(dentry); ++out_mnt: ++ /* dget in vfs_create call */ ++ dput(nd_rec_init.dentry); ++ ++out: ++ nfs4_reset_user(); ++ ++ dprintk("NFSD: nfsd4_create_clid_file OUT recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ dprintk("NFSD: nfsd4_create_clid_file returns %d\n",status); ++ ++ return status; ++ ++out_unlock: ++ up(&nd_rec_init.dentry->d_inode->i_sem); ++ goto out; ++} ++ ++/* ++ * called with pdentry->d_inode->i_sem held ? ++ */ ++int ++nfsd4_unlink_rec_file(char *name, int namlen) ++{ ++ struct dentry *dentry; ++ int type, status; ++ ++ dprintk("NFSD: nfsd4_unlink_rec_file. name %.*s\n", namlen, name); ++ ++ dentry = lookup_one_len(name, nd_rec_init.dentry, namlen); ++ dprintk("NFSD: nfsd4_unlink_rec_file POST LOOKUP nd_rec d_count %d\n", ++ atomic_read(&nd_rec_init.dentry->d_count)); ++ status = PTR_ERR(dentry); ++ if (IS_ERR(dentry)) ++ goto out; ++ ++ status = -ENOENT; ++ if (!dentry->d_inode) { ++ dput(dentry); ++ goto out; ++ } ++ ++ /* should only be files here! */ ++ type = dentry->d_inode->i_mode & S_IFMT; ++ status = -EISDIR; ++ if (!(type & S_IFREG)) { ++ dput(dentry); ++ goto out; ++ } ++ ++ dprintk("NFSD: nfsd4_unlink_rec_file PRE VFS UNLINK [%d:%d]\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ status = vfs_unlink(nd_rec_init.dentry->d_inode, dentry); ++ ++ dprintk("NFSD: nfsd4_unlink_rec_file POST VFS UNLINK [%d:%d]\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ dprintk("NFSD: nfsd4_unlink_rec_file FILE dentry->d_count %d\n", ++ atomic_read(&dentry->d_count)); ++out: ++ dprintk("NFSD: nfsd4_unlink_rec_file returns %d\n",status); ++ return status; ++} ++ ++void ++nfsd4_remove_clid_file(struct nfs4_client *clp) ++{ ++ char fbuf[MAX_FILE_LEN], *fname = fbuf; ++ int status; ++ ++ if (!rec_dir_init) ++ return; ++ ++ dprintk("NFSD: nfsd4_remove_clid_file client %.*s\n", ++ clp->cl_name.len,clp->cl_name.data); ++ ++ nfs4_save_set_user(); ++ ++ dprintk("NFSD: nfsd4_remove_clid_file IN recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ nfs4_make_rec_filename(&fname, clp); ++ status = nfsd4_unlink_rec_file(fname, strlen(fname)); ++ nfs4_reset_user(); ++ if (status != nfs_ok) ++ printk("NFSD: Failed to remove expired client state file %.*s from %s\n", strlen(fname), fname, recovery_dirname); ++ ++ dprintk("NFSD: nfsd4_remove_clid_file OUT recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ return; ++} ++ ++struct rec_dirent { ++ int clear; ++}; ++ ++/* ++ * on reboot, stuff the reclaim hash with known client id's. ++ * ++ * the filename may not equal the clid. the clid might be the first ++ * (and so far only) line of data in the file. ++ * ++ * i will probably end up writing data such as the setclientid principal ++ * to each clid file. if i do, i will always put the clid as the ++ * first line of data. ++ */ ++ ++int ++nfsd4_get_recdir_dirent(struct rec_dirent *rdirent, const char *name, ++ int namlen, loff_t offset, ino_t ino, unsigned int d_type) ++{ ++ struct dentry *dclid; ++ struct file *filp; ++ mm_segment_t oldfs; ++ int status = nfs_ok; ++ ++ dprintk("NFSD: nfsd4_get_recdir_dirent IN recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ dprintk("NFSD: nfsd4_get_recdir_dirent name %.*s, clear %d\n", ++ namlen, name, rdirent->clear); ++ ++ if (name && isdotent(name, namlen)) ++ goto out; ++ ++ dclid = lookup_one_len(name, nd_rec_init.dentry, namlen); ++ status = PTR_ERR(dclid); ++ if(IS_ERR(dclid)) ++ goto out; ++ ++ if (rdirent->clear){ ++ dprintk("NFSD: nfsd4_get_recdir_dirent REMOVE\n"); ++ ++ dprintk("NFSD: nfsd4_get_recdir_dirent PRE VFS_UNLINK [%d:%d]\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ status = vfs_unlink(nd_rec_init.dentry->d_inode, dclid); ++ ++ dprintk("NFSD: nfsd4_get_recdir_dirent POST VFS_UNLINK [%d:%d]\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ } else { ++ char buf[MAX_FILE_LEN]; ++ ++ dprintk("NFSD: nfsd4_get_recdir_dirent READ\n"); ++ ++ filp = dentry_open(dclid, mntget(nd_rec_init.mnt), O_RDWR); ++ if (IS_ERR(filp)) { ++ status = PTR_ERR(filp); ++ goto out; ++ } ++ ++ memset(buf, 0, MAX_FILE_LEN); ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ status = vfs_read(filp, buf, MAX_FILE_LEN, &filp->f_pos); ++ set_fs(oldfs); ++ ++ dprintk("NFSD: nfsd4_get_recdir_dirent vfs_read returns %d\n", ++ status); ++ if (status > 0) ++ status = nfs4_client_to_reclaim(buf, status); ++ fput(filp); ++ } ++out: ++ dprintk("NFSD:nfsd4_get_recdir_dirent OUT recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ dprintk("NFSD: nfsd4_get_recdir_dirent returns %d\n",status); ++ return 0; ++} ++ ++int ++nfsd4_list_rec_dir(int clear) ++{ ++ struct file *filp; ++ struct rec_dirent rdirent; ++ int status; ++ ++ if (!rec_dir_init) ++ return -EINVAL; ++ ++ nfs4_save_set_user(); ++ ++ dprintk("NFSD: nfsd4_list_rec_dir IN recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ /* open directory */ ++ filp = dentry_open(dget(nd_rec_init.dentry), mntget(nd_rec_init.mnt), ++ O_RDWR); ++ status = PTR_ERR(filp); ++ if (IS_ERR(filp)) ++ goto out; ++ rdirent.clear = clear; ++ ++ /* read the directory entries into memory */ ++ status = vfs_readdir(filp, (filldir_t) nfsd4_get_recdir_dirent, ++ (void*)&rdirent); ++ ++ fput(filp); ++out: ++ dprintk("NFSD: nfsd4_list_rec_dir OUT recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ dprintk("NFSD: nfsd4_list_rec_dir DONE status: %d\n", status); ++ ++ nfs4_reset_user(); ++ return status; ++} ++ ++ ++/* ++ * Hold reference to the recovery directory. ++ */ ++ ++void ++nfsd4_init_rec_dir(char *rec_dirname) ++{ ++ int status; ++ ++ printk("NFSD: Using %s as the NFSv4 state recovery directory\n", ++ rec_dirname); ++ ++ nfs4_save_set_user(); ++ ++ status = path_lookup(rec_dirname, LOOKUP_FOLLOW, &nd_rec_init); ++ ++ printk("NFSD: nfsd4_init_rec_dir INITIAL recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++ ++ if (!status) ++ rec_dir_init = 1; ++ nfs4_reset_user(); ++ printk("NFSD: nfsd4_init_rec_dir rec_dir_init %d\n", rec_dir_init); ++} ++ ++void ++nfsd4_shutdown_rec_dir(void) ++{ ++ rec_dir_init = 0; ++ path_release(&nd_rec_init); ++ ++ printk("NFSD: nfsd4_shutdown_rec_dir FINAL recdir [d:mnt] count %d:%d\n", ++ atomic_read(&nd_rec_init.dentry->d_count), ++ atomic_read(&nd_rec_init.mnt->mnt_count)); ++} +Index: linux-2.6.10/fs/nfsd/Makefile +=================================================================== +--- linux-2.6.10.orig/fs/nfsd/Makefile 2004-12-25 05:35:50.000000000 +0800 ++++ linux-2.6.10/fs/nfsd/Makefile 2005-04-05 14:49:13.431687392 +0800 +@@ -8,5 +8,5 @@ + export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o + nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ +- nfs4acl.o ++ nfs4acl.o nfs4callback.o nfs4recover.o + nfsd-objs := $(nfsd-y) +Index: linux-2.6.10/fs/nfs/nfs4xdr.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/nfs4xdr.c 2004-12-25 05:35:40.000000000 +0800 ++++ linux-2.6.10/fs/nfs/nfs4xdr.c 2005-04-05 14:49:13.452684200 +0800 +@@ -82,12 +82,16 @@ + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define encode_getattr_maxsz (op_encode_hdr_maxsz + 3) ++#define nfs4_fattr_bitmap_maxsz 3 ++#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +-#define nfs4_fattr_bitmap_maxsz (36 + 2 * nfs4_name_maxsz) +-#define decode_getattr_maxsz (op_decode_hdr_maxsz + 3 + \ +- nfs4_fattr_bitmap_maxsz) ++/* This is based on getfattr, which uses the most attributes: */ ++#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ ++ 3 + 3 + 3 + 2 * nfs4_name_maxsz)) ++#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ ++ nfs4_fattr_value_maxsz) ++#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) + #define encode_savefh_maxsz (op_encode_hdr_maxsz) + #define decode_savefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) +@@ -122,11 +126,11 @@ + #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ + 1 + nfs4_name_maxsz + \ + nfs4_path_maxsz + \ +- nfs4_fattr_bitmap_maxsz) ++ nfs4_fattr_maxsz) + #define decode_symlink_maxsz (op_decode_hdr_maxsz + 8) + #define encode_create_maxsz (op_encode_hdr_maxsz + \ + 2 + nfs4_name_maxsz + \ +- nfs4_fattr_bitmap_maxsz) ++ nfs4_fattr_maxsz) + #define decode_create_maxsz (op_decode_hdr_maxsz + 8) + #define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4) + #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) +@@ -205,7 +209,7 @@ + #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + op_encode_hdr_maxsz + 4 + \ +- nfs4_fattr_bitmap_maxsz + \ ++ nfs4_fattr_maxsz + \ + encode_getattr_maxsz) + #define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ +@@ -360,6 +364,20 @@ + encode_delegreturn_maxsz) + #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ + decode_delegreturn_maxsz) ++#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 1) ++#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ op_encode_hdr_maxsz + 4 + \ ++ nfs4_fattr_bitmap_maxsz + 1) ++#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + + static struct { + unsigned int mode; +@@ -459,7 +477,7 @@ + * In the worst-case, this would be + * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) + * = 36 bytes, plus any contribution from variable-length fields +- * such as owner/group/acl's. ++ * such as owner/group. + */ + len = 16; + +@@ -1083,6 +1101,27 @@ + return 0; + } + ++extern nfs4_stateid zero_stateid; ++ ++static int ++encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) ++{ ++ uint32_t *p; ++ ++ RESERVE_SPACE(4+sizeof(zero_stateid.data)); ++ WRITE32(OP_SETATTR); ++ WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data)); ++ RESERVE_SPACE(2*4); ++ WRITE32(1); ++ WRITE32(FATTR4_WORD0_ACL); ++ if (arg->acl_len % 4) ++ return -EINVAL; ++ RESERVE_SPACE(4); ++ WRITE32(arg->acl_len); ++ xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); ++ return 0; ++} ++ + static int + encode_savefh(struct xdr_stream *xdr) + { +@@ -1627,6 +1666,34 @@ + } + + /* ++ * Encode a GETACL request ++ */ ++static int ++nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_getaclargs *args) ++{ ++ struct xdr_stream xdr; ++ struct rpc_auth *auth = req->rq_task->tk_auth; ++ struct compound_hdr hdr = { ++ .nops = 2, ++ }; ++ int replen, status; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, &hdr); ++ status = encode_putfh(&xdr, args->fh); ++ if (status) ++ goto out; ++ status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0); ++ /* set up reply buffer: */ ++ replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2; ++ xdr_inline_pages(&req->rq_rcv_buf, replen, ++ args->acl_pages, args->acl_pgbase, args->acl_len); ++out: ++ return status; ++} ++ ++/* + * Encode a WRITE request + */ + static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args) +@@ -3122,6 +3189,46 @@ + return decode_op_hdr(xdr, OP_RENEW); + } + ++static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, ++ ssize_t *acl_len) ++{ ++ uint32_t *savep; ++ uint32_t attrlen, ++ bitmap[2] = {0}; ++ struct kvec *iov = req->rq_rcv_buf.head; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto out; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto out; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto out; ++ ++ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) ++ return -EIO; ++ if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { ++ int hdrlen, recvd; ++ ++ /* We ignore &savep and don't do consistency checks on ++ * the attr length. Let userspace figure it out.... */ ++ hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; ++ recvd = req->rq_rcv_buf.len - hdrlen; ++ if (attrlen > recvd) { ++ printk(KERN_WARNING "NFS: server cheating in getattr" ++ " acl reply: attrlen %u > recvd %u\n", ++ attrlen, recvd); ++ return -EINVAL; ++ } ++ if (attrlen <= *acl_len) ++ xdr_read_pages(xdr, attrlen); ++ *acl_len = attrlen; ++ } ++ ++out: ++ return status; ++} ++ + static int + decode_savefh(struct xdr_stream *xdr) + { +@@ -3413,6 +3520,71 @@ + + } + ++/* ++ * Encode an SETACL request ++ */ ++static int ++nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .nops = 2, ++ }; ++ int status; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, &hdr); ++ status = encode_putfh(&xdr, args->fh); ++ if (status) ++ goto out; ++ status = encode_setacl(&xdr, args); ++out: ++ return status; ++} ++/* ++ * Decode SETACL response ++ */ ++static int ++nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_setattr(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETACL response ++ */ ++static int ++nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, ssize_t *acl_len) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_getacl(&xdr, rqstp, acl_len); ++ ++out: ++ return status; ++} + + /* + * Decode CLOSE response +@@ -4009,6 +4181,8 @@ + PROC(READDIR, enc_readdir, dec_readdir), + PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), + PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), ++ PROC(GETACL, enc_getacl, dec_getacl), ++ PROC(SETACL, enc_setacl, dec_setacl), + }; + + struct rpc_version nfs_version4 = { +Index: linux-2.6.10/fs/nfs/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/inode.c 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/fs/nfs/inode.c 2005-04-05 14:49:13.445685264 +0800 +@@ -486,13 +486,27 @@ + if (error < 0) + goto out_err; + +- buf->f_frsize = server->wtmult; ++ /* ++ * Current versions of glibc do not correctly handle the ++ * case where f_frsize != f_bsize. Eventually we want to ++ * report the value of wtmult in this field. ++ */ ++ buf->f_frsize = sb->s_blocksize; ++ ++ /* ++ * On most *nix systems, f_blocks, f_bfree, and f_bavail ++ * are reported in units of f_frsize. Linux hasn't had ++ * an f_frsize field in its statfs struct until recently, ++ * thus historically Linux's sys_statfs reports these ++ * fields in units of f_bsize. ++ */ + buf->f_bsize = sb->s_blocksize; + blockbits = sb->s_blocksize_bits; + blockres = (1 << blockbits) - 1; + buf->f_blocks = (res.tbytes + blockres) >> blockbits; + buf->f_bfree = (res.fbytes + blockres) >> blockbits; + buf->f_bavail = (res.abytes + blockres) >> blockbits; ++ + buf->f_files = res.tfiles; + buf->f_ffree = res.afiles; + +@@ -565,9 +579,9 @@ + + memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) +- nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; ++ nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS; + else +- nfsi->flags |= NFS_INO_INVALID_ATTR; ++ nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS; + } + + /* +@@ -605,7 +619,7 @@ + return 0; + if (nfs_compare_fh(NFS_FH(inode), fh)) + return 0; +- if (is_bad_inode(inode)) ++ if (is_bad_inode(inode) || NFS_STALE(inode)) + return 0; + return 1; + } +@@ -664,7 +678,7 @@ + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. + */ +- inode->i_op = &nfs_file_inode_operations; ++ inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &nfs_file_operations; + inode->i_data.a_ops = &nfs_file_aops; +@@ -766,13 +780,8 @@ + vmtruncate(inode, attr->ia_size); + } + } +- if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { +- struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred; +- if (*cred) { +- put_rpccred(*cred); +- *cred = NULL; +- } +- } ++ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) ++ NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS; + nfs_end_data_update(inode); + unlock_kernel(); + return error; +@@ -949,14 +958,14 @@ + lock_kernel(); + if (!inode || is_bad_inode(inode)) + goto out_nowait; +- if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode) ++ if (NFS_STALE(inode)) + goto out_nowait; + + while (NFS_REVALIDATING(inode)) { + status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING); + if (status < 0) + goto out_nowait; +- if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOAC) ++ if (NFS_ATTRTIMEO(inode) == 0) + continue; + if (NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) + continue; +@@ -968,14 +977,14 @@ + /* Protect against RPC races by saving the change attribute */ + verifier = nfs_save_change_attribute(inode); + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); +- if (status) { ++ if (status != 0) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), status); + if (status == -ESTALE) { +- NFS_FLAGS(inode) |= NFS_INO_STALE; +- if (inode != inode->i_sb->s_root->d_inode) +- remove_inode_hash(inode); ++ nfs_zap_caches(inode); ++ if (!S_ISDIR(inode->i_mode)) ++ NFS_FLAGS(inode) |= NFS_INO_STALE; + } + goto out; + } +@@ -1014,7 +1023,6 @@ + inode->i_sb->s_id, + (long long)NFS_FILEID(inode)); + +- NFS_FLAGS(inode) &= ~NFS_INO_STALE; + out: + NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING; + wake_up(&nfsi->nfs_i_wait); +@@ -1161,7 +1169,7 @@ + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) + || inode->i_uid != fattr->uid + || inode->i_gid != fattr->gid) +- nfsi->flags |= NFS_INO_INVALID_ATTR; ++ nfsi->flags |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; + + /* Has the link count changed? */ + if (inode->i_nlink != fattr->nlink) +@@ -1270,7 +1278,7 @@ + #endif + nfsi->change_attr = fattr->change_attr; + if (!data_unstable) +- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; ++ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS; + } + + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); +@@ -1278,14 +1286,8 @@ + + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || + inode->i_uid != fattr->uid || +- inode->i_gid != fattr->gid) { +- struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred; +- if (*cred) { +- put_rpccred(*cred); +- *cred = NULL; +- } +- invalid |= NFS_INO_INVALID_ATTR; +- } ++ inode->i_gid != fattr->gid) ++ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS; + + inode->i_mode = fattr->mode; + inode->i_nlink = fattr->nlink; +@@ -1335,7 +1337,8 @@ + */ + nfs_invalidate_inode(inode); + out_err: +- return -EIO; ++ NFS_FLAGS(inode) |= NFS_INO_STALE; ++ return -ESTALE; + } + + /* +@@ -1449,8 +1452,6 @@ + + kill_anon_super(s); + +- nfs4_renewd_prepare_shutdown(server); +- + if (server->client != NULL && !IS_ERR(server->client)) + rpc_shutdown_client(server->client); + if (server->client_sys != NULL && !IS_ERR(server->client_sys)) +@@ -1461,8 +1462,6 @@ + + rpciod_down(); /* release rpciod */ + +- destroy_nfsv4_state(server); +- + if (server->hostname != NULL) + kfree(server->hostname); + kfree(server); +@@ -1478,8 +1477,53 @@ + + #ifdef CONFIG_NFS_V4 + +-static void nfs4_clear_inode(struct inode *); ++#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" ++ ++int ++nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, ++ size_t buflen, int flags) ++{ ++ struct inode *inode = dentry->d_inode; ++ ++ if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) ++ return -EINVAL; ++ ++ if (!S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++ ++ return nfs4_proc_set_acl(inode, buf, buflen); ++} ++ ++/* The getxattr man page suggests returning -ENODATA for unknown attributes, ++ * and that's what we'll do for e.g. user attributes that haven't been set. ++ * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported ++ * attributes in kernel-managed attribute namespaces. */ ++ssize_t ++nfs4_getxattr(struct dentry *dentry, const char *key, void *buf, ++ size_t buflen) ++{ ++ struct inode *inode = dentry->d_inode; + ++ if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) ++ return -EOPNOTSUPP; ++ ++ return nfs4_proc_get_acl(inode, buf, buflen); ++} ++ ++ssize_t ++nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) ++{ ++ ssize_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; ++ ++ if (buf && buflen < len) ++ return -ERANGE; ++ if (buf) ++ memcpy(buf, XATTR_NAME_NFSV4_ACL, len); ++ return len; ++} ++ ++static void nfs4_clear_inode(struct inode *); + + static struct super_operations nfs4_sops = { + .alloc_inode = nfs_alloc_inode, +@@ -1543,9 +1587,6 @@ + server->wsize = nfs_block_size(data->wsize, NULL); + server->flags = data->flags & NFS_MOUNT_FLAGMASK; + +- /* NFSv4 doesn't use NLM locking */ +- server->flags |= NFS_MOUNT_NONLM; +- + server->acregmin = data->acregmin*HZ; + server->acregmax = data->acregmax*HZ; + server->acdirmin = data->acdirmin*HZ; +@@ -1790,8 +1831,22 @@ + + static void nfs4_kill_super(struct super_block *sb) + { ++ struct nfs_server *server = NFS_SB(sb); ++ + nfs_return_all_delegations(sb); +- nfs_kill_super(sb); ++ kill_anon_super(sb); ++ ++ nfs4_renewd_prepare_shutdown(server); ++ ++ if (server->client != NULL && !IS_ERR(server->client)) ++ rpc_shutdown_client(server->client); ++ rpciod_down(); /* release rpciod */ ++ ++ destroy_nfsv4_state(server); ++ ++ if (server->hostname != NULL) ++ kfree(server->hostname); ++ kfree(server); + } + + static struct file_system_type nfs4_fs_type = { +@@ -1821,9 +1876,13 @@ + extern int nfs_init_nfspagecache(void); + extern void nfs_destroy_nfspagecache(void); + extern int nfs_init_readpagecache(void); +-extern int nfs_destroy_readpagecache(void); ++extern void nfs_destroy_readpagecache(void); + extern int nfs_init_writepagecache(void); +-extern int nfs_destroy_writepagecache(void); ++extern void nfs_destroy_writepagecache(void); ++#ifdef CONFIG_NFS_DIRECTIO ++extern int nfs_init_directcache(void); ++extern void nfs_destroy_directcache(void); ++#endif + + static kmem_cache_t * nfs_inode_cachep; + +@@ -1904,6 +1963,12 @@ + if (err) + goto out1; + ++#ifdef CONFIG_NFS_DIRECTIO ++ err = nfs_init_directcache(); ++ if (err) ++ goto out0; ++#endif ++ + #ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); + #endif +@@ -1914,8 +1979,14 @@ + goto out; + return 0; + out: ++#ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); ++#endif + nfs_destroy_writepagecache(); ++#ifdef CONFIG_NFS_DIRECTIO ++out0: ++ nfs_destroy_directcache(); ++#endif + out1: + nfs_destroy_readpagecache(); + out2: +@@ -1928,6 +1999,9 @@ + + static void __exit exit_nfs_fs(void) + { ++#ifdef CONFIG_NFS_DIRECTIO ++ nfs_destroy_directcache(); ++#endif + nfs_destroy_writepagecache(); + nfs_destroy_readpagecache(); + nfs_destroy_inodecache(); +Index: linux-2.6.10/fs/nfs/nfs4state.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/nfs4state.c 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/fs/nfs/nfs4state.c 2005-04-05 14:49:13.446685112 +0800 +@@ -445,7 +445,7 @@ + state->owner = owner; + atomic_inc(&owner->so_count); + list_add(&state->inode_states, &nfsi->open_states); +- state->inode = inode; ++ state->inode = igrab(inode); + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); +@@ -471,6 +471,7 @@ + list_del(&state->inode_states); + spin_unlock(&inode->i_lock); + list_del(&state->open_states); ++ iput(inode); + BUG_ON (state->state != 0); + nfs4_free_open_state(state); + nfs4_put_state_owner(owner); +@@ -486,7 +487,6 @@ + struct nfs4_state_owner *owner = state->owner; + struct nfs4_client *clp = owner->so_client; + int newstate; +- int status = 0; + + atomic_inc(&owner->so_count); + down_read(&clp->cl_sem); +@@ -508,10 +508,8 @@ + newstate |= FMODE_WRITE; + if (state->state == newstate) + goto out; +- if (newstate != 0) +- status = nfs4_do_downgrade(inode, state, newstate); +- else +- status = nfs4_do_close(inode, state); ++ if (nfs4_do_close(inode, state, newstate) == -EINPROGRESS) ++ return; + } + out: + nfs4_put_open_state(state); +Index: linux-2.6.10/fs/nfs/idmap.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/idmap.c 2004-12-25 05:34:26.000000000 +0800 ++++ linux-2.6.10/fs/nfs/idmap.c 2005-04-05 14:49:13.454683896 +0800 +@@ -80,6 +80,7 @@ + static ssize_t idmap_pipe_downcall(struct file *, const char __user *, + size_t); + void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); ++static void idmap_pipe_release(struct inode *inode); + + static unsigned int fnvhash32(const void *, size_t); + +@@ -87,6 +88,7 @@ + .upcall = idmap_pipe_upcall, + .downcall = idmap_pipe_downcall, + .destroy_msg = idmap_pipe_destroy_msg, ++ .release_pipe = idmap_pipe_release, + }; + + void +@@ -448,6 +450,19 @@ + up(&idmap->idmap_im_lock); + } + ++static void ++idmap_pipe_release(struct inode *inode) ++{ ++ struct rpc_inode *rpci = RPC_I(inode); ++ struct idmap *idmap = (struct idmap *)rpci->private; ++ struct idmap_msg *im = &idmap->idmap_im; ++ ++ down(&idmap->idmap_im_lock); ++ im->im_status = IDMAP_STATUS_LOOKUPFAIL; ++ wake_up(&idmap->idmap_wq); ++ up(&idmap->idmap_im_lock); ++} ++ + /* + * Fowler/Noll/Vo hash + * http://www.isthe.com/chongo/tech/comp/fnv/ +Index: linux-2.6.10/fs/nfs/dir.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/dir.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/fs/nfs/dir.c 2005-04-05 14:49:13.439686176 +0800 +@@ -40,8 +40,6 @@ + static int nfs_opendir(struct inode *, struct file *); + static int nfs_readdir(struct file *, void *, filldir_t); + static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); +-static int nfs_cached_lookup(struct inode *, struct dentry *, +- struct nfs_fh *, struct nfs_fattr *); + static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); + static int nfs_mkdir(struct inode *, struct dentry *, int); + static int nfs_rmdir(struct inode *, struct dentry *); +@@ -92,6 +90,9 @@ + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, ++ .getxattr = nfs4_getxattr, ++ .setxattr = nfs4_setxattr, ++ .listxattr = nfs4_listxattr, + }; + + #endif /* CONFIG_NFS_V4 */ +@@ -294,24 +295,13 @@ + return res; + } + +-static unsigned int nfs_type2dtype[] = { +- DT_UNKNOWN, +- DT_REG, +- DT_DIR, +- DT_BLK, +- DT_CHR, +- DT_LNK, +- DT_SOCK, +- DT_UNKNOWN, +- DT_FIFO +-}; +- +-static inline +-unsigned int nfs_type_to_d_type(enum nfs_ftype type) ++static inline unsigned int dt_type(struct inode *inode) + { +- return nfs_type2dtype[type]; ++ return (inode->i_mode >> 12) & 15; + } + ++static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc); ++ + /* + * Once we've found the start of the dirent within a page: fill 'er up... + */ +@@ -321,6 +311,7 @@ + { + struct file *file = desc->file; + struct nfs_entry *entry = desc->entry; ++ struct dentry *dentry = NULL; + unsigned long fileid; + int loop_count = 0, + res; +@@ -333,9 +324,16 @@ + * retrieving the current dirent on the server */ + fileid = nfs_fileid_to_ino_t(entry->ino); + ++ /* Get a dentry if we have one */ ++ if (dentry != NULL) ++ dput(dentry); ++ dentry = nfs_readdir_lookup(desc); ++ + /* Use readdirplus info */ +- if (desc->plus && (entry->fattr->valid & NFS_ATTR_FATTR)) +- d_type = nfs_type_to_d_type(entry->fattr->type); ++ if (dentry != NULL && dentry->d_inode != NULL) { ++ d_type = dt_type(dentry->d_inode); ++ fileid = dentry->d_inode->i_ino; ++ } + + res = filldir(dirent, entry->name, entry->len, + entry->prev_cookie, fileid, d_type); +@@ -352,7 +350,8 @@ + } + } + dir_page_release(desc); +- ++ if (dentry != NULL) ++ dput(dentry); + dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->target, res); + return res; + } +@@ -615,24 +614,10 @@ + goto out_valid; + } + +- /* +- * Note: we're not holding inode->i_sem and so may be racing with +- * operations that change the directory. We therefore save the +- * change attribute *before* we do the RPC call. +- */ +- verifier = nfs_save_change_attribute(dir); +- error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); +- if (!error) { +- if (nfs_compare_fh(NFS_FH(inode), &fhandle)) +- goto out_bad; +- if (nfs_lookup_verify_inode(inode, isopen)) +- goto out_zap_parent; +- goto out_valid_renew; +- } +- + if (NFS_STALE(inode)) + goto out_bad; + ++ verifier = nfs_save_change_attribute(dir); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); + if (error) + goto out_bad; +@@ -641,7 +626,6 @@ + if ((error = nfs_refresh_inode(inode, &fattr)) != 0) + goto out_bad; + +- out_valid_renew: + nfs_renew_times(dentry); + nfs_set_verifier(dentry, verifier); + out_valid: +@@ -723,6 +707,7 @@ + + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) + { ++ struct dentry *res; + struct inode *inode = NULL; + int error; + struct nfs_fh fhandle; +@@ -731,11 +716,11 @@ + dfprintk(VFS, "NFS: lookup(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + +- error = -ENAMETOOLONG; ++ res = ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + goto out; + +- error = -ENOMEM; ++ res = ERR_PTR(-ENOMEM); + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + + lock_kernel(); +@@ -746,29 +731,27 @@ + if (nfs_is_exclusive_create(dir, nd)) + goto no_entry; + +- error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); +- if (error != 0) { +- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, +- &fhandle, &fattr); +- if (error == -ENOENT) +- goto no_entry; +- if (error != 0) +- goto out_unlock; ++ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); ++ if (error == -ENOENT) ++ goto no_entry; ++ if (error < 0) { ++ res = ERR_PTR(error); ++ goto out_unlock; + } +- error = -EACCES; ++ res = ERR_PTR(-EACCES); + inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); + if (!inode) + goto out_unlock; + no_entry: +- error = 0; +- d_add(dentry, inode); ++ res = d_add_unique(dentry, inode); ++ if (res != NULL) ++ dentry = res; + nfs_renew_times(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + out_unlock: + unlock_kernel(); + out: +- BUG_ON(error > 0); +- return ERR_PTR(error); ++ return res; + } + + #ifdef CONFIG_NFS_V4 +@@ -798,15 +781,15 @@ + + static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) + { ++ struct dentry *res = NULL; + struct inode *inode = NULL; +- int error = 0; + + /* Check that we are indeed trying to open this file */ + if (!is_atomic_open(dir, nd)) + goto no_open; + + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { +- error = -ENAMETOOLONG; ++ res = ERR_PTR(-ENAMETOOLONG); + goto out; + } + dentry->d_op = NFS_PROTO(dir)->dentry_ops; +@@ -828,7 +811,7 @@ + inode = nfs4_atomic_open(dir, dentry, nd); + unlock_kernel(); + if (IS_ERR(inode)) { +- error = PTR_ERR(inode); ++ int error = PTR_ERR(inode); + switch (error) { + /* Make a negative dentry */ + case -ENOENT: +@@ -841,16 +824,18 @@ + /* case -EISDIR: */ + /* case -EINVAL: */ + default: ++ res = ERR_PTR(error); + goto out; + } + } + no_entry: +- d_add(dentry, inode); ++ res = d_add_unique(dentry, inode); ++ if (res != NULL) ++ dentry = res; + nfs_renew_times(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + out: +- BUG_ON(error > 0); +- return ERR_PTR(error); ++ return res; + no_open: + return nfs_lookup(dir, dentry, nd); + } +@@ -906,83 +891,51 @@ + } + #endif /* CONFIG_NFSV4 */ + +-static inline +-int find_dirent_name(nfs_readdir_descriptor_t *desc, struct page *page, struct dentry *dentry) ++static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) + { ++ struct dentry *parent = desc->file->f_dentry; ++ struct inode *dir = parent->d_inode; + struct nfs_entry *entry = desc->entry; +- int status; +- +- while((status = dir_decode(desc)) == 0) { +- if (entry->len != dentry->d_name.len) +- continue; +- if (memcmp(entry->name, dentry->d_name.name, entry->len)) +- continue; +- if (!(entry->fattr->valid & NFS_ATTR_FATTR)) +- continue; +- break; +- } +- return status; +-} +- +-/* +- * Use the cached Readdirplus results in order to avoid a LOOKUP call +- * whenever we believe that the parent directory has not changed. +- * +- * We assume that any file creation/rename changes the directory mtime. +- * As this results in a page cache invalidation whenever it occurs, +- * we don't require any other tests for cache coherency. +- */ +-static +-int nfs_cached_lookup(struct inode *dir, struct dentry *dentry, +- struct nfs_fh *fh, struct nfs_fattr *fattr) +-{ +- nfs_readdir_descriptor_t desc; +- struct nfs_server *server; +- struct nfs_entry entry; +- struct page *page; +- unsigned long timestamp; +- int res; +- +- if (!NFS_USE_READDIRPLUS(dir)) +- return -ENOENT; +- server = NFS_SERVER(dir); +- /* Don't use readdirplus unless the cache is stable */ +- if ((server->flags & NFS_MOUNT_NOAC) != 0 +- || nfs_caches_unstable(dir) +- || nfs_attribute_timeout(dir)) +- return -ENOENT; +- if ((NFS_FLAGS(dir) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) != 0) +- return -ENOENT; +- timestamp = NFS_I(dir)->readdir_timestamp; +- +- entry.fh = fh; +- entry.fattr = fattr; +- +- desc.decode = NFS_PROTO(dir)->decode_dirent; +- desc.entry = &entry; +- desc.page_index = 0; +- desc.plus = 1; +- +- for(;(page = find_get_page(dir->i_mapping, desc.page_index)); desc.page_index++) { +- +- res = -EIO; +- if (PageUptodate(page)) { +- void * kaddr = kmap_atomic(page, KM_USER0); +- desc.ptr = kaddr; +- res = find_dirent_name(&desc, page, dentry); +- kunmap_atomic(kaddr, KM_USER0); +- } +- page_cache_release(page); ++ struct dentry *dentry, *alias; ++ struct qstr name = { ++ .name = entry->name, ++ .len = entry->len, ++ }; ++ struct inode *inode; + +- if (res == 0) +- goto out_found; +- if (res != -EAGAIN) ++ switch (name.len) { ++ case 2: ++ if (name.name[0] == '.' && name.name[1] == '.') ++ return dget_parent(parent); + break; ++ case 1: ++ if (name.name[0] == '.') ++ return dget(parent); ++ } ++ name.hash = full_name_hash(name.name, name.len); ++ dentry = d_lookup(parent, &name); ++ if (dentry != NULL) ++ return dentry; ++ if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) ++ return NULL; ++ /* Note: caller is already holding the dir->i_sem! */ ++ dentry = d_alloc(parent, &name); ++ if (dentry == NULL) ++ return NULL; ++ dentry->d_op = NFS_PROTO(dir)->dentry_ops; ++ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); ++ if (!inode) { ++ dput(dentry); ++ return NULL; + } +- return -ENOENT; +- out_found: +- fattr->timestamp = timestamp; +- return 0; ++ alias = d_add_unique(dentry, inode); ++ if (alias != NULL) { ++ dput(dentry); ++ dentry = alias; ++ } ++ nfs_renew_times(dentry); ++ nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); ++ return dentry; + } + + /* +@@ -1045,15 +998,9 @@ + if (nd && (nd->flags & LOOKUP_CREATE)) + open_flags = nd->intent.open.flags; + +- /* +- * The 0 argument passed into the create function should one day +- * contain the O_EXCL flag if requested. This allows NFSv3 to +- * select the appropriate create strategy. Currently open_namei +- * does not pass the create flags. +- */ + lock_kernel(); + nfs_begin_data_update(dir); +- inode = NFS_PROTO(dir)->create(dir, &dentry->d_name, &attr, open_flags); ++ inode = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); + nfs_end_data_update(dir); + if (!IS_ERR(inode)) { + d_instantiate(dentry, inode); +@@ -1508,7 +1455,7 @@ + + if (cache->cred != cred + || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)) +- || (NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR)) ++ || (NFS_FLAGS(inode) & NFS_INO_INVALID_ACCESS)) + return -ENOENT; + memcpy(res, cache, sizeof(*res)); + return 0; +@@ -1522,6 +1469,7 @@ + if (cache->cred) + put_rpccred(cache->cred); + cache->cred = get_rpccred(set->cred); ++ NFS_FLAGS(inode) &= ~NFS_INO_INVALID_ACCESS; + } + cache->jiffies = set->jiffies; + cache->mask = set->mask; +Index: linux-2.6.10/fs/nfs/unlink.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/unlink.c 2004-12-25 05:35:29.000000000 +0800 ++++ linux-2.6.10/fs/nfs/unlink.c 2005-04-05 14:49:13.435686784 +0800 +@@ -215,7 +215,6 @@ + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + spin_unlock(&dentry->d_lock); +- if (data->task.tk_rpcwait == &nfs_delete_queue) +- rpc_wake_up_task(&data->task); ++ rpc_wake_up_task(&data->task); + nfs_put_unlinkdata(data); + } +Index: linux-2.6.10/fs/nfs/write.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/write.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/fs/nfs/write.c 2005-04-05 14:49:13.443685568 +0800 +@@ -61,7 +61,6 @@ + #include + #include + #include +-#include + + #include "delegation.h" + +@@ -83,49 +82,17 @@ + static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int); + + static kmem_cache_t *nfs_wdata_cachep; +-static mempool_t *nfs_wdata_mempool; +-static mempool_t *nfs_commit_mempool; ++mempool_t *nfs_wdata_mempool; ++mempool_t *nfs_commit_mempool; + + static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); + +-static __inline__ struct nfs_write_data *nfs_writedata_alloc(void) +-{ +- struct nfs_write_data *p; +- p = (struct nfs_write_data *)mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); +- if (p) { +- memset(p, 0, sizeof(*p)); +- INIT_LIST_HEAD(&p->pages); +- } +- return p; +-} +- +-static __inline__ void nfs_writedata_free(struct nfs_write_data *p) +-{ +- mempool_free(p, nfs_wdata_mempool); +-} +- +-static void nfs_writedata_release(struct rpc_task *task) ++void nfs_writedata_release(struct rpc_task *task) + { + struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata; + nfs_writedata_free(wdata); + } + +-static __inline__ struct nfs_write_data *nfs_commit_alloc(void) +-{ +- struct nfs_write_data *p; +- p = (struct nfs_write_data *)mempool_alloc(nfs_commit_mempool, SLAB_NOFS); +- if (p) { +- memset(p, 0, sizeof(*p)); +- INIT_LIST_HEAD(&p->pages); +- } +- return p; +-} +- +-static __inline__ void nfs_commit_free(struct nfs_write_data *p) +-{ +- mempool_free(p, nfs_commit_mempool); +-} +- + /* Adjust the file length if we're writing beyond the end */ + static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) + { +@@ -184,11 +151,10 @@ + int result, written = 0; + struct nfs_write_data *wdata; + +- wdata = kmalloc(sizeof(*wdata), GFP_NOFS); ++ wdata = nfs_writedata_alloc(); + if (!wdata) + return -ENOMEM; + +- memset(wdata, 0, sizeof(*wdata)); + wdata->flags = how; + wdata->cred = ctx->cred; + wdata->inode = inode; +@@ -238,8 +204,7 @@ + + io_error: + nfs_end_data_update_defer(inode); +- +- kfree(wdata); ++ nfs_writedata_free(wdata); + return written ? written : result; + } + +@@ -1199,7 +1164,8 @@ + } + if (time_before(complain, jiffies)) { + printk(KERN_WARNING +- "NFS: Server wrote less than requested.\n"); ++ "NFS: Server wrote zero bytes, expected %u.\n", ++ argp->count); + complain = jiffies + 300 * HZ; + } + /* Can't do anything about it except throw an error. */ +Index: linux-2.6.10/fs/nfs/proc.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/proc.c 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/fs/nfs/proc.c 2005-04-05 14:49:13.440686024 +0800 +@@ -63,12 +63,12 @@ + dprintk("%s: call getattr\n", __FUNCTION__); + fattr->valid = 0; + status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0); +- dprintk("%s: reply getattr %d\n", __FUNCTION__, status); ++ dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); + if (status) + return status; + dprintk("%s: call statfs\n", __FUNCTION__); + status = rpc_call(server->client_sys, NFSPROC_STATFS, fhandle, &fsinfo, 0); +- dprintk("%s: reply statfs %d\n", __FUNCTION__, status); ++ dprintk("%s: reply statfs: %d\n", __FUNCTION__, status); + if (status) + return status; + info->rtmax = NFS_MAXDATA; +@@ -96,7 +96,7 @@ + fattr->valid = 0; + status = rpc_call(server->client, NFSPROC_GETATTR, + fhandle, fattr, 0); +- dprintk("NFS reply getattr\n"); ++ dprintk("NFS reply getattr: %d\n", status); + return status; + } + +@@ -114,7 +114,7 @@ + dprintk("NFS call setattr\n"); + fattr->valid = 0; + status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0); +- dprintk("NFS reply setattr\n"); ++ dprintk("NFS reply setattr: %d\n", status); + return status; + } + +@@ -213,15 +213,15 @@ + } + + static struct inode * +-nfs_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr, ++nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags) + { + struct nfs_fh fhandle; + struct nfs_fattr fattr; + struct nfs_createargs arg = { + .fh = NFS_FH(dir), +- .name = name->name, +- .len = name->len, ++ .name = dentry->d_name.name, ++ .len = dentry->d_name.len, + .sattr = sattr + }; + struct nfs_diropok res = { +@@ -231,7 +231,7 @@ + int status; + + fattr.valid = 0; +- dprintk("NFS call create %s\n", name->name); ++ dprintk("NFS call create %s\n", dentry->d_name.name); + status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); + dprintk("NFS reply create: %d\n", status); + if (status == 0) { +@@ -620,6 +620,7 @@ + .version = 2, /* protocol version */ + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, ++ .file_inode_ops = &nfs_file_inode_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +Index: linux-2.6.10/fs/nfs/callback.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/callback.c 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/fs/nfs/callback.c 2005-04-05 14:49:13.436686632 +0800 +@@ -139,133 +139,10 @@ + return ret; + } + +-/* +- * AUTH_NULL authentication +- */ +-static int nfs_callback_null_accept(struct svc_rqst *rqstp, u32 *authp) +-{ +- struct kvec *argv = &rqstp->rq_arg.head[0]; +- struct kvec *resv = &rqstp->rq_res.head[0]; +- +- if (argv->iov_len < 3*4) +- return SVC_GARBAGE; +- +- if (svc_getu32(argv) != 0) { +- dprintk("svc: bad null cred\n"); +- *authp = rpc_autherr_badcred; +- return SVC_DENIED; +- } +- if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) { +- dprintk("svc: bad null verf\n"); +- *authp = rpc_autherr_badverf; +- return SVC_DENIED; +- } +- +- /* Signal that mapping to nobody uid/gid is required */ +- rqstp->rq_cred.cr_uid = (uid_t) -1; +- rqstp->rq_cred.cr_gid = (gid_t) -1; +- rqstp->rq_cred.cr_group_info = groups_alloc(0); +- if (rqstp->rq_cred.cr_group_info == NULL) +- return SVC_DROP; /* kmalloc failure - client must retry */ +- +- /* Put NULL verifier */ +- svc_putu32(resv, RPC_AUTH_NULL); +- svc_putu32(resv, 0); +- dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK); +- return SVC_OK; +-} +- +-static int nfs_callback_null_release(struct svc_rqst *rqstp) +-{ +- if (rqstp->rq_cred.cr_group_info) +- put_group_info(rqstp->rq_cred.cr_group_info); +- rqstp->rq_cred.cr_group_info = NULL; +- return 0; /* don't drop */ +-} +- +-static struct auth_ops nfs_callback_auth_null = { +- .name = "null", +- .flavour = RPC_AUTH_NULL, +- .accept = nfs_callback_null_accept, +- .release = nfs_callback_null_release, +-}; +- +-/* +- * AUTH_SYS authentication +- */ +-static int nfs_callback_unix_accept(struct svc_rqst *rqstp, u32 *authp) +-{ +- struct kvec *argv = &rqstp->rq_arg.head[0]; +- struct kvec *resv = &rqstp->rq_res.head[0]; +- struct svc_cred *cred = &rqstp->rq_cred; +- u32 slen, i; +- int len = argv->iov_len; +- +- dprintk("%s: start\n", __FUNCTION__); +- cred->cr_group_info = NULL; +- rqstp->rq_client = NULL; +- if ((len -= 3*4) < 0) +- return SVC_GARBAGE; +- +- /* Get length, time stamp and machine name */ +- svc_getu32(argv); +- svc_getu32(argv); +- slen = XDR_QUADLEN(ntohl(svc_getu32(argv))); +- if (slen > 64 || (len -= (slen + 3)*4) < 0) +- goto badcred; +- argv->iov_base = (void*)((u32*)argv->iov_base + slen); +- argv->iov_len -= slen*4; +- +- cred->cr_uid = ntohl(svc_getu32(argv)); +- cred->cr_gid = ntohl(svc_getu32(argv)); +- slen = ntohl(svc_getu32(argv)); +- if (slen > 16 || (len -= (slen + 2)*4) < 0) +- goto badcred; +- cred->cr_group_info = groups_alloc(slen); +- if (cred->cr_group_info == NULL) +- return SVC_DROP; +- for (i = 0; i < slen; i++) +- GROUP_AT(cred->cr_group_info, i) = ntohl(svc_getu32(argv)); +- +- if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) { +- *authp = rpc_autherr_badverf; +- return SVC_DENIED; +- } +- /* Put NULL verifier */ +- svc_putu32(resv, RPC_AUTH_NULL); +- svc_putu32(resv, 0); +- dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK); +- return SVC_OK; +-badcred: +- *authp = rpc_autherr_badcred; +- return SVC_DENIED; +-} +- +-static int nfs_callback_unix_release(struct svc_rqst *rqstp) +-{ +- if (rqstp->rq_cred.cr_group_info) +- put_group_info(rqstp->rq_cred.cr_group_info); +- rqstp->rq_cred.cr_group_info = NULL; +- return 0; +-} +- +-static struct auth_ops nfs_callback_auth_unix = { +- .name = "unix", +- .flavour = RPC_AUTH_UNIX, +- .accept = nfs_callback_unix_accept, +- .release = nfs_callback_unix_release, +-}; +- +-/* +- * Hook the authentication protocol +- */ +-static int nfs_callback_auth(struct svc_rqst *rqstp, u32 *authp) ++static int nfs_callback_authenticate(struct svc_rqst *rqstp) + { + struct in_addr *addr = &rqstp->rq_addr.sin_addr; + struct nfs4_client *clp; +- struct kvec *argv = &rqstp->rq_arg.head[0]; +- int flavour; +- int retval; + + /* Don't talk to strangers */ + clp = nfs4_find_client(addr); +@@ -273,34 +150,19 @@ + return SVC_DROP; + dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr)); + nfs4_put_client(clp); +- flavour = ntohl(svc_getu32(argv)); +- switch(flavour) { ++ switch (rqstp->rq_authop->flavour) { + case RPC_AUTH_NULL: +- if (rqstp->rq_proc != CB_NULL) { +- *authp = rpc_autherr_tooweak; +- retval = SVC_DENIED; +- break; +- } +- rqstp->rq_authop = &nfs_callback_auth_null; +- retval = nfs_callback_null_accept(rqstp, authp); ++ if (rqstp->rq_proc != CB_NULL) ++ return SVC_DENIED; + break; + case RPC_AUTH_UNIX: +- /* Eat the authentication flavour */ +- rqstp->rq_authop = &nfs_callback_auth_unix; +- retval = nfs_callback_unix_accept(rqstp, authp); + break; ++ case RPC_AUTH_GSS: ++ /* FIXME: RPCSEC_GSS handling? */ + default: +- /* FIXME: need to add RPCSEC_GSS upcalls */ +-#if 0 +- svc_ungetu32(argv); +- retval = svc_authenticate(rqstp, authp); +-#else +- *authp = rpc_autherr_rejectedcred; +- retval = SVC_DENIED; +-#endif ++ return SVC_DENIED; + } +- dprintk("%s: flavour %d returning error %d\n", __FUNCTION__, flavour, retval); +- return retval; ++ return SVC_OK; + } + + /* +@@ -321,5 +183,5 @@ + .pg_name = "NFSv4 callback", /* service name */ + .pg_class = "nfs", /* authentication class */ + .pg_stats = &nfs4_callback_stats, +- .pg_authenticate = nfs_callback_auth, ++ .pg_authenticate = nfs_callback_authenticate, + }; +Index: linux-2.6.10/fs/nfs/file.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/file.c 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/fs/nfs/file.c 2005-04-05 14:49:13.453684048 +0800 +@@ -67,6 +67,19 @@ + .setattr = nfs_setattr, + }; + ++#ifdef CONFIG_NFS_V4 ++ ++struct inode_operations nfs4_file_inode_operations = { ++ .permission = nfs_permission, ++ .getattr = nfs_getattr, ++ .setattr = nfs_setattr, ++ .getxattr = nfs4_getxattr, ++ .setxattr = nfs4_setxattr, ++ .listxattr = nfs4_listxattr, ++}; ++ ++#endif /* CONFIG_NFS_V4 */ ++ + /* Hack for future NFS swap support */ + #ifndef IS_SWAPFILE + # define IS_SWAPFILE(inode) (0) +@@ -295,10 +308,19 @@ + static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) + { + struct inode *inode = filp->f_mapping->host; +- int status; ++ int status = 0; + + lock_kernel(); +- status = NFS_PROTO(inode)->lock(filp, cmd, fl); ++ /* Use local locking if mounted with "-onolock" */ ++ if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) ++ status = NFS_PROTO(inode)->lock(filp, cmd, fl); ++ else { ++ struct file_lock *cfl = posix_test_lock(filp, fl); ++ if (cfl != NULL) { ++ memcpy(fl, cfl, sizeof(*fl)); ++ fl->fl_type = F_UNLCK; ++ } ++ } + unlock_kernel(); + return status; + } +@@ -325,7 +347,11 @@ + * still need to complete the unlock. + */ + lock_kernel(); +- status = NFS_PROTO(inode)->lock(filp, cmd, fl); ++ /* Use local locking if mounted with "-onolock" */ ++ if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) ++ status = NFS_PROTO(inode)->lock(filp, cmd, fl); ++ else ++ status = posix_lock_file_wait(filp, fl); + rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset); + return status; + } +@@ -351,15 +377,19 @@ + return status; + + lock_kernel(); +- status = NFS_PROTO(inode)->lock(filp, cmd, fl); +- /* If we were signalled we still need to ensure that +- * we clean up any state on the server. We therefore +- * record the lock call as having succeeded in order to +- * ensure that locks_remove_posix() cleans it out when +- * the process exits. +- */ +- if (status == -EINTR || status == -ERESTARTSYS) +- posix_lock_file(filp, fl); ++ /* Use local locking if mounted with "-onolock" */ ++ if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) { ++ status = NFS_PROTO(inode)->lock(filp, cmd, fl); ++ /* If we were signalled we still need to ensure that ++ * we clean up any state on the server. We therefore ++ * record the lock call as having succeeded in order to ++ * ensure that locks_remove_posix() cleans it out when ++ * the process exits. ++ */ ++ if (status == -EINTR || status == -ERESTARTSYS) ++ posix_lock_file(filp, fl); ++ } else ++ status = posix_lock_file_wait(filp, fl); + unlock_kernel(); + if (status < 0) + return status; +@@ -396,15 +426,6 @@ + if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + return -ENOLCK; + +- if (NFS_PROTO(inode)->version != 4) { +- /* Fake OK code if mounted without NLM support */ +- if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) { +- if (IS_GETLK(cmd)) +- return LOCK_USE_CLNT; +- return 0; +- } +- } +- + /* + * No BSD flocks over NFS allowed. + * Note: we could try to fake a POSIX lock request here by +Index: linux-2.6.10/fs/nfs/nfs3proc.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/nfs3proc.c 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/fs/nfs/nfs3proc.c 2005-04-05 14:49:13.441685872 +0800 +@@ -80,10 +80,10 @@ + dprintk("%s: call fsinfo\n", __FUNCTION__); + info->fattr->valid = 0; + status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); +- dprintk("%s: reply fsinfo %d\n", __FUNCTION__, status); ++ dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status); + if (!(info->fattr->valid & NFS_ATTR_FATTR)) { + status = rpc_call(server->client_sys, NFS3PROC_GETATTR, fhandle, info->fattr, 0); +- dprintk("%s: reply getattr %d\n", __FUNCTION__, status); ++ dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); + } + return status; + } +@@ -101,7 +101,7 @@ + fattr->valid = 0; + status = rpc_call(server->client, NFS3PROC_GETATTR, + fhandle, fattr, 0); +- dprintk("NFS reply getattr\n"); ++ dprintk("NFS reply getattr: %d\n", status); + return status; + } + +@@ -119,7 +119,7 @@ + dprintk("NFS call setattr\n"); + fattr->valid = 0; + status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0); +- dprintk("NFS reply setattr\n"); ++ dprintk("NFS reply setattr: %d\n", status); + return status; + } + +@@ -198,7 +198,7 @@ + if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; + } +- dprintk("NFS reply access, status = %d\n", status); ++ dprintk("NFS reply access: %d\n", status); + return status; + } + +@@ -296,7 +296,7 @@ + * For now, we don't implement O_EXCL. + */ + static struct inode * +-nfs3_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr, ++nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags) + { + struct nfs_fh fhandle; +@@ -304,8 +304,8 @@ + struct nfs_fattr dir_attr; + struct nfs3_createargs arg = { + .fh = NFS_FH(dir), +- .name = name->name, +- .len = name->len, ++ .name = dentry->d_name.name, ++ .len = dentry->d_name.len, + .sattr = sattr, + }; + struct nfs3_diropres res = { +@@ -315,7 +315,7 @@ + }; + int status; + +- dprintk("NFS call create %s\n", name->name); ++ dprintk("NFS call create %s\n", dentry->d_name.name); + arg.createmode = NFS3_CREATE_UNCHECKED; + if (flags & O_EXCL) { + arg.createmode = NFS3_CREATE_EXCLUSIVE; +@@ -353,7 +353,7 @@ + if (status != 0) + goto out; + if (fhandle.size == 0 || !(fattr.valid & NFS_ATTR_FATTR)) { +- status = nfs3_proc_lookup(dir, name, &fhandle, &fattr); ++ status = nfs3_proc_lookup(dir, &dentry->d_name, &fhandle, &fattr); + if (status != 0) + goto out; + } +@@ -838,6 +838,7 @@ + .version = 3, /* protocol version */ + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, ++ .file_inode_ops = &nfs_file_inode_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +Index: linux-2.6.10/fs/nfs/nfs4proc.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/nfs4proc.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/fs/nfs/nfs4proc.c 2005-04-05 14:49:13.456683592 +0800 +@@ -477,7 +477,7 @@ + /* + * Returns an nfs4_state + an referenced inode + */ +-static int _nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) ++static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) + { + struct nfs4_state_owner *sp; + struct nfs4_state *state = NULL; +@@ -491,7 +491,7 @@ + struct nfs_openargs o_arg = { + .fh = NFS_FH(dir), + .open_flags = flags, +- .name = name, ++ .name = &dentry->d_name, + .server = server, + .bitmask = server->attr_bitmask, + .claim = NFS4_OPEN_CLAIM_NULL, +@@ -581,14 +581,14 @@ + } + + +-struct nfs4_state *nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred) ++struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred) + { + struct nfs4_exception exception = { }; + struct nfs4_state *res; + int status; + + do { +- status = _nfs4_do_open(dir, name, flags, sattr, cred, &res); ++ status = _nfs4_do_open(dir, dentry, flags, sattr, cred, &res); + if (status == 0) + break; + /* NOTE: BAD_SEQID means the server and client disagree about the +@@ -635,6 +635,8 @@ + + fattr->valid = 0; + ++ if (state != NULL) ++ msg.rpc_cred = state->owner->so_cred; + if (sattr->ia_valid & ATTR_SIZE) + nfs4_copy_stateid(&arg.stateid, state, NULL); + else +@@ -658,6 +660,61 @@ + return err; + } + ++struct nfs4_closedata { ++ struct inode *inode; ++ struct nfs4_state *state; ++ struct nfs_closeargs arg; ++ struct nfs_closeres res; ++}; ++ ++static void nfs4_close_done(struct rpc_task *task) ++{ ++ struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; ++ struct nfs4_state *state = calldata->state; ++ struct nfs4_state_owner *sp = state->owner; ++ struct nfs_server *server = NFS_SERVER(calldata->inode); ++ ++ /* hmm. we are done with the inode, and in the process of freeing ++ * the state_owner. we keep this around to process errors ++ */ ++ nfs4_increment_seqid(task->tk_status, sp); ++ switch (task->tk_status) { ++ case 0: ++ state->state = calldata->arg.open_flags; ++ memcpy(&state->stateid, &calldata->res.stateid, ++ sizeof(state->stateid)); ++ break; ++ case -NFS4ERR_STALE_STATEID: ++ case -NFS4ERR_EXPIRED: ++ state->state = calldata->arg.open_flags; ++ nfs4_schedule_state_recovery(server->nfs4_state); ++ break; ++ default: ++ if (nfs4_async_handle_error(task, server) == -EAGAIN) { ++ rpc_restart_call(task); ++ return; ++ } ++ } ++ nfs4_put_open_state(state); ++ up(&sp->so_sema); ++ nfs4_put_state_owner(sp); ++ up_read(&server->nfs4_state->cl_sem); ++ kfree(calldata); ++} ++ ++static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *calldata) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], ++ .rpc_argp = &calldata->arg, ++ .rpc_resp = &calldata->res, ++ .rpc_cred = calldata->state->owner->so_cred, ++ }; ++ if (calldata->arg.open_flags != 0) ++ msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; ++ return rpc_call_async(clnt, &msg, 0, nfs4_close_done, calldata); ++} ++ + /* + * It is possible for data to be read/written from a mem-mapped file + * after the sys_close call (which hits the vfs layer as a flush). +@@ -669,102 +726,34 @@ + * + * NOTE: Caller must be holding the sp->so_owner semaphore! + */ +-static int _nfs4_do_close(struct inode *inode, struct nfs4_state *state) ++int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) + { +- struct nfs4_state_owner *sp = state->owner; +- int status = 0; +- struct nfs_closeargs arg = { +- .fh = NFS_FH(inode), +- }; +- struct nfs_closeres res; +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], +- .rpc_argp = &arg, +- .rpc_resp = &res, +- }; ++ struct nfs4_closedata *calldata; ++ int status; + +- if (test_bit(NFS_DELEGATED_STATE, &state->flags)) ++ /* Tell caller we're done */ ++ if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { ++ state->state = mode; + return 0; +- memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid)); ++ } ++ calldata = (struct nfs4_closedata *)kmalloc(sizeof(*calldata), GFP_KERNEL); ++ if (calldata == NULL) ++ return -ENOMEM; ++ calldata->inode = inode; ++ calldata->state = state; ++ calldata->arg.fh = NFS_FH(inode); + /* Serialization for the sequence id */ +- arg.seqid = sp->so_seqid, +- status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, RPC_TASK_NOINTR); +- +- /* hmm. we are done with the inode, and in the process of freeing +- * the state_owner. we keep this around to process errors ++ calldata->arg.seqid = state->owner->so_seqid; ++ calldata->arg.open_flags = mode; ++ memcpy(&calldata->arg.stateid, &state->stateid, ++ sizeof(calldata->arg.stateid)); ++ status = nfs4_close_call(NFS_SERVER(inode)->client, calldata); ++ /* ++ * Return -EINPROGRESS on success in order to indicate to the ++ * caller that an asynchronous RPC call has been launched, and ++ * that it will release the semaphores on completion. + */ +- nfs4_increment_seqid(status, sp); +- if (!status) +- memcpy(&state->stateid, &res.stateid, sizeof(state->stateid)); +- +- return status; +-} +- +-int nfs4_do_close(struct inode *inode, struct nfs4_state *state) +-{ +- struct nfs_server *server = NFS_SERVER(state->inode); +- struct nfs4_exception exception = { }; +- int err; +- do { +- err = _nfs4_do_close(inode, state); +- switch (err) { +- case -NFS4ERR_STALE_STATEID: +- case -NFS4ERR_EXPIRED: +- nfs4_schedule_state_recovery(server->nfs4_state); +- err = 0; +- default: +- state->state = 0; +- } +- err = nfs4_handle_exception(server, err, &exception); +- } while (exception.retry); +- return err; +-} +- +-static int _nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) +-{ +- struct nfs4_state_owner *sp = state->owner; +- int status = 0; +- struct nfs_closeargs arg = { +- .fh = NFS_FH(inode), +- .seqid = sp->so_seqid, +- .open_flags = mode, +- }; +- struct nfs_closeres res; +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE], +- .rpc_argp = &arg, +- .rpc_resp = &res, +- }; +- +- if (test_bit(NFS_DELEGATED_STATE, &state->flags)) +- return 0; +- memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid)); +- status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, RPC_TASK_NOINTR); +- nfs4_increment_seqid(status, sp); +- if (!status) +- memcpy(&state->stateid, &res.stateid, sizeof(state->stateid)); +- +- return status; +-} +- +-int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) +-{ +- struct nfs_server *server = NFS_SERVER(state->inode); +- struct nfs4_exception exception = { }; +- int err; +- do { +- err = _nfs4_do_downgrade(inode, state, mode); +- switch (err) { +- case -NFS4ERR_STALE_STATEID: +- case -NFS4ERR_EXPIRED: +- nfs4_schedule_state_recovery(server->nfs4_state); +- err = 0; +- default: +- state->state = mode; +- } +- err = nfs4_handle_exception(server, err, &exception); +- } while (exception.retry); +- return err; ++ return (status == 0) ? -EINPROGRESS : status; + } + + struct inode * +@@ -785,7 +774,7 @@ + } + + cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); +- state = nfs4_do_open(dir, &dentry->d_name, nd->intent.open.flags, &attr, cred); ++ state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); + put_rpccred(cred); + if (IS_ERR(state)) + return (struct inode *)state; +@@ -802,7 +791,7 @@ + cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); + state = nfs4_open_delegated(dentry->d_inode, openflags, cred); + if (IS_ERR(state)) +- state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred); ++ state = nfs4_do_open(dir, dentry, openflags, NULL, cred); + put_rpccred(cred); + if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0) + return 1; +@@ -1026,7 +1015,7 @@ + FMODE_WRITE, cred); + if (IS_ERR(state)) + state = nfs4_do_open(dentry->d_parent->d_inode, +- &dentry->d_name, FMODE_WRITE, ++ dentry, FMODE_WRITE, + NULL, cred); + need_iput = 1; + } +@@ -1327,7 +1316,7 @@ + */ + + static struct inode * +-nfs4_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr, ++nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, + int flags) + { + struct inode *inode; +@@ -1335,7 +1324,7 @@ + struct rpc_cred *cred; + + cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); +- state = nfs4_do_open(dir, name, flags, sattr, cred); ++ state = nfs4_do_open(dir, dentry, flags, sattr, cred); + put_rpccred(cred); + if (!IS_ERR(state)) { + inode = state->inode; +@@ -2049,6 +2038,86 @@ + } + + static int ++nfs4_server_supports_acls(struct nfs_server *server) ++{ ++ return (server->caps & NFS_CAP_ACLS) ++ && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) ++ && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); ++} ++ ++/* XXX: assuming XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, ++ * and that it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) ++ * bytes on the stack. (Currently probably both true.) ++ */ ++#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) ++ ++static void buf_to_pages(const void *buf, ssize_t buflen, ++ struct page **pages, unsigned int *pgbase) ++{ ++ const void *p = buf; ++ ++ *pgbase = offset_in_page(buf); ++ p -= *pgbase; ++ while (p < buf + buflen) { ++ *(pages++) = virt_to_page(p); ++ p += PAGE_CACHE_SIZE; ++ } ++} ++ ++ssize_t ++nfs4_proc_get_acl(struct inode *inode, void *buf, ssize_t buflen) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct page *pages[NFS4ACL_MAXPAGES]; ++ struct nfs_getaclargs args = { ++ .fh = NFS_FH(inode), ++ .acl_pages = pages, ++ .acl_len = buflen, ++ }; ++ ssize_t acl_len = buflen; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL], ++ .rpc_argp = &args, ++ .rpc_resp = &acl_len, ++ }; ++ int ret; ++ ++ if (!nfs4_server_supports_acls(server)) ++ return -EOPNOTSUPP; ++ buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); ++ ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); ++ if (buflen && acl_len > buflen) ++ return -ERANGE; ++ if (ret == 0) ++ ret = acl_len; ++ return ret; ++} ++ ++int ++nfs4_proc_set_acl(struct inode *inode, const void *buf, ssize_t buflen) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct page *pages[NFS4ACL_MAXPAGES]; ++ struct nfs_setaclargs arg = { ++ .fh = NFS_FH(inode), ++ .acl_pages = pages, ++ .acl_len = buflen, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL], ++ .rpc_argp = &arg, ++ .rpc_resp = NULL, ++ }; ++ int ret; ++ ++ if (!nfs4_server_supports_acls(server)) ++ return -EOPNOTSUPP; ++ buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ++ ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0); ++ return ret; ++} ++ ++static int + nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server) + { + struct nfs4_client *clp = server->nfs4_state; +@@ -2589,6 +2658,7 @@ + .version = 4, /* protocol version */ + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, ++ .file_inode_ops = &nfs4_file_inode_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +Index: linux-2.6.10/fs/nfs/direct.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/direct.c 2005-03-31 15:35:23.000000000 +0800 ++++ linux-2.6.10/fs/nfs/direct.c 2005-04-05 14:49:13.448684808 +0800 +@@ -33,6 +33,7 @@ + * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy + * 08 Jun 2003 Port to 2.5 APIs --cel + * 31 Mar 2004 Handle direct I/O without VFS support --cel ++ * 15 Sep 2004 Parallel async reads --cel + * + */ + +@@ -43,6 +44,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -50,11 +52,27 @@ + + #include + #include ++#include + + #define NFSDBG_FACILITY NFSDBG_VFS +-#define VERF_SIZE (2 * sizeof(__u32)) + #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) + ++static kmem_cache_t *nfs_direct_cachep; ++ ++/* ++ * This represents a set of asynchronous requests that we're waiting on ++ */ ++struct nfs_direct_req { ++ struct kref kref; /* release manager */ ++ struct list_head list; /* nfs_read_data structs */ ++ wait_queue_head_t wait; /* wait for i/o completion */ ++ struct page ** pages; /* pages in our buffer */ ++ unsigned int npages; /* count of pages */ ++ atomic_t complete, /* i/os we're waiting for */ ++ count, /* bytes actually processed */ ++ error; /* any reported error */ ++}; ++ + + /** + * nfs_get_user_pages - find and set up pages underlying user's buffer +@@ -71,7 +89,8 @@ + unsigned long page_count; + size_t array_size; + +- /* set an arbitrary limit to prevent arithmetic overflow */ ++ /* set an arbitrary limit to prevent type overflow */ ++ /* XXX: this can probably be as large as INT_MAX */ + if (size > MAX_DIRECTIO_SIZE) { + *pages = NULL; + return -EFBIG; +@@ -95,6 +114,8 @@ + /** + * nfs_free_user_pages - tear down page struct array + * @pages: array of page struct pointers underlying target buffer ++ * @npages: number of pages in the array ++ * @do_dirty: dirty the pages as we release them + */ + static void + nfs_free_user_pages(struct page **pages, int npages, int do_dirty) +@@ -109,77 +130,231 @@ + } + + /** +- * nfs_direct_read_seg - Read in one iov segment. Generate separate +- * read RPCs for each "rsize" bytes. ++ * nfs_direct_req_release - release nfs_direct_req structure for direct read ++ * @kref: kref object embedded in an nfs_direct_req structure ++ * ++ */ ++static void nfs_direct_req_release(struct kref *kref) ++{ ++ struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); ++ kmem_cache_free(nfs_direct_cachep, dreq); ++} ++ ++/** ++ * nfs_direct_read_alloc - allocate nfs_read_data structures for direct read ++ * @count: count of bytes for the read request ++ * @rsize: local rsize setting ++ * ++ * Note we also set the number of requests we have in the dreq when we are ++ * done. This prevents races with I/O completion so we will always wait ++ * until all requests have been dispatched and completed. ++ */ ++static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int rsize) ++{ ++ struct list_head *list; ++ struct nfs_direct_req *dreq; ++ unsigned int reads = 0; ++ ++ dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); ++ if (!dreq) ++ return NULL; ++ ++ kref_init(&dreq->kref); ++ init_waitqueue_head(&dreq->wait); ++ INIT_LIST_HEAD(&dreq->list); ++ atomic_set(&dreq->count, 0); ++ atomic_set(&dreq->error, 0); ++ ++ list = &dreq->list; ++ for(;;) { ++ struct nfs_read_data *data = nfs_readdata_alloc(); ++ ++ if (unlikely(!data)) { ++ while (!list_empty(list)) { ++ data = list_entry(list->next, ++ struct nfs_read_data, pages); ++ list_del(&data->pages); ++ nfs_readdata_free(data); ++ } ++ kref_put(&dreq->kref, nfs_direct_req_release); ++ return NULL; ++ } ++ ++ INIT_LIST_HEAD(&data->pages); ++ list_add(&data->pages, list); ++ ++ data->req = (struct nfs_page *) dreq; ++ reads++; ++ if (nbytes <= rsize) ++ break; ++ nbytes -= rsize; ++ } ++ kref_get(&dreq->kref); ++ atomic_set(&dreq->complete, reads); ++ return dreq; ++} ++ ++/** ++ * nfs_direct_read_result - handle a read reply for a direct read request ++ * @data: address of NFS READ operation control block ++ * @status: status of this NFS READ operation ++ * ++ * We must hold a reference to all the pages in this direct read request ++ * until the RPCs complete. This could be long *after* we are woken up in ++ * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server). ++ */ ++static void nfs_direct_read_result(struct nfs_read_data *data, int status) ++{ ++ struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; ++ ++ if (likely(status >= 0)) ++ atomic_add(data->res.count, &dreq->count); ++ else ++ atomic_set(&dreq->error, status); ++ ++ if (unlikely(atomic_dec_and_test(&dreq->complete))) { ++ nfs_free_user_pages(dreq->pages, dreq->npages, 1); ++ wake_up(&dreq->wait); ++ kref_put(&dreq->kref, nfs_direct_req_release); ++ } ++} ++ ++/** ++ * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read ++ * @dreq: address of nfs_direct_req struct for this request + * @inode: target inode + * @ctx: target file open context +- * user_addr: starting address of this segment of user's buffer +- * count: size of this segment +- * file_offset: offset in file to begin the operation +- * @pages: array of addresses of page structs defining user's buffer +- * nr_pages: size of pages array ++ * @user_addr: starting address of this segment of user's buffer ++ * @count: size of this segment ++ * @file_offset: offset in file to begin the operation ++ * ++ * For each nfs_read_data struct that was allocated on the list, dispatch ++ * an NFS READ operation + */ +-static int +-nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx, +- unsigned long user_addr, size_t count, loff_t file_offset, +- struct page **pages, int nr_pages) +-{ +- const unsigned int rsize = NFS_SERVER(inode)->rsize; +- int tot_bytes = 0; +- int curpage = 0; +- struct nfs_read_data rdata = { +- .inode = inode, +- .cred = ctx->cred, +- .args = { +- .fh = NFS_FH(inode), +- .context = ctx, +- }, +- .res = { +- .fattr = &rdata.fattr, +- }, +- }; ++static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, ++ struct inode *inode, struct nfs_open_context *ctx, ++ unsigned long user_addr, size_t count, loff_t file_offset) ++{ ++ struct list_head *list = &dreq->list; ++ struct page **pages = dreq->pages; ++ unsigned int curpage, pgbase; ++ unsigned int rsize = NFS_SERVER(inode)->rsize; + +- rdata.args.pgbase = user_addr & ~PAGE_MASK; +- rdata.args.offset = file_offset; +- do { +- int result; +- +- rdata.args.count = count; +- if (rdata.args.count > rsize) +- rdata.args.count = rsize; +- rdata.args.pages = &pages[curpage]; +- +- dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", +- rdata.args.count, (long long) rdata.args.offset, +- user_addr + tot_bytes, rdata.args.pgbase, curpage); ++ curpage = 0; ++ pgbase = user_addr & ~PAGE_MASK; ++ do { ++ struct nfs_read_data *data; ++ unsigned int bytes; ++ ++ bytes = rsize; ++ if (count < rsize) ++ bytes = count; ++ ++ data = list_entry(list->next, struct nfs_read_data, pages); ++ list_del_init(&data->pages); ++ ++ data->inode = inode; ++ data->cred = ctx->cred; ++ data->args.fh = NFS_FH(inode); ++ data->args.context = ctx; ++ data->args.offset = file_offset; ++ data->args.pgbase = pgbase; ++ data->args.pages = &pages[curpage]; ++ data->args.count = bytes; ++ data->res.fattr = &data->fattr; ++ data->res.eof = 0; ++ data->res.count = bytes; ++ ++ NFS_PROTO(inode)->read_setup(data); ++ ++ data->task.tk_cookie = (unsigned long) inode; ++ data->task.tk_calldata = data; ++ data->task.tk_release = nfs_readdata_release; ++ data->complete = nfs_direct_read_result; + + lock_kernel(); +- result = NFS_PROTO(inode)->read(&rdata); ++ rpc_execute(&data->task); + unlock_kernel(); + +- if (result <= 0) { +- if (tot_bytes > 0) +- break; +- if (result == -EISDIR) +- result = -EINVAL; +- return result; +- } ++ dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ bytes, ++ (unsigned long long)data->args.offset); ++ ++ file_offset += bytes; ++ pgbase += bytes; ++ curpage += pgbase >> PAGE_SHIFT; ++ pgbase &= ~PAGE_MASK; + +- tot_bytes += result; +- if (rdata.res.eof) +- break; +- +- rdata.args.offset += result; +- rdata.args.pgbase += result; +- curpage += rdata.args.pgbase >> PAGE_SHIFT; +- rdata.args.pgbase &= ~PAGE_MASK; +- count -= result; ++ count -= bytes; + } while (count != 0); ++} + +- /* XXX: should we zero the rest of the user's buffer if we +- * hit eof? */ ++/** ++ * nfs_direct_read_wait - wait for I/O completion for direct reads ++ * @dreq: request on which we are to wait ++ * @intr: whether or not this wait can be interrupted ++ * ++ * Collects and returns the final error value/byte-count. ++ */ ++static ssize_t nfs_direct_read_wait(struct nfs_direct_req *dreq, int intr) ++{ ++ int result = 0; + +- return tot_bytes; ++ if (intr) { ++ result = wait_event_interruptible(dreq->wait, ++ (atomic_read(&dreq->complete) == 0)); ++ } else { ++ wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0)); ++ } ++ ++ if (!result) ++ result = atomic_read(&dreq->error); ++ if (!result) ++ result = atomic_read(&dreq->count); ++ ++ kref_put(&dreq->kref, nfs_direct_req_release); ++ return (ssize_t) result; ++} ++ ++/** ++ * nfs_direct_read_seg - Read in one iov segment. Generate separate ++ * read RPCs for each "rsize" bytes. ++ * @inode: target inode ++ * @ctx: target file open context ++ * @user_addr: starting address of this segment of user's buffer ++ * @count: size of this segment ++ * @file_offset: offset in file to begin the operation ++ * @pages: array of addresses of page structs defining user's buffer ++ * @nr_pages: number of pages in the array ++ * ++ */ ++static ssize_t nfs_direct_read_seg(struct inode *inode, ++ struct nfs_open_context *ctx, unsigned long user_addr, ++ size_t count, loff_t file_offset, struct page **pages, ++ unsigned int nr_pages) ++{ ++ ssize_t result; ++ sigset_t oldset; ++ struct rpc_clnt *clnt = NFS_CLIENT(inode); ++ struct nfs_direct_req *dreq; ++ ++ dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize); ++ if (!dreq) ++ return -ENOMEM; ++ ++ dreq->pages = pages; ++ dreq->npages = nr_pages; ++ ++ rpc_clnt_sigmask(clnt, &oldset); ++ nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count, ++ file_offset); ++ result = nfs_direct_read_wait(dreq, clnt->cl_intr); ++ rpc_clnt_sigunmask(clnt, &oldset); ++ ++ return result; + } + + /** +@@ -191,9 +366,8 @@ + * file_offset: offset in file to begin the operation + * nr_segs: size of iovec array + * +- * generic_file_direct_IO has already pushed out any non-direct +- * writes so that this read will see them when we read from the +- * server. ++ * We've already pushed out any non-direct writes so that this read ++ * will see them when we read from the server. + */ + static ssize_t + nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx, +@@ -222,8 +396,6 @@ + result = nfs_direct_read_seg(inode, ctx, user_addr, size, + file_offset, pages, page_count); + +- nfs_free_user_pages(pages, page_count, 1); +- + if (result <= 0) { + if (tot_bytes > 0) + break; +@@ -249,31 +421,31 @@ + * @pages: array of addresses of page structs defining user's buffer + * nr_pages: size of pages array + */ +-static int +-nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx, +- unsigned long user_addr, size_t count, loff_t file_offset, +- struct page **pages, int nr_pages) ++static ssize_t nfs_direct_write_seg(struct inode *inode, ++ struct nfs_open_context *ctx, unsigned long user_addr, ++ size_t count, loff_t file_offset, struct page **pages, ++ int nr_pages) + { + const unsigned int wsize = NFS_SERVER(inode)->wsize; + size_t request; +- int curpage, need_commit, result, tot_bytes; ++ int curpage, need_commit; ++ ssize_t result, tot_bytes; + struct nfs_writeverf first_verf; +- struct nfs_write_data wdata = { +- .inode = inode, +- .cred = ctx->cred, +- .args = { +- .fh = NFS_FH(inode), +- .context = ctx, +- }, +- .res = { +- .fattr = &wdata.fattr, +- .verf = &wdata.verf, +- }, +- }; ++ struct nfs_write_data *wdata; + +- wdata.args.stable = NFS_UNSTABLE; ++ wdata = nfs_writedata_alloc(); ++ if (!wdata) ++ return -ENOMEM; ++ ++ wdata->inode = inode; ++ wdata->cred = ctx->cred; ++ wdata->args.fh = NFS_FH(inode); ++ wdata->args.context = ctx; ++ wdata->args.stable = NFS_UNSTABLE; + if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) +- wdata.args.stable = NFS_FILE_SYNC; ++ wdata->args.stable = NFS_FILE_SYNC; ++ wdata->res.fattr = &wdata->fattr; ++ wdata->res.verf = &wdata->verf; + + nfs_begin_data_update(inode); + retry: +@@ -281,20 +453,20 @@ + tot_bytes = 0; + curpage = 0; + request = count; +- wdata.args.pgbase = user_addr & ~PAGE_MASK; +- wdata.args.offset = file_offset; +- do { +- wdata.args.count = request; +- if (wdata.args.count > wsize) +- wdata.args.count = wsize; +- wdata.args.pages = &pages[curpage]; ++ wdata->args.pgbase = user_addr & ~PAGE_MASK; ++ wdata->args.offset = file_offset; ++ do { ++ wdata->args.count = request; ++ if (wdata->args.count > wsize) ++ wdata->args.count = wsize; ++ wdata->args.pages = &pages[curpage]; + + dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", +- wdata.args.count, (long long) wdata.args.offset, +- user_addr + tot_bytes, wdata.args.pgbase, curpage); ++ wdata->args.count, (long long) wdata->args.offset, ++ user_addr + tot_bytes, wdata->args.pgbase, curpage); + + lock_kernel(); +- result = NFS_PROTO(inode)->write(&wdata); ++ result = NFS_PROTO(inode)->write(wdata); + unlock_kernel(); + + if (result <= 0) { +@@ -304,20 +476,25 @@ + } + + if (tot_bytes == 0) +- memcpy(&first_verf.verifier, &wdata.verf.verifier, +- VERF_SIZE); +- if (wdata.verf.committed != NFS_FILE_SYNC) { ++ memcpy(&first_verf.verifier, &wdata->verf.verifier, ++ sizeof(first_verf.verifier)); ++ if (wdata->verf.committed != NFS_FILE_SYNC) { + need_commit = 1; +- if (memcmp(&first_verf.verifier, +- &wdata.verf.verifier, VERF_SIZE)) ++ if (memcmp(&first_verf.verifier, &wdata->verf.verifier, ++ sizeof(first_verf.verifier))); + goto sync_retry; + } + +- tot_bytes += result; +- wdata.args.offset += result; +- wdata.args.pgbase += result; +- curpage += wdata.args.pgbase >> PAGE_SHIFT; +- wdata.args.pgbase &= ~PAGE_MASK; ++ tot_bytes += result; ++ ++ /* in case of a short write: stop now, let the app recover */ ++ if (result < wdata->args.count) ++ break; ++ ++ wdata->args.offset += result; ++ wdata->args.pgbase += result; ++ curpage += wdata->args.pgbase >> PAGE_SHIFT; ++ wdata->args.pgbase &= ~PAGE_MASK; + request -= result; + } while (request != 0); + +@@ -325,27 +502,27 @@ + * Commit data written so far, even in the event of an error + */ + if (need_commit) { +- wdata.args.count = tot_bytes; +- wdata.args.offset = file_offset; ++ wdata->args.count = tot_bytes; ++ wdata->args.offset = file_offset; + + lock_kernel(); +- result = NFS_PROTO(inode)->commit(&wdata); ++ result = NFS_PROTO(inode)->commit(wdata); + unlock_kernel(); + + if (result < 0 || memcmp(&first_verf.verifier, +- &wdata.verf.verifier, +- VERF_SIZE) != 0) ++ &wdata->verf.verifier, ++ sizeof(first_verf.verifier)) != 0) + goto sync_retry; + } + result = tot_bytes; + + out: + nfs_end_data_update_defer(inode); +- ++ nfs_writedata_free(wdata); + return result; + + sync_retry: +- wdata.args.stable = NFS_FILE_SYNC; ++ wdata->args.stable = NFS_FILE_SYNC; + goto retry; + } + +@@ -362,9 +539,9 @@ + * that non-direct readers might access, so they will pick up these + * writes immediately. + */ +-static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx, +- const struct iovec *iov, loff_t file_offset, +- unsigned long nr_segs) ++static ssize_t nfs_direct_write(struct inode *inode, ++ struct nfs_open_context *ctx, const struct iovec *iov, ++ loff_t file_offset, unsigned long nr_segs) + { + ssize_t tot_bytes = 0; + unsigned long seg = 0; +@@ -504,6 +681,8 @@ + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) ++ retval = nfs_wb_all(inode); ++ if (retval == 0) + retval = filemap_fdatawait(mapping); + if (retval) + goto out; +@@ -593,6 +772,8 @@ + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) ++ retval = nfs_wb_all(inode); ++ if (retval == 0) + retval = filemap_fdatawait(mapping); + if (retval) + goto out; +@@ -607,3 +788,21 @@ + out: + return retval; + } ++ ++int nfs_init_directcache(void) ++{ ++ nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", ++ sizeof(struct nfs_direct_req), ++ 0, SLAB_RECLAIM_ACCOUNT, ++ NULL, NULL); ++ if (nfs_direct_cachep == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void nfs_destroy_directcache(void) ++{ ++ if (kmem_cache_destroy(nfs_direct_cachep)) ++ printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n"); ++} +Index: linux-2.6.10/fs/nfs/read.c +=================================================================== +--- linux-2.6.10.orig/fs/nfs/read.c 2004-12-25 05:33:47.000000000 +0800 ++++ linux-2.6.10/fs/nfs/read.c 2005-04-05 14:49:13.437686480 +0800 +@@ -24,7 +24,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -39,25 +38,11 @@ + static void nfs_readpage_result_full(struct nfs_read_data *, int); + + static kmem_cache_t *nfs_rdata_cachep; +-static mempool_t *nfs_rdata_mempool; ++mempool_t *nfs_rdata_mempool; + + #define MIN_POOL_READ (32) + +-static struct nfs_read_data *nfs_readdata_alloc(void) +-{ +- struct nfs_read_data *p; +- p = (struct nfs_read_data *)mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); +- if (p) +- memset(p, 0, sizeof(*p)); +- return p; +-} +- +-static __inline__ void nfs_readdata_free(struct nfs_read_data *p) +-{ +- mempool_free(p, nfs_rdata_mempool); +-} +- +-static void nfs_readdata_release(struct rpc_task *task) ++void nfs_readdata_release(struct rpc_task *task) + { + struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; + nfs_readdata_free(data); diff --git a/lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch b/lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch new file mode 100644 index 0000000..8aa3fd0 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch @@ -0,0 +1,1477 @@ +Index: linux-2.6.10/arch/i386/kernel/asm-offsets.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/asm-offsets.c 2004-12-25 05:34:31.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/asm-offsets.c 2005-04-05 16:34:18.173220992 +0800 +@@ -52,6 +52,7 @@ + OFFSET(TI_preempt_count, thread_info, preempt_count); + OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TI_restart_block, thread_info, restart_block); ++ OFFSET(TI_sysenter_return, thread_info, sysenter_return); + BLANK(); + + OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); +Index: linux-2.6.10/arch/i386/kernel/cpu/common.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/cpu/common.c 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/cpu/common.c 2005-04-05 16:34:18.174220840 +0800 +@@ -384,6 +384,12 @@ + if (disable_pse) + clear_bit(X86_FEATURE_PSE, c->x86_capability); + ++ /* hack: disable SEP for non-NX cpus; SEP breaks Execshield. */ ++ #ifdef CONFIG_HIGHMEM64G ++ if (!test_bit(X86_FEATURE_NX, c->x86_capability)) ++ #endif ++ clear_bit(X86_FEATURE_SEP, c->x86_capability); ++ + /* If the model name is still unset, do table lookup. */ + if ( !c->x86_model_id[0] ) { + char *p; +Index: linux-2.6.10/arch/i386/kernel/entry.S +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/entry.S 2005-04-05 16:29:30.192000792 +0800 ++++ linux-2.6.10/arch/i386/kernel/entry.S 2005-04-05 16:34:18.167221904 +0800 +@@ -218,8 +218,12 @@ + pushl %ebp + pushfl + pushl $(__USER_CS) +- pushl $SYSENTER_RETURN +- ++ /* ++ * Push current_thread_info()->sysenter_return to the stack. ++ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words ++ * pushed above, and the word being pushed now: ++ */ ++ pushl (TI_sysenter_return-THREAD_SIZE+4*4)(%esp) + /* + * Load the potential sixth argument from user stack. + * Careful about security. +Index: linux-2.6.10/arch/i386/kernel/process.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/process.c 2004-12-25 05:33:47.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/process.c 2005-04-05 16:34:18.173220992 +0800 +@@ -36,6 +36,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -565,6 +567,8 @@ + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + + __unlazy_fpu(prev_p); ++ if (next_p->mm) ++ load_user_cs_desc(cpu, next_p->mm); + + /* + * Reload esp0, LDT and the page table pointer: +@@ -812,3 +816,62 @@ + return 0; + } + ++ ++unsigned long arch_align_stack(unsigned long sp) ++{ ++ if (current->flags & PF_RELOCEXEC) ++ sp -= ((get_random_int() % 65536) << 4); ++ return sp & ~0xf; ++} ++ ++ ++void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) ++{ ++ if (limit > mm->context.exec_limit) { ++ mm->context.exec_limit = limit; ++ set_user_cs(&mm->context.user_cs, limit); ++ if (mm == current->mm) ++ load_user_cs_desc(smp_processor_id(), mm); ++ } ++} ++ ++void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) ++{ ++ struct vm_area_struct *vma; ++ unsigned long limit = 0; ++ ++ if (old_end == mm->context.exec_limit) { ++ for (vma = mm->mmap; vma; vma = vma->vm_next) ++ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) ++ limit = vma->vm_end; ++ ++ mm->context.exec_limit = limit; ++ set_user_cs(&mm->context.user_cs, limit); ++ if (mm == current->mm) ++ load_user_cs_desc(smp_processor_id(), mm); ++ } ++} ++ ++void arch_flush_exec_range(struct mm_struct *mm) ++{ ++ mm->context.exec_limit = 0; ++ set_user_cs(&mm->context.user_cs, 0); ++} ++ ++/* ++ * Generate random brk address between 128MB and 196MB. (if the layout ++ * allows it.) ++ */ ++void randomize_brk(unsigned long old_brk) ++{ ++ unsigned long new_brk, range_start, range_end; ++ ++ range_start = 0x08000000; ++ if (current->mm->brk >= range_start) ++ range_start = current->mm->brk; ++ range_end = range_start + 0x02000000; ++ new_brk = randomize_range(range_start, range_end, 0); ++ if (new_brk) ++ current->mm->brk = new_brk; ++} ++ +Index: linux-2.6.10/arch/i386/kernel/signal.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/signal.c 2005-04-05 16:29:23.290050048 +0800 ++++ linux-2.6.10/arch/i386/kernel/signal.c 2005-04-05 16:34:18.170221448 +0800 +@@ -390,7 +390,7 @@ + if (err) + goto give_sigsegv; + +- restorer = &__kernel_sigreturn; ++ restorer = current->mm->context.vdso + (long)&__kernel_sigreturn; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + +@@ -487,9 +487,10 @@ + goto give_sigsegv; + + /* Set up to return from userspace. */ +- restorer = &__kernel_rt_sigreturn; ++ restorer = current->mm->context.vdso + (long)&__kernel_rt_sigreturn; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; ++ + err |= __put_user(restorer, &frame->pretcode); + + /* +Index: linux-2.6.10/arch/i386/kernel/smp.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/smp.c 2005-04-05 16:29:30.198999728 +0800 ++++ linux-2.6.10/arch/i386/kernel/smp.c 2005-04-05 16:34:18.172221144 +0800 +@@ -22,6 +22,7 @@ + + #include + #include ++#include + #include + + /* +@@ -313,6 +314,8 @@ + unsigned long cpu; + + cpu = get_cpu(); ++ if (current->active_mm) ++ load_user_cs_desc(cpu, current->active_mm); + + if (!cpu_isset(cpu, flush_cpumask)) + goto out; +Index: linux-2.6.10/arch/i386/kernel/sysenter.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/sysenter.c 2004-12-25 05:35:40.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/sysenter.c 2005-04-05 16:34:18.171221296 +0800 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -41,11 +42,14 @@ + extern const char vsyscall_int80_start, vsyscall_int80_end; + extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; + ++struct page *sysenter_page; ++ + static int __init sysenter_setup(void) + { + void *page = (void *)get_zeroed_page(GFP_ATOMIC); + +- __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC); ++ __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_KERNEL_RO); ++ sysenter_page = virt_to_page(page); + + if (!boot_cpu_has(X86_FEATURE_SEP)) { + memcpy(page, +@@ -59,7 +63,51 @@ + &vsyscall_sysenter_end - &vsyscall_sysenter_start); + + on_each_cpu(enable_sep_cpu, NULL, 1, 1); ++ + return 0; + } + + __initcall(sysenter_setup); ++ ++extern void SYSENTER_RETURN_OFFSET; ++ ++unsigned int vdso_enabled = 0; ++ ++void map_vsyscall(void) ++{ ++ struct thread_info *ti = current_thread_info(); ++ struct vm_area_struct *vma; ++ unsigned long addr; ++ ++ if (unlikely(!vdso_enabled)) { ++ current->mm->context.vdso = NULL; ++ return; ++ } ++ ++ /* ++ * Map the vDSO (it will be randomized): ++ */ ++ down_write(¤t->mm->mmap_sem); ++ addr = do_mmap(NULL, 0, 4096, PROT_READ | PROT_EXEC, MAP_PRIVATE, 0); ++ current->mm->context.vdso = (void *)addr; ++ ti->sysenter_return = (void *)addr + (long)&SYSENTER_RETURN_OFFSET; ++ if (addr != -1) { ++ vma = find_vma(current->mm, addr); ++ if (vma) { ++ pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; ++ get_page(sysenter_page); ++ install_page(current->mm, vma, addr, ++ sysenter_page, vma->vm_page_prot); ++ ++ } ++ } ++ up_write(¤t->mm->mmap_sem); ++} ++ ++static int __init vdso_setup(char *str) ++{ ++ vdso_enabled = simple_strtoul(str, NULL, 0); ++ return 1; ++} ++__setup("vdso=", vdso_setup); ++ +Index: linux-2.6.10/arch/i386/kernel/traps.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/traps.c 2005-04-05 16:29:30.193000640 +0800 ++++ linux-2.6.10/arch/i386/kernel/traps.c 2005-04-05 16:43:17.073295728 +0800 +@@ -497,6 +497,10 @@ + DO_ERROR(12, SIGBUS, "stack segment", stack_segment) + DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) + ++/* ++ * the original non-exec stack patch was written by ++ * Solar Designer . Thanks! ++ */ + fastcall void do_general_protection(struct pt_regs * regs, long error_code) + { + int cpu = get_cpu(); +@@ -535,6 +539,46 @@ + if (!(regs->xcs & 3)) + goto gp_in_kernel; + ++ /* ++ * lazy-check for CS validity on exec-shield binaries: ++ */ ++ if (current->mm) { ++ int cpu = smp_processor_id(); ++ struct desc_struct *desc1, *desc2; ++ struct vm_area_struct *vma; ++ unsigned long limit = 0; ++ ++ spin_lock(¤t->mm->page_table_lock); ++ for (vma = current->mm->mmap; vma; vma = vma->vm_next) ++ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) ++ limit = vma->vm_end; ++ spin_unlock(¤t->mm->page_table_lock); ++ ++ current->mm->context.exec_limit = limit; ++ set_user_cs(¤t->mm->context.user_cs, limit); ++ ++ desc1 = ¤t->mm->context.user_cs; ++ desc2 = per_cpu(cpu_gdt_table, cpu) + GDT_ENTRY_DEFAULT_USER_CS; ++ ++ /* ++ * The CS was not in sync - reload it and retry the ++ * instruction. If the instruction still faults then ++ * we wont hit this branch next time around. ++ */ ++ if (desc1->a != desc2->a || desc1->b != desc2->b) { ++ if (print_fatal_signals >= 2) { ++ printk("#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id()); ++ printk(" exec_limit: %08lx, user_cs: %08lx/%08lx, CPU_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, desc1->a, desc1->b, desc2->a, desc2->b); ++ } ++ load_user_cs_desc(cpu, current->mm); ++ return; ++ } ++ } ++ if (print_fatal_signals) { ++ printk("#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, error_code/8, regs->eip, smp_processor_id()); ++ printk(" exec_limit: %08lx, user_cs: %08lx/%08lx.\n", current->mm->context.exec_limit, current->mm->context.user_cs.a, current->mm->context.user_cs.b); ++ } ++ + current->thread.error_code = error_code; + current->thread.trap_no = 13; + force_sig(SIGSEGV, current); +Index: linux-2.6.10/arch/i386/kernel/vsyscall.lds.S +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/vsyscall.lds.S 2004-12-25 05:34:31.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/vsyscall.lds.S 2005-04-05 16:34:18.169221600 +0800 +@@ -7,7 +7,7 @@ + + SECTIONS + { +- . = VSYSCALL_BASE + SIZEOF_HEADERS; ++ . = SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } +@@ -20,7 +20,7 @@ + For the layouts to match, we need to skip more than enough + space for the dynamic symbol table et al. If this amount + is insufficient, ld -shared will barf. Just increase it here. */ +- . = VSYSCALL_BASE + 0x400; ++ . = 0x400; + + .text : { *(.text) } :text =0x90909090 + +Index: linux-2.6.10/arch/i386/kernel/vsyscall-sysenter.S +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/vsyscall-sysenter.S 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/vsyscall-sysenter.S 2005-04-05 16:34:18.170221448 +0800 +@@ -24,11 +24,11 @@ + /* 7: align return point with nop's to make disassembly easier */ + .space 7,0x90 + +- /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */ ++ /* 14: System call restart point is here! (SYSENTER_RETURN_OFFSET-2) */ + jmp .Lenter_kernel + /* 16: System call normal return point is here! */ +- .globl SYSENTER_RETURN /* Symbol used by entry.S. */ +-SYSENTER_RETURN: ++ .globl SYSENTER_RETURN_OFFSET /* Symbol used by sysenter.c */ ++SYSENTER_RETURN_OFFSET: + pop %ebp + .Lpop_ebp: + pop %edx +Index: linux-2.6.10/arch/i386/mm/init.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/mm/init.c 2005-04-05 16:29:28.016331544 +0800 ++++ linux-2.6.10/arch/i386/mm/init.c 2005-04-05 16:34:18.167221904 +0800 +@@ -518,7 +518,10 @@ + set_nx(); + if (nx_enabled) + printk("NX (Execute Disable) protection: active\n"); ++ else + #endif ++ if (exec_shield) ++ printk("Using x86 segment limits to approximate NX protection\n"); + + pagetable_init(); + +Index: linux-2.6.10/arch/i386/mm/mmap.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/mm/mmap.c 2004-12-25 05:34:33.000000000 +0800 ++++ linux-2.6.10/arch/i386/mm/mmap.c 2005-04-05 16:43:44.365146736 +0800 +@@ -26,6 +26,7 @@ + + #include + #include ++#include + + /* + * Top of mmap area (just below the process stack). +@@ -38,13 +39,17 @@ + static inline unsigned long mmap_base(struct mm_struct *mm) + { + unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; ++ unsigned long random_factor = 0; ++ ++ if (current->flags & PF_RELOCEXEC) ++ random_factor = get_random_int() % (1024*1024); + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + +- return TASK_SIZE - (gap & PAGE_MASK); ++ return PAGE_ALIGN(TASK_SIZE - gap - random_factor); + } + + /* +@@ -57,15 +62,17 @@ + * Fall back to the standard layout if the personality + * bit is set, or if the expected stack growth is unlimited: + */ +- if (sysctl_legacy_va_layout || ++ if ((exec_shield != 2) && (sysctl_legacy_va_layout || + (current->personality & ADDR_COMPAT_LAYOUT) || +- current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { ++ current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)){ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(mm); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; ++ if (current->flags & PF_RELOCEXEC) ++ mm->get_unmapped_exec_area = arch_get_unmapped_exec_area; + mm->unmap_area = arch_unmap_area_topdown; + } + } +Index: linux-2.6.10/arch/ia64/ia32/binfmt_elf32.c +=================================================================== +--- linux-2.6.10.orig/arch/ia64/ia32/binfmt_elf32.c 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/arch/ia64/ia32/binfmt_elf32.c 2005-04-05 16:34:18.174220840 +0800 +@@ -272,7 +272,7 @@ + } + + static unsigned long +-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type) ++elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused) + { + unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK; + +Index: linux-2.6.10/arch/x86_64/ia32/ia32_binfmt.c +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/ia32/ia32_binfmt.c 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/ia32/ia32_binfmt.c 2005-04-05 16:34:18.175220688 +0800 +@@ -390,7 +390,7 @@ + } + + static unsigned long +-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type) ++elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused) + { + unsigned long map_addr; + struct task_struct *me = current; +Index: linux-2.6.10/drivers/char/random.c +=================================================================== +--- linux-2.6.10.orig/drivers/char/random.c 2005-04-05 16:29:24.214909448 +0800 ++++ linux-2.6.10/drivers/char/random.c 2005-04-05 16:34:18.197217344 +0800 +@@ -2469,3 +2469,37 @@ + } + #endif + #endif /* CONFIG_INET */ ++ ++/* ++ * Get a random word: ++ */ ++unsigned int get_random_int(void) ++{ ++ unsigned int val = 0; ++ ++ if (!exec_shield_randomize) ++ return 0; ++ ++#ifdef CONFIG_X86_HAS_TSC ++ rdtscl(val); ++#endif ++ val += current->pid + jiffies + (int)val; ++ ++ /* ++ * Use IP's RNG. It suits our purpose perfectly: it re-keys itself ++ * every second, from the entropy pool (and thus creates a limited ++ * drain on it), and uses halfMD4Transform within the second. We ++ * also spice it with the TSC (if available), jiffies, PID and the ++ * stack address: ++ */ ++ return secure_ip_id(val); ++} ++ ++unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len) ++{ ++ unsigned long range = end - len - start; ++ if (end <= start + len) ++ return 0; ++ return PAGE_ALIGN(get_random_int() % range + start); ++} ++ +Index: linux-2.6.10/fs/binfmt_elf.c +=================================================================== +--- linux-2.6.10.orig/fs/binfmt_elf.c 2005-04-05 16:29:24.353888320 +0800 ++++ linux-2.6.10/fs/binfmt_elf.c 2005-04-05 16:39:25.042569760 +0800 +@@ -494,7 +494,7 @@ + unsigned long reloc_func_desc = 0; + char passed_fileno[6]; + struct files_struct *files; +- int have_pt_gnu_stack, executable_stack = EXSTACK_DEFAULT; ++ int have_pt_gnu_stack, relocexec, executable_stack = EXSTACK_DEFAULT; + unsigned long def_flags = 0; + struct { + struct elfhdr elf_ex; +@@ -660,6 +660,24 @@ + } + have_pt_gnu_stack = (i < loc->elf_ex.e_phnum); + ++ relocexec = 0; ++ ++ if (current->personality == PER_LINUX) ++ switch (exec_shield) { ++ case 1: ++ if (executable_stack == EXSTACK_DISABLE_X) { ++ current->flags |= PF_RELOCEXEC; ++ relocexec = PF_RELOCEXEC; ++ } ++ break; ++ ++ case 2: ++ executable_stack = EXSTACK_DISABLE_X; ++ current->flags |= PF_RELOCEXEC; ++ relocexec = PF_RELOCEXEC; ++ break; ++ } ++ + /* Some simple consistency checks for the interpreter */ + if (elf_interpreter) { + interpreter_type = INTERPRETER_ELF | INTERPRETER_AOUT; +@@ -713,6 +731,15 @@ + if (retval) + goto out_free_dentry; + ++ current->flags |= relocexec; ++#ifdef __i386__ ++ /* ++ * Turn off the CS limit completely if exec-shield disabled or ++ * NX active: ++ */ ++ if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled) ++ arch_add_exec_range(current->mm, -1); ++#endif + /* Discard our unneeded old files struct */ + if (files) { + steal_locks(files); +@@ -731,7 +758,8 @@ + /* Do this immediately, since STACK_TOP as used in setup_arg_pages + may depend on the personality. */ + SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); +- if (elf_read_implies_exec(loc->elf_ex, have_pt_gnu_stack)) ++ if (exec_shield != 2 && ++ elf_read_implies_exec(loc->elf_ex, have_pt_gnu_stack)) + current->personality |= READ_IMPLIES_EXEC; + + arch_pick_mmap_layout(current->mm); +@@ -894,6 +922,14 @@ + + set_binfmt(&elf_format); + ++ /* ++ * Map the vsyscall trampoline. This address is then passed via ++ * AT_SYSINFO. ++ */ ++#ifdef __HAVE_ARCH_VSYSCALL ++ map_vsyscall(); ++#endif ++ + compute_creds(bprm); + current->flags &= ~PF_FORKNOEXEC; + create_elf_tables(bprm, &loc->elf_ex, (interpreter_type == INTERPRETER_AOUT), +Index: linux-2.6.10/fs/exec.c +=================================================================== +--- linux-2.6.10.orig/fs/exec.c 2005-04-05 16:29:30.270988784 +0800 ++++ linux-2.6.10/fs/exec.c 2005-04-05 16:34:18.177220384 +0800 +@@ -396,7 +396,12 @@ + while (i < MAX_ARG_PAGES) + bprm->page[i++] = NULL; + #else ++#ifdef __HAVE_ARCH_ALIGN_STACK ++ stack_base = arch_align_stack(STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE); ++ stack_base = PAGE_ALIGN(stack_base); ++#else + stack_base = STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE; ++#endif + bprm->p += stack_base; + mm->arg_start = bprm->p; + arg_size = STACK_TOP - (PAGE_MASK & (unsigned long) mm->arg_start); +@@ -854,6 +859,7 @@ + tcomm[i] = '\0'; + set_task_comm(current, tcomm); + ++ current->flags &= ~PF_RELOCEXEC; + flush_thread(); + + if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || +Index: linux-2.6.10/fs/proc/array.c +=================================================================== +--- linux-2.6.10.orig/fs/proc/array.c 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/fs/proc/array.c 2005-04-05 16:34:18.180219928 +0800 +@@ -373,8 +373,12 @@ + ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; + read_unlock(&tasklist_lock); + +- if (!whole || num_threads<2) +- wchan = get_wchan(task); ++ if (!whole || num_threads<2) { ++ wchan = 0; ++ if (current->uid == task->uid || current->euid == task->uid || ++ capable(CAP_SYS_NICE)) ++ wchan = get_wchan(task); ++ } + if (!whole) { + min_flt = task->min_flt; + maj_flt = task->maj_flt; +Index: linux-2.6.10/fs/proc/base.c +=================================================================== +--- linux-2.6.10.orig/fs/proc/base.c 2005-04-05 16:29:24.361887104 +0800 ++++ linux-2.6.10/fs/proc/base.c 2005-04-05 16:34:18.179220080 +0800 +@@ -117,7 +117,7 @@ + E(PROC_TGID_CMDLINE, "cmdline", S_IFREG|S_IRUGO), + E(PROC_TGID_STAT, "stat", S_IFREG|S_IRUGO), + E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), +- E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), ++ E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUSR), + E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), + E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO), + E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), +@@ -142,7 +142,7 @@ + E(PROC_TID_CMDLINE, "cmdline", S_IFREG|S_IRUGO), + E(PROC_TID_STAT, "stat", S_IFREG|S_IRUGO), + E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), +- E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), ++ E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUSR), + E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), + E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO), + E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), +Index: linux-2.6.10/fs/proc/task_mmu.c +=================================================================== +--- linux-2.6.10.orig/fs/proc/task_mmu.c 2004-12-25 05:34:01.000000000 +0800 ++++ linux-2.6.10/fs/proc/task_mmu.c 2005-04-05 16:41:11.796340720 +0800 +@@ -14,19 +14,27 @@ + buffer += sprintf(buffer, + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" +- "VmRSS:\t%8lu kB\n" +- "VmData:\t%8lu kB\n" +- "VmStk:\t%8lu kB\n" +- "VmExe:\t%8lu kB\n" +- "VmLib:\t%8lu kB\n" +- "VmPTE:\t%8lu kB\n", +- (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), +- mm->locked_vm << (PAGE_SHIFT-10), +- mm->rss << (PAGE_SHIFT-10), +- data << (PAGE_SHIFT-10), +- mm->stack_vm << (PAGE_SHIFT-10), text, lib, +- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); +- return buffer; ++ "VmData:\t%8lu kB\n" ++ "VmStk:\t%8lu kB\n" ++ "VmExe:\t%8lu kB\n" ++ "VmLib:\t%8lu kB\n" ++ "VmPTE:\t%8lu kB\n" ++ "StaBrk:\t%08lx kB\n" ++ "Brk:\t%08lx kB\n" ++ "StaStk:\t%08lx kB\n" , ++ (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), ++ mm->locked_vm << (PAGE_SHIFT-10), ++ mm->rss << (PAGE_SHIFT-10), ++ data << (PAGE_SHIFT-10), ++ mm->stack_vm << (PAGE_SHIFT-10), text, lib, ++ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, ++ mm->start_brk, mm->brk, mm->start_stack); ++#if __i386__ ++ if (!nx_enabled) ++ buffer += sprintf(buffer, ++ "ExecLim:\t%08lx\n", mm->context.exec_limit); ++#endif ++ return buffer; + } + + unsigned long task_vsize(struct mm_struct *mm) +@@ -47,6 +55,9 @@ + + static int show_map(struct seq_file *m, void *v) + { ++#ifdef __i386__ ++ struct task_struct *task = m->private; ++#endif + struct vm_area_struct *map = v; + struct file *file = map->vm_file; + int flags = map->vm_flags; +@@ -65,7 +76,13 @@ + map->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', +- flags & VM_EXEC ? 'x' : '-', ++ (flags & VM_EXEC ++#ifdef __i386__ ++ || (!nx_enabled && ++ (map->vm_start < task->mm->context.exec_limit)) ++#endif ++ ) ++ ? 'x' : '-', + flags & VM_MAYSHARE ? 's' : 'p', + map->vm_pgoff << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino, &len); +Index: linux-2.6.10/include/asm-i386/desc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/desc.h 2005-04-05 16:29:30.129010368 +0800 ++++ linux-2.6.10/include/asm-i386/desc.h 2005-04-05 16:34:18.188218712 +0800 +@@ -129,6 +129,20 @@ + extern int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr, + unsigned long bytecount); + ++static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) ++{ ++ limit = (limit - 1) / PAGE_SIZE; ++ desc->a = limit & 0xffff; ++ desc->b = (limit & 0xf0000) | 0x00c0fb00; ++} ++ ++#define load_user_cs_desc(cpu, mm) \ ++ per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs ++ ++extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); ++extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); ++extern void arch_flush_exec_range(struct mm_struct *mm); ++ + #endif /* !__ASSEMBLY__ */ + + #endif +Index: linux-2.6.10/include/asm-i386/elf.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/elf.h 2004-12-25 05:35:15.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/elf.h 2005-04-05 16:34:18.188218712 +0800 +@@ -9,6 +9,7 @@ + #include + #include + #include /* for savesegment */ ++#include + + #include + +@@ -133,15 +134,22 @@ + #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs) + #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs) + +-#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL)) +-#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) +-#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) + extern void __kernel_vsyscall; ++#define VSYSCALL_BASE ((unsigned long)current->mm->context.vdso) ++#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) ++#define VSYSCALL_OFFSET ((unsigned long) &__kernel_vsyscall) ++#define VSYSCALL_ENTRY (VSYSCALL_BASE + VSYSCALL_OFFSET) + +-#define ARCH_DLINFO \ +-do { \ +- NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ +- NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ ++/* kernel-internal fixmap address: */ ++#define __VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL)) ++#define __VSYSCALL_EHDR ((const struct elfhdr *) __VSYSCALL_BASE) ++ ++#define ARCH_DLINFO \ ++do { \ ++ if (VSYSCALL_BASE) { \ ++ NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ ++ NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ ++ } \ + } while (0) + + /* +@@ -152,15 +160,15 @@ + * Dumping its extra ELF program headers includes all the other information + * a debugger needs to easily find how the vsyscall DSO was being used. + */ +-#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum) ++#define ELF_CORE_EXTRA_PHDRS (__VSYSCALL_EHDR->e_phnum) + #define ELF_CORE_WRITE_EXTRA_PHDRS \ + do { \ + const struct elf_phdr *const vsyscall_phdrs = \ +- (const struct elf_phdr *) (VSYSCALL_BASE \ +- + VSYSCALL_EHDR->e_phoff); \ ++ (const struct elf_phdr *) (__VSYSCALL_BASE \ ++ + __VSYSCALL_EHDR->e_phoff); \ + int i; \ + Elf32_Off ofs = 0; \ +- for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ ++ for (i = 0; i < __VSYSCALL_EHDR->e_phnum; ++i) { \ + struct elf_phdr phdr = vsyscall_phdrs[i]; \ + if (phdr.p_type == PT_LOAD) { \ + BUG_ON(ofs != 0); \ +@@ -178,10 +186,10 @@ + #define ELF_CORE_WRITE_EXTRA_DATA \ + do { \ + const struct elf_phdr *const vsyscall_phdrs = \ +- (const struct elf_phdr *) (VSYSCALL_BASE \ +- + VSYSCALL_EHDR->e_phoff); \ ++ (const struct elf_phdr *) (__VSYSCALL_BASE \ ++ + __VSYSCALL_EHDR->e_phoff); \ + int i; \ +- for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ ++ for (i = 0; i < __VSYSCALL_EHDR->e_phnum; ++i) { \ + if (vsyscall_phdrs[i].p_type == PT_LOAD) \ + DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \ + PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \ +@@ -190,4 +198,10 @@ + + #endif + ++#define __HAVE_ARCH_RANDOMIZE_BRK ++extern void randomize_brk(unsigned long old_brk); ++ ++#define __HAVE_ARCH_VSYSCALL ++extern void map_vsyscall(void); ++ + #endif +Index: linux-2.6.10/include/asm-i386/mmu.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/mmu.h 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/mmu.h 2005-04-05 16:34:18.189218560 +0800 +@@ -7,11 +7,17 @@ + * we put the segment information here. + * + * cpu_vm_mask is used to optimize ldt flushing. ++ * ++ * exec_limit is used to track the range PROT_EXEC ++ * mappings span. + */ + typedef struct { + int size; + struct semaphore sem; + void *ldt; ++ struct desc_struct user_cs; ++ unsigned long exec_limit; ++ void *vdso; + } mm_context_t; + + #endif +Index: linux-2.6.10/include/asm-i386/pgalloc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/pgalloc.h 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/pgalloc.h 2005-04-05 16:34:18.190218408 +0800 +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + #include /* for struct page */ + +Index: linux-2.6.10/include/asm-i386/processor.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/processor.h 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/processor.h 2005-04-05 16:34:18.189218560 +0800 +@@ -296,7 +296,10 @@ + /* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) ++#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3) ++ ++#define __HAVE_ARCH_ALIGN_STACK ++extern unsigned long arch_align_stack(unsigned long sp); + + #define HAVE_ARCH_PICK_MMAP_LAYOUT + +@@ -478,6 +481,7 @@ + regs->xcs = __USER_CS; \ + regs->eip = new_eip; \ + regs->esp = new_esp; \ ++ load_user_cs_desc(smp_processor_id(), current->mm); \ + } while (0) + + /* Forward declaration, a strange C thing */ +Index: linux-2.6.10/include/asm-i386/thread_info.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/thread_info.h 2005-04-05 16:29:30.127010672 +0800 ++++ linux-2.6.10/include/asm-i386/thread_info.h 2005-04-05 16:34:18.190218408 +0800 +@@ -38,6 +38,7 @@ + 0-0xBFFFFFFF for user-thead + 0-0xFFFFFFFF for kernel-thread + */ ++ void *sysenter_return; + struct restart_block restart_block; + + unsigned long previous_esp; /* ESP of the previous stack in case +Index: linux-2.6.10/include/asm-ia64/pgalloc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ia64/pgalloc.h 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/include/asm-ia64/pgalloc.h 2005-04-05 16:34:18.184219320 +0800 +@@ -23,6 +23,10 @@ + #include + #include + ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++ + /* + * Very stupidly, we used to get new pgd's and pmd's, init their contents + * to point to the NULL versions of the next level page table, later on +Index: linux-2.6.10/include/asm-ppc64/pgalloc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ppc64/pgalloc.h 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/include/asm-ppc64/pgalloc.h 2005-04-05 16:34:18.185219168 +0800 +@@ -11,6 +11,11 @@ + + extern kmem_cache_t *zero_cache; + ++/* Dummy functions since we don't support execshield on ppc */ ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++ + /* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License +Index: linux-2.6.10/include/asm-ppc/pgalloc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ppc/pgalloc.h 2004-12-25 05:33:48.000000000 +0800 ++++ linux-2.6.10/include/asm-ppc/pgalloc.h 2005-04-05 16:34:18.183219472 +0800 +@@ -40,5 +40,10 @@ + + #define check_pgt_cache() do { } while (0) + ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++ ++ + #endif /* _PPC_PGALLOC_H */ + #endif /* __KERNEL__ */ +Index: linux-2.6.10/include/asm-s390/pgalloc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-s390/pgalloc.h 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/include/asm-s390/pgalloc.h 2005-04-05 16:34:18.186219016 +0800 +@@ -19,6 +19,10 @@ + #include + #include + ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++ + #define check_pgt_cache() do {} while (0) + + extern void diag10(unsigned long addr); +Index: linux-2.6.10/include/asm-sparc64/pgalloc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-sparc64/pgalloc.h 2004-12-25 05:35:29.000000000 +0800 ++++ linux-2.6.10/include/asm-sparc64/pgalloc.h 2005-04-05 16:34:18.187218864 +0800 +@@ -261,4 +261,8 @@ + #define pgd_free(pgd) free_pgd_fast(pgd) + #define pgd_alloc(mm) get_pgd_fast() + ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++ + #endif /* _SPARC64_PGALLOC_H */ +Index: linux-2.6.10/include/asm-sparc/pgalloc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-sparc/pgalloc.h 2004-12-25 05:33:51.000000000 +0800 ++++ linux-2.6.10/include/asm-sparc/pgalloc.h 2005-04-05 16:34:18.191218256 +0800 +@@ -66,4 +66,8 @@ + #define pte_free(pte) BTFIXUP_CALL(pte_free)(pte) + #define __pte_free_tlb(tlb, pte) pte_free(pte) + ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++ + #endif /* _SPARC_PGALLOC_H */ +Index: linux-2.6.10/include/asm-x86_64/pgalloc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-x86_64/pgalloc.h 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/include/asm-x86_64/pgalloc.h 2005-04-05 16:34:18.185219168 +0800 +@@ -7,6 +7,11 @@ + #include + #include + ++#define arch_add_exec_range(mm, limit) do { ; } while (0) ++#define arch_flush_exec_range(mm) do { ; } while (0) ++#define arch_remove_exec_range(mm, limit) do { ; } while (0) ++ ++ + #define pmd_populate_kernel(mm, pmd, pte) \ + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) + #define pgd_populate(mm, pgd, pmd) \ +Index: linux-2.6.10/include/linux/mm.h +=================================================================== +--- linux-2.6.10.orig/include/linux/mm.h 2005-04-05 16:29:30.250991824 +0800 ++++ linux-2.6.10/include/linux/mm.h 2005-04-05 16:43:44.366146584 +0800 +@@ -685,7 +685,14 @@ + unsigned long addr, unsigned long len, pgoff_t pgoff); + extern void exit_mmap(struct mm_struct *); + +-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); ++extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int); ++ ++ ++static inline unsigned long get_unmapped_area(struct file * file, unsigned long addr, ++ unsigned long len, unsigned long pgoff, unsigned long flags) ++{ ++ return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0); ++} + + extern unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, +Index: linux-2.6.10/include/linux/random.h +=================================================================== +--- linux-2.6.10.orig/include/linux/random.h 2004-12-25 05:35:40.000000000 +0800 ++++ linux-2.6.10/include/linux/random.h 2005-04-05 16:34:18.183219472 +0800 +@@ -69,6 +69,9 @@ + extern struct file_operations random_fops, urandom_fops; + #endif + ++unsigned int get_random_int(void); ++unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len); ++ + #endif /* __KERNEL___ */ + + #endif /* _LINUX_RANDOM_H */ +Index: linux-2.6.10/include/linux/resource.h +=================================================================== +--- linux-2.6.10.orig/include/linux/resource.h 2004-12-25 05:33:52.000000000 +0800 ++++ linux-2.6.10/include/linux/resource.h 2005-04-05 16:34:18.182219624 +0800 +@@ -52,8 +52,11 @@ + /* + * Limit the stack by to some sane default: root can always + * increase this limit if needed.. 8MB seems reasonable. ++ * ++ * (2MB more to cover randomization effects.) + */ +-#define _STK_LIM (8*1024*1024) ++#define _STK_LIM (10*1024*1024) ++#define EXEC_STACK_BIAS (2*1024*1024) + + /* + * GPG wants 32kB of mlocked memory, to make sure pass phrases +Index: linux-2.6.10/include/linux/sched.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sched.h 2005-04-05 16:29:27.971338384 +0800 ++++ linux-2.6.10/include/linux/sched.h 2005-04-05 16:43:44.367146432 +0800 +@@ -32,6 +32,9 @@ + #include + + struct exec_domain; ++extern int exec_shield; ++extern int exec_shield_randomize; ++extern int print_fatal_signals; + + /* + * cloning flags: +@@ -193,6 +196,10 @@ + extern unsigned long + arch_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); ++ ++extern unsigned long ++arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long, ++ unsigned long, unsigned long); + extern unsigned long + arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, +@@ -208,6 +215,9 @@ + unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); ++ unsigned long (*get_unmapped_exec_area) (struct file *filp, ++ unsigned long addr, unsigned long len, ++ unsigned long pgoff, unsigned long flags); + void (*unmap_area) (struct vm_area_struct *area); + unsigned long mmap_base; /* base of mmap area */ + unsigned long free_area_cache; /* first hole */ +@@ -720,6 +730,7 @@ + #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ + #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ + #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ ++#define PF_RELOCEXEC 0x00800000 /* relocate shared libraries */ + + #ifdef CONFIG_SMP + extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); +Index: linux-2.6.10/kernel/signal.c +=================================================================== +--- linux-2.6.10.orig/kernel/signal.c 2005-04-05 16:29:27.951341424 +0800 ++++ linux-2.6.10/kernel/signal.c 2005-04-05 16:43:17.077295120 +0800 +@@ -1608,6 +1608,35 @@ + spin_unlock_irq(¤t->sighand->siglock); + } + ++int print_fatal_signals = 0; ++ ++static void print_fatal_signal(struct pt_regs *regs, int signr) ++{ ++ int i; ++ unsigned char insn; ++ printk("%s/%d: potentially unexpected fatal signal %d.\n", ++ current->comm, current->pid, signr); ++ ++#ifdef __i386__ ++ printk("code at %08lx: ", regs->eip); ++ for (i = 0; i < 16; i++) { ++ __get_user(insn, (unsigned char *)(regs->eip + i)); ++ printk("%02x ", insn); ++ } ++#endif ++ printk("\n"); ++ show_regs(regs); ++} ++ ++static int __init setup_print_fatal_signals(char *str) ++{ ++ get_option (&str, &print_fatal_signals); ++ ++ return 1; ++} ++ ++__setup("print-fatal-signals=", setup_print_fatal_signals); ++ + #ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER + + static void +@@ -1808,6 +1837,12 @@ + if (!signr) + break; /* will return 0 */ + ++ if ((signr == SIGSEGV) && print_fatal_signals) { ++ spin_unlock_irq(¤t->sighand->siglock); ++ print_fatal_signal(regs, signr); ++ spin_lock_irq(¤t->sighand->siglock); ++ } ++ + if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { + ptrace_signal_deliver(regs, cookie); + +@@ -1904,6 +1939,8 @@ + * Anything else is fatal, maybe with a core dump. + */ + current->flags |= PF_SIGNALED; ++ if (print_fatal_signals) ++ print_fatal_signal(regs, signr); + if (sig_kernel_coredump(signr)) { + /* + * If it was able to dump core, this kills all +Index: linux-2.6.10/kernel/sysctl.c +=================================================================== +--- linux-2.6.10.orig/kernel/sysctl.c 2005-04-05 16:29:24.394882088 +0800 ++++ linux-2.6.10/kernel/sysctl.c 2005-04-05 16:43:17.078294968 +0800 +@@ -75,6 +75,29 @@ + void __user *, size_t *, loff_t *); + #endif + ++extern unsigned int vdso_enabled; ++ ++int exec_shield = 1; ++int exec_shield_randomize = 1; ++ ++static int __init setup_exec_shield(char *str) ++{ ++ get_option (&str, &exec_shield); ++ ++ return 1; ++} ++ ++__setup("exec-shield=", setup_exec_shield); ++ ++static int __init setup_exec_shield_randomize(char *str) ++{ ++ get_option (&str, &exec_shield_randomize); ++ ++ return 1; ++} ++ ++__setup("exec-shield-randomize=", setup_exec_shield_randomize); ++ + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ + static int maxolduid = 65535; + static int minolduid; +@@ -276,6 +299,40 @@ + .proc_handler = &proc_dointvec, + }, + { ++ .ctl_name = KERN_PANIC, ++ .procname = "exec-shield", ++ .data = &exec_shield, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_PANIC, ++ .procname = "exec-shield-randomize", ++ .data = &exec_shield_randomize, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_PANIC, ++ .procname = "print-fatal-signals", ++ .data = &print_fatal_signals, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#if __i386__ ++ { ++ .ctl_name = KERN_PANIC, ++ .procname = "vdso", ++ .data = &vdso_enabled, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++ { + .ctl_name = KERN_CORE_USES_PID, + .procname = "core_uses_pid", + .data = &core_uses_pid, +Index: linux-2.6.10/mm/mmap.c +=================================================================== +--- linux-2.6.10.orig/mm/mmap.c 2005-04-05 16:29:30.134009608 +0800 ++++ linux-2.6.10/mm/mmap.c 2005-04-05 16:43:44.369146128 +0800 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -245,6 +246,8 @@ + __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) + { ++ if (vma->vm_flags & VM_EXEC) ++ arch_add_exec_range(mm, vma->vm_end); + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; +@@ -347,6 +350,8 @@ + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; ++ if (vma->vm_flags & VM_EXEC) ++ arch_remove_exec_range(mm, vma->vm_end); + } + + /* +@@ -642,6 +647,8 @@ + } else /* cases 2, 5, 7 */ + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); ++ if (prev->vm_flags & VM_EXEC) ++ arch_add_exec_range(mm, prev->vm_end); + return prev; + } + +@@ -813,7 +820,7 @@ + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ +- addr = get_unmapped_area(file, addr, len, pgoff, flags); ++ addr = get_unmapped_area_prot(file, addr, len, pgoff, flags, prot & PROT_EXEC); + if (addr & ~PAGE_MASK) + return addr; + +@@ -1207,9 +1214,10 @@ + area->vm_mm->free_area_cache = area->vm_end; + } + ++ + unsigned long +-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, +- unsigned long pgoff, unsigned long flags) ++get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len, ++ unsigned long pgoff, unsigned long flags, int exec) + { + if (flags & MAP_FIXED) { + unsigned long ret; +@@ -1241,10 +1249,80 @@ + return file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + +- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); ++ if (exec && current->mm->get_unmapped_exec_area) ++ return current->mm->get_unmapped_exec_area(file, addr, len, pgoff, flags); ++ else ++ return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + } + +-EXPORT_SYMBOL(get_unmapped_area); ++EXPORT_SYMBOL(get_unmapped_area_prot); ++ ++ ++#define SHLIB_BASE 0x00111000 ++ ++unsigned long arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0, ++ unsigned long len0, unsigned long pgoff, unsigned long flags) ++{ ++ unsigned long addr = addr0, len = len0; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ unsigned long tmp; ++ ++ if (len > TASK_SIZE) ++ return -ENOMEM; ++ ++ if (!addr && !(flags & MAP_FIXED)) ++ addr = randomize_range(SHLIB_BASE, 0x01000000, len); ++ ++ if (addr) { ++ addr = PAGE_ALIGN(addr); ++ vma = find_vma(mm, addr); ++ if (TASK_SIZE - len >= addr && ++ (!vma || addr + len <= vma->vm_start)) { ++ return addr; ++ } ++ } ++ ++ addr = SHLIB_BASE; ++ ++ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { ++ /* At this point: (!vma || addr < vma->vm_end). */ ++ if (TASK_SIZE - len < addr) { ++ return -ENOMEM; ++ } ++ if (!vma || addr + len <= vma->vm_start) { ++ /* ++ * Must not let a PROT_EXEC mapping get into the ++ * brk area: ++ */ ++ if (addr + len > mm->brk) ++ goto failed; ++ ++ /* ++ * Up until the brk area we randomize addresses ++ * as much as possible: ++ */ ++ if (addr >= 0x01000000) { ++ tmp = randomize_range(0x01000000, mm->brk, len); ++ vma = find_vma(mm, tmp); ++ if (TASK_SIZE - len >= tmp && ++ (!vma || tmp + len <= vma->vm_start)) ++ return tmp; ++ } ++ /* ++ * Ok, randomization didnt work out - return ++ * the result of the linear search: ++ */ ++ return addr; ++ } ++ addr = vma->vm_end; ++ } ++ ++failed: ++ return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags); ++} ++ ++ + + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ + struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +@@ -1319,6 +1397,14 @@ + return prev ? prev->vm_next : vma; + } + ++ ++static int over_stack_limit(unsigned long sz) ++{ ++ if (sz < EXEC_STACK_BIAS) ++ return 0; ++ return (sz - EXEC_STACK_BIAS) > current->signal->rlim[RLIMIT_STACK].rlim_cur; ++} ++ + #ifdef CONFIG_STACK_GROWSUP + /* + * vma is the first one with address > vma->vm_end. Have to extend vma. +@@ -1358,7 +1444,7 @@ + return -ENOMEM; + } + +- if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur || ++ if (over_stack_limit(address - vma->vm_start) || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + current->signal->rlim[RLIMIT_AS].rlim_cur) { + anon_vma_unlock(vma); +@@ -1432,7 +1518,7 @@ + return -ENOMEM; + } + +- if (vma->vm_end - address > current->signal->rlim[RLIMIT_STACK].rlim_cur || ++ if (over_stack_limit(vma->vm_end - address) || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + current->signal->rlim[RLIMIT_AS].rlim_cur) { + anon_vma_unlock(vma); +@@ -1668,10 +1754,14 @@ + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + +- if (new_below) ++ if (new_below) { ++ unsigned long old_end = vma->vm_end; ++ + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + ((addr - new->vm_start) >> PAGE_SHIFT), new); +- else ++ if (vma->vm_flags & VM_EXEC) ++ arch_remove_exec_range(mm, old_end); ++ } else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + + return 0; +@@ -1890,6 +1980,7 @@ + mm->rss = 0; + mm->total_vm = 0; + mm->locked_vm = 0; ++ arch_flush_exec_range(mm); + + spin_unlock(&mm->page_table_lock); + +Index: linux-2.6.10/mm/mprotect.c +=================================================================== +--- linux-2.6.10.orig/mm/mprotect.c 2005-04-05 16:29:30.135009456 +0800 ++++ linux-2.6.10/mm/mprotect.c 2005-04-05 16:34:18.193217952 +0800 +@@ -22,6 +22,7 @@ + + #include + #include ++#include + #include + #include + +@@ -117,7 +118,7 @@ + struct mm_struct * mm = vma->vm_mm; + unsigned long oldflags = vma->vm_flags; + long nrpages = (end - start) >> PAGE_SHIFT; +- unsigned long charged = 0; ++ unsigned long charged = 0, old_end = vma->vm_end; + pgprot_t newprot; + pgoff_t pgoff; + int error; +@@ -179,8 +180,11 @@ + * vm_flags and vm_page_prot are protected by the mmap_sem + * held in write mode. + */ ++ oldflags = vma->vm_flags; + vma->vm_flags = newflags; + vma->vm_page_prot = newprot; ++ if (oldflags & VM_EXEC) ++ arch_remove_exec_range(current->mm, old_end); + change_protection(vma, start, end, newprot); + __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + __vm_stat_account(mm, newflags, vma->vm_file, nrpages); +Index: linux-2.6.10/mm/mremap.c +=================================================================== +--- linux-2.6.10.orig/mm/mremap.c 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/mm/mremap.c 2005-04-05 16:43:44.370145976 +0800 +@@ -385,8 +385,8 @@ + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + +- new_addr = get_unmapped_area(vma->vm_file, 0, new_len, +- vma->vm_pgoff, map_flags); ++ new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len, ++ vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC); + ret = new_addr; + if (new_addr & ~PAGE_MASK) + goto out; diff --git a/lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch b/lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch new file mode 100644 index 0000000..9c0bb12 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch @@ -0,0 +1,10676 @@ +Index: linux-2.6.10/arch/i386/Kconfig.debug +=================================================================== +--- linux-2.6.10.orig/arch/i386/Kconfig.debug 2005-04-05 16:29:30.191000944 +0800 ++++ linux-2.6.10/arch/i386/Kconfig.debug 2005-04-05 16:47:53.904211032 +0800 +@@ -2,6 +2,63 @@ + + source "lib/Kconfig.debug" + ++config CRASH_DUMP ++ tristate "Crash dump support (EXPERIMENTAL)" ++ depends on EXPERIMENTAL ++ default n ++ ---help--- ++ Say Y here to enable saving an image of system memory when a panic ++ or other error occurs. Dumps can also be forced with the SysRq+d ++ key if MAGIC_SYSRQ is enabled. ++ ++config KERNTYPES ++ bool ++ depends on CRASH_DUMP ++ default y ++ ++config CRASH_DUMP_BLOCKDEV ++ tristate "Crash dump block device driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving crash dumps directly to a disk device. ++ ++config CRASH_DUMP_NETDEV ++ tristate "Crash dump network device driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving crash dumps over a network device. ++ ++config CRASH_DUMP_MEMDEV ++ bool "Crash dump staged memory driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow intermediate saving crash dumps in spare ++ memory pages which would then be written out to disk ++ later. ++ ++config CRASH_DUMP_SOFTBOOT ++ bool "Save crash dump across a soft reboot" ++ depends on CRASH_DUMP_MEMDEV ++ help ++ Say Y to allow a crash dump to be preserved in memory ++ pages across a soft reboot and written out to disk ++ thereafter. For this to work, CRASH_DUMP must be ++ configured as part of the kernel (not as a module). ++ ++config CRASH_DUMP_COMPRESS_RLE ++ tristate "Crash dump RLE compression" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving dumps with Run Length Encoding compression. ++ ++config CRASH_DUMP_COMPRESS_GZIP ++ tristate "Crash dump GZIP compression" ++ select ZLIB_INFLATE ++ select ZLIB_DEFLATE ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving dumps with Gnu Zip compression. ++ + config EARLY_PRINTK + bool "Early printk" if EMBEDDED + default y +@@ -15,8 +72,8 @@ + with klogd/syslogd or the X server. You should normally N here, + unless you want to debug such a crash. + +-config DEBUG_STACKOVERFLOW +- bool "Check for stack overflows" ++config DEBUG_STACKOVERFLOW ++ bool "Check for stack overflows" + depends on DEBUG_KERNEL + + config KPROBES +Index: linux-2.6.10/arch/i386/mm/init.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/mm/init.c 2005-04-05 16:47:05.157621640 +0800 ++++ linux-2.6.10/arch/i386/mm/init.c 2005-04-05 16:47:53.909210272 +0800 +@@ -244,6 +244,13 @@ + return 0; + } + ++/* To enable modules to check if a page is in RAM */ ++int pfn_is_ram(unsigned long pfn) ++{ ++ return (page_is_ram(pfn)); ++} ++ ++ + #ifdef CONFIG_HIGHMEM + pte_t *kmap_pte; + pgprot_t kmap_prot; +Index: linux-2.6.10/arch/i386/kernel/traps.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/traps.c 2005-04-05 16:47:05.156621792 +0800 ++++ linux-2.6.10/arch/i386/kernel/traps.c 2005-04-05 16:47:53.906210728 +0800 +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_EISA + #include +@@ -382,6 +383,7 @@ + bust_spinlocks(0); + die.lock_owner = -1; + spin_unlock_irq(&die.lock); ++ dump((char *)str, regs); + if (in_interrupt()) + panic("Fatal exception in interrupt"); + +@@ -654,6 +656,7 @@ + printk(" on CPU%d, eip %08lx, registers:\n", + smp_processor_id(), regs->eip); + show_registers(regs); ++ dump((char *)msg, regs); + printk("console shuts up ...\n"); + console_silent(); + spin_unlock(&nmi_print_lock); +Index: linux-2.6.10/arch/i386/kernel/setup.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/setup.c 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/setup.c 2005-04-05 16:47:53.905210880 +0800 +@@ -662,6 +662,10 @@ + */ + #define LOWMEMSIZE() (0x9f000) + ++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT ++unsigned long crashdump_addr = 0xdeadbeef; ++#endif ++ + static void __init parse_cmdline_early (char ** cmdline_p) + { + char c = ' ', *to = command_line, *from = saved_command_line; +@@ -823,6 +827,11 @@ + if (c == ' ' && !memcmp(from, "vmalloc=", 8)) + __VMALLOC_RESERVE = memparse(from+8, &from); + ++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT ++ if (c == ' ' && !memcmp(from, "crashdump=", 10)) ++ crashdump_addr = memparse(from+10, &from); ++#endif ++ + c = *(from++); + if (!c) + break; +@@ -1288,6 +1297,10 @@ + + static char * __init machine_specific_memory_setup(void); + ++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT ++extern void crashdump_reserve(void); ++#endif ++ + /* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures +@@ -1393,6 +1406,10 @@ + #endif + + ++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT ++ crashdump_reserve(); /* Preserve crash dump state from prev boot */ ++#endif ++ + dmi_scan_machine(); + + #ifdef CONFIG_X86_GENERICARCH +Index: linux-2.6.10/arch/i386/kernel/smp.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/smp.c 2005-04-05 16:47:05.154622096 +0800 ++++ linux-2.6.10/arch/i386/kernel/smp.c 2005-04-05 16:47:53.908210424 +0800 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -143,6 +144,13 @@ + */ + cfg = __prepare_ICR(shortcut, vector); + ++ if (vector == DUMP_VECTOR) { ++ /* ++ * Setup DUMP IPI to be delivered as an NMI ++ */ ++ cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI; ++ } ++ + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ +@@ -220,6 +228,13 @@ + * program the ICR + */ + cfg = __prepare_ICR(0, vector); ++ ++ if (vector == DUMP_VECTOR) { ++ /* ++ * Setup DUMP IPI to be delivered as an NMI ++ */ ++ cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI; ++ } + + /* + * Send the IPI. The write to APIC_ICR fires this off. +@@ -506,6 +521,11 @@ + + static struct call_data_struct * call_data; + ++void dump_send_ipi(void) ++{ ++ send_IPI_allbutself(DUMP_VECTOR); ++} ++ + /* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. +@@ -561,7 +581,7 @@ + return 0; + } + +-static void stop_this_cpu (void * dummy) ++void stop_this_cpu (void * dummy) + { + /* + * Remove this CPU: +@@ -622,4 +642,3 @@ + atomic_inc(&call_data->finished); + } + } +- +Index: linux-2.6.10/arch/i386/kernel/i386_ksyms.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/i386_ksyms.c 2004-12-25 05:35:40.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/i386_ksyms.c 2005-04-05 16:47:53.907210576 +0800 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -31,6 +32,7 @@ + #include + #include + #include ++#include + #include + + extern void dump_thread(struct pt_regs *, struct user *); +@@ -192,3 +194,20 @@ + #endif + + EXPORT_SYMBOL(csum_partial); ++ ++#ifdef CONFIG_CRASH_DUMP_MODULE ++#ifdef CONFIG_SMP ++extern irq_desc_t irq_desc[NR_IRQS]; ++extern cpumask_t irq_affinity[NR_IRQS]; ++extern void stop_this_cpu(void *); ++EXPORT_SYMBOL(irq_desc); ++EXPORT_SYMBOL(irq_affinity); ++EXPORT_SYMBOL(stop_this_cpu); ++EXPORT_SYMBOL(dump_send_ipi); ++#endif ++extern int pfn_is_ram(unsigned long); ++EXPORT_SYMBOL(pfn_is_ram); ++#ifdef ARCH_HAS_NMI_WATCHDOG ++EXPORT_SYMBOL(touch_nmi_watchdog); ++#endif ++#endif +Index: linux-2.6.10/arch/s390/Kconfig.debug +=================================================================== +--- linux-2.6.10.orig/arch/s390/Kconfig.debug 2004-12-25 05:34:31.000000000 +0800 ++++ linux-2.6.10/arch/s390/Kconfig.debug 2005-04-05 16:47:53.921208448 +0800 +@@ -2,4 +2,13 @@ + + source "lib/Kconfig.debug" + ++config KERNTYPES ++ bool "Kerntypes debugging information" ++ default y ++ ---help--- ++ Say Y here to save additional kernel debugging information in the ++ file init/kerntypes.o. This information is used by crash analysis ++ tools such as lcrash to assign structures to kernel addresses. ++ ++ + endmenu +Index: linux-2.6.10/arch/s390/boot/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/s390/boot/Makefile 2004-12-25 05:35:49.000000000 +0800 ++++ linux-2.6.10/arch/s390/boot/Makefile 2005-04-05 16:47:53.922208296 +0800 +@@ -15,4 +15,4 @@ + + install: $(CONFIGURE) $(obj)/image + sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/image \ +- System.map Kerntypes "$(INSTALL_PATH)" ++ System.map init/Kerntypes "$(INSTALL_PATH)" +Index: linux-2.6.10/arch/s390/boot/install.sh +=================================================================== +--- linux-2.6.10.orig/arch/s390/boot/install.sh 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/arch/s390/boot/install.sh 2005-04-05 16:47:53.921208448 +0800 +@@ -16,7 +16,8 @@ + # $1 - kernel version + # $2 - kernel image file + # $3 - kernel map file +-# $4 - default install path (blank if root directory) ++# $4 - kernel type file ++# $5 - default install path (blank if root directory) + # + + # User may have a custom install script +@@ -26,13 +27,13 @@ + + # Default install - same as make zlilo + +-if [ -f $4/vmlinuz ]; then +- mv $4/vmlinuz $4/vmlinuz.old ++if [ -f $5/vmlinuz ]; then ++ mv $5/vmlinuz $5/vmlinuz.old + fi + +-if [ -f $4/System.map ]; then +- mv $4/System.map $4/System.old ++if [ -f $5/System.map ]; then ++ mv $5/System.map $5/System.old + fi + +-cat $2 > $4/vmlinuz +-cp $3 $4/System.map ++cat $2 > $5/vmlinuz ++cp $3 $5/System.map +Index: linux-2.6.10/arch/ia64/Kconfig.debug +=================================================================== +--- linux-2.6.10.orig/arch/ia64/Kconfig.debug 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/ia64/Kconfig.debug 2005-04-05 16:47:53.917209056 +0800 +@@ -2,6 +2,65 @@ + + source "lib/Kconfig.debug" + ++config CRASH_DUMP ++ tristate "Crash dump support (EXPERIMENTAL)" ++ depends on EXPERIMENTAL ++ default n ++ ---help--- ++ Say Y here to enable saving an image of system memory when a panic ++ or other error occurs. Dumps can also be forced with the SysRq+d ++ key if MAGIC_SYSRQ is enabled. ++ ++config KERNTYPES ++ bool ++ depends on CRASH_DUMP ++ default y ++ ++config CRASH_DUMP_BLOCKDEV ++ tristate "Crash dump block device driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving crash dumps directly to a disk device. ++ ++config CRASH_DUMP_NETDEV ++ tristate "Crash dump network device driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving crash dumps over a network device. ++ ++config CRASH_DUMP_MEMDEV ++ bool "Crash dump staged memory driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow intermediate saving crash dumps in spare ++ memory pages which would then be written out to disk ++ later. ++ ++config CRASH_DUMP_SOFTBOOT ++ bool "Save crash dump across a soft reboot" ++ depends on CRASH_DUMP_MEMDEV ++ help ++ Say Y to allow a crash dump to be preserved in memory ++ pages across a soft reboot and written out to disk ++ thereafter. For this to work, CRASH_DUMP must be ++ configured as part of the kernel (not as a module). ++ ++config CRASH_DUMP_COMPRESS_RLE ++ tristate "Crash dump RLE compression" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving dumps with Run Length Encoding compression. ++ ++config CRASH_DUMP_COMPRESS_GZIP ++ tristate "Crash dump GZIP compression" ++ select ZLIB_INFLATE ++ select ZLIB_DEFLATE ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving dumps with Gnu Zip compression. ++ ++ ++ + choice + prompt "Physical memory granularity" + default IA64_GRANULE_64MB +Index: linux-2.6.10/arch/ia64/kernel/traps.c +=================================================================== +--- linux-2.6.10.orig/arch/ia64/kernel/traps.c 2004-12-25 05:35:39.000000000 +0800 ++++ linux-2.6.10/arch/ia64/kernel/traps.c 2005-04-05 16:47:53.918208904 +0800 +@@ -21,6 +21,8 @@ + #include + #include + #include ++#include ++#include + + extern spinlock_t timerlist_lock; + +@@ -89,6 +91,7 @@ + printk("%s[%d]: %s %ld [%d]\n", + current->comm, current->pid, str, err, ++die_counter); + show_regs(regs); ++ dump((char *)str, regs); + } else + printk(KERN_ERR "Recursive die() failure, output suppressed\n"); + +Index: linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c +=================================================================== +--- linux-2.6.10.orig/arch/ia64/kernel/ia64_ksyms.c 2005-04-05 16:29:27.954340968 +0800 ++++ linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c 2005-04-05 16:47:53.917209056 +0800 +@@ -7,7 +7,6 @@ + + #include + #include +- + #include + EXPORT_SYMBOL(memset); + EXPORT_SYMBOL(memchr); +@@ -28,6 +27,9 @@ + EXPORT_SYMBOL(strstr); + EXPORT_SYMBOL(strpbrk); + ++#include ++EXPORT_SYMBOL(sys_ioctl); ++ + #include + EXPORT_SYMBOL(ip_fast_csum); /* hand-coded assembly */ + +@@ -125,3 +127,21 @@ + # endif + # endif + #endif ++ ++#include ++ ++#ifdef CONFIG_CRASH_DUMP_MODULE ++#ifdef CONFIG_SMP ++extern irq_desc_t _irq_desc[NR_IRQS]; ++extern cpumask_t irq_affinity[NR_IRQS]; ++extern void stop_this_cpu(void *); ++extern int (*dump_ipi_function_ptr)(struct pt_regs *); ++extern void dump_send_ipi(void); ++EXPORT_SYMBOL(_irq_desc); ++EXPORT_SYMBOL(irq_affinity); ++EXPORT_SYMBOL(stop_this_cpu); ++EXPORT_SYMBOL(dump_send_ipi); ++EXPORT_SYMBOL(dump_ipi_function_ptr); ++#endif ++#endif ++ +Index: linux-2.6.10/arch/ia64/kernel/irq.c +=================================================================== +--- linux-2.6.10.orig/arch/ia64/kernel/irq.c 2004-12-25 05:35:27.000000000 +0800 ++++ linux-2.6.10/arch/ia64/kernel/irq.c 2005-04-05 16:47:53.919208752 +0800 +@@ -933,7 +933,11 @@ + + static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; + ++#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE) ++cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; ++#else + static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; ++#endif + + static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 }; + +Index: linux-2.6.10/arch/ia64/kernel/smp.c +=================================================================== +--- linux-2.6.10.orig/arch/ia64/kernel/smp.c 2004-12-25 05:35:40.000000000 +0800 ++++ linux-2.6.10/arch/ia64/kernel/smp.c 2005-04-05 16:47:53.920208600 +0800 +@@ -31,6 +31,10 @@ + #include + #include + ++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE) ++#include ++#endif ++ + #include + #include + #include +@@ -67,6 +71,11 @@ + #define IPI_CALL_FUNC 0 + #define IPI_CPU_STOP 1 + ++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE) ++#define IPI_DUMP_INTERRUPT 4 ++ int (*dump_ipi_function_ptr)(struct pt_regs *) = NULL; ++#endif ++ + /* This needs to be cacheline aligned because it is written to by *other* CPUs. */ + static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned; + +@@ -84,7 +93,9 @@ + spin_unlock_irq(&call_lock); + } + +-static void ++ ++/*changed static void stop_this_cpu -> void stop_this_cpu */ ++void + stop_this_cpu (void) + { + /* +@@ -155,6 +166,15 @@ + case IPI_CPU_STOP: + stop_this_cpu(); + break; ++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE) ++ case IPI_DUMP_INTERRUPT: ++ if( dump_ipi_function_ptr != NULL ) { ++ if (!dump_ipi_function_ptr(regs)) { ++ printk(KERN_ERR "(*dump_ipi_function_ptr)(): rejected IPI_DUMP_INTERRUPT\n"); ++ } ++ } ++ break; ++#endif + + default: + printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which); +@@ -369,9 +389,17 @@ + { + send_IPI_allbutself(IPI_CPU_STOP); + } ++EXPORT_SYMBOL(smp_send_stop); + + int __init + setup_profiling_timer (unsigned int multiplier) + { + return -EINVAL; + } ++ ++#if defined(CONFIG_CRASH_DUMP) || defined(CONFIG_CRASH_DUMP_MODULE) ++void dump_send_ipi(void) ++{ ++ send_IPI_allbutself(IPI_DUMP_INTERRUPT); ++} ++#endif +Index: linux-2.6.10/arch/ppc64/Kconfig.debug +=================================================================== +--- linux-2.6.10.orig/arch/ppc64/Kconfig.debug 2004-12-25 05:35:27.000000000 +0800 ++++ linux-2.6.10/arch/ppc64/Kconfig.debug 2005-04-05 16:47:53.922208296 +0800 +@@ -2,6 +2,64 @@ + + source "lib/Kconfig.debug" + ++config KERNTYPES ++ bool ++ depends on CRASH_DUMP ++ default y ++ ++config CRASH_DUMP ++ tristate "Crash dump support" ++ default n ++ ---help--- ++ Say Y here to enable saving an image of system memory when a panic ++ or other error occurs. Dumps can also be forced with the SysRq+d ++ key if MAGIC_SYSRQ is enabled. ++ ++config CRASH_DUMP_BLOCKDEV ++ tristate "Crash dump block device driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving crash dumps directly to a disk device. ++ ++config CRASH_DUMP_NETDEV ++ tristate "Crash dump network device driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving crash dumps over a network device. ++ ++config CRASH_DUMP_MEMDEV ++ bool "Crash dump staged memory driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow intermediate saving crash dumps in spare ++ memory pages which would then be written out to disk ++ later. Need 'kexec' support for this to work. ++ **** Not supported at present **** ++ ++config CRASH_DUMP_SOFTBOOT ++ bool "Save crash dump across a soft reboot" ++ help ++ Say Y to allow a crash dump to be preserved in memory ++ pages across a soft reboot and written out to disk ++ thereafter. For this to work, CRASH_DUMP must be ++ configured as part of the kernel (not as a module). ++ Need 'kexec' support to use this option. ++ **** Not supported at present **** ++ ++config CRASH_DUMP_COMPRESS_RLE ++ tristate "Crash dump RLE compression" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving dumps with Run Length Encoding compression. ++ ++config CRASH_DUMP_COMPRESS_GZIP ++ tristate "Crash dump GZIP compression" ++ select ZLIB_INFLATE ++ select ZLIB_DEFLATE ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving dumps with Gnu Zip compression. ++ + config DEBUG_STACKOVERFLOW + bool "Check for stack overflows" + depends on DEBUG_KERNEL +Index: linux-2.6.10/arch/ppc64/kernel/traps.c +=================================================================== +--- linux-2.6.10.orig/arch/ppc64/kernel/traps.c 2004-12-25 05:34:47.000000000 +0800 ++++ linux-2.6.10/arch/ppc64/kernel/traps.c 2005-04-05 16:47:53.923208144 +0800 +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -116,6 +117,7 @@ + if (nl) + printk("\n"); + show_regs(regs); ++ dump((char *)str, regs); + bust_spinlocks(0); + spin_unlock_irq(&die_lock); + +Index: linux-2.6.10/arch/ppc64/kernel/ppc_ksyms.c +=================================================================== +--- linux-2.6.10.orig/arch/ppc64/kernel/ppc_ksyms.c 2004-12-25 05:34:26.000000000 +0800 ++++ linux-2.6.10/arch/ppc64/kernel/ppc_ksyms.c 2005-04-05 16:47:53.925207840 +0800 +@@ -159,6 +159,17 @@ + EXPORT_SYMBOL(get_wchan); + EXPORT_SYMBOL(console_drivers); + ++#ifdef CONFIG_CRASH_DUMP_MODULE ++extern int dump_page_is_ram(unsigned long); ++EXPORT_SYMBOL(dump_page_is_ram); ++#ifdef CONFIG_SMP ++EXPORT_SYMBOL(irq_affinity); ++extern void stop_this_cpu(void *); ++EXPORT_SYMBOL(stop_this_cpu); ++EXPORT_SYMBOL(dump_send_ipi); ++#endif ++#endif ++ + EXPORT_SYMBOL(tb_ticks_per_usec); + EXPORT_SYMBOL(paca); + EXPORT_SYMBOL(cur_cpu_spec); +Index: linux-2.6.10/arch/ppc64/kernel/lmb.c +=================================================================== +--- linux-2.6.10.orig/arch/ppc64/kernel/lmb.c 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/arch/ppc64/kernel/lmb.c 2005-04-05 16:47:53.924207992 +0800 +@@ -344,3 +344,31 @@ + + return pa; + } ++ ++ ++/* ++ * This is the copy of page_is_ram (mm/init.c). The difference is ++ * it identifies all memory holes. ++ */ ++int dump_page_is_ram(unsigned long pfn) ++{ ++ int i; ++ unsigned long paddr = (pfn << PAGE_SHIFT); ++ ++ for (i=0; i < lmb.memory.cnt ;i++) { ++ unsigned long base; ++ ++#ifdef CONFIG_MSCHUNKS ++ base = lmb.memory.region[i].physbase; ++#else ++ base = lmb.memory.region[i].base; ++#endif ++ if ((paddr >= base) && ++ (paddr < (base + lmb.memory.region[i].size))) { ++ return 1; ++ } ++ } ++ ++ return 0; ++} ++ +Index: linux-2.6.10/arch/ppc64/kernel/xics.c +=================================================================== +--- linux-2.6.10.orig/arch/ppc64/kernel/xics.c 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/arch/ppc64/kernel/xics.c 2005-04-05 16:47:53.925207840 +0800 +@@ -421,7 +421,8 @@ + smp_message_recv(PPC_MSG_MIGRATE_TASK, regs); + } + #endif +-#ifdef CONFIG_DEBUGGER ++#if defined(CONFIG_DEBUGGER) || defined(CONFIG_CRASH_DUMP) \ ++ || defined(CONFIG_CRASH_DUMP_MODULE) + if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK, + &xics_ipi_message[cpu].value)) { + mb(); +Index: linux-2.6.10/arch/ppc64/kernel/smp.c +=================================================================== +--- linux-2.6.10.orig/arch/ppc64/kernel/smp.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/arch/ppc64/kernel/smp.c 2005-04-05 16:47:53.926207688 +0800 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -71,6 +72,7 @@ + struct smp_ops_t *smp_ops; + + static volatile unsigned int cpu_callin_map[NR_CPUS]; ++static int (*dump_ipi_function_ptr)(struct pt_regs *) = NULL; + + extern unsigned char stab_array[]; + +@@ -177,9 +179,16 @@ + /* spare */ + break; + #endif +-#ifdef CONFIG_DEBUGGER ++#if defined(CONFIG_DEBUGGER) || defined(CONFIG_CRASH_DUMP) \ ++ || defined(CONFIG_CRASH_DUMP_MODULE) + case PPC_MSG_DEBUGGER_BREAK: +- debugger_ipi(regs); ++ if (dump_ipi_function_ptr) { ++ dump_ipi_function_ptr(regs); ++ } ++#ifdef CONFIG_DEBUGGER ++ else ++ debugger_ipi(regs); ++#endif + break; + #endif + default: +@@ -201,7 +210,16 @@ + } + #endif + +-static void stop_this_cpu(void *dummy) ++void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *)) ++{ ++ dump_ipi_function_ptr = dump_ipi_callback; ++ if (dump_ipi_callback) { ++ mb(); ++ smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_DEBUGGER_BREAK); ++ } ++} ++ ++void stop_this_cpu(void *dummy) + { + local_irq_disable(); + while (1) +Index: linux-2.6.10/arch/x86_64/Kconfig.debug +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/Kconfig.debug 2004-12-25 05:34:01.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/Kconfig.debug 2005-04-05 16:47:53.909210272 +0800 +@@ -2,6 +2,66 @@ + + source "lib/Kconfig.debug" + ++config CRASH_DUMP ++ tristate "Crash dump support (EXPERIMENTAL)" ++ depends on EXPERIMENTAL ++ default n ++ ---help--- ++ Say Y here to enable saving an image of system memory when a panic ++ or other error occurs. Dumps can also be forced with the SysRq+d ++ key if MAGIC_SYSRQ is enabled. ++ ++config KERNTYPES ++ bool ++ depends on CRASH_DUMP ++ default y ++ ++config CRASH_DUMP_BLOCKDEV ++ tristate "Crash dump block device driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving crash dumps directly to a disk device. ++ ++config CRASH_DUMP_NETDEV ++ tristate "Crash dump network device driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving crash dumps over a network device. ++ ++config CRASH_DUMP_MEMDEV ++ bool "Crash dump staged memory driver" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow intermediate saving crash dumps in spare ++ memory pages which would then be written out to disk ++ later. ++ ++config CRASH_DUMP_SOFTBOOT ++ bool "Save crash dump across a soft reboot" ++ depends on CRASH_DUMP_MEMDEV ++ help ++ Say Y to allow a crash dump to be preserved in memory ++ lkcd-kernpages across a soft reboot and written out to disk ++ thereafter. For this to work, CRASH_DUMP must be ++ configured as part of the kernel (not as a module). ++ ++config CRASH_DUMP_COMPRESS_RLE ++ tristate "Crash dump RLE compression" ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving dumps with Run Length Encoding compression. ++ ++ ++config CRASH_DUMP_COMPRESS_GZIP ++ tristate "Crash dump GZIP compression" ++ select ZLIB_INFLATE ++ select ZLIB_DEFLATE ++ depends on CRASH_DUMP ++ help ++ Say Y to allow saving dumps with Gnu Zip compression. ++ ++ ++ + # !SMP for now because the context switch early causes GPF in segment reloading + # and the GS base checking does the wrong thing then, causing a hang. + config CHECKING +Index: linux-2.6.10/arch/x86_64/mm/init.c +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/mm/init.c 2005-04-05 16:29:30.040023896 +0800 ++++ linux-2.6.10/arch/x86_64/mm/init.c 2005-04-05 16:47:53.916209208 +0800 +@@ -378,7 +378,7 @@ + __flush_tlb_all(); + } + +-static inline int page_is_ram (unsigned long pagenr) ++inline int page_is_ram (unsigned long pagenr) + { + int i; + +Index: linux-2.6.10/arch/x86_64/kernel/traps.c +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/kernel/traps.c 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/kernel/traps.c 2005-04-05 16:47:53.915209360 +0800 +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -369,6 +370,7 @@ + printk("\n"); + notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); + show_registers(regs); ++ dump((char *)str, regs); + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->rip); +Index: linux-2.6.10/arch/x86_64/kernel/setup.c +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/kernel/setup.c 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/kernel/setup.c 2005-04-05 16:47:53.911209968 +0800 +@@ -221,6 +221,8 @@ + } + } + ++unsigned long crashdump_addr = 0xdeadbeef; ++ + static __init void parse_cmdline_early (char ** cmdline_p) + { + char c = ' ', *to = command_line, *from = COMMAND_LINE; +@@ -311,6 +313,9 @@ + + if (!memcmp(from,"oops=panic", 10)) + panic_on_oops = 1; ++ ++ if (c == ' ' && !memcmp(from, "crashdump=", 10)) ++ crashdump_addr = memparse(from+10, &from); + + next_char: + c = *(from++); +@@ -441,6 +446,10 @@ + reserve_bootmem_generic(addr, PAGE_SIZE); + } + ++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT ++extern void crashdump_reserve(void); ++#endif ++ + void __init setup_arch(char **cmdline_p) + { + unsigned long low_mem_size; +@@ -550,6 +559,9 @@ + } + #endif + paging_init(); ++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT ++ crashdump_reserve(); /* Preserve crash dump state from prev boot */ ++#endif + + check_ioapic(); + #ifdef CONFIG_ACPI_BOOT +Index: linux-2.6.10/arch/x86_64/kernel/smp.c +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/kernel/smp.c 2004-12-25 05:35:50.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/kernel/smp.c 2005-04-05 16:47:53.915209360 +0800 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -151,6 +152,13 @@ + if (!mm) + BUG(); + ++ if (vector == DUMP_VECTOR) { ++ /* ++ * Setup DUMP IPI to be delivered as an NMI ++ */ ++ cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI; ++ } ++ + /* + * I'm not happy about this global shared spinlock in the + * MM hot path, but we'll see how contended it is. +@@ -253,6 +261,13 @@ + send_IPI_allbutself(KDB_VECTOR); + } + ++ ++/* void dump_send_ipi(int (*dump_ipi_handler)(struct pt_regs *)); */ ++void dump_send_ipi(void) ++{ ++ send_IPI_allbutself(DUMP_VECTOR); ++} ++ + /* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing +@@ -340,6 +355,18 @@ + return 0; + } + ++void stop_this_cpu(void* dummy) ++{ ++ /* ++ * Remove this CPU: ++ */ ++ cpu_clear(smp_processor_id(), cpu_online_map); ++ local_irq_disable(); ++ disable_local_APIC(); ++ for (;;) ++ asm("hlt"); ++} ++ + void smp_stop_cpu(void) + { + /* +Index: linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/kernel/x8664_ksyms.c 2004-12-25 05:34:01.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c 2005-04-05 16:47:53.914209512 +0800 +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include + + extern spinlock_t rtc_lock; +@@ -216,6 +217,20 @@ + extern unsigned long __supported_pte_mask; + EXPORT_SYMBOL(__supported_pte_mask); + ++#ifdef CONFIG_CRASH_DUMP_MODULE ++#ifdef CONFIG_SMP ++extern irq_desc_t irq_desc[NR_IRQS]; ++extern cpumask_t irq_affinity[NR_IRQS]; ++extern void stop_this_cpu(void *); ++EXPORT_SYMBOL(irq_desc); ++EXPORT_SYMBOL(irq_affinity); ++EXPORT_SYMBOL(dump_send_ipi); ++EXPORT_SYMBOL(stop_this_cpu); ++#endif ++extern int page_is_ram(unsigned long); ++EXPORT_SYMBOL(page_is_ram); ++#endif ++ + #ifdef CONFIG_SMP + EXPORT_SYMBOL(flush_tlb_page); + EXPORT_SYMBOL_GPL(flush_tlb_all); +Index: linux-2.6.10/arch/x86_64/kernel/pci-gart.c +=================================================================== +--- linux-2.6.10.orig/arch/x86_64/kernel/pci-gart.c 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/x86_64/kernel/pci-gart.c 2005-04-05 16:47:53.913209664 +0800 +@@ -34,7 +34,7 @@ + dma_addr_t bad_dma_address; + + unsigned long iommu_bus_base; /* GART remapping area (physical) */ +-static unsigned long iommu_size; /* size of remapping area bytes */ ++unsigned long iommu_size; /* size of remapping area bytes */ + static unsigned long iommu_pages; /* .. and in pages */ + + u32 *iommu_gatt_base; /* Remapping table */ +Index: linux-2.6.10/init/version.c +=================================================================== +--- linux-2.6.10.orig/init/version.c 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/init/version.c 2005-04-05 16:47:53.896212248 +0800 +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + #define version(a) Version_ ## a + #define version_string(a) version(a) +@@ -31,3 +32,6 @@ + const char *linux_banner = + "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; ++ ++const char *LINUX_COMPILE_VERSION_ID = __stringify(LINUX_COMPILE_VERSION_ID); ++LINUX_COMPILE_VERSION_ID_TYPE; +Index: linux-2.6.10/init/kerntypes.c +=================================================================== +--- linux-2.6.10.orig/init/kerntypes.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/init/kerntypes.c 2005-04-05 16:47:53.895212400 +0800 +@@ -0,0 +1,40 @@ ++/* ++ * kerntypes.c ++ * ++ * Copyright (C) 2000 Tom Morano (tjm@sgi.com) and ++ * Matt D. Robinson (yakker@alacritech.com) ++ * ++ * Dummy module that includes headers for all kernel types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under version 2 of the GNU GPL. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef LINUX_COMPILE_VERSION_ID_TYPE ++/* Define version type for version validation of dump and kerntypes */ ++LINUX_COMPILE_VERSION_ID_TYPE; ++#endif ++#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP) ++extern struct runqueue runqueues; ++struct runqueue rn; ++#endif ++ ++struct new_utsname *p; ++void ++kerntypes_dummy(void) ++{ ++} +Index: linux-2.6.10/init/main.c +=================================================================== +--- linux-2.6.10.orig/init/main.c 2005-04-05 16:29:30.028025720 +0800 ++++ linux-2.6.10/init/main.c 2005-04-05 16:47:53.897212096 +0800 +@@ -109,6 +109,16 @@ + EXPORT_SYMBOL(system_state); + + /* ++ * The kernel_magic value represents the address of _end, which allows ++ * namelist tools to "match" each other respectively. That way a tool ++ * that looks at /dev/mem can verify that it is using the right System.map ++ * file -- if kernel_magic doesn't equal the namelist value of _end, ++ * something's wrong. ++ */ ++extern unsigned long _end; ++unsigned long *kernel_magic = &_end; ++ ++/* + * Boot command-line arguments + */ + #define MAX_INIT_ARGS 32 +Index: linux-2.6.10/init/Makefile +=================================================================== +--- linux-2.6.10.orig/init/Makefile 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/init/Makefile 2005-04-05 16:47:53.897212096 +0800 +@@ -9,12 +9,20 @@ + mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o + mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o + ++extra-$(CONFIG_KERNTYPES) += kerntypes.o ++#For IA64, compile kerntypes in dwarf-2 format. ++ifeq ($(CONFIG_IA64),y) ++CFLAGS_kerntypes.o := -gdwarf-2 ++else ++CFLAGS_kerntypes.o := -gstabs ++endif ++ + # files to be removed upon make clean + clean-files := ../include/linux/compile.h + + # dependencies on generated files need to be listed explicitly + +-$(obj)/version.o: include/linux/compile.h ++$(obj)/version.o $(obj)/kerntypes.o: include/linux/compile.h + + # compile.h changes depending on hostname, generation number, etc, + # so we regenerate it always. +@@ -24,3 +32,4 @@ + include/linux/compile.h: FORCE + @echo ' CHK $@' + @$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CC) $(CFLAGS)" ++ +Index: linux-2.6.10/include/asm-um/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-um/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-um/kerntypes.h 2005-04-05 16:47:53.864217112 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-um/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* Usermode-Linux-specific header files */ ++#ifndef _UM_KERNTYPES_H ++#define _UM_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _UM_KERNTYPES_H */ +Index: linux-2.6.10/include/linux/sysctl.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sysctl.h 2005-04-05 16:29:27.969338688 +0800 ++++ linux-2.6.10/include/linux/sysctl.h 2005-04-05 16:47:53.894212552 +0800 +@@ -135,6 +135,7 @@ + KERN_HZ_TIMER=65, /* int: hz timer on or off */ + KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ + KERN_SETUID_DUMPABLE=67, /* int: behaviour of dumps for setuid core */ ++ KERN_DUMP=68, /* directory: dump parameters */ + }; + + +Index: linux-2.6.10/include/linux/sched.h +=================================================================== +--- linux-2.6.10.orig/include/linux/sched.h 2005-04-05 16:47:05.178618448 +0800 ++++ linux-2.6.10/include/linux/sched.h 2005-04-05 16:47:53.891213008 +0800 +@@ -94,6 +94,7 @@ + extern int nr_threads; + extern int last_pid; + DECLARE_PER_CPU(unsigned long, process_counts); ++DECLARE_PER_CPU(struct runqueue, runqueues); + extern int nr_processes(void); + extern unsigned long nr_running(void); + extern unsigned long nr_uninterruptible(void); +@@ -760,6 +761,110 @@ + void yield(void); + + /* ++ * These are the runqueue data structures: ++ */ ++ ++#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) ++ ++typedef struct runqueue runqueue_t; ++ ++struct prio_array { ++ unsigned int nr_active; ++ unsigned long bitmap[BITMAP_SIZE]; ++ struct list_head queue[MAX_PRIO]; ++}; ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * ++ * Locking rule: those places that want to lock multiple runqueues ++ * (such as the load balancing or the thread migration code), lock ++ * acquire operations must be ordered by ascending &runqueue. ++ */ ++struct runqueue { ++ spinlock_t lock; ++ ++ /* ++ * nr_running and cpu_load should be in the same cacheline because ++ * remote CPUs use both these fields when doing load calculation. ++ */ ++ unsigned long nr_running; ++#ifdef CONFIG_SMP ++ unsigned long cpu_load; ++#endif ++ unsigned long long nr_switches; ++ ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ ++ unsigned long expired_timestamp; ++ unsigned long long timestamp_last_tick; ++ task_t *curr, *idle; ++ struct mm_struct *prev_mm; ++ prio_array_t *active, *expired, arrays[2]; ++ int best_expired_prio; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_SMP ++ struct sched_domain *sd; ++ ++ /* For active balancing */ ++ int active_balance; ++ int push_cpu; ++ ++ task_t *migration_thread; ++ struct list_head migration_queue; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ ++ /* sys_sched_yield() stats */ ++ unsigned long yld_exp_empty; ++ unsigned long yld_act_empty; ++ unsigned long yld_both_empty; ++ unsigned long yld_cnt; ++ ++ /* schedule() stats */ ++ unsigned long sched_noswitch; ++ unsigned long sched_switch; ++ unsigned long sched_cnt; ++ unsigned long sched_goidle; ++ ++ /* pull_task() stats */ ++ unsigned long pt_gained[MAX_IDLE_TYPES]; ++ unsigned long pt_lost[MAX_IDLE_TYPES]; ++ ++ /* active_load_balance() stats */ ++ unsigned long alb_cnt; ++ unsigned long alb_lost; ++ unsigned long alb_gained; ++ unsigned long alb_failed; ++ ++ /* try_to_wake_up() stats */ ++ unsigned long ttwu_cnt; ++ unsigned long ttwu_attempts; ++ unsigned long ttwu_moved; ++ ++ /* wake_up_new_task() stats */ ++ unsigned long wunt_cnt; ++ unsigned long wunt_moved; ++ ++ /* sched_migrate_task() stats */ ++ unsigned long smt_cnt; ++ ++ /* sched_balance_exec() stats */ ++ unsigned long sbe_cnt; ++#endif ++}; ++ ++/* + * The default (Linux) execution domain. + */ + extern struct exec_domain default_exec_domain; +Index: linux-2.6.10/include/linux/miscdevice.h +=================================================================== +--- linux-2.6.10.orig/include/linux/miscdevice.h 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/include/linux/miscdevice.h 2005-04-05 16:47:53.893212704 +0800 +@@ -25,6 +25,7 @@ + #define MICROCODE_MINOR 184 + #define MWAVE_MINOR 219 /* ACP/Mwave Modem */ + #define MPT_MINOR 220 ++#define CRASH_DUMP_MINOR 230 /* LKCD */ + #define MISC_DYNAMIC_MINOR 255 + + #define TUN_MINOR 200 +Index: linux-2.6.10/include/linux/dump.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dump.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/linux/dump.h 2005-04-05 16:47:53.893212704 +0800 +@@ -0,0 +1,406 @@ ++/* ++ * Kernel header file for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sgi.com) ++ * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved. ++ * ++ * vmdump.h to dump.h by: Matt D. Robinson (yakker@sourceforge.net) ++ * Copyright 2001 - 2002 Matt D. Robinson. All rights reserved. ++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. ++ * ++ * Most of this is the same old stuff from vmdump.h, except now we're ++ * actually a stand-alone driver plugged into the block layer interface, ++ * with the exception that we now allow for compression modes externally ++ * loaded (e.g., someone can come up with their own). ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* This header file includes all structure definitions for crash dumps. */ ++#ifndef _DUMP_H ++#define _DUMP_H ++ ++#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE) ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Predefine default DUMP_PAGE constants, asm header may override. ++ * ++ * On ia64 discontinuous memory systems it's possible for the memory ++ * banks to stop at 2**12 page alignments, the smallest possible page ++ * size. But the system page size, PAGE_SIZE, is in fact larger. ++ */ ++#define DUMP_PAGE_SHIFT PAGE_SHIFT ++#define DUMP_PAGE_MASK PAGE_MASK ++#define DUMP_PAGE_ALIGN(addr) PAGE_ALIGN(addr) ++ ++/* ++ * Dump offset changed from 4Kb to 64Kb to support multiple PAGE_SIZE ++ * (kernel page size). Assumption goes that 64K is the highest page size ++ * supported ++ */ ++ ++#define DUMP_HEADER_OFFSET (1ULL << 16) ++ ++#define OLDMINORBITS 8 ++#define OLDMINORMASK ((1U << OLDMINORBITS) -1) ++ ++/* Making DUMP_PAGE_SIZE = PAGE_SIZE, to support dumping on architectures ++ * which support page sizes (PAGE_SIZE) greater than 4KB. ++ * Will it affect ia64 discontinuous memory systems ???? ++ */ ++#define DUMP_PAGE_SIZE PAGE_SIZE ++ ++/* thread_info lies at the bottom of stack, (Except IA64). */ ++#define STACK_START_POSITION(tsk) (tsk->thread_info) ++/* ++ * Predefined default memcpy() to use when copying memory to the dump buffer. ++ * ++ * On ia64 there is a heads up function that can be called to let the prom ++ * machine check monitor know that the current activity is risky and it should ++ * ignore the fault (nofault). In this case the ia64 header will redefine this ++ * macro to __dump_memcpy() and use it's arch specific version. ++ */ ++#define DUMP_memcpy memcpy ++#define bzero(a,b) memset(a, 0, b) ++ ++/* necessary header files */ ++#include /* for architecture-specific header */ ++ ++/* ++ * Size of the buffer that's used to hold: ++ * ++ * 1. the dump header (padded to fill the complete buffer) ++ * 2. the possibly compressed page headers and data ++ * ++ * = 256k for page size >= 64k ++ * = 64k for page size < 64k ++ */ ++#if (PAGE_SHIFT >= 16) ++#define DUMP_BUFFER_SIZE (256 * 1024) /* size of dump buffer */ ++#else ++#define DUMP_BUFFER_SIZE (64 * 1024) /* size of dump buffer */ ++#endif ++ ++#define DUMP_HEADER_SIZE DUMP_BUFFER_SIZE ++ ++/* standard header definitions */ ++#define DUMP_MAGIC_NUMBER 0xa8190173618f23edULL /* dump magic number */ ++#define DUMP_MAGIC_LIVE 0xa8190173618f23cdULL /* live magic number */ ++#define DUMP_VERSION_NUMBER 0x8 /* dump version number */ ++#define DUMP_PANIC_LEN 0x100 /* dump panic string length */ ++ ++/* dump levels - type specific stuff added later -- add as necessary */ ++#define DUMP_LEVEL_NONE 0x0 /* no dumping at all -- just bail */ ++#define DUMP_LEVEL_HEADER 0x1 /* kernel dump header only */ ++#define DUMP_LEVEL_KERN 0x2 /* dump header and kernel pages */ ++#define DUMP_LEVEL_USED 0x4 /* dump header, kernel/user pages */ ++#define DUMP_LEVEL_ALL_RAM 0x8 /* dump header, all RAM pages */ ++#define DUMP_LEVEL_ALL 0x10 /* dump all memory RAM and firmware */ ++ ++ ++/* dump compression options -- add as necessary */ ++#define DUMP_COMPRESS_NONE 0x0 /* don't compress this dump */ ++#define DUMP_COMPRESS_RLE 0x1 /* use RLE compression */ ++#define DUMP_COMPRESS_GZIP 0x2 /* use GZIP compression */ ++ ++/* dump flags - any dump-type specific flags -- add as necessary */ ++#define DUMP_FLAGS_NONE 0x0 /* no flags are set for this dump */ ++#define DUMP_FLAGS_SOFTBOOT 0x2 /* 2 stage soft-boot based dump */ ++#define DUMP_FLAGS_NONDISRUPT 0X1 /* non-disruptive dumping */ ++ ++#define DUMP_FLAGS_TARGETMASK 0xf0000000 /* handle special case targets */ ++#define DUMP_FLAGS_DISKDUMP 0x80000000 /* dump to local disk */ ++#define DUMP_FLAGS_NETDUMP 0x40000000 /* dump over the network */ ++ ++/* dump header flags -- add as necessary */ ++#define DUMP_DH_FLAGS_NONE 0x0 /* no flags set (error condition!) */ ++#define DUMP_DH_RAW 0x1 /* raw page (no compression) */ ++#define DUMP_DH_COMPRESSED 0x2 /* page is compressed */ ++#define DUMP_DH_END 0x4 /* end marker on a full dump */ ++#define DUMP_DH_TRUNCATED 0x8 /* dump is incomplete */ ++#define DUMP_DH_TEST_PATTERN 0x10 /* dump page is a test pattern */ ++#define DUMP_DH_NOT_USED 0x20 /* 1st bit not used in flags */ ++ ++/* names for various dump parameters in /proc/kernel */ ++#define DUMP_ROOT_NAME "sys/dump" ++#define DUMP_DEVICE_NAME "device" ++#define DUMP_COMPRESS_NAME "compress" ++#define DUMP_LEVEL_NAME "level" ++#define DUMP_FLAGS_NAME "flags" ++#define DUMP_ADDR_NAME "addr" ++ ++#define DUMP_SYSRQ_KEY 'd' /* key to use for MAGIC_SYSRQ key */ ++ ++/* CTL_DUMP names: */ ++enum ++{ ++ CTL_DUMP_DEVICE=1, ++ CTL_DUMP_COMPRESS=3, ++ CTL_DUMP_LEVEL=3, ++ CTL_DUMP_FLAGS=4, ++ CTL_DUMP_ADDR=5, ++ CTL_DUMP_TEST=6, ++}; ++ ++ ++/* page size for gzip compression -- buffered slightly beyond hardware PAGE_SIZE used by DUMP */ ++#define DUMP_DPC_PAGE_SIZE (DUMP_PAGE_SIZE + 512) ++ ++/* dump ioctl() control options */ ++#define DIOSDUMPDEV _IOW('p', 0xA0, unsigned int) /* set the dump device */ ++#define DIOGDUMPDEV _IOR('p', 0xA1, unsigned int) /* get the dump device */ ++#define DIOSDUMPLEVEL _IOW('p', 0xA2, unsigned int) /* set the dump level */ ++#define DIOGDUMPLEVEL _IOR('p', 0xA3, unsigned int) /* get the dump level */ ++#define DIOSDUMPFLAGS _IOW('p', 0xA4, unsigned int) /* set the dump flag parameters */ ++#define DIOGDUMPFLAGS _IOR('p', 0xA5, unsigned int) /* get the dump flag parameters */ ++#define DIOSDUMPCOMPRESS _IOW('p', 0xA6, unsigned int) /* set the dump compress level */ ++#define DIOGDUMPCOMPRESS _IOR('p', 0xA7, unsigned int) /* get the dump compress level */ ++ ++/* these ioctls are used only by netdump module */ ++#define DIOSTARGETIP _IOW('p', 0xA8, unsigned int) /* set the target m/c's ip */ ++#define DIOGTARGETIP _IOR('p', 0xA9, unsigned int) /* get the target m/c's ip */ ++#define DIOSTARGETPORT _IOW('p', 0xAA, unsigned int) /* set the target m/c's port */ ++#define DIOGTARGETPORT _IOR('p', 0xAB, unsigned int) /* get the target m/c's port */ ++#define DIOSSOURCEPORT _IOW('p', 0xAC, unsigned int) /* set the source m/c's port */ ++#define DIOGSOURCEPORT _IOR('p', 0xAD, unsigned int) /* get the source m/c's port */ ++#define DIOSETHADDR _IOW('p', 0xAE, unsigned int) /* set ethernet address */ ++#define DIOGETHADDR _IOR('p', 0xAF, unsigned int) /* get ethernet address */ ++#define DIOGDUMPOKAY _IOR('p', 0xB0, unsigned int) /* check if dump is configured */ ++#define DIOSDUMPTAKE _IOW('p', 0xB1, unsigned int) /* Take a manual dump */ ++ ++/* ++ * Structure: __dump_header ++ * Function: This is the header dumped at the top of every valid crash ++ * dump. ++ */ ++struct __dump_header { ++ /* the dump magic number -- unique to verify dump is valid */ ++ u64 dh_magic_number; ++ ++ /* the version number of this dump */ ++ u32 dh_version; ++ ++ /* the size of this header (in case we can't read it) */ ++ u32 dh_header_size; ++ ++ /* the level of this dump (just a header?) */ ++ u32 dh_dump_level; ++ ++ /* ++ * We assume dump_page_size to be 4K in every case. ++ * Store here the configurable system page size (4K, 8K, 16K, etc.) ++ */ ++ u32 dh_page_size; ++ ++ /* the size of all physical memory */ ++ u64 dh_memory_size; ++ ++ /* the start of physical memory */ ++ u64 dh_memory_start; ++ ++ /* the end of physical memory */ ++ u64 dh_memory_end; ++ ++ /* the number of hardware/physical pages in this dump specifically */ ++ u32 dh_num_dump_pages; ++ ++ /* the panic string, if available */ ++ char dh_panic_string[DUMP_PANIC_LEN]; ++ ++ /* timeval depends on architecture, two long values */ ++ struct { ++ u64 tv_sec; ++ u64 tv_usec; ++ } dh_time; /* the time of the system crash */ ++ ++ /* the NEW utsname (uname) information -- in character form */ ++ /* we do this so we don't have to include utsname.h */ ++ /* plus it helps us be more architecture independent */ ++ /* now maybe one day soon they'll make the [65] a #define! */ ++ char dh_utsname_sysname[65]; ++ char dh_utsname_nodename[65]; ++ char dh_utsname_release[65]; ++ char dh_utsname_version[65]; ++ char dh_utsname_machine[65]; ++ char dh_utsname_domainname[65]; ++ ++ /* the address of current task (OLD = void *, NEW = u64) */ ++ u64 dh_current_task; ++ ++ /* what type of compression we're using in this dump (if any) */ ++ u32 dh_dump_compress; ++ ++ /* any additional flags */ ++ u32 dh_dump_flags; ++ ++ /* any additional flags */ ++ u32 dh_dump_device; ++} __attribute__((packed)); ++ ++/* ++ * Structure: __dump_page ++ * Function: To act as the header associated to each physical page of ++ * memory saved in the system crash dump. This allows for ++ * easy reassembly of each crash dump page. The address bits ++ * are split to make things easier for 64-bit/32-bit system ++ * conversions. ++ * ++ * dp_byte_offset and dp_page_index are landmarks that are helpful when ++ * looking at a hex dump of /dev/vmdump, ++ */ ++struct __dump_page { ++ /* the address of this dump page */ ++ u64 dp_address; ++ ++ /* the size of this dump page */ ++ u32 dp_size; ++ ++ /* flags (currently DUMP_COMPRESSED, DUMP_RAW or DUMP_END) */ ++ u32 dp_flags; ++} __attribute__((packed)); ++ ++/* ++ * Structure: __lkcdinfo ++ * Function: This structure contains information needed for the lkcdutils ++ * package (particularly lcrash) to determine what information is ++ * associated to this kernel, specifically. ++ */ ++struct __lkcdinfo { ++ int arch; ++ int ptrsz; ++ int byte_order; ++ int linux_release; ++ int page_shift; ++ int page_size; ++ u64 page_mask; ++ u64 page_offset; ++ int stack_offset; ++}; ++ ++#ifdef __KERNEL__ ++ ++/* ++ * Structure: __dump_compress ++ * Function: This is what an individual compression mechanism can use ++ * to plug in their own compression techniques. It's always ++ * best to build these as individual modules so that people ++ * can put in whatever they want. ++ */ ++struct __dump_compress { ++ /* the list_head structure for list storage */ ++ struct list_head list; ++ ++ /* the type of compression to use (DUMP_COMPRESS_XXX) */ ++ int compress_type; ++ const char *compress_name; ++ ++ /* the compression function to call */ ++ u32 (*compress_func)(const u8 *, u32, u8 *, u32, unsigned long); ++}; ++ ++/* functions for dump compression registration */ ++extern void dump_register_compression(struct __dump_compress *); ++extern void dump_unregister_compression(int); ++ ++/* ++ * Structure dump_mbank[]: ++ * ++ * For CONFIG_DISCONTIGMEM systems this array specifies the ++ * memory banks/chunks that need to be dumped after a panic. ++ * ++ * For classic systems it specifies a single set of pages from ++ * 0 to max_mapnr. ++ */ ++struct __dump_mbank { ++ u64 start; ++ u64 end; ++ int type; ++ int pad1; ++ long pad2; ++}; ++ ++#define DUMP_MBANK_TYPE_CONVENTIONAL_MEMORY 1 ++#define DUMP_MBANK_TYPE_OTHER 2 ++ ++#define MAXCHUNKS 256 ++extern int dump_mbanks; ++extern struct __dump_mbank dump_mbank[MAXCHUNKS]; ++ ++/* notification event codes */ ++#define DUMP_BEGIN 0x0001 /* dump beginning */ ++#define DUMP_END 0x0002 /* dump ending */ ++ ++/* Scheduler soft spin control. ++ * ++ * 0 - no dump in progress ++ * 1 - cpu0 is dumping, ... ++ */ ++extern unsigned long dump_oncpu; ++extern void dump_execute(const char *, const struct pt_regs *); ++ ++/* ++ * Notifier list for kernel code which wants to be called ++ * at kernel dump. ++ */ ++extern struct notifier_block *dump_notifier_list; ++static inline int register_dump_notifier(struct notifier_block *nb) ++{ ++ return notifier_chain_register(&dump_notifier_list, nb); ++} ++static inline int unregister_dump_notifier(struct notifier_block * nb) ++{ ++ return notifier_chain_unregister(&dump_notifier_list, nb); ++} ++ ++extern void (*dump_function_ptr)(const char *, const struct pt_regs *); ++static inline void dump(char * str, struct pt_regs * regs) ++{ ++ if (dump_function_ptr) ++ dump_function_ptr(str, regs); ++} ++ ++/* ++ * Common Arch Specific Functions should be declared here. ++ * This allows the C compiler to detect discrepancies. ++ */ ++extern void __dump_open(void); ++extern void __dump_cleanup(void); ++extern void __dump_clean_irq_state(void); ++extern void __dump_init(u64); ++extern void __dump_save_regs(struct pt_regs *, const struct pt_regs *); ++extern void __dump_save_context(int cpu, const struct pt_regs *, struct task_struct *tsk); ++extern int __dump_configure_header(const struct pt_regs *); ++extern int __dump_irq_enable(void); ++extern void __dump_irq_restore(void); ++extern int __dump_page_valid(unsigned long index); ++#ifdef CONFIG_SMP ++extern void __dump_save_other_cpus(void); ++#else ++#define __dump_save_other_cpus() ++#endif ++ ++extern int manual_handle_crashdump(void); ++ ++/* to track all used (compound + zero order) pages */ ++#define PageInuse(p) (PageCompound(p) || page_count(p)) ++ ++#endif /* __KERNEL__ */ ++ ++#else /* !CONFIG_CRASH_DUMP */ ++ ++/* If not configured then make code disappear! */ ++#define register_dump_watchdog(x) do { } while(0) ++#define unregister_dump_watchdog(x) do { } while(0) ++#define register_dump_notifier(x) do { } while(0) ++#define unregister_dump_notifier(x) do { } while(0) ++#define dump_in_progress() 0 ++#define dump(x, y) do { } while(0) ++ ++#endif /* !CONFIG_CRASH_DUMP */ ++ ++#endif /* _DUMP_H */ +Index: linux-2.6.10/include/linux/dumpdev.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dumpdev.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/linux/dumpdev.h 2005-04-05 16:47:53.890213160 +0800 +@@ -0,0 +1,163 @@ ++/* ++ * Generic dump device interfaces for flexible system dump ++ * (Enables variation of dump target types e.g disk, network, memory) ++ * ++ * These interfaces have evolved based on discussions on lkcd-devel. ++ * Eventually the intent is to support primary and secondary or ++ * alternate targets registered at the same time, with scope for ++ * situation based failover or multiple dump devices used for parallel ++ * dump i/o. ++ * ++ * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com) ++ * ++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++#ifndef _LINUX_DUMPDEV_H ++#define _LINUX_DUMPDEV_H ++ ++#include ++#include ++#include ++#include ++ ++/* Determined by the dump target (device) type */ ++ ++struct dump_dev; ++ ++struct dump_dev_ops { ++ int (*open)(struct dump_dev *, unsigned long); /* configure */ ++ int (*release)(struct dump_dev *); /* unconfigure */ ++ int (*silence)(struct dump_dev *); /* when dump starts */ ++ int (*resume)(struct dump_dev *); /* when dump is over */ ++ int (*seek)(struct dump_dev *, loff_t); ++ /* trigger a write (async in nature typically) */ ++ int (*write)(struct dump_dev *, void *, unsigned long); ++ /* not usually used during dump, but option available */ ++ int (*read)(struct dump_dev *, void *, unsigned long); ++ /* use to poll for completion */ ++ int (*ready)(struct dump_dev *, void *); ++ int (*ioctl)(struct dump_dev *, unsigned int, unsigned long); ++}; ++ ++struct dump_dev { ++ char type_name[32]; /* block, net-poll etc */ ++ unsigned long device_id; /* interpreted differently for various types */ ++ struct dump_dev_ops *ops; ++ struct list_head list; ++ loff_t curr_offset; ++ struct netpoll np; ++}; ++ ++/* ++ * dump_dev type variations: ++ */ ++ ++/* block */ ++struct dump_blockdev { ++ struct dump_dev ddev; ++ dev_t dev_id; ++ struct block_device *bdev; ++ struct bio *bio; ++ loff_t start_offset; ++ loff_t limit; ++ int err; ++}; ++ ++static inline struct dump_blockdev *DUMP_BDEV(struct dump_dev *dev) ++{ ++ return container_of(dev, struct dump_blockdev, ddev); ++} ++ ++ ++/* mem - for internal use by soft-boot based dumper */ ++struct dump_memdev { ++ struct dump_dev ddev; ++ unsigned long indirect_map_root; ++ unsigned long nr_free; ++ struct page *curr_page; ++ unsigned long *curr_map; ++ unsigned long curr_map_offset; ++ unsigned long last_offset; ++ unsigned long last_used_offset; ++ unsigned long last_bs_offset; ++}; ++ ++static inline struct dump_memdev *DUMP_MDEV(struct dump_dev *dev) ++{ ++ return container_of(dev, struct dump_memdev, ddev); ++} ++ ++/* Todo/future - meant for raw dedicated interfaces e.g. mini-ide driver */ ++struct dump_rdev { ++ struct dump_dev ddev; ++ char name[32]; ++ int (*reset)(struct dump_rdev *, unsigned int, ++ unsigned long); ++ /* ... to do ... */ ++}; ++ ++/* just to get the size right when saving config across a soft-reboot */ ++struct dump_anydev { ++ union { ++ struct dump_blockdev bddev; ++ /* .. add other types here .. */ ++ }; ++}; ++ ++ ++ ++/* Dump device / target operation wrappers */ ++/* These assume that dump_dev is initiatized to dump_config.dumper->dev */ ++ ++extern struct dump_dev *dump_dev; ++ ++static inline int dump_dev_open(unsigned long arg) ++{ ++ return dump_dev->ops->open(dump_dev, arg); ++} ++ ++static inline int dump_dev_release(void) ++{ ++ return dump_dev->ops->release(dump_dev); ++} ++ ++static inline int dump_dev_silence(void) ++{ ++ return dump_dev->ops->silence(dump_dev); ++} ++ ++static inline int dump_dev_resume(void) ++{ ++ return dump_dev->ops->resume(dump_dev); ++} ++ ++static inline int dump_dev_seek(loff_t offset) ++{ ++ return dump_dev->ops->seek(dump_dev, offset); ++} ++ ++static inline int dump_dev_write(void *buf, unsigned long len) ++{ ++ return dump_dev->ops->write(dump_dev, buf, len); ++} ++ ++static inline int dump_dev_ready(void *buf) ++{ ++ return dump_dev->ops->ready(dump_dev, buf); ++} ++ ++static inline int dump_dev_ioctl(unsigned int cmd, unsigned long arg) ++{ ++ if (!dump_dev || !dump_dev->ops->ioctl) ++ return -EINVAL; ++ return dump_dev->ops->ioctl(dump_dev, cmd, arg); ++} ++ ++extern int dump_register_device(struct dump_dev *); ++extern void dump_unregister_device(struct dump_dev *); ++ ++#endif /* _LINUX_DUMPDEV_H */ +Index: linux-2.6.10/include/linux/dump_netdev.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dump_netdev.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/linux/dump_netdev.h 2005-04-05 16:47:53.889213312 +0800 +@@ -0,0 +1,80 @@ ++/* ++ * linux/drivers/net/netconsole.h ++ * ++ * Copyright (C) 2001 Ingo Molnar ++ * ++ * This file contains the implementation of an IRQ-safe, crash-safe ++ * kernel console implementation that outputs kernel messages to the ++ * network. ++ * ++ * Modification history: ++ * ++ * 2001-09-17 started by Ingo Molnar. ++ */ ++ ++/**************************************************************** ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2, or (at your option) ++ * any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ++ * ++ ****************************************************************/ ++ ++#define NETCONSOLE_VERSION 0x03 ++ ++enum netdump_commands { ++ COMM_NONE = 0, ++ COMM_SEND_MEM = 1, ++ COMM_EXIT = 2, ++ COMM_REBOOT = 3, ++ COMM_HELLO = 4, ++ COMM_GET_NR_PAGES = 5, ++ COMM_GET_PAGE_SIZE = 6, ++ COMM_START_NETDUMP_ACK = 7, ++ COMM_GET_REGS = 8, ++ COMM_GET_MAGIC = 9, ++ COMM_START_WRITE_NETDUMP_ACK = 10, ++}; ++ ++typedef struct netdump_req_s { ++ u64 magic; ++ u32 nr; ++ u32 command; ++ u32 from; ++ u32 to; ++} req_t; ++ ++enum netdump_replies { ++ REPLY_NONE = 0, ++ REPLY_ERROR = 1, ++ REPLY_LOG = 2, ++ REPLY_MEM = 3, ++ REPLY_RESERVED = 4, ++ REPLY_HELLO = 5, ++ REPLY_NR_PAGES = 6, ++ REPLY_PAGE_SIZE = 7, ++ REPLY_START_NETDUMP = 8, ++ REPLY_END_NETDUMP = 9, ++ REPLY_REGS = 10, ++ REPLY_MAGIC = 11, ++ REPLY_START_WRITE_NETDUMP = 12, ++}; ++ ++typedef struct netdump_reply_s { ++ u32 nr; ++ u32 code; ++ u32 info; ++} reply_t; ++ ++#define HEADER_LEN (1 + sizeof(reply_t)) ++ ++ +Index: linux-2.6.10/include/asm-parisc/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-parisc/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-parisc/kerntypes.h 2005-04-05 16:47:53.870216200 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-parisc/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* PA-RISC-specific header files */ ++#ifndef _PARISC_KERNTYPES_H ++#define _PARISC_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _PARISC_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-h8300/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-h8300/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-h8300/kerntypes.h 2005-04-05 16:47:53.880214680 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-h8300/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* H8300-specific header files */ ++#ifndef _H8300_KERNTYPES_H ++#define _H8300_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _H8300_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-ppc/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ppc/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-ppc/kerntypes.h 2005-04-05 16:47:53.882214376 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-ppc/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* PowerPC-specific header files */ ++#ifndef _PPC_KERNTYPES_H ++#define _PPC_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _PPC_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-alpha/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-alpha/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-alpha/kerntypes.h 2005-04-05 16:47:53.876215288 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-alpha/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* Alpha-specific header files */ ++#ifndef _ALPHA_KERNTYPES_H ++#define _ALPHA_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _ALPHA_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-arm26/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-arm26/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-arm26/kerntypes.h 2005-04-05 16:47:53.865216960 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-arm26/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* ARM26-specific header files */ ++#ifndef _ARM26_KERNTYPES_H ++#define _ARM26_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _ARM26_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-sh/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-sh/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-sh/kerntypes.h 2005-04-05 16:47:53.877215136 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-sh/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* Super-H-specific header files */ ++#ifndef _SH_KERNTYPES_H ++#define _SH_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _SH_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-ia64/nmi.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ia64/nmi.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-ia64/nmi.h 2005-04-05 16:47:53.883214224 +0800 +@@ -0,0 +1,28 @@ ++/* ++ * linux/include/asm-ia64/nmi.h ++ */ ++#ifndef ASM_NMI_H ++#define ASM_NMI_H ++ ++#include ++ ++struct pt_regs; ++ ++typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu); ++ ++/** ++ * set_nmi_callback ++ * ++ * Set a handler for an NMI. Only one handler may be ++ * set. Return 1 if the NMI was handled. ++ */ ++void set_nmi_callback(nmi_callback_t callback); ++ ++/** ++ * unset_nmi_callback ++ * ++ * Remove the handler previously set. ++ */ ++void unset_nmi_callback(void); ++ ++#endif /* ASM_NMI_H */ +Index: linux-2.6.10/include/asm-ia64/dump.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ia64/dump.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-ia64/dump.h 2005-04-05 16:47:53.884214072 +0800 +@@ -0,0 +1,201 @@ ++/* ++ * Kernel header file for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sgi.com) ++ * ++ * Copyright 1999 - 2002 Silicon Graphics, Inc. All rights reserved. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* This header file holds the architecture specific crash dump header */ ++#ifndef _ASM_DUMP_H ++#define _ASM_DUMP_H ++ ++/* definitions */ ++#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */ ++#define DUMP_ASM_VERSION_NUMBER 0x4 /* version number */ ++ ++#ifdef __KERNEL__ ++#include ++#include ++#include ++ ++#ifdef CONFIG_SMP ++extern cpumask_t irq_affinity[]; ++extern int (*dump_ipi_function_ptr)(struct pt_regs *); ++extern void dump_send_ipi(void); ++#else /* !CONFIG_SMP */ ++#define dump_send_ipi() do { } while(0) ++#endif ++ ++#else /* !__KERNEL__ */ ++/* necessary header files */ ++#include /* for pt_regs */ ++#include ++#endif /* __KERNEL__ */ ++ ++/* ++ * mkswap.c calls getpagesize() to get the system page size, ++ * which is not necessarily the same as the hardware page size. ++ * ++ * For ia64 the kernel PAGE_SIZE can be configured from 4KB ... 16KB. ++ * ++ * The physical memory is layed out out in the hardware/minimal pages. ++ * This is the size we need to use for dumping physical pages. ++ * ++ * Note ths hardware/minimal page size being use in; ++ * arch/ia64/kernel/efi.c`efi_memmap_walk(): ++ * curr.end = curr.start + (md->num_pages << 12); ++ * ++ * Since the system page size could change between the kernel we boot ++ * on the the kernel that cause the core dume we may want to have something ++ * more constant like the maximum system page size (See include/asm-ia64/page.h). ++ */ ++/* IA64 manages the stack in differnt manner as compared to other architectures. ++ * task_struct lies at the bottom of stack. ++ */ ++#undef STACK_START_POSITION ++#define STACK_START_POSITION(tsk) (tsk) ++#define DUMP_MIN_PAGE_SHIFT 12 ++#define DUMP_MIN_PAGE_SIZE (1UL << DUMP_MIN_PAGE_SHIFT) ++#define DUMP_MIN_PAGE_MASK (~(DUMP_MIN_PAGE_SIZE - 1)) ++#define DUMP_MIN_PAGE_ALIGN(addr) (((addr) + DUMP_MIN_PAGE_SIZE - 1) & DUMP_MIN_PAGE_MASK) ++ ++#define DUMP_MAX_PAGE_SHIFT 16 ++#define DUMP_MAX_PAGE_SIZE (1UL << DUMP_MAX_PAGE_SHIFT) ++#define DUMP_MAX_PAGE_MASK (~(DUMP_MAX_PAGE_SIZE - 1)) ++#define DUMP_MAX_PAGE_ALIGN(addr) (((addr) + DUMP_MAX_PAGE_SIZE - 1) & DUMP_MAX_PAGE_MASK) ++ ++#define DUMP_EF_PAGE_SHIFT DUMP_MIN_PAGE_SHIFT ++ ++extern int _end,_start; ++ ++/* ++ * Structure: dump_header_asm_t ++ * Function: This is the header for architecture-specific stuff. It ++ * follows right after the dump header. ++ */ ++/*typedef struct _dump_header_asm {*/ ++ ++typedef struct __dump_header_asm { ++ ++ /* the dump magic number -- unique to verify dump is valid */ ++ uint64_t dha_magic_number; ++ ++ /* the version number of this dump */ ++ uint32_t dha_version; ++ ++ /* the size of this header (in case we can't read it) */ ++ uint32_t dha_header_size; ++ ++ /* pointer to pt_regs, (OLD: (struct pt_regs *, NEW: (uint64_t)) */ ++ uint64_t dha_pt_regs; ++ ++ /* the dump registers */ ++ struct pt_regs dha_regs; ++ ++ /* the rnat register saved after flushrs */ ++ uint64_t dha_rnat; ++ ++ /* the pfs register saved after flushrs */ ++ uint64_t dha_pfs; ++ ++ /* the bspstore register saved after flushrs */ ++ uint64_t dha_bspstore; ++ ++ /* smp specific */ ++ uint32_t dha_smp_num_cpus; ++ uint32_t dha_dumping_cpu; ++ struct pt_regs dha_smp_regs[NR_CPUS]; ++ uint64_t dha_smp_current_task[NR_CPUS]; ++ uint64_t dha_stack[NR_CPUS]; ++ uint64_t dha_stack_ptr[NR_CPUS]; ++ ++} __attribute__((packed)) dump_header_asm_t; ++ ++ ++extern struct __dump_header_asm dump_header_asm; ++ ++#ifdef __KERNEL__ ++static inline void get_current_regs(struct pt_regs *regs) ++{ ++ /* ++ * REMIND: Looking at functions/Macros like: ++ * DO_SAVE_SWITCH_STACK ++ * ia64_switch_to() ++ * ia64_save_extra() ++ * switch_to() ++ * to implement this new feature that Matt seem to have added ++ * to panic.c; seems all platforms are now expected to provide ++ * this function to dump the current registers into the pt_regs ++ * structure. ++ */ ++ volatile unsigned long rsc_value;/*for storing the rsc value*/ ++ volatile unsigned long ic_value; ++ ++ __asm__ __volatile__("mov %0=b6;;":"=r"(regs->b6)); ++ __asm__ __volatile__("mov %0=b7;;":"=r"(regs->b7)); ++ ++ __asm__ __volatile__("mov %0=ar.csd;;":"=r"(regs->ar_csd)); ++ __asm__ __volatile__("mov %0=ar.ssd;;":"=r"(regs->ar_ssd)); ++ __asm__ __volatile__("mov %0=psr;;":"=r"(ic_value)); ++ if(ic_value & 0x1000)/*Within an interrupt*/ ++ { ++ __asm__ __volatile__("mov %0=cr.ipsr;;":"=r"(regs->cr_ipsr)); ++ __asm__ __volatile__("mov %0=cr.iip;;":"=r"(regs->cr_iip)); ++ __asm__ __volatile__("mov %0=cr.ifs;;":"=r"(regs->cr_ifs)); ++ } ++ else ++ { ++ regs->cr_ipsr=regs->cr_iip=regs->cr_ifs=(unsigned long)-1; ++ } ++ __asm__ __volatile__("mov %0=ar.unat;;":"=r"(regs->ar_unat)); ++ __asm__ __volatile__("mov %0=ar.pfs;;":"=r"(regs->ar_pfs)); ++ __asm__ __volatile__("mov %0=ar.rsc;;":"=r"(rsc_value)); ++ regs->ar_rsc = rsc_value; ++ /*loadrs is from 16th bit to 29th bit of rsc*/ ++ regs->loadrs = rsc_value >> 16 & (unsigned long)0x3fff; ++ /*setting the rsc.mode value to 0 (rsc.mode is the last two bits of rsc)*/ ++ __asm__ __volatile__("mov ar.rsc=%0;;"::"r"(rsc_value & (unsigned long)(~3))); ++ __asm__ __volatile__("mov %0=ar.rnat;;":"=r"(regs->ar_rnat)); ++ __asm__ __volatile__("mov %0=ar.bspstore;;":"=r"(regs->ar_bspstore)); ++ /*copying the original value back*/ ++ __asm__ __volatile__("mov ar.rsc=%0;;"::"r"(rsc_value)); ++ __asm__ __volatile__("mov %0=pr;;":"=r"(regs->pr)); ++ __asm__ __volatile__("mov %0=ar.fpsr;;":"=r"(regs->ar_fpsr)); ++ __asm__ __volatile__("mov %0=ar.ccv;;":"=r"(regs->ar_ccv)); ++ ++ __asm__ __volatile__("mov %0=r2;;":"=r"(regs->r2)); ++ __asm__ __volatile__("mov %0=r3;;":"=r"(regs->r3)); ++ __asm__ __volatile__("mov %0=r8;;":"=r"(regs->r8)); ++ __asm__ __volatile__("mov %0=r9;;":"=r"(regs->r9)); ++ __asm__ __volatile__("mov %0=r10;;":"=r"(regs->r10)); ++ __asm__ __volatile__("mov %0=r11;;":"=r"(regs->r11)); ++ __asm__ __volatile__("mov %0=r12;;":"=r"(regs->r12)); ++ __asm__ __volatile__("mov %0=r13;;":"=r"(regs->r13)); ++ __asm__ __volatile__("mov %0=r14;;":"=r"(regs->r14)); ++ __asm__ __volatile__("mov %0=r15;;":"=r"(regs->r15)); ++ __asm__ __volatile__("mov %0=r16;;":"=r"(regs->r16)); ++ __asm__ __volatile__("mov %0=r17;;":"=r"(regs->r17)); ++ __asm__ __volatile__("mov %0=r18;;":"=r"(regs->r18)); ++ __asm__ __volatile__("mov %0=r19;;":"=r"(regs->r19)); ++ __asm__ __volatile__("mov %0=r20;;":"=r"(regs->r20)); ++ __asm__ __volatile__("mov %0=r21;;":"=r"(regs->r21)); ++ __asm__ __volatile__("mov %0=r22;;":"=r"(regs->r22)); ++ __asm__ __volatile__("mov %0=r23;;":"=r"(regs->r23)); ++ __asm__ __volatile__("mov %0=r24;;":"=r"(regs->r24)); ++ __asm__ __volatile__("mov %0=r25;;":"=r"(regs->r25)); ++ __asm__ __volatile__("mov %0=r26;;":"=r"(regs->r26)); ++ __asm__ __volatile__("mov %0=r27;;":"=r"(regs->r27)); ++ __asm__ __volatile__("mov %0=r28;;":"=r"(regs->r28)); ++ __asm__ __volatile__("mov %0=r29;;":"=r"(regs->r29)); ++ __asm__ __volatile__("mov %0=r30;;":"=r"(regs->r30)); ++ __asm__ __volatile__("mov %0=r31;;":"=r"(regs->r31)); ++} ++ ++/* Perhaps added to Common Arch Specific Functions and moved to dump.h some day */ ++extern void * __dump_memcpy(void *, const void *, size_t); ++#endif /* __KERNEL__ */ ++ ++#endif /* _ASM_DUMP_H */ +Index: linux-2.6.10/include/asm-ia64/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ia64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-ia64/kerntypes.h 2005-04-05 16:47:53.884214072 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-ia64/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* IA64-specific header files */ ++#ifndef _IA64_KERNTYPES_H ++#define _IA64_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _IA64_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-ppc64/dump.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ppc64/dump.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-ppc64/dump.h 2005-04-05 16:47:53.878214984 +0800 +@@ -0,0 +1,115 @@ ++/* ++ * Kernel header file for Linux crash dumps. ++ * ++ * Created by: Todd Inglett ++ * ++ * Copyright 2002 - 2004 International Business Machines ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* This header file holds the architecture specific crash dump header */ ++#ifndef _ASM_DUMP_H ++#define _ASM_DUMP_H ++ ++/* necessary header files */ ++#include /* for pt_regs */ ++#include ++#include ++ ++/* definitions */ ++#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */ ++#define DUMP_ASM_VERSION_NUMBER 0x5 /* version number */ ++ ++/* ++ * Structure: __dump_header_asm ++ * Function: This is the header for architecture-specific stuff. It ++ * follows right after the dump header. ++ */ ++struct __dump_header_asm { ++ ++ /* the dump magic number -- unique to verify dump is valid */ ++ uint64_t dha_magic_number; ++ ++ /* the version number of this dump */ ++ uint32_t dha_version; ++ ++ /* the size of this header (in case we can't read it) */ ++ uint32_t dha_header_size; ++ ++ /* the dump registers */ ++ struct pt_regs dha_regs; ++ ++ /* smp specific */ ++ uint32_t dha_smp_num_cpus; ++ int dha_dumping_cpu; ++ struct pt_regs dha_smp_regs[NR_CPUS]; ++ uint64_t dha_smp_current_task[NR_CPUS]; ++ uint64_t dha_stack[NR_CPUS]; ++ uint64_t dha_stack_ptr[NR_CPUS]; ++} __attribute__((packed)); ++ ++#ifdef __KERNEL__ ++static inline void get_current_regs(struct pt_regs *regs) ++{ ++ unsigned long tmp1, tmp2; ++ ++ __asm__ __volatile__ ( ++ "std 0,0(%2)\n" ++ "std 1,8(%2)\n" ++ "std 2,16(%2)\n" ++ "std 3,24(%2)\n" ++ "std 4,32(%2)\n" ++ "std 5,40(%2)\n" ++ "std 6,48(%2)\n" ++ "std 7,56(%2)\n" ++ "std 8,64(%2)\n" ++ "std 9,72(%2)\n" ++ "std 10,80(%2)\n" ++ "std 11,88(%2)\n" ++ "std 12,96(%2)\n" ++ "std 13,104(%2)\n" ++ "std 14,112(%2)\n" ++ "std 15,120(%2)\n" ++ "std 16,128(%2)\n" ++ "std 17,136(%2)\n" ++ "std 18,144(%2)\n" ++ "std 19,152(%2)\n" ++ "std 20,160(%2)\n" ++ "std 21,168(%2)\n" ++ "std 22,176(%2)\n" ++ "std 23,184(%2)\n" ++ "std 24,192(%2)\n" ++ "std 25,200(%2)\n" ++ "std 26,208(%2)\n" ++ "std 27,216(%2)\n" ++ "std 28,224(%2)\n" ++ "std 29,232(%2)\n" ++ "std 30,240(%2)\n" ++ "std 31,248(%2)\n" ++ "mfmsr %0\n" ++ "std %0, 264(%2)\n" ++ "mfctr %0\n" ++ "std %0, 280(%2)\n" ++ "mflr %0\n" ++ "std %0, 288(%2)\n" ++ "bl 1f\n" ++ "1: mflr %1\n" ++ "std %1, 256(%2)\n" ++ "mtlr %0\n" ++ "mfxer %0\n" ++ "std %0, 296(%2)\n" ++ : "=&r" (tmp1), "=&r" (tmp2) ++ : "b" (regs)); ++} ++ ++extern struct __dump_header_asm dump_header_asm; ++ ++#ifdef CONFIG_SMP ++extern void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *)); ++#else ++#define dump_send_ipi() do { } while(0) ++#endif ++#endif /* __KERNEL__ */ ++ ++#endif /* _ASM_DUMP_H */ +Index: linux-2.6.10/include/asm-ppc64/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ppc64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-ppc64/kerntypes.h 2005-04-05 16:47:53.879214832 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-ppc64/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* PPC64-specific header files */ ++#ifndef _PPC64_KERNTYPES_H ++#define _PPC64_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _PPC64_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-ppc64/kmap_types.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ppc64/kmap_types.h 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/include/asm-ppc64/kmap_types.h 2005-04-05 16:47:53.878214984 +0800 +@@ -16,7 +16,8 @@ + KM_IRQ1, + KM_SOFTIRQ0, + KM_SOFTIRQ1, +- KM_TYPE_NR ++ KM_TYPE_NR, ++ KM_DUMP + }; + + #endif +Index: linux-2.6.10/include/asm-ppc64/smp.h +=================================================================== +--- linux-2.6.10.orig/include/asm-ppc64/smp.h 2004-12-25 05:33:47.000000000 +0800 ++++ linux-2.6.10/include/asm-ppc64/smp.h 2005-04-05 16:47:53.877215136 +0800 +@@ -36,7 +36,7 @@ + extern void smp_send_debugger_break(int cpu); + struct pt_regs; + extern void smp_message_recv(int, struct pt_regs *); +- ++extern void dump_send_ipi(int (*dump_ipi_callback)(struct pt_regs *)); + + #define smp_processor_id() (get_paca()->paca_index) + #define hard_smp_processor_id() (get_paca()->hw_cpu_id) +Index: linux-2.6.10/include/asm-cris/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-cris/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-cris/kerntypes.h 2005-04-05 16:47:53.874215592 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-cris/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* CRIS-specific header files */ ++#ifndef _CRIS_KERNTYPES_H ++#define _CRIS_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _CRIS_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-m68knommu/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-m68knommu/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-m68knommu/kerntypes.h 2005-04-05 16:47:53.870216200 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-m68knommu/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* m68k/no-MMU-specific header files */ ++#ifndef _M68KNOMMU_KERNTYPES_H ++#define _M68KNOMMU_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _M68KNOMMU_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-v850/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-v850/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-v850/kerntypes.h 2005-04-05 16:47:53.888213464 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-v850/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* V850-specific header files */ ++#ifndef _V850_KERNTYPES_H ++#define _V850_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _V850_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-x86_64/dump.h +=================================================================== +--- linux-2.6.10.orig/include/asm-x86_64/dump.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-x86_64/dump.h 2005-04-05 16:47:53.868216504 +0800 +@@ -0,0 +1,93 @@ ++/* ++ * Kernel header file for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sgi.com) ++ * ++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved. ++ * x86_64 lkcd port Sachin Sant ( sachinp@in.ibm.com) ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* This header file holds the architecture specific crash dump header */ ++#ifndef _ASM_DUMP_H ++#define _ASM_DUMP_H ++ ++/* necessary header files */ ++#include /* for pt_regs */ ++#include ++ ++/* definitions */ ++#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */ ++#define DUMP_ASM_VERSION_NUMBER 0x2 /* version number */ ++ ++ ++/* ++ * Structure: dump_header_asm_t ++ * Function: This is the header for architecture-specific stuff. It ++ * follows right after the dump header. ++ */ ++struct __dump_header_asm { ++ ++ /* the dump magic number -- unique to verify dump is valid */ ++ uint64_t dha_magic_number; ++ ++ /* the version number of this dump */ ++ uint32_t dha_version; ++ ++ /* the size of this header (in case we can't read it) */ ++ uint32_t dha_header_size; ++ ++ /* the dump registers */ ++ struct pt_regs dha_regs; ++ ++ /* smp specific */ ++ uint32_t dha_smp_num_cpus; ++ int dha_dumping_cpu; ++ struct pt_regs dha_smp_regs[NR_CPUS]; ++ uint64_t dha_smp_current_task[NR_CPUS]; ++ uint64_t dha_stack[NR_CPUS]; ++ uint64_t dha_stack_ptr[NR_CPUS]; ++} __attribute__((packed)); ++ ++#ifdef __KERNEL__ ++static inline void get_current_regs(struct pt_regs *regs) ++{ ++ unsigned seg; ++ __asm__ __volatile__("movq %%r15,%0" : "=m"(regs->r15)); ++ __asm__ __volatile__("movq %%r14,%0" : "=m"(regs->r14)); ++ __asm__ __volatile__("movq %%r13,%0" : "=m"(regs->r13)); ++ __asm__ __volatile__("movq %%r12,%0" : "=m"(regs->r12)); ++ __asm__ __volatile__("movq %%r11,%0" : "=m"(regs->r11)); ++ __asm__ __volatile__("movq %%r10,%0" : "=m"(regs->r10)); ++ __asm__ __volatile__("movq %%r9,%0" : "=m"(regs->r9)); ++ __asm__ __volatile__("movq %%r8,%0" : "=m"(regs->r8)); ++ __asm__ __volatile__("movq %%rbx,%0" : "=m"(regs->rbx)); ++ __asm__ __volatile__("movq %%rcx,%0" : "=m"(regs->rcx)); ++ __asm__ __volatile__("movq %%rdx,%0" : "=m"(regs->rdx)); ++ __asm__ __volatile__("movq %%rsi,%0" : "=m"(regs->rsi)); ++ __asm__ __volatile__("movq %%rdi,%0" : "=m"(regs->rdi)); ++ __asm__ __volatile__("movq %%rbp,%0" : "=m"(regs->rbp)); ++ __asm__ __volatile__("movq %%rax,%0" : "=m"(regs->rax)); ++ __asm__ __volatile__("movq %%rsp,%0" : "=m"(regs->rsp)); ++ __asm__ __volatile__("movl %%ss, %0" :"=r"(seg)); ++ regs->ss = (unsigned long)seg; ++ __asm__ __volatile__("movl %%cs, %0" :"=r"(seg)); ++ regs->cs = (unsigned long)seg; ++ __asm__ __volatile__("pushfq; popq %0" :"=m"(regs->eflags)); ++ regs->rip = (unsigned long)current_text_addr(); ++ ++} ++ ++extern volatile int dump_in_progress; ++extern struct __dump_header_asm dump_header_asm; ++ ++#ifdef CONFIG_SMP ++ ++ ++extern void dump_send_ipi(void); ++#else ++#define dump_send_ipi() do { } while(0) ++#endif ++#endif /* __KERNEL__ */ ++ ++#endif /* _ASM_DUMP_H */ +Index: linux-2.6.10/include/asm-x86_64/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-x86_64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-x86_64/kerntypes.h 2005-04-05 16:47:53.869216352 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-x86_64/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* x86_64-specific header files */ ++#ifndef _X86_64_KERNTYPES_H ++#define _X86_64_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _X86_64_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-x86_64/hw_irq.h +=================================================================== +--- linux-2.6.10.orig/include/asm-x86_64/hw_irq.h 2004-12-25 05:35:39.000000000 +0800 ++++ linux-2.6.10/include/asm-x86_64/hw_irq.h 2005-04-05 16:47:53.869216352 +0800 +@@ -34,7 +34,6 @@ + + #define IA32_SYSCALL_VECTOR 0x80 + +- + /* + * Vectors 0x20-0x2f are used for ISA interrupts. + */ +@@ -55,6 +54,7 @@ + #define TASK_MIGRATION_VECTOR 0xfb + #define CALL_FUNCTION_VECTOR 0xfa + #define KDB_VECTOR 0xf9 ++#define DUMP_VECTOR 0xf8 + + #define THERMAL_APIC_VECTOR 0xf0 + +Index: linux-2.6.10/include/asm-x86_64/kmap_types.h +=================================================================== +--- linux-2.6.10.orig/include/asm-x86_64/kmap_types.h 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/include/asm-x86_64/kmap_types.h 2005-04-05 16:47:53.868216504 +0800 +@@ -13,7 +13,8 @@ + KM_IRQ1, + KM_SOFTIRQ0, + KM_SOFTIRQ1, +- KM_TYPE_NR ++ KM_DUMP, ++ KM_TYPE_NR, + }; + + #endif +Index: linux-2.6.10/include/asm-x86_64/smp.h +=================================================================== +--- linux-2.6.10.orig/include/asm-x86_64/smp.h 2004-12-25 05:33:48.000000000 +0800 ++++ linux-2.6.10/include/asm-x86_64/smp.h 2005-04-05 16:47:53.867216656 +0800 +@@ -41,6 +41,7 @@ + extern int pic_mode; + extern int smp_num_siblings; + extern void smp_flush_tlb(void); ++extern void dump_send_ipi(void); + extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); + extern void smp_send_reschedule(int cpu); + extern void smp_invalidate_rcv(void); /* Process an NMI */ +Index: linux-2.6.10/include/asm-s390/dump.h +=================================================================== +--- linux-2.6.10.orig/include/asm-s390/dump.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-s390/dump.h 2005-04-05 16:47:53.865216960 +0800 +@@ -0,0 +1,10 @@ ++/* ++ * Kernel header file for Linux crash dumps. ++ */ ++ ++/* Nothing to be done here, we have proper hardware support */ ++#ifndef _ASM_DUMP_H ++#define _ASM_DUMP_H ++ ++#endif ++ +Index: linux-2.6.10/include/asm-s390/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-s390/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-s390/kerntypes.h 2005-04-05 16:47:53.866216808 +0800 +@@ -0,0 +1,46 @@ ++/* ++ * asm-s390/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* S/390 specific header files */ ++#ifndef _S390_KERNTYPES_H ++#define _S390_KERNTYPES_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* channel subsystem driver */ ++#include "../../drivers/s390/cio/cio.h" ++#include "../../drivers/s390/cio/chsc.h" ++#include "../../drivers/s390/cio/css.h" ++#include "../../drivers/s390/cio/device.h" ++#include "../../drivers/s390/cio/qdio.h" ++ ++/* dasd device driver */ ++#include "../../drivers/s390/block/dasd_int.h" ++#include "../../drivers/s390/block/dasd_diag.h" ++#include "../../drivers/s390/block/dasd_eckd.h" ++#include "../../drivers/s390/block/dasd_fba.h" ++ ++/* networking drivers */ ++#include "../../drivers/s390/net/fsm.h" ++#include "../../drivers/s390/net/iucv.h" ++#include "../../drivers/s390/net/lcs.h" ++ ++/* zfcp device driver */ ++#include "../../drivers/s390/scsi/zfcp_def.h" ++#include "../../drivers/s390/scsi/zfcp_fsf.h" ++ ++#endif /* _S390_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-sparc64/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-sparc64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-sparc64/kerntypes.h 2005-04-05 16:47:53.872215896 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-sparc64/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* SPARC64-specific header files */ ++#ifndef _SPARC64_KERNTYPES_H ++#define _SPARC64_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _SPARC64_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-mips/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-mips/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-mips/kerntypes.h 2005-04-05 16:47:53.881214528 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-mips/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* MIPS-specific header files */ ++#ifndef _MIPS_KERNTYPES_H ++#define _MIPS_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _MIPS_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-m68k/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-m68k/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-m68k/kerntypes.h 2005-04-05 16:47:53.875215440 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-m68k/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* m68k-specific header files */ ++#ifndef _M68K_KERNTYPES_H ++#define _M68K_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _M68K_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-generic/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-generic/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-generic/kerntypes.h 2005-04-05 16:47:53.871216048 +0800 +@@ -0,0 +1,20 @@ ++/* ++ * asm-generic/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* Arch-independent header files */ ++#ifndef _GENERIC_KERNTYPES_H ++#define _GENERIC_KERNTYPES_H ++ ++#include ++ ++#endif /* _GENERIC_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-i386/dump.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/dump.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-i386/dump.h 2005-04-05 16:47:53.886213768 +0800 +@@ -0,0 +1,90 @@ ++/* ++ * Kernel header file for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sgi.com) ++ * ++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* This header file holds the architecture specific crash dump header */ ++#ifndef _ASM_DUMP_H ++#define _ASM_DUMP_H ++ ++/* necessary header files */ ++#include ++#include ++#include ++#include ++ ++/* definitions */ ++#define DUMP_ASM_MAGIC_NUMBER 0xdeaddeadULL /* magic number */ ++#define DUMP_ASM_VERSION_NUMBER 0x3 /* version number */ ++ ++/* ++ * Structure: __dump_header_asm ++ * Function: This is the header for architecture-specific stuff. It ++ * follows right after the dump header. ++ */ ++struct __dump_header_asm { ++ /* the dump magic number -- unique to verify dump is valid */ ++ u64 dha_magic_number; ++ ++ /* the version number of this dump */ ++ u32 dha_version; ++ ++ /* the size of this header (in case we can't read it) */ ++ u32 dha_header_size; ++ ++ /* the esp for i386 systems */ ++ u32 dha_esp; ++ ++ /* the eip for i386 systems */ ++ u32 dha_eip; ++ ++ /* the dump registers */ ++ struct pt_regs dha_regs; ++ ++ /* smp specific */ ++ u32 dha_smp_num_cpus; ++ u32 dha_dumping_cpu; ++ struct pt_regs dha_smp_regs[NR_CPUS]; ++ u32 dha_smp_current_task[NR_CPUS]; ++ u32 dha_stack[NR_CPUS]; ++ u32 dha_stack_ptr[NR_CPUS]; ++} __attribute__((packed)); ++ ++#ifdef __KERNEL__ ++ ++extern struct __dump_header_asm dump_header_asm; ++ ++#ifdef CONFIG_SMP ++extern cpumask_t irq_affinity[]; ++extern int (*dump_ipi_function_ptr)(struct pt_regs *); ++extern void dump_send_ipi(void); ++#else ++#define dump_send_ipi() do { } while(0) ++#endif ++ ++static inline void get_current_regs(struct pt_regs *regs) ++{ ++ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx)); ++ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx)); ++ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx)); ++ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi)); ++ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi)); ++ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp)); ++ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax)); ++ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp)); ++ __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss)); ++ __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs)); ++ __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds)); ++ __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes)); ++ __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags)); ++ regs->eip = (unsigned long)current_text_addr(); ++} ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _ASM_DUMP_H */ +Index: linux-2.6.10/include/asm-i386/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-i386/kerntypes.h 2005-04-05 16:47:53.887213616 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-i386/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* ix86-specific header files */ ++#ifndef _I386_KERNTYPES_H ++#define _I386_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _I386_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-i386/kmap_types.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/kmap_types.h 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/kmap_types.h 2005-04-05 16:47:53.886213768 +0800 +@@ -23,7 +23,8 @@ + D(10) KM_IRQ1, + D(11) KM_SOFTIRQ0, + D(12) KM_SOFTIRQ1, +-D(13) KM_TYPE_NR ++D(13) KM_DUMP, ++D(14) KM_TYPE_NR + }; + + #undef D +Index: linux-2.6.10/include/asm-i386/smp.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/smp.h 2004-12-25 05:35:50.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/smp.h 2005-04-05 16:47:53.885213920 +0800 +@@ -37,6 +37,7 @@ + extern cpumask_t cpu_sibling_map[]; + + extern void smp_flush_tlb(void); ++extern void dump_send_ipi(void); + extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); + extern void smp_invalidate_rcv(void); /* Process an NMI */ + extern void (*mtrr_hook) (void); +Index: linux-2.6.10/include/asm-i386/mach-default/irq_vectors.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/mach-default/irq_vectors.h 2004-12-25 05:34:26.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/mach-default/irq_vectors.h 2005-04-05 16:47:53.887213616 +0800 +@@ -48,6 +48,7 @@ + #define INVALIDATE_TLB_VECTOR 0xfd + #define RESCHEDULE_VECTOR 0xfc + #define CALL_FUNCTION_VECTOR 0xfb ++#define DUMP_VECTOR 0xfa + + #define THERMAL_APIC_VECTOR 0xf0 + /* +Index: linux-2.6.10/include/asm-arm/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-arm/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-arm/kerntypes.h 2005-04-05 16:47:53.873215744 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-arm/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* ARM-specific header files */ ++#ifndef _ARM_KERNTYPES_H ++#define _ARM_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _ARM_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-sparc/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-sparc/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-sparc/kerntypes.h 2005-04-05 16:47:53.874215592 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-sparc/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* SPARC-specific header files */ ++#ifndef _SPARC_KERNTYPES_H ++#define _SPARC_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _SPARC_KERNTYPES_H */ +Index: linux-2.6.10/include/asm-mips64/kerntypes.h +=================================================================== +--- linux-2.6.10.orig/include/asm-mips64/kerntypes.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/asm-mips64/kerntypes.h 2005-04-05 16:47:53.881214528 +0800 +@@ -0,0 +1,21 @@ ++/* ++ * asm-mips64/kerntypes.h ++ * ++ * Arch-dependent header file that includes headers for all arch-specific ++ * types of interest. ++ * The kernel type information is used by the lcrash utility when ++ * analyzing system crash dumps or the live system. Using the type ++ * information for the running system, rather than kernel header files, ++ * makes for a more flexible and robust analysis tool. ++ * ++ * This source code is released under the GNU GPL. ++ */ ++ ++/* MIPS64-specific header files */ ++#ifndef _MIPS64_KERNTYPES_H ++#define _MIPS64_KERNTYPES_H ++ ++/* Use the default */ ++#include ++ ++#endif /* _MIPS64_KERNTYPES_H */ +Index: linux-2.6.10/net/Kconfig +=================================================================== +--- linux-2.6.10.orig/net/Kconfig 2005-04-05 16:29:27.896349784 +0800 ++++ linux-2.6.10/net/Kconfig 2005-04-05 16:47:53.895212400 +0800 +@@ -632,7 +632,7 @@ + endmenu + + config NETPOLL +- def_bool NETCONSOLE ++ def_bool NETCONSOLE || CRASH_DUMP_NETDEV + + config NETPOLL_RX + bool "Netpoll support for trapping incoming packets" +Index: linux-2.6.10/scripts/mkcompile_h +=================================================================== +--- linux-2.6.10.orig/scripts/mkcompile_h 2004-12-25 05:35:50.000000000 +0800 ++++ linux-2.6.10/scripts/mkcompile_h 2005-04-05 16:47:53.950204040 +0800 +@@ -33,7 +33,7 @@ + + UTS_LEN=64 + UTS_TRUNCATE="sed -e s/\(.\{1,$UTS_LEN\}\).*/\1/" +- ++LINUX_COMPILE_VERSION_ID="__linux_compile_version_id__`hostname | tr -c '[0-9A-Za-z\n]' '__'`_`LANG=C date | tr -c '[0-9A-Za-z\n]' '_'`" + # Generate a temporary compile.h + + ( echo /\* This file is auto generated, version $VERSION \*/ +@@ -55,6 +55,8 @@ + fi + + echo \#define LINUX_COMPILER \"`$CC -v 2>&1 | tail -n 1`\" ++ echo \#define LINUX_COMPILE_VERSION_ID $LINUX_COMPILE_VERSION_ID ++ echo \#define LINUX_COMPILE_VERSION_ID_TYPE typedef char* "$LINUX_COMPILE_VERSION_ID""_t" + ) > .tmpcompile + + # Only replace the real compile.h if the new one is different, +Index: linux-2.6.10/mm/bootmem.c +=================================================================== +--- linux-2.6.10.orig/mm/bootmem.c 2004-12-25 05:34:30.000000000 +0800 ++++ linux-2.6.10/mm/bootmem.c 2005-04-05 16:47:53.903211184 +0800 +@@ -26,6 +26,7 @@ + */ + unsigned long max_low_pfn; + unsigned long min_low_pfn; ++EXPORT_SYMBOL(min_low_pfn); + unsigned long max_pfn; + + EXPORT_SYMBOL(max_pfn); /* This is exported so +@@ -284,6 +285,7 @@ + if (j + 16 < BITS_PER_LONG) + prefetchw(page + j + 16); + __ClearPageReserved(page + j); ++ set_page_count(page + j, 1); + } + __free_pages(page, ffs(BITS_PER_LONG)-1); + i += BITS_PER_LONG; +Index: linux-2.6.10/mm/page_alloc.c +=================================================================== +--- linux-2.6.10.orig/mm/page_alloc.c 2005-04-05 16:29:28.218300840 +0800 ++++ linux-2.6.10/mm/page_alloc.c 2005-04-05 16:47:53.902211336 +0800 +@@ -47,6 +47,11 @@ + EXPORT_SYMBOL(totalram_pages); + EXPORT_SYMBOL(nr_swap_pages); + ++#ifdef CONFIG_CRASH_DUMP_MODULE ++/* This symbol has to be exported to use 'for_each_pgdat' macro by modules. */ ++EXPORT_SYMBOL(pgdat_list); ++#endif ++ + /* + * Used by page_zone() to look up the address of the struct zone whose + * id is encoded in the upper bits of page->flags +@@ -281,8 +286,11 @@ + arch_free_page(page, order); + + mod_page_state(pgfree, 1 << order); +- for (i = 0 ; i < (1 << order) ; ++i) ++ for (i = 0 ; i < (1 << order) ; ++i){ ++ if (unlikely(i)) ++ __put_page(page + i); + free_pages_check(__FUNCTION__, page + i); ++ } + list_add(&page->lru, &list); + kernel_map_pages(page, 1<mapping || page_mapped(page) || +- (page->flags & ( +- 1 << PG_private | +- 1 << PG_locked | +- 1 << PG_lru | +- 1 << PG_active | +- 1 << PG_dirty | +- 1 << PG_reclaim | +- 1 << PG_swapcache | +- 1 << PG_writeback ))) ++ int i; ++ ++ for(i = 0; i < (1 << order); i++){ ++ struct page *page = _page + i; ++ ++ if (page->mapping || page_mapped(page) || ++ (page->flags & ( ++ 1 << PG_private | ++ 1 << PG_locked | ++ 1 << PG_lru | ++ 1 << PG_active | ++ 1 << PG_dirty | ++ 1 << PG_reclaim | ++ 1 << PG_swapcache | ++ 1 << PG_writeback ))) + bad_page(__FUNCTION__, page); + +- page->flags &= ~(1 << PG_uptodate | 1 << PG_error | +- 1 << PG_referenced | 1 << PG_arch_1 | +- 1 << PG_checked | 1 << PG_mappedtodisk); +- page->private = 0; +- set_page_refs(page, order); ++ page->flags &= ~(1 << PG_uptodate | 1 << PG_error | ++ 1 << PG_referenced | 1 << PG_arch_1 | ++ 1 << PG_checked | 1 << PG_mappedtodisk); ++ page->private = 0; ++ set_page_count(page, 1); ++ } + } + + /* +Index: linux-2.6.10/kernel/sched.c +=================================================================== +--- linux-2.6.10.orig/kernel/sched.c 2005-04-05 16:29:30.335978904 +0800 ++++ linux-2.6.10/kernel/sched.c 2005-04-05 16:47:53.901211488 +0800 +@@ -54,6 +54,10 @@ + #define cpu_to_node_mask(cpu) (cpu_online_map) + #endif + ++/* used to soft spin in sched while dump is in progress */ ++unsigned long dump_oncpu; ++EXPORT_SYMBOL(dump_oncpu); ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +@@ -184,109 +188,6 @@ + #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ + < (long long) (sd)->cache_hot_time) + +-/* +- * These are the runqueue data structures: +- */ +- +-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) +- +-typedef struct runqueue runqueue_t; +- +-struct prio_array { +- unsigned int nr_active; +- unsigned long bitmap[BITMAP_SIZE]; +- struct list_head queue[MAX_PRIO]; +-}; +- +-/* +- * This is the main, per-CPU runqueue data structure. +- * +- * Locking rule: those places that want to lock multiple runqueues +- * (such as the load balancing or the thread migration code), lock +- * acquire operations must be ordered by ascending &runqueue. +- */ +-struct runqueue { +- spinlock_t lock; +- +- /* +- * nr_running and cpu_load should be in the same cacheline because +- * remote CPUs use both these fields when doing load calculation. +- */ +- unsigned long nr_running; +-#ifdef CONFIG_SMP +- unsigned long cpu_load; +-#endif +- unsigned long long nr_switches; +- +- /* +- * This is part of a global counter where only the total sum +- * over all CPUs matters. A task can increase this counter on +- * one CPU and if it got migrated afterwards it may decrease +- * it on another CPU. Always updated under the runqueue lock: +- */ +- unsigned long nr_uninterruptible; +- +- unsigned long expired_timestamp; +- unsigned long long timestamp_last_tick; +- task_t *curr, *idle; +- struct mm_struct *prev_mm; +- prio_array_t *active, *expired, arrays[2]; +- int best_expired_prio; +- atomic_t nr_iowait; +- +-#ifdef CONFIG_SMP +- struct sched_domain *sd; +- +- /* For active balancing */ +- int active_balance; +- int push_cpu; +- +- task_t *migration_thread; +- struct list_head migration_queue; +-#endif +- +-#ifdef CONFIG_SCHEDSTATS +- /* latency stats */ +- struct sched_info rq_sched_info; +- +- /* sys_sched_yield() stats */ +- unsigned long yld_exp_empty; +- unsigned long yld_act_empty; +- unsigned long yld_both_empty; +- unsigned long yld_cnt; +- +- /* schedule() stats */ +- unsigned long sched_noswitch; +- unsigned long sched_switch; +- unsigned long sched_cnt; +- unsigned long sched_goidle; +- +- /* pull_task() stats */ +- unsigned long pt_gained[MAX_IDLE_TYPES]; +- unsigned long pt_lost[MAX_IDLE_TYPES]; +- +- /* active_load_balance() stats */ +- unsigned long alb_cnt; +- unsigned long alb_lost; +- unsigned long alb_gained; +- unsigned long alb_failed; +- +- /* try_to_wake_up() stats */ +- unsigned long ttwu_cnt; +- unsigned long ttwu_attempts; +- unsigned long ttwu_moved; +- +- /* wake_up_new_task() stats */ +- unsigned long wunt_cnt; +- unsigned long wunt_moved; +- +- /* sched_migrate_task() stats */ +- unsigned long smt_cnt; +- +- /* sched_balance_exec() stats */ +- unsigned long sbe_cnt; +-#endif +-}; + + static DEFINE_PER_CPU(struct runqueue, runqueues); + +@@ -2535,6 +2436,15 @@ + unsigned long run_time; + int cpu, idx; + ++ /* ++ * If crash dump is in progress, this other cpu's ++ * need to wait until it completes. ++ * NB: this code is optimized away for kernels without ++ * dumping enabled. ++ */ ++ if (unlikely(dump_oncpu)) ++ goto dump_scheduling_disabled; ++ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. +@@ -2698,6 +2608,16 @@ + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; ++ ++ return; ++ ++ dump_scheduling_disabled: ++ /* allow scheduling only if this is the dumping cpu */ ++ if (dump_oncpu != smp_processor_id()+1) { ++ while (dump_oncpu) ++ cpu_relax(); ++ } ++ return; + } + + EXPORT_SYMBOL(schedule); +Index: linux-2.6.10/kernel/panic.c +=================================================================== +--- linux-2.6.10.orig/kernel/panic.c 2004-12-25 05:35:29.000000000 +0800 ++++ linux-2.6.10/kernel/panic.c 2005-04-05 16:47:53.898211944 +0800 +@@ -18,12 +18,17 @@ + #include + #include + #include ++#ifdef CONFIG_KEXEC ++#include ++#endif + + int panic_timeout; + int panic_on_oops; + int tainted; ++void (*dump_function_ptr)(const char *, const struct pt_regs *) = 0; + + EXPORT_SYMBOL(panic_timeout); ++EXPORT_SYMBOL(dump_function_ptr); + + struct notifier_block *panic_notifier_list; + +@@ -71,11 +76,12 @@ + printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + bust_spinlocks(0); + ++ notifier_call_chain(&panic_notifier_list, 0, buf); ++ + #ifdef CONFIG_SMP + smp_send_stop(); + #endif + +- notifier_call_chain(&panic_notifier_list, 0, buf); + + if (!panic_blink) + panic_blink = no_blink; +@@ -87,6 +93,18 @@ + * We can't use the "normal" timers since we just panicked.. + */ + printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); ++#ifdef CONFIG_KEXEC ++{ ++ struct kimage *image; ++ image = xchg(&kexec_image, 0); ++ if (image) { ++ printk(KERN_EMERG "by starting a new kernel ..\n"); ++ mdelay(panic_timeout*1000); ++ machine_kexec(image); ++ } ++} ++#endif ++ + for (i = 0; i < panic_timeout*1000; ) { + touch_nmi_watchdog(); + i += panic_blink(i); +Index: linux-2.6.10/drivers/block/ll_rw_blk.c +=================================================================== +--- linux-2.6.10.orig/drivers/block/ll_rw_blk.c 2005-04-05 16:29:30.310982704 +0800 ++++ linux-2.6.10/drivers/block/ll_rw_blk.c 2005-04-05 16:47:53.949204192 +0800 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + /* + * for max sense size +@@ -2628,7 +2629,8 @@ + sector_t maxsector; + int ret, nr_sectors = bio_sectors(bio); + +- might_sleep(); ++ if (likely(!dump_oncpu)) ++ might_sleep(); + /* Test device or partition size, when known. */ + maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + if (maxsector) { +Index: linux-2.6.10/drivers/dump/dump_i386.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_i386.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_i386.c 2005-04-05 16:47:53.940205560 +0800 +@@ -0,0 +1,372 @@ ++/* ++ * Architecture specific (i386) functions for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sgi.com) ++ * ++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved. ++ * ++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com) ++ * Copyright 2000 TurboLinux, Inc. All rights reserved. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* ++ * The hooks for dumping the kernel virtual memory to disk are in this ++ * file. Any time a modification is made to the virtual memory mechanism, ++ * these routines must be changed to use the new mechanisms. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++#include ++ ++#include ++#include ++#include ++#include ++ ++static __s32 saved_irq_count; /* saved preempt_count() flags */ ++ ++static int ++alloc_dha_stack(void) ++{ ++ int i; ++ void *ptr; ++ ++ if (dump_header_asm.dha_stack[0]) ++ return 0; ++ ++ ptr = vmalloc(THREAD_SIZE * num_online_cpus()); ++ if (!ptr) { ++ printk("vmalloc for dha_stacks failed\n"); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ dump_header_asm.dha_stack[i] = (u32)((unsigned long)ptr + ++ (i * THREAD_SIZE)); ++ } ++ return 0; ++} ++ ++static int ++free_dha_stack(void) ++{ ++ if (dump_header_asm.dha_stack[0]) { ++ vfree((void *)dump_header_asm.dha_stack[0]); ++ dump_header_asm.dha_stack[0] = 0; ++ } ++ return 0; ++} ++ ++ ++void ++__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs) ++{ ++ *dest_regs = *regs; ++ ++ /* In case of panic dumps, we collects regs on entry to panic. ++ * so, we shouldn't 'fix' ssesp here again. But it is hard to ++ * tell just looking at regs whether ssesp need fixing. We make ++ * this decision by looking at xss in regs. If we have better ++ * means to determine that ssesp are valid (by some flag which ++ * tells that we are here due to panic dump), then we can use ++ * that instead of this kludge. ++ */ ++ if (!user_mode(regs)) { ++ if ((0xffff & regs->xss) == __KERNEL_DS) ++ /* already fixed up */ ++ return; ++ dest_regs->esp = (unsigned long)&(regs->esp); ++ __asm__ __volatile__ ("movw %%ss, %%ax;" ++ :"=a"(dest_regs->xss)); ++ } ++} ++ ++void ++__dump_save_context(int cpu, const struct pt_regs *regs, ++ struct task_struct *tsk) ++{ ++ dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk; ++ __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs); ++ ++ /* take a snapshot of the stack */ ++ /* doing this enables us to tolerate slight drifts on this cpu */ ++ ++ if (dump_header_asm.dha_stack[cpu]) { ++ memcpy((void *)dump_header_asm.dha_stack[cpu], ++ STACK_START_POSITION(tsk), ++ THREAD_SIZE); ++ } ++ dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info); ++} ++ ++#ifdef CONFIG_SMP ++extern cpumask_t irq_affinity[]; ++extern irq_desc_t irq_desc[]; ++extern void dump_send_ipi(void); ++ ++static int dump_expect_ipi[NR_CPUS]; ++static atomic_t waiting_for_dump_ipi; ++static cpumask_t saved_affinity[NR_IRQS]; ++ ++extern void stop_this_cpu(void *); /* exported by i386 kernel */ ++ ++static int ++dump_nmi_callback(struct pt_regs *regs, int cpu) ++{ ++ if (!dump_expect_ipi[cpu]) ++ return 0; ++ ++ dump_expect_ipi[cpu] = 0; ++ ++ dump_save_this_cpu(regs); ++ atomic_dec(&waiting_for_dump_ipi); ++ ++ level_changed: ++ switch (dump_silence_level) { ++ case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */ ++ while (dump_oncpu) { ++ barrier(); /* paranoia */ ++ if (dump_silence_level != DUMP_HARD_SPIN_CPUS) ++ goto level_changed; ++ ++ cpu_relax(); /* kill time nicely */ ++ } ++ break; ++ ++ case DUMP_HALT_CPUS: /* Execute halt */ ++ stop_this_cpu(NULL); ++ break; ++ ++ case DUMP_SOFT_SPIN_CPUS: ++ /* Mark the task so it spins in schedule */ ++ set_tsk_thread_flag(current, TIF_NEED_RESCHED); ++ break; ++ } ++ ++ return 1; ++} ++ ++/* save registers on other processors */ ++void ++__dump_save_other_cpus(void) ++{ ++ int i, cpu = smp_processor_id(); ++ int other_cpus = num_online_cpus()-1; ++ ++ if (other_cpus > 0) { ++ atomic_set(&waiting_for_dump_ipi, other_cpus); ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ dump_expect_ipi[i] = (i != cpu && cpu_online(i)); ++ } ++ ++ /* short circuit normal NMI handling temporarily */ ++ set_nmi_callback(dump_nmi_callback); ++ wmb(); ++ ++ dump_send_ipi(); ++ /* may be we dont need to wait for NMI to be processed. ++ just write out the header at the end of dumping, if ++ this IPI is not processed until then, there probably ++ is a problem and we just fail to capture state of ++ other cpus. */ ++ while(atomic_read(&waiting_for_dump_ipi) > 0) { ++ cpu_relax(); ++ } ++ ++ unset_nmi_callback(); ++ } ++} ++ ++/* ++ * Routine to save the old irq affinities and change affinities of all irqs to ++ * the dumping cpu. ++ */ ++static void ++set_irq_affinity(void) ++{ ++ int i; ++ cpumask_t cpu = CPU_MASK_NONE; ++ ++ cpu_set(smp_processor_id(), cpu); ++ memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long)); ++ for (i = 0; i < NR_IRQS; i++) { ++ if (irq_desc[i].handler == NULL) ++ continue; ++ irq_affinity[i] = cpu; ++ if (irq_desc[i].handler->set_affinity != NULL) ++ irq_desc[i].handler->set_affinity(i, irq_affinity[i]); ++ } ++} ++ ++/* ++ * Restore old irq affinities. ++ */ ++static void ++reset_irq_affinity(void) ++{ ++ int i; ++ ++ memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long)); ++ for (i = 0; i < NR_IRQS; i++) { ++ if (irq_desc[i].handler == NULL) ++ continue; ++ if (irq_desc[i].handler->set_affinity != NULL) ++ irq_desc[i].handler->set_affinity(i, saved_affinity[i]); ++ } ++} ++ ++#else /* !CONFIG_SMP */ ++#define set_irq_affinity() do { } while (0) ++#define reset_irq_affinity() do { } while (0) ++#define save_other_cpu_states() do { } while (0) ++#endif /* !CONFIG_SMP */ ++ ++/* ++ * Kludge - dump from interrupt context is unreliable (Fixme) ++ * ++ * We do this so that softirqs initiated for dump i/o ++ * get processed and we don't hang while waiting for i/o ++ * to complete or in any irq synchronization attempt. ++ * ++ * This is not quite legal of course, as it has the side ++ * effect of making all interrupts & softirqs triggered ++ * while dump is in progress complete before currently ++ * pending softirqs and the currently executing interrupt ++ * code. ++ */ ++static inline void ++irq_bh_save(void) ++{ ++ saved_irq_count = irq_count(); ++ preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK); ++} ++ ++static inline void ++irq_bh_restore(void) ++{ ++ preempt_count() |= saved_irq_count; ++} ++ ++/* ++ * Name: __dump_irq_enable ++ * Func: Reset system so interrupts are enabled. ++ * This is used for dump methods that require interrupts ++ * Eventually, all methods will have interrupts disabled ++ * and this code can be removed. ++ * ++ * Change irq affinities ++ * Re-enable interrupts ++ */ ++int ++__dump_irq_enable(void) ++{ ++ set_irq_affinity(); ++ irq_bh_save(); ++ local_irq_enable(); ++ return 0; ++} ++ ++/* ++ * Name: __dump_irq_restore ++ * Func: Resume the system state in an architecture-specific way. ++ ++ */ ++void ++__dump_irq_restore(void) ++{ ++ local_irq_disable(); ++ reset_irq_affinity(); ++ irq_bh_restore(); ++} ++ ++/* ++ * Name: __dump_configure_header() ++ * Func: Meant to fill in arch specific header fields except per-cpu state ++ * already captured via __dump_save_context for all CPUs. ++ */ ++int ++__dump_configure_header(const struct pt_regs *regs) ++{ ++ return (0); ++} ++ ++/* ++ * Name: __dump_init() ++ * Func: Initialize the dumping routine process. ++ */ ++void ++__dump_init(uint64_t local_memory_start) ++{ ++ return; ++} ++ ++/* ++ * Name: __dump_open() ++ * Func: Open the dump device (architecture specific). ++ */ ++void ++__dump_open(void) ++{ ++ alloc_dha_stack(); ++} ++ ++/* ++ * Name: __dump_cleanup() ++ * Func: Free any architecture specific data structures. This is called ++ * when the dump module is being removed. ++ */ ++void ++__dump_cleanup(void) ++{ ++ free_dha_stack(); ++} ++ ++extern int pfn_is_ram(unsigned long); ++ ++/* ++ * Name: __dump_page_valid() ++ * Func: Check if page is valid to dump. ++ */ ++int ++__dump_page_valid(unsigned long index) ++{ ++ if (!pfn_valid(index)) ++ return 0; ++ ++ return pfn_is_ram(index); ++} ++ ++/* ++ * Name: manual_handle_crashdump() ++ * Func: Interface for the lkcd dump command. Calls dump_execute() ++ */ ++int ++manual_handle_crashdump(void) { ++ ++ struct pt_regs regs; ++ ++ get_current_regs(®s); ++ dump_execute("manual", ®s); ++ return 0; ++} ++ ++/* ++ * Name: __dump_clean_irq_state() ++ * Func: Clean up from the previous IRQ handling state. Such as oops from ++ * interrupt handler or bottom half. ++ */ ++void ++__dump_clean_irq_state(void) ++{ ++ return; ++} +Index: linux-2.6.10/drivers/dump/dump_ia64.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_ia64.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_ia64.c 2005-04-05 16:47:53.928207384 +0800 +@@ -0,0 +1,458 @@ ++/* ++ * Architecture specific (ia64) functions for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sgi.com) ++ * Contributions from SGI, IBM, and others. ++ * ++ * 2.4 kernel modifications by: Matt D. Robinson (yakker@alacritech.com) ++ * ia64 kernel modifications by: Piet Delaney (piet@www.piet.net) ++ * ++ * Copyright (C) 2001 - 2002 Matt D. Robinson (yakker@alacritech.com) ++ * Copyright (C) 2002 Silicon Graphics, Inc. All rights reserved. ++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* ++ * The hooks for dumping the kernel virtual memory to disk are in this ++ * file. Any time a modification is made to the virtual memory mechanism, ++ * these routines must be changed to use the new mechanisms. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static __s32 saved_irq_count; /* saved preempt_count() flags */ ++ ++ ++static int alloc_dha_stack(void) ++{ ++ int i; ++ void *ptr; ++ ++ if (dump_header_asm.dha_stack[0]) ++ { ++ return 0; ++ } ++ ptr = vmalloc(THREAD_SIZE * num_online_cpus()); ++ if (!ptr) { ++ printk("vmalloc for dha_stacks failed\n"); ++ return -ENOMEM; ++ } ++ bzero(ptr,THREAD_SIZE ); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ dump_header_asm.dha_stack[i] = (uint64_t)((unsigned long)ptr + (i * THREAD_SIZE)); ++ } ++ return 0; ++} ++ ++static int free_dha_stack(void) ++{ ++ if (dump_header_asm.dha_stack[0]) ++ { ++ vfree((void*)dump_header_asm.dha_stack[0]); ++ dump_header_asm.dha_stack[0] = 0; ++ } ++ return 0; ++} ++ ++/* a structure to get arguments into the following callback routine */ ++struct unw_args { ++ int cpu; ++ struct task_struct *tsk; ++}; ++ ++static void ++do_save_sw(struct unw_frame_info *info, void *arg) ++{ ++ struct unw_args *uwargs = (struct unw_args *)arg; ++ int cpu = uwargs->cpu; ++ struct task_struct *tsk = uwargs->tsk; ++ ++ dump_header_asm.dha_stack_ptr[cpu] = (uint64_t)info->sw; ++ ++ if (tsk && dump_header_asm.dha_stack[cpu]) { ++ memcpy((void *)dump_header_asm.dha_stack[cpu], ++ STACK_START_POSITION(tsk), ++ THREAD_SIZE); ++ } ++} ++ ++void ++__dump_save_context(int cpu, const struct pt_regs *regs, ++ struct task_struct *tsk) ++{ ++ struct unw_args uwargs; ++ ++ dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk; ++ ++ if (regs) { ++ dump_header_asm.dha_smp_regs[cpu] = *regs; ++ } ++ ++ /* save a snapshot of the stack in a nice state for unwinding */ ++ uwargs.cpu = cpu; ++ uwargs.tsk = tsk; ++ ++ unw_init_running(do_save_sw, (void *)&uwargs); ++} ++ ++#ifdef CONFIG_SMP ++ ++extern cpumask_t irq_affinity[]; ++#define irq_desc _irq_desc ++extern irq_desc_t irq_desc[]; ++extern void dump_send_ipi(void); ++static cpumask_t saved_affinity[NR_IRQS]; ++ ++/* ++ * Routine to save the old irq affinities and change affinities of all irqs to ++ * the dumping cpu. ++ */ ++static void ++set_irq_affinity(void) ++{ ++ int i; ++ cpumask_t cpu = CPU_MASK_NONE; ++ ++ cpu_set(smp_processor_id(), cpu); ++ memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long)); ++ for (i = 0; i < NR_IRQS; i++) { ++ if (irq_desc[i].handler == NULL) ++ continue; ++ irq_affinity[i] = cpu; ++ if (irq_desc[i].handler->set_affinity != NULL) ++ irq_desc[i].handler->set_affinity(i, irq_affinity[i]); ++ } ++} ++ ++/* ++ * Restore old irq affinities. ++ */ ++static void ++reset_irq_affinity(void) ++{ ++ int i; ++ ++ memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long)); ++ for (i = 0; i < NR_IRQS; i++) { ++ if (irq_desc[i].handler == NULL) ++ continue; ++ if (irq_desc[i].handler->set_affinity != NULL) ++ irq_desc[i].handler->set_affinity(i, saved_affinity[i]); ++ } ++} ++ ++#else /* !CONFIG_SMP */ ++#define set_irq_affinity() do { } while (0) ++#define reset_irq_affinity() do { } while (0) ++#define save_other_cpu_states() do { } while (0) ++#endif /* !CONFIG_SMP */ ++ ++#ifdef CONFIG_SMP ++static int dump_expect_ipi[NR_CPUS]; ++static atomic_t waiting_for_dump_ipi; ++static int wait_for_dump_ipi = 2000; /* wait 2000 ms for ipi to be handled */ ++extern void (*dump_trace_ptr)(struct pt_regs *); ++ ++ ++extern void stop_this_cpu(void); ++ ++static int ++dump_nmi_callback(struct pt_regs *regs, int cpu) ++{ ++ if (!dump_expect_ipi[cpu]) ++ return 0; ++ ++ dump_expect_ipi[cpu] = 0; ++ ++ dump_save_this_cpu(regs); ++ atomic_dec(&waiting_for_dump_ipi); ++ ++ level_changed: ++ switch (dump_silence_level) { ++ case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */ ++ while (dump_oncpu) { ++ barrier(); /* paranoia */ ++ if (dump_silence_level != DUMP_HARD_SPIN_CPUS) ++ goto level_changed; ++ ++ cpu_relax(); /* kill time nicely */ ++ } ++ break; ++ ++ case DUMP_HALT_CPUS: /* Execute halt */ ++ stop_this_cpu(); ++ break; ++ ++ case DUMP_SOFT_SPIN_CPUS: ++ /* Mark the task so it spins in schedule */ ++ set_tsk_thread_flag(current, TIF_NEED_RESCHED); ++ break; ++ } ++ ++ return 1; ++} ++ ++int IPI_handler(struct pt_regs *regs) ++{ ++ int cpu; ++ cpu = task_cpu(current); ++ return(dump_nmi_callback(regs, cpu)); ++} ++ ++/* save registers on other processors */ ++void ++__dump_save_other_cpus(void) ++{ ++ int i, cpu = smp_processor_id(); ++ int other_cpus = num_online_cpus()-1; ++ int wait_time = wait_for_dump_ipi; ++ ++ if (other_cpus > 0) { ++ atomic_set(&waiting_for_dump_ipi, other_cpus); ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ dump_expect_ipi[i] = (i != cpu && cpu_online(i)); ++ } ++ ++ dump_ipi_function_ptr = IPI_handler; ++ ++ wmb(); ++ ++ dump_send_ipi(); ++ /* may be we dont need to wait for IPI to be processed. ++ * just write out the header at the end of dumping, if ++ * this IPI is not processed until then, there probably ++ * is a problem and we just fail to capture state of ++ * other cpus. */ ++ while(wait_time-- && (atomic_read(&waiting_for_dump_ipi) > 0)) { ++ barrier(); ++ mdelay(1); ++ } ++ if (wait_time <= 0) { ++ printk("dump ipi timeout, proceeding...\n"); ++ } ++ } ++} ++#endif ++/* ++ * Kludge - dump from interrupt context is unreliable (Fixme) ++ * ++ * We do this so that softirqs initiated for dump i/o ++ * get processed and we don't hang while waiting for i/o ++ * to complete or in any irq synchronization attempt. ++ * ++ * This is not quite legal of course, as it has the side ++ * effect of making all interrupts & softirqs triggered ++ * while dump is in progress complete before currently ++ * pending softirqs and the currently executing interrupt ++ * code. ++ */ ++static inline void ++irq_bh_save(void) ++{ ++ saved_irq_count = irq_count(); ++ preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK); ++} ++ ++static inline void ++irq_bh_restore(void) ++{ ++ preempt_count() |= saved_irq_count; ++} ++ ++/* ++ * Name: __dump_configure_header() ++ * Func: Configure the dump header with all proper values. ++ */ ++int ++__dump_configure_header(const struct pt_regs *regs) ++{ ++ return (0); ++} ++ ++ ++#define dim(x) (sizeof(x)/sizeof(*(x))) ++ ++/* ++ * Name: __dump_irq_enable ++ * Func: Reset system so interrupts are enabled. ++ * This is used for dump methods that require interrupts ++ * Eventually, all methods will have interrupts disabled ++ * and this code can be removed. ++ * ++ * Change irq affinities ++ * Re-enable interrupts ++ */ ++int ++__dump_irq_enable(void) ++{ ++ set_irq_affinity(); ++ irq_bh_save(); ++ ia64_srlz_d(); ++ /* ++ * reduce the task priority level ++ * to get disk interrupts ++ */ ++ ia64_setreg(_IA64_REG_CR_TPR, 0); ++ ia64_srlz_d(); ++ local_irq_enable(); ++ return 0; ++} ++ ++/* ++ * Name: __dump_irq_restore ++ * Func: Resume the system state in an architecture-specific way. ++ ++ */ ++void ++__dump_irq_restore(void) ++{ ++ local_irq_disable(); ++ reset_irq_affinity(); ++ irq_bh_restore(); ++} ++ ++/* ++ * Name: __dump_page_valid() ++ * Func: Check if page is valid to dump. ++ */ ++int ++__dump_page_valid(unsigned long index) ++{ ++ if (!pfn_valid(index)) ++ { ++ return 0; ++ } ++ return 1; ++} ++ ++/* ++ * Name: __dump_init() ++ * Func: Initialize the dumping routine process. This is in case ++ * it's necessary in the future. ++ */ ++void ++__dump_init(uint64_t local_memory_start) ++{ ++ return; ++} ++ ++/* ++ * Name: __dump_open() ++ * Func: Open the dump device (architecture specific). This is in ++ * case it's necessary in the future. ++ */ ++void ++__dump_open(void) ++{ ++ alloc_dha_stack(); ++ return; ++} ++ ++ ++/* ++ * Name: __dump_cleanup() ++ * Func: Free any architecture specific data structures. This is called ++ * when the dump module is being removed. ++ */ ++void ++__dump_cleanup(void) ++{ ++ free_dha_stack(); ++ ++ return; ++} ++ ++ ++ ++int __dump_memcpy_mc_expected = 0; /* Doesn't help yet */ ++ ++/* ++ * An ia64 version of memcpy() that trys to avoid machine checks. ++ * ++ * NB: ++ * By itself __dump_memcpy_mc_expected() ins't providing any ++ * protection against Machine Checks. We are looking into the ++ * possability of adding code to the arch/ia64/kernel/mca.c fuction ++ * ia64_mca_ucmc_handler() to restore state so that a IA64_MCA_CORRECTED ++ * can be returned to the firmware. Curently it always returns ++ * IA64_MCA_COLD_BOOT and reboots the machine. ++ */ ++/* ++void * __dump_memcpy(void * dest, const void *src, size_t count) ++{ ++ void *vp; ++ ++ if (__dump_memcpy_mc_expected) { ++ ia64_pal_mc_expected((u64) 1, 0); ++ } ++ ++ vp = memcpy(dest, src, count); ++ ++ if (__dump_memcpy_mc_expected) { ++ ia64_pal_mc_expected((u64) 0, 0); ++ } ++ return(vp); ++} ++*/ ++/* ++ * Name: manual_handle_crashdump() ++ * Func: Interface for the lkcd dump command. Calls dump_execute() ++ */ ++int ++manual_handle_crashdump(void) { ++ ++ struct pt_regs regs; ++ ++ get_current_regs(®s); ++ dump_execute("manual", ®s); ++ return 0; ++} ++ ++/* ++ * Name: __dump_clean_irq_state() ++ * Func: Clean up from the previous IRQ handling state. Such as oops from ++ * interrupt handler or bottom half. ++ */ ++void ++__dump_clean_irq_state(void) ++{ ++ unsigned long saved_tpr; ++ unsigned long TPR_MASK = 0xFFFFFFFFFFFEFF0F; ++ ++ ++ /* Get the processors task priority register */ ++ saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); ++ /* clear the mmi and mic bit's of the TPR to unmask interrupts */ ++ saved_tpr = saved_tpr & TPR_MASK; ++ ia64_setreg(_IA64_REG_CR_TPR, saved_tpr); ++ ia64_srlz_d(); ++ ++ /* Tell the processor we're done with the interrupt ++ * that got us here. ++ */ ++ ++ ia64_eoi(); ++ ++ /* local implementation of irq_exit(); */ ++ preempt_count() -= IRQ_EXIT_OFFSET; ++ preempt_enable_no_resched(); ++ ++ return; ++} ++ +Index: linux-2.6.10/drivers/dump/dump_rle.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_rle.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_rle.c 2005-04-05 16:47:53.935206320 +0800 +@@ -0,0 +1,176 @@ ++/* ++ * RLE Compression functions for kernel crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sourceforge.net) ++ * Copyright 2001 Matt D. Robinson. All rights reserved. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* header files */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Name: dump_compress_rle() ++ * Func: Compress a DUMP_PAGE_SIZE (hardware) page down to something more ++ * reasonable, if possible. This is the same routine we use in IRIX. ++ */ ++static u32 ++dump_compress_rle(const u8 *old, u32 oldsize, u8 *new, u32 newsize, ++ unsigned long loc) ++{ ++ u16 ri, wi, count = 0; ++ u_char value = 0, cur_byte; ++ ++ /* ++ * If the block should happen to "compress" to larger than the ++ * buffer size, allocate a larger one and change cur_buf_size. ++ */ ++ ++ wi = ri = 0; ++ ++ while (ri < oldsize) { ++ if (!ri) { ++ cur_byte = value = old[ri]; ++ count = 0; ++ } else { ++ if (count == 255) { ++ if (wi + 3 > oldsize) { ++ return oldsize; ++ } ++ new[wi++] = 0; ++ new[wi++] = count; ++ new[wi++] = value; ++ value = cur_byte = old[ri]; ++ count = 0; ++ } else { ++ if ((cur_byte = old[ri]) == value) { ++ count++; ++ } else { ++ if (count > 1) { ++ if (wi + 3 > oldsize) { ++ return oldsize; ++ } ++ new[wi++] = 0; ++ new[wi++] = count; ++ new[wi++] = value; ++ } else if (count == 1) { ++ if (value == 0) { ++ if (wi + 3 > oldsize) { ++ return oldsize; ++ } ++ new[wi++] = 0; ++ new[wi++] = 1; ++ new[wi++] = 0; ++ } else { ++ if (wi + 2 > oldsize) { ++ return oldsize; ++ } ++ new[wi++] = value; ++ new[wi++] = value; ++ } ++ } else { /* count == 0 */ ++ if (value == 0) { ++ if (wi + 2 > oldsize) { ++ return oldsize; ++ } ++ new[wi++] = value; ++ new[wi++] = value; ++ } else { ++ if (wi + 1 > oldsize) { ++ return oldsize; ++ } ++ new[wi++] = value; ++ } ++ } /* if count > 1 */ ++ ++ value = cur_byte; ++ count = 0; ++ ++ } /* if byte == value */ ++ ++ } /* if count == 255 */ ++ ++ } /* if ri == 0 */ ++ ri++; ++ ++ } ++ if (count > 1) { ++ if (wi + 3 > oldsize) { ++ return oldsize; ++ } ++ new[wi++] = 0; ++ new[wi++] = count; ++ new[wi++] = value; ++ } else if (count == 1) { ++ if (value == 0) { ++ if (wi + 3 > oldsize) ++ return oldsize; ++ new[wi++] = 0; ++ new[wi++] = 1; ++ new[wi++] = 0; ++ } else { ++ if (wi + 2 > oldsize) ++ return oldsize; ++ new[wi++] = value; ++ new[wi++] = value; ++ } ++ } else { /* count == 0 */ ++ if (value == 0) { ++ if (wi + 2 > oldsize) ++ return oldsize; ++ new[wi++] = value; ++ new[wi++] = value; ++ } else { ++ if (wi + 1 > oldsize) ++ return oldsize; ++ new[wi++] = value; ++ } ++ } /* if count > 1 */ ++ ++ value = cur_byte; ++ count = 0; ++ return wi; ++} ++ ++/* setup the rle compression functionality */ ++static struct __dump_compress dump_rle_compression = { ++ .compress_type = DUMP_COMPRESS_RLE, ++ .compress_func = dump_compress_rle, ++ .compress_name = "RLE", ++}; ++ ++/* ++ * Name: dump_compress_rle_init() ++ * Func: Initialize rle compression for dumping. ++ */ ++static int __init ++dump_compress_rle_init(void) ++{ ++ dump_register_compression(&dump_rle_compression); ++ return 0; ++} ++ ++/* ++ * Name: dump_compress_rle_cleanup() ++ * Func: Remove rle compression for dumping. ++ */ ++static void __exit ++dump_compress_rle_cleanup(void) ++{ ++ dump_unregister_compression(DUMP_COMPRESS_RLE); ++} ++ ++/* module initialization */ ++module_init(dump_compress_rle_init); ++module_exit(dump_compress_rle_cleanup); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("LKCD Development Team "); ++MODULE_DESCRIPTION("RLE compression module for crash dump driver"); +Index: linux-2.6.10/drivers/dump/dump_execute.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_execute.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_execute.c 2005-04-05 16:47:53.943205104 +0800 +@@ -0,0 +1,144 @@ ++/* ++ * The file has the common/generic dump execution code ++ * ++ * Started: Oct 2002 - Suparna Bhattacharya ++ * Split and rewrote high level dump execute code to make use ++ * of dump method interfaces. ++ * ++ * Derived from original code in dump_base.c created by ++ * Matt Robinson ) ++ * ++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. ++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * Assumes dumper and dump config settings are in place ++ * (invokes corresponding dumper specific routines as applicable) ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++ ++struct notifier_block *dump_notifier_list; /* dump started/ended callback */ ++ ++extern int panic_timeout; ++ ++/* Dump progress indicator */ ++void ++dump_speedo(int i) ++{ ++ static const char twiddle[4] = { '|', '\\', '-', '/' }; ++ printk("%c\b", twiddle[i&3]); ++} ++ ++/* Make the device ready and write out the header */ ++int dump_begin(void) ++{ ++ int err = 0; ++ ++ /* dump_dev = dump_config.dumper->dev; */ ++ dumper_reset(); ++ if ((err = dump_dev_silence())) { ++ /* quiesce failed, can't risk continuing */ ++ /* Todo/Future: switch to alternate dump scheme if possible */ ++ printk("dump silence dev failed ! error %d\n", err); ++ return err; ++ } ++ ++ pr_debug("Writing dump header\n"); ++ if ((err = dump_update_header())) { ++ printk("dump update header failed ! error %d\n", err); ++ dump_dev_resume(); ++ return err; ++ } ++ ++ dump_config.dumper->curr_offset = DUMP_BUFFER_SIZE; ++ ++ return 0; ++} ++ ++/* ++ * Write the dump terminator, a final header update and let go of ++ * exclusive use of the device for dump. ++ */ ++int dump_complete(void) ++{ ++ int ret = 0; ++ ++ if (dump_config.level != DUMP_LEVEL_HEADER) { ++ if ((ret = dump_update_end_marker())) { ++ printk("dump update end marker error %d\n", ret); ++ } ++ if ((ret = dump_update_header())) { ++ printk("dump update header error %d\n", ret); ++ } ++ } ++ ret = dump_dev_resume(); ++ ++ if ((panic_timeout > 0) && (!(dump_config.flags & (DUMP_FLAGS_SOFTBOOT | DUMP_FLAGS_NONDISRUPT)))) { ++ mdelay(panic_timeout * 1000); ++ machine_restart(NULL); ++ } ++ ++ return ret; ++} ++ ++/* Saves all dump data */ ++int dump_execute_savedump(void) ++{ ++ int ret = 0, err = 0; ++ ++ if ((ret = dump_begin())) { ++ return ret; ++ } ++ ++ if (dump_config.level != DUMP_LEVEL_HEADER) { ++ ret = dump_sequencer(); ++ } ++ if ((err = dump_complete())) { ++ printk("Dump complete failed. Error %d\n", err); ++ } ++ ++ return ret; ++} ++ ++extern void dump_calc_bootmap_pages(void); ++ ++/* Does all the real work: Capture and save state */ ++int dump_generic_execute(const char *panic_str, const struct pt_regs *regs) ++{ ++ int ret = 0; ++ ++#ifdef CONFIG_DISCONTIGMEM ++ printk(KERN_INFO "Reconfiguring memory bank information....\n"); ++ printk(KERN_INFO "This may take a while....\n"); ++ dump_reconfigure_mbanks(); ++#endif ++ ++ if ((ret = dump_configure_header(panic_str, regs))) { ++ printk("dump config header failed ! error %d\n", ret); ++ return ret; ++ } ++ ++ dump_calc_bootmap_pages(); ++ /* tell interested parties that a dump is about to start */ ++ notifier_call_chain(&dump_notifier_list, DUMP_BEGIN, ++ &dump_config.dump_device); ++ ++ if (dump_config.level != DUMP_LEVEL_NONE) ++ ret = dump_execute_savedump(); ++ ++ pr_debug("dumped %ld blocks of %d bytes each\n", ++ dump_config.dumper->count, DUMP_BUFFER_SIZE); ++ ++ /* tell interested parties that a dump has completed */ ++ notifier_call_chain(&dump_notifier_list, DUMP_END, ++ &dump_config.dump_device); ++ ++ return ret; ++} +Index: linux-2.6.10/drivers/dump/dump_netdev.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_netdev.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_netdev.c 2005-04-05 16:47:53.936206168 +0800 +@@ -0,0 +1,566 @@ ++/* ++ * Implements the dump driver interface for saving a dump via network ++ * interface. ++ * ++ * Some of this code has been taken/adapted from Ingo Molnar's netconsole ++ * code. LKCD team expresses its thanks to Ingo. ++ * ++ * Started: June 2002 - Mohamed Abbas ++ * Adapted netconsole code to implement LKCD dump over the network. ++ * ++ * Nov 2002 - Bharata B. Rao ++ * Innumerable code cleanups, simplification and some fixes. ++ * Netdump configuration done by ioctl instead of using module parameters. ++ * Oct 2003 - Prasanna S Panchamukhi ++ * Netdump code modified to use Netpoll API's. ++ * ++ * Copyright (C) 2001 Ingo Molnar ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++static int startup_handshake; ++static int page_counter; ++static unsigned long flags_global; ++static int netdump_in_progress; ++ ++/* ++ * security depends on the trusted path between the netconsole ++ * server and netconsole client, since none of the packets are ++ * encrypted. The random magic number protects the protocol ++ * against spoofing. ++ */ ++static u64 dump_magic; ++ ++/* ++ * We maintain a small pool of fully-sized skbs, ++ * to make sure the message gets out even in ++ * extreme OOM situations. ++ */ ++ ++static void rx_hook(struct netpoll *np, int port, char *msg, int size); ++int new_req = 0; ++static req_t req; ++ ++static void rx_hook(struct netpoll *np, int port, char *msg, int size) ++{ ++ req_t * __req = (req_t *) msg; ++ /* ++ * First check if were are dumping or doing startup handshake, if ++ * not quickly return. ++ */ ++ ++ if (!netdump_in_progress) ++ return ; ++ ++ if ((ntohl(__req->command) != COMM_GET_MAGIC) && ++ (ntohl(__req->command) != COMM_HELLO) && ++ (ntohl(__req->command) != COMM_START_WRITE_NETDUMP_ACK) && ++ (ntohl(__req->command) != COMM_START_NETDUMP_ACK) && ++ (memcmp(&__req->magic, &dump_magic, sizeof(dump_magic)) != 0)) ++ goto out; ++ ++ req.magic = ntohl(__req->magic); ++ req.command = ntohl(__req->command); ++ req.from = ntohl(__req->from); ++ req.to = ntohl(__req->to); ++ req.nr = ntohl(__req->nr); ++ new_req = 1; ++out: ++ return ; ++} ++static char netdump_membuf[1024 + HEADER_LEN + 1]; ++/* ++ * Fill the netdump_membuf with the header information from reply_t structure ++ * and send it down to netpoll_send_udp() routine. ++ */ ++static void ++netdump_send_packet(struct netpoll *np, reply_t *reply, size_t data_len) { ++ char *b; ++ ++ b = &netdump_membuf[1]; ++ netdump_membuf[0] = NETCONSOLE_VERSION; ++ put_unaligned(htonl(reply->nr), (u32 *) b); ++ put_unaligned(htonl(reply->code), (u32 *) (b + sizeof(reply->code))); ++ put_unaligned(htonl(reply->info), (u32 *) (b + sizeof(reply->code) + ++ sizeof(reply->info))); ++ netpoll_send_udp(np, netdump_membuf, data_len + HEADER_LEN); ++} ++ ++static void ++dump_send_mem(struct netpoll *np, req_t *req, const char* buff, size_t len) ++{ ++ int i; ++ ++ int nr_chunks = len/1024; ++ reply_t reply; ++ ++ reply.nr = req->nr; ++ reply.code = REPLY_MEM; ++ if ( nr_chunks <= 0) ++ nr_chunks = 1; ++ for (i = 0; i < nr_chunks; i++) { ++ unsigned int offset = i*1024; ++ reply.info = offset; ++ memcpy((netdump_membuf + HEADER_LEN), (buff + offset), 1024); ++ netdump_send_packet(np, &reply, 1024); ++ } ++} ++ ++/* ++ * This function waits for the client to acknowledge the receipt ++ * of the netdump startup reply, with the possibility of packets ++ * getting lost. We resend the startup packet if no ACK is received, ++ * after a 1 second delay. ++ * ++ * (The client can test the success of the handshake via the HELLO ++ * command, and send ACKs until we enter netdump mode.) ++ */ ++static int ++dump_handshake(struct dump_dev *net_dev) ++{ ++ reply_t reply; ++ int i, j; ++ size_t str_len; ++ ++ if (startup_handshake) { ++ sprintf((netdump_membuf + HEADER_LEN), ++ "NETDUMP start, waiting for start-ACK.\n"); ++ reply.code = REPLY_START_NETDUMP; ++ reply.nr = 0; ++ reply.info = 0; ++ } else { ++ sprintf((netdump_membuf + HEADER_LEN), ++ "NETDUMP start, waiting for start-ACK.\n"); ++ reply.code = REPLY_START_WRITE_NETDUMP; ++ reply.nr = net_dev->curr_offset; ++ reply.info = net_dev->curr_offset; ++ } ++ str_len = strlen(netdump_membuf + HEADER_LEN); ++ ++ /* send 300 handshake packets before declaring failure */ ++ for (i = 0; i < 300; i++) { ++ netdump_send_packet(&net_dev->np, &reply, str_len); ++ ++ /* wait 1 sec */ ++ for (j = 0; j < 10000; j++) { ++ udelay(100); ++ netpoll_poll(&net_dev->np); ++ if (new_req) ++ break; ++ } ++ ++ /* ++ * if there is no new request, try sending the handshaking ++ * packet again ++ */ ++ if (!new_req) ++ continue; ++ ++ /* ++ * check if the new request is of the expected type, ++ * if so, return, else try sending the handshaking ++ * packet again ++ */ ++ if (startup_handshake) { ++ if (req.command == COMM_HELLO || req.command == ++ COMM_START_NETDUMP_ACK) { ++ return 0; ++ } else { ++ new_req = 0; ++ continue; ++ } ++ } else { ++ if (req.command == COMM_SEND_MEM) { ++ return 0; ++ } else { ++ new_req = 0; ++ continue; ++ } ++ } ++ } ++ return -1; ++} ++ ++static ssize_t ++do_netdump(struct dump_dev *net_dev, const char* buff, size_t len) ++{ ++ reply_t reply; ++ ssize_t ret = 0; ++ int repeatCounter, counter, total_loop; ++ size_t str_len; ++ ++ netdump_in_progress = 1; ++ ++ if (dump_handshake(net_dev) < 0) { ++ printk("network dump failed due to handshake failure\n"); ++ goto out; ++ } ++ ++ /* ++ * Ideally startup handshake should be done during dump configuration, ++ * i.e., in dump_net_open(). This will be done when I figure out ++ * the dependency between startup handshake, subsequent write and ++ * various commands wrt to net-server. ++ */ ++ if (startup_handshake) ++ startup_handshake = 0; ++ ++ counter = 0; ++ repeatCounter = 0; ++ total_loop = 0; ++ while (1) { ++ if (!new_req) { ++ netpoll_poll(&net_dev->np); ++ } ++ if (!new_req) { ++ repeatCounter++; ++ ++ if (repeatCounter > 5) { ++ counter++; ++ if (counter > 10000) { ++ if (total_loop >= 100000) { ++ printk("Time OUT LEAVE NOW\n"); ++ goto out; ++ } else { ++ total_loop++; ++ printk("Try number %d out of " ++ "10 before Time Out\n", ++ total_loop); ++ } ++ } ++ mdelay(1); ++ repeatCounter = 0; ++ } ++ continue; ++ } ++ repeatCounter = 0; ++ counter = 0; ++ total_loop = 0; ++ new_req = 0; ++ switch (req.command) { ++ case COMM_NONE: ++ break; ++ ++ case COMM_SEND_MEM: ++ dump_send_mem(&net_dev->np, &req, buff, len); ++ break; ++ ++ case COMM_EXIT: ++ case COMM_START_WRITE_NETDUMP_ACK: ++ ret = len; ++ goto out; ++ ++ case COMM_HELLO: ++ sprintf((netdump_membuf + HEADER_LEN), ++ "Hello, this is netdump version " "0.%02d\n", ++ NETCONSOLE_VERSION); ++ str_len = strlen(netdump_membuf + HEADER_LEN); ++ reply.code = REPLY_HELLO; ++ reply.nr = req.nr; ++ reply.info = net_dev->curr_offset; ++ netdump_send_packet(&net_dev->np, &reply, str_len); ++ break; ++ ++ case COMM_GET_PAGE_SIZE: ++ sprintf((netdump_membuf + HEADER_LEN), ++ "PAGE_SIZE: %ld\n", PAGE_SIZE); ++ str_len = strlen(netdump_membuf + HEADER_LEN); ++ reply.code = REPLY_PAGE_SIZE; ++ reply.nr = req.nr; ++ reply.info = PAGE_SIZE; ++ netdump_send_packet(&net_dev->np, &reply, str_len); ++ break; ++ ++ case COMM_GET_NR_PAGES: ++ reply.code = REPLY_NR_PAGES; ++ reply.nr = req.nr; ++ reply.info = num_physpages; ++ reply.info = page_counter; ++ sprintf((netdump_membuf + HEADER_LEN), ++ "Number of pages: %ld\n", num_physpages); ++ str_len = strlen(netdump_membuf + HEADER_LEN); ++ netdump_send_packet(&net_dev->np, &reply, str_len); ++ break; ++ ++ case COMM_GET_MAGIC: ++ reply.code = REPLY_MAGIC; ++ reply.nr = req.nr; ++ reply.info = NETCONSOLE_VERSION; ++ sprintf((netdump_membuf + HEADER_LEN), ++ (char *)&dump_magic, sizeof(dump_magic)); ++ str_len = strlen(netdump_membuf + HEADER_LEN); ++ netdump_send_packet(&net_dev->np, &reply, str_len); ++ break; ++ ++ default: ++ reply.code = REPLY_ERROR; ++ reply.nr = req.nr; ++ reply.info = req.command; ++ sprintf((netdump_membuf + HEADER_LEN), ++ "Got unknown command code %d!\n", req.command); ++ str_len = strlen(netdump_membuf + HEADER_LEN); ++ netdump_send_packet(&net_dev->np, &reply, str_len); ++ break; ++ } ++ } ++out: ++ netdump_in_progress = 0; ++ return ret; ++} ++ ++static int ++dump_validate_config(struct netpoll *np) ++{ ++ if (!np->local_ip) { ++ printk("network device %s has no local address, " ++ "aborting.\n", np->name); ++ return -1; ++ } ++ ++#define IP(x) ((unsigned char *)&np->local_ip)[x] ++ printk("Source %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3)); ++#undef IP ++ ++ if (!np->local_port) { ++ printk("source_port parameter not specified, aborting.\n"); ++ return -1; ++ } ++ ++ if (!np->remote_ip) { ++ printk("target_ip parameter not specified, aborting.\n"); ++ return -1; ++ } ++ ++ np->remote_ip = ntohl(np->remote_ip); ++#define IP(x) ((unsigned char *)&np->remote_ip)[x] ++ printk("Target %d.%d.%d.%d", IP(0), IP(1), IP(2), IP(3)); ++#undef IP ++ ++ if (!np->remote_port) { ++ printk("target_port parameter not specified, aborting.\n"); ++ return -1; ++ } ++ printk("Target Ethernet Address %02x:%02x:%02x:%02x:%02x:%02x", ++ np->remote_mac[0], np->remote_mac[1], np->remote_mac[2], ++ np->remote_mac[3], np->remote_mac[4], np->remote_mac[5]); ++ ++ if ((np->remote_mac[0] & np->remote_mac[1] & np->remote_mac[2] & ++ np->remote_mac[3] & np->remote_mac[4] & np->remote_mac[5]) == 255) ++ printk("(Broadcast)"); ++ printk("\n"); ++ return 0; ++} ++ ++/* ++ * Prepares the dump device so we can take a dump later. ++ * Validates the netdump configuration parameters. ++ * ++ * TODO: Network connectivity check should be done here. ++ */ ++static int ++dump_net_open(struct dump_dev *net_dev, unsigned long arg) ++{ ++ int retval = 0; ++ ++ /* get the interface name */ ++ if (copy_from_user(net_dev->np.dev_name, (void *)arg, IFNAMSIZ)) ++ return -EFAULT; ++ net_dev->np.rx_hook = rx_hook; ++ retval = netpoll_setup(&net_dev->np); ++ ++ dump_validate_config(&net_dev->np); ++ net_dev->curr_offset = 0; ++ printk("Network device %s successfully configured for dumping\n", ++ net_dev->np.dev_name); ++ return retval; ++} ++ ++/* ++ * Close the dump device and release associated resources ++ * Invoked when unconfiguring the dump device. ++ */ ++static int ++dump_net_release(struct dump_dev *net_dev) ++{ ++ netpoll_cleanup(&net_dev->np); ++ return 0; ++} ++ ++/* ++ * Prepare the dump device for use (silence any ongoing activity ++ * and quiesce state) when the system crashes. ++ */ ++static int ++dump_net_silence(struct dump_dev *net_dev) ++{ ++ netpoll_set_trap(1); ++ local_irq_save(flags_global); ++ startup_handshake = 1; ++ net_dev->curr_offset = 0; ++ printk("Dumping to network device %s on CPU %d ...\n", net_dev->np.name, ++ smp_processor_id()); ++ return 0; ++} ++ ++/* ++ * Invoked when dumping is done. This is the time to put things back ++ * (i.e. undo the effects of dump_block_silence) so the device is ++ * available for normal use. ++ */ ++static int ++dump_net_resume(struct dump_dev *net_dev) ++{ ++ int indx; ++ size_t str_len; ++ reply_t reply; ++ ++ sprintf((netdump_membuf + HEADER_LEN), "NETDUMP end.\n"); ++ str_len = strlen(netdump_membuf + HEADER_LEN); ++ for( indx = 0; indx < 6; indx++) { ++ reply.code = REPLY_END_NETDUMP; ++ reply.nr = 0; ++ reply.info = 0; ++ netdump_send_packet(&net_dev->np, &reply, str_len); ++ } ++ printk("NETDUMP END!\n"); ++ local_irq_restore(flags_global); ++ netpoll_set_trap(0); ++ startup_handshake = 0; ++ return 0; ++} ++ ++/* ++ * Seek to the specified offset in the dump device. ++ * Makes sure this is a valid offset, otherwise returns an error. ++ */ ++static int ++dump_net_seek(struct dump_dev *net_dev, loff_t off) ++{ ++ net_dev->curr_offset = off; ++ return 0; ++} ++ ++/* ++ * ++ */ ++static int ++dump_net_write(struct dump_dev *net_dev, void *buf, unsigned long len) ++{ ++ int cnt, i, off; ++ ssize_t ret; ++ ++ cnt = len/ PAGE_SIZE; ++ ++ for (i = 0; i < cnt; i++) { ++ off = i* PAGE_SIZE; ++ ret = do_netdump(net_dev, buf+off, PAGE_SIZE); ++ if (ret <= 0) ++ return -1; ++ net_dev->curr_offset = net_dev->curr_offset + PAGE_SIZE; ++ } ++ return len; ++} ++ ++/* ++ * check if the last dump i/o is over and ready for next request ++ */ ++static int ++dump_net_ready(struct dump_dev *net_dev, void *buf) ++{ ++ return 0; ++} ++ ++/* ++ * ioctl function used for configuring network dump ++ */ ++static int ++dump_net_ioctl(struct dump_dev *net_dev, unsigned int cmd, unsigned long arg) ++{ ++ switch (cmd) { ++ case DIOSTARGETIP: ++ net_dev->np.remote_ip= arg; ++ break; ++ case DIOSTARGETPORT: ++ net_dev->np.remote_port = (u16)arg; ++ break; ++ case DIOSSOURCEPORT: ++ net_dev->np.local_port = (u16)arg; ++ break; ++ case DIOSETHADDR: ++ return copy_from_user(net_dev->np.remote_mac, (void *)arg, 6); ++ break; ++ case DIOGTARGETIP: ++ case DIOGTARGETPORT: ++ case DIOGSOURCEPORT: ++ case DIOGETHADDR: ++ break; ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++struct dump_dev_ops dump_netdev_ops = { ++ .open = dump_net_open, ++ .release = dump_net_release, ++ .silence = dump_net_silence, ++ .resume = dump_net_resume, ++ .seek = dump_net_seek, ++ .write = dump_net_write, ++ /* .read not implemented */ ++ .ready = dump_net_ready, ++ .ioctl = dump_net_ioctl ++}; ++ ++static struct dump_dev default_dump_netdev = { ++ .type_name = "networkdev", ++ .ops = &dump_netdev_ops, ++ .curr_offset = 0, ++ .np.name = "netdump", ++ .np.dev_name = "eth0", ++ .np.rx_hook = rx_hook, ++ .np.local_port = 6688, ++ .np.remote_port = 6688, ++ .np.remote_mac = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, ++}; ++ ++static int __init ++dump_netdev_init(void) ++{ ++ default_dump_netdev.curr_offset = 0; ++ ++ if (dump_register_device(&default_dump_netdev) < 0) { ++ printk("network dump device driver registration failed\n"); ++ return -1; ++ } ++ printk("network device driver for LKCD registered\n"); ++ ++ get_random_bytes(&dump_magic, sizeof(dump_magic)); ++ return 0; ++} ++ ++static void __exit ++dump_netdev_cleanup(void) ++{ ++ dump_unregister_device(&default_dump_netdev); ++} ++ ++MODULE_AUTHOR("LKCD Development Team "); ++MODULE_DESCRIPTION("Network Dump Driver for Linux Kernel Crash Dump (LKCD)"); ++MODULE_LICENSE("GPL"); ++ ++module_init(dump_netdev_init); ++module_exit(dump_netdev_cleanup); +Index: linux-2.6.10/drivers/dump/dump_x8664.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_x8664.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_x8664.c 2005-04-05 16:47:53.932206776 +0800 +@@ -0,0 +1,362 @@ ++/* ++ * Architecture specific (x86-64) functions for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sgi.com) ++ * ++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved. ++ * ++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com) ++ * Copyright 2000 TurboLinux, Inc. All rights reserved. ++ * ++ * x86-64 port Copyright 2002 Andi Kleen, SuSE Labs ++ * x86-64 port Sachin Sant ( sachinp@in.ibm.com ) ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* ++ * The hooks for dumping the kernel virtual memory to disk are in this ++ * file. Any time a modification is made to the virtual memory mechanism, ++ * these routines must be changed to use the new mechanisms. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static __s32 saved_irq_count; /* saved preempt_count() flag */ ++ ++void (*dump_trace_ptr)(struct pt_regs *); ++ ++static int alloc_dha_stack(void) ++{ ++ int i; ++ void *ptr; ++ ++ if (dump_header_asm.dha_stack[0]) ++ return 0; ++ ++ ptr = vmalloc(THREAD_SIZE * num_online_cpus()); ++ if (!ptr) { ++ printk("vmalloc for dha_stacks failed\n"); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ dump_header_asm.dha_stack[i] = ++ (uint64_t)((unsigned long)ptr + (i * THREAD_SIZE)); ++ } ++ return 0; ++} ++ ++static int free_dha_stack(void) ++{ ++ if (dump_header_asm.dha_stack[0]) { ++ vfree((void *)dump_header_asm.dha_stack[0]); ++ dump_header_asm.dha_stack[0] = 0; ++ } ++ return 0; ++} ++ ++void ++__dump_save_regs(struct pt_regs* dest_regs, const struct pt_regs* regs) ++{ ++ if (regs) ++ memcpy(dest_regs, regs, sizeof(struct pt_regs)); ++} ++ ++void ++__dump_save_context(int cpu, const struct pt_regs *regs, ++ struct task_struct *tsk) ++{ ++ dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk; ++ __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs); ++ ++ /* take a snapshot of the stack */ ++ /* doing this enables us to tolerate slight drifts on this cpu */ ++ ++ if (dump_header_asm.dha_stack[cpu]) { ++ memcpy((void *)dump_header_asm.dha_stack[cpu], ++ STACK_START_POSITION(tsk), ++ THREAD_SIZE); ++ } ++ dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info); ++} ++ ++#ifdef CONFIG_SMP ++extern cpumask_t irq_affinity[]; ++extern irq_desc_t irq_desc[]; ++extern void dump_send_ipi(void); ++static int dump_expect_ipi[NR_CPUS]; ++static atomic_t waiting_for_dump_ipi; ++static unsigned long saved_affinity[NR_IRQS]; ++ ++extern void stop_this_cpu(void *); ++ ++static int ++dump_nmi_callback(struct pt_regs *regs, int cpu) ++{ ++ if (!dump_expect_ipi[cpu]) { ++ return 0; ++ } ++ ++ dump_expect_ipi[cpu] = 0; ++ ++ dump_save_this_cpu(regs); ++ atomic_dec(&waiting_for_dump_ipi); ++ ++level_changed: ++ ++ switch (dump_silence_level) { ++ case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */ ++ while (dump_oncpu) { ++ barrier(); /* paranoia */ ++ if (dump_silence_level != DUMP_HARD_SPIN_CPUS) ++ goto level_changed; ++ ++ cpu_relax(); /* kill time nicely */ ++ } ++ break; ++ ++ case DUMP_HALT_CPUS: /* Execute halt */ ++ stop_this_cpu(NULL); ++ break; ++ ++ case DUMP_SOFT_SPIN_CPUS: ++ /* Mark the task so it spins in schedule */ ++ set_tsk_thread_flag(current, TIF_NEED_RESCHED); ++ break; ++ } ++ ++ return 1; ++} ++ ++/* save registers on other processors */ ++void ++__dump_save_other_cpus(void) ++{ ++ int i, cpu = smp_processor_id(); ++ int other_cpus = num_online_cpus() - 1; ++ ++ if (other_cpus > 0) { ++ atomic_set(&waiting_for_dump_ipi, other_cpus); ++ ++ for (i = 0; i < NR_CPUS; i++) ++ dump_expect_ipi[i] = (i != cpu && cpu_online(i)); ++ ++ set_nmi_callback(dump_nmi_callback); ++ wmb(); ++ ++ dump_send_ipi(); ++ ++ /* may be we dont need to wait for NMI to be processed. ++ just write out the header at the end of dumping, if ++ this IPI is not processed untill then, there probably ++ is a problem and we just fail to capture state of ++ other cpus. */ ++ while(atomic_read(&waiting_for_dump_ipi) > 0) ++ cpu_relax(); ++ ++ unset_nmi_callback(); ++ } ++ return; ++} ++ ++/* ++ * Routine to save the old irq affinities and change affinities of all irqs to ++ * the dumping cpu. ++ */ ++static void ++set_irq_affinity(void) ++{ ++ int i; ++ cpumask_t cpu = CPU_MASK_NONE; ++ ++ cpu_set(smp_processor_id(), cpu); ++ memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long)); ++ for (i = 0; i < NR_IRQS; i++) { ++ if (irq_desc[i].handler == NULL) ++ continue; ++ irq_affinity[i] = cpu; ++ if (irq_desc[i].handler->set_affinity != NULL) ++ irq_desc[i].handler->set_affinity(i, irq_affinity[i]); ++ } ++} ++ ++/* ++ * Restore old irq affinities. ++ */ ++static void ++reset_irq_affinity(void) ++{ ++ int i; ++ ++ memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long)); ++ for (i = 0; i < NR_IRQS; i++) { ++ if (irq_desc[i].handler == NULL) ++ continue; ++ if (irq_desc[i].handler->set_affinity != NULL) ++ irq_desc[i].handler->set_affinity(i, saved_affinity[i]); ++ } ++} ++ ++#else /* !CONFIG_SMP */ ++#define set_irq_affinity() do { } while (0) ++#define reset_irq_affinity() do { } while (0) ++#define save_other_cpu_states() do { } while (0) ++#endif /* !CONFIG_SMP */ ++ ++static inline void ++irq_bh_save(void) ++{ ++ saved_irq_count = irq_count(); ++ preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK); ++} ++ ++static inline void ++irq_bh_restore(void) ++{ ++ preempt_count() |= saved_irq_count; ++} ++ ++/* ++ * Name: __dump_irq_enable ++ * Func: Reset system so interrupts are enabled. ++ * This is used for dump methods that require interrupts ++ * Eventually, all methods will have interrupts disabled ++ * and this code can be removed. ++ * ++ * Change irq affinities ++ * Re-enable interrupts ++ */ ++int ++__dump_irq_enable(void) ++{ ++ set_irq_affinity(); ++ irq_bh_save(); ++ local_irq_enable(); ++ return 0; ++} ++ ++/* ++ * Name: __dump_irq_restore ++ * Func: Resume the system state in an architecture-speeific way. ++ * ++ */ ++void ++__dump_irq_restore(void) ++{ ++ local_irq_disable(); ++ reset_irq_affinity(); ++ irq_bh_restore(); ++} ++ ++/* ++ * Name: __dump_configure_header() ++ * Func: Configure the dump header with all proper values. ++ */ ++int ++__dump_configure_header(const struct pt_regs *regs) ++{ ++ /* Dummy function - return */ ++ return (0); ++} ++ ++static int notify(struct notifier_block *nb, unsigned long code, void *data) ++{ ++ if (code == DIE_NMI_IPI && dump_oncpu) ++ return NOTIFY_BAD; ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block dump_notifier = { ++ .notifier_call = notify, ++}; ++ ++/* ++ * Name: __dump_init() ++ * Func: Initialize the dumping routine process. ++ */ ++void ++__dump_init(uint64_t local_memory_start) ++{ ++ notifier_chain_register(&die_chain, &dump_notifier); ++} ++ ++/* ++ * Name: __dump_open() ++ * Func: Open the dump device (architecture specific). This is in ++ * case it's necessary in the future. ++ */ ++void ++__dump_open(void) ++{ ++ alloc_dha_stack(); ++ /* return */ ++ return; ++} ++ ++/* ++ * Name: __dump_cleanup() ++ * Func: Free any architecture specific data structures. This is called ++ * when the dump module is being removed. ++ */ ++void ++__dump_cleanup(void) ++{ ++ free_dha_stack(); ++ notifier_chain_unregister(&die_chain, &dump_notifier); ++ synchronize_kernel(); ++ return; ++} ++ ++extern int page_is_ram(unsigned long); ++ ++/* ++ * Name: __dump_page_valid() ++ * Func: Check if page is valid to dump. ++ */ ++int ++__dump_page_valid(unsigned long index) ++{ ++ if (!pfn_valid(index)) ++ return 0; ++ ++ return page_is_ram(index); ++} ++ ++/* ++ * Name: manual_handle_crashdump() ++ * Func: Interface for the lkcd dump command. Calls dump_execute() ++ */ ++int ++manual_handle_crashdump(void) { ++ ++ struct pt_regs regs; ++ ++ get_current_regs(®s); ++ dump_execute("manual", ®s); ++ return 0; ++} ++ ++/* ++ * Name: __dump_clean_irq_state() ++ * Func: Clean up from the previous IRQ handling state. Such as oops from ++ * interrupt handler or bottom half. ++ */ ++void ++__dump_clean_irq_state(void) ++{ ++ return; ++} +Index: linux-2.6.10/drivers/dump/dump_overlay.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_overlay.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_overlay.c 2005-04-05 16:47:53.934206472 +0800 +@@ -0,0 +1,890 @@ ++/* ++ * Two-stage soft-boot based dump scheme methods (memory overlay ++ * with post soft-boot writeout) ++ * ++ * Started: Oct 2002 - Suparna Bhattacharya ++ * ++ * This approach of saving the dump in memory and writing it ++ * out after a softboot without clearing memory is derived from the ++ * Mission Critical Linux dump implementation. Credits and a big ++ * thanks for letting the lkcd project make use of the excellent ++ * piece of work and also for helping with clarifications and ++ * tips along the way are due to: ++ * Dave Winchell (primary author of mcore) ++ * and also to ++ * Jeff Moyer ++ * Josh Huber ++ * ++ * For those familiar with the mcore implementation, the key ++ * differences/extensions here are in allowing entire memory to be ++ * saved (in compressed form) through a careful ordering scheme ++ * on both the way down as well on the way up after boot, the latter ++ * for supporting the LKCD notion of passes in which most critical ++ * data is the first to be saved to the dump device. Also the post ++ * boot writeout happens from within the kernel rather than driven ++ * from userspace. ++ * ++ * The sequence is orchestrated through the abstraction of "dumpers", ++ * one for the first stage which then sets up the dumper for the next ++ * stage, providing for a smooth and flexible reuse of the singlestage ++ * dump scheme methods and a handle to pass dump device configuration ++ * information across the soft boot. ++ * ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* ++ * Disruptive dumping using the second kernel soft-boot option ++ * for issuing dump i/o operates in 2 stages: ++ * ++ * (1) - Saves the (compressed & formatted) dump in memory using a ++ * carefully ordered overlay scheme designed to capture the ++ * entire physical memory or selective portions depending on ++ * dump config settings, ++ * - Registers the stage 2 dumper and ++ * - Issues a soft reboot w/o clearing memory. ++ * ++ * The overlay scheme starts with a small bootstrap free area ++ * and follows a reverse ordering of passes wherein it ++ * compresses and saves data starting with the least critical ++ * areas first, thus freeing up the corresponding pages to ++ * serve as destination for subsequent data to be saved, and ++ * so on. With a good compression ratio, this makes it feasible ++ * to capture an entire physical memory dump without significantly ++ * reducing memory available during regular operation. ++ * ++ * (2) Post soft-reboot, runs through the saved memory dump and ++ * writes it out to disk, this time around, taking care to ++ * save the more critical data first (i.e. pages which figure ++ * in early passes for a regular dump). Finally issues a ++ * clean reboot. ++ * ++ * Since the data was saved in memory after selection/filtering ++ * and formatted as per the chosen output dump format, at this ++ * stage the filter and format actions are just dummy (or ++ * passthrough) actions, except for influence on ordering of ++ * passes. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_KEXEC ++#include ++#include ++#include ++#endif ++#include "dump_methods.h" ++ ++extern struct list_head dumper_list_head; ++extern struct dump_memdev *dump_memdev; ++extern struct dumper dumper_stage2; ++struct dump_config_block *dump_saved_config = NULL; ++extern struct dump_blockdev *dump_blockdev; ++static struct dump_memdev *saved_dump_memdev = NULL; ++static struct dumper *saved_dumper = NULL; ++ ++#ifdef CONFIG_KEXEC ++extern int panic_timeout; ++#endif ++ ++/* For testing ++extern void dump_display_map(struct dump_memdev *); ++*/ ++ ++struct dumper *dumper_by_name(char *name) ++{ ++#ifdef LATER ++ struct dumper *dumper; ++ list_for_each_entry(dumper, &dumper_list_head, dumper_list) ++ if (!strncmp(dumper->name, name, 32)) ++ return dumper; ++ ++ /* not found */ ++ return NULL; ++#endif ++ /* Temporary proof of concept */ ++ if (!strncmp(dumper_stage2.name, name, 32)) ++ return &dumper_stage2; ++ else ++ return NULL; ++} ++ ++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT ++extern void dump_early_reserve_map(struct dump_memdev *); ++ ++void crashdump_reserve(void) ++{ ++ extern unsigned long crashdump_addr; ++ ++ if (crashdump_addr == 0xdeadbeef) ++ return; ++ ++ /* reserve dump config and saved dump pages */ ++ dump_saved_config = (struct dump_config_block *)crashdump_addr; ++ /* magic verification */ ++ if (dump_saved_config->magic != DUMP_MAGIC_LIVE) { ++ printk("Invalid dump magic. Ignoring dump\n"); ++ dump_saved_config = NULL; ++ return; ++ } ++ ++ printk("Dump may be available from previous boot\n"); ++ ++#ifdef CONFIG_X86_64 ++ reserve_bootmem_node(NODE_DATA(0), ++ virt_to_phys((void *)crashdump_addr), ++ PAGE_ALIGN(sizeof(struct dump_config_block))); ++#else ++ reserve_bootmem(virt_to_phys((void *)crashdump_addr), ++ PAGE_ALIGN(sizeof(struct dump_config_block))); ++#endif ++ dump_early_reserve_map(&dump_saved_config->memdev); ++ ++} ++#endif ++ ++/* ++ * Loads the dump configuration from a memory block saved across soft-boot ++ * The ops vectors need fixing up as the corresp. routines may have ++ * relocated in the new soft-booted kernel. ++ */ ++int dump_load_config(struct dump_config_block *config) ++{ ++ struct dumper *dumper; ++ struct dump_data_filter *filter_table, *filter; ++ struct dump_dev *dev; ++ int i; ++ ++ if (config->magic != DUMP_MAGIC_LIVE) ++ return -ENOENT; /* not a valid config */ ++ ++ /* initialize generic config data */ ++ memcpy(&dump_config, &config->config, sizeof(dump_config)); ++ ++ /* initialize dumper state */ ++ if (!(dumper = dumper_by_name(config->dumper.name))) { ++ printk("dumper name mismatch\n"); ++ return -ENOENT; /* dumper mismatch */ ++ } ++ ++ /* verify and fixup schema */ ++ if (strncmp(dumper->scheme->name, config->scheme.name, 32)) { ++ printk("dumper scheme mismatch\n"); ++ return -ENOENT; /* mismatch */ ++ } ++ config->scheme.ops = dumper->scheme->ops; ++ config->dumper.scheme = &config->scheme; ++ ++ /* verify and fixup filter operations */ ++ filter_table = dumper->filter; ++ for (i = 0, filter = config->filter_table; ++ ((i < MAX_PASSES) && filter_table[i].selector); ++ i++, filter++) { ++ if (strncmp(filter_table[i].name, filter->name, 32)) { ++ printk("dump filter mismatch\n"); ++ return -ENOENT; /* filter name mismatch */ ++ } ++ filter->selector = filter_table[i].selector; ++ } ++ config->dumper.filter = config->filter_table; ++ ++ /* fixup format */ ++ if (strncmp(dumper->fmt->name, config->fmt.name, 32)) { ++ printk("dump format mismatch\n"); ++ return -ENOENT; /* mismatch */ ++ } ++ config->fmt.ops = dumper->fmt->ops; ++ config->dumper.fmt = &config->fmt; ++ ++ /* fixup target device */ ++ dev = (struct dump_dev *)(&config->dev[0]); ++ if (dumper->dev == NULL) { ++ pr_debug("Vanilla dumper - assume default\n"); ++ if (dump_dev == NULL) ++ return -ENODEV; ++ dumper->dev = dump_dev; ++ } ++ ++ if (strncmp(dumper->dev->type_name, dev->type_name, 32)) { ++ printk("dump dev type mismatch %s instead of %s\n", ++ dev->type_name, dumper->dev->type_name); ++ return -ENOENT; /* mismatch */ ++ } ++ dev->ops = dumper->dev->ops; ++ config->dumper.dev = dev; ++ ++ /* fixup memory device containing saved dump pages */ ++ /* assume statically init'ed dump_memdev */ ++ config->memdev.ddev.ops = dump_memdev->ddev.ops; ++ /* switch to memdev from prev boot */ ++ saved_dump_memdev = dump_memdev; /* remember current */ ++ dump_memdev = &config->memdev; ++ ++ /* Make this the current primary dumper */ ++ dump_config.dumper = &config->dumper; ++ ++ return 0; ++} ++ ++/* Saves the dump configuration in a memory block for use across a soft-boot */ ++int dump_save_config(struct dump_config_block *config) ++{ ++ printk("saving dump config settings\n"); ++ ++ /* dump config settings */ ++ memcpy(&config->config, &dump_config, sizeof(dump_config)); ++ ++ /* dumper state */ ++ memcpy(&config->dumper, dump_config.dumper, sizeof(struct dumper)); ++ memcpy(&config->scheme, dump_config.dumper->scheme, ++ sizeof(struct dump_scheme)); ++ memcpy(&config->fmt, dump_config.dumper->fmt, sizeof(struct dump_fmt)); ++ memcpy(&config->dev[0], dump_config.dumper->dev, ++ sizeof(struct dump_anydev)); ++ memcpy(&config->filter_table, dump_config.dumper->filter, ++ sizeof(struct dump_data_filter)*MAX_PASSES); ++ ++ /* handle to saved mem pages */ ++ memcpy(&config->memdev, dump_memdev, sizeof(struct dump_memdev)); ++ ++ config->magic = DUMP_MAGIC_LIVE; ++ ++ return 0; ++} ++ ++int dump_init_stage2(struct dump_config_block *saved_config) ++{ ++ int err = 0; ++ ++ pr_debug("dump_init_stage2\n"); ++ /* Check if dump from previous boot exists */ ++ if (saved_config) { ++ printk("loading dumper from previous boot \n"); ++ /* load and configure dumper from previous boot */ ++ if ((err = dump_load_config(saved_config))) ++ return err; ++ ++ if (!dump_oncpu) { ++ if ((err = dump_configure(dump_config.dump_device))) { ++ printk("Stage 2 dump configure failed\n"); ++ return err; ++ } ++ } ++ ++ dumper_reset(); ++ dump_dev = dump_config.dumper->dev; ++ /* write out the dump */ ++ err = dump_generic_execute(NULL, NULL); ++ ++ dump_saved_config = NULL; ++ ++ if (!dump_oncpu) { ++ dump_unconfigure(); ++ } ++ ++ return err; ++ ++ } else { ++ /* no dump to write out */ ++ printk("no dumper from previous boot \n"); ++ return 0; ++ } ++} ++ ++extern void dump_mem_markpages(struct dump_memdev *); ++ ++int dump_switchover_stage(void) ++{ ++ int ret = 0; ++ ++ /* trigger stage 2 rightaway - in real life would be after soft-boot */ ++ /* dump_saved_config would be a boot param */ ++ saved_dump_memdev = dump_memdev; ++ saved_dumper = dump_config.dumper; ++ ret = dump_init_stage2(dump_saved_config); ++ dump_memdev = saved_dump_memdev; ++ dump_config.dumper = saved_dumper; ++ return ret; ++} ++ ++int dump_activate_softboot(void) ++{ ++ int err = 0; ++#ifdef CONFIG_KEXEC ++ int num_cpus_online = 0; ++ struct kimage *image; ++#endif ++ ++ /* temporary - switchover to writeout previously saved dump */ ++#ifndef CONFIG_KEXEC ++ err = dump_switchover_stage(); /* non-disruptive case */ ++ if (dump_oncpu) ++ dump_config.dumper = &dumper_stage1; /* set things back */ ++ ++ return err; ++#else ++ ++ dump_silence_level = DUMP_HALT_CPUS; ++ /* wait till we become the only cpu */ ++ /* maybe by checking for online cpus ? */ ++ ++ while((num_cpus_online = num_online_cpus()) > 1); ++ ++ /* now call into kexec */ ++ ++ image = xchg(&kexec_image, 0); ++ if (image) { ++ mdelay(panic_timeout*1000); ++ machine_kexec(image); ++ } ++ ++ ++ /* TBD/Fixme: ++ * * should we call reboot notifiers ? inappropriate for panic ? ++ * * what about device_shutdown() ? ++ * * is explicit bus master disabling needed or can we do that ++ * * through driverfs ? ++ * */ ++ return 0; ++#endif ++} ++ ++/* --- DUMP SCHEME ROUTINES --- */ ++ ++static inline int dump_buf_pending(struct dumper *dumper) ++{ ++ return (dumper->curr_buf - dumper->dump_buf); ++} ++ ++/* Invoked during stage 1 of soft-reboot based dumping */ ++int dump_overlay_sequencer(void) ++{ ++ struct dump_data_filter *filter = dump_config.dumper->filter; ++ struct dump_data_filter *filter2 = dumper_stage2.filter; ++ int pass = 0, err = 0, save = 0; ++ int (*action)(unsigned long, unsigned long); ++ ++ /* Make sure gzip compression is being used */ ++ if (dump_config.dumper->compress->compress_type != DUMP_COMPRESS_GZIP) { ++ printk(" Please set GZIP compression \n"); ++ return -EINVAL; ++ } ++ ++ /* start filling in dump data right after the header */ ++ dump_config.dumper->curr_offset = ++ PAGE_ALIGN(dump_config.dumper->header_len); ++ ++ /* Locate the last pass */ ++ for (;filter->selector; filter++, pass++); ++ ++ /* ++ * Start from the end backwards: overlay involves a reverse ++ * ordering of passes, since less critical pages are more ++ * likely to be reusable as scratch space once we are through ++ * with them. ++ */ ++ for (--pass, --filter; pass >= 0; pass--, filter--) ++ { ++ /* Assumes passes are exclusive (even across dumpers) */ ++ /* Requires care when coding the selection functions */ ++ if ((save = filter->level_mask & dump_config.level)) ++ action = dump_save_data; ++ else ++ action = dump_skip_data; ++ ++ /* Remember the offset where this pass started */ ++ /* The second stage dumper would use this */ ++ if (dump_buf_pending(dump_config.dumper) & (PAGE_SIZE - 1)) { ++ pr_debug("Starting pass %d with pending data\n", pass); ++ pr_debug("filling dummy data to page-align it\n"); ++ dump_config.dumper->curr_buf = (void *)PAGE_ALIGN( ++ (unsigned long)dump_config.dumper->curr_buf); ++ } ++ ++ filter2[pass].start[0] = dump_config.dumper->curr_offset ++ + dump_buf_pending(dump_config.dumper); ++ ++ err = dump_iterator(pass, action, filter); ++ ++ filter2[pass].end[0] = dump_config.dumper->curr_offset ++ + dump_buf_pending(dump_config.dumper); ++ filter2[pass].num_mbanks = 1; ++ ++ if (err < 0) { ++ printk("dump_overlay_seq: failure %d in pass %d\n", ++ err, pass); ++ break; ++ } ++ printk("\n %d overlay pages %s of %d each in pass %d\n", ++ err, save ? "saved" : "skipped", DUMP_PAGE_SIZE, pass); ++ } ++ ++ return err; ++} ++ ++/* from dump_memdev.c */ ++extern struct page *dump_mem_lookup(struct dump_memdev *dev, unsigned long loc); ++extern struct page *dump_mem_next_page(struct dump_memdev *dev); ++ ++static inline struct page *dump_get_saved_page(loff_t loc) ++{ ++ return (dump_mem_lookup(dump_memdev, loc >> PAGE_SHIFT)); ++} ++ ++static inline struct page *dump_next_saved_page(void) ++{ ++ return (dump_mem_next_page(dump_memdev)); ++} ++ ++/* ++ * Iterates over list of saved dump pages. Invoked during second stage of ++ * soft boot dumping ++ * ++ * Observation: If additional selection is desired at this stage then ++ * a different iterator could be written which would advance ++ * to the next page header everytime instead of blindly picking up ++ * the data. In such a case loc would be interpreted differently. ++ * At this moment however a blind pass seems sufficient, cleaner and ++ * faster. ++ */ ++int dump_saved_data_iterator(int pass, int (*action)(unsigned long, ++ unsigned long), struct dump_data_filter *filter) ++{ ++ loff_t loc, end; ++ struct page *page; ++ unsigned long count = 0; ++ int i, err = 0; ++ unsigned long sz; ++ ++ for (i = 0; i < filter->num_mbanks; i++) { ++ loc = filter->start[i]; ++ end = filter->end[i]; ++ printk("pass %d, start off 0x%llx end offset 0x%llx\n", pass, ++ loc, end); ++ ++ /* loc will get treated as logical offset into stage 1 */ ++ page = dump_get_saved_page(loc); ++ ++ for (; loc < end; loc += PAGE_SIZE) { ++ dump_config.dumper->curr_loc = loc; ++ if (!page) { ++ printk("no more saved data for pass %d\n", ++ pass); ++ break; ++ } ++ sz = (loc + PAGE_SIZE > end) ? end - loc : PAGE_SIZE; ++ ++ if (page && filter->selector(pass, (unsigned long)page, ++ PAGE_SIZE)) { ++ pr_debug("mem offset 0x%llx\n", loc); ++ if ((err = action((unsigned long)page, sz))) ++ break; ++ else ++ count++; ++ /* clear the contents of page */ ++ /* fixme: consider using KM_DUMP instead */ ++ clear_highpage(page); ++ ++ } ++ page = dump_next_saved_page(); ++ } ++ } ++ ++ return err ? err : count; ++} ++ ++static inline int dump_overlay_pages_done(struct page *page, int nr) ++{ ++ int ret=0; ++ ++ for (; nr ; page++, nr--) { ++ if (dump_check_and_free_page(dump_memdev, page)) ++ ret++; ++ } ++ return ret; ++} ++ ++int dump_overlay_save_data(unsigned long loc, unsigned long len) ++{ ++ int err = 0; ++ struct page *page = (struct page *)loc; ++ static unsigned long cnt = 0; ++ ++ if ((err = dump_generic_save_data(loc, len))) ++ return err; ++ ++ if (dump_overlay_pages_done(page, len >> PAGE_SHIFT)) { ++ cnt++; ++ if (!(cnt & 0x7f)) ++ pr_debug("released page 0x%lx\n", page_to_pfn(page)); ++ } ++ ++ return err; ++} ++ ++ ++int dump_overlay_skip_data(unsigned long loc, unsigned long len) ++{ ++ struct page *page = (struct page *)loc; ++ ++ dump_overlay_pages_done(page, len >> PAGE_SHIFT); ++ return 0; ++} ++ ++int dump_overlay_resume(void) ++{ ++ int err = 0; ++ ++ /* ++ * switch to stage 2 dumper, save dump_config_block ++ * and then trigger a soft-boot ++ */ ++ dumper_stage2.header_len = dump_config.dumper->header_len; ++ dump_config.dumper = &dumper_stage2; ++ if ((err = dump_save_config(dump_saved_config))) ++ return err; ++ ++ dump_dev = dump_config.dumper->dev; ++ ++#ifdef CONFIG_KEXEC ++ /* If we are doing a disruptive dump, activate softboot now */ ++ if((panic_timeout > 0) && (!(dump_config.flags & DUMP_FLAGS_NONDISRUPT))) ++ err = dump_activate_softboot(); ++#endif ++ ++ return err; ++ err = dump_switchover_stage(); /* plugs into soft boot mechanism */ ++ dump_config.dumper = &dumper_stage1; /* set things back */ ++ return err; ++} ++ ++int dump_overlay_configure(unsigned long devid) ++{ ++ struct dump_dev *dev; ++ struct dump_config_block *saved_config = dump_saved_config; ++ int err = 0; ++ ++ /* If there is a previously saved dump, write it out first */ ++ if (saved_config) { ++ printk("Processing old dump pending writeout\n"); ++ err = dump_switchover_stage(); ++ if (err) { ++ printk("failed to writeout saved dump\n"); ++ return err; ++ } ++ dump_free_mem(saved_config); /* testing only: not after boot */ ++ } ++ ++ dev = dumper_stage2.dev = dump_config.dumper->dev; ++ /* From here on the intermediate dump target is memory-only */ ++ dump_dev = dump_config.dumper->dev = &dump_memdev->ddev; ++ if ((err = dump_generic_configure(0))) { ++ printk("dump generic configure failed: err %d\n", err); ++ return err; ++ } ++ /* temporary */ ++ dumper_stage2.dump_buf = dump_config.dumper->dump_buf; ++ ++ /* Sanity check on the actual target dump device */ ++ if (!dev || (err = dev->ops->open(dev, devid))) { ++ return err; ++ } ++ /* TBD: should we release the target if this is soft-boot only ? */ ++ ++ /* alloc a dump config block area to save across reboot */ ++ if (!(dump_saved_config = dump_alloc_mem(sizeof(struct ++ dump_config_block)))) { ++ printk("dump config block alloc failed\n"); ++ /* undo configure */ ++ dump_generic_unconfigure(); ++ return -ENOMEM; ++ } ++ dump_config.dump_addr = (unsigned long)dump_saved_config; ++ printk("Dump config block of size %d set up at 0x%lx\n", ++ sizeof(*dump_saved_config), (unsigned long)dump_saved_config); ++ return 0; ++} ++ ++int dump_overlay_unconfigure(void) ++{ ++ struct dump_dev *dev = dumper_stage2.dev; ++ int err = 0; ++ ++ pr_debug("dump_overlay_unconfigure\n"); ++ /* Close the secondary device */ ++ dev->ops->release(dev); ++ pr_debug("released secondary device\n"); ++ ++ err = dump_generic_unconfigure(); ++ pr_debug("Unconfigured generic portions\n"); ++ dump_free_mem(dump_saved_config); ++ dump_saved_config = NULL; ++ pr_debug("Freed saved config block\n"); ++ dump_dev = dump_config.dumper->dev = dumper_stage2.dev; ++ ++ printk("Unconfigured overlay dumper\n"); ++ return err; ++} ++ ++int dump_staged_unconfigure(void) ++{ ++ int err = 0; ++ struct dump_config_block *saved_config = dump_saved_config; ++ struct dump_dev *dev; ++ ++ pr_debug("dump_staged_unconfigure\n"); ++ err = dump_generic_unconfigure(); ++ ++ /* now check if there is a saved dump waiting to be written out */ ++ if (saved_config) { ++ printk("Processing saved dump pending writeout\n"); ++ if ((err = dump_switchover_stage())) { ++ printk("Error in commiting saved dump at 0x%lx\n", ++ (unsigned long)saved_config); ++ printk("Old dump may hog memory\n"); ++ } else { ++ dump_free_mem(saved_config); ++ pr_debug("Freed saved config block\n"); ++ } ++ dump_saved_config = NULL; ++ } else { ++ dev = &dump_memdev->ddev; ++ dev->ops->release(dev); ++ } ++ printk("Unconfigured second stage dumper\n"); ++ ++ return 0; ++} ++ ++/* ----- PASSTHRU FILTER ROUTINE --------- */ ++ ++/* transparent - passes everything through */ ++int dump_passthru_filter(int pass, unsigned long loc, unsigned long sz) ++{ ++ return 1; ++} ++ ++/* ----- PASSTRU FORMAT ROUTINES ---- */ ++ ++ ++int dump_passthru_configure_header(const char *panic_str, const struct pt_regs *regs) ++{ ++ dump_config.dumper->header_dirty++; ++ return 0; ++} ++ ++/* Copies bytes of data from page(s) to the specified buffer */ ++int dump_copy_pages(void *buf, struct page *page, unsigned long sz) ++{ ++ unsigned long len = 0, bytes; ++ void *addr; ++ ++ while (len < sz) { ++ addr = kmap_atomic(page, KM_DUMP); ++ bytes = (sz > len + PAGE_SIZE) ? PAGE_SIZE : sz - len; ++ memcpy(buf, addr, bytes); ++ kunmap_atomic(addr, KM_DUMP); ++ buf += bytes; ++ len += bytes; ++ page++; ++ } ++ /* memset(dump_config.dumper->curr_buf, 0x57, len); temporary */ ++ ++ return sz - len; ++} ++ ++int dump_passthru_update_header(void) ++{ ++ long len = dump_config.dumper->header_len; ++ struct page *page; ++ void *buf = dump_config.dumper->dump_buf; ++ int err = 0; ++ ++ if (!dump_config.dumper->header_dirty) ++ return 0; ++ ++ pr_debug("Copying header of size %ld bytes from memory\n", len); ++ if (len > DUMP_BUFFER_SIZE) ++ return -E2BIG; ++ ++ page = dump_mem_lookup(dump_memdev, 0); ++ for (; (len > 0) && page; buf += PAGE_SIZE, len -= PAGE_SIZE) { ++ if ((err = dump_copy_pages(buf, page, PAGE_SIZE))) ++ return err; ++ page = dump_mem_next_page(dump_memdev); ++ } ++ if (len > 0) { ++ printk("Incomplete header saved in mem\n"); ++ return -ENOENT; ++ } ++ ++ if ((err = dump_dev_seek(0))) { ++ printk("Unable to seek to dump header offset\n"); ++ return err; ++ } ++ err = dump_ll_write(dump_config.dumper->dump_buf, ++ buf - dump_config.dumper->dump_buf); ++ if (err < dump_config.dumper->header_len) ++ return (err < 0) ? err : -ENOSPC; ++ ++ dump_config.dumper->header_dirty = 0; ++ return 0; ++} ++ ++static loff_t next_dph_offset = 0; ++ ++static int dph_valid(struct __dump_page *dph) ++{ ++ if ((dph->dp_address & (PAGE_SIZE - 1)) || (dph->dp_flags ++ > DUMP_DH_COMPRESSED) || (!dph->dp_flags) || ++ (dph->dp_size > PAGE_SIZE)) { ++ printk("dp->address = 0x%llx, dp->size = 0x%x, dp->flag = 0x%x\n", ++ dph->dp_address, dph->dp_size, dph->dp_flags); ++ return 0; ++ } ++ return 1; ++} ++ ++int dump_verify_lcrash_data(void *buf, unsigned long sz) ++{ ++ struct __dump_page *dph; ++ ++ /* sanity check for page headers */ ++ while (next_dph_offset + sizeof(*dph) < sz) { ++ dph = (struct __dump_page *)(buf + next_dph_offset); ++ if (!dph_valid(dph)) { ++ printk("Invalid page hdr at offset 0x%llx\n", ++ next_dph_offset); ++ return -EINVAL; ++ } ++ next_dph_offset += dph->dp_size + sizeof(*dph); ++ } ++ ++ next_dph_offset -= sz; ++ return 0; ++} ++ ++/* ++ * TBD/Later: Consider avoiding the copy by using a scatter/gather ++ * vector representation for the dump buffer ++ */ ++int dump_passthru_add_data(unsigned long loc, unsigned long sz) ++{ ++ struct page *page = (struct page *)loc; ++ void *buf = dump_config.dumper->curr_buf; ++ int err = 0; ++ ++ if ((err = dump_copy_pages(buf, page, sz))) { ++ printk("dump_copy_pages failed"); ++ return err; ++ } ++ ++ if ((err = dump_verify_lcrash_data(buf, sz))) { ++ printk("dump_verify_lcrash_data failed\n"); ++ printk("Invalid data for pfn 0x%lx\n", page_to_pfn(page)); ++ printk("Page flags 0x%lx\n", page->flags); ++ printk("Page count 0x%x\n", page_count(page)); ++ return err; ++ } ++ ++ dump_config.dumper->curr_buf = buf + sz; ++ ++ return 0; ++} ++ ++ ++/* Stage 1 dumper: Saves compressed dump in memory and soft-boots system */ ++ ++/* Scheme to overlay saved data in memory for writeout after a soft-boot */ ++struct dump_scheme_ops dump_scheme_overlay_ops = { ++ .configure = dump_overlay_configure, ++ .unconfigure = dump_overlay_unconfigure, ++ .sequencer = dump_overlay_sequencer, ++ .iterator = dump_page_iterator, ++ .save_data = dump_overlay_save_data, ++ .skip_data = dump_overlay_skip_data, ++ .write_buffer = dump_generic_write_buffer ++}; ++ ++struct dump_scheme dump_scheme_overlay = { ++ .name = "overlay", ++ .ops = &dump_scheme_overlay_ops ++}; ++ ++ ++/* Stage 1 must use a good compression scheme - default to gzip */ ++extern struct __dump_compress dump_gzip_compression; ++ ++struct dumper dumper_stage1 = { ++ .name = "stage1", ++ .scheme = &dump_scheme_overlay, ++ .fmt = &dump_fmt_lcrash, ++ .compress = &dump_none_compression, /* needs to be gzip */ ++ .filter = dump_filter_table, ++ .dev = NULL, ++}; ++ ++/* Stage 2 dumper: Activated after softboot to write out saved dump to device */ ++ ++/* Formatter that transfers data as is (transparent) w/o further conversion */ ++struct dump_fmt_ops dump_fmt_passthru_ops = { ++ .configure_header = dump_passthru_configure_header, ++ .update_header = dump_passthru_update_header, ++ .save_context = NULL, /* unused */ ++ .add_data = dump_passthru_add_data, ++ .update_end_marker = dump_lcrash_update_end_marker ++}; ++ ++struct dump_fmt dump_fmt_passthru = { ++ .name = "passthru", ++ .ops = &dump_fmt_passthru_ops ++}; ++ ++/* Filter that simply passes along any data within the range (transparent)*/ ++/* Note: The start and end ranges in the table are filled in at run-time */ ++ ++extern int dump_filter_none(int pass, unsigned long loc, unsigned long sz); ++ ++struct dump_data_filter dump_passthru_filtertable[MAX_PASSES] = { ++{.name = "passkern", .selector = dump_passthru_filter, ++ .level_mask = DUMP_MASK_KERN }, ++{.name = "passuser", .selector = dump_passthru_filter, ++ .level_mask = DUMP_MASK_USED }, ++{.name = "passunused", .selector = dump_passthru_filter, ++ .level_mask = DUMP_MASK_UNUSED }, ++{.name = "none", .selector = dump_filter_none, ++ .level_mask = DUMP_MASK_REST } ++}; ++ ++ ++/* Scheme to handle data staged / preserved across a soft-boot */ ++struct dump_scheme_ops dump_scheme_staged_ops = { ++ .configure = dump_generic_configure, ++ .unconfigure = dump_staged_unconfigure, ++ .sequencer = dump_generic_sequencer, ++ .iterator = dump_saved_data_iterator, ++ .save_data = dump_generic_save_data, ++ .skip_data = dump_generic_skip_data, ++ .write_buffer = dump_generic_write_buffer ++}; ++ ++struct dump_scheme dump_scheme_staged = { ++ .name = "staged", ++ .ops = &dump_scheme_staged_ops ++}; ++ ++/* The stage 2 dumper comprising all these */ ++struct dumper dumper_stage2 = { ++ .name = "stage2", ++ .scheme = &dump_scheme_staged, ++ .fmt = &dump_fmt_passthru, ++ .compress = &dump_none_compression, ++ .filter = dump_passthru_filtertable, ++ .dev = NULL, ++}; ++ +Index: linux-2.6.10/drivers/dump/dump_memdev.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_memdev.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_memdev.c 2005-04-05 16:47:53.947204496 +0800 +@@ -0,0 +1,655 @@ ++/* ++ * Implements the dump driver interface for saving a dump in available ++ * memory areas. The saved pages may be written out to persistent storage ++ * after a soft reboot. ++ * ++ * Started: Oct 2002 - Suparna Bhattacharya ++ * ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ * ++ * The approach of tracking pages containing saved dump using map pages ++ * allocated as needed has been derived from the Mission Critical Linux ++ * mcore dump implementation. ++ * ++ * Credits and a big thanks for letting the lkcd project make use of ++ * the excellent piece of work and also helping with clarifications ++ * and tips along the way are due to: ++ * Dave Winchell (primary author of mcore) ++ * Jeff Moyer ++ * Josh Huber ++ * ++ * For those familiar with the mcore code, the main differences worth ++ * noting here (besides the dump device abstraction) result from enabling ++ * "high" memory pages (pages not permanently mapped in the kernel ++ * address space) to be used for saving dump data (because of which a ++ * simple virtual address based linked list cannot be used anymore for ++ * managing free pages), an added level of indirection for faster ++ * lookups during the post-boot stage, and the idea of pages being ++ * made available as they get freed up while dump to memory progresses ++ * rather than one time before starting the dump. The last point enables ++ * a full memory snapshot to be saved starting with an initial set of ++ * bootstrap pages given a good compression ratio. (See dump_overlay.c) ++ * ++ */ ++ ++/* ++ * -----------------MEMORY LAYOUT ------------------ ++ * The memory space consists of a set of discontiguous pages, and ++ * discontiguous map pages as well, rooted in a chain of indirect ++ * map pages (also discontiguous). Except for the indirect maps ++ * (which must be preallocated in advance), the rest of the pages ++ * could be in high memory. ++ * ++ * root ++ * | --------- -------- -------- ++ * --> | . . +|--->| . +|------->| . . | indirect ++ * --|--|--- ---|---- --|-|--- maps ++ * | | | | | ++ * ------ ------ ------- ------ ------- ++ * | . | | . | | . . | | . | | . . | maps ++ * --|--- --|--- --|--|-- --|--- ---|-|-- ++ * page page page page page page page data ++ * pages ++ * ++ * Writes to the dump device happen sequentially in append mode. ++ * The main reason for the existence of the indirect map is ++ * to enable a quick way to lookup a specific logical offset in ++ * the saved data post-soft-boot, e.g. to writeout pages ++ * with more critical data first, even though such pages ++ * would have been compressed and copied last, being the lowest ++ * ranked candidates for reuse due to their criticality. ++ * (See dump_overlay.c) ++ */ ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++ ++#define DUMP_MAP_SZ (PAGE_SIZE / sizeof(unsigned long)) /* direct map size */ ++#define DUMP_IND_MAP_SZ DUMP_MAP_SZ - 1 /* indirect map size */ ++#define DUMP_NR_BOOTSTRAP 64 /* no of bootstrap pages */ ++ ++extern int dump_low_page(struct page *); ++ ++/* check if the next entry crosses a page boundary */ ++static inline int is_last_map_entry(unsigned long *map) ++{ ++ unsigned long addr = (unsigned long)(map + 1); ++ ++ return (!(addr & (PAGE_SIZE - 1))); ++} ++ ++/* Todo: should have some validation checks */ ++/* The last entry in the indirect map points to the next indirect map */ ++/* Indirect maps are referred to directly by virtual address */ ++static inline unsigned long *next_indirect_map(unsigned long *map) ++{ ++ return (unsigned long *)map[DUMP_IND_MAP_SZ]; ++} ++ ++#ifdef CONFIG_CRASH_DUMP_SOFTBOOT ++/* Called during early bootup - fixme: make this __init */ ++void dump_early_reserve_map(struct dump_memdev *dev) ++{ ++ unsigned long *map1, *map2; ++ loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT; ++ int i, j; ++ ++ printk("Reserve bootmap space holding previous dump of %lld pages\n", ++ last); ++ map1= (unsigned long *)dev->indirect_map_root; ++ ++ while (map1 && (off < last)) { ++#ifdef CONFIG_X86_64 ++ reserve_bootmem_node(NODE_DATA(0), virt_to_phys((void *)map1), ++ PAGE_SIZE); ++#else ++ reserve_bootmem(virt_to_phys((void *)map1), PAGE_SIZE); ++#endif ++ for (i=0; (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last); ++ i++, off += DUMP_MAP_SZ) { ++ pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]); ++ if (map1[i] >= max_low_pfn) ++ continue; ++#ifdef CONFIG_X86_64 ++ reserve_bootmem_node(NODE_DATA(0), ++ map1[i] << PAGE_SHIFT, PAGE_SIZE); ++#else ++ reserve_bootmem(map1[i] << PAGE_SHIFT, PAGE_SIZE); ++#endif ++ map2 = pfn_to_kaddr(map1[i]); ++ for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] && ++ (off + j < last); j++) { ++ pr_debug("\t map[%d][%d] = 0x%lx\n", i, j, ++ map2[j]); ++ if (map2[j] < max_low_pfn) { ++#ifdef CONFIG_X86_64 ++ reserve_bootmem_node(NODE_DATA(0), ++ map2[j] << PAGE_SHIFT, ++ PAGE_SIZE); ++#else ++ reserve_bootmem(map2[j] << PAGE_SHIFT, ++ PAGE_SIZE); ++#endif ++ } ++ } ++ } ++ map1 = next_indirect_map(map1); ++ } ++ dev->nr_free = 0; /* these pages don't belong to this boot */ ++} ++#endif ++ ++/* mark dump pages so that they aren't used by this kernel */ ++void dump_mark_map(struct dump_memdev *dev) ++{ ++ unsigned long *map1, *map2; ++ loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT; ++ struct page *page; ++ int i, j; ++ ++ printk("Dump: marking pages in use by previous dump\n"); ++ map1= (unsigned long *)dev->indirect_map_root; ++ ++ while (map1 && (off < last)) { ++ page = virt_to_page(map1); ++ set_page_count(page, 1); ++ for (i=0; (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last); ++ i++, off += DUMP_MAP_SZ) { ++ pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]); ++ page = pfn_to_page(map1[i]); ++ set_page_count(page, 1); ++ map2 = kmap_atomic(page, KM_DUMP); ++ for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] && ++ (off + j < last); j++) { ++ pr_debug("\t map[%d][%d] = 0x%lx\n", i, j, ++ map2[j]); ++ page = pfn_to_page(map2[j]); ++ set_page_count(page, 1); ++ } ++ } ++ map1 = next_indirect_map(map1); ++ } ++} ++ ++ ++/* ++ * Given a logical offset into the mem device lookup the ++ * corresponding page ++ * loc is specified in units of pages ++ * Note: affects curr_map (even in the case where lookup fails) ++ */ ++struct page *dump_mem_lookup(struct dump_memdev *dump_mdev, unsigned long loc) ++{ ++ unsigned long *map; ++ unsigned long i, index = loc / DUMP_MAP_SZ; ++ struct page *page = NULL; ++ unsigned long curr_pfn, curr_map, *curr_map_ptr = NULL; ++ ++ map = (unsigned long *)dump_mdev->indirect_map_root; ++ if (!map) ++ return NULL; ++ if (loc > dump_mdev->last_offset >> PAGE_SHIFT) ++ return NULL; ++ ++ /* ++ * first locate the right indirect map ++ * in the chain of indirect maps ++ */ ++ for (i = 0; i + DUMP_IND_MAP_SZ < index ; i += DUMP_IND_MAP_SZ) { ++ if (!(map = next_indirect_map(map))) ++ return NULL; ++ } ++ /* then the right direct map */ ++ /* map entries are referred to by page index */ ++ if ((curr_map = map[index - i])) { ++ page = pfn_to_page(curr_map); ++ /* update the current traversal index */ ++ /* dump_mdev->curr_map = &map[index - i];*/ ++ curr_map_ptr = &map[index - i]; ++ } ++ ++ if (page) ++ map = kmap_atomic(page, KM_DUMP); ++ else ++ return NULL; ++ ++ /* and finally the right entry therein */ ++ /* data pages are referred to by page index */ ++ i = index * DUMP_MAP_SZ; ++ if ((curr_pfn = map[loc - i])) { ++ page = pfn_to_page(curr_pfn); ++ dump_mdev->curr_map = curr_map_ptr; ++ dump_mdev->curr_map_offset = loc - i; ++ dump_mdev->ddev.curr_offset = loc << PAGE_SHIFT; ++ } else { ++ page = NULL; ++ } ++ kunmap_atomic(map, KM_DUMP); ++ ++ return page; ++} ++ ++/* ++ * Retrieves a pointer to the next page in the dump device ++ * Used during the lookup pass post-soft-reboot ++ */ ++struct page *dump_mem_next_page(struct dump_memdev *dev) ++{ ++ unsigned long i; ++ unsigned long *map; ++ struct page *page = NULL; ++ ++ if (dev->ddev.curr_offset + PAGE_SIZE >= dev->last_offset) { ++ return NULL; ++ } ++ ++ if ((i = (unsigned long)(++dev->curr_map_offset)) >= DUMP_MAP_SZ) { ++ /* move to next map */ ++ if (is_last_map_entry(++dev->curr_map)) { ++ /* move to the next indirect map page */ ++ printk("dump_mem_next_page: go to next indirect map\n"); ++ dev->curr_map = (unsigned long *)*dev->curr_map; ++ if (!dev->curr_map) ++ return NULL; ++ } ++ i = dev->curr_map_offset = 0; ++ pr_debug("dump_mem_next_page: next map 0x%lx, entry 0x%lx\n", ++ dev->curr_map, *dev->curr_map); ++ ++ }; ++ ++ if (*dev->curr_map) { ++ map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_DUMP); ++ if (map[i]) ++ page = pfn_to_page(map[i]); ++ kunmap_atomic(map, KM_DUMP); ++ dev->ddev.curr_offset += PAGE_SIZE; ++ }; ++ ++ return page; ++} ++ ++/* Copied from dump_filters.c */ ++static inline int kernel_page(struct page *p) ++{ ++ /* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */ ++ return (PageReserved(p) && !PageInuse(p)) || (!PageLRU(p) && PageInuse(p)); ++} ++ ++static inline int user_page(struct page *p) ++{ ++ return PageInuse(p) && (!PageReserved(p) && PageLRU(p)); ++} ++ ++int dump_reused_by_boot(struct page *page) ++{ ++ /* Todo ++ * Checks: ++ * if PageReserved ++ * if < __end + bootmem_bootmap_pages for this boot + allowance ++ * if overwritten by initrd (how to check ?) ++ * Also, add more checks in early boot code ++ * e.g. bootmem bootmap alloc verify not overwriting dump, and if ++ * so then realloc or move the dump pages out accordingly. ++ */ ++ ++ /* Temporary proof of concept hack, avoid overwriting kern pages */ ++ ++ return (kernel_page(page) || dump_low_page(page) || user_page(page)); ++} ++ ++ ++/* Uses the free page passed in to expand available space */ ++int dump_mem_add_space(struct dump_memdev *dev, struct page *page) ++{ ++ struct page *map_page; ++ unsigned long *map; ++ unsigned long i; ++ ++ if (!dev->curr_map) ++ return -ENOMEM; /* must've exhausted indirect map */ ++ ++ if (!*dev->curr_map || dev->curr_map_offset >= DUMP_MAP_SZ) { ++ /* add map space */ ++ *dev->curr_map = page_to_pfn(page); ++ dev->curr_map_offset = 0; ++ return 0; ++ } ++ ++ /* add data space */ ++ i = dev->curr_map_offset; ++ map_page = pfn_to_page(*dev->curr_map); ++ map = (unsigned long *)kmap_atomic(map_page, KM_DUMP); ++ map[i] = page_to_pfn(page); ++ kunmap_atomic(map, KM_DUMP); ++ dev->curr_map_offset = ++i; ++ dev->last_offset += PAGE_SIZE; ++ if (i >= DUMP_MAP_SZ) { ++ /* move to next map */ ++ if (is_last_map_entry(++dev->curr_map)) { ++ /* move to the next indirect map page */ ++ pr_debug("dump_mem_add_space: using next" ++ "indirect map\n"); ++ dev->curr_map = (unsigned long *)*dev->curr_map; ++ } ++ } ++ return 0; ++} ++ ++ ++/* Caution: making a dest page invalidates existing contents of the page */ ++int dump_check_and_free_page(struct dump_memdev *dev, struct page *page) ++{ ++ int err = 0; ++ ++ /* ++ * the page can be used as a destination only if we are sure ++ * it won't get overwritten by the soft-boot, and is not ++ * critical for us right now. ++ */ ++ if (dump_reused_by_boot(page)) ++ return 0; ++ ++ if ((err = dump_mem_add_space(dev, page))) { ++ printk("Warning: Unable to extend memdev space. Err %d\n", ++ err); ++ return 0; ++ } ++ ++ dev->nr_free++; ++ return 1; ++} ++ ++ ++/* Set up the initial maps and bootstrap space */ ++/* Must be called only after any previous dump is written out */ ++int dump_mem_open(struct dump_dev *dev, unsigned long devid) ++{ ++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev); ++ unsigned long nr_maps, *map, *prev_map = &dump_mdev->indirect_map_root; ++ void *addr; ++ struct page *page; ++ unsigned long i = 0; ++ int err = 0; ++ ++ /* Todo: sanity check for unwritten previous dump */ ++ ++ /* allocate pages for indirect map (non highmem area) */ ++ nr_maps = num_physpages / DUMP_MAP_SZ; /* maps to cover entire mem */ ++ for (i = 0; i < nr_maps; i += DUMP_IND_MAP_SZ) { ++ if (!(map = (unsigned long *)dump_alloc_mem(PAGE_SIZE))) { ++ printk("Unable to alloc indirect map %ld\n", ++ i / DUMP_IND_MAP_SZ); ++ return -ENOMEM; ++ } ++ clear_page(map); ++ *prev_map = (unsigned long)map; ++ prev_map = &map[DUMP_IND_MAP_SZ]; ++ }; ++ ++ dump_mdev->curr_map = (unsigned long *)dump_mdev->indirect_map_root; ++ dump_mdev->curr_map_offset = 0; ++ ++ /* ++ * allocate a few bootstrap pages: at least 1 map and 1 data page ++ * plus enough to save the dump header ++ */ ++ i = 0; ++ do { ++ if (!(addr = dump_alloc_mem(PAGE_SIZE))) { ++ printk("Unable to alloc bootstrap page %ld\n", i); ++ return -ENOMEM; ++ } ++ ++ page = virt_to_page(addr); ++ if (dump_low_page(page)) { ++ dump_free_mem(addr); ++ continue; ++ } ++ ++ if (dump_mem_add_space(dump_mdev, page)) { ++ printk("Warning: Unable to extend memdev " ++ "space. Err %d\n", err); ++ dump_free_mem(addr); ++ continue; ++ } ++ i++; ++ } while (i < DUMP_NR_BOOTSTRAP); ++ ++ printk("dump memdev init: %ld maps, %ld bootstrap pgs, %ld free pgs\n", ++ nr_maps, i, dump_mdev->last_offset >> PAGE_SHIFT); ++ ++ dump_mdev->last_bs_offset = dump_mdev->last_offset; ++ ++ return 0; ++} ++ ++/* Releases all pre-alloc'd pages */ ++int dump_mem_release(struct dump_dev *dev) ++{ ++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev); ++ struct page *page, *map_page; ++ unsigned long *map, *prev_map; ++ void *addr; ++ int i; ++ ++ if (!dump_mdev->nr_free) ++ return 0; ++ ++ pr_debug("dump_mem_release\n"); ++ page = dump_mem_lookup(dump_mdev, 0); ++ for (i = 0; page && (i < DUMP_NR_BOOTSTRAP - 1); i++) { ++ if (PageHighMem(page)) ++ break; ++ addr = page_address(page); ++ if (!addr) { ++ printk("page_address(%p) = NULL\n", page); ++ break; ++ } ++ pr_debug("Freeing page at 0x%lx\n", addr); ++ dump_free_mem(addr); ++ if (dump_mdev->curr_map_offset >= DUMP_MAP_SZ - 1) { ++ map_page = pfn_to_page(*dump_mdev->curr_map); ++ if (PageHighMem(map_page)) ++ break; ++ page = dump_mem_next_page(dump_mdev); ++ addr = page_address(map_page); ++ if (!addr) { ++ printk("page_address(%p) = NULL\n", ++ map_page); ++ break; ++ } ++ pr_debug("Freeing map page at 0x%lx\n", addr); ++ dump_free_mem(addr); ++ i++; ++ } else { ++ page = dump_mem_next_page(dump_mdev); ++ } ++ } ++ ++ /* now for the last used bootstrap page used as a map page */ ++ if ((i < DUMP_NR_BOOTSTRAP) && (*dump_mdev->curr_map)) { ++ map_page = pfn_to_page(*dump_mdev->curr_map); ++ if ((map_page) && !PageHighMem(map_page)) { ++ addr = page_address(map_page); ++ if (!addr) { ++ printk("page_address(%p) = NULL\n", map_page); ++ } else { ++ pr_debug("Freeing map page at 0x%lx\n", addr); ++ dump_free_mem(addr); ++ i++; ++ } ++ } ++ } ++ ++ printk("Freed %d bootstrap pages\n", i); ++ ++ /* free the indirect maps */ ++ map = (unsigned long *)dump_mdev->indirect_map_root; ++ ++ i = 0; ++ while (map) { ++ prev_map = map; ++ map = next_indirect_map(map); ++ dump_free_mem(prev_map); ++ i++; ++ } ++ ++ printk("Freed %d indirect map(s)\n", i); ++ ++ /* Reset the indirect map */ ++ dump_mdev->indirect_map_root = 0; ++ dump_mdev->curr_map = 0; ++ ++ /* Reset the free list */ ++ dump_mdev->nr_free = 0; ++ ++ dump_mdev->last_offset = dump_mdev->ddev.curr_offset = 0; ++ dump_mdev->last_used_offset = 0; ++ dump_mdev->curr_map = NULL; ++ dump_mdev->curr_map_offset = 0; ++ return 0; ++} ++ ++/* ++ * Long term: ++ * It is critical for this to be very strict. Cannot afford ++ * to have anything running and accessing memory while we overwrite ++ * memory (potential risk of data corruption). ++ * If in doubt (e.g if a cpu is hung and not responding) just give ++ * up and refuse to proceed with this scheme. ++ * ++ * Note: I/O will only happen after soft-boot/switchover, so we can ++ * safely disable interrupts and force stop other CPUs if this is ++ * going to be a disruptive dump, no matter what they ++ * are in the middle of. ++ */ ++/* ++ * ATM Most of this is already taken care of in the nmi handler ++ * We may halt the cpus rightaway if we know this is going to be disruptive ++ * For now, since we've limited ourselves to overwriting free pages we ++ * aren't doing much here. Eventually, we'd have to wait to make sure other ++ * cpus aren't using memory we could be overwriting ++ */ ++int dump_mem_silence(struct dump_dev *dev) ++{ ++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev); ++ ++ if (dump_mdev->last_offset > dump_mdev->last_bs_offset) { ++ /* prefer to run lkcd config & start with a clean slate */ ++ return -EEXIST; ++ } ++ return 0; ++} ++ ++extern int dump_overlay_resume(void); ++ ++/* Trigger the next stage of dumping */ ++int dump_mem_resume(struct dump_dev *dev) ++{ ++ dump_overlay_resume(); ++ return 0; ++} ++ ++/* ++ * Allocate mem dev pages as required and copy buffer contents into it. ++ * Fails if the no free pages are available ++ * Keeping it simple and limited for starters (can modify this over time) ++ * Does not handle holes or a sparse layout ++ * Data must be in multiples of PAGE_SIZE ++ */ ++int dump_mem_write(struct dump_dev *dev, void *buf, unsigned long len) ++{ ++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev); ++ struct page *page; ++ unsigned long n = 0; ++ void *addr; ++ unsigned long *saved_curr_map, saved_map_offset; ++ int ret = 0; ++ ++ pr_debug("dump_mem_write: offset 0x%llx, size %ld\n", ++ dev->curr_offset, len); ++ ++ if (dev->curr_offset + len > dump_mdev->last_offset) { ++ printk("Out of space to write\n"); ++ return -ENOSPC; ++ } ++ ++ if ((len & (PAGE_SIZE - 1)) || (dev->curr_offset & (PAGE_SIZE - 1))) ++ return -EINVAL; /* not aligned in units of page size */ ++ ++ saved_curr_map = dump_mdev->curr_map; ++ saved_map_offset = dump_mdev->curr_map_offset; ++ page = dump_mem_lookup(dump_mdev, dev->curr_offset >> PAGE_SHIFT); ++ ++ for (n = len; (n > 0) && page; n -= PAGE_SIZE, buf += PAGE_SIZE ) { ++ addr = kmap_atomic(page, KM_DUMP); ++ /* memset(addr, 'x', PAGE_SIZE); */ ++ memcpy(addr, buf, PAGE_SIZE); ++ kunmap_atomic(addr, KM_DUMP); ++ /* dev->curr_offset += PAGE_SIZE; */ ++ page = dump_mem_next_page(dump_mdev); ++ } ++ ++ dump_mdev->curr_map = saved_curr_map; ++ dump_mdev->curr_map_offset = saved_map_offset; ++ ++ if (dump_mdev->last_used_offset < dev->curr_offset) ++ dump_mdev->last_used_offset = dev->curr_offset; ++ ++ return (len - n) ? (len - n) : ret ; ++} ++ ++/* dummy - always ready */ ++int dump_mem_ready(struct dump_dev *dev, void *buf) ++{ ++ return 0; ++} ++ ++/* ++ * Should check for availability of space to write upto the offset ++ * affects only the curr_offset; last_offset untouched ++ * Keep it simple: Only allow multiples of PAGE_SIZE for now ++ */ ++int dump_mem_seek(struct dump_dev *dev, loff_t offset) ++{ ++ struct dump_memdev *dump_mdev = DUMP_MDEV(dev); ++ ++ if (offset & (PAGE_SIZE - 1)) ++ return -EINVAL; /* allow page size units only for now */ ++ ++ /* Are we exceeding available space ? */ ++ if (offset > dump_mdev->last_offset) { ++ printk("dump_mem_seek failed for offset 0x%llx\n", ++ offset); ++ return -ENOSPC; ++ } ++ ++ dump_mdev->ddev.curr_offset = offset; ++ return 0; ++} ++ ++struct dump_dev_ops dump_memdev_ops = { ++ .open = dump_mem_open, ++ .release = dump_mem_release, ++ .silence = dump_mem_silence, ++ .resume = dump_mem_resume, ++ .seek = dump_mem_seek, ++ .write = dump_mem_write, ++ .read = NULL, /* not implemented at the moment */ ++ .ready = dump_mem_ready ++}; ++ ++static struct dump_memdev default_dump_memdev = { ++ .ddev = {.type_name = "memdev", .ops = &dump_memdev_ops, ++ .device_id = 0x14} ++ /* assume the rest of the fields are zeroed by default */ ++}; ++ ++/* may be overwritten if a previous dump exists */ ++struct dump_memdev *dump_memdev = &default_dump_memdev; ++ +Index: linux-2.6.10/drivers/dump/dump_blockdev.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_blockdev.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_blockdev.c 2005-04-05 16:47:53.945204800 +0800 +@@ -0,0 +1,469 @@ ++/* ++ * Implements the dump driver interface for saving a dump to ++ * a block device through the kernel's generic low level block i/o ++ * routines. ++ * ++ * Started: June 2002 - Mohamed Abbas ++ * Moved original lkcd kiobuf dump i/o code from dump_base.c ++ * to use generic dump device interfaces ++ * ++ * Sept 2002 - Bharata B. Rao ++ * Convert dump i/o to directly use bio instead of kiobuf for 2.5 ++ * ++ * Oct 2002 - Suparna Bhattacharya ++ * Rework to new dumpdev.h structures, implement open/close/ ++ * silence, misc fixes (blocknr removal, bio_add_page usage) ++ * ++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. ++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++ ++extern void *dump_page_buf; ++ ++/* The end_io callback for dump i/o completion */ ++static int ++dump_bio_end_io(struct bio *bio, unsigned int bytes_done, int error) ++{ ++ struct dump_blockdev *dump_bdev; ++ ++ if (bio->bi_size) { ++ /* some bytes still left to transfer */ ++ return 1; /* not complete */ ++ } ++ ++ dump_bdev = (struct dump_blockdev *)bio->bi_private; ++ if (error) { ++ printk("IO error while writing the dump, aborting\n"); ++ } ++ ++ dump_bdev->err = error; ++ ++ /* no wakeup needed, since caller polls for completion */ ++ return 0; ++} ++ ++/* Check if the dump bio is already mapped to the specified buffer */ ++static int ++dump_block_map_valid(struct dump_blockdev *dev, struct page *page, ++ int len) ++{ ++ struct bio *bio = dev->bio; ++ unsigned long bsize = 0; ++ ++ if (!bio->bi_vcnt) ++ return 0; /* first time, not mapped */ ++ ++ ++ if ((bio_page(bio) != page) || (len > bio->bi_vcnt << PAGE_SHIFT)) ++ return 0; /* buffer not mapped */ ++ ++ bsize = bdev_hardsect_size(bio->bi_bdev); ++ if ((len & (PAGE_SIZE - 1)) || (len & bsize)) ++ return 0; /* alignment checks needed */ ++ ++ /* quick check to decide if we need to redo bio_add_page */ ++ if (bdev_get_queue(bio->bi_bdev)->merge_bvec_fn) ++ return 0; /* device may have other restrictions */ ++ ++ return 1; /* already mapped */ ++} ++ ++/* ++ * Set up the dump bio for i/o from the specified buffer ++ * Return value indicates whether the full buffer could be mapped or not ++ */ ++static int ++dump_block_map(struct dump_blockdev *dev, void *buf, int len) ++{ ++ struct page *page = virt_to_page(buf); ++ struct bio *bio = dev->bio; ++ unsigned long bsize = 0; ++ ++ bio->bi_bdev = dev->bdev; ++ bio->bi_sector = (dev->start_offset + dev->ddev.curr_offset) >> 9; ++ bio->bi_idx = 0; /* reset index to the beginning */ ++ ++ if (dump_block_map_valid(dev, page, len)) { ++ /* already mapped and usable rightaway */ ++ bio->bi_size = len; /* reset size to the whole bio */ ++ bio->bi_vcnt = (len + PAGE_SIZE - 1) / PAGE_SIZE; /* Set the proper vector cnt */ ++ } else { ++ /* need to map the bio */ ++ bio->bi_size = 0; ++ bio->bi_vcnt = 0; ++ bsize = bdev_hardsect_size(bio->bi_bdev); ++ ++ /* first a few sanity checks */ ++ if (len < bsize) { ++ printk("map: len less than hardsect size \n"); ++ return -EINVAL; ++ } ++ ++ if ((unsigned long)buf & bsize) { ++ printk("map: not aligned \n"); ++ return -EINVAL; ++ } ++ ++ /* assume contig. page aligned low mem buffer( no vmalloc) */ ++ if ((page_address(page) != buf) || (len & (PAGE_SIZE - 1))) { ++ printk("map: invalid buffer alignment!\n"); ++ return -EINVAL; ++ } ++ /* finally we can go ahead and map it */ ++ while (bio->bi_size < len) ++ if (bio_add_page(bio, page++, PAGE_SIZE, 0) == 0) { ++ break; ++ } ++ ++ bio->bi_end_io = dump_bio_end_io; ++ bio->bi_private = dev; ++ } ++ ++ if (bio->bi_size != len) { ++ printk("map: bio size = %d not enough for len = %d!\n", ++ bio->bi_size, len); ++ return -E2BIG; ++ } ++ return 0; ++} ++ ++static void ++dump_free_bio(struct bio *bio) ++{ ++ if (bio) ++ kfree(bio->bi_io_vec); ++ kfree(bio); ++} ++ ++/* ++ * Prepares the dump device so we can take a dump later. ++ * The caller is expected to have filled up the dev_id field in the ++ * block dump dev structure. ++ * ++ * At dump time when dump_block_write() is invoked it will be too ++ * late to recover, so as far as possible make sure obvious errors ++ * get caught right here and reported back to the caller. ++ */ ++static int ++dump_block_open(struct dump_dev *dev, unsigned long arg) ++{ ++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); ++ struct block_device *bdev; ++ int retval = 0; ++ struct bio_vec *bvec; ++ ++ /* make sure this is a valid block device */ ++ if (!arg) { ++ retval = -EINVAL; ++ goto err; ++ } ++ ++ /* Convert it to the new dev_t format */ ++ arg = MKDEV((arg >> OLDMINORBITS), (arg & OLDMINORMASK)); ++ ++ /* get a corresponding block_dev struct for this */ ++ bdev = bdget((dev_t)arg); ++ if (!bdev) { ++ retval = -ENODEV; ++ goto err; ++ } ++ ++ /* get the block device opened */ ++ if ((retval = blkdev_get(bdev, O_RDWR | O_LARGEFILE, 0))) { ++ goto err1; ++ } ++ ++ if ((dump_bdev->bio = kmalloc(sizeof(struct bio), GFP_KERNEL)) ++ == NULL) { ++ printk("Cannot allocate bio\n"); ++ retval = -ENOMEM; ++ goto err2; ++ } ++ ++ bio_init(dump_bdev->bio); ++ ++ if ((bvec = kmalloc(sizeof(struct bio_vec) * ++ (DUMP_BUFFER_SIZE >> PAGE_SHIFT), GFP_KERNEL)) == NULL) { ++ retval = -ENOMEM; ++ goto err3; ++ } ++ ++ /* assign the new dump dev structure */ ++ dump_bdev->dev_id = (dev_t)arg; ++ dump_bdev->bdev = bdev; ++ ++ /* make a note of the limit */ ++ dump_bdev->limit = bdev->bd_inode->i_size; ++ ++ /* now make sure we can map the dump buffer */ ++ dump_bdev->bio->bi_io_vec = bvec; ++ dump_bdev->bio->bi_max_vecs = DUMP_BUFFER_SIZE >> PAGE_SHIFT; ++ ++ retval = dump_block_map(dump_bdev, dump_config.dumper->dump_buf, ++ DUMP_BUFFER_SIZE); ++ ++ if (retval) { ++ printk("open: dump_block_map failed, ret %d\n", retval); ++ goto err3; ++ } ++ ++ printk("Block device (%d,%d) successfully configured for dumping\n", ++ MAJOR(dump_bdev->dev_id), ++ MINOR(dump_bdev->dev_id)); ++ ++ ++ /* after opening the block device, return */ ++ return retval; ++ ++err3: dump_free_bio(dump_bdev->bio); ++ dump_bdev->bio = NULL; ++err2: if (bdev) blkdev_put(bdev); ++ goto err; ++err1: if (bdev) bdput(bdev); ++ dump_bdev->bdev = NULL; ++err: return retval; ++} ++ ++/* ++ * Close the dump device and release associated resources ++ * Invoked when unconfiguring the dump device. ++ */ ++static int ++dump_block_release(struct dump_dev *dev) ++{ ++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); ++ ++ /* release earlier bdev if present */ ++ if (dump_bdev->bdev) { ++ blkdev_put(dump_bdev->bdev); ++ dump_bdev->bdev = NULL; ++ } ++ ++ dump_free_bio(dump_bdev->bio); ++ dump_bdev->bio = NULL; ++ ++ return 0; ++} ++ ++ ++/* ++ * Prepare the dump device for use (silence any ongoing activity ++ * and quiesce state) when the system crashes. ++ */ ++static int ++dump_block_silence(struct dump_dev *dev) ++{ ++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); ++ struct request_queue *q = bdev_get_queue(dump_bdev->bdev); ++ int ret; ++ ++ /* If we can't get request queue lock, refuse to take the dump */ ++ if (!spin_trylock(q->queue_lock)) ++ return -EBUSY; ++ ++ ret = elv_queue_empty(q); ++ spin_unlock(q->queue_lock); ++ ++ /* For now we assume we have the device to ourselves */ ++ /* Just a quick sanity check */ ++ if (!ret) { ++ /* Warn the user and move on */ ++ printk(KERN_ALERT "Warning: Non-empty request queue\n"); ++ printk(KERN_ALERT "I/O requests in flight at dump time\n"); ++ } ++ ++ /* ++ * Move to a softer level of silencing where no spin_lock_irqs ++ * are held on other cpus ++ */ ++ dump_silence_level = DUMP_SOFT_SPIN_CPUS; ++ ++ ret = __dump_irq_enable(); ++ if (ret) { ++ return ret; ++ } ++ ++ printk("Dumping to block device (%d,%d) on CPU %d ...\n", ++ MAJOR(dump_bdev->dev_id), MINOR(dump_bdev->dev_id), ++ smp_processor_id()); ++ ++ return 0; ++} ++ ++/* ++ * Invoked when dumping is done. This is the time to put things back ++ * (i.e. undo the effects of dump_block_silence) so the device is ++ * available for normal use. ++ */ ++static int ++dump_block_resume(struct dump_dev *dev) ++{ ++ __dump_irq_restore(); ++ return 0; ++} ++ ++ ++/* ++ * Seek to the specified offset in the dump device. ++ * Makes sure this is a valid offset, otherwise returns an error. ++ */ ++static int ++dump_block_seek(struct dump_dev *dev, loff_t off) ++{ ++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); ++ loff_t offset = off + dump_bdev->start_offset; ++ ++ if (offset & ( PAGE_SIZE - 1)) { ++ printk("seek: non-page aligned\n"); ++ return -EINVAL; ++ } ++ ++ if (offset & (bdev_hardsect_size(dump_bdev->bdev) - 1)) { ++ printk("seek: not sector aligned \n"); ++ return -EINVAL; ++ } ++ ++ if (offset > dump_bdev->limit) { ++ printk("seek: not enough space left on device!\n"); ++ return -ENOSPC; ++ } ++ dev->curr_offset = off; ++ return 0; ++} ++ ++/* ++ * Write out a buffer after checking the device limitations, ++ * sector sizes, etc. Assumes the buffer is in directly mapped ++ * kernel address space (not vmalloc'ed). ++ * ++ * Returns: number of bytes written or -ERRNO. ++ */ ++static int ++dump_block_write(struct dump_dev *dev, void *buf, ++ unsigned long len) ++{ ++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); ++ loff_t offset = dev->curr_offset + dump_bdev->start_offset; ++ int retval = -ENOSPC; ++ ++ if (offset >= dump_bdev->limit) { ++ printk("write: not enough space left on device!\n"); ++ goto out; ++ } ++ ++ /* don't write more blocks than our max limit */ ++ if (offset + len > dump_bdev->limit) ++ len = dump_bdev->limit - offset; ++ ++ ++ retval = dump_block_map(dump_bdev, buf, len); ++ if (retval){ ++ printk("write: dump_block_map failed! err %d\n", retval); ++ goto out; ++ } ++ ++ /* ++ * Write out the data to disk. ++ * Assumes the entire buffer mapped to a single bio, which we can ++ * submit and wait for io completion. In the future, may consider ++ * increasing the dump buffer size and submitting multiple bio s ++ * for better throughput. ++ */ ++ dump_bdev->err = -EAGAIN; ++ submit_bio(WRITE, dump_bdev->bio); ++ ++ dump_bdev->ddev.curr_offset += len; ++ retval = len; ++ out: ++ return retval; ++} ++ ++/* ++ * Name: dump_block_ready() ++ * Func: check if the last dump i/o is over and ready for next request ++ */ ++static int ++dump_block_ready(struct dump_dev *dev, void *buf) ++{ ++ struct dump_blockdev *dump_bdev = DUMP_BDEV(dev); ++ request_queue_t *q = bdev_get_queue(dump_bdev->bio->bi_bdev); ++ ++ /* check for io completion */ ++ if (dump_bdev->err == -EAGAIN) { ++ q->unplug_fn(q); ++ return -EAGAIN; ++ } ++ ++ if (dump_bdev->err) { ++ printk("dump i/o err\n"); ++ return dump_bdev->err; ++ } ++ ++ return 0; ++} ++ ++ ++struct dump_dev_ops dump_blockdev_ops = { ++ .open = dump_block_open, ++ .release = dump_block_release, ++ .silence = dump_block_silence, ++ .resume = dump_block_resume, ++ .seek = dump_block_seek, ++ .write = dump_block_write, ++ /* .read not implemented */ ++ .ready = dump_block_ready ++}; ++ ++static struct dump_blockdev default_dump_blockdev = { ++ .ddev = {.type_name = "blockdev", .ops = &dump_blockdev_ops, ++ .curr_offset = 0}, ++ /* ++ * leave enough room for the longest swap header possibly written ++ * written by mkswap (likely the largest page size supported by ++ * the arch ++ */ ++ .start_offset = DUMP_HEADER_OFFSET, ++ .err = 0 ++ /* assume the rest of the fields are zeroed by default */ ++}; ++ ++struct dump_blockdev *dump_blockdev = &default_dump_blockdev; ++ ++static int __init ++dump_blockdev_init(void) ++{ ++ if (dump_register_device(&dump_blockdev->ddev) < 0) { ++ printk("block device driver registration failed\n"); ++ return -1; ++ } ++ ++ printk("block device driver for LKCD registered\n"); ++ return 0; ++} ++ ++static void __exit ++dump_blockdev_cleanup(void) ++{ ++ dump_unregister_device(&dump_blockdev->ddev); ++ printk("block device driver for LKCD unregistered\n"); ++} ++ ++MODULE_AUTHOR("LKCD Development Team "); ++MODULE_DESCRIPTION("Block Dump Driver for Linux Kernel Crash Dump (LKCD)"); ++MODULE_LICENSE("GPL"); ++ ++module_init(dump_blockdev_init); ++module_exit(dump_blockdev_cleanup); +Index: linux-2.6.10/drivers/dump/dump_fmt.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_fmt.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_fmt.c 2005-04-05 16:47:53.941205408 +0800 +@@ -0,0 +1,407 @@ ++/* ++ * Implements the routines which handle the format specific ++ * aspects of dump for the default dump format. ++ * ++ * Used in single stage dumping and stage 1 of soft-boot based dumping ++ * Saves data in LKCD (lcrash) format ++ * ++ * Previously a part of dump_base.c ++ * ++ * Started: Oct 2002 - Suparna Bhattacharya ++ * Split off and reshuffled LKCD dump format code around generic ++ * dump method interfaces. ++ * ++ * Derived from original code created by ++ * Matt Robinson ) ++ * ++ * Contributions from SGI, IBM, HP, MCL, and others. ++ * ++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. ++ * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved. ++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++ ++/* ++ * SYSTEM DUMP LAYOUT ++ * ++ * System dumps are currently the combination of a dump header and a set ++ * of data pages which contain the system memory. The layout of the dump ++ * (for full dumps) is as follows: ++ * ++ * +-----------------------------+ ++ * | generic dump header | ++ * +-----------------------------+ ++ * | architecture dump header | ++ * +-----------------------------+ ++ * | page header | ++ * +-----------------------------+ ++ * | page data | ++ * +-----------------------------+ ++ * | page header | ++ * +-----------------------------+ ++ * | page data | ++ * +-----------------------------+ ++ * | | | ++ * | | | ++ * | | | ++ * | | | ++ * | V | ++ * +-----------------------------+ ++ * | PAGE_END header | ++ * +-----------------------------+ ++ * ++ * There are two dump headers, the first which is architecture ++ * independent, and the other which is architecture dependent. This ++ * allows different architectures to dump different data structures ++ * which are specific to their chipset, CPU, etc. ++ * ++ * After the dump headers come a succession of dump page headers along ++ * with dump pages. The page header contains information about the page ++ * size, any flags associated with the page (whether it's compressed or ++ * not), and the address of the page. After the page header is the page ++ * data, which is either compressed (or not). Each page of data is ++ * dumped in succession, until the final dump header (PAGE_END) is ++ * placed at the end of the dump, assuming the dump device isn't out ++ * of space. ++ * ++ * This mechanism allows for multiple compression types, different ++ * types of data structures, different page ordering, etc., etc., etc. ++ * It's a very straightforward mechanism for dumping system memory. ++ */ ++ ++struct __dump_header dump_header; /* the primary dump header */ ++struct __dump_header_asm dump_header_asm; /* the arch-specific dump header */ ++ ++/* Replace a runtime sanity check on the DUMP_BUFFER_SIZE with a ++ * compile-time check. The compile_time_assertions routine will not ++ * compile if the assertion is false. ++ * ++ * If you fail this assert you are most likely on a large machine and ++ * should use a special 6.0.0 version of LKCD or a version > 7.0.0. See ++ * the LKCD website for more information. ++ */ ++ ++#define COMPILE_TIME_ASSERT(const_expr) \ ++ switch(0){case 0: case (const_expr):;} ++ ++static inline void compile_time_assertions(void) ++{ ++ COMPILE_TIME_ASSERT((sizeof(struct __dump_header) + ++ sizeof(struct __dump_header_asm)) <= DUMP_BUFFER_SIZE); ++} ++ ++/* ++ * Set up common header fields (mainly the arch indep section) ++ * Per-cpu state is handled by lcrash_save_context ++ * Returns the size of the header in bytes. ++ */ ++static int lcrash_init_dump_header(const char *panic_str) ++{ ++ struct timeval dh_time; ++ u64 temp_memsz = dump_header.dh_memory_size; ++ ++ /* initialize the dump headers to zero */ ++ /* save dha_stack pointer because it may contains pointer for stack! */ ++ memset(&dump_header, 0, sizeof(dump_header)); ++ memset(&dump_header_asm, 0, ++ offsetof(struct __dump_header_asm, dha_stack)); ++ memset(&dump_header_asm.dha_stack+1, 0, ++ sizeof(dump_header_asm) - ++ offsetof(struct __dump_header_asm, dha_stack) - ++ sizeof(dump_header_asm.dha_stack)); ++ dump_header.dh_memory_size = temp_memsz; ++ ++ /* configure dump header values */ ++ dump_header.dh_magic_number = DUMP_MAGIC_NUMBER; ++ dump_header.dh_version = DUMP_VERSION_NUMBER; ++ dump_header.dh_memory_start = PAGE_OFFSET; ++ dump_header.dh_memory_end = DUMP_MAGIC_NUMBER; ++ dump_header.dh_header_size = sizeof(struct __dump_header); ++ dump_header.dh_page_size = PAGE_SIZE; ++ dump_header.dh_dump_level = dump_config.level; ++ dump_header.dh_current_task = (unsigned long) current; ++ dump_header.dh_dump_compress = dump_config.dumper->compress-> ++ compress_type; ++ dump_header.dh_dump_flags = dump_config.flags; ++ dump_header.dh_dump_device = dump_config.dumper->dev->device_id; ++ ++#if DUMP_DEBUG >= 6 ++ dump_header.dh_num_bytes = 0; ++#endif ++ dump_header.dh_num_dump_pages = 0; ++ do_gettimeofday(&dh_time); ++ dump_header.dh_time.tv_sec = dh_time.tv_sec; ++ dump_header.dh_time.tv_usec = dh_time.tv_usec; ++ ++ memcpy((void *)&(dump_header.dh_utsname_sysname), ++ (const void *)&(system_utsname.sysname), __NEW_UTS_LEN + 1); ++ memcpy((void *)&(dump_header.dh_utsname_nodename), ++ (const void *)&(system_utsname.nodename), __NEW_UTS_LEN + 1); ++ memcpy((void *)&(dump_header.dh_utsname_release), ++ (const void *)&(system_utsname.release), __NEW_UTS_LEN + 1); ++ memcpy((void *)&(dump_header.dh_utsname_version), ++ (const void *)&(system_utsname.version), __NEW_UTS_LEN + 1); ++ memcpy((void *)&(dump_header.dh_utsname_machine), ++ (const void *)&(system_utsname.machine), __NEW_UTS_LEN + 1); ++ memcpy((void *)&(dump_header.dh_utsname_domainname), ++ (const void *)&(system_utsname.domainname), __NEW_UTS_LEN + 1); ++ ++ if (panic_str) { ++ memcpy((void *)&(dump_header.dh_panic_string), ++ (const void *)panic_str, DUMP_PANIC_LEN); ++ } ++ ++ dump_header_asm.dha_magic_number = DUMP_ASM_MAGIC_NUMBER; ++ dump_header_asm.dha_version = DUMP_ASM_VERSION_NUMBER; ++ dump_header_asm.dha_header_size = sizeof(dump_header_asm); ++#ifdef CONFIG_ARM ++ dump_header_asm.dha_physaddr_start = PHYS_OFFSET; ++#endif ++ ++ dump_header_asm.dha_smp_num_cpus = num_online_cpus(); ++ pr_debug("smp_num_cpus in header %d\n", ++ dump_header_asm.dha_smp_num_cpus); ++ ++ dump_header_asm.dha_dumping_cpu = smp_processor_id(); ++ ++ return sizeof(dump_header) + sizeof(dump_header_asm); ++} ++ ++ ++int dump_lcrash_configure_header(const char *panic_str, ++ const struct pt_regs *regs) ++{ ++ int retval = 0; ++ ++ dump_config.dumper->header_len = lcrash_init_dump_header(panic_str); ++ ++ /* capture register states for all processors */ ++ dump_save_this_cpu(regs); ++ __dump_save_other_cpus(); /* side effect:silence cpus */ ++ ++ /* configure architecture-specific dump header values */ ++ if ((retval = __dump_configure_header(regs))) ++ return retval; ++ ++ dump_config.dumper->header_dirty++; ++ return 0; ++} ++/* save register and task context */ ++void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, ++ struct task_struct *tsk) ++{ ++ /* This level of abstraction might be redundantly redundant */ ++ __dump_save_context(cpu, regs, tsk); ++} ++ ++/* write out the header */ ++int dump_write_header(void) ++{ ++ int retval = 0, size; ++ void *buf = dump_config.dumper->dump_buf; ++ ++ /* accounts for DUMP_HEADER_OFFSET if applicable */ ++ if ((retval = dump_dev_seek(0))) { ++ printk("Unable to seek to dump header offset: %d\n", ++ retval); ++ return retval; ++ } ++ ++ memcpy(buf, (void *)&dump_header, sizeof(dump_header)); ++ size = sizeof(dump_header); ++ memcpy(buf + size, (void *)&dump_header_asm, sizeof(dump_header_asm)); ++ size += sizeof(dump_header_asm); ++ size = PAGE_ALIGN(size); ++ retval = dump_ll_write(buf , size); ++ ++ if (retval < size) ++ return (retval >= 0) ? ENOSPC : retval; ++ return 0; ++} ++ ++int dump_generic_update_header(void) ++{ ++ int err = 0; ++ ++ if (dump_config.dumper->header_dirty) { ++ if ((err = dump_write_header())) { ++ printk("dump write header failed !err %d\n", err); ++ } else { ++ dump_config.dumper->header_dirty = 0; ++ } ++ } ++ ++ return err; ++} ++ ++static inline int is_curr_stack_page(struct page *page, unsigned long size) ++{ ++ unsigned long thread_addr = (unsigned long)current_thread_info(); ++ unsigned long addr = (unsigned long)page_address(page); ++ ++ return !PageHighMem(page) && (addr < thread_addr + THREAD_SIZE) ++ && (addr + size > thread_addr); ++} ++ ++static inline int is_dump_page(struct page *page, unsigned long size) ++{ ++ unsigned long addr = (unsigned long)page_address(page); ++ unsigned long dump_buf = (unsigned long)dump_config.dumper->dump_buf; ++ ++ return !PageHighMem(page) && (addr < dump_buf + DUMP_BUFFER_SIZE) ++ && (addr + size > dump_buf); ++} ++ ++int dump_allow_compress(struct page *page, unsigned long size) ++{ ++ /* ++ * Don't compress the page if any part of it overlaps ++ * with the current stack or dump buffer (since the contents ++ * in these could be changing while compression is going on) ++ */ ++ return !is_curr_stack_page(page, size) && !is_dump_page(page, size); ++} ++ ++void lcrash_init_pageheader(struct __dump_page *dp, struct page *page, ++ unsigned long sz) ++{ ++ memset(dp, sizeof(struct __dump_page), 0); ++ dp->dp_flags = 0; ++ dp->dp_size = 0; ++ if (sz > 0) ++ dp->dp_address = (loff_t)page_to_pfn(page) << PAGE_SHIFT; ++ ++#if DUMP_DEBUG > 6 ++ dp->dp_page_index = dump_header.dh_num_dump_pages; ++ dp->dp_byte_offset = dump_header.dh_num_bytes + DUMP_BUFFER_SIZE ++ + DUMP_HEADER_OFFSET; /* ?? */ ++#endif /* DUMP_DEBUG */ ++} ++ ++int dump_lcrash_add_data(unsigned long loc, unsigned long len) ++{ ++ struct page *page = (struct page *)loc; ++ void *addr, *buf = dump_config.dumper->curr_buf; ++ struct __dump_page *dp = (struct __dump_page *)buf; ++ int bytes, size; ++ ++ if (buf > dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE) ++ return -ENOMEM; ++ ++ lcrash_init_pageheader(dp, page, len); ++ buf += sizeof(struct __dump_page); ++ ++ while (len) { ++ addr = kmap_atomic(page, KM_DUMP); ++ size = bytes = (len > PAGE_SIZE) ? PAGE_SIZE : len; ++ /* check for compression */ ++ if (dump_allow_compress(page, bytes)) { ++ size = dump_compress_data((char *)addr, bytes, ++ (char *)buf, loc); ++ } ++ /* set the compressed flag if the page did compress */ ++ if (size && (size < bytes)) { ++ dp->dp_flags |= DUMP_DH_COMPRESSED; ++ } else { ++ /* compression failed -- default to raw mode */ ++ dp->dp_flags |= DUMP_DH_RAW; ++ memcpy(buf, addr, bytes); ++ size = bytes; ++ } ++ /* memset(buf, 'A', size); temporary: testing only !! */ ++ kunmap_atomic(addr, KM_DUMP); ++ dp->dp_size += size; ++ buf += size; ++ len -= bytes; ++ page++; ++ } ++ ++ /* now update the header */ ++#if DUMP_DEBUG > 6 ++ dump_header.dh_num_bytes += dp->dp_size + sizeof(*dp); ++#endif ++ dump_header.dh_num_dump_pages++; ++ dump_config.dumper->header_dirty++; ++ ++ dump_config.dumper->curr_buf = buf; ++ ++ return len; ++} ++ ++int dump_lcrash_update_end_marker(void) ++{ ++ struct __dump_page *dp = ++ (struct __dump_page *)dump_config.dumper->curr_buf; ++ unsigned long left; ++ int ret = 0; ++ ++ lcrash_init_pageheader(dp, NULL, 0); ++ dp->dp_flags |= DUMP_DH_END; /* tbd: truncation test ? */ ++ ++ /* now update the header */ ++#if DUMP_DEBUG > 6 ++ dump_header.dh_num_bytes += sizeof(*dp); ++#endif ++ dump_config.dumper->curr_buf += sizeof(*dp); ++ left = dump_config.dumper->curr_buf - dump_config.dumper->dump_buf; ++ ++ printk("\n"); ++ ++ while (left) { ++ if ((ret = dump_dev_seek(dump_config.dumper->curr_offset))) { ++ printk("Seek failed at offset 0x%llx\n", ++ dump_config.dumper->curr_offset); ++ return ret; ++ } ++ ++ if (DUMP_BUFFER_SIZE > left) ++ memset(dump_config.dumper->curr_buf, 'm', ++ DUMP_BUFFER_SIZE - left); ++ ++ if ((ret = dump_ll_write(dump_config.dumper->dump_buf, ++ DUMP_BUFFER_SIZE)) < DUMP_BUFFER_SIZE) { ++ return (ret < 0) ? ret : -ENOSPC; ++ } ++ ++ dump_config.dumper->curr_offset += DUMP_BUFFER_SIZE; ++ ++ if (left > DUMP_BUFFER_SIZE) { ++ left -= DUMP_BUFFER_SIZE; ++ memcpy(dump_config.dumper->dump_buf, ++ dump_config.dumper->dump_buf + DUMP_BUFFER_SIZE, left); ++ dump_config.dumper->curr_buf -= DUMP_BUFFER_SIZE; ++ } else { ++ left = 0; ++ } ++ } ++ return 0; ++} ++ ++ ++/* Default Formatter (lcrash) */ ++struct dump_fmt_ops dump_fmt_lcrash_ops = { ++ .configure_header = dump_lcrash_configure_header, ++ .update_header = dump_generic_update_header, ++ .save_context = dump_lcrash_save_context, ++ .add_data = dump_lcrash_add_data, ++ .update_end_marker = dump_lcrash_update_end_marker ++}; ++ ++struct dump_fmt dump_fmt_lcrash = { ++ .name = "lcrash", ++ .ops = &dump_fmt_lcrash_ops ++}; ++ +Index: linux-2.6.10/drivers/dump/dump_setup.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_setup.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_setup.c 2005-04-05 16:47:53.939205712 +0800 +@@ -0,0 +1,923 @@ ++/* ++ * Standard kernel function entry points for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sourceforge.net) ++ * Contributions from SGI, IBM, HP, MCL, and others. ++ * ++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. ++ * Copyright (C) 2000 - 2002 TurboLinux, Inc. All rights reserved. ++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. ++ * Copyright (C) 2002 Free Software Foundation, Inc. All rights reserved. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* ++ * ----------------------------------------------------------------------- ++ * ++ * DUMP HISTORY ++ * ++ * This dump code goes back to SGI's first attempts at dumping system ++ * memory on SGI systems running IRIX. A few developers at SGI needed ++ * a way to take this system dump and analyze it, and created 'icrash', ++ * or IRIX Crash. The mechanism (the dumps and 'icrash') were used ++ * by support people to generate crash reports when a system failure ++ * occurred. This was vital for large system configurations that ++ * couldn't apply patch after patch after fix just to hope that the ++ * problems would go away. So the system memory, along with the crash ++ * dump analyzer, allowed support people to quickly figure out what the ++ * problem was on the system with the crash dump. ++ * ++ * In comes Linux. SGI started moving towards the open source community, ++ * and upon doing so, SGI wanted to take its support utilities into Linux ++ * with the hopes that they would end up the in kernel and user space to ++ * be used by SGI's customers buying SGI Linux systems. One of the first ++ * few products to be open sourced by SGI was LKCD, or Linux Kernel Crash ++ * Dumps. LKCD comprises of a patch to the kernel to enable system ++ * dumping, along with 'lcrash', or Linux Crash, to analyze the system ++ * memory dump. A few additional system scripts and kernel modifications ++ * are also included to make the dump mechanism and dump data easier to ++ * process and use. ++ * ++ * As soon as LKCD was released into the open source community, a number ++ * of larger companies started to take advantage of it. Today, there are ++ * many community members that contribute to LKCD, and it continues to ++ * flourish and grow as an open source project. ++ */ ++ ++/* ++ * DUMP TUNABLES (read/write with ioctl, readonly with /proc) ++ * ++ * This is the list of system tunables (via /proc) that are available ++ * for Linux systems. All the read, write, etc., functions are listed ++ * here. Currently, there are a few different tunables for dumps: ++ * ++ * dump_device (used to be dumpdev): ++ * The device for dumping the memory pages out to. This ++ * may be set to the primary swap partition for disruptive dumps, ++ * and must be an unused partition for non-disruptive dumps. ++ * Todo: In the case of network dumps, this may be interpreted ++ * as the IP address of the netdump server to connect to. ++ * ++ * dump_compress (used to be dump_compress_pages): ++ * This is the flag which indicates which compression mechanism ++ * to use. This is a BITMASK, not an index (0,1,2,4,8,16,etc.). ++ * This is the current set of values: ++ * ++ * 0: DUMP_COMPRESS_NONE -- Don't compress any pages. ++ * 1: DUMP_COMPRESS_RLE -- This uses RLE compression. ++ * 2: DUMP_COMPRESS_GZIP -- This uses GZIP compression. ++ * ++ * dump_level: ++ * The amount of effort the dump module should make to save ++ * information for post crash analysis. This value is now ++ * a BITMASK value, not an index: ++ * ++ * 0: Do nothing, no dumping. (DUMP_LEVEL_NONE) ++ * ++ * 1: Print out the dump information to the dump header, and ++ * write it out to the dump_device. (DUMP_LEVEL_HEADER) ++ * ++ * 2: Write out the dump header and all kernel memory pages. ++ * (DUMP_LEVEL_KERN) ++ * ++ * 4: Write out the dump header and all kernel and user ++ * memory pages. (DUMP_LEVEL_USED) ++ * ++ * 8: Write out the dump header and all conventional/cached ++ * memory (RAM) pages in the system (kernel, user, free). ++ * (DUMP_LEVEL_ALL_RAM) ++ * ++ * 16: Write out everything, including non-conventional memory ++ * like firmware, proms, I/O registers, uncached memory. ++ * (DUMP_LEVEL_ALL) ++ * ++ * The dump_level will default to 1. ++ * ++ * dump_flags: ++ * These are the flags to use when talking about dumps. There ++ * are lots of possibilities. This is a BITMASK value, not an index. ++ * ++ * ----------------------------------------------------------------------- ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * ----------------------------------------------------------------------- ++ * V A R I A B L E S ++ * ----------------------------------------------------------------------- ++ */ ++ ++/* Dump tunables */ ++struct dump_config dump_config = { ++ .level = 0, ++ .flags = 0, ++ .dump_device = 0, ++ .dump_addr = 0, ++ .dumper = NULL ++}; ++#ifdef CONFIG_ARM ++static _dump_regs_t all_regs; ++#endif ++ ++/* Global variables used in dump.h */ ++/* degree of system freeze when dumping */ ++enum dump_silence_levels dump_silence_level = DUMP_HARD_SPIN_CPUS; ++ ++/* Other global fields */ ++extern struct __dump_header dump_header; ++struct dump_dev *dump_dev = NULL; /* Active dump device */ ++static int dump_compress = 0; ++ ++static u32 dump_compress_none(const u8 *old, u32 oldsize, u8 *new, u32 newsize, ++ unsigned long loc); ++struct __dump_compress dump_none_compression = { ++ .compress_type = DUMP_COMPRESS_NONE, ++ .compress_func = dump_compress_none, ++ .compress_name = "none", ++}; ++ ++/* our device operations and functions */ ++static int dump_ioctl(struct inode *i, struct file *f, ++ unsigned int cmd, unsigned long arg); ++ ++#ifdef CONFIG_COMPAT ++static int dw_long(unsigned int, unsigned int, unsigned long, struct file*); ++#endif ++ ++static struct file_operations dump_fops = { ++ .owner = THIS_MODULE, ++ .ioctl = dump_ioctl, ++}; ++ ++static struct miscdevice dump_miscdev = { ++ .minor = CRASH_DUMP_MINOR, ++ .name = "dump", ++ .fops = &dump_fops, ++}; ++MODULE_ALIAS_MISCDEV(CRASH_DUMP_MINOR); ++ ++/* static variables */ ++static int dump_okay = 0; /* can we dump out to disk? */ ++static spinlock_t dump_lock = SPIN_LOCK_UNLOCKED; ++ ++/* used for dump compressors */ ++static struct list_head dump_compress_list = LIST_HEAD_INIT(dump_compress_list); ++ ++/* list of registered dump targets */ ++static struct list_head dump_target_list = LIST_HEAD_INIT(dump_target_list); ++ ++/* lkcd info structure -- this is used by lcrash for basic system data */ ++struct __lkcdinfo lkcdinfo = { ++ .ptrsz = (sizeof(void *) * 8), ++#if defined(__LITTLE_ENDIAN) ++ .byte_order = __LITTLE_ENDIAN, ++#else ++ .byte_order = __BIG_ENDIAN, ++#endif ++ .page_shift = PAGE_SHIFT, ++ .page_size = PAGE_SIZE, ++ .page_mask = PAGE_MASK, ++ .page_offset = PAGE_OFFSET, ++}; ++ ++/* ++ * ----------------------------------------------------------------------- ++ * / P R O C T U N A B L E F U N C T I O N S ++ * ----------------------------------------------------------------------- ++ */ ++ ++static int proc_dump_device(ctl_table *ctl, int write, struct file *f, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++ ++static int proc_doulonghex(ctl_table *ctl, int write, struct file *f, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++/* ++ * sysctl-tuning infrastructure. ++ */ ++static ctl_table dump_table[] = { ++ { .ctl_name = CTL_DUMP_LEVEL, ++ .procname = DUMP_LEVEL_NAME, ++ .data = &dump_config.level, ++ .maxlen = sizeof(int), ++ .mode = 0444, ++ .proc_handler = proc_doulonghex, }, ++ ++ { .ctl_name = CTL_DUMP_FLAGS, ++ .procname = DUMP_FLAGS_NAME, ++ .data = &dump_config.flags, ++ .maxlen = sizeof(int), ++ .mode = 0444, ++ .proc_handler = proc_doulonghex, }, ++ ++ { .ctl_name = CTL_DUMP_COMPRESS, ++ .procname = DUMP_COMPRESS_NAME, ++ .data = &dump_compress, /* FIXME */ ++ .maxlen = sizeof(int), ++ .mode = 0444, ++ .proc_handler = proc_dointvec, }, ++ ++ { .ctl_name = CTL_DUMP_DEVICE, ++ .procname = DUMP_DEVICE_NAME, ++ .mode = 0444, ++ .data = &dump_config.dump_device, /* FIXME */ ++ .maxlen = sizeof(int), ++ .proc_handler = proc_dump_device }, ++ ++#ifdef CONFIG_CRASH_DUMP_MEMDEV ++ { .ctl_name = CTL_DUMP_ADDR, ++ .procname = DUMP_ADDR_NAME, ++ .mode = 0444, ++ .data = &dump_config.dump_addr, ++ .maxlen = sizeof(unsigned long), ++ .proc_handler = proc_doulonghex }, ++#endif ++ ++ { 0, } ++}; ++ ++static ctl_table dump_root[] = { ++ { .ctl_name = KERN_DUMP, ++ .procname = "dump", ++ .mode = 0555, ++ .child = dump_table }, ++ { 0, } ++}; ++ ++static ctl_table kernel_root[] = { ++ { .ctl_name = CTL_KERN, ++ .procname = "kernel", ++ .mode = 0555, ++ .child = dump_root, }, ++ { 0, } ++}; ++ ++static struct ctl_table_header *sysctl_header; ++ ++/* ++ * ----------------------------------------------------------------------- ++ * C O M P R E S S I O N F U N C T I O N S ++ * ----------------------------------------------------------------------- ++ */ ++ ++/* ++ * Name: dump_compress_none() ++ * Func: Don't do any compression, period. ++ */ ++static u32 ++dump_compress_none(const u8 *old, u32 oldsize, u8 *new, u32 newsize, ++ unsigned long loc) ++{ ++ /* just return the old size */ ++ return oldsize; ++} ++ ++ ++/* ++ * Name: dump_execute() ++ * Func: Execute the dumping process. This makes sure all the appropriate ++ * fields are updated correctly, and calls dump_execute_memdump(), ++ * which does the real work. ++ */ ++void ++dump_execute(const char *panic_str, const struct pt_regs *regs) ++{ ++ int state = -1; ++ unsigned long flags; ++ ++ /* make sure we can dump */ ++ if (!dump_okay) { ++ pr_info("LKCD not yet configured, can't take dump now\n"); ++ return; ++ } ++ ++ /* Exclude multiple dumps at the same time, ++ * and disable interrupts, some drivers may re-enable ++ * interrupts in with silence() ++ * ++ * Try and acquire spin lock. If successful, leave preempt ++ * and interrupts disabled. See spin_lock_irqsave in spinlock.h ++ */ ++ local_irq_save(flags); ++ if (!spin_trylock(&dump_lock)) { ++ local_irq_restore(flags); ++ pr_info("LKCD dump already in progress\n"); ++ return; ++ } ++ ++ /* What state are interrupts really in? */ ++ if (in_interrupt()){ ++ if(in_irq()) ++ printk(KERN_ALERT "Dumping from interrupt handler!\n"); ++ else ++ printk(KERN_ALERT "Dumping from bottom half!\n"); ++ ++ __dump_clean_irq_state(); ++ } ++ ++ ++ /* Bring system into the strictest level of quiescing for min drift ++ * dump drivers can soften this as required in dev->ops->silence() ++ */ ++ dump_oncpu = smp_processor_id() + 1; ++ dump_silence_level = DUMP_HARD_SPIN_CPUS; ++ ++ state = dump_generic_execute(panic_str, regs); ++ ++ dump_oncpu = 0; ++ spin_unlock_irqrestore(&dump_lock, flags); ++ ++ if (state < 0) { ++ printk("Dump Incomplete or failed!\n"); ++ } else { ++ printk("Dump Complete; %d dump pages saved.\n", ++ dump_header.dh_num_dump_pages); ++ } ++} ++ ++/* ++ * Name: dump_register_compression() ++ * Func: Register a dump compression mechanism. ++ */ ++void ++dump_register_compression(struct __dump_compress *item) ++{ ++ if (item) ++ list_add(&(item->list), &dump_compress_list); ++} ++ ++/* ++ * Name: dump_unregister_compression() ++ * Func: Remove a dump compression mechanism, and re-assign the dump ++ * compression pointer if necessary. ++ */ ++void ++dump_unregister_compression(int compression_type) ++{ ++ struct list_head *tmp; ++ struct __dump_compress *dc; ++ ++ /* let's make sure our list is valid */ ++ if (compression_type != DUMP_COMPRESS_NONE) { ++ list_for_each(tmp, &dump_compress_list) { ++ dc = list_entry(tmp, struct __dump_compress, list); ++ if (dc->compress_type == compression_type) { ++ list_del(&(dc->list)); ++ break; ++ } ++ } ++ } ++} ++ ++/* ++ * Name: dump_compress_init() ++ * Func: Initialize (or re-initialize) compression scheme. ++ */ ++static int ++dump_compress_init(int compression_type) ++{ ++ struct list_head *tmp; ++ struct __dump_compress *dc; ++ ++ /* try to remove the compression item */ ++ list_for_each(tmp, &dump_compress_list) { ++ dc = list_entry(tmp, struct __dump_compress, list); ++ if (dc->compress_type == compression_type) { ++ dump_config.dumper->compress = dc; ++ dump_compress = compression_type; ++ pr_debug("Dump Compress %s\n", dc->compress_name); ++ return 0; ++ } ++ } ++ ++ /* ++ * nothing on the list -- return ENODATA to indicate an error ++ * ++ * NB: ++ * EAGAIN: reports "Resource temporarily unavailable" which ++ * isn't very enlightening. ++ */ ++ printk("compression_type:%d not found\n", compression_type); ++ ++ return -ENODATA; ++} ++ ++static int ++dumper_setup(unsigned long flags, unsigned long devid) ++{ ++ int ret = 0; ++ ++ /* unconfigure old dumper if it exists */ ++ dump_okay = 0; ++ if (dump_config.dumper) { ++ pr_debug("Unconfiguring current dumper\n"); ++ dump_unconfigure(); ++ } ++ /* set up new dumper */ ++ if (dump_config.flags & DUMP_FLAGS_SOFTBOOT) { ++ printk("Configuring softboot based dump \n"); ++#ifdef CONFIG_CRASH_DUMP_MEMDEV ++ dump_config.dumper = &dumper_stage1; ++#else ++ printk("Requires CONFIG_CRASHDUMP_MEMDEV. Can't proceed.\n"); ++ return -1; ++#endif ++ } else { ++ dump_config.dumper = &dumper_singlestage; ++ } ++ dump_config.dumper->dev = dump_dev; ++ ++ ret = dump_configure(devid); ++ if (!ret) { ++ dump_okay = 1; ++ pr_debug("%s dumper set up for dev 0x%lx\n", ++ dump_config.dumper->name, devid); ++ dump_config.dump_device = devid; ++ } else { ++ printk("%s dumper set up failed for dev 0x%lx\n", ++ dump_config.dumper->name, devid); ++ dump_config.dumper = NULL; ++ } ++ return ret; ++} ++ ++static int ++dump_target_init(int target) ++{ ++ char type[20]; ++ struct list_head *tmp; ++ struct dump_dev *dev; ++ ++ switch (target) { ++ case DUMP_FLAGS_DISKDUMP: ++ strcpy(type, "blockdev"); break; ++ case DUMP_FLAGS_NETDUMP: ++ strcpy(type, "networkdev"); break; ++ default: ++ return -1; ++ } ++ ++ /* ++ * This is a bit stupid, generating strings from flag ++ * and doing strcmp. This is done because 'struct dump_dev' ++ * has string 'type_name' and not interger 'type'. ++ */ ++ list_for_each(tmp, &dump_target_list) { ++ dev = list_entry(tmp, struct dump_dev, list); ++ if (strcmp(type, dev->type_name) == 0) { ++ dump_dev = dev; ++ return 0; ++ } ++ } ++ return -1; ++} ++ ++/* ++ * Name: dump_ioctl() ++ * Func: Allow all dump tunables through a standard ioctl() mechanism. ++ * This is far better than before, where we'd go through /proc, ++ * because now this will work for multiple OS and architectures. ++ */ ++static int ++dump_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg) ++{ ++ /* check capabilities */ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (!dump_config.dumper && cmd == DIOSDUMPCOMPRESS) ++ /* dump device must be configured first */ ++ return -ENODEV; ++ ++ /* ++ * This is the main mechanism for controlling get/set data ++ * for various dump device parameters. The real trick here ++ * is setting the dump device (DIOSDUMPDEV). That's what ++ * triggers everything else. ++ */ ++ switch (cmd) { ++ case DIOSDUMPDEV: /* set dump_device */ ++ pr_debug("Configuring dump device\n"); ++ if (!(f->f_flags & O_RDWR)) ++ return -EPERM; ++ ++ __dump_open(); ++ return dumper_setup(dump_config.flags, arg); ++ ++ ++ case DIOGDUMPDEV: /* get dump_device */ ++ return put_user((long)dump_config.dump_device, (long *)arg); ++ ++ case DIOSDUMPLEVEL: /* set dump_level */ ++ if (!(f->f_flags & O_RDWR)) ++ return -EPERM; ++ ++ /* make sure we have a positive value */ ++ if (arg < 0) ++ return -EINVAL; ++ ++ /* Fixme: clean this up */ ++ dump_config.level = 0; ++ switch ((int)arg) { ++ case DUMP_LEVEL_ALL: ++ case DUMP_LEVEL_ALL_RAM: ++ dump_config.level |= DUMP_MASK_UNUSED; ++ case DUMP_LEVEL_USED: ++ dump_config.level |= DUMP_MASK_USED; ++ case DUMP_LEVEL_KERN: ++ dump_config.level |= DUMP_MASK_KERN; ++ case DUMP_LEVEL_HEADER: ++ dump_config.level |= DUMP_MASK_HEADER; ++ case DUMP_LEVEL_NONE: ++ break; ++ default: ++ return (-EINVAL); ++ } ++ pr_debug("Dump Level 0x%lx\n", dump_config.level); ++ break; ++ ++ case DIOGDUMPLEVEL: /* get dump_level */ ++ /* fixme: handle conversion */ ++ return put_user((long)dump_config.level, (long *)arg); ++ ++ ++ case DIOSDUMPFLAGS: /* set dump_flags */ ++ /* check flags */ ++ if (!(f->f_flags & O_RDWR)) ++ return -EPERM; ++ ++ /* make sure we have a positive value */ ++ if (arg < 0) ++ return -EINVAL; ++ ++ if (dump_target_init(arg & DUMP_FLAGS_TARGETMASK) < 0) ++ return -EINVAL; /* return proper error */ ++ ++ dump_config.flags = arg; ++ ++ pr_debug("Dump Flags 0x%lx\n", dump_config.flags); ++ break; ++ ++ case DIOGDUMPFLAGS: /* get dump_flags */ ++ return put_user((long)dump_config.flags, (long *)arg); ++ ++ case DIOSDUMPCOMPRESS: /* set the dump_compress status */ ++ if (!(f->f_flags & O_RDWR)) ++ return -EPERM; ++ ++ return dump_compress_init((int)arg); ++ ++ case DIOGDUMPCOMPRESS: /* get the dump_compress status */ ++ return put_user((long)(dump_config.dumper ? ++ dump_config.dumper->compress->compress_type : 0), ++ (long *)arg); ++ case DIOGDUMPOKAY: /* check if dump is configured */ ++ return put_user((long)dump_okay, (long *)arg); ++ ++ case DIOSDUMPTAKE: /* Trigger a manual dump */ ++ /* Do not proceed if lkcd not yet configured */ ++ if(!dump_okay) { ++ printk("LKCD not yet configured. Cannot take manual dump\n"); ++ return -ENODEV; ++ } ++ ++ /* Take the dump */ ++ return manual_handle_crashdump(); ++ ++ default: ++ /* ++ * these are network dump specific ioctls, let the ++ * module handle them. ++ */ ++ return dump_dev_ioctl(cmd, arg); ++ } ++ return 0; ++} ++ ++/* ++ * Handle special cases for dump_device ++ * changing dump device requires doing an opening the device ++ */ ++static int ++proc_dump_device(ctl_table *ctl, int write, struct file *f, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int *valp = ctl->data; ++ int oval = *valp; ++ int ret = -EPERM; ++ ++ /* same permission checks as ioctl */ ++ if (capable(CAP_SYS_ADMIN)) { ++ ret = proc_doulonghex(ctl, write, f, buffer, lenp, ppos); ++ if (ret == 0 && write && *valp != oval) { ++ /* need to restore old value to close properly */ ++ dump_config.dump_device = (dev_t) oval; ++ __dump_open(); ++ ret = dumper_setup(dump_config.flags, (dev_t) *valp); ++ } ++ } ++ ++ return ret; ++} ++ ++/* All for the want of a proc_do_xxx routine which prints values in hex */ ++/* Write is not implemented correctly, so mode is set to 0444 above. */ ++static int ++proc_doulonghex(ctl_table *ctl, int write, struct file *f, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++#define TMPBUFLEN 21 ++ unsigned long *i; ++ size_t len, left; ++ char buf[TMPBUFLEN]; ++ ++ if (!ctl->data || !ctl->maxlen || !*lenp || (*ppos && !write)) { ++ *lenp = 0; ++ return 0; ++ } ++ ++ i = (unsigned long *) ctl->data; ++ left = *lenp; ++ ++ sprintf(buf, "0x%lx\n", (*i)); ++ len = strlen(buf); ++ if (len > left) ++ len = left; ++ if(copy_to_user(buffer, buf, len)) ++ return -EFAULT; ++ ++ left -= len; ++ *lenp -= left; ++ *ppos += *lenp; ++ return 0; ++} ++ ++/* ++ * ----------------------------------------------------------------------- ++ * I N I T F U N C T I O N S ++ * ----------------------------------------------------------------------- ++ */ ++ ++#ifdef CONFIG_COMPAT ++static int dw_long(unsigned int fd, unsigned int cmd, unsigned long arg, ++ struct file *f) ++{ ++ mm_segment_t old_fs = get_fs(); ++ int err; ++ unsigned long val; ++ ++ set_fs (KERNEL_DS); ++ err = sys_ioctl(fd, cmd, (u64)&val); ++ set_fs (old_fs); ++ if (!err && put_user((unsigned int) val, (u32 *)arg)) ++ return -EFAULT; ++ return err; ++} ++#endif ++ ++/* ++ * These register and unregister routines are exported for modules ++ * to register their dump drivers (like block, net etc) ++ */ ++int ++dump_register_device(struct dump_dev *ddev) ++{ ++ struct list_head *tmp; ++ struct dump_dev *dev; ++ ++ list_for_each(tmp, &dump_target_list) { ++ dev = list_entry(tmp, struct dump_dev, list); ++ if (strcmp(ddev->type_name, dev->type_name) == 0) { ++ printk("Target type %s already registered\n", ++ dev->type_name); ++ return -1; /* return proper error */ ++ } ++ } ++ list_add(&(ddev->list), &dump_target_list); ++ ++ return 0; ++} ++ ++void ++dump_unregister_device(struct dump_dev *ddev) ++{ ++ list_del(&(ddev->list)); ++ if (ddev != dump_dev) ++ return; ++ ++ dump_okay = 0; ++ ++ if (dump_config.dumper) ++ dump_unconfigure(); ++ ++ dump_config.flags &= ~DUMP_FLAGS_TARGETMASK; ++ dump_okay = 0; ++ dump_dev = NULL; ++ dump_config.dumper = NULL; ++} ++ ++static int panic_event(struct notifier_block *this, unsigned long event, ++ void *ptr) ++{ ++#ifdef CONFIG_ARM ++ get_current_general_regs(&all_regs); ++ get_current_cp14_regs(&all_regs); ++ get_current_cp15_regs(&all_regs); ++ dump_execute((const char *)ptr, &all_regs); ++#else ++ struct pt_regs regs; ++ ++ get_current_regs(®s); ++ dump_execute((const char *)ptr, ®s); ++#endif ++ return 0; ++} ++ ++extern struct notifier_block *panic_notifier_list; ++static int panic_event(struct notifier_block *, unsigned long, void *); ++static struct notifier_block panic_block = { ++ .notifier_call = panic_event, ++}; ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++/* Sysrq handler */ ++static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs, ++ struct tty_struct *tty) { ++ if(!pt_regs) { ++ struct pt_regs regs; ++ get_current_regs(®s); ++ dump_execute("sysrq", ®s); ++ ++ } else { ++ dump_execute("sysrq", pt_regs); ++ } ++} ++ ++static struct sysrq_key_op sysrq_crashdump_op = { ++ .handler = sysrq_handle_crashdump, ++ .help_msg = "Dump", ++ .action_msg = "Starting crash dump", ++}; ++#endif ++ ++static inline void ++dump_sysrq_register(void) ++{ ++#ifdef CONFIG_MAGIC_SYSRQ ++ register_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op); ++#endif ++} ++ ++static inline void ++dump_sysrq_unregister(void) ++{ ++#ifdef CONFIG_MAGIC_SYSRQ ++ unregister_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op); ++#endif ++} ++ ++/* ++ * Name: dump_init() ++ * Func: Initialize the dump process. This will set up any architecture ++ * dependent code. The big key is we need the memory offsets before ++ * the page table is initialized, because the base memory offset ++ * is changed after paging_init() is called. ++ */ ++static int __init ++dump_init(void) ++{ ++ struct sysinfo info; ++ int err; ++ ++ /* try to create our dump device */ ++ err = misc_register(&dump_miscdev); ++ if (err) { ++ printk("cannot register dump character device!\n"); ++ return err; ++ } ++ ++ __dump_init((u64)PAGE_OFFSET); ++ ++#ifdef CONFIG_COMPAT ++ err = register_ioctl32_conversion(DIOSDUMPDEV, NULL); ++ err |= register_ioctl32_conversion(DIOGDUMPDEV, NULL); ++ err |= register_ioctl32_conversion(DIOSDUMPLEVEL, NULL); ++ err |= register_ioctl32_conversion(DIOGDUMPLEVEL, dw_long); ++ err |= register_ioctl32_conversion(DIOSDUMPFLAGS, NULL); ++ err |= register_ioctl32_conversion(DIOGDUMPFLAGS, dw_long); ++ err |= register_ioctl32_conversion(DIOSDUMPCOMPRESS, NULL); ++ err |= register_ioctl32_conversion(DIOGDUMPCOMPRESS, dw_long); ++ err |= register_ioctl32_conversion(DIOSTARGETIP, NULL); ++ err |= register_ioctl32_conversion(DIOGTARGETIP, NULL); ++ err |= register_ioctl32_conversion(DIOSTARGETPORT, NULL); ++ err |= register_ioctl32_conversion(DIOGTARGETPORT, NULL); ++ err |= register_ioctl32_conversion(DIOSSOURCEPORT, NULL); ++ err |= register_ioctl32_conversion(DIOGSOURCEPORT, NULL); ++ err |= register_ioctl32_conversion(DIOSETHADDR, NULL); ++ err |= register_ioctl32_conversion(DIOGETHADDR, NULL); ++ err |= register_ioctl32_conversion(DIOGDUMPOKAY, dw_long); ++ err |= register_ioctl32_conversion(DIOSDUMPTAKE, NULL); ++ if (err) { ++ printk(KERN_ERR "LKCD: registering ioctl32 translations failed\ ++"); ++ } ++#endif ++ /* set the dump_compression_list structure up */ ++ dump_register_compression(&dump_none_compression); ++ ++ /* grab the total memory size now (not if/when we crash) */ ++ si_meminfo(&info); ++ ++ /* set the memory size */ ++ dump_header.dh_memory_size = (u64)info.totalram; ++ ++ sysctl_header = register_sysctl_table(kernel_root, 0); ++ dump_sysrq_register(); ++ ++ notifier_chain_register(&panic_notifier_list, &panic_block); ++ dump_function_ptr = dump_execute; ++ ++ pr_info("Crash dump driver initialized.\n"); ++ return 0; ++} ++ ++static void __exit ++dump_cleanup(void) ++{ ++ int err; ++ dump_okay = 0; ++ ++ if (dump_config.dumper) ++ dump_unconfigure(); ++ ++ /* arch-specific cleanup routine */ ++ __dump_cleanup(); ++ ++#ifdef CONFIG_COMPAT ++ err = unregister_ioctl32_conversion(DIOSDUMPDEV); ++ err |= unregister_ioctl32_conversion(DIOGDUMPDEV); ++ err |= unregister_ioctl32_conversion(DIOSDUMPLEVEL); ++ err |= unregister_ioctl32_conversion(DIOGDUMPLEVEL); ++ err |= unregister_ioctl32_conversion(DIOSDUMPFLAGS); ++ err |= unregister_ioctl32_conversion(DIOGDUMPFLAGS); ++ err |= unregister_ioctl32_conversion(DIOSDUMPCOMPRESS); ++ err |= unregister_ioctl32_conversion(DIOGDUMPCOMPRESS); ++ err |= unregister_ioctl32_conversion(DIOSTARGETIP); ++ err |= unregister_ioctl32_conversion(DIOGTARGETIP); ++ err |= unregister_ioctl32_conversion(DIOSTARGETPORT); ++ err |= unregister_ioctl32_conversion(DIOGTARGETPORT); ++ err |= unregister_ioctl32_conversion(DIOSSOURCEPORT); ++ err |= unregister_ioctl32_conversion(DIOGSOURCEPORT); ++ err |= unregister_ioctl32_conversion(DIOSETHADDR); ++ err |= unregister_ioctl32_conversion(DIOGETHADDR); ++ err |= unregister_ioctl32_conversion(DIOGDUMPOKAY); ++ err |= unregister_ioctl32_conversion(DIOSDUMPTAKE); ++ if (err) { ++ printk(KERN_ERR "LKCD: Unregistering ioctl32 translations failed\n"); ++ } ++#endif ++ ++ /* ignore errors while unregistering -- since can't do anything */ ++ unregister_sysctl_table(sysctl_header); ++ misc_deregister(&dump_miscdev); ++ dump_sysrq_unregister(); ++ notifier_chain_unregister(&panic_notifier_list, &panic_block); ++ dump_function_ptr = NULL; ++} ++ ++EXPORT_SYMBOL(dump_register_compression); ++EXPORT_SYMBOL(dump_unregister_compression); ++EXPORT_SYMBOL(dump_register_device); ++EXPORT_SYMBOL(dump_unregister_device); ++EXPORT_SYMBOL(dump_config); ++EXPORT_SYMBOL(dump_silence_level); ++ ++EXPORT_SYMBOL(__dump_irq_enable); ++EXPORT_SYMBOL(__dump_irq_restore); ++ ++MODULE_AUTHOR("Matt D. Robinson "); ++MODULE_DESCRIPTION("Linux Kernel Crash Dump (LKCD) driver"); ++MODULE_LICENSE("GPL"); ++ ++module_init(dump_init); ++module_exit(dump_cleanup); +Index: linux-2.6.10/drivers/dump/dump_scheme.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_scheme.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_scheme.c 2005-04-05 16:47:53.944204952 +0800 +@@ -0,0 +1,430 @@ ++/* ++ * Default single stage dump scheme methods ++ * ++ * Previously a part of dump_base.c ++ * ++ * Started: Oct 2002 - Suparna Bhattacharya ++ * Split and rewrote LKCD dump scheme to generic dump method ++ * interfaces ++ * Derived from original code created by ++ * Matt Robinson ) ++ * ++ * Contributions from SGI, IBM, HP, MCL, and others. ++ * ++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. ++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* ++ * Implements the default dump scheme, i.e. single-stage gathering and ++ * saving of dump data directly to the target device, which operates in ++ * a push mode, where the dumping system decides what data it saves ++ * taking into account pre-specified dump config options. ++ * ++ * Aside: The 2-stage dump scheme, where there is a soft-reset between ++ * the gathering and saving phases, also reuses some of these ++ * default routines (see dump_overlay.c) ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++ ++extern int panic_timeout; /* time before reboot */ ++ ++extern void dump_speedo(int); ++ ++/* Default sequencer used during single stage dumping */ ++/* Also invoked during stage 2 of soft-boot based dumping */ ++int dump_generic_sequencer(void) ++{ ++ struct dump_data_filter *filter = dump_config.dumper->filter; ++ int pass = 0, err = 0, save = 0; ++ int (*action)(unsigned long, unsigned long); ++ ++ /* ++ * We want to save the more critical data areas first in ++ * case we run out of space, encounter i/o failures, or get ++ * interrupted otherwise and have to give up midway ++ * So, run through the passes in increasing order ++ */ ++ for (;filter->selector; filter++, pass++) ++ { ++ /* Assumes passes are exclusive (even across dumpers) */ ++ /* Requires care when coding the selection functions */ ++ if ((save = filter->level_mask & dump_config.level)) ++ action = dump_save_data; ++ else ++ action = dump_skip_data; ++ ++ if ((err = dump_iterator(pass, action, filter)) < 0) ++ break; ++ ++ printk("\n %d dump pages %s of %d each in pass %d\n", ++ err, save ? "saved" : "skipped", (int)DUMP_PAGE_SIZE, pass); ++ ++ } ++ ++ return (err < 0) ? err : 0; ++} ++ ++static inline struct page *dump_get_page(loff_t loc) ++{ ++ ++ unsigned long page_index = loc >> PAGE_SHIFT; ++ ++ /* todo: complete this to account for ia64/discontig mem */ ++ /* todo: and to check for validity, ram page, no i/o mem etc */ ++ /* need to use pfn/physaddr equiv of kern_addr_valid */ ++ ++ /* Important: ++ * On ARM/XScale system, the physical address starts from ++ * PHYS_OFFSET, and it maybe the situation that PHYS_OFFSET != 0. ++ * For example on Intel's PXA250, PHYS_OFFSET = 0xa0000000. And the ++ * page index starts from PHYS_PFN_OFFSET. When configuring ++ * filter, filter->start is assigned to 0 in dump_generic_configure. ++ * Here we want to adjust it by adding PHYS_PFN_OFFSET to it! ++ */ ++#ifdef CONFIG_ARM ++ page_index += PHYS_PFN_OFFSET; ++#endif ++ if (__dump_page_valid(page_index)) ++ return pfn_to_page(page_index); ++ else ++ return NULL; ++ ++} ++ ++/* Default iterator: for singlestage and stage 1 of soft-boot dumping */ ++/* Iterates over range of physical memory pages in DUMP_PAGE_SIZE increments */ ++int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned long), ++ struct dump_data_filter *filter) ++{ ++ /* Todo : fix unit, type */ ++ loff_t loc, start, end; ++ int i, count = 0, err = 0; ++ struct page *page; ++ ++ /* Todo: Add membanks code */ ++ /* TBD: Check if we need to address DUMP_PAGE_SIZE < PAGE_SIZE */ ++ ++ for (i = 0; i < filter->num_mbanks; i++) { ++ start = filter->start[i]; ++ end = filter->end[i]; ++ for (loc = start; loc < end; loc += DUMP_PAGE_SIZE) { ++ dump_config.dumper->curr_loc = loc; ++ page = dump_get_page(loc); ++ if (page && filter->selector(pass, ++ (unsigned long) page, DUMP_PAGE_SIZE)) { ++ if ((err = action((unsigned long)page, ++ DUMP_PAGE_SIZE))) { ++ printk("dump_page_iterator: err %d for " ++ "loc 0x%llx, in pass %d\n", ++ err, loc, pass); ++ return err ? err : count; ++ } else ++ count++; ++ } ++ } ++ } ++ ++ return err ? err : count; ++} ++ ++/* ++ * Base function that saves the selected block of data in the dump ++ * Action taken when iterator decides that data needs to be saved ++ */ ++int dump_generic_save_data(unsigned long loc, unsigned long sz) ++{ ++ void *buf; ++ void *dump_buf = dump_config.dumper->dump_buf; ++ int left, bytes, ret; ++ ++ if ((ret = dump_add_data(loc, sz))) { ++ return ret; ++ } ++ buf = dump_config.dumper->curr_buf; ++ ++ /* If we've filled up the buffer write it out */ ++ if ((left = buf - dump_buf) >= DUMP_BUFFER_SIZE) { ++ bytes = dump_write_buffer(dump_buf, DUMP_BUFFER_SIZE); ++ if (bytes < DUMP_BUFFER_SIZE) { ++ printk("dump_write_buffer failed %d\n", bytes); ++ return bytes ? -ENOSPC : bytes; ++ } ++ ++ left -= bytes; ++ ++ /* -- A few chores to do from time to time -- */ ++ dump_config.dumper->count++; ++ ++ if (!(dump_config.dumper->count & 0x3f)) { ++ /* Update the header every one in a while */ ++ memset((void *)dump_buf, 'b', DUMP_BUFFER_SIZE); ++ if ((ret = dump_update_header()) < 0) { ++ /* issue warning */ ++ return ret; ++ } ++ printk("."); ++ ++ touch_nmi_watchdog(); ++ } else if (!(dump_config.dumper->count & 0x7)) { ++ /* Show progress so the user knows we aren't hung */ ++ dump_speedo(dump_config.dumper->count >> 3); ++ } ++ /* Todo: Touch/Refresh watchdog */ ++ ++ /* --- Done with periodic chores -- */ ++ ++ /* ++ * extra bit of copying to simplify verification ++ * in the second kernel boot based scheme ++ */ ++ memcpy(dump_buf - DUMP_PAGE_SIZE, dump_buf + ++ DUMP_BUFFER_SIZE - DUMP_PAGE_SIZE, DUMP_PAGE_SIZE); ++ ++ /* now adjust the leftover bits back to the top of the page */ ++ /* this case would not arise during stage 2 (passthru) */ ++ memset(dump_buf, 'z', DUMP_BUFFER_SIZE); ++ if (left) { ++ memcpy(dump_buf, dump_buf + DUMP_BUFFER_SIZE, left); ++ } ++ buf -= DUMP_BUFFER_SIZE; ++ dump_config.dumper->curr_buf = buf; ++ } ++ ++ return 0; ++} ++ ++int dump_generic_skip_data(unsigned long loc, unsigned long sz) ++{ ++ /* dummy by default */ ++ return 0; ++} ++ ++/* ++ * Common low level routine to write a buffer to current dump device ++ * Expects checks for space etc to have been taken care of by the caller ++ * Operates serially at the moment for simplicity. ++ * TBD/Todo: Consider batching for improved throughput ++ */ ++int dump_ll_write(void *buf, unsigned long len) ++{ ++ long transferred = 0, last_transfer = 0; ++ int ret = 0; ++ ++ /* make sure device is ready */ ++ while ((ret = dump_dev_ready(NULL)) == -EAGAIN); ++ if (ret < 0) { ++ printk("dump_dev_ready failed !err %d\n", ret); ++ return ret; ++ } ++ ++ while (len) { ++ if ((last_transfer = dump_dev_write(buf, len)) <= 0) { ++ ret = last_transfer; ++ printk("dump_dev_write failed !err %d\n", ++ ret); ++ break; ++ } ++ /* wait till complete */ ++ while ((ret = dump_dev_ready(buf)) == -EAGAIN) ++ cpu_relax(); ++ ++ if (ret < 0) { ++ printk("i/o failed !err %d\n", ret); ++ break; ++ } ++ ++ len -= last_transfer; ++ buf += last_transfer; ++ transferred += last_transfer; ++ } ++ return (ret < 0) ? ret : transferred; ++} ++ ++/* default writeout routine for single dump device */ ++/* writes out the dump data ensuring enough space is left for the end marker */ ++int dump_generic_write_buffer(void *buf, unsigned long len) ++{ ++ long written = 0; ++ int err = 0; ++ ++ /* check for space */ ++ if ((err = dump_dev_seek(dump_config.dumper->curr_offset + len + ++ 2*DUMP_BUFFER_SIZE)) < 0) { ++ printk("dump_write_buffer: insuff space after offset 0x%llx\n", ++ dump_config.dumper->curr_offset); ++ return err; ++ } ++ /* alignment check would happen as a side effect of this */ ++ if ((err = dump_dev_seek(dump_config.dumper->curr_offset)) < 0) ++ return err; ++ ++ written = dump_ll_write(buf, len); ++ ++ /* all or none */ ++ ++ if (written < len) ++ written = written ? -ENOSPC : written; ++ else ++ dump_config.dumper->curr_offset += len; ++ ++ return written; ++} ++ ++int dump_generic_configure(unsigned long devid) ++{ ++ struct dump_dev *dev = dump_config.dumper->dev; ++ struct dump_data_filter *filter; ++ void *buf; ++ int ret = 0; ++ ++ /* Allocate the dump buffer and initialize dumper state */ ++ /* Assume that we get aligned addresses */ ++ if (!(buf = dump_alloc_mem(DUMP_BUFFER_SIZE + 3 * DUMP_PAGE_SIZE))) ++ return -ENOMEM; ++ ++ if ((unsigned long)buf & (PAGE_SIZE - 1)) { ++ /* sanity check for page aligned address */ ++ dump_free_mem(buf); ++ return -ENOMEM; /* fixme: better error code */ ++ } ++ ++ /* Initialize the rest of the fields */ ++ dump_config.dumper->dump_buf = buf + DUMP_PAGE_SIZE; ++ dumper_reset(); ++ ++ /* Open the dump device */ ++ if (!dev) ++ return -ENODEV; ++ ++ if ((ret = dev->ops->open(dev, devid))) { ++ return ret; ++ } ++ ++ /* Initialise the memory ranges in the dump filter */ ++ for (filter = dump_config.dumper->filter ;filter->selector; filter++) { ++ if (!filter->start[0] && !filter->end[0]) { ++ pg_data_t *pgdat; ++ int i = 0; ++ for_each_pgdat(pgdat) { ++ filter->start[i] = ++ (loff_t)pgdat->node_start_pfn << PAGE_SHIFT; ++ filter->end[i] = ++ (loff_t)(pgdat->node_start_pfn + pgdat->node_spanned_pages) << PAGE_SHIFT; ++ i++; ++ } ++ filter->num_mbanks = i; ++ } ++ } ++ ++ return 0; ++} ++ ++int dump_generic_unconfigure(void) ++{ ++ struct dump_dev *dev = dump_config.dumper->dev; ++ void *buf = dump_config.dumper->dump_buf; ++ int ret = 0; ++ ++ pr_debug("Generic unconfigure\n"); ++ /* Close the dump device */ ++ if (dev && (ret = dev->ops->release(dev))) ++ return ret; ++ ++ printk("Closed dump device\n"); ++ ++ if (buf) ++ dump_free_mem((buf - DUMP_PAGE_SIZE)); ++ ++ dump_config.dumper->curr_buf = dump_config.dumper->dump_buf = NULL; ++ pr_debug("Released dump buffer\n"); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_DISCONTIGMEM ++ ++void dump_reconfigure_mbanks(void) ++{ ++ pg_data_t *pgdat; ++ loff_t start, end, loc, loc_end; ++ int i=0; ++ struct dump_data_filter *filter = dump_config.dumper->filter; ++ ++ for_each_pgdat(pgdat) { ++ ++ start = (loff_t)(pgdat->node_start_pfn << PAGE_SHIFT); ++ end = ((loff_t)(pgdat->node_start_pfn + pgdat->node_spanned_pages) << PAGE_SHIFT); ++ for(loc = start; loc < end; loc += (DUMP_PAGE_SIZE)) { ++ ++ if(!(__dump_page_valid(loc >> PAGE_SHIFT))) ++ continue; ++ ++ /* We found a valid page. This is the start */ ++ filter->start[i] = loc; ++ ++ /* Now loop here till you find the end */ ++ for(loc_end = loc; loc_end < end; loc_end += (DUMP_PAGE_SIZE)) { ++ ++ if(__dump_page_valid(loc_end >> PAGE_SHIFT)) { ++ /* This page could very well be the last page */ ++ filter->end[i] = loc_end; ++ continue; ++ } ++ break; ++ } ++ i++; ++ loc = loc_end; ++ } ++ } ++ filter->num_mbanks = i; ++ ++ /* Propagate memory bank information to other filters */ ++ for (filter = dump_config.dumper->filter, filter++ ;filter->selector; filter++) { ++ for(i = 0; i < dump_config.dumper->filter->num_mbanks; i++) { ++ filter->start[i] = dump_config.dumper->filter->start[i]; ++ filter->end[i] = dump_config.dumper->filter->end[i]; ++ filter->num_mbanks = dump_config.dumper->filter->num_mbanks; ++ } ++ } ++} ++#endif ++ ++/* Set up the default dump scheme */ ++ ++struct dump_scheme_ops dump_scheme_singlestage_ops = { ++ .configure = dump_generic_configure, ++ .unconfigure = dump_generic_unconfigure, ++ .sequencer = dump_generic_sequencer, ++ .iterator = dump_page_iterator, ++ .save_data = dump_generic_save_data, ++ .skip_data = dump_generic_skip_data, ++ .write_buffer = dump_generic_write_buffer, ++}; ++ ++struct dump_scheme dump_scheme_singlestage = { ++ .name = "single-stage", ++ .ops = &dump_scheme_singlestage_ops ++}; ++ ++/* The single stage dumper comprising all these */ ++struct dumper dumper_singlestage = { ++ .name = "single-stage", ++ .scheme = &dump_scheme_singlestage, ++ .fmt = &dump_fmt_lcrash, ++ .compress = &dump_none_compression, ++ .filter = dump_filter_table, ++ .dev = NULL, ++}; ++ +Index: linux-2.6.10/drivers/dump/dump_gzip.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_gzip.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_gzip.c 2005-04-05 16:47:53.937206016 +0800 +@@ -0,0 +1,174 @@ ++/* ++ * GZIP Compression functions for kernel crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sourceforge.net) ++ * Copyright 2001 Matt D. Robinson. All rights reserved. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* header files */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void *deflate_workspace; ++static unsigned long workspace_paddr[2]; ++ ++static u8 *safety_buffer; ++ ++/* ++ * Name: dump_compress_gzip() ++ * Func: Compress a DUMP_PAGE_SIZE page using gzip-style algorithms (the. ++ * deflate functions similar to what's used in PPP). ++ */ ++static u32 ++dump_compress_gzip(const u8 *old, u32 oldsize, u8 *new, u32 newsize, ++ unsigned long loc) ++{ ++ /* error code and dump stream */ ++ int err; ++ z_stream dump_stream; ++ struct page *pg = (struct page *)loc; ++ unsigned long paddr = page_to_pfn(pg) << PAGE_SHIFT; ++ static int warning = 0; ++ ++ dump_stream.workspace = deflate_workspace; ++ if ((paddr == workspace_paddr[0]) || (paddr == workspace_paddr[1])) { ++ /* ++ * This page belongs to deflate_workspace used as temporary ++ * buffer for compression. Hence, dump them without compression. ++ */ ++ return(0); ++ } ++ if ((err = zlib_deflateInit(&dump_stream, Z_BEST_COMPRESSION)) != Z_OK) { ++ /* fall back to RLE compression */ ++ printk("dump_compress_gzip(): zlib_deflateInit() " ++ "failed (%d)!\n", err); ++ return 0; ++ } ++ ++ /* copy the old page to the safety buffer */ ++ if (oldsize <= DUMP_PAGE_SIZE) { ++ memcpy(safety_buffer, old, oldsize); ++ dump_stream.next_in = (u8 *) safety_buffer; ++ } else { ++ if (!warning) { ++ printk("dump_compress_gzip oversize input: %d\n", ++ oldsize); ++ warning++; ++ } ++ dump_stream.next_in = (u8 *) old; ++ } ++ ++ /* use old (page of memory) and size (DUMP_PAGE_SIZE) as in-streams */ ++ dump_stream.avail_in = oldsize; ++ ++ /* out streams are new (dpcpage) and new size (DUMP_DPC_PAGE_SIZE) */ ++ dump_stream.next_out = new; ++ dump_stream.avail_out = newsize; ++ ++ /* deflate the page -- check for error */ ++ err = zlib_deflate(&dump_stream, Z_FINISH); ++ if (err != Z_STREAM_END) { ++ /* zero is return code here */ ++ (void)zlib_deflateEnd(&dump_stream); ++ printk("dump_compress_gzip(): zlib_deflate() failed (%d)!\n", ++ err); ++ return 0; ++ } ++ ++ /* let's end the deflated compression stream */ ++ if ((err = zlib_deflateEnd(&dump_stream)) != Z_OK) { ++ printk("dump_compress_gzip(): zlib_deflateEnd() " ++ "failed (%d)!\n", err); ++ } ++ ++ /* return the compressed byte total (if it's smaller) */ ++ if (dump_stream.total_out >= oldsize) { ++ return oldsize; ++ } ++ return dump_stream.total_out; ++} ++ ++/* setup the gzip compression functionality */ ++static struct __dump_compress dump_gzip_compression = { ++ .compress_type = DUMP_COMPRESS_GZIP, ++ .compress_func = dump_compress_gzip, ++ .compress_name = "GZIP", ++}; ++ ++/* ++ * Name: dump_compress_gzip_init() ++ * Func: Initialize gzip as a compression mechanism. ++ */ ++static int __init ++dump_compress_gzip_init(void) ++{ ++ struct page *pg; ++ ++ deflate_workspace = vmalloc(zlib_deflate_workspacesize()); ++ if (!deflate_workspace) { ++ printk("dump_compress_gzip_init(): Failed to " ++ "alloc %d bytes for deflate workspace\n", ++ zlib_deflate_workspacesize()); ++ return -ENOMEM; ++ } ++ /* ++ * Need to find pages (workspace) that are used for compression. ++ * Even though zlib_deflate_workspacesize() is 64 pages (approximately) ++ * depends on the arch, we used only 2 pages. Hence, get the physical ++ * addresses for these 2 pages and used them to not to compress those ++ * pages. ++ */ ++ pg = vmalloc_to_page(deflate_workspace); ++ workspace_paddr[0] = page_to_pfn(pg) << PAGE_SHIFT; ++ pg = vmalloc_to_page(deflate_workspace + DUMP_PAGE_SIZE); ++ workspace_paddr[1] = page_to_pfn(pg) << PAGE_SHIFT; ++ ++ /* Eliminate the possibility of real data getting a compression ++ * failure. ++ */ ++ ++ if (!(safety_buffer = (void *)__get_free_pages(GFP_KERNEL, ++ get_order(DUMP_PAGE_SIZE)))) ++ return -ENOMEM; ++ ++ printk("dump gzip safety buffer: %p, %d\n", safety_buffer, ++ (int)DUMP_PAGE_SIZE); ++ ++ dump_register_compression(&dump_gzip_compression); ++ return 0; ++} ++ ++/* ++ * Name: dump_compress_gzip_cleanup() ++ * Func: Remove gzip as a compression mechanism. ++ */ ++static void __exit ++dump_compress_gzip_cleanup(void) ++{ ++ vfree(deflate_workspace); ++ if (safety_buffer) { ++ free_pages((unsigned long)safety_buffer, ++ get_order(DUMP_PAGE_SIZE)); ++ safety_buffer = NULL; ++ } ++ ++ dump_unregister_compression(DUMP_COMPRESS_GZIP); ++} ++ ++/* module initialization */ ++module_init(dump_compress_gzip_init); ++module_exit(dump_compress_gzip_cleanup); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("LKCD Development Team "); ++MODULE_DESCRIPTION("Gzip compression module for crash dump driver"); +Index: linux-2.6.10/drivers/dump/dump_filters.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_filters.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_filters.c 2005-04-05 16:47:53.942205256 +0800 +@@ -0,0 +1,143 @@ ++/* ++ * Default filters to select data to dump for various passes. ++ * ++ * Started: Oct 2002 - Suparna Bhattacharya ++ * Split and rewrote default dump selection logic to generic dump ++ * method interfaces ++ * Derived from a portion of dump_base.c created by ++ * Matt Robinson ) ++ * ++ * Copyright (C) 1999 - 2002 Silicon Graphics, Inc. All rights reserved. ++ * Copyright (C) 2001 - 2002 Matt D. Robinson. All rights reserved. ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * Used during single-stage dumping and during stage 1 of the 2-stage scheme ++ * (Stage 2 of the 2-stage scheme uses the fully transparent filters ++ * i.e. passthru filters in dump_overlay.c) ++ * ++ * Future: Custom selective dump may involve a different set of filters. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++ ++#define DUMP_PFN_SAFETY_MARGIN 1024 /* 4 MB */ ++static unsigned long bootmap_pages; ++ ++/* Copied from mm/bootmem.c - FIXME */ ++/* return the number of _pages_ that will be allocated for the boot bitmap */ ++void dump_calc_bootmap_pages (void) ++{ ++ unsigned long mapsize; ++ unsigned long pages = num_physpages; ++ ++ mapsize = (pages+7)/8; ++ mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; ++ mapsize >>= PAGE_SHIFT; ++ bootmap_pages = mapsize + DUMP_PFN_SAFETY_MARGIN + 1; ++} ++ ++ ++/* temporary */ ++extern unsigned long min_low_pfn; ++ ++ ++int dump_low_page(struct page *p) ++{ ++ return ((page_to_pfn(p) >= min_low_pfn) && ++ (page_to_pfn(p) < (min_low_pfn + bootmap_pages))); ++} ++ ++static inline int kernel_page(struct page *p) ++{ ++ /* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */ ++ return (PageReserved(p) && !PageInuse(p)) || (!PageLRU(p) && PageInuse(p)); ++} ++ ++static inline int user_page(struct page *p) ++{ ++ return PageInuse(p) && (!PageReserved(p) && PageLRU(p)); ++} ++ ++static inline int unreferenced_page(struct page *p) ++{ ++ return !PageInuse(p) && !PageReserved(p); ++} ++ ++ ++/* loc marks the beginning of a range of pages */ ++int dump_filter_kernpages(int pass, unsigned long loc, unsigned long sz) ++{ ++ struct page *page = (struct page *)loc; ++ /* if any of the pages is a kernel page, select this set */ ++ while (sz) { ++ if (dump_low_page(page) || kernel_page(page)) ++ return 1; ++ sz -= PAGE_SIZE; ++ page++; ++ } ++ return 0; ++} ++ ++ ++/* loc marks the beginning of a range of pages */ ++int dump_filter_userpages(int pass, unsigned long loc, unsigned long sz) ++{ ++ struct page *page = (struct page *)loc; ++ int ret = 0; ++ /* select if the set has any user page, and no kernel pages */ ++ while (sz) { ++ if (user_page(page) && !dump_low_page(page)) { ++ ret = 1; ++ } else if (kernel_page(page) || dump_low_page(page)) { ++ return 0; ++ } ++ page++; ++ sz -= PAGE_SIZE; ++ } ++ return ret; ++} ++ ++ ++ ++/* loc marks the beginning of a range of pages */ ++int dump_filter_unusedpages(int pass, unsigned long loc, unsigned long sz) ++{ ++ struct page *page = (struct page *)loc; ++ ++ /* select if the set does not have any used pages */ ++ while (sz) { ++ if (!unreferenced_page(page) || dump_low_page(page)) { ++ return 0; ++ } ++ page++; ++ sz -= PAGE_SIZE; ++ } ++ return 1; ++} ++ ++/* dummy: last (non-existent) pass */ ++int dump_filter_none(int pass, unsigned long loc, unsigned long sz) ++{ ++ return 0; ++} ++ ++/* TBD: resolve level bitmask ? */ ++struct dump_data_filter dump_filter_table[] = { ++ { .name = "kern", .selector = dump_filter_kernpages, ++ .level_mask = DUMP_MASK_KERN}, ++ { .name = "user", .selector = dump_filter_userpages, ++ .level_mask = DUMP_MASK_USED}, ++ { .name = "unused", .selector = dump_filter_unusedpages, ++ .level_mask = DUMP_MASK_UNUSED}, ++ { .name = "none", .selector = dump_filter_none, ++ .level_mask = DUMP_MASK_REST}, ++ { .name = "", .selector = NULL, .level_mask = 0} ++}; ++ +Index: linux-2.6.10/drivers/dump/dump_ppc64.c +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_ppc64.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_ppc64.c 2005-04-05 16:47:53.931206928 +0800 +@@ -0,0 +1,410 @@ ++/* ++ * Architecture specific (ppc64) functions for Linux crash dumps. ++ * ++ * Created by: Matt Robinson (yakker@sgi.com) ++ * ++ * Copyright 1999 Silicon Graphics, Inc. All rights reserved. ++ * ++ * 2.3 kernel modifications by: Matt D. Robinson (yakker@turbolinux.com) ++ * Copyright 2000 TurboLinux, Inc. All rights reserved. ++ * Copyright 2003, 2004 IBM Corporation ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++/* ++ * The hooks for dumping the kernel virtual memory to disk are in this ++ * file. Any time a modification is made to the virtual memory mechanism, ++ * these routines must be changed to use the new mechanisms. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "dump_methods.h" ++#include ++#include ++#include ++#include ++#include ++#if defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE) ++#include ++#endif ++ ++extern cpumask_t irq_affinity[]; ++ ++static cpumask_t saved_affinity[NR_IRQS]; ++ ++static __s32 saved_irq_count; /* saved preempt_count() flags */ ++ ++static int alloc_dha_stack(void) ++{ ++ int i; ++ void *ptr; ++ ++ if (dump_header_asm.dha_stack[0]) ++ return 0; ++ ++ ptr = (void *)vmalloc(THREAD_SIZE * num_possible_cpus()); ++ if (!ptr) { ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ dump_header_asm.dha_stack[i] = ++ (uint64_t)((unsigned long)ptr + (i * THREAD_SIZE)); ++ } ++ return 0; ++} ++ ++static int free_dha_stack(void) ++{ ++ if (dump_header_asm.dha_stack[0]) { ++ vfree((void*)dump_header_asm.dha_stack[0]); ++ dump_header_asm.dha_stack[0] = 0; ++ } ++ return 0; ++} ++#ifdef CONFIG_SMP ++static int dump_expect_ipi[NR_CPUS]; ++static atomic_t waiting_for_dump_ipi; ++ ++extern void stop_this_cpu(void *); ++static int ++dump_ipi_handler(struct pt_regs *regs) ++{ ++ int cpu = smp_processor_id(); ++ ++ if (!dump_expect_ipi[cpu]) ++ return 0; ++ dump_save_this_cpu(regs); ++ atomic_dec(&waiting_for_dump_ipi); ++ ++ level_changed: ++ switch (dump_silence_level) { ++ case DUMP_HARD_SPIN_CPUS: /* Spin until dump is complete */ ++ while (dump_oncpu) { ++ barrier(); /* paranoia */ ++ if (dump_silence_level != DUMP_HARD_SPIN_CPUS) ++ goto level_changed; ++ cpu_relax(); /* kill time nicely */ ++ } ++ break; ++ ++ case DUMP_HALT_CPUS: /* Execute halt */ ++ stop_this_cpu(NULL); ++ break; ++ ++ case DUMP_SOFT_SPIN_CPUS: ++ /* Mark the task so it spins in schedule */ ++ set_tsk_thread_flag(current, TIF_NEED_RESCHED); ++ break; ++ } ++ ++ return 1; ++} ++ ++/* save registers on other processors ++ * If the other cpus don't respond we simply do not get their states. ++ */ ++void ++__dump_save_other_cpus(void) ++{ ++ int i, cpu = smp_processor_id(); ++ int other_cpus = num_online_cpus()-1; ++ ++ if (other_cpus > 0) { ++ atomic_set(&waiting_for_dump_ipi, other_cpus); ++ for (i = 0; i < NR_CPUS; i++) ++ dump_expect_ipi[i] = (i != cpu && cpu_online(i)); ++ ++ printk(KERN_ALERT "sending IPI to other cpus...\n"); ++ dump_send_ipi(dump_ipi_handler); ++ /* ++ * may be we dont need to wait for IPI to be processed. ++ * just write out the header at the end of dumping, if ++ * this IPI is not processed until then, there probably ++ * is a problem and we just fail to capture state of ++ * other cpus. ++ * However, we will wait 10 secs for other CPUs to respond. ++ * If not, proceed the dump process even though we failed ++ * to capture other CPU states. ++ */ ++ i = 10000; /* wait max of 10 seconds */ ++ while ((atomic_read(&waiting_for_dump_ipi) > 0) && (--i > 0)) { ++ barrier(); ++ mdelay(1); ++ } ++ printk(KERN_ALERT "done waiting: %d cpus not responding\n", ++ atomic_read(&waiting_for_dump_ipi)); ++ dump_send_ipi(NULL); /* clear handler */ ++ } ++} ++ ++/* ++ * Restore old irq affinities. ++ */ ++static void ++__dump_reset_irq_affinity(void) ++{ ++ int i; ++ irq_desc_t *irq_d; ++ ++ memcpy(irq_affinity, saved_affinity, NR_IRQS * sizeof(unsigned long)); ++ ++ for_each_irq(i) { ++ irq_d = get_irq_desc(i); ++ if (irq_d->handler == NULL) { ++ continue; ++ } ++ if (irq_d->handler->set_affinity != NULL) { ++ irq_d->handler->set_affinity(i, saved_affinity[i]); ++ } ++ } ++} ++ ++/* ++ * Routine to save the old irq affinities and change affinities of all irqs to ++ * the dumping cpu. ++ * ++ * NB: Need to be expanded to multiple nodes. ++ */ ++static void ++__dump_set_irq_affinity(void) ++{ ++ int i; ++ cpumask_t cpu = CPU_MASK_NONE; ++ irq_desc_t *irq_d; ++ ++ cpu_set(smp_processor_id(), cpu); ++ ++ memcpy(saved_affinity, irq_affinity, NR_IRQS * sizeof(unsigned long)); ++ ++ for_each_irq(i) { ++ irq_d = get_irq_desc(i); ++ if (irq_d->handler == NULL) { ++ continue; ++ } ++ irq_affinity[i] = cpu; ++ if (irq_d->handler->set_affinity != NULL) { ++ irq_d->handler->set_affinity(i, irq_affinity[i]); ++ } ++ } ++} ++#else /* !CONFIG_SMP */ ++#define __dump_save_other_cpus() do { } while (0) ++#define __dump_set_irq_affinity() do { } while (0) ++#define __dump_reset_irq_affinity() do { } while (0) ++#endif /* !CONFIG_SMP */ ++ ++void ++__dump_save_regs(struct pt_regs *dest_regs, const struct pt_regs *regs) ++{ ++ if (regs) { ++ memcpy(dest_regs, regs, sizeof(struct pt_regs)); ++ } ++} ++ ++void ++__dump_save_context(int cpu, const struct pt_regs *regs, ++ struct task_struct *tsk) ++{ ++ dump_header_asm.dha_smp_current_task[cpu] = (unsigned long)tsk; ++ __dump_save_regs(&dump_header_asm.dha_smp_regs[cpu], regs); ++ ++ /* take a snapshot of the stack */ ++ /* doing this enables us to tolerate slight drifts on this cpu */ ++ ++ if (dump_header_asm.dha_stack[cpu]) { ++ memcpy((void *)dump_header_asm.dha_stack[cpu], ++ STACK_START_POSITION(tsk), ++ THREAD_SIZE); ++ } ++ dump_header_asm.dha_stack_ptr[cpu] = (unsigned long)(tsk->thread_info); ++} ++ ++/* ++ * Name: __dump_configure_header() ++ * Func: Configure the dump header with all proper values. ++ */ ++int ++__dump_configure_header(const struct pt_regs *regs) ++{ ++ return (0); ++} ++ ++#if defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE) ++int ++kdb_sysdump(int argc, const char **argv, const char **envp, struct pt_regs *regs) ++{ ++ kdb_printf("Dumping to disk...\n"); ++ dump("dump from kdb", regs); ++ kdb_printf("Dump Complete\n"); ++ return 0; ++} ++#endif ++ ++/* ++ * Name: __dump_init() ++ * Func: Initialize the dumping routine process. This is in case ++ * it's necessary in the future. ++ */ ++void ++__dump_init(uint64_t local_memory_start) ++{ ++#if defined(FIXME) && defined(CONFIG_KDB) && !defined(CONFIG_DUMP_MODULE) ++ /* This won't currently work because interrupts are off in kdb ++ * and the dump process doesn't understand how to recover. ++ */ ++ /* ToDo: add a command to query/set dump configuration */ ++ kdb_register_repeat("sysdump", kdb_sysdump, "", "use lkcd to dump the system to disk (if configured)", 0, KDB_REPEAT_NONE); ++#endif ++ ++ /* return */ ++ return; ++} ++ ++/* ++ * Name: __dump_open() ++ * Func: Open the dump device (architecture specific). This is in ++ * case it's necessary in the future. ++ */ ++void ++__dump_open(void) ++{ ++ alloc_dha_stack(); ++} ++ ++ ++/* ++ * Name: __dump_cleanup() ++ * Func: Free any architecture specific data structures. This is called ++ * when the dump module is being removed. ++ */ ++void ++__dump_cleanup(void) ++{ ++ free_dha_stack(); ++} ++ ++/* ++ * Kludge - dump from interrupt context is unreliable (Fixme) ++ * ++ * We do this so that softirqs initiated for dump i/o ++ * get processed and we don't hang while waiting for i/o ++ * to complete or in any irq synchronization attempt. ++ * ++ * This is not quite legal of course, as it has the side ++ * effect of making all interrupts & softirqs triggered ++ * while dump is in progress complete before currently ++ * pending softirqs and the currently executing interrupt ++ * code. ++ */ ++static inline void ++irq_bh_save(void) ++{ ++ saved_irq_count = irq_count(); ++ preempt_count() &= ~(HARDIRQ_MASK|SOFTIRQ_MASK); ++} ++ ++static inline void ++irq_bh_restore(void) ++{ ++ preempt_count() |= saved_irq_count; ++} ++ ++/* ++ * Name: __dump_irq_enable ++ * Func: Reset system so interrupts are enabled. ++ * This is used for dump methods that require interrupts ++ * Eventually, all methods will have interrupts disabled ++ * and this code can be removed. ++ * ++ * Change irq affinities ++ * Re-enable interrupts ++ */ ++int ++__dump_irq_enable(void) ++{ ++ __dump_set_irq_affinity(); ++ irq_bh_save(); ++ local_irq_enable(); ++ return 0; ++} ++ ++/* ++ * Name: __dump_irq_restore ++ * Func: Resume the system state in an architecture-specific way. ++ */ ++void ++__dump_irq_restore(void) ++{ ++ local_irq_disable(); ++ __dump_reset_irq_affinity(); ++ irq_bh_restore(); ++} ++ ++#if 0 ++/* Cheap progress hack. It estimates pages to write and ++ * assumes all pages will go -- so it may get way off. ++ * As the progress is not displayed for other architectures, not used at this ++ * moment. ++ */ ++void ++__dump_progress_add_page(void) ++{ ++ unsigned long total_pages = nr_free_pages() + nr_inactive_pages + nr_active_pages; ++ unsigned int percent = (dump_header.dh_num_dump_pages * 100) / total_pages; ++ char buf[30]; ++ ++ if (percent > last_percent && percent <= 100) { ++ sprintf(buf, "Dump %3d%% ", percent); ++ ppc64_dump_msg(0x2, buf); ++ last_percent = percent; ++ } ++ ++} ++#endif ++ ++extern int dump_page_is_ram(unsigned long); ++/* ++ * Name: __dump_page_valid() ++ * Func: Check if page is valid to dump. ++ */ ++int ++__dump_page_valid(unsigned long index) ++{ ++ if (!pfn_valid(index)) ++ return 0; ++ ++ return dump_page_is_ram(index); ++} ++ ++/* ++ * Name: manual_handle_crashdump() ++ * Func: Interface for the lkcd dump command. Calls dump_execute() ++ */ ++int ++manual_handle_crashdump(void) ++{ ++ struct pt_regs regs; ++ ++ get_current_regs(®s); ++ dump_execute("manual", ®s); ++ return 0; ++} ++ ++/* ++ * Name: __dump_clean_irq_state() ++ * Func: Clean up from the previous IRQ handling state. Such as oops from ++ * interrupt handler or bottom half. ++ */ ++void ++__dump_clean_irq_state(void) ++{ ++ return; ++} +Index: linux-2.6.10/drivers/dump/dump_methods.h +=================================================================== +--- linux-2.6.10.orig/drivers/dump/dump_methods.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/dump_methods.h 2005-04-05 16:47:53.930207080 +0800 +@@ -0,0 +1,357 @@ ++/* ++ * Generic interfaces for flexible system dump ++ * ++ * Started: Oct 2002 - Suparna Bhattacharya (suparna@in.ibm.com) ++ * ++ * Copyright (C) 2002 International Business Machines Corp. ++ * ++ * This code is released under version 2 of the GNU GPL. ++ */ ++ ++#ifndef _LINUX_DUMP_METHODS_H ++#define _LINUX_DUMP_METHODS_H ++ ++/* ++ * Inspired by Matt Robinson's suggestion of introducing dump ++ * methods as a way to enable different crash dump facilities to ++ * coexist where each employs its own scheme or dumping policy. ++ * ++ * The code here creates a framework for flexible dump by defining ++ * a set of methods and providing associated helpers that differentiate ++ * between the underlying mechanism (how to dump), overall scheme ++ * (sequencing of stages and data dumped and associated quiescing), ++ * output format (what the dump output looks like), target type ++ * (where to save the dump; see dumpdev.h), and selection policy ++ * (state/data to dump). ++ * ++ * These sets of interfaces can be mixed and matched to build a ++ * dumper suitable for a given situation, allowing for ++ * flexibility as well appropriate degree of code reuse. ++ * For example all features and options of lkcd (including ++ * granular selective dumping in the near future) should be ++ * available even when say, the 2 stage soft-boot based mechanism ++ * is used for taking disruptive dumps. ++ * ++ * Todo: Additionally modules or drivers may supply their own ++ * custom dumpers which extend dump with module specific ++ * information or hardware state, and can even tweak the ++ * mechanism when it comes to saving state relevant to ++ * them. ++ */ ++ ++#include ++#include ++#include ++#include ++#include /* get_order */ ++ ++#define MAX_PASSES 6 ++#define MAX_DEVS 4 ++ ++ ++/* To customise selection of pages to be dumped in a given pass/group */ ++struct dump_data_filter{ ++ char name[32]; ++ int (*selector)(int, unsigned long, unsigned long); ++ ulong level_mask; /* dump level(s) for which this filter applies */ ++ loff_t start[MAX_NUMNODES], end[MAX_NUMNODES]; /* location range applicable */ ++ ulong num_mbanks; /* Number of memory banks. Greater than one for discontig memory (NUMA) */ ++}; ++ ++ ++/* ++ * Determined by the kind of dump mechanism and appropriate ++ * overall scheme ++ */ ++struct dump_scheme_ops { ++ /* sets aside memory, inits data structures etc */ ++ int (*configure)(unsigned long devid); ++ /* releases resources */ ++ int (*unconfigure)(void); ++ ++ /* ordering of passes, invoking iterator */ ++ int (*sequencer)(void); ++ /* iterates over system data, selects and acts on data to dump */ ++ int (*iterator)(int, int (*)(unsigned long, unsigned long), ++ struct dump_data_filter *); ++ /* action when data is selected for dump */ ++ int (*save_data)(unsigned long, unsigned long); ++ /* action when data is to be excluded from dump */ ++ int (*skip_data)(unsigned long, unsigned long); ++ /* policies for space, multiple dump devices etc */ ++ int (*write_buffer)(void *, unsigned long); ++}; ++ ++struct dump_scheme { ++ /* the name serves as an anchor to locate the scheme after reboot */ ++ char name[32]; ++ struct dump_scheme_ops *ops; ++ struct list_head list; ++}; ++ ++/* Quiescing/Silence levels (controls IPI callback behaviour) */ ++extern enum dump_silence_levels { ++ DUMP_SOFT_SPIN_CPUS = 1, ++ DUMP_HARD_SPIN_CPUS = 2, ++ DUMP_HALT_CPUS = 3, ++} dump_silence_level; ++ ++/* determined by the dump (file) format */ ++struct dump_fmt_ops { ++ /* build header */ ++ int (*configure_header)(const char *, const struct pt_regs *); ++ int (*update_header)(void); /* update header and write it out */ ++ /* save curr context */ ++ void (*save_context)(int, const struct pt_regs *, ++ struct task_struct *); ++ /* typically called by the save_data action */ ++ /* add formatted data to the dump buffer */ ++ int (*add_data)(unsigned long, unsigned long); ++ int (*update_end_marker)(void); ++}; ++ ++struct dump_fmt { ++ unsigned long magic; ++ char name[32]; /* lcrash, crash, elf-core etc */ ++ struct dump_fmt_ops *ops; ++ struct list_head list; ++}; ++ ++/* ++ * Modules will be able add their own data capture schemes by ++ * registering their own dumpers. Typically they would use the ++ * primary dumper as a template and tune it with their routines. ++ * Still Todo. ++ */ ++ ++/* The combined dumper profile (mechanism, scheme, dev, fmt) */ ++struct dumper { ++ char name[32]; /* singlestage, overlay (stg1), passthru(stg2), pull */ ++ struct dump_scheme *scheme; ++ struct dump_fmt *fmt; ++ struct __dump_compress *compress; ++ struct dump_data_filter *filter; ++ struct dump_dev *dev; ++ /* state valid only for active dumper(s) - per instance */ ++ /* run time state/context */ ++ int curr_pass; ++ unsigned long count; ++ loff_t curr_offset; /* current logical offset into dump device */ ++ loff_t curr_loc; /* current memory location */ ++ void *curr_buf; /* current position in the dump buffer */ ++ void *dump_buf; /* starting addr of dump buffer */ ++ int header_dirty; /* whether the header needs to be written out */ ++ int header_len; ++ struct list_head dumper_list; /* links to other dumpers */ ++}; ++ ++/* Starting point to get to the current configured state */ ++struct dump_config { ++ ulong level; ++ ulong flags; ++ struct dumper *dumper; ++ unsigned long dump_device; ++ unsigned long dump_addr; /* relevant only for in-memory dumps */ ++ struct list_head dump_dev_list; ++}; ++ ++extern struct dump_config dump_config; ++ ++/* Used to save the dump config across a reboot for 2-stage dumps: ++ * ++ * Note: The scheme, format, compression and device type should be ++ * registered at bootup, for this config to be sharable across soft-boot. ++ * The function addresses could have changed and become invalid, and ++ * need to be set up again. ++ */ ++struct dump_config_block { ++ u64 magic; /* for a quick sanity check after reboot */ ++ struct dump_memdev memdev; /* handle to dump stored in memory */ ++ struct dump_config config; ++ struct dumper dumper; ++ struct dump_scheme scheme; ++ struct dump_fmt fmt; ++ struct __dump_compress compress; ++ struct dump_data_filter filter_table[MAX_PASSES]; ++ struct dump_anydev dev[MAX_DEVS]; /* target dump device */ ++}; ++ ++ ++/* Wrappers that invoke the methods for the current (active) dumper */ ++ ++/* Scheme operations */ ++ ++static inline int dump_sequencer(void) ++{ ++ return dump_config.dumper->scheme->ops->sequencer(); ++} ++ ++static inline int dump_iterator(int pass, int (*action)(unsigned long, ++ unsigned long), struct dump_data_filter *filter) ++{ ++ return dump_config.dumper->scheme->ops->iterator(pass, action, filter); ++} ++ ++#define dump_save_data dump_config.dumper->scheme->ops->save_data ++#define dump_skip_data dump_config.dumper->scheme->ops->skip_data ++ ++static inline int dump_write_buffer(void *buf, unsigned long len) ++{ ++ return dump_config.dumper->scheme->ops->write_buffer(buf, len); ++} ++ ++static inline int dump_configure(unsigned long devid) ++{ ++ return dump_config.dumper->scheme->ops->configure(devid); ++} ++ ++static inline int dump_unconfigure(void) ++{ ++ return dump_config.dumper->scheme->ops->unconfigure(); ++} ++ ++/* Format operations */ ++ ++static inline int dump_configure_header(const char *panic_str, ++ const struct pt_regs *regs) ++{ ++ return dump_config.dumper->fmt->ops->configure_header(panic_str, regs); ++} ++ ++static inline void dump_save_context(int cpu, const struct pt_regs *regs, ++ struct task_struct *tsk) ++{ ++ dump_config.dumper->fmt->ops->save_context(cpu, regs, tsk); ++} ++ ++static inline int dump_save_this_cpu(const struct pt_regs *regs) ++{ ++ int cpu = smp_processor_id(); ++ ++ dump_save_context(cpu, regs, current); ++ return 1; ++} ++ ++static inline int dump_update_header(void) ++{ ++ return dump_config.dumper->fmt->ops->update_header(); ++} ++ ++static inline int dump_update_end_marker(void) ++{ ++ return dump_config.dumper->fmt->ops->update_end_marker(); ++} ++ ++static inline int dump_add_data(unsigned long loc, unsigned long sz) ++{ ++ return dump_config.dumper->fmt->ops->add_data(loc, sz); ++} ++ ++/* Compression operation */ ++static inline int dump_compress_data(char *src, int slen, char *dst, ++ unsigned long loc) ++{ ++ return dump_config.dumper->compress->compress_func(src, slen, ++ dst, DUMP_DPC_PAGE_SIZE, loc); ++} ++ ++ ++/* Prototypes of some default implementations of dump methods */ ++ ++extern struct __dump_compress dump_none_compression; ++ ++/* Default scheme methods (dump_scheme.c) */ ++ ++extern int dump_generic_sequencer(void); ++extern int dump_page_iterator(int pass, int (*action)(unsigned long, unsigned ++ long), struct dump_data_filter *filter); ++extern int dump_generic_save_data(unsigned long loc, unsigned long sz); ++extern int dump_generic_skip_data(unsigned long loc, unsigned long sz); ++extern int dump_generic_write_buffer(void *buf, unsigned long len); ++extern int dump_generic_configure(unsigned long); ++extern int dump_generic_unconfigure(void); ++#ifdef CONFIG_DISCONTIGMEM ++extern void dump_reconfigure_mbanks(void); ++#endif ++ ++/* Default scheme template */ ++extern struct dump_scheme dump_scheme_singlestage; ++ ++/* Default dump format methods */ ++ ++extern int dump_lcrash_configure_header(const char *panic_str, ++ const struct pt_regs *regs); ++extern void dump_lcrash_save_context(int cpu, const struct pt_regs *regs, ++ struct task_struct *tsk); ++extern int dump_generic_update_header(void); ++extern int dump_lcrash_add_data(unsigned long loc, unsigned long sz); ++extern int dump_lcrash_update_end_marker(void); ++ ++/* Default format (lcrash) template */ ++extern struct dump_fmt dump_fmt_lcrash; ++ ++/* Default dump selection filter table */ ++ ++/* ++ * Entries listed in order of importance and correspond to passes ++ * The last entry (with a level_mask of zero) typically reflects data that ++ * won't be dumped -- this may for example be used to identify data ++ * that will be skipped for certain so the corresponding memory areas can be ++ * utilized as scratch space. ++ */ ++extern struct dump_data_filter dump_filter_table[]; ++ ++/* Some pre-defined dumpers */ ++extern struct dumper dumper_singlestage; ++extern struct dumper dumper_stage1; ++extern struct dumper dumper_stage2; ++ ++/* These are temporary */ ++#define DUMP_MASK_HEADER DUMP_LEVEL_HEADER ++#define DUMP_MASK_KERN DUMP_LEVEL_KERN ++#define DUMP_MASK_USED DUMP_LEVEL_USED ++#define DUMP_MASK_UNUSED DUMP_LEVEL_ALL_RAM ++#define DUMP_MASK_REST 0 /* dummy for now */ ++ ++/* Helpers - move these to dump.h later ? */ ++ ++int dump_generic_execute(const char *panic_str, const struct pt_regs *regs); ++extern int dump_ll_write(void *buf, unsigned long len); ++int dump_check_and_free_page(struct dump_memdev *dev, struct page *page); ++ ++static inline void dumper_reset(void) ++{ ++ dump_config.dumper->curr_buf = dump_config.dumper->dump_buf; ++ dump_config.dumper->curr_loc = 0; ++ dump_config.dumper->curr_offset = 0; ++ dump_config.dumper->count = 0; ++ dump_config.dumper->curr_pass = 0; ++} ++ ++/* ++ * May later be moulded to perform boot-time allocations so we can dump ++ * earlier during bootup ++ */ ++static inline void *dump_alloc_mem(unsigned long size) ++{ ++ return (void *) __get_free_pages(GFP_KERNEL, get_order(size)); ++} ++ ++static inline void dump_free_mem(void *buf) ++{ ++ struct page *page; ++ ++ /* ignore reserved pages (e.g. post soft boot stage) */ ++ if (buf && (page = virt_to_page(buf))) { ++ if (PageReserved(page)) ++ return; ++ } ++ /* ++ * Allocated using __get_free_pages(). ++ */ ++ free_pages((unsigned long)buf, ++ get_order(DUMP_BUFFER_SIZE + 3 * DUMP_PAGE_SIZE)); ++} ++ ++ ++#endif /* _LINUX_DUMP_METHODS_H */ +Index: linux-2.6.10/drivers/dump/Makefile +=================================================================== +--- linux-2.6.10.orig/drivers/dump/Makefile 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/drivers/dump/Makefile 2005-04-05 16:47:53.947204496 +0800 +@@ -0,0 +1,22 @@ ++# ++# Makefile for the dump device drivers. ++# ++ ++dump-y := dump_setup.o dump_fmt.o dump_filters.o dump_scheme.o dump_execute.o ++ifeq ($(CONFIG_X86_64),) ++ifeq ($(CONFIG_X86),y) ++dump-$(CONFIG_X86) += dump_i386.o ++endif ++endif ++dump-$(CONFIG_ARM) += dump_arm.o ++dump-$(CONFIG_PPC64) += dump_ppc64.o ++dump-$(CONFIG_X86_64) += dump_x8664.o ++dump-$(CONFIG_IA64) += dump_ia64.o ++dump-$(CONFIG_CRASH_DUMP_MEMDEV) += dump_memdev.o dump_overlay.o ++dump-objs += $(dump-y) ++ ++obj-$(CONFIG_CRASH_DUMP) += dump.o ++obj-$(CONFIG_CRASH_DUMP_BLOCKDEV) += dump_blockdev.o ++obj-$(CONFIG_CRASH_DUMP_NETDEV) += dump_netdev.o ++obj-$(CONFIG_CRASH_DUMP_COMPRESS_RLE) += dump_rle.o ++obj-$(CONFIG_CRASH_DUMP_COMPRESS_GZIP) += dump_gzip.o +Index: linux-2.6.10/drivers/Makefile +=================================================================== +--- linux-2.6.10.orig/drivers/Makefile 2004-12-25 05:36:00.000000000 +0800 ++++ linux-2.6.10/drivers/Makefile 2005-04-05 16:47:53.950204040 +0800 +@@ -60,3 +60,4 @@ + obj-$(CONFIG_CPU_FREQ) += cpufreq/ + obj-$(CONFIG_MMC) += mmc/ + obj-y += firmware/ ++obj-$(CONFIG_CRASH_DUMP) += dump/ diff --git a/lustre/kernel_patches/patches/uml-2.6.10-fc3.patch b/lustre/kernel_patches/patches/uml-2.6.10-fc3.patch new file mode 100644 index 0000000..a5abf90 --- /dev/null +++ b/lustre/kernel_patches/patches/uml-2.6.10-fc3.patch @@ -0,0 +1,3746 @@ +Index: linux-2.6.10/include/asm-um/archparam-i386.h +=================================================================== +--- linux-2.6.10.orig/include/asm-um/archparam-i386.h 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/include/asm-um/archparam-i386.h 2005-04-05 12:40:36.075903800 +0800 +@@ -10,7 +10,8 @@ + + #include "user.h" + +-#define ELF_PLATFORM "i586" ++extern char * elf_aux_platform; ++#define ELF_PLATFORM (elf_aux_platform) + + #define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) + +@@ -56,15 +57,13 @@ + pr_reg[16] = PT_REGS_SS(regs); \ + } while(0); + +-#if 0 /* Turn this back on when UML has VSYSCALL working */ +-#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL)) +-#else +-#define VSYSCALL_BASE 0 +-#endif + +-#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) +-#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) +-extern void *__kernel_vsyscall; ++extern unsigned long vsyscall_ehdr; ++extern unsigned long vsyscall_end; ++extern unsigned long __kernel_vsyscall; ++ ++#define VSYSCALL_BASE vsyscall_ehdr ++#define VSYSCALL_END vsyscall_end + + /* + * Architecture-neutral AT_ values in 0-17, leave some room +@@ -75,8 +74,10 @@ + + #define ARCH_DLINFO \ + do { \ +- NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ +- NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ ++ if ( vsyscall_ehdr ) { \ ++ NEW_AUX_ENT(AT_SYSINFO, __kernel_vsyscall); \ ++ NEW_AUX_ENT(AT_SYSINFO_EHDR, vsyscall_ehdr); \ ++ } \ + } while (0) + + /* +@@ -87,22 +88,18 @@ + * Dumping its extra ELF program headers includes all the other information + * a debugger needs to easily find how the vsyscall DSO was being used. + */ +-#if 0 +-#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum) +-#endif +- +-#undef ELF_CORE_EXTRA_PHDRS ++#define ELF_CORE_EXTRA_PHDRS \ ++ (vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0 ) + +-#if 0 + #define ELF_CORE_WRITE_EXTRA_PHDRS \ +-do { \ +- const struct elf_phdr *const vsyscall_phdrs = \ +- (const struct elf_phdr *) (VSYSCALL_BASE \ +- + VSYSCALL_EHDR->e_phoff); \ ++if ( vsyscall_ehdr ) { \ ++ const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \ ++ const struct elf_phdr *const phdrp = \ ++ (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \ + int i; \ + Elf32_Off ofs = 0; \ +- for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ +- struct elf_phdr phdr = vsyscall_phdrs[i]; \ ++ for (i = 0; i < ehdrp->e_phnum; ++i) { \ ++ struct elf_phdr phdr = phdrp[i]; \ + if (phdr.p_type == PT_LOAD) { \ + ofs = phdr.p_offset = offset; \ + offset += phdr.p_filesz; \ +@@ -112,23 +109,19 @@ + phdr.p_paddr = 0; /* match other core phdrs */ \ + DUMP_WRITE(&phdr, sizeof(phdr)); \ + } \ +-} while (0) ++} + #define ELF_CORE_WRITE_EXTRA_DATA \ +-do { \ +- const struct elf_phdr *const vsyscall_phdrs = \ +- (const struct elf_phdr *) (VSYSCALL_BASE \ +- + VSYSCALL_EHDR->e_phoff); \ ++if ( vsyscall_ehdr ) { \ ++ const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \ ++ const struct elf_phdr *const phdrp = \ ++ (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \ + int i; \ +- for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ +- if (vsyscall_phdrs[i].p_type == PT_LOAD) \ +- DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \ +- vsyscall_phdrs[i].p_filesz); \ ++ for (i = 0; i < ehdrp->e_phnum; ++i) { \ ++ if (phdrp[i].p_type == PT_LOAD) \ ++ DUMP_WRITE((void *) phdrp[i].p_vaddr, \ ++ phdrp[i].p_filesz); \ + } \ +-} while (0) +-#endif +- +-#undef ELF_CORE_WRITE_EXTRA_PHDRS +-#undef ELF_CORE_WRITE_EXTRA_DATA ++} + + #define R_386_NONE 0 + #define R_386_32 1 +Index: linux-2.6.10/include/asm-um/elf.h +=================================================================== +--- linux-2.6.10.orig/include/asm-um/elf.h 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/include/asm-um/elf.h 2005-04-05 12:40:36.074903952 +0800 +@@ -3,7 +3,8 @@ + + #include "asm/archparam.h" + +-#define ELF_HWCAP (0) ++extern long elf_aux_hwcap; ++#define ELF_HWCAP (elf_aux_hwcap) + + #define SET_PERSONALITY(ex, ibcs2) do ; while(0) + +Index: linux-2.6.10/include/asm-um/fixmap.h +=================================================================== +--- linux-2.6.10.orig/include/asm-um/fixmap.h 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/include/asm-um/fixmap.h 2005-04-05 12:40:36.075903800 +0800 +@@ -3,6 +3,7 @@ + + #include + #include ++#include + + /* + * Here we define all the compile-time 'special' virtual +@@ -34,7 +35,6 @@ + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + #endif +- FIX_VSYSCALL, + __end_of_fixed_addresses + }; + +@@ -68,8 +68,8 @@ + * This is the range that is readable by user mode, and things + * acting like user mode such as get_user_pages. + */ +-#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL)) +-#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) ++#define FIXADDR_USER_START VSYSCALL_BASE ++#define FIXADDR_USER_END VSYSCALL_END + + extern void __this_fixmap_does_not_exist(void); + +Index: linux-2.6.10/include/asm-i386/thread_info.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/thread_info.h 2005-03-31 16:20:10.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/thread_info.h 2005-04-05 12:40:36.076903648 +0800 +@@ -139,6 +139,7 @@ + #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ + #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ + #define TIF_IRET 5 /* return with iret */ ++#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ + #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ + #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ + +@@ -148,12 +149,14 @@ + #define _TIF_NEED_RESCHED (1< + #include + #include ++#include + + /* +- * Used for LDT copy/destruction. ++ * Used for LDT initialization/destruction. You cannot copy an LDT with ++ * init_new_context, since it thinks you are passing it a new LDT and won't ++ * deallocate its old content. + */ + int init_new_context(struct task_struct *tsk, struct mm_struct *mm); + void destroy_context(struct mm_struct *mm); + ++/* LDT initialization for a clean environment - needed for SKAS.*/ ++static inline void init_new_empty_context(struct mm_struct *mm) ++{ ++ init_MUTEX(&mm->context.sem); ++ mm->context.size = 0; ++} ++ ++/* LDT copy for SKAS - for the above problem.*/ ++int copy_context(struct mm_struct *mm, struct mm_struct *old_mm); + + static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) + { +@@ -29,6 +41,10 @@ + { + int cpu = smp_processor_id(); + ++#ifdef CONFIG_SMP ++ prev = per_cpu(cpu_tlbstate, cpu).active_mm; ++#endif ++ + if (likely(prev != next)) { + /* stop flush ipis for the previous mm */ + cpu_clear(cpu, prev->cpu_vm_mask); +@@ -50,7 +66,6 @@ + #ifdef CONFIG_SMP + else { + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; +- BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); + + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + /* We were in lazy tlb mode and leave_mm disabled +Index: linux-2.6.10/include/asm-i386/ptrace.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/ptrace.h 2004-12-25 05:33:51.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/ptrace.h 2005-04-05 12:40:36.077903496 +0800 +@@ -64,4 +64,26 @@ + #endif + #endif + ++/*For SKAS3 support.*/ ++#ifndef _LINUX_PTRACE_STRUCT_DEF ++#define _LINUX_PTRACE_STRUCT_DEF ++ ++#define PTRACE_FAULTINFO 52 ++#define PTRACE_SIGPENDING 53 ++#define PTRACE_LDT 54 ++#define PTRACE_SWITCH_MM 55 ++ ++struct ptrace_faultinfo { ++ int is_write; ++ unsigned long addr; ++}; ++ ++struct ptrace_ldt { ++ int func; ++ void *ptr; ++ unsigned long bytecount; ++}; ++ ++#endif /*ifndef _LINUX_PTRACE_STRUCT_DEF*/ ++ + #endif +Index: linux-2.6.10/include/asm-i386/desc.h +=================================================================== +--- linux-2.6.10.orig/include/asm-i386/desc.h 2005-03-31 16:20:09.000000000 +0800 ++++ linux-2.6.10/include/asm-i386/desc.h 2005-04-05 12:40:36.078903344 +0800 +@@ -126,6 +126,9 @@ + put_cpu(); + } + ++extern int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr, ++ unsigned long bytecount); ++ + #endif /* !__ASSEMBLY__ */ + + #endif +Index: linux-2.6.10/include/linux/ptrace.h +=================================================================== +--- linux-2.6.10.orig/include/linux/ptrace.h 2005-03-31 15:35:23.000000000 +0800 ++++ linux-2.6.10/include/linux/ptrace.h 2005-04-05 12:40:36.071904408 +0800 +@@ -20,6 +20,7 @@ + #define PTRACE_DETACH 0x11 + + #define PTRACE_SYSCALL 24 ++#define PTRACE_SYSEMU 31 + + /* 0x4200-0x4300 are reserved for architecture-independent additions. */ + #define PTRACE_SETOPTIONS 0x4200 +Index: linux-2.6.10/include/linux/mm.h +=================================================================== +--- linux-2.6.10.orig/include/linux/mm.h 2005-03-31 16:10:15.000000000 +0800 ++++ linux-2.6.10/include/linux/mm.h 2005-04-05 12:40:36.072904256 +0800 +@@ -625,6 +625,9 @@ + extern struct shrinker *set_shrinker(int, shrinker_t); + extern void remove_shrinker(struct shrinker *shrinker); + ++extern long do_mprotect(struct mm_struct *mm, unsigned long start, ++ size_t len, unsigned long prot); ++ + /* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc_map() that does all +@@ -684,9 +687,15 @@ + + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, ++extern unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file *file, ++ unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flag, ++ unsigned long pgoff); ++static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, +- unsigned long flag, unsigned long pgoff); ++ unsigned long flag, unsigned long pgoff) { ++ return __do_mmap_pgoff(current->mm, file, addr, len, prot, flag, pgoff); ++} + + static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, +Index: linux-2.6.10/include/linux/proc_mm.h +=================================================================== +--- linux-2.6.10.orig/include/linux/proc_mm.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/include/linux/proc_mm.h 2005-04-05 12:40:36.073904104 +0800 +@@ -0,0 +1,48 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __PROC_MM_H ++#define __PROC_MM_H ++ ++#include "linux/sched.h" ++ ++#define MM_MMAP 54 ++#define MM_MUNMAP 55 ++#define MM_MPROTECT 56 ++#define MM_COPY_SEGMENTS 57 ++ ++struct mm_mmap { ++ unsigned long addr; ++ unsigned long len; ++ unsigned long prot; ++ unsigned long flags; ++ unsigned long fd; ++ unsigned long offset; ++}; ++ ++struct mm_munmap { ++ unsigned long addr; ++ unsigned long len; ++}; ++ ++struct mm_mprotect { ++ unsigned long addr; ++ unsigned long len; ++ unsigned int prot; ++}; ++ ++struct proc_mm_op { ++ int op; ++ union { ++ struct mm_mmap mmap; ++ struct mm_munmap munmap; ++ struct mm_mprotect mprotect; ++ int copy_segments; ++ } u; ++}; ++ ++extern struct mm_struct *proc_mm_get_mm(int fd); ++ ++#endif +Index: linux-2.6.10/lib/Kconfig.debug +=================================================================== +--- linux-2.6.10.orig/lib/Kconfig.debug 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/lib/Kconfig.debug 2005-04-05 12:40:36.010913680 +0800 +@@ -23,7 +23,6 @@ + config MAGIC_SYSRQ + bool "Magic SysRq key" + depends on DEBUG_KERNEL && (H8300 || M68KNOMMU || V850) +- depends (USERMODE && MCONSOLE) + help + Enables console device to interpret special characters as + commands to dump state information. +Index: linux-2.6.10/kernel/fork.c +=================================================================== +--- linux-2.6.10.orig/kernel/fork.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/kernel/fork.c 2005-04-05 12:40:36.070904560 +0800 +@@ -927,6 +927,9 @@ + * of CLONE_PTRACE. + */ + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); ++#ifdef TIF_SYSCALL_EMU ++ clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); ++#endif + + /* Our parent execution domain becomes current domain + These must match for thread signalling to apply */ +Index: linux-2.6.10/mm/mmap.c +=================================================================== +--- linux-2.6.10.orig/mm/mmap.c 2005-03-31 16:20:10.000000000 +0800 ++++ linux-2.6.10/mm/mmap.c 2005-04-05 12:40:36.013913224 +0800 +@@ -759,11 +759,11 @@ + * The caller must hold down_write(current->mm->mmap_sem). + */ + +-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, +- unsigned long len, unsigned long prot, +- unsigned long flags, unsigned long pgoff) ++unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file * file, ++ unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, ++ unsigned long pgoff) + { +- struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + struct inode *inode; + unsigned int vm_flags; +@@ -1037,7 +1037,7 @@ + return error; + } + +-EXPORT_SYMBOL(do_mmap_pgoff); ++EXPORT_SYMBOL(__do_mmap_pgoff); + + /* Get an address range which is currently unmapped. + * For shmat() with addr=0. +Index: linux-2.6.10/mm/proc_mm.c +=================================================================== +--- linux-2.6.10.orig/mm/proc_mm.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/mm/proc_mm.c 2005-04-05 12:40:36.014913072 +0800 +@@ -0,0 +1,181 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/mm.h" ++#include "linux/init.h" ++#include "linux/proc_fs.h" ++#include "linux/proc_mm.h" ++#include "linux/file.h" ++#include "linux/mman.h" ++#include "asm/uaccess.h" ++#include "asm/mmu_context.h" ++ ++static struct file_operations proc_mm_fops; ++ ++struct mm_struct *proc_mm_get_mm(int fd) ++{ ++ struct mm_struct *ret = ERR_PTR(-EBADF); ++ struct file *file; ++ ++ file = fget(fd); ++ if (!file) ++ goto out; ++ ++ ret = ERR_PTR(-EINVAL); ++ if(file->f_op != &proc_mm_fops) ++ goto out_fput; ++ ++ ret = file->private_data; ++ out_fput: ++ fput(file); ++ out: ++ return(ret); ++} ++ ++extern long do_mmap2(struct mm_struct *mm, unsigned long addr, ++ unsigned long len, unsigned long prot, ++ unsigned long flags, unsigned long fd, ++ unsigned long pgoff); ++ ++static ssize_t write_proc_mm(struct file *file, const char *buffer, ++ size_t count, loff_t *ppos) ++{ ++ struct mm_struct *mm = file->private_data; ++ struct proc_mm_op req; ++ int n, ret; ++ ++ if(count > sizeof(req)) ++ return(-EINVAL); ++ ++ n = copy_from_user(&req, buffer, count); ++ if(n != 0) ++ return(-EFAULT); ++ ++ ret = count; ++ switch(req.op){ ++ case MM_MMAP: { ++ struct mm_mmap *map = &req.u.mmap; ++ ++ /* Nobody ever noticed it, but do_mmap_pgoff() calls ++ * get_unmapped_area() which checks current->mm, if ++ * MAP_FIXED is not set, so mmap() could replace ++ * an old mapping. ++ */ ++ if (! (map->flags & MAP_FIXED)) ++ return(-EINVAL); ++ ++ ret = do_mmap2(mm, map->addr, map->len, map->prot, ++ map->flags, map->fd, map->offset >> PAGE_SHIFT); ++ if((ret & ~PAGE_MASK) == 0) ++ ret = count; ++ ++ break; ++ } ++ case MM_MUNMAP: { ++ struct mm_munmap *unmap = &req.u.munmap; ++ ++ down_write(&mm->mmap_sem); ++ ret = do_munmap(mm, unmap->addr, unmap->len); ++ up_write(&mm->mmap_sem); ++ ++ if(ret == 0) ++ ret = count; ++ break; ++ } ++ case MM_MPROTECT: { ++ struct mm_mprotect *protect = &req.u.mprotect; ++ ++ ret = do_mprotect(mm, protect->addr, protect->len, ++ protect->prot); ++ if(ret == 0) ++ ret = count; ++ break; ++ } ++ ++ case MM_COPY_SEGMENTS: { ++ struct mm_struct *from = proc_mm_get_mm(req.u.copy_segments); ++ ++ if(IS_ERR(from)){ ++ ret = PTR_ERR(from); ++ break; ++ } ++ ++ ret = copy_context(mm, from); ++ if(ret == 0) ++ ret = count; ++ break; ++ } ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return(ret); ++} ++ ++static int open_proc_mm(struct inode *inode, struct file *file) ++{ ++ struct mm_struct *mm = mm_alloc(); ++ int ret; ++ ++ ret = -ENOMEM; ++ if(mm == NULL) ++ goto out_mem; ++ ++ init_new_empty_context(mm); ++ arch_pick_mmap_layout(mm); ++ ++ spin_lock(&mmlist_lock); ++ list_add(&mm->mmlist, ¤t->mm->mmlist); ++ spin_unlock(&mmlist_lock); ++ ++ file->private_data = mm; ++ ++ return(0); ++ ++ out_mem: ++ return(ret); ++} ++ ++static int release_proc_mm(struct inode *inode, struct file *file) ++{ ++ struct mm_struct *mm = file->private_data; ++ ++ mmput(mm); ++ return(0); ++} ++ ++static struct file_operations proc_mm_fops = { ++ .open = open_proc_mm, ++ .release = release_proc_mm, ++ .write = write_proc_mm, ++}; ++ ++static int make_proc_mm(void) ++{ ++ struct proc_dir_entry *ent; ++ ++ ent = create_proc_entry("mm", 0222, &proc_root); ++ if(ent == NULL){ ++ printk("make_proc_mm : Failed to register /proc/mm\n"); ++ return(0); ++ } ++ ent->proc_fops = &proc_mm_fops; ++ ++ return(0); ++} ++ ++__initcall(make_proc_mm); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +Index: linux-2.6.10/mm/mprotect.c +=================================================================== +--- linux-2.6.10.orig/mm/mprotect.c 2005-03-31 16:20:10.000000000 +0800 ++++ linux-2.6.10/mm/mprotect.c 2005-04-05 12:40:36.011913528 +0800 +@@ -93,19 +93,20 @@ + { + pgd_t *dir; + unsigned long beg = start; ++ struct mm_struct * mm = vma->vm_mm; + +- dir = pgd_offset(current->mm, start); ++ dir = pgd_offset(mm, start); + flush_cache_range(vma, beg, end); + if (start >= end) + BUG(); +- spin_lock(¤t->mm->page_table_lock); ++ spin_lock(&mm->page_table_lock); + do { + change_pmd_range(dir, start, end - start, newprot); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (start && (start < end)); + flush_tlb_range(vma, beg, end); +- spin_unlock(¤t->mm->page_table_lock); ++ spin_unlock(&mm->page_table_lock); + return; + } + +@@ -190,8 +191,9 @@ + return error; + } + +-asmlinkage long +-sys_mprotect(unsigned long start, size_t len, unsigned long prot) ++long ++do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, ++ unsigned long prot) + { + unsigned long vm_flags, nstart, end, tmp; + struct vm_area_struct *vma, *prev; +@@ -220,9 +222,9 @@ + + vm_flags = calc_vm_prot_bits(prot); + +- down_write(¤t->mm->mmap_sem); ++ down_write(&mm->mmap_sem); + +- vma = find_vma_prev(current->mm, start, &prev); ++ vma = find_vma_prev(mm, start, &prev); + error = -ENOMEM; + if (!vma) + goto out; +@@ -288,6 +290,11 @@ + } + } + out: +- up_write(¤t->mm->mmap_sem); ++ up_write(&mm->mmap_sem); + return error; + } ++ ++asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) ++{ ++ return(do_mprotect(current->mm, start, len, prot)); ++} +Index: linux-2.6.10/mm/Makefile +=================================================================== +--- linux-2.6.10.orig/mm/Makefile 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/mm/Makefile 2005-04-05 12:40:36.014913072 +0800 +@@ -18,3 +18,4 @@ + obj-$(CONFIG_SHMEM) += shmem.o + obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o + ++obj-$(CONFIG_PROC_MM) += proc_mm.o +Index: linux-2.6.10/arch/i386/kernel/entry.S +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/entry.S 2005-03-31 16:20:08.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/entry.S 2005-04-05 12:40:36.064905472 +0800 +@@ -222,7 +222,7 @@ + SAVE_ALL + GET_THREAD_INFO(%ebp) + +- testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) ++ testb $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +@@ -245,8 +245,8 @@ + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) +- # system call tracing in operation +- testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) ++ # system call tracing in operation / emulation ++ testb $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +@@ -307,6 +307,9 @@ + movl %esp, %eax + xorl %edx,%edx + call do_syscall_trace ++ cmpl $0, %eax ++ jne syscall_exit # ret != 0 -> running under PTRACE_SYSEMU, ++ # so must skip actual syscall + movl ORIG_EAX(%esp), %eax + cmpl $(nr_syscalls), %eax + jnae syscall_call +Index: linux-2.6.10/arch/i386/kernel/ptrace.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/ptrace.c 2004-12-25 05:34:29.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/ptrace.c 2005-04-05 12:40:36.061905928 +0800 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -406,15 +407,27 @@ + } + break; + ++ case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */ + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: /* restart after signal. */ + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; ++ /* If we came here with PTRACE_SYSEMU and now continue with ++ * PTRACE_SYSCALL, entry.S used to intercept the syscall return. ++ * But it shouldn't! ++ * So we don't clear TIF_SYSCALL_EMU, which is always unused in ++ * this special case, to remember, we came from SYSEMU. That ++ * flag will be cleared by do_syscall_trace(). ++ */ ++ if (request == PTRACE_SYSEMU) { ++ set_tsk_thread_flag(child, TIF_SYSCALL_EMU); ++ } else if (request == PTRACE_CONT) { ++ clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); ++ } + if (request == PTRACE_SYSCALL) { + set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); +- } +- else { ++ } else { + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + } + child->exit_code = data; +@@ -443,6 +456,8 @@ + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; ++ /*See do_syscall_trace to know why we don't clear ++ * TIF_SYSCALL_EMU.*/ + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + set_singlestep(child); + child->exit_code = data; +@@ -542,6 +557,58 @@ + (struct user_desc __user *) data); + break; + ++#ifdef CONFIG_PROC_MM ++ case PTRACE_FAULTINFO: { ++ struct ptrace_faultinfo fault; ++ ++ fault = ((struct ptrace_faultinfo) ++ { .is_write = child->thread.error_code, ++ .addr = child->thread.cr2 }); ++ ret = copy_to_user((unsigned long *) data, &fault, ++ sizeof(fault)); ++ if(ret) ++ break; ++ break; ++ } ++ ++ case PTRACE_SIGPENDING: ++ ret = copy_to_user((unsigned long *) data, ++ &child->pending.signal, ++ sizeof(child->pending.signal)); ++ break; ++ ++ case PTRACE_LDT: { ++ struct ptrace_ldt ldt; ++ ++ if(copy_from_user(&ldt, (unsigned long *) data, ++ sizeof(ldt))){ ++ ret = -EIO; ++ break; ++ } ++ ret = __modify_ldt(child->mm, ldt.func, ldt.ptr, ldt.bytecount); ++ break; ++ } ++ ++ case PTRACE_SWITCH_MM: { ++ struct mm_struct *old = child->mm; ++ struct mm_struct *new = proc_mm_get_mm(data); ++ ++ if(IS_ERR(new)){ ++ ret = PTR_ERR(new); ++ break; ++ } ++ ++ atomic_inc(&new->mm_users); ++ task_lock(child); ++ child->mm = new; ++ child->active_mm = new; ++ task_unlock(child); ++ mmput(old); ++ ret = 0; ++ break; ++ } ++#endif ++ + default: + ret = ptrace_request(child, request, addr, data); + break; +@@ -557,8 +624,9 @@ + * - triggered by current->work.syscall_trace + */ + __attribute__((regparm(3))) +-void do_syscall_trace(struct pt_regs *regs, int entryexit) ++int do_syscall_trace(struct pt_regs *regs, int entryexit) + { ++ int is_sysemu, is_systrace, is_singlestep; + if (unlikely(current->audit_context)) { + if (!entryexit) + audit_syscall_entry(current, regs->orig_eax, +@@ -567,16 +635,27 @@ + else + audit_syscall_exit(current, regs->eax); + } +- +- if (!test_thread_flag(TIF_SYSCALL_TRACE) && +- !test_thread_flag(TIF_SINGLESTEP)) +- return; ++ is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); ++ is_systrace = test_thread_flag(TIF_SYSCALL_TRACE); ++ is_singlestep = test_thread_flag(TIF_SINGLESTEP); ++ ++ if (!is_systrace && !is_singlestep && !is_sysemu) ++ return 0; ++ /* We can detect the case of coming from PTRACE_SYSEMU and now running ++ * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being ++ * set additionally. ++ * If so let's reset the flag and return without action. ++ */ ++ if (is_sysemu && (is_systrace || is_singlestep)) { ++ clear_thread_flag(TIF_SYSCALL_EMU); ++ return 0; ++ } + if (!(current->ptrace & PT_PTRACED)) +- return; ++ return 0; + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) && +- !test_thread_flag(TIF_SINGLESTEP) ? 0x80 : 0)); ++ !is_singlestep ? 0x80 : 0)); + + /* + * this isn't the same as continuing with a signal, but it will do +@@ -587,4 +666,6 @@ + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } ++ /* != 0 if nullifying the syscall, 0 if running it normally */ ++ return is_sysemu; + } +Index: linux-2.6.10/arch/i386/kernel/ldt.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/ldt.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/ldt.c 2005-04-05 12:40:36.062905776 +0800 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ + static void flush_ldt(void *null) +@@ -27,11 +28,12 @@ + } + #endif + +-static int alloc_ldt(mm_context_t *pc, int mincount, int reload) ++static int alloc_ldt(struct mm_struct *mm, int mincount, int reload) + { + void *oldldt; + void *newldt; + int oldsize; ++ mm_context_t * pc = &mm->context; + + if (mincount <= pc->size) + return 0; +@@ -58,13 +60,15 @@ + #ifdef CONFIG_SMP + cpumask_t mask; + preempt_disable(); +- load_LDT(pc); ++ if (¤t->active_mm->context == pc) ++ load_LDT(pc); + mask = cpumask_of_cpu(smp_processor_id()); +- if (!cpus_equal(current->mm->cpu_vm_mask, mask)) ++ if (!cpus_equal(mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); + #else +- load_LDT(pc); ++ if (¤t->active_mm->context == pc) ++ load_LDT(pc); + #endif + } + if (oldsize) { +@@ -76,12 +80,12 @@ + return 0; + } + +-static inline int copy_ldt(mm_context_t *new, mm_context_t *old) ++static inline int copy_ldt(struct mm_struct *new, struct mm_struct *old) + { +- int err = alloc_ldt(new, old->size, 0); ++ int err = alloc_ldt(new, old->context.size, 0); + if (err < 0) + return err; +- memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); ++ memcpy(new->context.ldt, old->context.ldt, old->context.size*LDT_ENTRY_SIZE); + return 0; + } + +@@ -89,22 +93,24 @@ + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +-int init_new_context(struct task_struct *tsk, struct mm_struct *mm) ++int copy_context(struct mm_struct *mm, struct mm_struct *old_mm) + { +- struct mm_struct * old_mm; + int retval = 0; + +- init_MUTEX(&mm->context.sem); +- mm->context.size = 0; +- old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); +- retval = copy_ldt(&mm->context, &old_mm->context); ++ retval = copy_ldt(mm, old_mm); + up(&old_mm->context.sem); + } + return retval; + } + ++int init_new_context(struct task_struct *tsk, struct mm_struct *mm) ++{ ++ init_new_empty_context(mm); ++ return copy_context(mm, current->mm); ++} ++ + /* + * No need to lock the MM as we are the last user + */ +@@ -121,11 +127,11 @@ + } + } + +-static int read_ldt(void __user * ptr, unsigned long bytecount) ++static int read_ldt(struct mm_struct * mm, void __user * ptr, ++ unsigned long bytecount) + { + int err; + unsigned long size; +- struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; +@@ -174,9 +180,8 @@ + return err; + } + +-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) ++static int write_ldt(struct mm_struct * mm, void __user * ptr, unsigned long bytecount, int oldmode) + { +- struct mm_struct * mm = current->mm; + __u32 entry_1, entry_2, *lp; + int error; + struct user_desc ldt_info; +@@ -200,7 +205,7 @@ + + down(&mm->context.sem); + if (ldt_info.entry_number >= mm->context.size) { +- error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); ++ error = alloc_ldt(mm, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } +@@ -233,23 +238,29 @@ + return error; + } + +-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) ++int __modify_ldt(struct mm_struct * mm, int func, void __user *ptr, ++ unsigned long bytecount) + { + int ret = -ENOSYS; + + switch (func) { + case 0: +- ret = read_ldt(ptr, bytecount); ++ ret = read_ldt(mm, ptr, bytecount); + break; + case 1: +- ret = write_ldt(ptr, bytecount, 1); ++ ret = write_ldt(mm, ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: +- ret = write_ldt(ptr, bytecount, 0); ++ ret = write_ldt(mm, ptr, bytecount, 0); + break; + } + return ret; + } ++ ++asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) ++{ ++ return __modify_ldt(current->mm, func, ptr, bytecount); ++} +Index: linux-2.6.10/arch/i386/kernel/sys_i386.c +=================================================================== +--- linux-2.6.10.orig/arch/i386/kernel/sys_i386.c 2004-12-25 05:35:39.000000000 +0800 ++++ linux-2.6.10/arch/i386/kernel/sys_i386.c 2005-04-05 12:40:36.063905624 +0800 +@@ -41,7 +41,7 @@ + } + + /* common code for old and new mmaps */ +-static inline long do_mmap2( ++long do_mmap2(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +@@ -56,9 +56,9 @@ + goto out; + } + +- down_write(¤t->mm->mmap_sem); +- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); +- up_write(¤t->mm->mmap_sem); ++ down_write(&mm->mmap_sem); ++ error = __do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff); ++ up_write(&mm->mmap_sem); + + if (file) + fput(file); +@@ -70,7 +70,7 @@ + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) + { +- return do_mmap2(addr, len, prot, flags, fd, pgoff); ++ return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff); + } + + /* +@@ -101,7 +101,7 @@ + if (a.offset & ~PAGE_MASK) + goto out; + +- err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); ++ err = do_mmap2(current->mm, a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + out: + return err; + } +Index: linux-2.6.10/arch/i386/Kconfig +=================================================================== +--- linux-2.6.10.orig/arch/i386/Kconfig 2005-03-31 15:35:23.000000000 +0800 ++++ linux-2.6.10/arch/i386/Kconfig 2005-04-05 12:40:36.066905168 +0800 +@@ -738,6 +738,10 @@ + depends on HIGHMEM64G + default y + ++config PROC_MM ++ bool "/proc/mm support" ++ default y ++ + # Common NUMA Features + config NUMA + bool "Numa Memory Allocation and Scheduler Support" +Index: linux-2.6.10/arch/um/include/frame.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/frame.h 2004-12-25 05:34:31.000000000 +0800 ++++ linux-2.6.10/arch/um/include/frame.h 2005-04-05 19:01:49.158500672 +0800 +@@ -1,53 +0,0 @@ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- * Licensed under the GPL +- */ +- +-#ifndef __FRAME_H_ +-#define __FRAME_H_ +- +-#include "sysdep/frame.h" +- +-struct frame_common { +- void *data; +- int len; +- int sig_index; +- int sr_index; +- int sr_relative; +- int sp_index; +- struct arch_frame_data arch; +-}; +- +-struct sc_frame { +- struct frame_common common; +- int sc_index; +-}; +- +-extern struct sc_frame signal_frame_sc; +- +-extern struct sc_frame signal_frame_sc_sr; +- +-struct si_frame { +- struct frame_common common; +- int sip_index; +- int si_index; +- int ucp_index; +- int uc_index; +-}; +- +-extern struct si_frame signal_frame_si; +- +-extern void capture_signal_stack(void); +- +-#endif +- +-/* +- * Overrides for Emacs so that we follow Linus's tabbing style. +- * Emacs will notice this stuff at the end of the file and automatically +- * adjust the settings for this buffer only. This must remain at the end +- * of the file. +- * --------------------------------------------------------------------------- +- * Local variables: +- * c-file-style: "linux" +- * End: +- */ +Index: linux-2.6.10/arch/um/include/frame_kern.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/frame_kern.h 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/arch/um/include/frame_kern.h 2005-04-05 12:40:36.056906688 +0800 +@@ -6,8 +6,8 @@ + #ifndef __FRAME_KERN_H_ + #define __FRAME_KERN_H_ + +-#include "frame.h" +-#include "sysdep/frame_kern.h" ++#define _S(nr) (1<<((nr)-1)) ++#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) + + extern int setup_signal_stack_sc(unsigned long stack_top, int sig, + struct k_sigaction *ka, +Index: linux-2.6.10/arch/um/include/frame_user.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/frame_user.h 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/arch/um/include/frame_user.h 2005-04-05 19:01:49.158500672 +0800 +@@ -1,23 +0,0 @@ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- * Licensed under the GPL +- */ +- +-#ifndef __FRAME_USER_H_ +-#define __FRAME_USER_H_ +- +-#include "sysdep/frame_user.h" +-#include "frame.h" +- +-#endif +- +-/* +- * Overrides for Emacs so that we follow Linus's tabbing style. +- * Emacs will notice this stuff at the end of the file and automatically +- * adjust the settings for this buffer only. This must remain at the end +- * of the file. +- * --------------------------------------------------------------------------- +- * Local variables: +- * c-file-style: "linux" +- * End: +- */ +Index: linux-2.6.10/arch/um/include/ptrace_user.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/ptrace_user.h 2004-12-25 05:33:51.000000000 +0800 ++++ linux-2.6.10/arch/um/include/ptrace_user.h 2005-04-05 12:40:36.057906536 +0800 +@@ -26,4 +26,35 @@ + int get_using_sysemu(void); + extern int sysemu_supported; + ++ ++/* syscall emulation path in ptrace */ ++ ++#ifndef PTRACE_SYSEMU ++#define PTRACE_SYSEMU 31 ++#endif ++ ++/* On architectures, that started to support PTRACE_O_TRACESYSGOOD ++ * in linux 2.4, there are two different definitions of ++ * PTRACE_SETOPTIONS: linux 2.4 uses 21 while linux 2.6 uses 0x4200. ++ * For binary compatibility, 2.6 also supports the old "21", named ++ * PTRACE_OLDSETOPTION. On these architectures, UML always must use ++ * "21", to ensure the kernel runs on 2.4 and 2.6 host without ++ * recompilation. So, we use PTRACE_OLDSETOPTIONS in UML. ++ * We also want to be able to build the kernel on 2.4, which doesn't ++ * have PTRACE_OLDSETOPTIONS. So, if it is missing, we declare ++ * PTRACE_OLDSETOPTIONS to to be the same as PTRACE_SETOPTIONS. ++ * ++ * On architectures, that start to support PTRACE_O_TRACESYSGOOD on ++ * linux 2.6, PTRACE_OLDSETOPTIONS never is defined, and also isn't ++ * supported by the host kernel. In that case, our trick lets us use ++ * the new 0x4200 with the name PTRACE_OLDSETOPTIONS. ++ */ ++#ifndef PTRACE_OLDSETOPTIONS ++#define PTRACE_OLDSETOPTIONS PTRACE_SETOPTIONS ++#endif ++ ++void set_using_sysemu(int value); ++int get_using_sysemu(void); ++extern int sysemu_supported; ++ + #endif +Index: linux-2.6.10/arch/um/include/sysdep-i386/frame.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame.h 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/arch/um/include/sysdep-i386/frame.h 2005-04-05 19:01:49.158500672 +0800 +@@ -1,29 +0,0 @@ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- * Licensed under the GPL +- */ +- +-#ifndef __FRAME_I386_H +-#define __FRAME_I386_H +- +-struct arch_frame_data_raw { +- unsigned long fp_start; +- unsigned long sr; +-}; +- +-struct arch_frame_data { +- int fpstate_size; +-}; +- +-#endif +- +-/* +- * Overrides for Emacs so that we follow Linus's tabbing style. +- * Emacs will notice this stuff at the end of the file and automatically +- * adjust the settings for this buffer only. This must remain at the end +- * of the file. +- * --------------------------------------------------------------------------- +- * Local variables: +- * c-file-style: "linux" +- * End: +- */ +Index: linux-2.6.10/arch/um/include/sysdep-i386/frame_kern.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame_kern.h 2004-12-25 05:34:26.000000000 +0800 ++++ linux-2.6.10/arch/um/include/sysdep-i386/frame_kern.h 2005-04-05 19:01:49.158500672 +0800 +@@ -1,69 +0,0 @@ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- * Licensed under the GPL +- */ +- +-#ifndef __FRAME_KERN_I386_H +-#define __FRAME_KERN_I386_H +- +-/* This is called from sys_sigreturn. It takes the sp at the point of the +- * sigreturn system call and returns the address of the sigcontext struct +- * on the stack. +- */ +- +-static inline void *sp_to_sc(unsigned long sp) +-{ +- return((void *) sp); +-} +- +-static inline void *sp_to_uc(unsigned long sp) +-{ +- unsigned long uc; +- +- uc = sp + signal_frame_si.uc_index - +- signal_frame_si.common.sp_index - 4; +- return((void *) uc); +-} +- +-static inline void *sp_to_rt_sc(unsigned long sp) +-{ +- unsigned long sc; +- +- sc = sp - signal_frame_si.common.sp_index + +- signal_frame_si.common.len - 4; +- return((void *) sc); +-} +- +-static inline void *sp_to_mask(unsigned long sp) +-{ +- unsigned long mask; +- +- mask = sp - signal_frame_sc.common.sp_index + +- signal_frame_sc.common.len - 8; +- return((void *) mask); +-} +- +-extern int sc_size(void *data); +- +-static inline void *sp_to_rt_mask(unsigned long sp) +-{ +- unsigned long mask; +- +- mask = sp - signal_frame_si.common.sp_index + +- signal_frame_si.common.len + +- sc_size(&signal_frame_si.common.arch) - 4; +- return((void *) mask); +-} +- +-#endif +- +-/* +- * Overrides for Emacs so that we follow Linus's tabbing style. +- * Emacs will notice this stuff at the end of the file and automatically +- * adjust the settings for this buffer only. This must remain at the end +- * of the file. +- * --------------------------------------------------------------------------- +- * Local variables: +- * c-file-style: "linux" +- * End: +- */ +Index: linux-2.6.10/arch/um/include/sysdep-i386/frame_user.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/sysdep-i386/frame_user.h 2004-12-25 05:35:28.000000000 +0800 ++++ linux-2.6.10/arch/um/include/sysdep-i386/frame_user.h 2005-04-05 19:01:49.158500672 +0800 +@@ -1,91 +0,0 @@ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- * Licensed under the GPL +- */ +- +-#ifndef __FRAME_USER_I386_H +-#define __FRAME_USER_I386_H +- +-#include +-#include "sysdep/frame.h" +- +-/* This stuff is to calculate the size of the fp state struct at runtime +- * because it has changed between 2.2 and 2.4 and it would be good for a +- * UML compiled on one to work on the other. +- * So, setup_arch_frame_raw fills in the arch struct with the raw data, which +- * just contains the address of the end of the sigcontext. This is invoked +- * from the signal handler. +- * setup_arch_frame uses that data to figure out what +- * arch_frame_data.fpstate_size should be. It really has no idea, since it's +- * not allowed to do sizeof(struct fpstate) but it's safe to consider that it's +- * everything from the end of the sigcontext up to the top of the stack. So, +- * it masks off the page number to get the offset within the page and subtracts +- * that from the page size, and that's how big the fpstate struct will be +- * considered to be. +- */ +- +-static inline void setup_arch_frame_raw(struct arch_frame_data_raw *data, +- void *end, unsigned long srp) +-{ +- unsigned long sr = *((unsigned long *) srp); +- +- data->fp_start = (unsigned long) end; +- if((sr & PAGE_MASK) == ((unsigned long) end & PAGE_MASK)) +- data->sr = sr; +- else data->sr = 0; +-} +- +-static inline void setup_arch_frame(struct arch_frame_data_raw *in, +- struct arch_frame_data *out) +-{ +- unsigned long fpstate_start = in->fp_start; +- +- if(in->sr == 0){ +- fpstate_start &= ~PAGE_MASK; +- out->fpstate_size = PAGE_SIZE - fpstate_start; +- } +- else { +- out->fpstate_size = in->sr - fpstate_start; +- } +-} +- +-/* This figures out where on the stack the SA_RESTORER function address +- * is stored. For i386, it's the signal handler return address, so it's +- * located next to the frame pointer. +- * This is inlined, so __builtin_frame_address(0) is correct. Otherwise, +- * it would have to be __builtin_frame_address(1). +- */ +- +-#define frame_restorer() \ +-({ \ +- unsigned long *fp; \ +-\ +- fp = __builtin_frame_address(0); \ +- ((unsigned long) (fp + 1)); \ +-}) +- +-/* Similarly, this returns the value of sp when the handler was first +- * entered. This is used to calculate the proper sp when delivering +- * signals. +- */ +- +-#define frame_sp() \ +-({ \ +- unsigned long *fp; \ +-\ +- fp = __builtin_frame_address(0); \ +- ((unsigned long) (fp + 1)); \ +-}) +- +-#endif +- +-/* +- * Overrides for Emacs so that we follow Linus's tabbing style. +- * Emacs will notice this stuff at the end of the file and automatically +- * adjust the settings for this buffer only. This must remain at the end +- * of the file. +- * --------------------------------------------------------------------------- +- * Local variables: +- * c-file-style: "linux" +- * End: +- */ +Index: linux-2.6.10/arch/um/include/elf_user.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/elf_user.h 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/arch/um/include/elf_user.h 2005-04-05 12:40:36.054906992 +0800 +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH ++ * Author: Bodo Stroesser ++ * Licensed under the GPL ++ */ ++ ++#ifndef __ELF_USER_H__ ++#define __ELF_USER_H__ ++ ++/* For compilation on a host that doesn't support AT_SYSINFO (Linux 2.4) */ ++ ++#ifndef AT_SYSINFO ++#define AT_SYSINFO 32 ++#endif ++#ifndef AT_SYSINFO_EHDR ++#define AT_SYSINFO_EHDR 33 ++#endif ++ ++#endif +Index: linux-2.6.10/arch/um/include/skas_ptrace.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/skas_ptrace.h 2004-12-25 05:35:27.000000000 +0800 ++++ linux-2.6.10/arch/um/include/skas_ptrace.h 2005-04-05 12:40:36.056906688 +0800 +@@ -6,6 +6,7 @@ + #ifndef __SKAS_PTRACE_H + #define __SKAS_PTRACE_H + ++#ifndef PTRACE_FAULTINFO + struct ptrace_faultinfo { + int is_write; + unsigned long addr; +@@ -21,6 +22,7 @@ + #define PTRACE_SIGPENDING 53 + #define PTRACE_LDT 54 + #define PTRACE_SWITCH_MM 55 ++#endif + + #endif + +Index: linux-2.6.10/arch/um/include/signal_user.h +=================================================================== +--- linux-2.6.10.orig/arch/um/include/signal_user.h 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/arch/um/include/signal_user.h 2005-04-05 12:40:36.055906840 +0800 +@@ -14,6 +14,8 @@ + extern int set_signals(int enable); + extern int get_signals(void); + ++#define SYSCALL_TRAP 0x80 ++ + #endif + + /* +Index: linux-2.6.10/arch/um/sys-i386/ptrace_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/sys-i386/ptrace_user.c 2004-12-25 05:35:50.000000000 +0800 ++++ linux-2.6.10/arch/um/sys-i386/ptrace_user.c 2005-04-05 12:40:36.022911856 +0800 +@@ -17,17 +17,30 @@ + + int ptrace_getregs(long pid, unsigned long *regs_out) + { +- return(ptrace(PTRACE_GETREGS, pid, 0, regs_out)); ++ if(ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0) ++ return(-errno); ++ return(0); + } + + int ptrace_setregs(long pid, unsigned long *regs) + { +- return(ptrace(PTRACE_SETREGS, pid, 0, regs)); ++ if(ptrace(PTRACE_SETREGS, pid, 0, regs) < 0) ++ return(-errno); ++ return(0); + } + + int ptrace_getfpregs(long pid, unsigned long *regs) + { +- return(ptrace(PTRACE_GETFPREGS, pid, 0, regs)); ++ if(ptrace(PTRACE_GETFPREGS, pid, 0, regs) < 0) ++ return(-errno); ++ return(0); ++} ++ ++int ptrace_setfpregs(long pid, unsigned long *regs) ++{ ++ if(ptrace(PTRACE_SETFPREGS, pid, 0, regs) < 0) ++ return(-errno); ++ return(0); + } + + static void write_debugregs(int pid, unsigned long *regs) +Index: linux-2.6.10/arch/um/sys-i386/sigcontext.c +=================================================================== +--- linux-2.6.10.orig/arch/um/sys-i386/sigcontext.c 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/arch/um/sys-i386/sigcontext.c 2005-04-05 12:40:36.023911704 +0800 +@@ -9,22 +9,14 @@ + #include + #include "sysdep/ptrace.h" + #include "kern_util.h" +-#include "frame_user.h" +- +-int sc_size(void *data) +-{ +- struct arch_frame_data *arch = data; +- +- return(sizeof(struct sigcontext) + arch->fpstate_size); +-} + + void sc_to_sc(void *to_ptr, void *from_ptr) + { + struct sigcontext *to = to_ptr, *from = from_ptr; +- int size = sizeof(*to) + signal_frame_sc.common.arch.fpstate_size; + +- memcpy(to, from, size); +- if(from->fpstate != NULL) to->fpstate = (struct _fpstate *) (to + 1); ++ memcpy(to, from, sizeof(*to) + sizeof(struct _fpstate)); ++ if(from->fpstate != NULL) ++ to->fpstate = (struct _fpstate *) (to + 1); + } + + unsigned long *sc_sigmask(void *sc_ptr) +Index: linux-2.6.10/arch/um/sys-i386/sysrq.c +=================================================================== +--- linux-2.6.10.orig/arch/um/sys-i386/sysrq.c 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/arch/um/sys-i386/sysrq.c 2005-04-05 12:40:36.022911856 +0800 +@@ -33,3 +33,13 @@ + + show_trace((unsigned long *) ®s); + } ++ ++/* Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +Index: linux-2.6.10/arch/um/sys-i386/signal.c +=================================================================== +--- linux-2.6.10.orig/arch/um/sys-i386/signal.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/arch/um/sys-i386/signal.c 2005-04-05 12:40:36.021912008 +0800 +@@ -0,0 +1,374 @@ ++/* ++ * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/signal.h" ++#include "linux/ptrace.h" ++#include "asm/current.h" ++#include "asm/ucontext.h" ++#include "asm/uaccess.h" ++#include "asm/unistd.h" ++#include "frame_kern.h" ++#include "signal_user.h" ++#include "ptrace_user.h" ++#include "sigcontext.h" ++#include "mode.h" ++ ++#ifdef CONFIG_MODE_SKAS ++ ++#include "skas.h" ++ ++static int copy_sc_from_user_skas(struct pt_regs *regs, ++ struct sigcontext *from) ++{ ++ struct sigcontext sc; ++ unsigned long fpregs[HOST_FP_SIZE]; ++ int err; ++ ++ err = copy_from_user(&sc, from, sizeof(sc)); ++ err |= copy_from_user(fpregs, sc.fpstate, sizeof(fpregs)); ++ if(err) ++ return(err); ++ ++ REGS_GS(regs->regs.skas.regs) = sc.gs; ++ REGS_FS(regs->regs.skas.regs) = sc.fs; ++ REGS_ES(regs->regs.skas.regs) = sc.es; ++ REGS_DS(regs->regs.skas.regs) = sc.ds; ++ REGS_EDI(regs->regs.skas.regs) = sc.edi; ++ REGS_ESI(regs->regs.skas.regs) = sc.esi; ++ REGS_EBP(regs->regs.skas.regs) = sc.ebp; ++ REGS_SP(regs->regs.skas.regs) = sc.esp; ++ REGS_EBX(regs->regs.skas.regs) = sc.ebx; ++ REGS_EDX(regs->regs.skas.regs) = sc.edx; ++ REGS_ECX(regs->regs.skas.regs) = sc.ecx; ++ REGS_EAX(regs->regs.skas.regs) = sc.eax; ++ REGS_IP(regs->regs.skas.regs) = sc.eip; ++ REGS_CS(regs->regs.skas.regs) = sc.cs; ++ REGS_EFLAGS(regs->regs.skas.regs) = sc.eflags; ++ REGS_SS(regs->regs.skas.regs) = sc.ss; ++ regs->regs.skas.fault_addr = sc.cr2; ++ regs->regs.skas.fault_type = FAULT_WRITE(sc.err); ++ regs->regs.skas.trap_type = sc.trapno; ++ ++ err = ptrace_setfpregs(userspace_pid[0], fpregs); ++ if(err < 0){ ++ printk("copy_sc_from_user_skas - PTRACE_SETFPREGS failed, " ++ "errno = %d\n", err); ++ return(1); ++ } ++ ++ return(0); ++} ++ ++int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp, ++ struct pt_regs *regs, unsigned long fault_addr, ++ int fault_type) ++{ ++ struct sigcontext sc; ++ unsigned long fpregs[HOST_FP_SIZE]; ++ int err; ++ ++ sc.gs = REGS_GS(regs->regs.skas.regs); ++ sc.fs = REGS_FS(regs->regs.skas.regs); ++ sc.es = REGS_ES(regs->regs.skas.regs); ++ sc.ds = REGS_DS(regs->regs.skas.regs); ++ sc.edi = REGS_EDI(regs->regs.skas.regs); ++ sc.esi = REGS_ESI(regs->regs.skas.regs); ++ sc.ebp = REGS_EBP(regs->regs.skas.regs); ++ sc.esp = REGS_SP(regs->regs.skas.regs); ++ sc.ebx = REGS_EBX(regs->regs.skas.regs); ++ sc.edx = REGS_EDX(regs->regs.skas.regs); ++ sc.ecx = REGS_ECX(regs->regs.skas.regs); ++ sc.eax = REGS_EAX(regs->regs.skas.regs); ++ sc.eip = REGS_IP(regs->regs.skas.regs); ++ sc.cs = REGS_CS(regs->regs.skas.regs); ++ sc.eflags = REGS_EFLAGS(regs->regs.skas.regs); ++ sc.esp_at_signal = regs->regs.skas.regs[UESP]; ++ sc.ss = regs->regs.skas.regs[SS]; ++ sc.cr2 = fault_addr; ++ sc.err = TO_SC_ERR(fault_type); ++ sc.trapno = regs->regs.skas.trap_type; ++ ++ err = ptrace_getfpregs(userspace_pid[0], fpregs); ++ if(err < 0){ ++ printk("copy_sc_to_user_skas - PTRACE_GETFPREGS failed, " ++ "errno = %d\n", err); ++ return(1); ++ } ++ to_fp = (to_fp ? to_fp : (struct _fpstate *) (to + 1)); ++ sc.fpstate = to_fp; ++ ++ if(err) ++ return(err); ++ ++ return(copy_to_user(to, &sc, sizeof(sc)) || ++ copy_to_user(to_fp, fpregs, sizeof(fpregs))); ++} ++#endif ++ ++#ifdef CONFIG_MODE_TT ++int copy_sc_from_user_tt(struct sigcontext *to, struct sigcontext *from, ++ int fpsize) ++{ ++ struct _fpstate *to_fp, *from_fp; ++ unsigned long sigs; ++ int err; ++ ++ to_fp = to->fpstate; ++ from_fp = from->fpstate; ++ sigs = to->oldmask; ++ err = copy_from_user(to, from, sizeof(*to)); ++ to->oldmask = sigs; ++ if(to_fp != NULL){ ++ err |= copy_from_user(&to->fpstate, &to_fp, ++ sizeof(to->fpstate)); ++ err |= copy_from_user(to_fp, from_fp, fpsize); ++ } ++ return(err); ++} ++ ++int copy_sc_to_user_tt(struct sigcontext *to, struct _fpstate *fp, ++ struct sigcontext *from, int fpsize) ++{ ++ struct _fpstate *to_fp, *from_fp; ++ int err; ++ ++ to_fp = (fp ? fp : (struct _fpstate *) (to + 1)); ++ from_fp = from->fpstate; ++ err = copy_to_user(to, from, sizeof(*to)); ++ if(from_fp != NULL){ ++ err |= copy_to_user(&to->fpstate, &to_fp, ++ sizeof(to->fpstate)); ++ err |= copy_to_user(to_fp, from_fp, fpsize); ++ } ++ return(err); ++} ++#endif ++ ++static int copy_sc_from_user(struct pt_regs *to, void *from) ++{ ++ int ret; ++ ++ ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from, ++ sizeof(struct _fpstate)), ++ copy_sc_from_user_skas(to, from)); ++ return(ret); ++} ++ ++static int copy_sc_to_user(struct sigcontext *to, struct _fpstate *fp, ++ struct pt_regs *from) ++{ ++ return(CHOOSE_MODE(copy_sc_to_user_tt(to, fp, UPT_SC(&from->regs), ++ sizeof(*fp)), ++ copy_sc_to_user_skas(to, fp, from, ++ current->thread.cr2, ++ current->thread.err))); ++} ++ ++static int copy_ucontext_to_user(struct ucontext *uc, struct _fpstate *fp, ++ sigset_t *set, unsigned long sp) ++{ ++ int err = 0; ++ ++ err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); ++ err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); ++ err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); ++ err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs); ++ err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); ++ return(err); ++} ++ ++struct sigframe ++{ ++ char *pretcode; ++ int sig; ++ struct sigcontext sc; ++ struct _fpstate fpstate; ++ unsigned long extramask[_NSIG_WORDS-1]; ++ char retcode[8]; ++}; ++ ++struct rt_sigframe ++{ ++ char *pretcode; ++ int sig; ++ struct siginfo *pinfo; ++ void *puc; ++ struct siginfo info; ++ struct ucontext uc; ++ struct _fpstate fpstate; ++ char retcode[8]; ++}; ++ ++int setup_signal_stack_sc(unsigned long stack_top, int sig, ++ struct k_sigaction *ka, struct pt_regs *regs, ++ sigset_t *mask) ++{ ++ struct sigframe __user *frame; ++ void *restorer; ++ int err = 0; ++ ++ stack_top &= -8UL; ++ frame = (struct sigframe *) stack_top - 1; ++ if(verify_area(VERIFY_WRITE, frame, sizeof(*frame))) ++ return(1); ++ ++ restorer = (void *) frame->retcode; ++ if(ka->sa.sa_flags & SA_RESTORER) ++ restorer = ka->sa.sa_restorer; ++ ++ err |= __put_user(restorer, &frame->pretcode); ++ err |= __put_user(sig, &frame->sig); ++ err |= copy_sc_to_user(&frame->sc, NULL, regs); ++ err |= __put_user(mask->sig[0], &frame->sc.oldmask); ++ if (_NSIG_WORDS > 1) ++ err |= __copy_to_user(&frame->extramask, &mask->sig[1], ++ sizeof(frame->extramask)); ++ ++ /* ++ * This is popl %eax ; movl $,%eax ; int $0x80 ++ * ++ * WE DO NOT USE IT ANY MORE! It's only left here for historical ++ * reasons and because gdb uses it as a signature to notice ++ * signal handler stack frames. ++ */ ++ err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); ++ err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); ++ err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); ++ ++ if(err) ++ return(err); ++ ++ PT_REGS_SP(regs) = (unsigned long) frame; ++ PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; ++ PT_REGS_EAX(regs) = (unsigned long) sig; ++ PT_REGS_EDX(regs) = (unsigned long) 0; ++ PT_REGS_ECX(regs) = (unsigned long) 0; ++ ++ if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ++ ptrace_notify(SIGTRAP); ++ return(0); ++} ++ ++int setup_signal_stack_si(unsigned long stack_top, int sig, ++ struct k_sigaction *ka, struct pt_regs *regs, ++ siginfo_t *info, sigset_t *mask) ++{ ++ struct rt_sigframe __user *frame; ++ void *restorer; ++ int err = 0; ++ ++ stack_top &= -8UL; ++ frame = (struct rt_sigframe *) stack_top - 1; ++ if(verify_area(VERIFY_WRITE, frame, sizeof(*frame))) ++ return(1); ++ ++ restorer = (void *) frame->retcode; ++ if(ka->sa.sa_flags & SA_RESTORER) ++ restorer = ka->sa.sa_restorer; ++ ++ err |= __put_user(restorer, &frame->pretcode); ++ err |= __put_user(sig, &frame->sig); ++ err |= __put_user(&frame->info, &frame->pinfo); ++ err |= __put_user(&frame->uc, &frame->puc); ++ err |= copy_siginfo_to_user(&frame->info, info); ++ err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, ++ PT_REGS_SP(regs)); ++ ++ /* ++ * This is movl $,%eax ; int $0x80 ++ * ++ * WE DO NOT USE IT ANY MORE! It's only left here for historical ++ * reasons and because gdb uses it as a signature to notice ++ * signal handler stack frames. ++ */ ++ err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); ++ err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); ++ err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); ++ ++ if(err) ++ return(err); ++ ++ PT_REGS_SP(regs) = (unsigned long) frame; ++ PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; ++ PT_REGS_EAX(regs) = (unsigned long) sig; ++ PT_REGS_EDX(regs) = (unsigned long) &frame->info; ++ PT_REGS_ECX(regs) = (unsigned long) &frame->uc; ++ ++ if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ++ ptrace_notify(SIGTRAP); ++ return(0); ++} ++ ++long sys_sigreturn(struct pt_regs regs) ++{ ++ unsigned long __user sp = PT_REGS_SP(¤t->thread.regs); ++ struct sigframe __user *frame = (struct sigframe *)(sp - 8); ++ sigset_t set; ++ struct sigcontext __user *sc = &frame->sc; ++ unsigned long __user *oldmask = &sc->oldmask; ++ unsigned long __user *extramask = &frame->extramask; ++ int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); ++ ++ if(copy_from_user(&set.sig[0], oldmask, sizeof(&set.sig[0])) || ++ copy_from_user(&set.sig[1], extramask, sig_size)) ++ goto segfault; ++ ++ sigdelsetmask(&set, ~_BLOCKABLE); ++ ++ spin_lock_irq(¤t->sighand->siglock); ++ current->blocked = set; ++ recalc_sigpending(); ++ spin_unlock_irq(¤t->sighand->siglock); ++ ++ if(copy_sc_from_user(¤t->thread.regs, sc)) ++ goto segfault; ++ ++ PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; /* Avoid ERESTART handling */ ++ return(PT_REGS_SYSCALL_RET(¤t->thread.regs)); ++ ++ segfault: ++ force_sig(SIGSEGV, current); ++ return 0; ++} ++ ++long sys_rt_sigreturn(struct pt_regs regs) ++{ ++ unsigned long __user sp = PT_REGS_SP(¤t->thread.regs); ++ struct rt_sigframe __user *frame = (struct rt_sigframe *) (sp - 4); ++ sigset_t set; ++ struct ucontext __user *uc = &frame->uc; ++ int sig_size = _NSIG_WORDS * sizeof(unsigned long); ++ ++ if(copy_from_user(&set, &uc->uc_sigmask, sig_size)) ++ goto segfault; ++ ++ sigdelsetmask(&set, ~_BLOCKABLE); ++ ++ spin_lock_irq(¤t->sighand->siglock); ++ current->blocked = set; ++ recalc_sigpending(); ++ spin_unlock_irq(¤t->sighand->siglock); ++ ++ if(copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext)) ++ goto segfault; ++ ++ PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; /* Avoid ERESTART handling */ ++ return(PT_REGS_SYSCALL_RET(¤t->thread.regs)); ++ ++ segfault: ++ force_sig(SIGSEGV, current); ++ return 0; ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +Index: linux-2.6.10/arch/um/sys-i386/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/um/sys-i386/Makefile 2004-12-25 05:34:01.000000000 +0800 ++++ linux-2.6.10/arch/um/sys-i386/Makefile 2005-04-05 12:40:36.023911704 +0800 +@@ -1,5 +1,5 @@ + obj-y = bitops.o bugs.o checksum.o fault.o ksyms.o ldt.o ptrace.o \ +- ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o ++ ptrace_user.o semaphore.o signal.o sigcontext.o syscalls.o sysrq.o + + obj-$(CONFIG_HIGHMEM) += highmem.o + obj-$(CONFIG_MODULES) += module.o +Index: linux-2.6.10/arch/um/kernel/mem_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/mem_user.c 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/mem_user.c 2005-04-05 12:40:36.051907448 +0800 +@@ -101,6 +101,8 @@ + } + printf("OK\n"); + munmap(addr, UM_KERN_PAGE_SIZE); ++ ++ os_close_file(fd); + } + + static int have_devanon = 0; +@@ -261,6 +263,39 @@ + } + #endif + ++#if 0 ++/* Debugging facility for dumping stuff out to the host, avoiding the timing ++ * problems that come with printf and breakpoints. ++ * Enable in case of emergency. ++ */ ++ ++int logging = 1; ++int logging_fd = -1; ++ ++int logging_line = 0; ++char logging_buf[512]; ++ ++void log(char *fmt, ...) ++{ ++ va_list ap; ++ struct timeval tv; ++ struct openflags flags; ++ ++ if(logging == 0) return; ++ if(logging_fd < 0){ ++ flags = of_create(of_trunc(of_rdwr(OPENFLAGS()))); ++ logging_fd = os_open_file("log", flags, 0644); ++ } ++ gettimeofday(&tv, NULL); ++ sprintf(logging_buf, "%d\t %u.%u ", logging_line++, tv.tv_sec, ++ tv.tv_usec); ++ va_start(ap, fmt); ++ vsprintf(&logging_buf[strlen(logging_buf)], fmt, ap); ++ va_end(ap); ++ write(logging_fd, logging_buf, strlen(logging_buf)); ++} ++#endif ++ + /* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically +Index: linux-2.6.10/arch/um/kernel/time.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/time.c 2004-12-25 05:34:26.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/time.c 2005-04-05 12:40:36.046908208 +0800 +@@ -60,6 +60,9 @@ + (setitimer(ITIMER_REAL, &disable, NULL) < 0)) + printk("disnable_timer - setitimer failed, errno = %d\n", + errno); ++ /* If there are signals already queued, after unblocking ignore them */ ++ set_handler(SIGALRM, SIG_IGN, 0, -1); ++ set_handler(SIGVTALRM, SIG_IGN, 0, -1); + } + + void switch_timers(int to_real) +Index: linux-2.6.10/arch/um/kernel/ksyms.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/ksyms.c 2004-12-25 05:33:50.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/ksyms.c 2005-04-05 12:40:36.049907752 +0800 +@@ -48,6 +48,7 @@ + EXPORT_SYMBOL(mode_tt); + EXPORT_SYMBOL(handle_page_fault); + EXPORT_SYMBOL(find_iomem); ++EXPORT_SYMBOL(end_iomem); + + #ifdef CONFIG_MODE_TT + EXPORT_SYMBOL(strncpy_from_user_tt); +Index: linux-2.6.10/arch/um/kernel/um_arch.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/um_arch.c 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/um_arch.c 2005-04-05 12:40:36.045908360 +0800 +@@ -44,11 +44,6 @@ + .ipi_pipe = { -1, -1 } + }; + +-/* Placeholder to make UML link until the vsyscall stuff is actually +- * implemented +- */ +-void *__kernel_vsyscall; +- + unsigned long thread_saved_pc(struct task_struct *task) + { + return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas, +@@ -326,6 +321,11 @@ + */ + check_tmpexec(); + ++ /* Need to check this early because mmapping happens before the ++ * kernel is running. ++ */ ++ check_tmpexec(); ++ + brk_start = (unsigned long) sbrk(0); + CHOOSE_MODE_PROC(before_mem_tt, before_mem_skas, brk_start); + /* Increase physical memory size for exec-shield users +Index: linux-2.6.10/arch/um/kernel/process.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/process.c 2004-12-25 05:35:25.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/process.c 2005-04-05 12:40:36.025911400 +0800 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -285,6 +286,9 @@ + printk("Checking that ptrace can change system call numbers..."); + pid = start_ptraced_child(&stack); + ++ if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) ++ panic("check_ptrace: PTRACE_SETOPTIONS failed, errno = %d", errno); ++ + while(1){ + if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) + panic("check_ptrace : ptrace failed, errno = %d", +@@ -292,8 +296,8 @@ + CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); + if(n < 0) + panic("check_ptrace : wait failed, errno = %d", errno); +- if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) +- panic("check_ptrace : expected SIGTRAP, " ++ if(!WIFSTOPPED(status) || (WSTOPSIG(status) != (SIGTRAP|SYSCALL_TRAP))) ++ panic("check_ptrace : expected (SIGTRAP|SYSCALL_TRAP), " + "got status = %d", status); + + syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET, +Index: linux-2.6.10/arch/um/kernel/process_kern.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/process_kern.c 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/process_kern.c 2005-04-05 12:40:36.047908056 +0800 +@@ -291,8 +291,6 @@ + + EXPORT_SYMBOL(disable_hlt); + +-extern int signal_frame_size; +- + void *um_kmalloc(int size) + { + return(kmalloc(size, GFP_KERNEL)); +Index: linux-2.6.10/arch/um/kernel/signal_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/signal_user.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/signal_user.c 2005-04-05 12:40:36.050907600 +0800 +@@ -61,6 +61,10 @@ + * disable profiling; it's safe because the profiling code does not interact + * with the kernel code at all.*/ + ++/* Both here and in set/get_signal we don't touch SIGPROF, because we must not ++ * disable profiling; it's safe because the profiling code does not interact ++ * with the kernel code at all.*/ ++ + static void change_signals(int type) + { + sigset_t mask; +Index: linux-2.6.10/arch/um/kernel/initrd_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/initrd_user.c 2004-12-25 05:34:26.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/initrd_user.c 2005-04-05 12:40:36.026911248 +0800 +@@ -29,6 +29,8 @@ + filename, -n); + return(-1); + } ++ ++ os_close_file(fd); + return(0); + } + +Index: linux-2.6.10/arch/um/kernel/dyn.lds.S +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/dyn.lds.S 2004-12-25 05:34:48.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/dyn.lds.S 2005-04-05 12:40:36.044908512 +0800 +@@ -7,8 +7,11 @@ + + SECTIONS + { ++ PROVIDE (__executable_start = START); + . = START + SIZEOF_HEADERS; + .interp : { *(.interp) } ++ /* Used in arch/um/kernel/mem.c. Any memory between START and __binary_start ++ * is remapped.*/ + __binary_start = .; + . = ALIGN(4096); /* Init code and data */ + _stext = .; +Index: linux-2.6.10/arch/um/kernel/ptrace.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/ptrace.c 2004-12-25 05:35:50.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/ptrace.c 2005-04-05 12:40:36.044908512 +0800 +@@ -16,6 +16,7 @@ + #include "asm/uaccess.h" + #include "kern_util.h" + #include "ptrace_user.h" ++#include "signal_user.h" + + /* + * Called by kernel/ptrace.c when detaching.. +@@ -328,8 +329,10 @@ + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + tracesysgood = (current->ptrace & PT_TRACESYSGOOD) && !is_singlestep; +- ptrace_notify(SIGTRAP | (tracesysgood ? 0x80 : 0)); +- ++ ptrace_notify(SIGTRAP | (tracesysgood ? SYSCALL_TRAP : 0)); ++ if ( entryexit ) /* force do_signal() --> is_syscall() */ ++ set_thread_flag(TIF_SIGPENDING); ++ + /* force do_signal() --> is_syscall() */ + set_thread_flag(TIF_SIGPENDING); + +Index: linux-2.6.10/arch/um/kernel/uml.lds.S +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/uml.lds.S 2005-04-01 12:25:25.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/uml.lds.S 2005-04-05 12:40:36.049907752 +0800 +@@ -7,8 +7,12 @@ + + SECTIONS + { ++ /*This must contain the right address - not quite the default ELF one.*/ ++ PROVIDE (__executable_start = START); + . = START + SIZEOF_HEADERS; + ++ /* Used in arch/um/kernel/mem.c. Any memory between START and __binary_start ++ * is remapped.*/ + __binary_start = .; + #ifdef MODE_TT + .thread_private : { +@@ -20,9 +24,13 @@ + } + . = ALIGN(4096); + .remap : { arch/um/kernel/tt/unmap_fin.o (.text) } +-#endif + ++ /*If you put this after #endif, STATIC build without TT mode ++ gives a segfaulting binary. And after all, a hole just after ++ binary_start is not very polite to glibc.*/ + . = ALIGN(4096); /* Init code and data */ ++#endif ++ + _stext = .; + __init_begin = .; + .init.text : { +Index: linux-2.6.10/arch/um/kernel/main.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/main.c 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/main.c 2005-04-05 12:40:36.024911552 +0800 +@@ -81,6 +81,8 @@ + + extern int uml_exitcode; + ++extern void scan_elf_aux( char **envp); ++ + int main(int argc, char **argv, char **envp) + { + char **new_argv; +@@ -147,6 +149,8 @@ + set_handler(SIGTERM, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1); + set_handler(SIGHUP, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1); + ++ scan_elf_aux( envp); ++ + do_uml_initcalls(); + ret = linux_main(argc, argv); + +@@ -155,18 +159,20 @@ + int err; + + printf("\n"); +- +- /* Let any pending signals fire, then disable them. This +- * ensures that they won't be delivered after the exec, when +- * they are definitely not expected. +- */ +- unblock_signals(); ++ /* stop timers and set SIG*ALRM to be ignored */ + disable_timer(); ++ /* disable SIGIO for the fds and set SIGIO to be ignored */ + err = deactivate_all_fds(); + if(err) + printf("deactivate_all_fds failed, errno = %d\n", + -err); + ++ /* Let any pending signals fire now. This ensures ++ * that they won't be delivered after the exec, when ++ * they are definitely not expected. ++ */ ++ unblock_signals(); ++ + execvp(new_argv[0], new_argv); + perror("Failed to exec kernel"); + ret = 1; +Index: linux-2.6.10/arch/um/kernel/irq_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/irq_user.c 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/irq_user.c 2005-04-05 12:40:36.028910944 +0800 +@@ -374,6 +374,8 @@ + if(err) + return(err); + } ++ /* If there is a signal already queued, after unblocking ignore it */ ++ set_handler(SIGIO, SIG_IGN, 0, -1); + + return(0); + } +Index: linux-2.6.10/arch/um/kernel/signal_kern.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/signal_kern.c 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/signal_kern.c 2005-04-05 12:40:36.048907904 +0800 +@@ -230,53 +230,6 @@ + return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); + } + +-extern int userspace_pid[]; +- +-static int copy_sc_from_user(struct pt_regs *to, void *from, +- struct arch_frame_data *arch) +-{ +- int ret; +- +- ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from, arch), +- copy_sc_from_user_skas(userspace_pid[0], +- &to->regs, from)); +- return(ret); +-} +- +-long sys_sigreturn(struct pt_regs regs) +-{ +- void __user *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); +- void __user *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); +- int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); +- +- spin_lock_irq(¤t->sighand->siglock); +- copy_from_user(¤t->blocked.sig[0], sc_sigmask(sc), +- sizeof(current->blocked.sig[0])); +- copy_from_user(¤t->blocked.sig[1], mask, sig_size); +- sigdelsetmask(¤t->blocked, ~_BLOCKABLE); +- recalc_sigpending(); +- spin_unlock_irq(¤t->sighand->siglock); +- copy_sc_from_user(¤t->thread.regs, sc, +- &signal_frame_sc.common.arch); +- return(PT_REGS_SYSCALL_RET(¤t->thread.regs)); +-} +- +-long sys_rt_sigreturn(struct pt_regs regs) +-{ +- unsigned long sp = PT_REGS_SP(¤t->thread.regs); +- struct ucontext __user *uc = sp_to_uc(sp); +- int sig_size = _NSIG_WORDS * sizeof(unsigned long); +- +- spin_lock_irq(¤t->sighand->siglock); +- copy_from_user(¤t->blocked, &uc->uc_sigmask, sig_size); +- sigdelsetmask(¤t->blocked, ~_BLOCKABLE); +- recalc_sigpending(); +- spin_unlock_irq(¤t->sighand->siglock); +- copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext, +- &signal_frame_si.common.arch); +- return(PT_REGS_SYSCALL_RET(¤t->thread.regs)); +-} +- + /* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically +Index: linux-2.6.10/arch/um/kernel/skas/include/uaccess-skas.h +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/include/uaccess-skas.h 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/include/uaccess-skas.h 2005-04-05 12:40:36.037909576 +0800 +@@ -7,6 +7,51 @@ + #define __SKAS_UACCESS_H + + #include "asm/errno.h" ++#include "asm/fixmap.h" ++ ++#define access_ok_skas(type, addr, size) \ ++ ((segment_eq(get_fs(), KERNEL_DS)) || \ ++ (((unsigned long) (addr) < TASK_SIZE) && \ ++ ((unsigned long) (addr) + (size) <= TASK_SIZE)) || \ ++ ((type == VERIFY_READ ) && \ ++ ((unsigned long) (addr) >= FIXADDR_USER_START) && \ ++ ((unsigned long) (addr) + (size) <= FIXADDR_USER_END) && \ ++ ((unsigned long) (addr) + (size) >= (unsigned long)(addr)))) ++ ++static inline int verify_area_skas(int type, const void * addr, ++ unsigned long size) ++{ ++ return(access_ok_skas(type, addr, size) ? 0 : -EFAULT); ++} ++ ++extern int copy_from_user_skas(void *to, const void *from, int n); ++extern int copy_to_user_skas(void *to, const void *from, int n); ++extern int strncpy_from_user_skas(char *dst, const char *src, int count); ++extern int __clear_user_skas(void *mem, int len); ++extern int clear_user_skas(void *mem, int len); ++extern int strnlen_user_skas(const void *str, int len); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SKAS_UACCESS_H ++#define __SKAS_UACCESS_H ++ ++#include "asm/errno.h" + + #define access_ok_skas(type, addr, size) \ + ((segment_eq(get_fs(), KERNEL_DS)) || \ +Index: linux-2.6.10/arch/um/kernel/skas/include/mmu-skas.h +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/include/mmu-skas.h 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/include/mmu-skas.h 2005-04-05 12:40:36.035909880 +0800 +@@ -22,3 +22,27 @@ + * c-file-style: "linux" + * End: + */ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SKAS_MMU_H ++#define __SKAS_MMU_H ++ ++struct mmu_context_skas { ++ int mm_fd; ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +Index: linux-2.6.10/arch/um/kernel/skas/include/mode-skas.h +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/include/mode-skas.h 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/include/mode-skas.h 2005-04-05 12:40:36.036909728 +0800 +@@ -14,6 +14,40 @@ + extern int have_fpx_regs; + + extern void user_time_init_skas(void); ++extern void sig_handler_common_skas(int sig, void *sc_ptr); ++extern void halt_skas(void); ++extern void reboot_skas(void); ++extern void kill_off_processes_skas(void); ++extern int is_skas_winch(int pid, int fd, void *data); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MODE_SKAS_H__ ++#define __MODE_SKAS_H__ ++ ++#include ++ ++extern unsigned long exec_regs[]; ++extern unsigned long exec_fp_regs[]; ++extern unsigned long exec_fpx_regs[]; ++extern int have_fpx_regs; ++ ++extern void user_time_init_skas(void); + extern int copy_sc_from_user_skas(int pid, union uml_pt_regs *regs, + void *from_ptr); + extern int copy_sc_to_user_skas(int pid, void *to_ptr, void *fp, +Index: linux-2.6.10/arch/um/kernel/skas/sys-i386/sigcontext.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/sys-i386/sigcontext.c 2004-12-25 05:33:51.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/sys-i386/sigcontext.c 2005-04-05 19:01:49.158500672 +0800 +@@ -1,114 +0,0 @@ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- * Licensed under the GPL +- */ +- +-#include +-#include +-#include +-#include +-#include "sysdep/ptrace.h" +-#include "sysdep/ptrace_user.h" +-#include "kern_util.h" +-#include "user.h" +-#include "sigcontext.h" +-#include "mode.h" +- +-int copy_sc_from_user_skas(int pid, union uml_pt_regs *regs, void *from_ptr) +-{ +- struct sigcontext sc, *from = from_ptr; +- unsigned long fpregs[FP_FRAME_SIZE]; +- int err; +- +- err = copy_from_user_proc(&sc, from, sizeof(sc)); +- err |= copy_from_user_proc(fpregs, sc.fpstate, sizeof(fpregs)); +- if(err) +- return(err); +- +- regs->skas.regs[GS] = sc.gs; +- regs->skas.regs[FS] = sc.fs; +- regs->skas.regs[ES] = sc.es; +- regs->skas.regs[DS] = sc.ds; +- regs->skas.regs[EDI] = sc.edi; +- regs->skas.regs[ESI] = sc.esi; +- regs->skas.regs[EBP] = sc.ebp; +- regs->skas.regs[UESP] = sc.esp; +- regs->skas.regs[EBX] = sc.ebx; +- regs->skas.regs[EDX] = sc.edx; +- regs->skas.regs[ECX] = sc.ecx; +- regs->skas.regs[EAX] = sc.eax; +- regs->skas.regs[EIP] = sc.eip; +- regs->skas.regs[CS] = sc.cs; +- regs->skas.regs[EFL] = sc.eflags; +- regs->skas.regs[SS] = sc.ss; +- regs->skas.fault_addr = sc.cr2; +- regs->skas.fault_type = FAULT_WRITE(sc.err); +- regs->skas.trap_type = sc.trapno; +- +- err = ptrace(PTRACE_SETFPREGS, pid, 0, fpregs); +- if(err < 0){ +- printk("copy_sc_to_user - PTRACE_SETFPREGS failed, " +- "errno = %d\n", errno); +- return(1); +- } +- +- return(0); +-} +- +-int copy_sc_to_user_skas(int pid, void *to_ptr, void *fp, +- union uml_pt_regs *regs, unsigned long fault_addr, +- int fault_type) +-{ +- struct sigcontext sc, *to = to_ptr; +- struct _fpstate *to_fp; +- unsigned long fpregs[FP_FRAME_SIZE]; +- int err; +- +- sc.gs = regs->skas.regs[GS]; +- sc.fs = regs->skas.regs[FS]; +- sc.es = regs->skas.regs[ES]; +- sc.ds = regs->skas.regs[DS]; +- sc.edi = regs->skas.regs[EDI]; +- sc.esi = regs->skas.regs[ESI]; +- sc.ebp = regs->skas.regs[EBP]; +- sc.esp = regs->skas.regs[UESP]; +- sc.ebx = regs->skas.regs[EBX]; +- sc.edx = regs->skas.regs[EDX]; +- sc.ecx = regs->skas.regs[ECX]; +- sc.eax = regs->skas.regs[EAX]; +- sc.eip = regs->skas.regs[EIP]; +- sc.cs = regs->skas.regs[CS]; +- sc.eflags = regs->skas.regs[EFL]; +- sc.esp_at_signal = regs->skas.regs[UESP]; +- sc.ss = regs->skas.regs[SS]; +- sc.cr2 = fault_addr; +- sc.err = TO_SC_ERR(fault_type); +- sc.trapno = regs->skas.trap_type; +- +- err = ptrace(PTRACE_GETFPREGS, pid, 0, fpregs); +- if(err < 0){ +- printk("copy_sc_to_user - PTRACE_GETFPREGS failed, " +- "errno = %d\n", errno); +- return(1); +- } +- to_fp = (struct _fpstate *) +- (fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to))); +- sc.fpstate = to_fp; +- +- if(err) +- return(err); +- +- return(copy_to_user_proc(to, &sc, sizeof(sc)) || +- copy_to_user_proc(to_fp, fpregs, sizeof(fpregs))); +-} +- +-/* +- * Overrides for Emacs so that we follow Linus's tabbing style. +- * Emacs will notice this stuff at the end of the file and automatically +- * adjust the settings for this buffer only. This must remain at the end +- * of the file. +- * --------------------------------------------------------------------------- +- * Local variables: +- * c-file-style: "linux" +- * End: +- */ +Index: linux-2.6.10/arch/um/kernel/skas/sys-i386/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/sys-i386/Makefile 2004-12-25 05:35:27.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/sys-i386/Makefile 2005-04-05 19:01:49.158500672 +0800 +@@ -1,12 +0,0 @@ +-# +-# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +-# Licensed under the GPL +-# +- +-obj-y = sigcontext.o +- +-USER_OBJS = sigcontext.o +-USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) +- +-$(USER_OBJS) : %.o: %.c +- $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< +Index: linux-2.6.10/arch/um/kernel/skas/process.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/process.c 2004-12-25 05:35:39.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/process.c 2005-04-05 12:40:36.030910640 +0800 +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -60,15 +61,10 @@ + /*To use the same value of using_sysemu as the caller, ask it that value (in local_using_sysemu)*/ + static void handle_trap(int pid, union uml_pt_regs *regs, int local_using_sysemu) + { +- int err, syscall_nr, status; +- +- syscall_nr = PT_SYSCALL_NR(regs->skas.regs); +- UPT_SYSCALL_NR(regs) = syscall_nr; +- if(syscall_nr < 0){ +- relay_signal(SIGTRAP, regs); +- return; +- } + ++ int err, status; ++ ++ UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->skas.regs); /* Mark this as a syscall */ + if (!local_using_sysemu) + { + err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid); +@@ -82,7 +78,8 @@ + "errno = %d\n", errno); + + CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED)); +- if((err < 0) || !WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) ++ if((err < 0) || !WIFSTOPPED(status) || ++ (WSTOPSIG(status) != (SIGTRAP|SYSCALL_TRAP))) + panic("handle_trap - failed to wait at end of syscall, " + "errno = %d, status = %d\n", errno, status); + } +@@ -131,6 +128,10 @@ + panic("start_userspace : expected SIGSTOP, got status = %d", + status); + ++ if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, (void *)PTRACE_O_TRACESYSGOOD) < 0) ++ panic("start_userspace : PTRACE_SETOPTIONS failed, errno=%d\n", ++ errno); ++ + if(munmap(stack, PAGE_SIZE) < 0) + panic("start_userspace : munmap failed, errno = %d\n", errno); + +@@ -160,15 +161,19 @@ + + regs->skas.is_user = 1; + save_registers(regs); ++ UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + + if(WIFSTOPPED(status)){ + switch(WSTOPSIG(status)){ + case SIGSEGV: + handle_segv(pid); + break; +- case SIGTRAP: ++ case (SIGTRAP|SYSCALL_TRAP): + handle_trap(pid, regs, local_using_sysemu); + break; ++ case SIGTRAP: ++ relay_signal(SIGTRAP, regs); ++ break; + case SIGIO: + case SIGVTALRM: + case SIGILL: +@@ -222,9 +227,10 @@ + block_signals(); + if(sigsetjmp(fork_buf, 1) == 0) + new_thread_proc(stack, handler); +- set_signals(flags); + + remove_sigstack(); ++ ++ set_signals(flags); + } + + void thread_wait(void *sw, void *fb) +Index: linux-2.6.10/arch/um/kernel/skas/process_kern.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/process_kern.c 2004-12-25 05:35:50.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/process_kern.c 2005-04-05 12:40:36.032910336 +0800 +@@ -19,7 +19,6 @@ + #include "os.h" + #include "user_util.h" + #include "tlb.h" +-#include "frame.h" + #include "kern.h" + #include "mode.h" + #include "proc_mm.h" +@@ -183,7 +182,6 @@ + int start_uml_skas(void) + { + start_userspace(0); +- capture_signal_stack(); + + init_new_thread_signals(1); + uml_idle_timer(); +Index: linux-2.6.10/arch/um/kernel/skas/syscall_kern.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/syscall_kern.c 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/syscall_kern.c 2005-04-05 12:40:36.034910032 +0800 +@@ -6,6 +6,7 @@ + #include "linux/sys.h" + #include "linux/ptrace.h" + #include "asm/errno.h" ++#include "linux/ptrace.h" + #include "asm/unistd.h" + #include "asm/ptrace.h" + #include "asm/current.h" +Index: linux-2.6.10/arch/um/kernel/skas/trap_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/trap_user.c 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/trap_user.c 2005-04-05 12:40:36.033910184 +0800 +@@ -21,6 +21,14 @@ + int save_errno = errno; + int save_user; + ++ /* This is done because to allow SIGSEGV to be delivered inside a SEGV ++ * handler. This can happen in copy_user, and if SEGV is disabled, ++ * the process will die. ++ * XXX Figure out why this is better than SA_NODEFER ++ */ ++ if(sig == SIGSEGV) ++ change_sig(SIGSEGV, 1); ++ + r = &TASK_REGS(get_current())->skas; + save_user = r->is_user; + r->is_user = 0; +Index: linux-2.6.10/arch/um/kernel/skas/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/skas/Makefile 2004-12-25 05:34:30.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/skas/Makefile 2005-04-05 12:40:36.034910032 +0800 +@@ -4,8 +4,7 @@ + # + + obj-y := exec_kern.o mem.o mem_user.o mmu.o process.o process_kern.o \ +- syscall_kern.o syscall_user.o time.o tlb.o trap_user.o uaccess.o \ +- sys-$(SUBARCH)/ ++ syscall_kern.o syscall_user.o time.o tlb.o trap_user.o uaccess.o + + subdir-y := util + +Index: linux-2.6.10/arch/um/kernel/helper.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/helper.c 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/helper.c 2005-04-05 12:40:36.027911096 +0800 +@@ -49,14 +49,14 @@ + return(0); + } + +-/* XXX The alloc_stack here breaks if this is called in the tracing thread */ +- ++/* Returns either the pid of the child process we run or -E* on failure. ++ * XXX The alloc_stack here breaks if this is called in the tracing thread */ + int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv, + unsigned long *stack_out) + { + struct helper_data data; + unsigned long stack, sp; +- int pid, fds[2], err, n; ++ int pid, fds[2], ret, n; + + if((stack_out != NULL) && (*stack_out != 0)) + stack = *stack_out; +@@ -64,16 +64,16 @@ + if(stack == 0) + return(-ENOMEM); + +- err = os_pipe(fds, 1, 0); +- if(err < 0){ +- printk("run_helper : pipe failed, err = %d\n", -err); ++ ret = os_pipe(fds, 1, 0); ++ if(ret < 0){ ++ printk("run_helper : pipe failed, ret = %d\n", -ret); + goto out_free; + } + +- err = os_set_exec_close(fds[1], 1); +- if(err < 0){ +- printk("run_helper : setting FD_CLOEXEC failed, err = %d\n", +- -err); ++ ret = os_set_exec_close(fds[1], 1); ++ if(ret < 0){ ++ printk("run_helper : setting FD_CLOEXEC failed, ret = %d\n", ++ -ret); + goto out_close; + } + +@@ -85,34 +85,36 @@ + pid = clone(helper_child, (void *) sp, CLONE_VM | SIGCHLD, &data); + if(pid < 0){ + printk("run_helper : clone failed, errno = %d\n", errno); +- err = -errno; ++ ret = -errno; + goto out_close; + } + + os_close_file(fds[1]); +- n = os_read_file(fds[0], &err, sizeof(err)); ++ fds[1] = -1; ++ ++ /*Read the errno value from the child.*/ ++ n = os_read_file(fds[0], &ret, sizeof(ret)); + if(n < 0){ +- printk("run_helper : read on pipe failed, err = %d\n", -n); +- err = n; +- goto out_kill; ++ printk("run_helper : read on pipe failed, ret = %d\n", -n); ++ ret = n; ++ os_kill_process(pid, 1); + } + else if(n != 0){ + CATCH_EINTR(n = waitpid(pid, NULL, 0)); +- pid = -errno; ++ ret = -errno; ++ } else { ++ ret = pid; + } + +- if(stack_out == NULL) free_stack(stack, 0); +- else *stack_out = stack; +- return(pid); +- +- out_kill: +- os_kill_process(pid, 1); + out_close: ++ if (fds[1] != -1) ++ os_close_file(fds[1]); + os_close_file(fds[0]); +- os_close_file(fds[1]); + out_free: +- free_stack(stack, 0); +- return(err); ++ if(stack_out == NULL) ++ free_stack(stack, 0); ++ else *stack_out = stack; ++ return(ret); + } + + int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, +Index: linux-2.6.10/arch/um/kernel/time_kern.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/time_kern.c 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/time_kern.c 2005-04-05 12:40:36.027911096 +0800 +@@ -170,7 +170,7 @@ + void timer_handler(int sig, union uml_pt_regs *regs) + { + local_irq_disable(); +- update_process_times(user_context(UPT_SP(regs))); ++ update_process_times(CHOOSE_MODE(user_context(UPT_SP(regs)), (regs)->skas.is_user)); + local_irq_enable(); + if(current_thread->cpu == 0) + timer_irq(regs); +Index: linux-2.6.10/arch/um/kernel/tt/include/mode-tt.h +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/tt/include/mode-tt.h 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/tt/include/mode-tt.h 2005-04-05 12:40:36.042908816 +0800 +@@ -14,6 +14,41 @@ + + extern int tracer(int (*init_proc)(void *), void *sp); + extern void user_time_init_tt(void); ++extern void sig_handler_common_tt(int sig, void *sc); ++extern void syscall_handler_tt(int sig, union uml_pt_regs *regs); ++extern void reboot_tt(void); ++extern void halt_tt(void); ++extern int is_tracer_winch(int pid, int fd, void *data); ++extern void kill_off_processes_tt(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MODE_TT_H__ ++#define __MODE_TT_H__ ++ ++#include "sysdep/ptrace.h" ++ ++enum { OP_NONE, OP_EXEC, OP_FORK, OP_TRACE_ON, OP_REBOOT, OP_HALT, OP_CB }; ++ ++extern int tracing_pid; ++ ++extern int tracer(int (*init_proc)(void *), void *sp); ++extern void user_time_init_tt(void); + extern int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data); + extern int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr, + void *data); +Index: linux-2.6.10/arch/um/kernel/tt/include/tt.h +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/tt/include/tt.h 2004-12-25 05:34:58.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/tt/include/tt.h 2005-04-05 12:40:36.043908664 +0800 +@@ -26,7 +26,8 @@ + extern int is_tracing(void *task); + extern void syscall_handler(int sig, union uml_pt_regs *regs); + extern void exit_kernel(int pid, void *task); +-extern int do_syscall(void *task, int pid, int local_using_sysemu); ++extern void do_syscall(void *task, int pid, int local_using_sysemu); ++extern void do_sigtrap(void *task); + extern int is_valid_pid(int pid); + extern void remap_data(void *segment_start, void *segment_end, int w); + +Index: linux-2.6.10/arch/um/kernel/tt/exec_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/tt/exec_user.c 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/tt/exec_user.c 2005-04-05 12:40:36.039909272 +0800 +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include "user_util.h" + #include "kern_util.h" +@@ -35,7 +36,10 @@ + tracer_panic("do_exec failed to get registers - errno = %d", + errno); + +- kill(old_pid, SIGKILL); ++ os_kill_ptraced_process(old_pid, 0); ++ ++ if (ptrace(PTRACE_OLDSETOPTIONS, new_pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) ++ tracer_panic("do_exec: PTRACE_SETOPTIONS failed, errno = %d", errno); + + if(ptrace_setregs(new_pid, regs) < 0) + tracer_panic("do_exec failed to start new proc - errno = %d", +Index: linux-2.6.10/arch/um/kernel/tt/sys-i386/sigcontext.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/tt/sys-i386/sigcontext.c 2004-12-25 05:35:39.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/tt/sys-i386/sigcontext.c 2005-04-05 19:01:49.158500672 +0800 +@@ -1,60 +0,0 @@ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- * Licensed under the GPL +- */ +- +-#include +-#include +-#include "kern_util.h" +-#include "sysdep/frame.h" +- +-int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data) +-{ +- struct arch_frame_data *arch = data; +- struct sigcontext *to = to_ptr, *from = from_ptr; +- struct _fpstate *to_fp, *from_fp; +- unsigned long sigs; +- int err; +- +- to_fp = to->fpstate; +- from_fp = from->fpstate; +- sigs = to->oldmask; +- err = copy_from_user_proc(to, from, sizeof(*to)); +- to->oldmask = sigs; +- if(to_fp != NULL){ +- err |= copy_from_user_proc(&to->fpstate, &to_fp, +- sizeof(to->fpstate)); +- err |= copy_from_user_proc(to_fp, from_fp, arch->fpstate_size); +- } +- return(err); +-} +- +-int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr, void *data) +-{ +- struct arch_frame_data *arch = data; +- struct sigcontext *to = to_ptr, *from = from_ptr; +- struct _fpstate *to_fp, *from_fp; +- int err; +- +- to_fp = (struct _fpstate *) +- (fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to))); +- from_fp = from->fpstate; +- err = copy_to_user_proc(to, from, sizeof(*to)); +- if(from_fp != NULL){ +- err |= copy_to_user_proc(&to->fpstate, &to_fp, +- sizeof(to->fpstate)); +- err |= copy_to_user_proc(to_fp, from_fp, arch->fpstate_size); +- } +- return(err); +-} +- +-/* +- * Overrides for Emacs so that we follow Linus's tabbing style. +- * Emacs will notice this stuff at the end of the file and automatically +- * adjust the settings for this buffer only. This must remain at the end +- * of the file. +- * --------------------------------------------------------------------------- +- * Local variables: +- * c-file-style: "linux" +- * End: +- */ +Index: linux-2.6.10/arch/um/kernel/tt/sys-i386/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/tt/sys-i386/Makefile 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/tt/sys-i386/Makefile 2005-04-05 19:01:49.158500672 +0800 +@@ -1,12 +0,0 @@ +-# +-# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +-# Licensed under the GPL +-# +- +-obj-y = sigcontext.o +- +-USER_OBJS = sigcontext.o +-USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) +- +-$(USER_OBJS) : %.o: %.c +- $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< +Index: linux-2.6.10/arch/um/kernel/tt/syscall_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/tt/syscall_user.c 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/tt/syscall_user.c 2005-04-05 12:40:36.037909576 +0800 +@@ -42,37 +42,31 @@ + syscall_trace(regs, 1); + record_syscall_end(index, result); + } +- +-int do_syscall(void *task, int pid, int local_using_sysemu) +-{ +- unsigned long proc_regs[FRAME_SIZE]; +- union uml_pt_regs *regs; +- int syscall; +- +- if(ptrace_getregs(pid, proc_regs) < 0) +- tracer_panic("Couldn't read registers"); +- syscall = PT_SYSCALL_NR(proc_regs); +- +- regs = TASK_REGS(task); +- UPT_SYSCALL_NR(regs) = syscall; +- +- if(syscall < 0) +- return(0); +- +- if((syscall != __NR_sigreturn) && +- ((unsigned long *) PT_IP(proc_regs) >= &_stext) && +- ((unsigned long *) PT_IP(proc_regs) <= &_etext)) +- tracer_panic("I'm tracing myself and I can't get out"); +- +- if(local_using_sysemu) +- return(1); +- +- if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, +- __NR_getpid) < 0) +- tracer_panic("do_syscall : Nullifying syscall failed, " +- "errno = %d", errno); +- return(1); +-} ++ ++ void do_sigtrap(void *task) ++ { ++ UPT_SYSCALL_NR(TASK_REGS(task)) = -1; ++ } ++ ++ void do_syscall(void *task, int pid, int local_using_sysemu) ++ { ++ unsigned long proc_regs[FRAME_SIZE]; ++ ++ if(ptrace_getregs(pid, proc_regs) < 0) ++ tracer_panic("Couldn't read registers"); ++ ++ UPT_SYSCALL_NR(TASK_REGS(task)) = PT_SYSCALL_NR(proc_regs); ++ ++ if(((unsigned long *) PT_IP(proc_regs) >= &_stext) && ++ ((unsigned long *) PT_IP(proc_regs) <= &_etext)) ++ tracer_panic("I'm tracing myself and I can't get out"); ++ ++ /* syscall number -1 in sysemu skips syscall restarting in host */ ++ if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, ++ local_using_sysemu ? -1 : __NR_getpid) < 0) ++ tracer_panic("do_syscall : Nullifying syscall failed, " ++ "errno = %d", errno); ++ } + + /* + * Overrides for Emacs so that we follow Linus's tabbing style. +Index: linux-2.6.10/arch/um/kernel/tt/tracer.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/tt/tracer.c 2005-04-01 01:16:47.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/tt/tracer.c 2005-04-05 12:40:36.041908968 +0800 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include "user.h" +@@ -25,7 +26,6 @@ + #include "mem_user.h" + #include "process.h" + #include "kern_util.h" +-#include "frame.h" + #include "chan_user.h" + #include "ptrace_user.h" + #include "mode.h" +@@ -72,6 +72,8 @@ + (ptrace(PTRACE_CONT, pid, 0, 0) < 0)) + tracer_panic("OP_FORK failed to attach pid"); + wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL); ++ if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) ++ tracer_panic("OP_FORK: PTRACE_SETOPTIONS failed, errno = %d", errno); + if(ptrace(PTRACE_CONT, pid, 0, 0) < 0) + tracer_panic("OP_FORK failed to continue process"); + } +@@ -141,7 +143,7 @@ + * any more, the trace of those will land here. So, we need to just + * PTRACE_SYSCALL it. + */ +- case SIGTRAP: ++ case (SIGTRAP|SYSCALL_TRAP): + if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) + tracer_panic("sleeping_process_signal : Failed to " + "PTRACE_SYSCALL pid %d, errno = %d\n", +@@ -184,9 +186,8 @@ + unsigned long eip = 0; + int status, pid = 0, sig = 0, cont_type, tracing = 0, op = 0; + int last_index, proc_id = 0, n, err, old_tracing = 0, strace = 0; +- int pt_syscall_parm, local_using_sysemu; ++ int pt_syscall_parm, local_using_sysemu = 0; + +- capture_signal_stack(); + signal(SIGPIPE, SIG_IGN); + setup_tracer_winch(); + tracing_pid = os_getpid(); +@@ -198,6 +199,10 @@ + printf("waitpid on idle thread failed, errno = %d\n", errno); + exit(1); + } ++ if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) { ++ printf("Failed to PTRACE_SETOPTIONS for idle thread, errno = %d\n", errno); ++ exit(1); ++ } + if((ptrace(PTRACE_CONT, pid, 0, 0) < 0)){ + printf("Failed to continue idle thread, errno = %d\n", errno); + exit(1); +@@ -315,7 +320,8 @@ + task = cpu_tasks[proc_id].task; + tracing = is_tracing(task); + old_tracing = tracing; +- ++ if ( tracing ) /* Assume: no syscall, when coming from user */ ++ do_sigtrap(task); + local_using_sysemu = get_using_sysemu(); + pt_syscall_parm = local_using_sysemu ? PTRACE_SYSEMU : PTRACE_SYSCALL; + +@@ -324,6 +330,15 @@ + sig = 0; + op = do_proc_op(task, proc_id); + switch(op){ ++ /* ++ * This is called when entering user mode; after ++ * this, we start intercepting syscalls. ++ * ++ * In fact, a process is started in kernel mode, ++ * so with is_tracing() == 0 (and that is reset ++ * when executing syscalls, since UML kernel has ++ * the right to do syscalls); ++ */ + case OP_TRACE_ON: + arch_leave_kernel(task, pid); + tracing = 1; +@@ -332,7 +347,13 @@ + case OP_HALT: + unmap_physmem(); + kmalloc_ok = 0; +- ptrace(PTRACE_KILL, pid, 0, 0); ++ os_kill_ptraced_process(pid, 0); ++ /* Now let's reap remaining zombies */ ++ errno = 0; ++ do { ++ waitpid(-1, &status, ++ WUNTRACED); ++ } while (errno != ECHILD); + return(op == OP_REBOOT); + case OP_NONE: + printf("Detaching pid %d\n", pid); +@@ -346,14 +367,26 @@ + */ + pid = cpu_tasks[proc_id].pid; + break; ++ case (SIGTRAP|SYSCALL_TRAP): ++ if(!tracing && (debugger_pid != -1)){ ++ child_signal(pid, W_STOPCODE(SIGTRAP)); ++ continue; ++ } ++ tracing = 0; ++ /* local_using_sysemu has been already set ++ * below, since if we are here, is_tracing() on ++ * the traced task was 1, i.e. the process had ++ * already run through one iteration of the ++ * loop which executed a OP_TRACE_ON request.*/ ++ do_syscall(task, pid, local_using_sysemu); ++ sig = SIGUSR2; ++ break; + case SIGTRAP: + if(!tracing && (debugger_pid != -1)){ + child_signal(pid, status); + continue; + } + tracing = 0; +- if(do_syscall(task, pid, local_using_sysemu)) +- sig = SIGUSR2; + break; + case SIGPROF: + if(tracing) sig = 0; +@@ -389,6 +422,9 @@ + continue; + } + ++ local_using_sysemu = get_using_sysemu(); ++ pt_syscall_parm = local_using_sysemu ? PTRACE_SYSEMU : PTRACE_SYSCALL; ++ + if(tracing){ + if(singlestepping(task)) + cont_type = PTRACE_SINGLESTEP; +Index: linux-2.6.10/arch/um/kernel/tt/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/tt/Makefile 2004-12-25 05:34:57.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/tt/Makefile 2005-04-05 12:40:36.041908968 +0800 +@@ -8,7 +8,7 @@ + + obj-y = exec_kern.o exec_user.o gdb.o ksyms.o mem.o mem_user.o process_kern.o \ + syscall_kern.o syscall_user.o time.o tlb.o tracer.o trap_user.o \ +- uaccess.o uaccess_user.o sys-$(SUBARCH)/ ++ uaccess.o uaccess_user.o + + obj-$(CONFIG_PT_PROXY) += gdb_kern.o ptproxy/ + +Index: linux-2.6.10/arch/um/kernel/trap_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/trap_user.c 2004-12-25 05:34:44.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/trap_user.c 2005-04-05 12:40:36.047908056 +0800 +@@ -18,7 +18,6 @@ + #include "sigcontext.h" + #include "sysdep/sigcontext.h" + #include "irq_user.h" +-#include "frame_user.h" + #include "signal_user.h" + #include "time_user.h" + #include "task.h" +Index: linux-2.6.10/arch/um/kernel/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/Makefile 2004-12-25 05:35:01.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/Makefile 2005-04-05 12:40:36.051907448 +0800 +@@ -6,7 +6,7 @@ + extra-y := vmlinux.lds + clean-files := vmlinux.lds.S + +-obj-y = checksum.o config.o exec_kern.o exitcode.o frame_kern.o frame.o \ ++obj-y = checksum.o config.o exec_kern.o exitcode.o \ + helper.o init_task.o irq.o irq_user.o ksyms.o main.o mem.o mem_user.o \ + physmem.o process.o process_kern.o ptrace.o reboot.o resource.o \ + sigio_user.o sigio_kern.o signal_kern.o signal_user.o smp.o \ +Index: linux-2.6.10/arch/um/kernel/mem.c +=================================================================== +--- linux-2.6.10.orig/arch/um/kernel/mem.c 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/arch/um/kernel/mem.c 2005-04-05 12:40:36.029910792 +0800 +@@ -175,6 +175,30 @@ + } + #endif /* CONFIG_HIGHMEM */ + ++static void __init fixaddr_user_init( void) ++{ ++ long size = FIXADDR_USER_END - FIXADDR_USER_START; ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ unsigned long paddr, vaddr = FIXADDR_USER_START; ++ ++ if ( ! size ) ++ return; ++ ++ fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir); ++ paddr = (unsigned long)alloc_bootmem_low_pages( size); ++ memcpy( (void *)paddr, (void *)FIXADDR_USER_START, size); ++ paddr = __pa(paddr); ++ for ( ; size > 0; size-=PAGE_SIZE, vaddr+=PAGE_SIZE, paddr+=PAGE_SIZE) { ++ pgd = swapper_pg_dir + pgd_index(vaddr); ++ pmd = pmd_offset(pgd, vaddr); ++ pte = pte_offset_kernel(pmd, vaddr); ++ /*pte_set_val( (*pte), paddr, PAGE_READONLY);*/ ++ pte_val(*pte) = paddr | pgprot_val(PAGE_READONLY); ++ } ++} ++ + void paging_init(void) + { + unsigned long zones_size[MAX_NR_ZONES], vaddr; +@@ -195,6 +219,8 @@ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); + ++ fixaddr_user_init(); ++ + #ifdef CONFIG_HIGHMEM + init_highmem(); + #endif +Index: linux-2.6.10/arch/um/os-Linux/user_syms.c +=================================================================== +--- linux-2.6.10.orig/arch/um/os-Linux/user_syms.c 2004-12-25 05:35:23.000000000 +0800 ++++ linux-2.6.10/arch/um/os-Linux/user_syms.c 2005-04-05 12:40:36.019912312 +0800 +@@ -26,6 +26,9 @@ + + EXPORT_SYMBOL(strstr); + ++EXPORT_SYMBOL(vsyscall_ehdr); ++EXPORT_SYMBOL(vsyscall_end); ++ + /* Here, instead, I can provide a fake prototype. Yes, someone cares: genksyms. + * However, the modules will use the CRC defined *here*, no matter if it is + * good; so the versions of these symbols will always match +Index: linux-2.6.10/arch/um/os-Linux/elf_aux.c +=================================================================== +--- linux-2.6.10.orig/arch/um/os-Linux/elf_aux.c 2005-04-05 19:01:49.158500672 +0800 ++++ linux-2.6.10/arch/um/os-Linux/elf_aux.c 2005-04-05 12:40:36.018912464 +0800 +@@ -0,0 +1,67 @@ ++/* ++ * arch/um/kernel/elf_aux.c ++ * ++ * Scan the Elf auxiliary vector provided by the host to extract ++ * information about vsyscall-page, etc. ++ * ++ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH ++ * Author: Bodo Stroesser (bodo.stroesser@fujitsu-siemens.com) ++ */ ++#include ++#include ++#include "init.h" ++#include "elf_user.h" ++ ++#if ELF_CLASS == ELFCLASS32 ++typedef Elf32_auxv_t elf_auxv_t; ++#else ++typedef Elf64_auxv_t elf_auxv_t; ++#endif ++ ++char * elf_aux_platform; ++long elf_aux_hwcap; ++ ++unsigned long vsyscall_ehdr; ++unsigned long vsyscall_end; ++ ++unsigned long __kernel_vsyscall; ++ ++ ++__init void scan_elf_aux( char **envp) ++{ ++ long page_size = 0; ++ elf_auxv_t * auxv; ++ ++ while ( *envp++ != NULL) ; ++ ++ for ( auxv = (elf_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++) { ++ switch ( auxv->a_type ) { ++ case AT_SYSINFO: ++ __kernel_vsyscall = auxv->a_un.a_val; ++ break; ++ case AT_SYSINFO_EHDR: ++ vsyscall_ehdr = auxv->a_un.a_val; ++ break; ++ case AT_HWCAP: ++ elf_aux_hwcap = auxv->a_un.a_val; ++ break; ++ case AT_PLATFORM: ++ elf_aux_platform = auxv->a_un.a_ptr; ++ break; ++ case AT_PAGESZ: ++ page_size = auxv->a_un.a_val; ++ break; ++ } ++ } ++ if ( ! __kernel_vsyscall || ! vsyscall_ehdr || ++ ! elf_aux_hwcap || ! elf_aux_platform || ++ ! page_size || (vsyscall_ehdr % page_size) ) { ++ __kernel_vsyscall = 0; ++ vsyscall_ehdr = 0; ++ elf_aux_hwcap = 0; ++ elf_aux_platform = "i586"; ++ } ++ else { ++ vsyscall_end = vsyscall_ehdr + page_size; ++ } ++} +Index: linux-2.6.10/arch/um/os-Linux/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/um/os-Linux/Makefile 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/arch/um/os-Linux/Makefile 2005-04-05 12:40:36.019912312 +0800 +@@ -3,9 +3,9 @@ + # Licensed under the GPL + # + +-obj-y = file.o process.o time.o tty.o user_syms.o drivers/ ++obj-y = elf_aux.o file.o process.o time.o tty.o user_syms.o drivers/ + +-USER_OBJS := $(foreach file,file.o process.o time.o tty.o,$(obj)/$(file)) ++USER_OBJS := $(foreach file,elf_aux.o file.o process.o time.o tty.o,$(obj)/$(file)) + + $(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< +Index: linux-2.6.10/arch/um/drivers/net_kern.c +=================================================================== +--- linux-2.6.10.orig/arch/um/drivers/net_kern.c 2004-12-25 05:34:44.000000000 +0800 ++++ linux-2.6.10/arch/um/drivers/net_kern.c 2005-04-05 12:40:36.016912768 +0800 +@@ -126,10 +126,6 @@ + lp->tl.data = (unsigned long) &lp->user; + netif_start_queue(dev); + +- spin_lock(&opened_lock); +- list_add(&lp->list, &opened); +- spin_unlock(&opened_lock); +- + /* clear buffer - it can happen that the host side of the interface + * is full when we get here. In this case, new data is never queued, + * SIGIOs never arrive, and the net never works. +@@ -152,9 +148,6 @@ + free_irq(dev->irq, dev); + if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); + lp->fd = -1; +- spin_lock(&opened_lock); +- list_del(&lp->list); +- spin_unlock(&opened_lock); + + spin_unlock(&lp->lock); + return 0; +@@ -397,6 +390,11 @@ + + if (device->have_mac) + set_ether_mac(dev, device->mac); ++ ++ spin_lock(&opened_lock); ++ list_add(&lp->list, &opened); ++ spin_unlock(&opened_lock); ++ + return(0); + } + +@@ -705,7 +703,7 @@ + static void close_devices(void) + { + struct list_head *ele; +- struct uml_net_private *lp; ++ struct uml_net_private *lp; + + list_for_each(ele, &opened){ + lp = list_entry(ele, struct uml_net_private, list); +Index: linux-2.6.10/arch/um/drivers/mconsole_kern.c +=================================================================== +--- linux-2.6.10.orig/arch/um/drivers/mconsole_kern.c 2004-12-25 05:33:49.000000000 +0800 ++++ linux-2.6.10/arch/um/drivers/mconsole_kern.c 2005-04-05 12:40:36.015912920 +0800 +@@ -204,6 +204,68 @@ + } + #endif + ++/* This is a more convoluted version of mconsole_proc, which has some stability ++ * problems; however, we need it fixed, because it is expected that UML users ++ * mount HPPFS instead of procfs on /proc. And we want mconsole_proc to still ++ * show the real procfs content, not the ones from hppfs.*/ ++#if 0 ++void mconsole_proc(struct mc_request *req) ++{ ++ char path[64]; ++ char *buf; ++ int len; ++ int fd; ++ int first_chunk = 1; ++ char *ptr = req->request.data; ++ ++ ptr += strlen("proc"); ++ while(isspace(*ptr)) ptr++; ++ snprintf(path, sizeof(path), "/proc/%s", ptr); ++ ++ fd = sys_open(path, 0, 0); ++ if (fd < 0) { ++ mconsole_reply(req, "Failed to open file", 1, 0); ++ printk("open %s: %d\n",path,fd); ++ goto out; ++ } ++ ++ buf = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if(buf == NULL){ ++ mconsole_reply(req, "Failed to allocate buffer", 1, 0); ++ goto out_close; ++ } ++ ++ for (;;) { ++ len = sys_read(fd, buf, PAGE_SIZE-1); ++ if (len < 0) { ++ mconsole_reply(req, "Read of file failed", 1, 0); ++ goto out_free; ++ } ++ /*Begin the file content on his own line.*/ ++ if (first_chunk) { ++ mconsole_reply(req, "\n", 0, 1); ++ first_chunk = 0; ++ } ++ if (len == PAGE_SIZE-1) { ++ buf[len] = '\0'; ++ mconsole_reply(req, buf, 0, 1); ++ } else { ++ buf[len] = '\0'; ++ mconsole_reply(req, buf, 0, 0); ++ break; ++ } ++ } ++ /*END*/ ++ ++ out_free: ++ kfree(buf); ++ out_close: ++ sys_close(fd); ++ out: ++ /* nothing */; ++} ++#endif ++ + void mconsole_proc(struct mc_request *req) + { + char path[64]; +Index: linux-2.6.10/arch/um/drivers/net_user.c +=================================================================== +--- linux-2.6.10.orig/arch/um/drivers/net_user.c 2004-12-25 05:34:26.000000000 +0800 ++++ linux-2.6.10/arch/um/drivers/net_user.c 2005-04-05 12:40:36.017912616 +0800 +@@ -173,10 +173,12 @@ + pe_data.stdout = fds[1]; + pid = run_helper(change_pre_exec, &pe_data, argv, NULL); + +- os_close_file(fds[1]); + read_output(fds[0], output, output_len); ++ os_close_file(fds[0]); ++ os_close_file(fds[1]); + +- CATCH_EINTR(err = waitpid(pid, NULL, 0)); ++ if (pid > 0) ++ CATCH_EINTR(err = waitpid(pid, NULL, 0)); + return(pid); + } + +Index: linux-2.6.10/arch/um/Kconfig +=================================================================== +--- linux-2.6.10.orig/arch/um/Kconfig 2004-12-25 05:34:45.000000000 +0800 ++++ linux-2.6.10/arch/um/Kconfig 2005-04-05 12:40:36.053907144 +0800 +@@ -139,6 +139,25 @@ + + It is safe to say 'Y' here. + ++config MAGIC_SYSRQ ++ bool "Magic SysRq key" ++ depends on MCONSOLE ++ ---help--- ++ If you say Y here, you will have some control over the system even ++ if the system crashes for example during kernel debugging (e.g., you ++ will be able to flush the buffer cache to disk, reboot the system ++ immediately or dump some status information). A key for each of the ++ possible requests is provided. ++ ++ This is the feature normally accomplished by pressing a key ++ while holding SysRq (Alt+PrintScreen). ++ ++ On UML, this is accomplished by sending a "sysrq" command with ++ mconsole, followed by the letter for the requested command. ++ ++ The keys are documented in . Don't say Y ++ unless you really know what this hack does. ++ + config HOST_2G_2G + bool "2G/2G host address space split" + default n +@@ -153,28 +172,28 @@ + So, if you do not know what to do here, say 'N'. + + config SMP +- bool "Symmetric multi-processing support (EXPERIMENTAL)" +- default n +- depends on MODE_TT && EXPERIMENTAL +- help +- This option enables UML SMP support. +- It is NOT related to having a real SMP box. Not directly, at least. ++ bool "Symmetric multi-processing support (EXPERIMENTAL)" ++ default n ++ depends on MODE_TT && EXPERIMENTAL ++ help ++ This option enables UML SMP support. ++ It is NOT related to having a real SMP box. Not directly, at least. ++ ++ UML implements virtual SMP by allowing as many processes to run ++ simultaneously on the host as there are virtual processors configured. ++ ++ Obviously, if the host is a uniprocessor, those processes will ++ timeshare, but, inside UML, will appear to be running simultaneously. ++ If the host is a multiprocessor, then UML processes may run ++ simultaneously, depending on the host scheduler. ++ ++ This, however, is supported only in TT mode. So, if you use the SKAS ++ patch on your host, switching to TT mode and enabling SMP usually gives ++ you worse performances. ++ Also, since the support for SMP has been under-developed, there could ++ be some bugs being exposed by enabling SMP. + +- UML implements virtual SMP by allowing as many processes to run +- simultaneously on the host as there are virtual processors configured. +- +- Obviously, if the host is a uniprocessor, those processes will +- timeshare, but, inside UML, will appear to be running simultaneously. +- If the host is a multiprocessor, then UML processes may run +- simultaneously, depending on the host scheduler. +- +- This, however, is supported only in TT mode. So, if you use the SKAS +- patch on your host, switching to TT mode and enabling SMP usually gives +- you worse performances. +- Also, since the support for SMP has been under-developed, there could +- be some bugs being exposed by enabling SMP. +- +- If you don't know what to do, say N. ++ If you don't know what to do, say N. + + config NR_CPUS + int "Maximum number of CPUs (2-32)" +@@ -282,4 +301,8 @@ + bool + default n + ++config INPUT ++ bool ++ default n ++ + source "arch/um/Kconfig.debug" +Index: linux-2.6.10/arch/um/Makefile +=================================================================== +--- linux-2.6.10.orig/arch/um/Makefile 2004-12-25 05:35:00.000000000 +0800 ++++ linux-2.6.10/arch/um/Makefile 2005-04-05 12:40:53.158306880 +0800 +@@ -77,6 +77,8 @@ + echo ' find in the kernel root.' + endef + ++.PHONY: linux ++ + prepare: $(ARCH_SYMLINKS) $(SYS_HEADERS) $(GEN_HEADERS) \ + $(ARCH_DIR)/kernel/vmlinux.lds.S + +Index: linux-2.6.10/fs/hostfs/hostfs.h +=================================================================== +--- linux-2.6.10.orig/fs/hostfs/hostfs.h 2004-12-25 05:35:24.000000000 +0800 ++++ linux-2.6.10/fs/hostfs/hostfs.h 2005-04-05 12:40:36.068904864 +0800 +@@ -16,9 +16,30 @@ + #define HOSTFS_ATTR_CTIME 64 + #define HOSTFS_ATTR_ATIME_SET 128 + #define HOSTFS_ATTR_MTIME_SET 256 ++ ++/* These two are unused by hostfs. */ + #define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ + #define HOSTFS_ATTR_ATTR_FLAG 1024 + ++/* If you are very careful, you'll notice that these two are missing: ++ * ++ * #define ATTR_KILL_SUID 2048 ++ * #define ATTR_KILL_SGID 4096 ++ * ++ * and this is because they were added in 2.5 development in this patch: ++ * ++ * http://linux.bkbits.net:8080/linux-2.5/ ++ * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html ++ * |src/.|src/include|src/include/linux|related/include/linux/fs.h ++ * ++ * Actually, they are not needed by most ->setattr() methods - they are set by ++ * callers of notify_change() to notify that the setuid/setgid bits must be ++ * dropped. ++ * notify_change() will delete those flags, make sure attr->ia_valid & ATTR_MODE ++ * is on, and remove the appropriate bits from attr->ia_mode (attr is a ++ * "struct iattr *"). -BlaisorBlade ++ */ ++ + struct hostfs_iattr { + unsigned int ia_valid; + mode_t ia_mode; +Index: linux-2.6.10/fs/hostfs/hostfs_kern.c +=================================================================== +--- linux-2.6.10.orig/fs/hostfs/hostfs_kern.c 2004-12-25 05:34:01.000000000 +0800 ++++ linux-2.6.10/fs/hostfs/hostfs_kern.c 2005-04-05 12:40:36.069904712 +0800 +@@ -393,6 +393,7 @@ + static struct file_operations hostfs_file_fops = { + .llseek = generic_file_llseek, + .read = generic_file_read, ++ .sendfile = generic_file_sendfile, + .write = generic_file_write, + .mmap = generic_file_mmap, + .open = hostfs_file_open, +@@ -818,6 +819,10 @@ + char *name; + int err; + ++ err = inode_change_ok(dentry->d_inode, attr); ++ if (err) ++ return err; ++ + if(append) + attr->ia_valid &= ~ATTR_SIZE; + diff --git a/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch new file mode 100644 index 0000000..16ae126 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch @@ -0,0 +1,113 @@ +Introduce lock-free versions of d_rehash and d_move. + + fs/dcache.c | 22 ++++++++++++++++++---- + include/linux/dcache.h | 2 ++ + 2 files changed, 20 insertions(+), 4 deletions(-) + +Index: linux-2.6.10/fs/dcache.c +=================================================================== +--- linux-2.6.10.orig/fs/dcache.c 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/fs/dcache.c 2005-03-31 19:16:50.807244880 +0800 +@@ -1116,29 +1116,23 @@ + spin_unlock(&dcache_lock); + } + +-static void __d_rehash(struct dentry * entry, struct hlist_head *list) ++void __d_rehash(struct dentry * entry) + { +- +- entry->d_flags &= ~DCACHE_UNHASHED; +- hlist_add_head_rcu(&entry->d_hash, list); ++ struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); ++ ++ spin_lock(&entry->d_lock); ++ entry->d_flags &= ~DCACHE_UNHASHED; ++ hlist_add_head_rcu(&entry->d_hash, list); ++ spin_unlock(&entry->d_lock); + } +- +-/** +- * d_rehash - add an entry back to the hash +- * @entry: dentry to add to the hash +- * +- * Adds a dentry to the hash according to its name. +- */ + ++EXPORT_SYMBOL(__d_rehash); ++ + void d_rehash(struct dentry * entry) + { +- struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); +- +- spin_lock(&dcache_lock); +- spin_lock(&entry->d_lock); +- __d_rehash(entry, list); +- spin_unlock(&entry->d_lock); +- spin_unlock(&dcache_lock); ++ spin_lock(&dcache_lock); ++ __d_rehash(entry); ++ spin_unlock(&dcache_lock); + } + + #define do_switch(x,y) do { \ +@@ -1213,14 +1207,13 @@ + * dcache entries should not be moved in this way. + */ + +-void d_move(struct dentry * dentry, struct dentry * target) ++void __d_move(struct dentry * dentry, struct dentry * target) + { + struct hlist_head *list; + + if (!dentry->d_inode) + printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + +- spin_lock(&dcache_lock); + write_seqlock(&rename_lock); + /* + * XXXX: do we really need to take target->d_lock? +@@ -1241,7 +1234,8 @@ + + already_unhashed: + list = d_hash(target->d_parent, target->d_name.hash); +- __d_rehash(dentry, list); ++ dentry->d_flags &= ~DCACHE_UNHASHED; ++ hlist_add_head_rcu(&dentry->d_hash, list); + + /* Unhash the target: dput() will then get rid of it */ + __d_drop(target); +@@ -1280,6 +1274,14 @@ + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); + write_sequnlock(&rename_lock); ++} ++ ++EXPORT_SYMBOL(__d_move); ++ ++void d_move(struct dentry *dentry, struct dentry *target) ++{ ++ spin_lock(&dcache_lock); ++ __d_move(dentry, target); + spin_unlock(&dcache_lock); + } + +Index: linux-2.6.10/include/linux/dcache.h +=================================================================== +--- linux-2.6.10.orig/include/linux/dcache.h 2005-03-31 15:35:26.000000000 +0800 ++++ linux-2.6.10/include/linux/dcache.h 2005-03-31 19:15:49.684536944 +0800 +@@ -228,6 +228,7 @@ + * This adds the entry to the hash queues. + */ + extern void d_rehash(struct dentry *); ++extern void __d_rehash(struct dentry *); + + /** + * d_add - add dentry to hash queues +@@ -246,6 +247,7 @@ + + /* used for rename() and baskets */ + extern void d_move(struct dentry *, struct dentry *); ++extern void __d_move(struct dentry *, struct dentry *); + + /* appendix may either be NULL or be used for transname suffixes */ + extern struct dentry * d_lookup(struct dentry *, struct qstr *); diff --git a/lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch new file mode 100644 index 0000000..85cb332 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch @@ -0,0 +1,34 @@ +Index: linux-2.6.10/fs/namespace.c +=================================================================== +--- linux-2.6.10.orig/fs/namespace.c 2005-03-31 17:03:37.000000000 +0800 ++++ linux-2.6.10/fs/namespace.c 2005-03-31 17:58:42.827926064 +0800 +@@ -365,7 +365,7 @@ + } + } + +-static int do_umount(struct vfsmount *mnt, int flags) ++int do_umount(struct vfsmount *mnt, int flags) + { + struct super_block * sb = mnt->mnt_sb; + int retval; +@@ -458,6 +458,8 @@ + return retval; + } + ++EXPORT_SYMBOL(do_umount); ++ + /* + * Now umount can handle mount points as well as block devices. + * This is important for filesystems which use unnamed block devices. +Index: linux-2.6.10/include/linux/mount.h +=================================================================== +--- linux-2.6.10.orig/include/linux/mount.h 2005-03-31 17:15:40.000000000 +0800 ++++ linux-2.6.10/include/linux/mount.h 2005-03-31 17:59:41.914943472 +0800 +@@ -70,6 +70,7 @@ + extern struct vfsmount *do_kern_mount(const char *fstype, int flags, + const char *name, void *data); + ++extern int do_umount(struct vfsmount *mnt, int flags); + struct nameidata; + + extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, diff --git a/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch new file mode 100644 index 0000000..dfcf347 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch @@ -0,0 +1,557 @@ +Index: linux-2.6.10/fs/open.c +=================================================================== +--- linux-2.6.10.orig/fs/open.c 2005-03-31 15:35:27.683586616 +0800 ++++ linux-2.6.10/fs/open.c 2005-03-31 17:13:48.440535208 +0800 +@@ -217,11 +217,12 @@ + struct inode * inode; + int error; + ++ intent_init(&nd.intent.open, IT_GETATTR); + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -476,6 +477,7 @@ + kernel_cap_t old_cap; + int res; + ++ intent_init(&nd.intent.open, IT_GETATTR); + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + +@@ -499,7 +501,7 @@ + else + current->cap_effective = current->cap_permitted; + +- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); ++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); + if (!res) { + res = permission(nd.dentry->d_inode, mode, &nd); + /* SuS v2 requires we report a read only fs too */ +@@ -521,7 +523,8 @@ + struct nameidata nd; + int error; + +- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ intent_init(&nd.intent.open, IT_GETATTR); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (error) + goto out; + +@@ -574,7 +577,8 @@ + struct nameidata nd; + int error; + +- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ intent_init(&nd.intent.open, IT_GETATTR); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + if (error) + goto out; + +@@ -759,6 +763,7 @@ + { + int namei_flags, error; + struct nameidata nd; ++ intent_init(&nd.intent.open, IT_OPEN); + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) +@@ -768,14 +773,14 @@ + + error = open_namei(filename, namei_flags, mode, &nd); + if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); ++ return dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent.open); + + return ERR_PTR(error); + } + + EXPORT_SYMBOL(filp_open); + +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags, struct open_intent *it) + { + struct file * f; + struct inode *inode; +@@ -787,6 +792,7 @@ + goto cleanup_dentry; + f->f_flags = flags; + f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; ++ f->f_it = it; + inode = dentry->d_inode; + if (f->f_mode & FMODE_WRITE) { + error = get_write_access(inode); +@@ -805,6 +811,7 @@ + error = f->f_op->open(inode,f); + if (error) + goto cleanup_all; ++ intent_release(it); + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + +@@ -830,11 +837,20 @@ + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ struct open_intent it; ++ intent_init(&it, IT_LOOKUP); ++ ++ return dentry_open_it(dentry, mnt, flags, &it); ++} ++ + EXPORT_SYMBOL(dentry_open); + + /* +Index: linux-2.6.10/fs/xattr.c +=================================================================== +--- linux-2.6.10.orig/fs/xattr.c 2004-12-25 05:34:32.000000000 +0800 ++++ linux-2.6.10/fs/xattr.c 2005-03-31 17:03:37.148465728 +0800 +@@ -164,7 +164,8 @@ + struct nameidata nd; + ssize_t error; + +- error = user_path_walk(path, &nd); ++ intent_init(&nd.intent.open, IT_GETXATTR); ++ error = user_path_walk_it(path, &nd); + if (error) + return error; + error = getxattr(nd.dentry, name, value, size); +@@ -179,7 +180,8 @@ + struct nameidata nd; + ssize_t error; + +- error = user_path_walk_link(path, &nd); ++ intent_init(&nd.intent.open, IT_GETXATTR); ++ error = user_path_walk_link_it(path, &nd); + if (error) + return error; + error = getxattr(nd.dentry, name, value, size); +@@ -245,7 +247,8 @@ + struct nameidata nd; + ssize_t error; + +- error = user_path_walk(path, &nd); ++ intent_init(&nd.intent.open, IT_GETXATTR); ++ error = user_path_walk_it(path, &nd); + if (error) + return error; + error = listxattr(nd.dentry, list, size); +@@ -259,7 +262,8 @@ + struct nameidata nd; + ssize_t error; + +- error = user_path_walk_link(path, &nd); ++ intent_init(&nd.intent.open, IT_GETXATTR); ++ error = user_path_walk_link_it(path, &nd); + if (error) + return error; + error = listxattr(nd.dentry, list, size); +Index: linux-2.6.10/fs/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/namei.c 2005-03-31 15:35:26.294797744 +0800 ++++ linux-2.6.10/fs/namei.c 2005-03-31 17:12:26.403006808 +0800 +@@ -288,8 +288,19 @@ + return 0; + } + ++void intent_release(struct open_intent *it) ++{ ++ if (!it) ++ return; ++ if (it->magic != INTENT_MAGIC) ++ return; ++ if (it->op_release) ++ it->op_release(it); ++} ++ + void path_release(struct nameidata *nd) + { ++ intent_release(&nd->intent.open); + dput(nd->dentry); + mntput(nd->mnt); + } +@@ -448,6 +459,7 @@ + static inline int __vfs_follow_link(struct nameidata *nd, const char *link) + { + int res = 0; ++ struct open_intent it = nd->intent.open; + char *name; + if (IS_ERR(link)) + goto fail; +@@ -458,6 +470,10 @@ + /* weird __emul_prefix() stuff did it */ + goto out; + } ++ intent_release(&nd->intent.open); ++ intent_init(&nd->intent.open, it.op); ++ nd->intent.open.flags = it.flags; ++ nd->intent.open.create_mode = it.create_mode; + res = link_path_walk(link, nd); + out: + if (nd->depth || res || nd->last_type!=LAST_NORM) +@@ -876,8 +892,14 @@ + return err; + } + ++int fastcall path_walk_it(const char * name, struct nameidata *nd) ++{ ++ current->total_link_count = 0; ++ return link_path_walk(name, nd); ++} + int fastcall path_walk(const char * name, struct nameidata *nd) + { ++ intent_init(&nd->intent.open, IT_LOOKUP); + current->total_link_count = 0; + return link_path_walk(name, nd); + } +@@ -886,7 +908,7 @@ + /* returns 1 if everything is done */ + static int __emul_lookup_dentry(const char *name, struct nameidata *nd) + { +- if (path_walk(name, nd)) ++ if (path_walk_it(name, nd)) + return 0; /* something went wrong... */ + + if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) { +@@ -947,7 +969,18 @@ + } + } + +-int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd) ++static inline int it_mode_from_lookup_flags(int flags) ++{ ++ int mode = IT_LOOKUP; ++ ++ if (flags & LOOKUP_OPEN) ++ mode = IT_OPEN; ++ if (flags & LOOKUP_CREATE) ++ mode |= IT_CREAT; ++ return mode; ++} ++ ++int fastcall path_lookup_it(const char *name, unsigned int flags, struct nameidata *nd) + { + int retval; + +@@ -982,6 +1015,12 @@ + return retval; + } + ++int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd) ++{ ++ intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags)); ++ return path_lookup_it(name, flags, nd); ++} ++ + /* + * Restricted form of lookup. Doesn't follow links, single-component only, + * needs parent already locked. Doesn't follow mounts. +@@ -1032,7 +1071,7 @@ + } + + /* SMP-safe */ +-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd) + { + unsigned long hash; + struct qstr this; +@@ -1052,11 +1091,16 @@ + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return __lookup_hash(&this, base, nd); + access: + return ERR_PTR(-EACCES); + } + ++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++{ ++ return lookup_one_len_it(name, base, len, NULL); ++} ++ + /* + * namei() + * +@@ -1068,18 +1112,24 @@ + * that namei follows links, while lnamei does not. + * SMP-safe + */ +-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd) + { + char *tmp = getname(name); + int err = PTR_ERR(tmp); + + if (!IS_ERR(tmp)) { +- err = path_lookup(tmp, flags, nd); ++ err = path_lookup_it(tmp, flags, nd); + putname(tmp); + } + return err; + } + ++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++{ ++ intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags)); ++ return __user_walk_it(name, flags, nd); ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -1370,7 +1420,7 @@ + * The simplest case - just a plain lookup. + */ + if (!(flag & O_CREAT)) { +- error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd); ++ error = path_lookup_it(pathname, lookup_flags(flag), nd); + if (error) + return error; + goto ok; +@@ -1379,7 +1429,8 @@ + /* + * Create - we need to know the parent. + */ +- error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); ++ nd->intent.open.op |= IT_CREAT; ++ error = path_lookup_it(pathname, LOOKUP_PARENT, nd); + if (error) + return error; + +@@ -2344,6 +2395,7 @@ + } + } + ++ + int page_symlink(struct inode *inode, const char *symname, int len) + { + struct address_space *mapping = inode->i_mapping; +@@ -2405,8 +2457,10 @@ + EXPORT_SYMBOL(page_symlink); + EXPORT_SYMBOL(page_symlink_inode_operations); + EXPORT_SYMBOL(path_lookup); ++EXPORT_SYMBOL(path_lookup_it); + EXPORT_SYMBOL(path_release); + EXPORT_SYMBOL(path_walk); ++EXPORT_SYMBOL(path_walk_it); + EXPORT_SYMBOL(permission); + EXPORT_SYMBOL(unlock_rename); + EXPORT_SYMBOL(vfs_create); +Index: linux-2.6.10/fs/stat.c +=================================================================== +--- linux-2.6.10.orig/fs/stat.c 2004-12-25 05:34:02.000000000 +0800 ++++ linux-2.6.10/fs/stat.c 2005-03-31 17:03:37.144466336 +0800 +@@ -60,15 +60,15 @@ + } + return 0; + } +- + EXPORT_SYMBOL(vfs_getattr); + + int vfs_stat(char __user *name, struct kstat *stat) + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent.open, IT_GETATTR); + +- error = user_path_walk(name, &nd); ++ error = user_path_walk_it(name, &nd); + if (!error) { + error = vfs_getattr(nd.mnt, nd.dentry, stat); + path_release(&nd); +@@ -82,8 +82,9 @@ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent.open, IT_GETATTR); + +- error = user_path_walk_link(name, &nd); ++ error = user_path_walk_link_it(name, &nd); + if (!error) { + error = vfs_getattr(nd.mnt, nd.dentry, stat); + path_release(&nd); +@@ -97,9 +98,12 @@ + { + struct file *f = fget(fd); + int error = -EBADF; ++ struct nameidata nd; ++ intent_init(&nd.intent.open, IT_GETATTR); + + if (f) { + error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat); ++ intent_release(&nd.intent.open); + fput(f); + } + return error; +Index: linux-2.6.10/fs/namespace.c +=================================================================== +--- linux-2.6.10.orig/fs/namespace.c 2005-03-31 15:35:26.295797592 +0800 ++++ linux-2.6.10/fs/namespace.c 2005-03-31 17:03:37.145466184 +0800 +@@ -113,6 +113,7 @@ + + static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) + { ++ memset(old_nd, 0, sizeof(*old_nd)); + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; + mnt->mnt_parent = mnt; +Index: linux-2.6.10/fs/exec.c +=================================================================== +--- linux-2.6.10.orig/fs/exec.c 2005-03-31 16:20:09.692859232 +0800 ++++ linux-2.6.10/fs/exec.c 2005-03-31 17:03:37.147465880 +0800 +@@ -125,8 +125,9 @@ + struct nameidata nd; + int error; + ++ intent_init(&nd.intent.open, IT_OPEN); + nd.intent.open.flags = FMODE_READ; +- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ error = user_path_walk_it(library, &nd); + if (error) + goto out; + +@@ -138,7 +139,7 @@ + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -485,8 +486,9 @@ + int err; + struct file *file; + ++ intent_init(&nd.intent.open, IT_OPEN); + nd.intent.open.flags = FMODE_READ; +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ err = path_lookup_it(name, LOOKUP_FOLLOW, &nd); + file = ERR_PTR(err); + + if (!err) { +@@ -499,7 +501,7 @@ + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +Index: linux-2.6.10/include/linux/fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/fs.h 2005-03-31 15:35:26.317794248 +0800 ++++ linux-2.6.10/include/linux/fs.h 2005-03-31 17:03:37.135467704 +0800 +@@ -600,6 +600,7 @@ + spinlock_t f_ep_lock; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; ++ struct open_intent *f_it; + }; + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); +@@ -1245,6 +1246,7 @@ + extern int do_truncate(struct dentry *, loff_t start); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct open_intent *); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char __user *); + +Index: linux-2.6.10/include/linux/namei.h +=================================================================== +--- linux-2.6.10.orig/include/linux/namei.h 2005-03-31 15:35:27.689585704 +0800 ++++ linux-2.6.10/include/linux/namei.h 2005-03-31 17:10:14.746021712 +0800 +@@ -2,14 +2,41 @@ + #define _LINUX_NAMEI_H + + #include ++#include + + struct vfsmount; + ++/* intent opcodes */ ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++#define IT_TRUNC (1<<6) ++#define IT_GETXATTR (1<<7) ++ ++#define INTENT_MAGIC 0x19620323 ++ ++ + struct open_intent { ++ int magic; ++ int op; ++ void (*op_release)(struct open_intent *); + int flags; + int create_mode; ++ union { ++ void *fs_data; /* FS-specific intent data */ ++ } d; + }; + ++static inline void intent_init(struct open_intent *it, int op) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->magic = INTENT_MAGIC; ++ it->op = op; ++} ++ + enum { MAX_NESTED_LINKS = 8 }; + + struct nameidata { +@@ -55,14 +82,22 @@ + #define LOOKUP_ACCESS (0x0400) + + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char __user *, unsigned, struct nameidata *)); + #define user_path_walk(name,nd) \ + __user_walk(name, LOOKUP_FOLLOW, nd) ++#define user_path_walk_it(name,nd) \ ++ __user_walk_it(name, LOOKUP_FOLLOW, nd) + #define user_path_walk_link(name,nd) \ + __user_walk(name, 0, nd) ++#define user_path_walk_link_it(name,nd) \ ++ __user_walk_it(name, 0, nd) + extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); ++extern int FASTCALL(path_lookup_it(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); ++extern int FASTCALL(path_walk_it(const char *, struct nameidata *)); + extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); + extern void path_release(struct nameidata *); ++extern void intent_release(struct open_intent *); + extern void path_release_on_umount(struct nameidata *); + + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); +Index: linux-2.6.10/include/linux/mount.h +=================================================================== +--- linux-2.6.10.orig/include/linux/mount.h 2004-12-25 05:33:51.000000000 +0800 ++++ linux-2.6.10/include/linux/mount.h 2005-03-31 17:15:40.613482328 +0800 +@@ -36,6 +36,8 @@ + struct list_head mnt_list; + struct list_head mnt_fslink; /* link in fs-specific expiry list */ + struct namespace *mnt_namespace; /* containing namespace */ ++ struct list_head mnt_lustre_list; /* GNS mount list */ ++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */ + }; + + static inline struct vfsmount *mntget(struct vfsmount *mnt) diff --git a/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch new file mode 100644 index 0000000..1bb1634 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch @@ -0,0 +1,78 @@ +Index: linux-2.6.10/fs/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/namei.c 2005-03-31 17:12:26.403006808 +0800 ++++ linux-2.6.10/fs/namei.c 2005-03-31 17:20:37.388365688 +0800 +@@ -783,8 +783,11 @@ + goto out_dput; + + if (inode->i_op->follow_link) { ++ int saved_flags = nd->flags; + mntget(next.mnt); ++ nd->flags |= LOOKUP_LINK_NOTLAST; + err = do_follow_link(next.dentry, nd); ++ nd->flags = saved_flags; + dput(next.dentry); + mntput(next.mnt); + if (err) +@@ -830,7 +833,9 @@ + if (err < 0) + break; + } ++ nd->flags |= LOOKUP_LAST; + err = do_lookup(nd, &this, &next, atomic); ++ nd->flags &= ~LOOKUP_LAST; + if (err) + break; + follow_mount(&next.mnt, &next.dentry); +@@ -876,10 +881,14 @@ + */ + if (nd->dentry && nd->dentry->d_sb && + (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { +- err = -ESTALE; ++ nd->flags |= LOOKUP_LAST; ++ err = !nd->dentry->d_op->d_revalidate(nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + /* Note: we do not d_invalidate() */ +- if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd)) ++ if (err) { ++ err = -ESTALE; + break; ++ } + } + return_base: + return 0; +@@ -1446,7 +1455,9 @@ + dir = nd->dentry; + nd->flags &= ~LOOKUP_PARENT; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + + do_last: + error = PTR_ERR(dentry); +@@ -1559,7 +1570,9 @@ + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + putname(nd->last.name); + goto do_last; + } +Index: linux-2.6.10/include/linux/namei.h +=================================================================== +--- linux-2.6.10.orig/include/linux/namei.h 2005-03-31 17:10:14.746021712 +0800 ++++ linux-2.6.10/include/linux/namei.h 2005-03-31 17:21:41.178668088 +0800 +@@ -73,7 +73,9 @@ + #define LOOKUP_PARENT 16 + #define LOOKUP_NOALT 32 + #define LOOKUP_ATOMIC 64 +- ++#define LOOKUP_LAST 128 ++#define LOOKUP_LINK_NOTLAST 256 +++ + /* + * Intent data + */ diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch new file mode 100644 index 0000000..57098d2 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch @@ -0,0 +1,274 @@ + fs/inode.c | 1 + fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++--------------- + include/linux/fs.h | 11 ++++---- + 3 files changed, 54 insertions(+), 24 deletions(-) + +Index: linux-2.6.10/fs/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/inode.c 2004-12-25 05:35:40.000000000 +0800 ++++ linux-2.6.10/fs/inode.c 2005-03-31 18:03:53.551688872 +0800 +@@ -166,6 +166,7 @@ + } + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; ++ dynlock_init(&inode->i_dcache_lock); + } + return inode; + } +Index: linux-2.6.10/fs/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/namei.c 2005-03-31 17:57:10.767921312 +0800 ++++ linux-2.6.10/fs/namei.c 2005-03-31 18:05:52.839554360 +0800 +@@ -104,6 +104,38 @@ + * any extra contention... + */ + ++void *lock_dir(struct inode *dir, struct qstr *name) ++{ ++ unsigned long hash; ++ ++ if (!IS_PDIROPS(dir)) { ++ down(&dir->i_sem); ++ return 0; ++ } ++ ++ /* OK. fs understands parallel directory operations. ++ * so, we try to acquire lock for hash of requested ++ * filename in order to prevent any operations with ++ * same name in same time -bzzz */ ++ ++ /* calculate name hash */ ++ hash = full_name_hash(name->name, name->len); ++ ++ /* lock this hash */ ++ return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC); ++} ++EXPORT_SYMBOL(lock_dir); ++ ++void unlock_dir(struct inode *dir, void *lock) ++{ ++ if (!IS_PDIROPS(dir)) { ++ up(&dir->i_sem); ++ return; ++ } ++ dynlock_unlock(&dir->i_dcache_lock, lock); ++} ++EXPORT_SYMBOL(unlock_dir); ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -390,8 +422,9 @@ + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ void *lock; + +- down(&dir->i_sem); ++ lock = lock_dir(dir, name); + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. +@@ -417,7 +450,7 @@ + else + result = dentry; + } +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + return result; + } + +@@ -425,7 +458,7 @@ + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { + dput(result); +@@ -1461,7 +1494,7 @@ + + dir = nd->dentry; + nd->flags &= ~LOOKUP_PARENT; +- down(&dir->d_inode->i_sem); ++ nd->lock = lock_dir(dir->d_inode, &nd->last); + nd->flags |= LOOKUP_LAST; + dentry = __lookup_hash(&nd->last, nd->dentry, nd); + nd->flags &= ~LOOKUP_LAST; +@@ -1469,7 +1502,7 @@ + do_last: + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + goto exit; + } + +@@ -1478,7 +1511,7 @@ + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current->fs->umask; + error = vfs_create(dir->d_inode, dentry, mode, nd); +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + dput(nd->dentry); + nd->dentry = dentry; + if (error) +@@ -1492,7 +1525,7 @@ + /* + * It already exists. + */ +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + + error = -EEXIST; + if (flag & O_EXCL) +@@ -1576,7 +1609,7 @@ + goto exit; + } + dir = nd->dentry; +- down(&dir->d_inode->i_sem); ++ nd->lock = lock_dir(dir->d_inode, &nd->last); + nd->flags |= LOOKUP_LAST; + dentry = __lookup_hash(&nd->last, nd->dentry, nd); + nd->flags &= ~LOOKUP_LAST; +@@ -1596,7 +1629,7 @@ + { + struct dentry *dentry; + +- down(&nd->dentry->d_inode->i_sem); ++ nd->lock = lock_dir(nd->dentry->d_inode, &nd->last); + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +@@ -1688,7 +1721,7 @@ + } + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1747,7 +1780,7 @@ + error = vfs_mkdir(nd.dentry->d_inode, dentry, mode); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1852,14 +1885,14 @@ + error = -EBUSY; + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash(&nd.last, nd.dentry); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1925,7 +1958,7 @@ + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash(&nd.last, nd.dentry); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1939,7 +1972,7 @@ + exit2: + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + if (inode) + iput(inode); /* truncate the inode here */ + exit1: +@@ -2005,7 +2038,7 @@ + error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -2094,7 +2127,7 @@ + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); + dput(new_dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out_release: + path_release(&nd); + out: +Index: linux-2.6.10/include/linux/fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/fs.h 2005-03-31 17:57:13.330531736 +0800 ++++ linux-2.6.10/include/linux/fs.h 2005-03-31 18:08:59.645155592 +0800 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + struct iovec; + struct nameidata; +@@ -151,7 +152,7 @@ + #define S_DIRSYNC 64 /* Directory modifications are synchronous */ + #define S_NOCMTIME 128 /* Do not update file c/mtime */ + #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ +- ++#define S_PDIROPS 512 /* Parallel directory operations */ + /* + * Note that nosuid etc flags are inode-specific: setting some file-system + * flags just means all the inodes inherit those flags by default. It might be +@@ -181,6 +182,7 @@ + #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) + #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) + #define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND) ++#define IS_PDIROPS(inode) __IS_FLG(inode, S_PDIROPS) + + #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) + #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) +@@ -482,6 +484,7 @@ + + atomic_t i_writecount; + void *i_security; ++ struct dynlock i_dcache_lock; /* for parallel directory ops */ + union { + void *generic_ip; + } u; +Index: linux-2.6.10/include/linux/namei.h +=================================================================== +--- linux-2.6.10.orig/include/linux/namei.h 2005-03-31 17:50:12.533502608 +0800 ++++ linux-2.6.10/include/linux/namei.h 2005-03-31 18:10:30.237383480 +0800 +@@ -63,7 +63,8 @@ + int last_type; + unsigned depth; + char *saved_names[MAX_NESTED_LINKS + 1]; +- ++ ++ void *lock; + /* Intent data */ + union { + struct open_intent open; +@@ -91,7 +92,7 @@ + #define LOOKUP_ATOMIC 64 + #define LOOKUP_LAST 128 + #define LOOKUP_LINK_NOTLAST 256 +-+ ++ + /* + * Intent data + */ diff --git a/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch new file mode 100644 index 0000000..ad2d3ab --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch @@ -0,0 +1,235 @@ +Index: linux-2.6.10/fs/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/namei.c 2005-03-31 17:43:42.417809208 +0800 ++++ linux-2.6.10/fs/namei.c 2005-03-31 17:47:14.292599344 +0800 +@@ -474,6 +474,7 @@ + intent_init(&nd->intent.open, it.op); + nd->intent.open.flags = it.flags; + nd->intent.open.create_mode = it.create_mode; ++ nd->intent.open.create = it.create; + res = link_path_walk(link, nd); + out: + if (nd->depth || res || nd->last_type!=LAST_NORM) +@@ -866,14 +867,20 @@ + lookup_parent: + nd->last = this; + nd->last_type = LAST_NORM; +- if (this.name[0] != '.') +- goto return_base; +- if (this.len == 1) +- nd->last_type = LAST_DOT; +- else if (this.len == 2 && this.name[1] == '.') +- nd->last_type = LAST_DOTDOT; +- else +- goto return_base; ++ if (this.name[0] == '.') { ++ if (this.len == 1) ++ nd->last_type = LAST_DOT; ++ else if (this.len == 2 && this.name[1] == '.') ++ nd->last_type = LAST_DOTDOT; ++ } ++ ++ if ((nd->last_type == LAST_NORM) && inode->i_op && ++ inode->i_op->endparentlookup) { ++ err = inode->i_op->endparentlookup(nd); ++ if (err) ++ break; ++ } ++ goto return_base; + return_reval: + /* + * We bypassed the ordinary revalidation routines. +@@ -1646,9 +1653,16 @@ + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + +- error = path_lookup(tmp, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_MKNOD); ++ nd.intent.open.create_mode = mode; ++ nd.intent.open.create.dev = dev; ++ ++ error = path_lookup_it(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto out2; ++ + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + +@@ -1675,6 +1689,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1717,9 +1732,13 @@ + struct dentry *dentry; + struct nameidata nd; + +- error = path_lookup(tmp, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_MKDIR); ++ nd.intent.open.create_mode = mode; ++ error = path_lookup_it(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto out2; + dentry = lookup_create(&nd, 1); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1729,6 +1748,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1814,9 +1834,12 @@ + if(IS_ERR(name)) + return PTR_ERR(name); + +- error = path_lookup(name, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_RMDIR); ++ error = path_lookup_it(name, LOOKUP_PARENT, &nd); + if (error) + goto exit; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto exit1; + + switch(nd.last_type) { + case LAST_DOTDOT: +@@ -1892,9 +1915,13 @@ + if(IS_ERR(name)) + return PTR_ERR(name); + +- error = path_lookup(name, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_UNLINK); ++ error = path_lookup_it(name, LOOKUP_PARENT, &nd); + if (error) + goto exit; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto exit1; ++ + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; +@@ -1965,9 +1992,13 @@ + struct dentry *dentry; + struct nameidata nd; + +- error = path_lookup(to, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_SYMLINK); ++ nd.intent.open.create.link = from; ++ error = path_lookup_it(to, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto out2; + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1975,6 +2006,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(to); +@@ -2046,9 +2078,13 @@ + error = __user_walk(oldname, 0, &old_nd); + if (error) + goto exit; +- error = path_lookup(to, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_LINK); ++ nd.intent.open.create.source_nd = &old_nd; ++ error = path_lookup_it(to, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto out_release; + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; +@@ -2229,9 +2265,18 @@ + if (error) + goto exit; + +- error = path_lookup(newname, LOOKUP_PARENT, &newnd); ++ error = -EBUSY; ++ if (oldnd.last_type != LAST_NORM) ++ goto exit1; ++ ++ intent_init(&newnd.intent.open, IT_RENAME); ++ newnd.intent.open.create.source_nd = &oldnd; ++ error = path_lookup_it(newname, LOOKUP_PARENT, &newnd); + if (error) + goto exit1; ++ if (newnd.intent.open.flags & IT_STATUS_RAW) { ++ goto exit2; ++ } + + error = -EXDEV; + if (oldnd.mnt != newnd.mnt) +@@ -2239,8 +2284,6 @@ + + old_dir = oldnd.dentry; + error = -EBUSY; +- if (oldnd.last_type != LAST_NORM) +- goto exit2; + + new_dir = newnd.dentry; + if (newnd.last_type != LAST_NORM) +Index: linux-2.6.10/include/linux/fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/fs.h 2005-03-31 17:03:37.000000000 +0800 ++++ linux-2.6.10/include/linux/fs.h 2005-03-31 17:46:35.715463960 +0800 +@@ -956,6 +956,7 @@ + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); ++ int (*endparentlookup) (struct nameidata *); + }; + + struct seq_file; +Index: linux-2.6.10/include/linux/namei.h +=================================================================== +--- linux-2.6.10.orig/include/linux/namei.h 2005-03-31 17:43:42.472800848 +0800 ++++ linux-2.6.10/include/linux/namei.h 2005-03-31 17:50:12.533502608 +0800 +@@ -15,8 +15,19 @@ + #define IT_UNLINK (1<<5) + #define IT_TRUNC (1<<6) + #define IT_GETXATTR (1<<7) +- ++#define IT_RMDIR (1<<8) ++#define IT_LINK (1<<9) ++#define IT_RENAME (1<<10) ++#define IT_MKDIR (1<<11) ++#define IT_MKNOD (1<<12) ++#define IT_SYMLINK (1<<13) ++#define IT_CHDIR (1<<14) ++ + #define INTENT_MAGIC 0x19620323 ++#define IT_STATUS_RAW (1<<10) /* Setting this in it_flags on exit from lookup ++ means everything was done already and return ++ value from lookup is in fact status of ++ already performed operation */ + + + struct open_intent { +@@ -26,6 +37,11 @@ + int flags; + int create_mode; + union { ++ unsigned dev; /* For mknod */ ++ char *link; /* For symlink */ ++ struct nameidata *source_nd; /* For link/rename */ ++ } create; ++ union { + void *fs_data; /* FS-specific intent data */ + } d; + }; diff --git a/lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch b/lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch new file mode 100644 index 0000000..07d1008 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch @@ -0,0 +1,22 @@ +Index: linux-2.6.10/fs/namespace.c +=================================================================== +--- linux-2.6.10.orig/fs/namespace.c 2005-03-31 17:58:42.827926064 +0800 ++++ linux-2.6.10/fs/namespace.c 2005-03-31 18:19:21.976546840 +0800 +@@ -62,6 +62,7 @@ + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_fslink); ++ INIT_LIST_HEAD(&mnt->mnt_lustre_list); + if (name) { + int size = strlen(name)+1; + char *newname = kmalloc(size, GFP_KERNEL); +@@ -177,6 +178,9 @@ + { + struct super_block *sb = mnt->mnt_sb; + dput(mnt->mnt_root); ++ spin_lock(&dcache_lock); ++ list_del(&mnt->mnt_lustre_list); ++ spin_unlock(&dcache_lock); + free_vfsmnt(mnt); + deactivate_super(sb); + } -- 1.8.3.1