Whamcloud - gitweb
merge b_llpmd into b_devel. the major highlights:
authorphil <phil>
Tue, 9 Sep 2003 03:55:05 +0000 (03:55 +0000)
committerphil <phil>
Tue, 9 Sep 2003 03:55:05 +0000 (03:55 +0000)
- new I/O backend
- new client page cache and llite/lov/osc plumbing
- pre-creation of OST objects
- most of the OBD protocol now revolves around exports, not obd_devices

56 files changed:
lustre/include/linux/lustre_compat25.h
lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config [new file with mode: 0644]
lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/dynamic-locks-2.4.20-rh.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext-2.4-patch-5.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-extents-oflag-2.4.18-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-map_inode_page-2.6.0.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-no-write-super-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/iopen-2.4.18-2.patch [new file with mode: 0644]
lustre/kernel_patches/patches/iopen-2.6.0.patch [new file with mode: 0644]
lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch [new file with mode: 0644]
lustre/kernel_patches/patches/removepage-2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/removepage-2.6.0.patch [new file with mode: 0644]
lustre/kernel_patches/patches/uml-2.6.0-fix.patch [new file with mode: 0644]
lustre/kernel_patches/patches/uml-patch-2.6.0-test3-1.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-pdirops-2.4.20-rh.patch [new file with mode: 0644]
lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext-2.4-patch-5.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-extents-oflag-2.4.18-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-map_inode_page-2.6.0.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-no-write-super-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/pc/iopen-2.4.18-2.pc [new file with mode: 0644]
lustre/kernel_patches/pc/iopen-2.6.0.pc [new file with mode: 0644]
lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc [new file with mode: 0644]
lustre/kernel_patches/pc/removepage-2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/removepage-2.6.0.pc [new file with mode: 0644]
lustre/kernel_patches/pc/uml-2.6.0-fix.pc [new file with mode: 0644]
lustre/kernel_patches/pc/uml-patch-2.6.0-test3-1.pc [new file with mode: 0644]
lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/series/chaos-2.4.18
lustre/kernel_patches/series/chaos-2.4.18-pdirops [new file with mode: 0644]
lustre/kernel_patches/series/uml_2.6.0_test3 [new file with mode: 0644]
lustre/liblustre/file.c
lustre/liblustre/llite_lib.h
lustre/liblustre/rw.c
lustre/liblustre/super.c
lustre/mdc/mdc_locks.c [new file with mode: 0644]
lustre/obdfilter/filter_io_24.c [new file with mode: 0644]
lustre/obdfilter/filter_io_26.c [new file with mode: 0644]
lustre/osc/osc_create.c [new file with mode: 0644]
lustre/ptlrpc/ptlrpc_internal.h
lustre/ptlrpc/ptlrpc_module.c
lustre/tests/replay-ost-single.sh [new file with mode: 0755]
lustre/tests/test-framework.sh [new file with mode: 0644]

index 96e52c4..1f26364 100644 (file)
 #include <linux/portals_compat25.h>
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
-# define PGCACHE_WRLOCK(mapping)          write_lock(&mapping->page_lock)
-# define PGCACHE_WRUNLOCK(mapping)        write_unlock(&mapping->page_lock)
+
+/* XXX our code should be using the 2.6 calls, not the other way around */
+#define TryLockPage(page)                TestSetPageLocked(page)
+#define filemap_fdatasync(mapping)       filemap_fdatawrite(mapping)
+#define Page_Uptodate(page)              PageUptodate(page)
 
 #define KDEVT_INIT(val)                 { .value = val }
 
 
 #define ll_vfs_create(a,b,c,d)              vfs_create(a,b,c,d)
 
+#define ll_dev_t                        dev_t
+
+#include <linux/writeback.h>
+
 #else /* 2.4.. */
 
 #define ll_vfs_create(a,b,c,d)              vfs_create(a,b,c)
 #define ll_permission(a,b,c)                permission(a,b)
-# define PGCACHE_WRLOCK(mapping)          spin_lock(&pagecache_lock)
-# define PGCACHE_WRUNLOCK(mapping)        spin_unlock(&pagecache_lock)
+
+#define ll_dev_t                        int
+
+static inline void clear_page_dirty(struct page *page)
+{
+        if (PageDirty(page))
+                ClearPageDirty(page); 
+}
 
 /* 2.5 uses hlists for some things, like the d_hash.  we'll treat them
  * as 2.5 and let macros drop back.. */
@@ -94,20 +107,6 @@ static inline void __d_drop(struct dentry *dentry)
 
 #endif /* end of 2.4 compat macros */
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
-# define filemap_fdatasync(mapping)       filemap_fdatawrite(mapping)
-#endif
-
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
-# define TryLockPage(page)                TestSetPageLocked(page)
-#endif
-
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
-# define Page_Uptodate(page)              PageUptodate(page)
-#endif
-
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
 #define  rb_node_s rb_node
 #define  rb_root_s rb_root
diff --git a/lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config b/lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config
new file mode 100644 (file)
index 0000000..f933188
--- /dev/null
@@ -0,0 +1,325 @@
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_USERMODE=y
+CONFIG_MMU=y
+CONFIG_UID16=y
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+
+#
+# UML-specific options
+#
+CONFIG_MODE_TT=y
+# CONFIG_MODE_SKAS is not set
+CONFIG_NET=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=y
+CONFIG_HOSTFS=y
+# CONFIG_HPPFS is not set
+CONFIG_MCONSOLE=y
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_HOST_2G_2G is not set
+# CONFIG_UML_SMP is not set
+# CONFIG_SMP is not set
+CONFIG_NEST_LEVEL=0
+CONFIG_KERNEL_HALF_GIGS=1
+# CONFIG_HIGHMEM is not set
+# CONFIG_PROC_MM is not set
+CONFIG_KERNEL_STACK_ORDER=2
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# General setup
+#
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_LOG_BUF_SHIFT=14
+# CONFIG_EMBEDDED is not set
+CONFIG_KALLSYMS=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+
+#
+# Loadable module support
+#
+# CONFIG_MODULES is not set
+
+#
+# Generic Driver Options
+#
+# CONFIG_FW_LOADER is not set
+
+#
+# Character Devices
+#
+CONFIG_STDIO_CONSOLE=y
+CONFIG_SSL=y
+CONFIG_FD_CHAN=y
+# CONFIG_NULL_CHAN is not set
+CONFIG_PORT_CHAN=y
+CONFIG_PTY_CHAN=y
+CONFIG_TTY_CHAN=y
+CONFIG_XTERM_CHAN=y
+CONFIG_CON_ZERO_CHAN="fd:0,fd:1"
+CONFIG_CON_CHAN="xterm"
+CONFIG_SSL_CHAN="pty"
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=256
+# CONFIG_WATCHDOG is not set
+# CONFIG_UML_SOUND is not set
+# CONFIG_SOUND is not set
+# CONFIG_HOSTAUDIO is not set
+
+#
+# Block Devices
+#
+CONFIG_BLK_DEV_UBD=y
+# CONFIG_BLK_DEV_UBD_SYNC is not set
+CONFIG_BLK_DEV_COW_COMMON=y
+CONFIG_BLK_DEV_LOOP=y
+# CONFIG_BLK_DEV_NBD is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_MMAPPER is not set
+CONFIG_NETDEVICES=y
+
+#
+# UML Network Devices
+#
+CONFIG_UML_NET=y
+CONFIG_UML_NET_ETHERTAP=y
+CONFIG_UML_NET_TUNTAP=y
+CONFIG_UML_NET_SLIP=y
+CONFIG_UML_NET_DAEMON=y
+CONFIG_UML_NET_MCAST=y
+# CONFIG_UML_NET_PCAP is not set
+# CONFIG_UML_NET_SLIRP is not set
+
+#
+# Networking support
+#
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+# CONFIG_NETLINK_DEV is not set
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+# CONFIG_IP_MULTICAST is not set
+# CONFIG_IP_ADVANCED_ROUTER is not set
+# CONFIG_IP_PNP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_IPV6 is not set
+# CONFIG_DECNET is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_NETFILTER is not set
+# CONFIG_XFRM_USER is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IPV6_SCTP__=y
+# CONFIG_IP_SCTP is not set
+# CONFIG_ATM is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_LLC is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+# CONFIG_NET_SCHED is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+CONFIG_DUMMY=y
+# CONFIG_BONDING is not set
+# CONFIG_EQUALIZER is not set
+CONFIG_TUN=y
+# CONFIG_ETHERTAP is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+# CONFIG_NET_ETHERNET is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_PPP=y
+# CONFIG_PPP_MULTILINK is not set
+# CONFIG_PPP_FILTER is not set
+# CONFIG_PPP_ASYNC is not set
+# CONFIG_PPP_SYNC_TTY is not set
+# CONFIG_PPP_DEFLATE is not set
+# CONFIG_PPP_BSDCOMP is not set
+# CONFIG_PPPOE is not set
+CONFIG_SLIP=y
+# CONFIG_SLIP_COMPRESSED is not set
+# CONFIG_SLIP_SMART is not set
+# CONFIG_SLIP_MODE_SLIP6 is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Token Ring devices (depends on LLC=y)
+#
+# CONFIG_SHAPER is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+# CONFIG_EXT2_FS_XATTR is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+# CONFIG_EXT3_FS_POSIX_ACL is not set
+# CONFIG_EXT3_FS_SECURITY is not set
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_XFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_ROMFS_FS is not set
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+# CONFIG_FAT_FS is not set
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_DEVFS_FS=y
+CONFIG_DEVFS_MOUNT=y
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+# CONFIG_DEVPTS_FS_XATTR is not set
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+
+#
+# Network File Systems
+#
+# CONFIG_NFS_FS is not set
+# CONFIG_NFSD is not set
+# CONFIG_EXPORTFS is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_CIFS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_INTERMEZZO_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+
+#
+# Security options
+#
+# CONFIG_SECURITY is not set
+
+#
+# Cryptographic options
+#
+# CONFIG_CRYPTO is not set
+
+#
+# Library routines
+#
+# CONFIG_CRC32 is not set
+
+#
+# SCSI support
+#
+# CONFIG_SCSI is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+# CONFIG_MD is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_SLAB=y
+# CONFIG_DEBUG_SPINLOCK is not set
+CONFIG_DEBUG_INFO=y
+CONFIG_FRAME_POINTER=y
+CONFIG_PT_PROXY=y
+# CONFIG_GPROF is not set
+# CONFIG_GCOV is not set
diff --git a/lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch b/lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch
new file mode 100644 (file)
index 0000000..a1cef3e
--- /dev/null
@@ -0,0 +1,212 @@
+ include/linux/dynlocks.h |   33 ++++++++++
+ lib/Makefile             |    4 -
+ lib/dynlocks.c           |  152 +++++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 187 insertions(+), 2 deletions(-)
+
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.18-alexey/include/linux/dynlocks.h       2003-09-01 16:33:25.000000000 +0400
+@@ -0,0 +1,33 @@
++#ifndef _LINUX_DYNLOCKS_H
++#define _LINUX_DYNLOCKS_H
++
++#include <linux/list.h>
++#include <linux/wait.h>
++
++struct dynlock_member {
++      struct list_head        dl_list;
++      unsigned long           dl_value;       /* lock value */
++      int                     dl_refcount;    /* number of users */
++      int                     dl_readers;
++      int                     dl_writers;
++      int                     dl_pid;         /* holder of the lock */
++      wait_queue_head_t       dl_wait;
++};
++
++/*
++ * lock's namespace:
++ *   - list of locks
++ *   - lock to protect this list
++ */
++struct dynlock {
++      struct list_head dl_list;
++      spinlock_t dl_list_lock;
++};
++
++void dynlock_init(struct dynlock *dl);
++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp);
++void dynlock_unlock(struct dynlock *dl, void *lock);
++
++
++#endif
++
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.18-alexey/lib/dynlocks.c 2003-09-01 16:36:00.000000000 +0400
+@@ -0,0 +1,152 @@
++/*
++ * Dynamic Locks
++ *
++ * struct dynlock is lockspace
++ * one may request lock (exclusive or shared) for some value
++ * in that lockspace
++ *
++ */
++
++#include <linux/dynlocks.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++
++/*
++ * dynlock_init
++ *
++ * initialize lockspace
++ *
++ */
++void dynlock_init(struct dynlock *dl)
++{
++      spin_lock_init(&dl->dl_list_lock);
++      INIT_LIST_HEAD(&dl->dl_list);
++}
++
++/*
++ * dynlock_lock
++ *
++ * acquires lock (exclusive or shared) in specified lockspace
++ * each lock in lockspace is allocated separately, so user have
++ * to specify GFP flags.
++ * routine returns pointer to lock. this pointer is intended to
++ * be passed to dynlock_unlock
++ *
++ */
++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp)
++{
++      struct dynlock_member *nhl = NULL; 
++      struct dynlock_member *hl; 
++      struct list_head *cur;
++
++repeat:
++      /* find requested lock in lockspace */
++      spin_lock(&dl->dl_list_lock);
++      list_for_each(cur, &dl->dl_list) {
++              hl = list_entry(cur, struct dynlock_member, dl_list);
++              if (hl->dl_value == value) {
++                      /* lock is found */
++                      if (nhl) {
++                              /* someone else just allocated
++                               * lock we didn't find and just created
++                               * so, we drop our lock
++                               */
++                              kfree(nhl);
++                              nhl = NULL;
++                      }
++                      hl->dl_refcount++;
++                      goto found;
++              }
++      }
++      /* lock not found */
++      if (nhl) {
++              /* we already have allocated lock. use it */
++              hl = nhl;
++              nhl = NULL;
++              list_add(&hl->dl_list, &dl->dl_list);
++              goto found;
++      }
++      spin_unlock(&dl->dl_list_lock);
++      
++      /* lock not found and we haven't allocated lock yet. allocate it */
++      nhl = kmalloc(sizeof(struct dynlock_member), gfp);
++      if (nhl == NULL)
++              return NULL;
++      nhl->dl_refcount = 1;
++      nhl->dl_value = value;
++      nhl->dl_readers = 0;
++      nhl->dl_writers = 0;
++      init_waitqueue_head(&nhl->dl_wait);
++
++      /* while lock is being allocated, someone else may allocate it
++       * and put onto to list. check this situation
++       */
++      goto repeat;
++
++found:
++      if (rw) {
++              /* exclusive lock: user don't want to share lock at all
++               * NOTE: one process may take the same lock several times
++               * this functionaly is useful for rename operations */
++              while ((hl->dl_writers && hl->dl_pid != current->pid) ||
++                              hl->dl_readers) {
++                      spin_unlock(&dl->dl_list_lock);
++                      wait_event(hl->dl_wait,
++                              hl->dl_writers == 0 && hl->dl_readers == 0);
++                      spin_lock(&dl->dl_list_lock);
++              }
++              hl->dl_writers++;
++      } else {
++              /* shared lock: user do not want to share lock with writer */
++              while (hl->dl_writers) {
++                      spin_unlock(&dl->dl_list_lock);
++                      wait_event(hl->dl_wait, hl->dl_writers == 0);
++                      spin_lock(&dl->dl_list_lock);
++              }
++              hl->dl_readers++;
++      }
++      hl->dl_pid = current->pid;
++      spin_unlock(&dl->dl_list_lock);
++
++      return hl;
++}
++
++
++/*
++ * dynlock_unlock
++ *
++ * user have to specify lockspace (dl) and pointer to lock structure
++ * returned by dynlock_lock()
++ *
++ */
++void dynlock_unlock(struct dynlock *dl, void *lock)
++{
++      struct dynlock_member *hl = lock;
++      int wakeup = 0;
++      
++      spin_lock(&dl->dl_list_lock);
++      if (hl->dl_writers) {
++              hl->dl_writers--;
++              if (hl->dl_writers == 0)
++                      wakeup = 1;
++      } else {
++              hl->dl_readers--;
++              if (hl->dl_readers == 0)
++                      wakeup = 1;
++      }
++      if (wakeup) {
++              hl->dl_pid = 0;
++              wake_up(&hl->dl_wait);
++      }
++      if (--(hl->dl_refcount) == 0) 
++              list_del(&hl->dl_list);
++      spin_unlock(&dl->dl_list_lock);
++      if (hl->dl_refcount == 0)
++              kfree(hl);
++}
++
++EXPORT_SYMBOL(dynlock_init);
++EXPORT_SYMBOL(dynlock_lock);
++EXPORT_SYMBOL(dynlock_unlock);
++
+--- linux-2.4.18/lib/Makefile~dynamic-locks-2.4.18-chaos       2003-08-29 11:57:40.000000000 +0400
++++ linux-2.4.18-alexey/lib/Makefile   2003-09-01 16:35:23.000000000 +0400
+@@ -8,9 +8,9 @@
+ L_TARGET := lib.a
+-export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o
++export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o dynlocks.o
+-obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o
++obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o dynlocks.o
+ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+ obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+
+_
diff --git a/lustre/kernel_patches/patches/dynamic-locks-2.4.20-rh.patch b/lustre/kernel_patches/patches/dynamic-locks-2.4.20-rh.patch
new file mode 100644 (file)
index 0000000..59f0a3e
--- /dev/null
@@ -0,0 +1,217 @@
+ include/linux/dynlocks.h |   33 ++++++++++
+ lib/Makefile             |    4 -
+ lib/dynlocks.c           |  152 +++++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 187 insertions(+), 2 deletions(-)
+
+Index: linux-2.4.20-rh/include/linux/dynlocks.h
+===================================================================
+--- linux-2.4.20-rh.orig/include/linux/dynlocks.h      2003-09-04 18:25:49.000000000 +0800
++++ linux-2.4.20-rh/include/linux/dynlocks.h   2003-09-04 18:25:49.000000000 +0800
+@@ -0,0 +1,33 @@
++#ifndef _LINUX_DYNLOCKS_H
++#define _LINUX_DYNLOCKS_H
++
++#include <linux/list.h>
++#include <linux/wait.h>
++
++struct dynlock_member {
++      struct list_head        dl_list;
++      unsigned long           dl_value;       /* lock value */
++      int                     dl_refcount;    /* number of users */
++      int                     dl_readers;
++      int                     dl_writers;
++      int                     dl_pid;         /* holder of the lock */
++      wait_queue_head_t       dl_wait;
++};
++
++/*
++ * lock's namespace:
++ *   - list of locks
++ *   - lock to protect this list
++ */
++struct dynlock {
++      struct list_head dl_list;
++      spinlock_t dl_list_lock;
++};
++
++void dynlock_init(struct dynlock *dl);
++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp);
++void dynlock_unlock(struct dynlock *dl, void *lock);
++
++
++#endif
++
+Index: linux-2.4.20-rh/lib/dynlocks.c
+===================================================================
+--- linux-2.4.20-rh.orig/lib/dynlocks.c        2003-09-04 18:25:49.000000000 +0800
++++ linux-2.4.20-rh/lib/dynlocks.c     2003-09-04 18:25:49.000000000 +0800
+@@ -0,0 +1,152 @@
++/*
++ * Dynamic Locks
++ *
++ * struct dynlock is lockspace
++ * one may request lock (exclusive or shared) for some value
++ * in that lockspace
++ *
++ */
++
++#include <linux/dynlocks.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++
++/*
++ * dynlock_init
++ *
++ * initialize lockspace
++ *
++ */
++void dynlock_init(struct dynlock *dl)
++{
++      spin_lock_init(&dl->dl_list_lock);
++      INIT_LIST_HEAD(&dl->dl_list);
++}
++
++/*
++ * dynlock_lock
++ *
++ * acquires lock (exclusive or shared) in specified lockspace
++ * each lock in lockspace is allocated separately, so user have
++ * to specify GFP flags.
++ * routine returns pointer to lock. this pointer is intended to
++ * be passed to dynlock_unlock
++ *
++ */
++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp)
++{
++      struct dynlock_member *nhl = NULL; 
++      struct dynlock_member *hl; 
++      struct list_head *cur;
++
++repeat:
++      /* find requested lock in lockspace */
++      spin_lock(&dl->dl_list_lock);
++      list_for_each(cur, &dl->dl_list) {
++              hl = list_entry(cur, struct dynlock_member, dl_list);
++              if (hl->dl_value == value) {
++                      /* lock is found */
++                      if (nhl) {
++                              /* someone else just allocated
++                               * lock we didn't find and just created
++                               * so, we drop our lock
++                               */
++                              kfree(nhl);
++                              nhl = NULL;
++                      }
++                      hl->dl_refcount++;
++                      goto found;
++              }
++      }
++      /* lock not found */
++      if (nhl) {
++              /* we already have allocated lock. use it */
++              hl = nhl;
++              nhl = NULL;
++              list_add(&hl->dl_list, &dl->dl_list);
++              goto found;
++      }
++      spin_unlock(&dl->dl_list_lock);
++      
++      /* lock not found and we haven't allocated lock yet. allocate it */
++      nhl = kmalloc(sizeof(struct dynlock_member), gfp);
++      if (nhl == NULL)
++              return NULL;
++      nhl->dl_refcount = 1;
++      nhl->dl_value = value;
++      nhl->dl_readers = 0;
++      nhl->dl_writers = 0;
++      init_waitqueue_head(&nhl->dl_wait);
++
++      /* while lock is being allocated, someone else may allocate it
++       * and put onto to list. check this situation
++       */
++      goto repeat;
++
++found:
++      if (rw) {
++              /* exclusive lock: user don't want to share lock at all
++               * NOTE: one process may take the same lock several times
++               * this functionaly is useful for rename operations */
++              while ((hl->dl_writers && hl->dl_pid != current->pid) ||
++                              hl->dl_readers) {
++                      spin_unlock(&dl->dl_list_lock);
++                      wait_event(hl->dl_wait,
++                              hl->dl_writers == 0 && hl->dl_readers == 0);
++                      spin_lock(&dl->dl_list_lock);
++              }
++              hl->dl_writers++;
++      } else {
++              /* shared lock: user do not want to share lock with writer */
++              while (hl->dl_writers) {
++                      spin_unlock(&dl->dl_list_lock);
++                      wait_event(hl->dl_wait, hl->dl_writers == 0);
++                      spin_lock(&dl->dl_list_lock);
++              }
++              hl->dl_readers++;
++      }
++      hl->dl_pid = current->pid;
++      spin_unlock(&dl->dl_list_lock);
++
++      return hl;
++}
++
++
++/*
++ * dynlock_unlock
++ *
++ * user have to specify lockspace (dl) and pointer to lock structure
++ * returned by dynlock_lock()
++ *
++ */
++void dynlock_unlock(struct dynlock *dl, void *lock)
++{
++      struct dynlock_member *hl = lock;
++      int wakeup = 0;
++      
++      spin_lock(&dl->dl_list_lock);
++      if (hl->dl_writers) {
++              hl->dl_writers--;
++              if (hl->dl_writers == 0)
++                      wakeup = 1;
++      } else {
++              hl->dl_readers--;
++              if (hl->dl_readers == 0)
++                      wakeup = 1;
++      }
++      if (wakeup) {
++              hl->dl_pid = 0;
++              wake_up(&hl->dl_wait);
++      }
++      if (--(hl->dl_refcount) == 0) 
++              list_del(&hl->dl_list);
++      spin_unlock(&dl->dl_list_lock);
++      if (hl->dl_refcount == 0)
++              kfree(hl);
++}
++
++EXPORT_SYMBOL(dynlock_init);
++EXPORT_SYMBOL(dynlock_lock);
++EXPORT_SYMBOL(dynlock_unlock);
++
+Index: linux-2.4.20-rh/lib/Makefile
+===================================================================
+--- linux-2.4.20-rh.orig/lib/Makefile  2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-rh/lib/Makefile       2003-09-04 18:27:26.000000000 +0800
+@@ -8,10 +8,10 @@
+ L_TARGET := lib.a
+-export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o
++export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o dynlocks.o
+ obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o \
+-       bust_spinlocks.o rbtree.o dump_stack.o
++       bust_spinlocks.o rbtree.o dump_stack.o dynlocks.o
+ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+ obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-5.patch b/lustre/kernel_patches/patches/ext-2.4-patch-5.patch
new file mode 100644 (file)
index 0000000..a65f6ed
--- /dev/null
@@ -0,0 +1,15 @@
+ include/linux/ext3_fs.h |    1 +
+ 1 files changed, 1 insertion(+)
+
+--- linux-2.4.18/include/linux/ext3_fs.h~ext-2.4-patch-5       2003-08-29 16:53:18.000000000 +0400
++++ linux-2.4.18-alexey/include/linux/ext3_fs.h        2003-09-01 11:50:37.000000000 +0400
+@@ -344,6 +344,7 @@ struct ext3_inode {
+   #define EXT3_MOUNT_WRITEBACK_DATA   0x0C00  /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
++#define EXT3_MOUNT_INDEX              0x4000  /* Enable directory index */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch
new file mode 100644 (file)
index 0000000..8343e54
--- /dev/null
@@ -0,0 +1,1461 @@
+--- ./fs/ext3/balloc.c.orig    Fri Apr 12 10:27:49 2002
++++ ./fs/ext3/balloc.c Tue May  7 15:35:59 2002
+@@ -46,18 +46,18 @@ struct ext3_group_desc * ext3_get_group_
+       unsigned long desc;
+       struct ext3_group_desc * gdp;
+-      if (block_group >= sb->u.ext3_sb.s_groups_count) {
++      if (block_group >= EXT3_SB(sb)->s_groups_count) {
+               ext3_error (sb, "ext3_get_group_desc",
+                           "block_group >= groups_count - "
+                           "block_group = %d, groups_count = %lu",
+-                          block_group, sb->u.ext3_sb.s_groups_count);
++                          block_group, EXT3_SB(sb)->s_groups_count);
+               return NULL;
+       }
+       
+       group_desc = block_group / EXT3_DESC_PER_BLOCK(sb);
+       desc = block_group % EXT3_DESC_PER_BLOCK(sb);
+-      if (!sb->u.ext3_sb.s_group_desc[group_desc]) {
++      if (!EXT3_SB(sb)->s_group_desc[group_desc]) {
+               ext3_error (sb, "ext3_get_group_desc",
+                           "Group descriptor not loaded - "
+                           "block_group = %d, group_desc = %lu, desc = %lu",
+@@ -66,9 +66,9 @@ struct ext3_group_desc * ext3_get_group_
+       }
+       
+       gdp = (struct ext3_group_desc *) 
+-            sb->u.ext3_sb.s_group_desc[group_desc]->b_data;
++            EXT3_SB(sb)->s_group_desc[group_desc]->b_data;
+       if (bh)
+-              *bh = sb->u.ext3_sb.s_group_desc[group_desc];
++              *bh = EXT3_SB(sb)->s_group_desc[group_desc];
+       return gdp + desc;
+ }
+@@ -104,8 +104,8 @@ static int read_block_bitmap (struct sup
+        * this group.  The IO will be retried next time.
+        */
+ error_out:
+-      sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group;
+-      sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh;
++      EXT3_SB(sb)->s_block_bitmap_number[bitmap_nr] = block_group;
++      EXT3_SB(sb)->s_block_bitmap[bitmap_nr] = bh;
+       return retval;
+ }
+@@ -128,16 +128,17 @@ static int __load_block_bitmap (struct s
+       int i, j, retval = 0;
+       unsigned long block_bitmap_number;
+       struct buffer_head * block_bitmap;
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
+-      if (block_group >= sb->u.ext3_sb.s_groups_count)
++      if (block_group >= sbi->s_groups_count)
+               ext3_panic (sb, "load_block_bitmap",
+                           "block_group >= groups_count - "
+                           "block_group = %d, groups_count = %lu",
+-                          block_group, sb->u.ext3_sb.s_groups_count);
++                          block_group, EXT3_SB(sb)->s_groups_count);
+-      if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) {
+-              if (sb->u.ext3_sb.s_block_bitmap[block_group]) {
+-                      if (sb->u.ext3_sb.s_block_bitmap_number[block_group] ==
++      if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) {
++              if (sbi->s_block_bitmap[block_group]) {
++                      if (sbi->s_block_bitmap_number[block_group] ==
+                           block_group)
+                               return block_group;
+                       ext3_error (sb, "__load_block_bitmap",
+@@ -149,21 +150,20 @@ static int __load_block_bitmap (struct s
+               return block_group;
+       }
+-      for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
+-                  sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++)
++      for (i = 0; i < sbi->s_loaded_block_bitmaps &&
++                  sbi->s_block_bitmap_number[i] != block_group; i++)
+               ;
+-      if (i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
+-          sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) {
+-              block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i];
+-              block_bitmap = sb->u.ext3_sb.s_block_bitmap[i];
++      if (i < sbi->s_loaded_block_bitmaps &&
++          sbi->s_block_bitmap_number[i] == block_group) {
++              block_bitmap_number = sbi->s_block_bitmap_number[i];
++              block_bitmap = sbi->s_block_bitmap[i];
+               for (j = i; j > 0; j--) {
+-                      sb->u.ext3_sb.s_block_bitmap_number[j] =
+-                              sb->u.ext3_sb.s_block_bitmap_number[j - 1];
+-                      sb->u.ext3_sb.s_block_bitmap[j] =
+-                              sb->u.ext3_sb.s_block_bitmap[j - 1];
++                      sbi->s_block_bitmap_number[j] =
++                              sbi->s_block_bitmap_number[j - 1];
++                      sbi->s_block_bitmap[j] = sbi->s_block_bitmap[j - 1];
+               }
+-              sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number;
+-              sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap;
++              sbi->s_block_bitmap_number[0] = block_bitmap_number;
++              sbi->s_block_bitmap[0] = block_bitmap;
+               /*
+                * There's still one special case here --- if block_bitmap == 0
+@@ -173,17 +173,14 @@ static int __load_block_bitmap (struct s
+               if (!block_bitmap)
+                       retval = read_block_bitmap (sb, block_group, 0);
+       } else {
+-              if (sb->u.ext3_sb.s_loaded_block_bitmaps<EXT3_MAX_GROUP_LOADED)
+-                      sb->u.ext3_sb.s_loaded_block_bitmaps++;
++              if (sbi->s_loaded_block_bitmaps<EXT3_MAX_GROUP_LOADED)
++                      sbi->s_loaded_block_bitmaps++;
+               else
+-                      brelse (sb->u.ext3_sb.s_block_bitmap
+-                                      [EXT3_MAX_GROUP_LOADED - 1]);
+-              for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1;
+-                                      j > 0;  j--) {
+-                      sb->u.ext3_sb.s_block_bitmap_number[j] =
+-                              sb->u.ext3_sb.s_block_bitmap_number[j - 1];
+-                      sb->u.ext3_sb.s_block_bitmap[j] =
+-                              sb->u.ext3_sb.s_block_bitmap[j - 1];
++                      brelse(sbi->s_block_bitmap[EXT3_MAX_GROUP_LOADED - 1]);
++              for (j = sbi->s_loaded_block_bitmaps - 1; j > 0;  j--) {
++                      sbi->s_block_bitmap_number[j] =
++                              sbi->s_block_bitmap_number[j - 1];
++                      sbi->s_block_bitmap[j] = sbi->s_block_bitmap[j - 1];
+               }
+               retval = read_block_bitmap (sb, block_group, 0);
+       }
+@@ -206,24 +203,25 @@ static int __load_block_bitmap (struct s
+ static inline int load_block_bitmap (struct super_block * sb,
+                                    unsigned int block_group)
+ {
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
+       int slot;
+-      
++
+       /*
+        * Do the lookup for the slot.  First of all, check if we're asking
+        * for the same slot as last time, and did we succeed that last time?
+        */
+-      if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 &&
+-          sb->u.ext3_sb.s_block_bitmap_number[0] == block_group &&
+-          sb->u.ext3_sb.s_block_bitmap[0]) {
++      if (sbi->s_loaded_block_bitmaps > 0 &&
++          sbi->s_block_bitmap_number[0] == block_group &&
++          sbi->s_block_bitmap[0]) {
+               return 0;
+       }
+       /*
+        * Or can we do a fast lookup based on a loaded group on a filesystem
+        * small enough to be mapped directly into the superblock?
+        */
+-      else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && 
+-               sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group
+-                      && sb->u.ext3_sb.s_block_bitmap[block_group]) {
++      else if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED &&
++               sbi->s_block_bitmap_number[block_group] == block_group
++                      && sbi->s_block_bitmap[block_group]) {
+               slot = block_group;
+       }
+       /*
+@@ -243,7 +241,7 @@ static inline int load_block_bitmap (str
+        * If it's a valid slot, we may still have cached a previous IO error,
+        * in which case the bh in the superblock cache will be zero.
+        */
+-      if (!sb->u.ext3_sb.s_block_bitmap[slot])
++      if (!sbi->s_block_bitmap[slot])
+               return -EIO;
+       
+       /*
+@@ -275,7 +273,7 @@ void ext3_free_blocks (handle_t *handle,
+               return;
+       }
+       lock_super (sb);
+-      es = sb->u.ext3_sb.s_es;
++      es = EXT3_SB(sb)->s_es;
+       if (block < le32_to_cpu(es->s_first_data_block) ||
+           block + count < block ||
+           (block + count) > le32_to_cpu(es->s_blocks_count)) {
+@@ -304,7 +302,7 @@ do_more:
+       if (bitmap_nr < 0)
+               goto error_return;
+       
+-      bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
++      bitmap_bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr];
+       gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
+       if (!gdp)
+               goto error_return;
+@@ -330,8 +328,8 @@ do_more:
+       if (err)
+               goto error_return;
+-      BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+-      err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++      BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
+       if (err)
+               goto error_return;
+@@ -341,7 +339,7 @@
+               if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
+                   block == le32_to_cpu(gdp->bg_inode_bitmap) ||
+                   in_range(block, le32_to_cpu(gdp->bg_inode_table),
+-                           sb->u.ext2_sb.s_itb_per_group)) {
++                           EXT3_SB(sb)->s_itb_per_group)) {
+                       ext3_error(sb, __FUNCTION__,
+                                  "Freeing block in system zone - block = %lu",
+                                  block);
+@@ -410,8 +407,8 @@ do_more:
+       if (!err) err = ret;
+       /* And the superblock */
+-      BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock");
+-      ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++      BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "dirtied superblock");
++      ret = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+       if (!err) err = ret;
+       if (overflow && !err) {
+@@ -564,12 +560,12 @@ int ext3_new_block (handle_t *handle, st
+       }
+       lock_super (sb);
+-      es = sb->u.ext3_sb.s_es;
++      es = EXT3_SB(sb)->s_es;
+       if (le32_to_cpu(es->s_free_blocks_count) <=
+                       le32_to_cpu(es->s_r_blocks_count) &&
+-          ((sb->u.ext3_sb.s_resuid != current->fsuid) &&
+-           (sb->u.ext3_sb.s_resgid == 0 ||
+-            !in_group_p (sb->u.ext3_sb.s_resgid)) && 
++          ((EXT3_SB(sb)->s_resuid != current->fsuid) &&
++           (EXT3_SB(sb)->s_resgid == 0 ||
++            !in_group_p (EXT3_SB(sb)->s_resgid)) &&
+            !capable(CAP_SYS_RESOURCE)))
+               goto out;
+@@ -598,7 +595,7 @@ int ext3_new_block (handle_t *handle, st
+               if (bitmap_nr < 0)
+                       goto io_error;
+               
+-              bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
++              bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr];
+               ext3_debug ("goal is at %d:%d.\n", i, j);
+@@ -621,9 +618,9 @@ int ext3_new_block (handle_t *handle, st
+        * Now search the rest of the groups.  We assume that 
+        * i and gdp correctly point to the last group visited.
+        */
+-      for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) {
++      for (k = 0; k < EXT3_SB(sb)->s_groups_count; k++) {
+               i++;
+-              if (i >= sb->u.ext3_sb.s_groups_count)
++              if (i >= EXT3_SB(sb)->s_groups_count)
+                       i = 0;
+               gdp = ext3_get_group_desc (sb, i, &bh2);
+               if (!gdp) {
+@@ -635,7 +632,7 @@ int ext3_new_block (handle_t *handle, st
+                       if (bitmap_nr < 0)
+                               goto io_error;
+       
+-                      bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
++                      bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr];
+                       j = find_next_usable_block(-1, bh, 
+                                                  EXT3_BLOCKS_PER_GROUP(sb));
+                       if (j >= 0) 
+@@ -674,8 +671,8 @@ got_block:
+       fatal = ext3_journal_get_write_access(handle, bh2);
+       if (fatal) goto out;
+-      BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+-      fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++      BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
++      fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
+       if (fatal) goto out;
+       tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb)
+@@ -796,7 +804,7 @@ got_block:
+       if (!fatal) fatal = err;
+       
+       BUFFER_TRACE(bh, "journal_dirty_metadata for superblock");
+-      err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++      err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+       if (!fatal) fatal = err;
+       sb->s_dirt = 1;
+@@ -829,11 +837,11 @@ unsigned long ext3_count_free_blocks (st
+       int i;
+       
+       lock_super (sb);
+-      es = sb->u.ext3_sb.s_es;
++      es = EXT3_SB(sb)->s_es;
+       desc_count = 0;
+       bitmap_count = 0;
+       gdp = NULL;
+-      for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
++      for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+               gdp = ext3_get_group_desc (sb, i, NULL);
+               if (!gdp)
+                       continue;
+@@ -842,7 +850,7 @@ unsigned long ext3_count_free_blocks (st
+               if (bitmap_nr < 0)
+                       continue;
+               
+-              x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr],
++              x = ext3_count_free (EXT3_SB(sb)->s_block_bitmap[bitmap_nr],
+                                    sb->s_blocksize);
+               printk ("group %d: stored = %d, counted = %lu\n",
+                       i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+@@ -853,7 +861,7 @@ unsigned long ext3_count_free_blocks (st
+       unlock_super (sb);
+       return bitmap_count;
+ #else
+-      return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count);
++      return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count);
+ #endif
+ }
+@@ -862,7 +870,7 @@ static inline int block_in_use (unsigned
+                               unsigned char * map)
+ {
+       return ext3_test_bit ((block -
+-              le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) %
++              le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
+                        EXT3_BLOCKS_PER_GROUP(sb), map);
+ }
+@@ -930,11 +938,11 @@ void ext3_check_blocks_bitmap (struct su
+       struct ext3_group_desc * gdp;
+       int i;
+-      es = sb->u.ext3_sb.s_es;
++      es = EXT3_SB(sb)->s_es;
+       desc_count = 0;
+       bitmap_count = 0;
+       gdp = NULL;
+-      for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
++      for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+               gdp = ext3_get_group_desc (sb, i, NULL);
+               if (!gdp)
+                       continue;
+@@ -968,7 +976,7 @@ void ext3_check_blocks_bitmap (struct su
+                                   "Inode bitmap for group %d is marked free",
+                                   i);
+-              for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++)
++              for (j = 0; j < EXT3_SB(sb)->s_itb_per_group; j++)
+                       if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
+                                                       sb, bh->b_data))
+                               ext3_error (sb, "ext3_check_blocks_bitmap",
+--- ./fs/ext3/dir.c.orig       Fri Apr 12 10:27:49 2002
++++ ./fs/ext3/dir.c    Tue May  7 14:54:13 2002
+@@ -52,7 +52,7 @@ int ext3_check_dir_entry (const char * f
+       else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+               error_msg = "directory entry across blocks";
+       else if (le32_to_cpu(de->inode) >
+-                      le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
++                      le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+               error_msg = "inode out of bounds";
+       if (error_msg != NULL)
+--- ./fs/ext3/ialloc.c.orig    Fri Apr 12 10:27:49 2002
++++ ./fs/ext3/ialloc.c Tue May  7 15:39:26 2002
+@@ -73,8 +73,8 @@ static int read_inode_bitmap (struct sup
+        * this group.  The IO will be retried next time.
+        */
+ error_out:
+-      sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group;
+-      sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh;
++      EXT3_SB(sb)->s_inode_bitmap_number[bitmap_nr] = block_group;
++      EXT3_SB(sb)->s_inode_bitmap[bitmap_nr] = bh;
+       return retval;
+ }
+@@ -225,7 +225,7 @@ void ext3_free_inode (handle_t *handle, 
+       clear_inode (inode);
+       lock_super (sb);
+-      es = sb->u.ext3_sb.s_es;
++      es = EXT3_SB(sb)->s_es;
+       if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+               ext3_error (sb, "ext3_free_inode",
+                           "reserved or nonexistent inode %lu", ino);
+@@ -237,7 +237,7 @@ void ext3_free_inode (handle_t *handle, 
+       if (bitmap_nr < 0)
+               goto error_return;
+-      bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
++      bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr];
+       BUFFER_TRACE(bh, "get_write_access");
+       fatal = ext3_journal_get_write_access(handle, bh);
+@@ -255,8 +255,8 @@ void ext3_free_inode (handle_t *handle, 
+               fatal = ext3_journal_get_write_access(handle, bh2);
+               if (fatal) goto error_return;
+-              BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access");
+-              fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++              BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get write access");
++              fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
+               if (fatal) goto error_return;
+               if (gdp) {
+@@ -271,9 +271,9 @@ void ext3_free_inode (handle_t *handle, 
+               if (!fatal) fatal = err;
+               es->s_free_inodes_count =
+                       cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
+-              BUFFER_TRACE(sb->u.ext3_sb.s_sbh,
++              BUFFER_TRACE(EXT3_SB(sb)->s_sbh,
+                                       "call ext3_journal_dirty_metadata");
+-              err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++              err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+               if (!fatal) fatal = err;
+       }
+       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+@@ -305,6 +305,8 @@ struct inode * ext3_new_inode (handle_t 
+       int i, j, avefreei;
+       struct inode * inode;
+       int bitmap_nr;
++      struct ext3_inode_info *ei;
++      struct ext3_sb_info *sbi;
+       struct ext3_group_desc * gdp;
+       struct ext3_group_desc * tmp;
+       struct ext3_super_block * es;
+@@ -318,7 +320,9 @@ struct inode * ext3_new_inode (handle_t 
+       inode = new_inode(sb);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+-      init_rwsem(&inode->u.ext3_i.truncate_sem);
++      sbi = EXT3_SB(sb);
++      ei = EXT3_I(inode);
++      init_rwsem(&ei->truncate_sem);
+       lock_super (sb);
+       es = sb->u.ext3_sb.s_es;
+@@ -328,9 +332,9 @@ struct inode * ext3_new_inode (handle_t 
+       if (S_ISDIR(mode)) {
+               avefreei = le32_to_cpu(es->s_free_inodes_count) /
+-                      sb->u.ext3_sb.s_groups_count;
++                      sbi->s_groups_count;
+               if (!gdp) {
+-                      for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) {
++                      for (j = 0; j < sbi->s_groups_count; j++) {
+                               struct buffer_head *temp_buffer;
+                               tmp = ext3_get_group_desc (sb, j, &temp_buffer);
+                               if (tmp &&
+@@ -350,7 +354,7 @@ repeat:
+               /*
+                * Try to place the inode in its parent directory
+                */
+-              i = dir->u.ext3_i.i_block_group;
++              i = EXT3_I(dir)->i_block_group;
+               tmp = ext3_get_group_desc (sb, i, &bh2);
+               if (tmp && le16_to_cpu(tmp->bg_free_inodes_count))
+                       gdp = tmp;
+@@ -360,10 +364,10 @@ repeat:
+                        * Use a quadratic hash to find a group with a
+                        * free inode
+                        */
+-                      for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) {
++                      for (j = 1; j < sbi->s_groups_count; j <<= 1) {
+                               i += j;
+-                              if (i >= sb->u.ext3_sb.s_groups_count)
+-                                      i -= sb->u.ext3_sb.s_groups_count;
++                              if (i >= sbi->s_groups_count)
++                                      i -= sbi->s_groups_count;
+                               tmp = ext3_get_group_desc (sb, i, &bh2);
+                               if (tmp &&
+                                   le16_to_cpu(tmp->bg_free_inodes_count)) {
+@@ -376,9 +380,9 @@ repeat:
+                       /*
+                        * That failed: try linear search for a free inode
+                        */
+-                      i = dir->u.ext3_i.i_block_group + 1;
+-                      for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) {
+-                              if (++i >= sb->u.ext3_sb.s_groups_count)
++                      i = EXT3_I(dir)->i_block_group + 1;
++                      for (j = 2; j < sbi->s_groups_count; j++) {
++                              if (++i >= sbi->s_groups_count)
+                                       i = 0;
+                               tmp = ext3_get_group_desc (sb, i, &bh2);
+                               if (tmp &&
+@@ -399,11 +403,11 @@ repeat:
+       if (bitmap_nr < 0)
+               goto fail;
+-      bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
++      bh = sbi->s_inode_bitmap[bitmap_nr];
+       if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data,
+-                                    EXT3_INODES_PER_GROUP(sb))) <
+-          EXT3_INODES_PER_GROUP(sb)) {
++                                    sbi->s_inodes_per_group)) <
++          sbi->s_inodes_per_group) {
+               BUFFER_TRACE(bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, bh);
+               if (err) goto fail;
+@@ -457,13 +461,13 @@ repeat:
+       err = ext3_journal_dirty_metadata(handle, bh2);
+       if (err) goto fail;
+       
+-      BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+-      err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++      BUFFER_TRACE(sbi->s_sbh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, sbi->s_sbh);
+       if (err) goto fail;
+       es->s_free_inodes_count =
+               cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
+-      BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata");
+-      err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++      BUFFER_TRACE(sbi->s_sbh, "call ext3_journal_dirty_metadata");
++      err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+       sb->s_dirt = 1;
+       if (err) goto fail;
+@@ -483,31 +487,31 @@ repeat:
+       inode->i_blksize = PAGE_SIZE;
+       inode->i_blocks = 0;
+       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+-      inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL;
++      ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
+       if (S_ISLNK(mode))
+-              inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
++              ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
+ #ifdef EXT3_FRAGMENTS
+-      inode->u.ext3_i.i_faddr = 0;
+-      inode->u.ext3_i.i_frag_no = 0;
+-      inode->u.ext3_i.i_frag_size = 0;
++      ei->i_faddr = 0;
++      ei->i_frag_no = 0;
++      ei->i_frag_size = 0;
+ #endif
+-      inode->u.ext3_i.i_file_acl = 0;
+-      inode->u.ext3_i.i_dir_acl = 0;
+-      inode->u.ext3_i.i_dtime = 0;
+-      INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++      ei->i_file_acl = 0;
++      ei->i_dir_acl = 0;
++      ei->i_dtime = 0;
++      INIT_LIST_HEAD(&ei->i_orphan);
+ #ifdef EXT3_PREALLOCATE
+-      inode->u.ext3_i.i_prealloc_count = 0;
++      ei->i_prealloc_count = 0;
+ #endif
+-      inode->u.ext3_i.i_block_group = i;
++      ei->i_block_group = i;
+       
+-      if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
++      if (ei->i_flags & EXT3_SYNC_FL)
+               inode->i_flags |= S_SYNC;
+       if (IS_SYNC(inode))
+               handle->h_sync = 1;
+       insert_inode_hash(inode);
+-      inode->i_generation = sb->u.ext3_sb.s_next_generation++;
++      inode->i_generation = sbi->s_next_generation++;
+-      inode->u.ext3_i.i_state = EXT3_STATE_NEW;
++      ei->i_state = EXT3_STATE_NEW;
+       err = ext3_mark_inode_dirty(handle, inode);
+       if (err) goto fail;
+       
+@@ -585,19 +589,19 @@ struct inode *ext3_orphan_get (struct su
+ unsigned long ext3_count_free_inodes (struct super_block * sb)
+ {
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      struct ext3_super_block *es = sbi->s_es;
+ #ifdef EXT3FS_DEBUG
+-      struct ext3_super_block * es;
+       unsigned long desc_count, bitmap_count, x;
+       int bitmap_nr;
+       struct ext3_group_desc * gdp;
+       int i;
+       lock_super (sb);
+-      es = sb->u.ext3_sb.s_es;
+       desc_count = 0;
+       bitmap_count = 0;
+       gdp = NULL;
+-      for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
++      for (i = 0; i < sbi->s_groups_count; i++) {
+               gdp = ext3_get_group_desc (sb, i, NULL);
+               if (!gdp)
+                       continue;
+@@ -606,8 +610,8 @@ unsigned long ext3_count_free_inodes (st
+               if (bitmap_nr < 0)
+                       continue;
+-              x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
+-                                   EXT3_INODES_PER_GROUP(sb) / 8);
++              x = ext3_count_free(sbi->s_inode_bitmap[bitmap_nr],
++                                  sbi->s_inodes_per_group / 8);
+               printk ("group %d: stored = %d, counted = %lu\n",
+                       i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+               bitmap_count += x;
+@@ -617,7 +621,7 @@ unsigned long ext3_count_free_inodes (st
+       unlock_super (sb);
+       return desc_count;
+ #else
+-      return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count);
++      return le32_to_cpu(es->s_free_inodes_count);
+ #endif
+ }
+@@ -626,16 +630,18 @@ unsigned long ext3_count_free_inodes (st
+ void ext3_check_inodes_bitmap (struct super_block * sb)
+ {
+       struct ext3_super_block * es;
++      struct ext3_sb_info *sbi;
+       unsigned long desc_count, bitmap_count, x;
+       int bitmap_nr;
+       struct ext3_group_desc * gdp;
+       int i;
+-      es = sb->u.ext3_sb.s_es;
++      sbi = EXT3_SB(sb);
++      es = sbi->s_es;
+       desc_count = 0;
+       bitmap_count = 0;
+       gdp = NULL;
+-      for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
++      for (i = 0; i < sbi->s_groups_count; i++) {
+               gdp = ext3_get_group_desc (sb, i, NULL);
+               if (!gdp)
+                       continue;
+@@ -644,7 +650,7 @@ void ext3_check_inodes_bitmap (struct su
+               if (bitmap_nr < 0)
+                       continue;
+-              x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
++              x = ext3_count_free (sbi->s_inode_bitmap[bitmap_nr],
+                                    EXT3_INODES_PER_GROUP(sb) / 8);
+               if (le16_to_cpu(gdp->bg_free_inodes_count) != x)
+                       ext3_error (sb, "ext3_check_inodes_bitmap",
+--- ./fs/ext3/inode.c.orig     Fri Apr 12 10:27:49 2002
++++ ./fs/ext3/inode.c  Tue May  7 15:41:23 2002
+@@ -196,7 +196,7 @@ void ext3_delete_inode (struct inode * i
+        * (Well, we could do this if we need to, but heck - it works)
+        */
+       ext3_orphan_del(handle, inode);
+-      inode->u.ext3_i.i_dtime = CURRENT_TIME;
++      EXT3_I(inode)->i_dtime = CURRENT_TIME;
+       /* 
+        * One subtle ordering requirement: if anything has gone wrong
+@@ -220,13 +220,14 @@ no_delete:
+ void ext3_discard_prealloc (struct inode * inode)
+ {
+ #ifdef EXT3_PREALLOCATE
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       lock_kernel();
+       /* Writer: ->i_prealloc* */
+-      if (inode->u.ext3_i.i_prealloc_count) {
+-              unsigned short total = inode->u.ext3_i.i_prealloc_count;
+-              unsigned long block = inode->u.ext3_i.i_prealloc_block;
+-              inode->u.ext3_i.i_prealloc_count = 0;
+-              inode->u.ext3_i.i_prealloc_block = 0;
++      if (ei->i_prealloc_count) {
++              unsigned short total = ei->i_prealloc_count;
++              unsigned long block = ei->i_prealloc_block;
++              ei->i_prealloc_count = 0;
++              ei->i_prealloc_block = 0;
+               /* Writer: end */
+               ext3_free_blocks (inode, block, total);
+       }
+@@ -243,13 +244,15 @@ static int ext3_alloc_block (handle_t *h
+       unsigned long result;
+ #ifdef EXT3_PREALLOCATE
++      struct ext3_inode_info *ei = EXT3_I(inode);
++
+       /* Writer: ->i_prealloc* */
+-      if (inode->u.ext3_i.i_prealloc_count &&
+-          (goal == inode->u.ext3_i.i_prealloc_block ||
+-           goal + 1 == inode->u.ext3_i.i_prealloc_block))
++      if (ei->i_prealloc_count &&
++          (goal == ei->i_prealloc_block ||
++           goal + 1 == ei->i_prealloc_block))
+       {
+-              result = inode->u.ext3_i.i_prealloc_block++;
+-              inode->u.ext3_i.i_prealloc_count--;
++              result = ei->i_prealloc_block++;
++              ei->i_prealloc_count--;
+               /* Writer: end */
+               ext3_debug ("preallocation hit (%lu/%lu).\n",
+                           ++alloc_hits, ++alloc_attempts);
+@@ -259,8 +262,8 @@ static int ext3_alloc_block (handle_t *h
+                           alloc_hits, ++alloc_attempts);
+               if (S_ISREG(inode->i_mode))
+                       result = ext3_new_block (inode, goal, 
+-                               &inode->u.ext3_i.i_prealloc_count,
+-                               &inode->u.ext3_i.i_prealloc_block, err);
++                               &ei->i_prealloc_count,
++                               &ei->i_prealloc_block, err);
+               else
+                       result = ext3_new_block (inode, goal, 0, 0, err);
+               /*
+@@ -394,7 +397,7 @@ static Indirect *ext3_get_branch(struct 
+       *err = 0;
+       /* i_data is not going away, no lock needed */
+-      add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
++      add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
+       if (!p->key)
+               goto no_block;
+       while (--depth) {
+@@ -437,7 +440,8 @@ no_block:
+ static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+ {
+-      u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data;
+       u32 *p;
+       /* Try to find previous block */
+@@ -453,9 +456,8 @@ static inline unsigned long ext3_find_ne
+        * It is going to be refered from inode itself? OK, just put it into
+        * the same cylinder group then.
+        */
+-      return (inode->u.ext3_i.i_block_group * 
+-              EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+-             le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
++      return (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++             le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+ }
+ /**
+@@ -474,14 +477,15 @@
+ static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
+                         Indirect *partial, unsigned long *goal)
+ {
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       /* Writer: ->i_next_alloc* */
+-      if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
+-              inode->u.ext3_i.i_next_alloc_block++;
+-              inode->u.ext3_i.i_next_alloc_goal++;
++      if (block == ei->i_next_alloc_block + 1) {
++              ei->i_next_alloc_block++;
++              ei->i_next_alloc_goal++;
+       }
+ #ifdef SEARCH_FROM_ZERO
+-      inode->u.ext3_i.i_next_alloc_block = 0;
+-      inode->u.ext3_i.i_next_alloc_goal = 0;
++      ei->i_next_alloc_block = 0;
++      ei->i_next_alloc_goal = 0;
+ #endif
+       /* Writer: end */
+       /* Reader: pointers, ->i_next_alloc* */
+@@ -490,8 +493,8 @@ static int ext3_find_goal(struct inode *
+                * try the heuristic for sequential allocation,
+                * failing that at least try to get decent locality.
+                */
+-              if (block == inode->u.ext3_i.i_next_alloc_block)
+-                      *goal = inode->u.ext3_i.i_next_alloc_goal;
++              if (block == ei->i_next_alloc_block)
++                      *goal = ei->i_next_alloc_goal;
+               if (!*goal)
+                       *goal = ext3_find_near(inode, partial);
+ #ifdef SEARCH_FROM_ZERO
+@@ -619,6 +621,7 @@
+ {
+       int i;
+       int err = 0;
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       /*
+        * If we're splicing into a [td]indirect block (as opposed to the
+@@ -641,11 +644,11 @@ static int ext3_splice_branch(handle_t *
+       /* That's it */
+       *where->p = where->key;
+-      inode->u.ext3_i.i_next_alloc_block = block;
+-      inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
++      ei->i_next_alloc_block = block;
++      ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key);
+ #ifdef SEARCH_FROM_ZERO
+-      inode->u.ext3_i.i_next_alloc_block = 0;
+-      inode->u.ext3_i.i_next_alloc_goal = 0;
++      ei->i_next_alloc_block = 0;
++      ei->i_next_alloc_goal = 0;
+ #endif
+       /* Writer: end */
+@@ -729,6 +732,7 @@
+       unsigned long goal;
+       int left;
+       int depth = ext3_block_to_path(inode, iblock, offsets);
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       loff_t new_size;
+       J_ASSERT(handle != NULL || create == 0);
+@@ -782,7 +785,7 @@ out:
+       /*
+        * Block out ext3_truncate while we alter the tree
+        */
+-      down_read(&inode->u.ext3_i.truncate_sem);
++      down_read(&ei->truncate_sem);
+       err = ext3_alloc_branch(handle, inode, left, goal,
+                                       offsets+(partial-chain), partial);
+@@ -794,7 +797,7 @@ out:
+       if (!err)
+               err = ext3_splice_branch(handle, inode, iblock, chain,
+                                        partial, left);
+-      up_read(&inode->u.ext3_i.truncate_sem);
++      up_read(&ei->truncate_sem);
+       if (err == -EAGAIN)
+               goto changed;
+       if (err)
+@@ -807,8 +810,8 @@ out:
+        * truncate is in progress.  It is racy between multiple parallel
+        * instances of get_block, but we have the BKL.
+        */
+-      if (new_size > inode->u.ext3_i.i_disksize)
+-              inode->u.ext3_i.i_disksize = new_size;
++      if (new_size > ei->i_disksize)
++              ei->i_disksize = new_size;
+       bh_result->b_state |= (1UL << BH_New);
+       goto got_it;
+@@ -921,7 +924,7 @@ struct buffer_head *ext3_bread(handle_t 
+               struct buffer_head *tmp_bh;
+               for (i = 1;
+-                   inode->u.ext3_i.i_prealloc_count &&
++                   EXT3_I(inode)->i_prealloc_count &&
+                    i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
+                    i++) {
+                       /*
+@@ -1131,8 +1134,8 @@ static int ext3_commit_write(struct file
+                       kunmap(page);
+               }
+       }
+-      if (inode->i_size > inode->u.ext3_i.i_disksize) {
+-              inode->u.ext3_i.i_disksize = inode->i_size;
++      if (inode->i_size > EXT3_I(inode)->i_disksize) {
++              EXT3_I(inode)->i_disksize = inode->i_size;
+               ret2 = ext3_mark_inode_dirty(handle, inode);
+               if (!ret) 
+                       ret = ret2;
+@@ -1832,7 +1835,8 @@ static void ext3_free_branches(handle_t 
+ void ext3_truncate(struct inode * inode)
+ {
+       handle_t *handle;
+-      u32 *i_data = inode->u.ext3_i.i_data;
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      u32 *i_data = EXT3_I(inode)->i_data;
+       int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+       int offsets[4];
+       Indirect chain[4];
+@@ -1884,13 +1887,13 @@ void ext3_truncate(struct inode * inode)
+        * on-disk inode. We do this via i_disksize, which is the value which
+        * ext3 *really* writes onto the disk inode.
+        */
+-      inode->u.ext3_i.i_disksize = inode->i_size;
++      ei->i_disksize = inode->i_size;
+       /*
+        * From here we block out all ext3_get_block() callers who want to
+        * modify the block allocation tree.
+        */
+-      down_write(&inode->u.ext3_i.truncate_sem);
++      down_write(&ei->truncate_sem);
+       if (n == 1) {           /* direct blocks */
+               ext3_free_data(handle, inode, NULL, i_data+offsets[0],
+@@ -1954,7 +1957,7 @@ do_indirects:
+               case EXT3_TIND_BLOCK:
+                       ;
+       }
+-      up_write(&inode->u.ext3_i.truncate_sem);
++      up_write(&ei->truncate_sem);
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       ext3_mark_inode_dirty(handle, inode);
+@@ -1983,6 +1986,8 @@ out_stop:
+ int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
+ {
++      struct super_block *sb = inode->i_sb;
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
+       struct buffer_head *bh = 0;
+       unsigned long block;
+       unsigned long block_group;
+@@ -1997,23 +2010,19 @@ int ext3_get_inode_loc (struct inode *in
+               inode->i_ino != EXT3_JOURNAL_INO &&
+-              inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+-              inode->i_ino > le32_to_cpu(
+-                      inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
+-              ext3_error (inode->i_sb, "ext3_get_inode_loc",
+-                          "bad inode number: %lu", inode->i_ino);
++              inode->i_ino < EXT3_FIRST_INO(sb)) ||
++              inode->i_ino > le32_to_cpu(sbi->s_es->s_inodes_count)) {
++              ext3_error (sb, __FUNCTION__, "bad inode #%lu", inode->i_ino);
+               goto bad_inode;
+       }
+-      block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
+-      if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
+-              ext3_error (inode->i_sb, "ext3_get_inode_loc",
+-                          "group >= groups count");
++      block_group = (inode->i_ino - 1) / sbi->s_inodes_per_group;
++      if (block_group >= sbi->s_groups_count) {
++              ext3_error(sb, __FUNCTION__, "group >= groups count");
+               goto bad_inode;
+       }
+-      group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
+-      desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
+-      bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
++      group_desc = block_group >> sbi->s_desc_per_block_bits;
++      desc = block_group & (sbi->s_desc_per_block - 1);
++      bh = sbi->s_group_desc[group_desc];
+       if (!bh) {
+-              ext3_error (inode->i_sb, "ext3_get_inode_loc",
+-                          "Descriptor not loaded");
++              ext3_error(sb, __FUNCTION__, "Descriptor not loaded");
+               goto bad_inode;
+       }
+@@ -2021,17 +2022,17 @@ int ext3_get_inode_loc (struct inode *in
+       /*
+        * Figure out the offset within the block group inode table
+        */
+-      offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
+-              EXT3_INODE_SIZE(inode->i_sb);
++      offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group) *
++              sbi->s_inode_size;
+       block = le32_to_cpu(gdp[desc].bg_inode_table) +
+-              (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
+-      if (!(bh = sb_bread(inode->i_sb, block))) {
+-              ext3_error (inode->i_sb, "ext3_get_inode_loc",
++              (offset >> EXT3_BLOCK_SIZE_BITS(sb));
++      if (!(bh = sb_bread(sb, block))) {
++              ext3_error (sb, __FUNCTION__,
+                           "unable to read inode block - "
+                           "inode=%lu, block=%lu", inode->i_ino, block);
+               goto bad_inode;
+       }
+-      offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
++      offset &= (EXT3_BLOCK_SIZE(sb) - 1);
+       iloc->bh = bh;
+       iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
+@@ -2047,6 +2048,7 @@ void ext3_read_inode(struct inode * inod
+ {
+       struct ext3_iloc iloc;
+       struct ext3_inode *raw_inode;
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       struct buffer_head *bh;
+       int block;
+       
+@@ -2054,7 +2056,7 @@ void ext3_read_inode(struct inode * inod
+               goto bad_inode;
+       bh = iloc.bh;
+       raw_inode = iloc.raw_inode;
+-      init_rwsem(&inode->u.ext3_i.truncate_sem);
++      init_rwsem(&ei->truncate_sem);
+       inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+       inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+       inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+@@ -2067,7 +2069,7 @@ void ext3_read_inode(struct inode * inod
+       inode->i_atime = le32_to_cpu(raw_inode->i_atime);
+       inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
+       inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
+-      inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
++      ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+       /* We now have enough fields to check if the inode was active or not.
+        * This is needed because nfsd might try to access dead inodes
+        * the test is that same one that e2fsck uses
+@@ -2075,7 +2077,7 @@ void ext3_read_inode(struct inode * inod
+        */
+       if (inode->i_nlink == 0) {
+               if (inode->i_mode == 0 ||
+-                  !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
++                  !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
+                       /* this inode is deleted */
+                       brelse (bh);
+                       goto bad_inode;
+@@ -2090,33 +2092,33 @@ void ext3_read_inode(struct inode * inod
+                                        * size */  
+       inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+       inode->i_version = ++event;
+-      inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
++      ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+ #ifdef EXT3_FRAGMENTS
+-      inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
+-      inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
+-      inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
++      ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
++      ei->i_frag_no = raw_inode->i_frag;
++      ei->i_frag_size = raw_inode->i_fsize;
+ #endif
+-      inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
++      ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+       if (!S_ISREG(inode->i_mode)) {
+-              inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
++              ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+       } else {
+               inode->i_size |=
+                       ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+       }
+-      inode->u.ext3_i.i_disksize = inode->i_size;
++      ei->i_disksize = inode->i_size;
+       inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+ #ifdef EXT3_PREALLOCATE
+-      inode->u.ext3_i.i_prealloc_count = 0;
++      ei->i_prealloc_count = 0;
+ #endif
+-      inode->u.ext3_i.i_block_group = iloc.block_group;
++      ei->i_block_group = iloc.block_group;
+       /*
+        * NOTE! The in-memory inode i_data array is in little-endian order
+        * even on big-endian machines: we do NOT byteswap the block numbers!
+        */
+       for (block = 0; block < EXT3_N_BLOCKS; block++)
+-              inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
+-      INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++              ei->i_data[block] = iloc.raw_inode->i_block[block];
++      INIT_LIST_HEAD(&ei->i_orphan);
+       brelse (iloc.bh);
+@@ -2143,17 +2145,17 @@ void ext3_read_inode(struct inode * inod
+       /* inode->i_attr_flags = 0;                             unused */
+-      if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
++      if (ei->i_flags & EXT3_SYNC_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
+               inode->i_flags |= S_SYNC;
+       }
+-      if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) {
++      if (ei->i_flags & EXT3_APPEND_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_APPEND;     unused */
+               inode->i_flags |= S_APPEND;
+       }
+-      if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FL) {
++      if (ei->i_flags & EXT3_IMMUTABLE_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE;  unused */
+               inode->i_flags |= S_IMMUTABLE;
+       }
+-      if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) {
++      if (ei->i_flags & EXT3_NOATIME_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_NOATIME;    unused */
+               inode->i_flags |= S_NOATIME;
+       }
+@@ -2175,6 +2177,7 @@ static int ext3_do_update_inode(handle_t
+                               struct ext3_iloc *iloc)
+ {
+       struct ext3_inode *raw_inode = iloc->raw_inode;
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       struct buffer_head *bh = iloc->bh;
+       int err = 0, rc, block;
+@@ -2192,7 +2195,7 @@ static int ext3_do_update_inode(handle_t
+  * Fix up interoperability with old kernels. Otherwise, old inodes get
+  * re-used with the upper 16 bits of the uid/gid intact
+  */
+-              if(!inode->u.ext3_i.i_dtime) {
++              if(!ei->i_dtime) {
+                       raw_inode->i_uid_high =
+                               cpu_to_le16(high_16_bits(inode->i_uid));
+                       raw_inode->i_gid_high =
+@@ -2210,34 +2213,33 @@ static int ext3_do_update_inode(handle_t
+               raw_inode->i_gid_high = 0;
+       }
+       raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+-      raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
++      raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+       raw_inode->i_atime = cpu_to_le32(inode->i_atime);
+       raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
+       raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
+       raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+-      raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
+-      raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
++      raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
++      raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+ #ifdef EXT3_FRAGMENTS
+-      raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
+-      raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
+-      raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
++      raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
++      raw_inode->i_frag = ei->i_frag_no;
++      raw_inode->i_fsize = ei->i_frag_size;
+ #else
+       /* If we are not tracking these fields in the in-memory inode,
+        * then preserve them on disk, but still initialise them to zero
+        * for new inodes. */
+-      if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
++      if (ei->i_state & EXT3_STATE_NEW) {
+               raw_inode->i_faddr = 0;
+               raw_inode->i_frag = 0;
+               raw_inode->i_fsize = 0;
+       }
+ #endif
+-      raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
++      raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+       if (!S_ISREG(inode->i_mode)) {
+-              raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
++              raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+       } else {
+-              raw_inode->i_size_high =
+-                      cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
+-              if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
++              raw_inode->i_size_high = cpu_to_le32(ei->i_disksize >> 32);
++              if (ei->i_disksize > MAX_NON_LFS) {
+                       struct super_block *sb = inode->i_sb;
+                       if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
+@@ -2247,7 +2249,7 @@ static int ext3_do_update_inode(handle_t
+                               * created, add a flag to the superblock.
+                               */
+                               err = ext3_journal_get_write_access(handle,
+-                                              sb->u.ext3_sb.s_sbh);
++                                              EXT3_SB(sb)->s_sbh);
+                               if (err)
+                                       goto out_brelse;
+                               ext3_update_dynamic_rev(sb);
+@@ -2256,7 +2258,7 @@ static int ext3_do_update_inode(handle_t
+                               sb->s_dirt = 1;
+                               handle->h_sync = 1;
+                               err = ext3_journal_dirty_metadata(handle,
+-                                              sb->u.ext3_sb.s_sbh);
++                                              EXT3_SB(sb)->s_sbh);
+                       }
+               }
+       }
+@@ -2265,13 +2267,13 @@ static int ext3_do_update_inode(handle_t
+               raw_inode->i_block[0] =
+                       cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
+       else for (block = 0; block < EXT3_N_BLOCKS; block++)
+-              raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
++              raw_inode->i_block[block] = ei->i_data[block];
+       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+       rc = ext3_journal_dirty_metadata(handle, bh);
+       if (!err)
+               err = rc;
+-      EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
++      ei->i_state &= ~EXT3_STATE_NEW;
+ out_brelse:
+       brelse (bh);
+@@ -2379,7 +2381,7 @@ int ext3_setattr(struct dentry *dentry, 
+               }
+               
+               error = ext3_orphan_add(handle, inode);
+-              inode->u.ext3_i.i_disksize = attr->ia_size;
++              EXT3_I(inode)->i_disksize = attr->ia_size;
+               rc = ext3_mark_inode_dirty(handle, inode);
+               if (!error)
+                       error = rc;
+@@ -2622,9 +2624,9 @@ int ext3_change_inode_journal_flag(struc
+        */
+       if (val)
+-              inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
++              EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
+       else
+-              inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
++              EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
+       journal_unlock_updates(journal);
+--- ./fs/ext3/ioctl.c.orig     Fri Apr 12 10:27:49 2002
++++ ./fs/ext3/ioctl.c  Tue May  7 15:20:52 2002
+@@ -18,13 +18,14 @@
+ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+               unsigned long arg)
+ {
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       unsigned int flags;
+       ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+       switch (cmd) {
+       case EXT3_IOC_GETFLAGS:
+-              flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
++              flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
+               return put_user(flags, (int *) arg);
+       case EXT3_IOC_SETFLAGS: {
+               handle_t *handle = NULL;
+@@ -42,7 +42,7 @@ int ext3_ioctl (struct inode * inode, st
+               if (get_user(flags, (int *) arg))
+                       return -EFAULT;
+-              oldflags = inode->u.ext3_i.i_flags;
++              oldflags = ei->i_flags;
+               /* The JOURNAL_DATA flag is modifiable only by root */
+               jflag = flags & EXT3_JOURNAL_DATA_FL;
+@@ -79,7 +79,7 @@ int ext3_ioctl (struct inode * inode, st
+               
+               flags = flags & EXT3_FL_USER_MODIFIABLE;
+               flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
+-              inode->u.ext3_i.i_flags = flags;
++              ei->i_flags = flags;
+               if (flags & EXT3_SYNC_FL)
+                       inode->i_flags |= S_SYNC;
+@@ -155,12 +155,12 @@ flags_err:
+                       int ret = 0;
+                       set_current_state(TASK_INTERRUPTIBLE);
+-                      add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
+-                      if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) {
++                      add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
++                      if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
+                               schedule();
+                               ret = 1;
+                       }
+-                      remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
++                      remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
+                       return ret;
+               }
+ #endif
+--- ./fs/ext3/namei.c.orig     Fri Apr 12 10:27:49 2002
++++ ./fs/ext3/namei.c  Tue May  7 16:05:51 2002
+@@ -1430,8 +1430,8 @@ int ext3_orphan_add(handle_t *handle, st
+       J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+               S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+-      BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+-      err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
++      BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
+       if (err)
+               goto out_unlock;
+       
+@@ -1442,7 +1442,7 @@ int ext3_orphan_add(handle_t *handle, st
+       /* Insert this inode at the head of the on-disk orphan list... */
+       NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+       EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+-      err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
++      err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+       rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
+       if (!err)
+               err = rc;
+@@ -1520,8 +1520,7 @@ int ext3_orphan_del(handle_t *handle, st
+               err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+       } else {
+               struct ext3_iloc iloc2;
+-              struct inode *i_prev =
+-                      list_entry(prev, struct inode, u.ext3_i.i_orphan);
++              struct inode *i_prev = orphan_list_entry(prev);
+               jbd_debug(4, "orphan inode %lu will point to %lu\n",
+                         i_prev->i_ino, ino_next);
+--- ./fs/ext3/super.c.orig     Fri Apr 12 10:27:49 2002
++++ ./fs/ext3/super.c  Tue May  7 16:05:44 2002
+@@ -121,7 +121,7 @@ static int ext3_error_behaviour(struct s
+       /* If no overrides were specified on the mount, then fall back
+        * to the default behaviour set in the filesystem's superblock
+        * on disk. */
+-      switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) {
++      switch (le16_to_cpu(EXT3_SB(sb)->s_es->s_errors)) {
+       case EXT3_ERRORS_PANIC:
+               return EXT3_ERRORS_PANIC;
+       case EXT3_ERRORS_RO:
+@@ -269,9 +269,9 @@ void ext3_abort (struct super_block * sb
+               return;
+       
+       printk (KERN_CRIT "Remounting filesystem read-only\n");
+-      sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
++      EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+       sb->s_flags |= MS_RDONLY;
+-      sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT;
++      EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+       journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+ }
+@@ -377,8 +377,6 @@ static int ext3_blkdev_remove(struct ext3
+       return ret;
+ }
+-#define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan)
+-
+ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
+ {
+       struct list_head *l;
+@@ -818,7 +818,7 @@ static void ext3_orphan_cleanup (struct 
+               sb->s_flags &= ~MS_RDONLY;
+       }
+-      if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) {
++      if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
+               if (es->s_last_orphan)
+                       jbd_debug(1, "Errors on filesystem, "
+                                 "clearing orphan list.\n");
+@@ -1463,12 +1463,14 @@ static void ext3_commit_super (struct su
+                              struct ext3_super_block * es,
+                              int sync)
+ {
++      struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
++
+       es->s_wtime = cpu_to_le32(CURRENT_TIME);
+-      BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty");
+-      mark_buffer_dirty(sb->u.ext3_sb.s_sbh);
++      BUFFER_TRACE(sbh, "marking dirty");
++      mark_buffer_dirty(sbh);
+       if (sync) {
+-              ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh);
+-              wait_on_buffer(sb->u.ext3_sb.s_sbh);
++              ll_rw_block(WRITE, 1, &sbh);
++              wait_on_buffer(sbh);
+       }
+ }
+@@ -1519,7 +1521,7 @@ static void ext3_clear_journal_err(struc
+               ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
+                            "filesystem check.");
+               
+-              sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
++              EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+               es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+               ext3_commit_super (sb, es, 1);
+--- ./fs/ext3/symlink.c.orig   Fri Apr 12 10:27:49 2002
++++ ./fs/ext3/symlink.c        Tue May  7 15:25:39 2002
+@@ -23,13 +23,13 @@
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+-      char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
+-      return vfs_readlink(dentry, buffer, buflen, s);
++      struct ext3_inode_info *ei = EXT3_I(dentry->d_inode);
++      return vfs_readlink(dentry, buffer, buflen, (char *)ei->i_data);
+ }
+ static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
+ {
+-      char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
+-      return vfs_follow_link(nd, s);
++      struct ext3_inode_info *ei = EXT3_I(dentry->d_inode);
++      return vfs_follow_link(nd, (char*)ei->i_data);
+ }
+--- ./include/linux/ext3_fs.h.orig     Tue Apr 16 14:27:25 2002
++++ ./include/linux/ext3_fs.h  Tue May  7 16:47:36 2002
+@@ -84,22 +84,25 @@
+ #define EXT3_MIN_BLOCK_SIZE           1024
+ #define       EXT3_MAX_BLOCK_SIZE             4096
+ #define EXT3_MIN_BLOCK_LOG_SIZE                 10
++
+ #ifdef __KERNEL__
+-# define EXT3_BLOCK_SIZE(s)           ((s)->s_blocksize)
+-#else
+-# define EXT3_BLOCK_SIZE(s)           (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+-#endif
+-#define       EXT3_ADDR_PER_BLOCK(s)          (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+-#ifdef __KERNEL__
+-# define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+-#else
+-# define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_log_block_size + 10)
+-#endif
+-#ifdef __KERNEL__
+-#define       EXT3_ADDR_PER_BLOCK_BITS(s)     ((s)->u.ext3_sb.s_addr_per_block_bits)
+-#define EXT3_INODE_SIZE(s)            ((s)->u.ext3_sb.s_inode_size)
+-#define EXT3_FIRST_INO(s)             ((s)->u.ext3_sb.s_first_ino)
++#define EXT3_SB(sb)   (&((sb)->u.ext3_sb))
++#define EXT3_I(inode) (&((inode)->u.ext3_i))
++
++#define EXT3_BLOCK_SIZE(s)            ((s)->s_blocksize)
++#define EXT3_BLOCK_SIZE_BITS(s)               ((s)->s_blocksize_bits)
++#define       EXT3_ADDR_PER_BLOCK_BITS(s)     (EXT3_SB(s)->s_addr_per_block_bits)
++#define EXT3_INODE_SIZE(s)            (EXT3_SB(s)->s_inode_size)
++#define EXT3_FIRST_INO(s)             (EXT3_SB(s)->s_first_ino)
+ #else
++
++/* Assume that user mode programs are passing in an ext3fs superblock, not
++ * a kernel struct super_block.  This will allow us to call the feature-test
++ * macros from user land. */
++#define EXT3_SB(sb)   (sb)
++
++#define EXT3_BLOCK_SIZE(s)    (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
++#define EXT3_BLOCK_SIZE_BITS(s)       ((s)->s_log_block_size + 10)
+ #define EXT3_INODE_SIZE(s)    (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
+                                EXT3_GOOD_OLD_INODE_SIZE : \
+                                (s)->s_inode_size)
+@@ -108,6 +110,7 @@
+                                EXT3_GOOD_OLD_FIRST_INO : \
+                                (s)->s_first_ino)
+ #endif
++#define EXT3_ADDR_PER_BLOCK(s)        (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ /*
+  * Macro-instructions used to manage fragments
+@@ -116,8 +120,8 @@
+ #define       EXT3_MAX_FRAG_SIZE              4096
+ #define EXT3_MIN_FRAG_LOG_SIZE                  10
+ #ifdef __KERNEL__
+-# define EXT3_FRAG_SIZE(s)            ((s)->u.ext3_sb.s_frag_size)
+-# define EXT3_FRAGS_PER_BLOCK(s)      ((s)->u.ext3_sb.s_frags_per_block)
++# define EXT3_FRAG_SIZE(s)            (EXT3_SB(s)->s_frag_size)
++# define EXT3_FRAGS_PER_BLOCK(s)      (EXT3_SB(s)->s_frags_per_block)
+ #else
+ # define EXT3_FRAG_SIZE(s)            (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size)
+ # define EXT3_FRAGS_PER_BLOCK(s)      (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s))
+@@ -163,15 +167,13 @@
+ /*
+  * Macro-instructions used to manage group descriptors
+  */
++# define EXT3_BLOCKS_PER_GROUP(s)     (EXT3_SB(s)->s_blocks_per_group)
++# define EXT3_INODES_PER_GROUP(s)     (EXT3_SB(s)->s_inodes_per_group)
+ #ifdef __KERNEL__
+-# define EXT3_BLOCKS_PER_GROUP(s)     ((s)->u.ext3_sb.s_blocks_per_group)
+-# define EXT3_DESC_PER_BLOCK(s)               ((s)->u.ext3_sb.s_desc_per_block)
+-# define EXT3_INODES_PER_GROUP(s)     ((s)->u.ext3_sb.s_inodes_per_group)
+-# define EXT3_DESC_PER_BLOCK_BITS(s)  ((s)->u.ext3_sb.s_desc_per_block_bits)
++# define EXT3_DESC_PER_BLOCK(s)               (EXT3_SB(s)->s_desc_per_block)
++# define EXT3_DESC_PER_BLOCK_BITS(s)  (EXT3_SB(s)->s_desc_per_block_bits)
+ #else
+-# define EXT3_BLOCKS_PER_GROUP(s)     ((s)->s_blocks_per_group)
+ # define EXT3_DESC_PER_BLOCK(s)               (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc))
+-# define EXT3_INODES_PER_GROUP(s)     ((s)->s_inodes_per_group)
+ #endif
+ /*
+@@ -344,7 +347,7 @@
+ #ifndef _LINUX_EXT2_FS_H
+ #define clear_opt(o, opt)             o &= ~EXT3_MOUNT_##opt
+ #define set_opt(o, opt)                       o |= EXT3_MOUNT_##opt
+-#define test_opt(sb, opt)             ((sb)->u.ext3_sb.s_mount_opt & \
++#define test_opt(sb, opt)             (EXT3_SB(sb)->s_mount_opt & \
+                                        EXT3_MOUNT_##opt)
+ #else
+ #define EXT2_MOUNT_NOLOAD             EXT3_MOUNT_NOLOAD
+@@ -441,17 +443,11 @@
+ /*EC*/        __u32   s_reserved[197];        /* Padding to the end of the block */
+ };
+-#ifdef __KERNEL__
+-#define EXT3_SB(sb)   (&((sb)->u.ext3_sb))
+-#define EXT3_I(inode) (&((inode)->u.ext3_i))
+-#else
+-/* Assume that user mode programs are passing in an ext3fs superblock, not
+- * a kernel struct super_block.  This will allow us to call the feature-test
+- * macros from user land. */
+-#define EXT3_SB(sb)   (sb)
+-#endif
+-
+-#define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime
++#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
++static inline struct inode *orphan_list_entry(struct list_head *l)
++{
++      return list_entry(l, struct inode, u.ext3_i.i_orphan);
++}
+ /*
+  * Codes for operating systems
+--- ./include/linux/ext3_jbd.h.orig    Tue May  7 14:44:08 2002
++++ ./include/linux/ext3_jbd.h Tue May  7 14:44:43 2002
+@@ -291,7 +291,7 @@
+               return 1;
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+               return 1;
+-      if (inode->u.ext3_i.i_flags & EXT3_JOURNAL_DATA_FL)
++      if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+               return 1;
+       return 0;
+ }
diff --git a/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch
new file mode 100644 (file)
index 0000000..7cd3384
--- /dev/null
@@ -0,0 +1,19 @@
+ fs/ext3/namei.c |    2 +-
+ 1 files changed, 1 insertion(+), 1 deletion(-)
+
+diff -puN fs/ext3/namei.c~ext3-compat-2.4.18-chaos fs/ext3/namei.c
+--- linux-2.4.18/fs/ext3/namei.c~ext3-compat-2.4.18-chaos      2003-08-28 20:14:27.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/namei.c        2003-08-28 20:14:27.000000000 +0400
+@@ -830,9 +830,9 @@ static int ext3_rmdir (struct inode * di
+        * recovery. */
+       inode->i_size = 0;
+       ext3_orphan_add(handle, inode);
+-      ext3_mark_inode_dirty(handle, inode);
+       dir->i_nlink--;
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
++      ext3_mark_inode_dirty(handle, inode);
+       dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       ext3_mark_inode_dirty(handle, dir);
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch
new file mode 100644 (file)
index 0000000..a173981
--- /dev/null
@@ -0,0 +1,478 @@
+
+Create a service thread to handle delete and truncate of inodes, to avoid
+long latency while truncating very large files.
+
+
+ fs/ext3/inode.c            |  116 ++++++++++++++++++++++
+ fs/ext3/super.c            |  231 +++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/ext3_fs.h    |    5 
+ include/linux/ext3_fs_sb.h |   10 +
+ 4 files changed, 362 insertions(+)
+
+--- linux-2.4.18-18.8.0-l15/fs/ext3/super.c~ext3-delete_thread-2.4.18  Tue Jun  3 17:26:21 2003
++++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/super.c    Wed Jul  2 23:49:40 2003
+@@ -396,6 +396,220 @@ static void dump_orphan_list(struct supe
+       }
+ }
++#ifdef EXT3_DELETE_THREAD
++/*
++ * Delete inodes in a loop until there are no more to be deleted.
++ * Normally, we run in the background doing the deletes and sleeping again,
++ * and clients just add new inodes to be deleted onto the end of the list.
++ * If someone is concerned about free space (e.g. block allocation or similar)
++ * then they can sleep on s_delete_waiter_queue and be woken up when space
++ * has been freed.
++ */
++int ext3_delete_thread(void *data)
++{
++      struct super_block *sb = data;
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      struct task_struct *tsk = current;
++
++      /* Almost like daemonize, but not quite */
++      exit_mm(current);
++      tsk->session = 1;
++      tsk->pgrp = 1;
++      tsk->tty = NULL;
++      exit_files(current);
++      reparent_to_init();
++
++      sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
++      sigfillset(&tsk->blocked);
++
++      /*tsk->flags |= PF_KERNTHREAD;*/
++
++      INIT_LIST_HEAD(&sbi->s_delete_list);
++      wake_up(&sbi->s_delete_waiter_queue);
++      ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
++
++      /* main loop */
++      for (;;) {
++              wait_event_interruptible(sbi->s_delete_thread_queue,
++                                       !list_empty(&sbi->s_delete_list) ||
++                                       !test_opt(sb, ASYNCDEL));
++              ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
++                         tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
++
++              spin_lock(&sbi->s_delete_lock);
++              if (list_empty(&sbi->s_delete_list)) {
++                      clear_opt(sbi->s_mount_opt, ASYNCDEL);
++                      memset(&sbi->s_delete_list, 0,
++                             sizeof(sbi->s_delete_list));
++                      spin_unlock(&sbi->s_delete_lock);
++                      ext3_debug("delete thread on %s exiting\n",
++                                 kdevname(sb->s_dev));
++                      wake_up(&sbi->s_delete_waiter_queue);
++                      break;
++              }
++
++              while (!list_empty(&sbi->s_delete_list)) {
++                      struct inode *inode=list_entry(sbi->s_delete_list.next,
++                                                     struct inode, i_dentry);
++                      unsigned long blocks = inode->i_blocks >>
++                                                      (inode->i_blkbits - 9);
++
++                      list_del_init(&inode->i_dentry);
++                      spin_unlock(&sbi->s_delete_lock);
++                      ext3_debug("%s delete ino %lu blk %lu\n",
++                                 tsk->comm, inode->i_ino, blocks);
++
++                      iput(inode);
++
++                      spin_lock(&sbi->s_delete_lock);
++                      sbi->s_delete_blocks -= blocks;
++                      sbi->s_delete_inodes--;
++              }
++              if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
++                      ext3_warning(sb, __FUNCTION__,
++                                   "%lu blocks, %lu inodes on list?\n",
++                                   sbi->s_delete_blocks,sbi->s_delete_inodes);
++                      sbi->s_delete_blocks = 0;
++                      sbi->s_delete_inodes = 0;
++              }
++              spin_unlock(&sbi->s_delete_lock);
++              wake_up(&sbi->s_delete_waiter_queue);
++      }
++
++      return 0;
++}
++
++static void ext3_start_delete_thread(struct super_block *sb)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      int rc;
++
++      spin_lock_init(&sbi->s_delete_lock);
++      init_waitqueue_head(&sbi->s_delete_thread_queue);
++      init_waitqueue_head(&sbi->s_delete_waiter_queue);
++
++      if (!test_opt(sb, ASYNCDEL))
++              return;
++
++      rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
++      if (rc < 0)
++              printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
++                     rc);
++      else
++              wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
++}
++
++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
++{
++      if (sbi->s_delete_list.next == 0)       /* thread never started */
++              return;
++
++      clear_opt(sbi->s_mount_opt, ASYNCDEL);
++      wake_up(&sbi->s_delete_thread_queue);
++      wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
++}
++
++/* Instead of playing games with the inode flags, destruction, etc we just
++ * create a new inode locally and put it on a list for the truncate thread.
++ * We need large parts of the inode struct in order to complete the
++ * truncate and unlink, so we may as well just have a real inode to do it.
++ *
++ * If we have any problem deferring the delete, just delete it right away.
++ * If we defer it, we also mark how many blocks it would free, so that we
++ * can keep the statfs data correct, and we know if we should sleep on the
++ * delete thread when we run out of space.
++ */
++static void ext3_delete_inode_thread(struct inode *old_inode)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
++      struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
++      struct inode *new_inode;
++      unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
++
++      if (is_bad_inode(old_inode)) {
++              clear_inode(old_inode);
++              return;
++      }
++
++      if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
++              goto out_delete;
++
++      /* We may want to delete the inode immediately and not defer it */
++      if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
++              goto out_delete;
++
++      /* We can't use the delete thread as-is during real orphan recovery,
++       * as we add to the orphan list here, causing ext3_orphan_cleanup()
++       * to loop endlessly.  It would be nice to do so, but needs work.
++       */
++      if (oei->i_state & EXT3_STATE_DELETE ||
++          sbi->s_mount_state & EXT3_ORPHAN_FS) {
++              ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
++                         old_inode->i_ino, blocks);
++              goto out_delete;
++      }
++
++      /* We can iget this inode again here, because our caller has unhashed
++       * old_inode, so new_inode will be in a different inode struct.
++       *
++       * We need to ensure that the i_orphan pointers in the other inodes
++       * point at the new inode copy instead of the old one so the orphan
++       * list doesn't get corrupted when the old orphan inode is freed.
++       */
++      down(&sbi->s_orphan_lock);
++
++      sbi->s_mount_state |= EXT3_ORPHAN_FS;
++      new_inode = iget(old_inode->i_sb, old_inode->i_ino);
++      sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
++      if (is_bad_inode(new_inode)) {
++              printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
++              iput(new_inode);
++              new_inode = NULL;
++      }
++      if (!new_inode) {
++              up(&sbi->s_orphan_lock);
++              ext3_debug("delete inode %lu directly (bad read)\n",
++                         old_inode->i_ino);
++              goto out_delete;
++      }
++      J_ASSERT(new_inode != old_inode);
++
++      J_ASSERT(!list_empty(&oei->i_orphan));
++
++      nei = EXT3_I(new_inode);
++      /* Ugh.  We need to insert new_inode into the same spot on the list
++       * as old_inode was, to ensure the in-memory orphan list is still
++       * in the same order as the on-disk orphan list (badness otherwise).
++       */
++      nei->i_orphan = oei->i_orphan;
++      nei->i_orphan.next->prev = &nei->i_orphan;
++      nei->i_orphan.prev->next = &nei->i_orphan;
++      nei->i_state |= EXT3_STATE_DELETE;
++      up(&sbi->s_orphan_lock);
++
++      clear_inode(old_inode);
++
++      spin_lock(&sbi->s_delete_lock);
++      J_ASSERT(list_empty(&new_inode->i_dentry));
++      list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
++      sbi->s_delete_blocks += blocks;
++      sbi->s_delete_inodes++;
++      spin_unlock(&sbi->s_delete_lock);
++
++      ext3_debug("delete inode %lu (%lu blocks) by thread\n",
++                 new_inode->i_ino, blocks);
++
++      wake_up(&sbi->s_delete_thread_queue);
++      return;
++
++out_delete:
++      ext3_delete_inode(old_inode);
++}
++#else
++#define ext3_start_delete_thread(sbi) do {} while(0)
++#define ext3_stop_delete_thread(sbi) do {} while(0)
++#endif /* EXT3_DELETE_THREAD */
++
+ void ext3_put_super (struct super_block * sb)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+@@ -403,6 +617,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_stop_delete_thread(sbi);
+       ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+@@ -451,7 +666,11 @@ static struct super_operations ext3_sops
+       write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
+       dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
+       put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
++#ifdef EXT3_DELETE_THREAD
++      delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
++#else
+       delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
++#endif
+       put_super:      ext3_put_super,         /* BKL held */
+       write_super:    ext3_write_super,       /* BKL held */
+       write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
+@@ -511,6 +730,14 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef EXT3_DELETE_THREAD
++              if (!strcmp(this_char, "asyncdel"))
++                      set_opt(*mount_options, ASYNCDEL);
++              else if (!strcmp(this_char, "noasyncdel"))
++                      clear_opt(*mount_options, ASYNCDEL);
++              else
++#endif
++
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -1206,6 +1433,7 @@ struct super_block * ext3_read_super (st
+       }
+       ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
++      ext3_start_delete_thread(sb);
+       /*
+        * akpm: core read_super() calls in here with the superblock locked.
+        * That deadlocks, because orphan cleanup needs to lock the superblock
+@@ -1648,6 +1876,9 @@ int ext3_remount (struct super_block * s
+       if (!parse_options(data, &tmp, sbi, &tmp, 1))
+               return -EINVAL;
++      if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
++              ext3_stop_delete_thread(sbi);
++
+       if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+               ext3_abort(sb, __FUNCTION__, "Abort forced by user");
+--- linux/fs/ext3/file.c.orig  Fri Jan 17 10:57:31 2003
++++ linux/fs/ext3/file.c       Mon Jun 30 13:28:52 2003
+@@ -121,7 +121,11 @@ struct file_operations ext3_file_operati
+ };
+ struct inode_operations ext3_file_inode_operations = {
++#ifdef EXT3_DELETE_THREAD
++      truncate:       ext3_truncate_thread,   /* BKL held */
++#else
+       truncate:       ext3_truncate,          /* BKL held */
++#endif
+       setattr:        ext3_setattr,           /* BKL held */
+ };
+--- linux-2.4.18-18.8.0-l15/fs/ext3/inode.c~ext3-delete_thread-2.4.18  Wed Jul  2 23:13:58 2003
++++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/inode.c    Wed Jul  2 23:50:29 2003
+@@ -2004,6 +2004,118 @@ out_stop:
+       ext3_journal_stop(handle, inode);
+ }
++#ifdef EXT3_DELETE_THREAD
++/* Move blocks from to-be-truncated inode over to a new inode, and delete
++ * that one from the delete thread instead.  This avoids a lot of latency
++ * when truncating large files.
++ *
++ * If we have any problem deferring the truncate, just truncate it right away.
++ * If we defer it, we also mark how many blocks it would free, so that we
++ * can keep the statfs data correct, and we know if we should sleep on the
++ * delete thread when we run out of space.
++ */
++void ext3_truncate_thread(struct inode *old_inode)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
++      struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
++      struct inode *new_inode;
++      handle_t *handle;
++      unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
++
++      if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
++              goto out_truncate;
++
++      /* XXX This is a temporary limitation for code simplicity.
++       *     We could truncate to arbitrary sizes at some later time.
++       */
++      if (old_inode->i_size != 0)
++              goto out_truncate;
++
++      /* We may want to truncate the inode immediately and not defer it */
++      if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
++          old_inode->i_size > oei->i_disksize)
++              goto out_truncate;
++
++      /* We can't use the delete thread as-is during real orphan recovery,
++       * as we add to the orphan list here, causing ext3_orphan_cleanup()
++       * to loop endlessly.  It would be nice to do so, but needs work.
++       */
++      if (oei->i_state & EXT3_STATE_DELETE ||
++          sbi->s_mount_state & EXT3_ORPHAN_FS) {
++              ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
++                         old_inode->i_ino, blocks);
++              goto out_truncate;
++      }
++
++      ext3_discard_prealloc(old_inode);
++
++      /* old_inode   = 1
++       * new_inode   = sb + GDT + ibitmap
++       * orphan list = 1 inode/superblock for add, 2 inodes for del
++       * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
++       */
++      handle = ext3_journal_start(old_inode, 7);
++      if (IS_ERR(handle))
++              goto out_truncate;
++
++      new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
++      if (IS_ERR(new_inode)) {
++              ext3_debug("truncate inode %lu directly (no new inodes)\n",
++                         old_inode->i_ino);
++              goto out_journal;
++      }
++
++      nei = EXT3_I(new_inode);
++
++      down_write(&oei->truncate_sem);
++      new_inode->i_size = old_inode->i_size;
++      new_inode->i_blocks = old_inode->i_blocks;
++      new_inode->i_uid = old_inode->i_uid;
++      new_inode->i_gid = old_inode->i_gid;
++      new_inode->i_nlink = 0;
++
++      /* FIXME when we do arbitrary truncates */
++      old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
++      old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
++
++      memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
++      memset(oei->i_data, 0, sizeof(oei->i_data));
++
++      nei->i_disksize = oei->i_disksize;
++      nei->i_state |= EXT3_STATE_DELETE;
++      up_write(&oei->truncate_sem);
++
++      if (ext3_orphan_add(handle, new_inode) < 0)
++              goto out_journal;
++
++      if (ext3_orphan_del(handle, old_inode) < 0) {
++              ext3_orphan_del(handle, new_inode);
++              iput(new_inode);
++              goto out_journal;
++      }
++
++      ext3_journal_stop(handle, old_inode);
++
++      spin_lock(&sbi->s_delete_lock);
++      J_ASSERT(list_empty(&new_inode->i_dentry));
++      list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
++      sbi->s_delete_blocks += blocks;
++      sbi->s_delete_inodes++;
++      spin_unlock(&sbi->s_delete_lock);
++
++      ext3_debug("delete inode %lu (%lu blocks) by thread\n",
++                 new_inode->i_ino, blocks);
++
++      wake_up(&sbi->s_delete_thread_queue);
++      return;
++
++out_journal:
++      ext3_journal_stop(handle, old_inode);
++out_truncate:
++      ext3_truncate(old_inode);
++}
++#endif /* EXT3_DELETE_THREAD */
++
+ /* 
+  * ext3_get_inode_loc returns with an extra refcount against the
+  * inode's underlying buffer_head on success. 
+--- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18  Tue Jun  3 17:26:20 2003
++++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs.h    Wed Jul  2 23:19:09 2003
+@@ -190,6 +190,7 @@ struct ext3_group_desc
+  */
+ #define EXT3_STATE_JDATA              0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW                        0x00000002 /* inode is newly created */
++#define EXT3_STATE_DELETE             0x00000010 /* deferred delete inode */
+ /*
+  * ioctl commands
+@@ -317,6 +318,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_INDEX              0x4000  /* Enable directory index */
++#define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -651,6 +653,9 @@ extern void ext3_discard_prealloc (struc
+ extern void ext3_dirty_inode(struct inode *);
+ extern int ext3_change_inode_journal_flag(struct inode *, int);
+ extern void ext3_truncate (struct inode *);
++#ifdef EXT3_DELETE_THREAD
++extern void ext3_truncate_thread(struct inode *inode);
++#endif
+ /* ioctl.c */
+ extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
+--- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18       Tue Jun  3 17:26:21 2003
++++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs_sb.h Wed Jul  2 23:19:09 2003
+@@ -29,6 +29,8 @@
+ #define EXT3_MAX_GROUP_LOADED 32
++#define EXT3_DELETE_THREAD
++
+ /*
+  * third extended-fs super-block data in memory
+  */
+@@ -74,6 +76,14 @@ struct ext3_sb_info {
+       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+ #endif
++#ifdef EXT3_DELETE_THREAD
++      spinlock_t s_delete_lock;
++      struct list_head s_delete_list;
++      unsigned long s_delete_blocks;
++      unsigned long s_delete_inodes;
++      wait_queue_head_t s_delete_thread_queue;
++      wait_queue_head_t s_delete_waiter_queue;
++#endif
+ };
+ #endif        /* _LINUX_EXT3_FS_SB */
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch
new file mode 100644 (file)
index 0000000..d0c315b
--- /dev/null
@@ -0,0 +1,1831 @@
+ fs/ext3/Makefile           |    3 
+ fs/ext3/extents.c          | 1573 +++++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/ialloc.c           |    4 
+ fs/ext3/inode.c            |   26 
+ fs/ext3/super.c            |    9 
+ include/linux/ext3_fs.h    |   18 
+ include/linux/ext3_fs_i.h  |    4 
+ include/linux/ext3_fs_sb.h |   10 
+ 8 files changed, 1641 insertions(+), 6 deletions(-)
+
+diff -puN /dev/null fs/ext3/extents.c
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.18-chaos-alexey/fs/ext3/extents.c        2003-08-25 21:11:58.000000000 +0400
+@@ -0,0 +1,1573 @@
++/*
++ *
++ * linux/fs/ext3/extents.c
++ *
++ * Extents support for EXT3
++ *
++ * 07/08/2003    Alex Tomas <bzzz@tmi.comex.ru>
++ * 
++ * TODO:
++ *   - ext3*_error() should be used in some situations
++ *   - find_goal() [to be tested and improved]
++ *   - error handling
++ *   - we could leak allocated block in some error cases
++ *   - quick search for index/leaf in ext3_ext_find_extent()
++ *   - tree reduction
++ *   - cache last found extent
++ *   - arch-independent
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/locks.h>
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if EXT_DEBUG defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG
++#ifdef EXT_DEBUG
++#define ext_debug(inode,fmt,a...)             \
++do {                                          \
++      if (test_opt((inode)->i_sb, EXTDEBUG))  \
++              printk(fmt, ##a);               \
++} while (0);
++#else
++#define ext_debug(inode,fmt,a...)
++#endif
++
++#define EXT3_ALLOC_NEEDED     2       /* block bitmap + group descriptor */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ *  - tree depth (0 mean there is no tree yet. all extents in the inode)
++ *  - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++      __u32   e_block;        /* first logical block extent covers */
++      __u32   e_start;        /* first physical block extents lives */
++      __u32   e_num;          /* number of blocks covered by extent */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++      __u32   e_block;        /* index covers logical blocks from 'block' */
++      __u32   e_leaf;         /* pointer to the physical block of the next *
++                               * level. leaf or next index could bet here */
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {   
++      __u16   e_num;          /* number of valid entries */
++      __u16   e_max;          /* capacity of store in entries */
++};
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++      __u32                           p_block;
++      __u16                           p_depth;
++      struct ext3_extent              *p_ext;
++      struct ext3_extent_idx          *p_idx;
++      struct ext3_extent_header       *p_hdr;
++      struct buffer_head              *p_bh;
++};
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++      ((struct ext3_extent *) (((char *) (__hdr__)) +         \
++                               sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++      ((struct ext3_extent_idx *) (((char *) (__hdr__)) +     \
++                                   sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++      ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++/*
++ * could return:
++ *  - EROFS
++ *  - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle, struct inode *inode,
++                              struct ext3_ext_path *path)
++{
++      if (path->p_bh) {
++              /* path points to block */
++              return ext3_journal_get_write_access(handle, path->p_bh);
++      }
++
++      /* path points to leaf/index in inode body */
++      return 0;
++}
++
++/*
++ * could return:
++ *  - EROFS
++ *  - ENOMEM
++ *  - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct inode *inode,
++                              struct ext3_ext_path *path)
++{
++      if (path->p_bh) {
++              /* path points to block */
++              return ext3_journal_dirty_metadata(handle, path->p_bh);
++      }
++
++      /* path points to leaf/index in inode body */
++      return ext3_mark_inode_dirty(handle, inode);
++}
++
++static inline int ext3_ext_space_block(struct inode *inode)
++{
++      int size;
++
++      size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header))
++              / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++      size = 6; /* FIXME: for debug, remove this line */
++#endif
++      return size;
++}
++
++static inline int ext3_ext_space_inode(struct inode *inode)
++{
++      int size;
++
++      size = (sizeof(EXT3_I(inode)->i_data) -
++                      sizeof(struct ext3_extent_header))
++                      / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++      size = 3; /* FIXME: for debug, remove this line */
++#endif
++      return size;
++}
++
++static inline int ext3_ext_space_inode_idx(struct inode *inode)
++{
++      int size;
++
++      size = (sizeof(EXT3_I(inode)->i_data) -
++                      sizeof(struct ext3_extent_header))
++                      / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++      size = 4; /* FIXME: for debug, remove this line */
++#endif
++      return size;
++}
++
++static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path)
++{
++      int k, l = path->p_depth;
++
++      ext_debug(inode, "path:");
++      for (k = 0; k <= l; k++, path++) {
++              if (path->p_idx) {
++                      ext_debug(inode, "  %d->%d", path->p_idx->e_block,
++                                      path->p_idx->e_leaf);
++              } else if (path->p_ext) {
++                      ext_debug(inode, "  %d:%d:%d",
++                                      path->p_ext->e_block,
++                                      path->p_ext->e_start,
++                                      path->p_ext->e_num);
++              } else
++                      ext_debug(inode, "  []");
++      }
++      ext_debug(inode, "\n");
++}
++
++static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path)
++{
++      int depth = EXT3_I(inode)->i_depth;
++      struct ext3_extent_header *eh = path[depth].p_hdr;
++      struct ext3_extent *ex = EXT_FIRST_EXTENT(eh);
++      int i;
++
++      for (i = 0; i < eh->e_num; i++, ex++) {
++              ext_debug(inode, "%d:%d:%d ",
++                              ex->e_block, ex->e_start, ex->e_num);
++      }
++      ext_debug(inode, "\n");
++}
++
++static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path)
++{
++      int depth = path->p_depth;
++      int i;
++
++      for (i = 0; i <= depth; i++, path++)
++              if (path->p_bh) {
++                      brelse(path->p_bh);
++                      path->p_bh = NULL;
++              }
++}
++
++static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path)
++{
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      unsigned long bg_start;
++      unsigned long colour;
++      int depth;
++      
++      if (path) {
++              depth = path->p_depth;
++              /* try to find previous block */
++              if (path[depth].p_ext)
++                      return path[depth].p_ext->e_start +
++                              path[depth].p_ext->e_num - 1;
++              
++              /* it looks index is empty
++               * try to find starting from index itself */
++              if (path[depth].p_bh)
++                      return path[depth].p_bh->b_blocknr;
++      }
++
++      /* OK. use inode's group */
++      bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++              le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++      colour = (current->pid % 16) *
++                      (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++      return bg_start + colour;
++}
++
++static struct ext3_ext_path *
++ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path)
++{
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      struct ext3_extent_header *eh = (void *) ei->i_data;
++      struct ext3_extent_idx *ix;
++      struct buffer_head *bh;
++      struct ext3_extent *ex;
++      int depth, i, k, ppos = 0;
++      
++      eh = (struct ext3_extent_header *) ei->i_data;
++
++      /* initialize capacity of leaf in inode for first time */
++      if (eh->e_max == 0)
++              eh->e_max = ext3_ext_space_inode(inode);
++      i = depth = ei->i_depth;
++      EXT_ASSERT(i == 0 || eh->e_num > 0);
++      
++      /* account possible depth increase */
++      if (!path) {
++              path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++                              GFP_NOFS);
++              if (!path)
++                      return ERR_PTR(-ENOMEM);
++      }
++      memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++
++      /* walk through the tree */
++      while (i) {
++              ext_debug(inode, "depth %d: num %d, max %d\n",
++                              ppos, eh->e_num, eh->e_max);
++              ix = EXT_FIRST_INDEX(eh);
++              if (eh->e_num)
++                      path[ppos].p_idx = ix;
++              EXT_ASSERT(eh->e_num <= eh->e_max);
++              for (k = 0; k < eh->e_num; k++, ix++) {
++                      ext_debug(inode, "index: %d -> %d\n",
++                                      ix->e_block, ix->e_leaf);
++                      if (block < ix->e_block)
++                              break;
++                      path[ppos].p_idx = ix;
++              }
++              path[ppos].p_block = path[ppos].p_idx->e_leaf;
++              path[ppos].p_depth = i;
++              path[ppos].p_hdr = eh;
++              path[ppos].p_ext = NULL;
++
++              bh = sb_bread(inode->i_sb, path[ppos].p_block);
++              if (!bh) {
++                      ext3_ext_drop_refs(inode, path);
++                      kfree(path);
++                      return ERR_PTR(-EIO);
++              }
++              eh = (struct ext3_extent_header *) bh->b_data;
++              ppos++;
++              EXT_ASSERT(ppos <= depth);
++              path[ppos].p_bh = bh;
++              i--;
++      }
++
++      path[ppos].p_depth = i;
++      path[ppos].p_hdr = eh;
++      path[ppos].p_ext = NULL;
++      
++      /* find extent */
++      ex = EXT_FIRST_EXTENT(eh);
++      if (eh->e_num)
++              path[ppos].p_ext = ex;
++      EXT_ASSERT(eh->e_num <= eh->e_max);
++      for (k = 0; k < eh->e_num; k++, ex++) {
++              if (block < ex->e_block) 
++                      break;
++              path[ppos].p_ext = ex;
++      }
++
++      ext3_ext_show_path(inode, path);
++
++      return path;
++}
++
++static void ext3_ext_check_boundary(struct inode *inode,
++                                      struct ext3_ext_path *curp,
++                                      void *addr, int len)
++{
++      void *end;
++
++      if (!len)
++              return;
++      if (curp->p_bh)
++              end = (void *) curp->p_hdr + inode->i_sb->s_blocksize;
++      else
++              end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data);
++      if (((unsigned long) addr) + len > (unsigned long) end) {
++              printk("overflow! 0x%p > 0x%p\n", addr + len, end);
++              BUG();
++      }
++      if ((unsigned long) addr < (unsigned long) curp->p_hdr) {
++              printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr);
++              BUG();
++      }
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle, struct inode *inode,
++                              struct ext3_ext_path *curp, int logical,
++                              int ptr)
++{
++      struct ext3_extent_idx *ix;
++      int len, err;
++
++      if ((err = ext3_ext_get_access(handle, inode, curp)))
++              return err;
++
++      EXT_ASSERT(logical != curp->p_idx->e_block);
++      len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++      if (logical > curp->p_idx->e_block) {
++              /* insert after */
++              len = (len - 1) * sizeof(struct ext3_extent_idx);
++              len = len < 0 ? 0 : len;
++              ext_debug(inode, "insert new index %d after: %d. "
++                              "move %d from 0x%p to 0x%p\n",
++                              logical, ptr, len,
++                              (curp->p_idx + 1), (curp->p_idx + 2));
++
++              ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len);
++              memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++              ix = curp->p_idx + 1;
++      } else {
++              /* insert before */
++              len = len * sizeof(struct ext3_extent_idx);
++              len = len < 0 ? 0 : len;
++              ext_debug(inode, "insert new index %d before: %d. "
++                              "move %d from 0x%p to 0x%p\n",
++                              logical, ptr, len,
++                              curp->p_idx, (curp->p_idx + 1));
++
++              ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len);
++              memmove(curp->p_idx + 1, curp->p_idx, len);
++              ix = curp->p_idx;
++      }
++
++      ix->e_block = logical;
++      ix->e_leaf = ptr;
++      curp->p_hdr->e_num++;
++
++      err = ext3_ext_dirty(handle, inode, curp);
++      ext3_std_error(inode->i_sb, err);
++
++      return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ *  - allocates all needed blocks (new leaf and all intermediate index blocks)
++ *  - makes decision where to split
++ *  - moves remaining extens and index entries (right to the split point)
++ *    into the newly allocated blocks
++ *  - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct inode *inode,
++                              struct ext3_ext_path *path,
++                              struct ext3_extent *newext, int at)
++{
++      struct buffer_head *bh = NULL;
++      int depth = EXT3_I(inode)->i_depth;
++      struct ext3_extent_header *neh;
++      struct ext3_extent_idx *fidx;
++      struct ext3_extent *ex;
++      int i = at, k, m, a;
++      long newblock, oldblock, border;
++      int *ablocks = NULL; /* array of allocated blocks */
++      int err = 0;
++
++      /* make decision: where to split? */
++      /* FIXME: now desicion is simplest: at current extent */
++
++      /* if current leaf will be splitted, then we should use 
++       * border from split point */
++      if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++              border = path[depth].p_ext[1].e_block;
++              ext_debug(inode, "leaf will be splitted."
++                              " next leaf starts at %d\n",
++                              (int)border);
++      } else {
++              border = newext->e_block;
++              ext_debug(inode, "leaf will be added."
++                              " next leaf starts at %d\n",
++                              (int)border);
++      }
++
++      /* 
++       * if error occurs, then we break processing
++       * and turn filesystem read-only. so, index won't
++       * be inserted and tree will be in consistent
++       * state. next mount will repair buffers too
++       */
++
++      /*
++       * get array to track all allocated blocks
++       * we need this to handle errors and free blocks
++       * upon them
++       */
++      ablocks = kmalloc(sizeof(long) * depth, GFP_NOFS);
++      if (!ablocks)
++              return -ENOMEM;
++      memset(ablocks, 0, sizeof(long) * depth);
++
++      /* allocate all needed blocks */
++      ext_debug(inode, "allocate %d blocks for indexes and leaf\n",
++                      depth - at);
++      ablocks[0] = newext->e_start++;
++      newext->e_num--;
++      for (a = 1; a < depth - at; a++) {
++              newblock = ext3_new_block(handle, inode, newext->e_start,
++                                              0, 0, &err);
++              if (newblock == 0)
++                      goto cleanup;
++              ablocks[a] = newblock;
++      }
++
++      /* initialize new leaf */
++      newblock = ablocks[--a];
++      EXT_ASSERT(newblock);
++      bh = sb_getblk(inode->i_sb, newblock);
++      if (!bh) {
++              err = -EIO;
++              goto cleanup;
++      }
++      lock_buffer(bh);
++
++      if ((err = ext3_journal_get_create_access(handle, bh)))
++              goto cleanup;
++
++      neh = (struct ext3_extent_header *) bh->b_data;
++      neh->e_num = 0;
++      neh->e_max = ext3_ext_space_block(inode);
++      ex = EXT_FIRST_EXTENT(neh);
++
++      /* move remain of path[depth] to the new leaf */
++      EXT_ASSERT(path[depth].p_hdr->e_num ==
++                      path[depth].p_hdr->e_max);
++      /* start copy from next extent */
++      /* TODO: we could do it by single memmove */
++      m = 0;
++      path[depth].p_ext++;
++      while (path[depth].p_ext <=
++                      EXT_MAX_EXTENT(path[depth].p_hdr)) {
++              ext_debug(inode, "move %d:%d:%d in new leaf\n",
++                              path[depth].p_ext->e_block,
++                              path[depth].p_ext->e_start,
++                              path[depth].p_ext->e_num);
++              memmove(ex++, path[depth].p_ext++,
++                              sizeof(struct ext3_extent));
++              neh->e_num++;
++              m++;
++      }
++      mark_buffer_uptodate(bh, 1);
++      unlock_buffer(bh);
++
++      if ((err = ext3_journal_dirty_metadata(handle, bh)))
++              goto cleanup;   
++      brelse(bh);
++      bh = NULL;
++
++      /* correct old leaf */
++      if (m) {
++              if ((err = ext3_ext_get_access(handle, inode, path)))
++                      goto cleanup;
++              path[depth].p_hdr->e_num -= m;
++              if ((err = ext3_ext_dirty(handle, inode, path)))
++                      goto cleanup;
++              
++      }
++
++      /* create intermediate indexes */
++      k = depth - at - 1;
++      EXT_ASSERT(k >= 0);
++      if (k)
++              ext_debug(inode,
++                              "create %d intermediate indices\n", k);
++      /* insert new index into current index block */
++      /* current depth stored in i var */
++      i = depth - 1;
++      while (k--) {
++              oldblock = newblock;
++              newblock = ablocks[--a];
++              bh = sb_getblk(inode->i_sb, newblock);
++              if (!bh) {
++                      err = -EIO;
++                      goto cleanup;
++              }
++              lock_buffer(bh);
++
++              if ((err = ext3_journal_get_create_access(handle, bh)))
++                      goto cleanup;
++
++              neh = (struct ext3_extent_header *) bh->b_data;
++              neh->e_num = 1;
++              neh->e_max = ext3_ext_space_block(inode);
++              fidx = EXT_FIRST_INDEX(neh);
++              fidx->e_block = border;
++              fidx->e_leaf = oldblock;
++
++              ext_debug(inode,
++                              "int.index at %d (block %u): %d -> %d\n",
++                              i, (unsigned) newblock,
++                              (int) border,
++                              (int) oldblock);
++              /* copy indexes */
++              m = 0;
++              path[i].p_idx++;
++              EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++                              EXT_LAST_INDEX(path[i].p_hdr));
++              ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++                              EXT_MAX_INDEX(path[i].p_hdr));
++              while (path[i].p_idx <=
++                              EXT_MAX_INDEX(path[i].p_hdr)) {
++                      ext_debug(inode, "%d: move %d:%d in new index\n",
++                                      i, path[i].p_idx->e_block,
++                                      path[i].p_idx->e_leaf);
++                      memmove(++fidx, path[i].p_idx++,
++                                      sizeof(struct ext3_extent_idx));
++                      neh->e_num++;
++                      m++;
++              }
++
++              mark_buffer_uptodate(bh, 1);
++              unlock_buffer(bh);
++
++              if ((err = ext3_journal_dirty_metadata(handle, bh)))
++                      goto cleanup;
++              brelse(bh);
++              bh = NULL;
++
++              /* correct old index */
++              if (m) {
++                      err = ext3_ext_get_access(handle,inode,path+i);
++                      if (err)
++                              goto cleanup;
++                      path[i].p_hdr->e_num -= m;
++                      err = ext3_ext_dirty(handle, inode, path + i);
++                      if (err)
++                              goto cleanup;
++              }
++
++              i--;
++      }
++
++      /* insert new index */
++      if (!err) 
++              err = ext3_ext_insert_index(handle, inode, path + at,
++                                              border, newblock);
++
++cleanup:
++      if (bh) {
++              if (buffer_locked(bh))
++                      unlock_buffer(bh);
++              brelse(bh);
++      }
++
++      if (err) {
++              /* free all allocated blocks in error case */
++              for (i = 0; i < depth; i++)
++                      if (!ablocks[i])
++                              continue;
++                      ext3_free_blocks(handle, inode, ablocks[i], 1);
++      }
++      kfree(ablocks);
++
++      return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ *  - allocates new block
++ *  - moves top-level data (index block or leaf) into the new block
++ *  - initialize new top-level, creating index that points to the
++ *    just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode,
++                                      struct ext3_ext_path *path,
++                                      struct ext3_extent *newext)
++{
++      struct buffer_head *bh;
++      struct ext3_ext_path *curp = path;
++      struct ext3_extent_header *neh;
++      struct ext3_extent_idx *fidx;
++      int len, err = 0;
++      long newblock;
++
++      /*
++       * use already allocated by the called block for new root block
++       */
++      newblock = newext->e_start++;
++      newext->e_num--;
++      
++      bh = sb_getblk(inode->i_sb, newblock);
++      if (!bh) {
++              err = -EIO;
++              ext3_std_error(inode->i_sb, err);
++              return err;
++      }
++      lock_buffer(bh);
++
++      if ((err = ext3_journal_get_create_access(handle, bh))) {
++              unlock_buffer(bh);
++              goto out;       
++      }
++
++      /* move top-level index/leaf into new block */
++      len = sizeof(struct ext3_extent_header) +
++              sizeof(struct ext3_extent) * curp->p_hdr->e_max;
++      EXT_ASSERT(len >= 0 && len < 4096);
++      memmove(bh->b_data, curp->p_hdr, len);
++
++      /* set size of new block */
++      neh = (struct ext3_extent_header *) bh->b_data;
++      neh->e_max = ext3_ext_space_block(inode);
++      mark_buffer_uptodate(bh, 1);
++      unlock_buffer(bh);
++
++      if ((err = ext3_journal_dirty_metadata(handle, bh)))
++              goto out;
++
++      /* create index in new top-level index: num,max,pointer */
++      if ((err = ext3_ext_get_access(handle, inode, curp)))
++              goto out;
++
++      curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode);
++      curp->p_hdr->e_num = 1;
++      curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++      curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block;
++      curp->p_idx->e_leaf = newblock;
++
++      neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data;
++      fidx = EXT_FIRST_INDEX(neh);
++      ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n",
++                      neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); 
++
++      EXT3_I(inode)->i_depth++;
++      err = ext3_ext_dirty(handle, inode, curp);
++out:
++      brelse(bh);
++
++      return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode,
++                                      struct ext3_ext_path *path,
++                                      struct ext3_extent *newext)
++{
++      int depth = EXT3_I(inode)->i_depth;
++      struct ext3_ext_path *curp;
++      int i = depth, err = 0;
++      long newblock = newext->e_start;
++
++      /* walk up to the tree and look for free index entry */
++      curp = path + depth;
++      while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++              i--;
++              curp--;
++      }
++
++      /* we use already allocated block for index block
++       * so, subsequent data blocks should be contigoues */
++      if (EXT_HAS_FREE_INDEX(curp)) {
++              /* if we found index with free entry, then use that
++               * entry: create all needed subtree and add new leaf */
++              err = ext3_ext_split(handle, inode, path, newext, i);
++      } else {
++              /* tree is full, time to grow in depth */
++              err = ext3_ext_grow_indepth(handle, inode, path, newext);
++      }
++
++      if (!err) {
++              /* refill path */
++              ext3_ext_drop_refs(inode, path);
++              path = ext3_ext_find_extent(inode, newext->e_block, path);
++              if (IS_ERR(path))
++                      err = PTR_ERR(path);
++
++              /*
++               * probably we've used some blocks from extent
++               * let's allocate new block for it
++               */
++              if (newext->e_num == 0 && !err) {
++                      newext->e_start =
++                              ext3_new_block(handle, inode, newblock,
++                                              0, 0, &err);
++                      newext->e_num = 1;
++              }
++      }
++
++      return err;
++}
++
++/*
++ * returns next allocated block or 0xffffffff
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static inline unsigned ext3_ext_next_allocated_block(struct inode *inode,
++                                               struct ext3_ext_path *path)
++{
++      int depth;
++
++      EXT_ASSERT(path != NULL);
++      depth = path->p_depth;
++
++      if (depth == 0 && path->p_ext == NULL)
++              return 0xffffffff;
++
++      /* FIXME: what if index isn't full ?! */
++      while (depth >= 0) {
++              if (depth == path->p_depth) {
++                      /* leaf */
++                      if (path[depth].p_ext !=
++                                      EXT_LAST_EXTENT(path[depth].p_hdr))
++                              return path[depth].p_ext[1].e_block;
++              } else {
++                      /* index */
++                      if (path[depth].p_idx !=
++                                      EXT_LAST_INDEX(path[depth].p_hdr))
++                              return path[depth].p_idx[1].e_block;
++              }
++              depth--;        
++      }
++
++      return 0xffffffff;
++}
++
++/*
++ * returns first allocated block from next leaf or 0xffffffff
++ */
++static unsigned ext3_ext_next_leaf_block(struct inode *inode,
++                                               struct ext3_ext_path *path)
++{
++      int depth;
++
++      EXT_ASSERT(path != NULL);
++      depth = path->p_depth;
++
++      /* zero-tree has no leaf blocks at all */
++      if (depth == 0)
++              return 0xffffffff;
++
++      /* go to index block */
++      depth--;
++      
++      while (depth >= 0) {
++              if (path[depth].p_idx !=
++                              EXT_LAST_INDEX(path[depth].p_hdr))
++                      return path[depth].p_idx[1].e_block;
++              depth--;        
++      }
++
++      return 0xffffffff;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode,
++                              struct ext3_ext_path *path)
++{
++      int depth = EXT3_I(inode)->i_depth;     
++      struct ext3_extent_header *eh;
++      struct ext3_extent *ex;
++      long border;
++      int k, err = 0;
++      
++      eh = path[depth].p_hdr;
++      ex = path[depth].p_ext;
++
++      EXT_ASSERT(ex);
++      EXT_ASSERT(eh);
++      
++      if (depth == 0) {
++              /* there is no tree at all */
++              return 0;
++      }
++      
++      if (ex != EXT_FIRST_EXTENT(eh)) {
++              /* we correct tree if first leaf got modified only */
++              return 0;
++      }
++      
++      k = depth - 1;
++      border = path[depth].p_ext->e_block;
++      if ((err = ext3_ext_get_access(handle, inode, path + k)))
++              return err;
++      path[k].p_idx->e_block = border;
++      if ((err = ext3_ext_dirty(handle, inode, path + k)))
++              return err;
++
++      while (k--) {
++              /* change all left-side indexes */
++              if (path[k].p_idx != EXT_FIRST_INDEX(path[k].p_hdr)
++                              && k != 0)
++                      break;
++              if ((err = ext3_ext_get_access(handle, inode, path + k)))
++                      break;
++              path[k].p_idx->e_block = border;
++              if ((err = ext3_ext_dirty(handle, inode, path + k)))
++                      break;
++      }
++
++      return err;
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct inode *inode,
++                              struct ext3_ext_path *path,
++                              struct ext3_extent *newext)
++{
++      int depth, len;
++      struct ext3_extent_header * eh;
++      struct ext3_extent *ex;
++      struct ext3_extent *nearex; /* nearest extent */
++      struct ext3_ext_path *npath = NULL;
++      int err;
++
++      depth = EXT3_I(inode)->i_depth; 
++      if ((ex = path[depth].p_ext)) {
++              /* try to insert block into found extent and return */
++              if (ex->e_block + ex->e_num == newext->e_block &&
++                              ex->e_start + ex->e_num == newext->e_start) {
++#ifdef AGRESSIVE_TEST
++                      if (ex->e_num >= 2)
++                              goto repeat;
++#endif
++                      if ((err = ext3_ext_get_access(handle, inode,
++                                                      path + depth)))
++                              return err;
++                      ext_debug(inode, "append %d block to %d:%d (from %d)\n",
++                                      newext->e_num, ex->e_block, ex->e_num,
++                                      ex->e_start);
++                      ex->e_num += newext->e_num;
++                      err = ext3_ext_dirty(handle, inode, path + depth);
++                      return err;
++              }
++      }
++
++repeat:
++      depth = EXT3_I(inode)->i_depth; 
++      eh = path[depth].p_hdr;
++      if (eh->e_num == eh->e_max) {
++              /* probably next leaf has space for us? */
++              int next = ext3_ext_next_leaf_block(inode, path);
++              if (next != 0xffffffff) {
++                      ext_debug(inode, "next leaf block - %d\n", next);
++                      EXT_ASSERT(!npath);
++                      npath = ext3_ext_find_extent(inode, next, NULL);
++                      if (IS_ERR(npath))
++                              return PTR_ERR(npath);
++                      EXT_ASSERT(npath->p_depth == path->p_depth);
++                      eh = npath[depth].p_hdr;
++                      if (eh->e_num < eh->e_max) {
++                              ext_debug(inode,
++                                              "next leaf has free ext(%d)\n",
++                                              eh->e_num);
++                              path = npath;
++                              goto repeat;
++                      }
++                      ext_debug(inode, "next leaf hasno free space(%d,%d)\n",
++                                      eh->e_num, eh->e_max);
++              }
++              /*
++               * there is no free space in found leaf
++               * we're gonna add new leaf in the tree
++               */
++              err = ext3_ext_create_new_leaf(handle, inode, path, newext);
++              if (err)
++                      goto cleanup;
++              goto repeat;
++      }
++
++      nearex = path[depth].p_ext;
++
++      if ((err = ext3_ext_get_access(handle, inode, path + depth)))
++              goto cleanup;
++
++      if (!nearex) {
++              /* there is no extent in this leaf, create first one */
++              ext_debug(inode, "first extent in the leaf: %d:%d:%d\n",
++                              newext->e_block, newext->e_start,
++                              newext->e_num);
++              eh->e_num++;
++              path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++
++      } else if (newext->e_block > nearex->e_block) {
++              EXT_ASSERT(newext->e_block != nearex->e_block);
++              len = EXT_MAX_EXTENT(eh) - nearex;
++              len = (len - 1) * sizeof(struct ext3_extent);
++              len = len < 0 ? 0 : len;
++              ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, "
++                              "move %d from 0x%p to 0x%p\n",
++                              newext->e_block, newext->e_start, newext->e_num,
++                              nearex, len, nearex + 1, nearex + 2);
++              ext3_ext_check_boundary(inode, path + depth, nearex + 2, len);
++              memmove(nearex + 2, nearex + 1, len);
++              path[depth].p_ext = nearex + 1;
++              eh->e_num++;
++      } else {
++              EXT_ASSERT(newext->e_block != nearex->e_block);
++              len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++              len = len < 0 ? 0 : len;
++              ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, "
++                              "move %d from 0x%p to 0x%p\n",
++                              newext->e_block, newext->e_start, newext->e_num,
++                              nearex, len, nearex + 1, nearex + 2);
++              memmove(nearex + 1, nearex, len);
++              path[depth].p_ext = nearex;
++              eh->e_num++;
++
++              /* time to correct all indexes above */
++              err = ext3_ext_correct_indexes(handle, inode, path);
++      }
++
++      if (!err) {
++              nearex = path[depth].p_ext;
++              nearex->e_block = newext->e_block;
++              nearex->e_start = newext->e_start;
++              nearex->e_num = newext->e_num;
++      }
++
++      err = ext3_ext_dirty(handle, inode, path + depth);
++
++cleanup:
++      if (npath) {
++              ext3_ext_drop_refs(inode, npath);
++              kfree(npath);
++      }
++              
++      return err;
++}
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode, long iblock,
++                      struct buffer_head *bh_result, int create,
++                      int extend_disksize)
++{
++      struct ext3_ext_path *path;
++      int depth = EXT3_I(inode)->i_depth;
++      struct ext3_extent newex;
++      struct ext3_extent *ex;
++      int goal, newblock, err = 0;
++
++      ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n",
++                      (int) iblock, (unsigned) inode->i_ino, bh_result);
++      bh_result->b_state &= ~(1UL << BH_New);
++
++      down(&EXT3_I(inode)->i_ext_sem);
++
++      /* find extent for this block */
++      path = ext3_ext_find_extent(inode, iblock, NULL);
++      if (IS_ERR(path)) {
++              err = PTR_ERR(path);
++              goto out2;
++      }
++
++      if ((ex = path[depth].p_ext)) {
++              /* if found exent covers block, simple return it */
++              if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) {
++                      newblock = iblock - ex->e_block + ex->e_start;
++                      ext_debug(inode, "%d fit into %d:%d -> %d\n",
++                                      (int) iblock, ex->e_block, ex->e_num,
++                                      newblock);
++                      goto out;
++              }
++      }
++
++      /*
++       * we couldn't try to create block if create flag is zero 
++       */
++      if (!create) 
++              goto out2;
++
++      /* allocate new block */
++      goal = ext3_ext_find_goal(inode, path);
++      newblock = ext3_new_block(handle, inode, goal, 0, 0, &err);
++      if (!newblock)
++              goto out2;
++      ext_debug(inode, "allocate new block: goal %d, found %d\n",
++                      goal, newblock);
++
++      /* try to insert new extent into found leaf and return */
++      newex.e_block = iblock;
++      newex.e_start = newblock;
++      newex.e_num = 1;
++      err = ext3_ext_insert_extent(handle, inode, path, &newex);
++      if (err)
++              goto out2;
++      
++      /* previous routine could use block we allocated */
++      newblock = newex.e_start;
++      bh_result->b_state |= (1UL << BH_New);
++
++out:
++      ext3_ext_show_leaf(inode, path);
++      bh_result->b_dev = inode->i_dev;
++      bh_result->b_blocknr = newblock;
++out2:
++      ext3_ext_drop_refs(inode, path);
++      kfree(path);
++      up(&EXT3_I(inode)->i_ext_sem);
++
++      return err;     
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int ext3_ext_more_to_truncate(struct inode *inode,
++                              struct ext3_ext_path *path)
++{
++      EXT_ASSERT(path->p_idx);
++
++      if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++              return 0;
++
++      /*
++       * if truncate on deeper level happened it it wasn't partial
++       * so we have to consider current index for truncation
++       */
++      if (path->p_hdr->e_num == path->p_block)
++              return 0;
++
++      /*
++       * put actual number of indexes to know is this number got
++       * changed at the next iteration
++       */
++      path->p_block = path->p_hdr->e_num;
++      
++      return 1;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_remove_index(handle_t *handle, struct inode *inode,
++                                      struct ext3_ext_path *path)
++{
++      struct buffer_head *bh;
++      int err;
++      
++      /* free index block */
++      path--;
++      EXT_ASSERT(path->p_hdr->e_num);
++      if ((err = ext3_ext_get_access(handle, inode, path)))
++              return err;
++      path->p_hdr->e_num--;
++      if ((err = ext3_ext_dirty(handle, inode, path)))
++              return err;
++      bh = sb_get_hash_table(inode->i_sb, path->p_idx->e_leaf);
++      ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf);
++      ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1);
++
++      ext_debug(inode, "index is empty, remove it, free block %d\n",
++                      path->p_idx->e_leaf);
++      return err;
++}
++
++/*
++ * returns 1 if current extent needs to be freed (even partial)
++ * instead, returns 0
++ */
++int ext3_ext_more_leaves_to_truncate(struct inode *inode,
++                                      struct ext3_ext_path *path)
++{
++      unsigned blocksize = inode->i_sb->s_blocksize;
++      struct ext3_extent *ex = path->p_ext;
++      int last_block; 
++
++      EXT_ASSERT(ex);
++
++      /* is there leave in the current leaf? */
++      if (ex < EXT_FIRST_EXTENT(path->p_hdr))
++              return 0;
++      
++      last_block = (inode->i_size + blocksize-1)
++                      >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
++
++      if (last_block >= ex->e_block + ex->e_num)
++              return 0;
++
++      /* seems it extent have to be freed */
++      return 1;
++}
++
++handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++      int err;
++
++      if (handle->h_buffer_credits > needed)
++              return handle;
++      if (!ext3_journal_extend(handle, needed))
++              return handle;
++      err = ext3_journal_restart(handle, needed);
++      
++      return handle;
++}
++
++/*
++ * this routine calculate max number of blocks to be modified
++ * while freeing extent and is intended to be used in truncate path
++ */
++static int ext3_ext_calc_credits(struct inode *inode,
++                                      struct ext3_ext_path *path,
++                                      int num)
++{
++      int depth = EXT3_I(inode)->i_depth;
++      int needed;
++      
++      /*
++       * extent couldn't cross group, so we will modify
++       * single bitmap block and single group descriptor
++       */
++      needed = 2;
++
++      /*
++       * if this is last extent in a leaf, then we have to
++       * free leaf block and remove pointer from index above.
++       * that pointer could be last in index block, so we'll
++       * have to remove it too. this way we could modify/free
++       * the whole path + root index (inode stored) will be
++       * modified
++       */
++      if (!path || (num == path->p_ext->e_num &&
++                              path->p_ext == EXT_FIRST_EXTENT(path->p_hdr)))
++              needed += (depth * EXT3_ALLOC_NEEDED) + 1;
++
++      return needed;
++}
++
++/*
++ * core of the truncate procedure:
++ * - calculated what part of each extent in the requested leaf
++ *   need to be freed
++ * - frees and forgets these blocks
++ *
++ * TODO: we could optimize and free several extents during
++ *       single journal_restart()-journal_restart() cycle
++ */
++static int ext3_ext_truncate_leaf(handle_t *handle,
++                                      struct inode *inode,
++                                      struct ext3_ext_path *path,
++                                      int depth)
++{
++      unsigned blocksize = inode->i_sb->s_blocksize;
++      int last_block; 
++      int i, err = 0, sf, num;
++
++      ext_debug(inode, "level %d - leaf\n", depth);
++      if (!path->p_hdr)
++              path->p_hdr =
++                      (struct ext3_extent_header *) path->p_bh->b_data;
++
++      EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max);
++      
++      last_block = (inode->i_size + blocksize-1)
++                                      >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
++      path->p_ext = EXT_LAST_EXTENT(path->p_hdr);
++      while (ext3_ext_more_leaves_to_truncate(inode, path)) {
++
++              /* what part of extent have to be freed? */
++              sf = last_block > path->p_ext->e_block ?
++                      last_block : path->p_ext->e_block;
++
++              /* number of blocks from extent to be freed */
++              num = path->p_ext->e_block + path->p_ext->e_num - sf;
++
++              /* calc physical first physical block to be freed */
++              sf = path->p_ext->e_start + (sf - path->p_ext->e_block);
++
++              i = ext3_ext_calc_credits(inode, path, num);
++              handle = ext3_ext_journal_restart(handle, i);
++              if (IS_ERR(handle))
++                      return PTR_ERR(handle);
++              
++              ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n",
++                              path->p_ext->e_block, path->p_ext->e_start,
++                              path->p_ext->e_num, sf, num);
++              for (i = 0; i < num; i++) {
++                      struct buffer_head *bh =
++                              sb_get_hash_table(inode->i_sb, sf + i);
++                      ext3_forget(handle, 0, inode, bh, sf + i);
++              }
++              ext3_free_blocks(handle, inode, sf, num);
++
++              /* collect extents usage stats */
++              spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock);
++              EXT3_SB(inode->i_sb)->s_ext_extents++;
++              EXT3_SB(inode->i_sb)->s_ext_blocks += num;
++              spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock);
++
++              /* reduce extent */
++              if ((err = ext3_ext_get_access(handle, inode, path)))
++                      return err;
++              path->p_ext->e_num -= num;
++              if (path->p_ext->e_num == 0)
++                      path->p_hdr->e_num--;
++              if ((err = ext3_ext_dirty(handle, inode, path)))
++                      return err;
++
++              path->p_ext--;
++      }
++      
++      /* if this leaf is free, then we should
++       * remove it from index block above */
++      if (path->p_hdr->e_num == 0 && depth > 0) 
++              err = ext3_ext_remove_index(handle, inode, path);
++
++      return err;
++}
++
++static void ext3_ext_collect_stats(struct inode *inode)
++{
++      int depth;
++      
++      /* skip inodes with old good bitmap */
++      if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++              return;
++      
++      /* collect on full truncate only */
++      if (inode->i_size)
++              return;
++
++      depth = EXT3_I(inode)->i_depth;
++      if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth)
++               EXT3_SB(inode->i_sb)->s_ext_mindepth = depth;
++      if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth)
++               EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth;
++      EXT3_SB(inode->i_sb)->s_ext_sum += depth;
++      EXT3_SB(inode->i_sb)->s_ext_count++;
++      
++}
++
++void ext3_ext_truncate(struct inode * inode)
++{
++      struct address_space *mapping = inode->i_mapping;
++      struct ext3_ext_path *path;
++      struct page * page;
++      handle_t *handle;
++      int i, depth, err = 0;
++
++      down(&EXT3_I(inode)->i_ext_sem);
++      ext3_ext_collect_stats(inode);
++
++      /*
++       * We have to lock the EOF page here, because lock_page() nests
++       * outside journal_start().
++       */
++      if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) {
++              /* Block boundary? Nothing to do */
++              page = NULL;
++      } else {
++              page = grab_cache_page(mapping,
++                              inode->i_size >> PAGE_CACHE_SHIFT);
++              if (!page) {
++                      up(&EXT3_I(inode)->i_ext_sem);
++                      return;
++              }
++      }
++
++      /*
++       * probably first extent we're gonna free will be last in block
++       */
++      i = ext3_ext_calc_credits(inode, NULL, 0);
++      handle = ext3_journal_start(inode, i);
++      if (IS_ERR(handle)) {
++              if (page) {
++                      clear_highpage(page);
++                      flush_dcache_page(page);
++                      unlock_page(page);
++                      page_cache_release(page);
++              }
++              up(&EXT3_I(inode)->i_ext_sem);
++              return;
++      }
++
++      if (page)
++              ext3_block_truncate_page(handle, mapping, inode->i_size, page,
++                                              inode->i_sb->s_blocksize);
++
++      /* 
++       * TODO: optimization is possible here
++       * probably we need not scaning at all,
++       * because page truncation is enough
++       */
++      if (ext3_orphan_add(handle, inode))
++              goto out_stop;
++
++      /* we have to know where to truncate from in crash case */
++      EXT3_I(inode)->i_disksize = inode->i_size;
++      ext3_mark_inode_dirty(handle, inode);
++
++      /*
++       * we start scanning from right side freeing all the blocks
++       * after i_size and walking into the deep
++       */
++      i = 0;
++      depth = EXT3_I(inode)->i_depth;
++      path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++      if (IS_ERR(path)) {
++              ext3_error(inode->i_sb, "ext3_ext_truncate",
++                              "Can't allocate path array");
++              goto out_stop;
++      }
++      memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++
++      path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data;
++      while (i >= 0 && err == 0) {
++              if (i == depth) {
++                      /* this is leaf block */
++                      err = ext3_ext_truncate_leaf(handle, inode,
++                                                      path + i, i);
++                      /* root level have p_bh == NULL, brelse() eats this */
++                      brelse(path[i].p_bh);
++                      i--;
++                      continue;
++              }
++              
++              /* this is index block */
++              if (!path[i].p_hdr) {
++                      path[i].p_hdr =
++                              (struct ext3_extent_header *) path[i].p_bh->b_data;
++                      ext_debug(inode, "initialize header\n");
++              }
++
++              EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max);
++              
++              if (!path[i].p_idx) {
++                      /* this level hasn't touched yet */
++                      path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
++                      path[i].p_block = path[i].p_hdr->e_num + 1;
++                      ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
++                                      path[i].p_hdr, path[i].p_hdr->e_num);
++              } else {
++                      /* we've already was here, see at next index */
++                      path[i].p_idx--;
++              }
++
++              ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
++                              i, EXT_FIRST_INDEX(path[i].p_hdr),
++                              path[i].p_idx);
++              if (ext3_ext_more_to_truncate(inode, path + i)) {
++                      /* go to the next level */
++                      ext_debug(inode, "move to level %d (block %d)\n", i+1,
++                                      path[i].p_idx->e_leaf);
++                      memset(path + i + 1, 0, sizeof(*path));
++                      path[i+1].p_bh = sb_bread(inode->i_sb,
++                                                      path[i].p_idx->e_leaf);
++                      if (!path[i+1].p_bh) {
++                              /* should we reset i_size? */
++                              err = -EIO;
++                              break;
++                      }
++                      i++;
++              } else {
++                      /* we finish processing this index, go up */
++                      if (path[i].p_hdr->e_num == 0 && i > 0) {
++                              /* index is empty, remove it
++                               * handle must be already prepared by the
++                               * truncate_leaf()
++                               */
++                              err = ext3_ext_remove_index(handle, inode,
++                                                              path + i);
++                      }
++                      /* root level have p_bh == NULL, brelse() eats this */
++                      brelse(path[i].p_bh);
++                      i--;
++                      ext_debug(inode, "return to level %d\n", i);
++              }
++      }
++
++      /* TODO: flexible tree reduction should be here */
++      if (path->p_hdr->e_num == 0) {
++              /*
++               * truncate to zero freed all the tree
++               * so, we need to correct i_depth
++               */
++              EXT3_I(inode)->i_depth = 0;
++              path->p_hdr->e_max = 0;
++              ext3_mark_inode_dirty(handle, inode);
++      }
++
++      kfree(path);
++
++      /* In a multi-transaction truncate, we only make the final
++       * transaction synchronous */
++      if (IS_SYNC(inode))
++              handle->h_sync = 1;
++
++out_stop:
++      /*
++       * If this was a simple ftruncate(), and the file will remain alive
++       * then we need to clear up the orphan record which we created above.
++       * However, if this was a real unlink then we were called by
++       * ext3_delete_inode(), and we allow that function to clean up the
++       * orphan info for us.
++       */
++      if (inode->i_nlink)
++              ext3_orphan_del(handle, inode);
++
++      up(&EXT3_I(inode)->i_ext_sem);
++      ext3_journal_stop(handle, inode);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      int depth = ei->i_depth + 1;
++      int needed;
++      
++      /*
++       * the worste case we're expecting is creation of the
++       * new root (growing in depth) with index splitting
++       * for splitting we have to consider depth + 1 because
++       * previous growing could increase it
++       */
++
++      /* 
++       * growing in depth:
++       * block allocation + new root + old root
++       */
++      needed = EXT3_ALLOC_NEEDED + 2;
++
++      /* index split. we may need:
++       *   allocate intermediate indexes and new leaf
++       *   change two blocks at each level, but root
++       *   modify root block (inode)
++       */
++      needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++      /* caller want to allocate num blocks */
++      needed *= num;
++      
++#ifdef CONFIG_QUOTA
++      /* 
++       * FIXME: real calculation should be here
++       * it depends on blockmap format of qouta file
++       */
++      needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++      return needed;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++      /*
++       * possible initialization would be here
++       */
++
++      if (test_opt(sb, EXTENTS))
++              printk("EXT3-fs: file extents enabled\n");
++      spin_lock_init(&EXT3_SB(sb)->s_ext_lock);
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++      /* show collected stats */
++      if (sbi->s_ext_count && sbi->s_ext_extents)
++              printk("EXT3-fs: min depth - %d, max depth - %d, "
++                              "ave. depth - %d, ave. blocks/extent - %d\n",
++                              sbi->s_ext_mindepth,
++                              sbi->s_ext_maxdepth,
++                              sbi->s_ext_sum / sbi->s_ext_count,
++                              sbi->s_ext_blocks / sbi->s_ext_extents);
++}
++
+diff -puN fs/ext3/ialloc.c~ext3-extents fs/ext3/ialloc.c
+--- linux-2.4.18-chaos/fs/ext3/ialloc.c~ext3-extents   2003-08-25 20:09:59.000000000 +0400
++++ linux-2.4.18-chaos-alexey/fs/ext3/ialloc.c 2003-08-25 21:12:14.000000000 +0400
+@@ -571,6 +571,10 @@ repeat:
+       ei->i_prealloc_count = 0;
+ #endif
+       ei->i_block_group = i;
++      if (test_opt(sb, EXTENTS))
++              EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++      ei->i_depth = 0;
++      sema_init(&ei->i_ext_sem, 1);
+       if (ei->i_flags & EXT3_SYNC_FL)
+               inode->i_flags |= S_SYNC;
+diff -puN fs/ext3/inode.c~ext3-extents fs/ext3/inode.c
+--- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-extents    2003-08-25 20:09:59.000000000 +0400
++++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c  2003-08-25 20:09:59.000000000 +0400
+@@ -842,6 +842,15 @@ changed:
+       goto reread;
+ }
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++              struct buffer_head *bh, int create, int extend_disksize)
++{
++      if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++              return ext3_ext_get_block(handle, inode, block, bh, create, 1);
++      return ext3_get_block_handle(handle, inode, block, bh, create, 1);
++}
++
+ /*
+  * The BKL is not held on entry here.
+  */
+@@ -855,7 +864,7 @@ static int ext3_get_block(struct inode *
+               handle = ext3_journal_current_handle();
+               J_ASSERT(handle != 0);
+       }
+-      ret = ext3_get_block_handle(handle, inode, iblock,
++      ret = ext3_get_block_wrap(handle, inode, iblock,
+                               bh_result, create, 1);
+       return ret;
+ }
+@@ -882,7 +891,7 @@ ext3_direct_io_get_block(struct inode *i
+               }
+       }
+       if (ret == 0)
+-              ret = ext3_get_block_handle(handle, inode, iblock,
++              ret = ext3_get_block_wrap(handle, inode, iblock,
+                                       bh_result, create, 0);
+       if (ret == 0)
+               bh_result->b_size = (1 << inode->i_blkbits);
+@@ -904,7 +913,7 @@ struct buffer_head *ext3_getblk(handle_t
+       dummy.b_state = 0;
+       dummy.b_blocknr = -1000;
+       buffer_trace_init(&dummy.b_history);
+-      *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
++      *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
+       if (!*errp && buffer_mapped(&dummy)) {
+               struct buffer_head *bh;
+               bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1520,7 +1529,7 @@ ext3_block_truncate_page_prepare(struct 
+  * This required during truncate. We need to physically zero the tail end
+  * of that block so it doesn't yield old data if the file is later grown.
+  */
+-static int ext3_block_truncate_page(handle_t *handle,
++int ext3_block_truncate_page(handle_t *handle,
+                                   struct address_space *mapping, loff_t from,
+                                   struct page *page, unsigned blocksize)
+ {
+@@ -2040,6 +2049,9 @@ void ext3_truncate(struct inode * inode)
+        */
+       ei->i_disksize = inode->i_size;
++      if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++              return ext3_ext_truncate(inode);
++
+       /*
+        * From here we block out all ext3_get_block() callers who want to
+        * modify the block allocation tree.
+@@ -2436,6 +2448,8 @@ void ext3_read_inode(struct inode * inod
+       ei->i_prealloc_count = 0;
+ #endif
+       ei->i_block_group = iloc.block_group;
++      ei->i_depth = raw_inode->osd2.linux2.l_i_depth;
++      sema_init(&ei->i_ext_sem, 1);
+       /*
+        * NOTE! The in-memory inode i_data array is in little-endian order
+@@ -2556,6 +2570,7 @@ static int ext3_do_update_inode(handle_t
+               raw_inode->i_fsize = 0;
+       }
+ #endif
++      raw_inode->osd2.linux2.l_i_depth = ei->i_depth;
+       raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+       if (!S_ISREG(inode->i_mode)) {
+               raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+@@ -2759,6 +2774,9 @@ int ext3_writepage_trans_blocks(struct i
+       int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+       int ret;
+       
++      if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++              return ext3_ext_writepage_trans_blocks(inode, bpp);
++
+       if (ext3_should_journal_data(inode))
+               ret = 3 * (bpp + indirects) + 2;
+       else
+diff -puN fs/ext3/Makefile~ext3-extents fs/ext3/Makefile
+--- linux-2.4.18-chaos/fs/ext3/Makefile~ext3-extents   2003-08-25 20:09:59.000000000 +0400
++++ linux-2.4.18-chaos-alexey/fs/ext3/Makefile 2003-08-25 20:09:59.000000000 +0400
+@@ -12,7 +12,8 @@ O_TARGET := ext3.o
+ export-objs :=        ext3-exports.o
+ obj-y    := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-              ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o
++              ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o \
++              extents.o
+ obj-m    := $(O_TARGET)
+ include $(TOPDIR)/Rules.make
+diff -puN fs/ext3/super.c~ext3-extents fs/ext3/super.c
+--- linux-2.4.18-chaos/fs/ext3/super.c~ext3-extents    2003-08-25 20:09:59.000000000 +0400
++++ linux-2.4.18-chaos-alexey/fs/ext3/super.c  2003-08-25 20:09:59.000000000 +0400
+@@ -619,6 +619,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_ext_release(sb);
+       ext3_stop_delete_thread(sbi);
+       ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+@@ -741,6 +742,12 @@ static int parse_options (char * options
+               else
+ #endif
++              if (!strcmp (this_char, "extents"))
++                      set_opt (sbi->s_mount_opt, EXTENTS);
++              else
++              if (!strcmp (this_char, "extdebug"))
++                      set_opt (sbi->s_mount_opt, EXTDEBUG);
++              else
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -1711,6 +1718,8 @@ static int ext3_create_journal(struct su
+       /* Make sure we flush the recovery flag to disk. */
+       ext3_commit_super(sb, es, 1);
++      ext3_ext_init(sb);
++
+       return 0;
+ }
+diff -puN include/linux/ext3_fs.h~ext3-extents include/linux/ext3_fs.h
+--- linux-2.4.18-chaos/include/linux/ext3_fs.h~ext3-extents    2003-08-25 20:09:59.000000000 +0400
++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs.h  2003-08-25 21:12:14.000000000 +0400
+@@ -183,6 +183,7 @@ struct ext3_group_desc
+ #define EXT3_IMAGIC_FL                        0x00002000 /* AFS directory */
+ #define EXT3_JOURNAL_DATA_FL          0x00004000 /* file data should be journaled */
+ #define EXT3_RESERVED_FL              0x80000000 /* reserved for ext3 lib */
++#define EXT3_EXTENTS_FL                       0x00080000 /* Inode uses extents */
+ #define EXT3_FL_USER_VISIBLE          0x00005FFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE               0x000000FF /* User modifiable flags */
+@@ -243,7 +244,7 @@ struct ext3_inode {
+               struct {
+                       __u8    l_i_frag;       /* Fragment number */
+                       __u8    l_i_fsize;      /* Fragment size */
+-                      __u16   i_pad1;
++                      __u16   l_i_depth;
+                       __u16   l_i_uid_high;   /* these 2 fields    */
+                       __u16   l_i_gid_high;   /* were reserved2[0] */
+                       __u32   l_i_reserved2;
+@@ -324,6 +325,8 @@ struct ext3_inode {
+ #define EXT3_MOUNT_IOPEN              0x8000  /* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV               0x10000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
++#define EXT3_MOUNT_EXTENTS            0x40000 /* Extents support */
++#define EXT3_MOUNT_EXTDEBUG           0x80000 /* Extents debug */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -663,6 +666,12 @@ extern void ext3_discard_prealloc (struc
+ extern void ext3_dirty_inode(struct inode *);
+ extern int ext3_change_inode_journal_flag(struct inode *, int);
+ extern void ext3_truncate (struct inode *);
++extern int ext3_block_truncate_page(handle_t *handle,
++                                  struct address_space *mapping, loff_t from,
++                                  struct page *page, unsigned blocksize);
++extern int ext3_forget(handle_t *handle, int is_metadata,
++                     struct inode *inode, struct buffer_head *bh,
++                     int blocknr);
+ #ifdef EXT3_DELETE_THREAD
+ extern void ext3_truncate_thread(struct inode *inode);
+ #endif
+@@ -722,6 +731,13 @@ extern struct inode_operations ext3_dir_
+ /* symlink.c */
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++                              struct buffer_head *, int, int);
++extern void ext3_ext_truncate(struct inode *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
+ #endif        /* __KERNEL__ */
+diff -puN include/linux/ext3_fs_i.h~ext3-extents include/linux/ext3_fs_i.h
+--- linux-2.4.18-chaos/include/linux/ext3_fs_i.h~ext3-extents  2003-08-25 20:09:59.000000000 +0400
++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_i.h        2003-08-25 20:09:59.000000000 +0400
+@@ -73,6 +73,10 @@ struct ext3_inode_info {
+        * by other means, so we have truncate_sem.
+        */
+       struct rw_semaphore truncate_sem;
++
++      /* extents-related data */
++      struct semaphore i_ext_sem;
++      __u16 i_depth;
+ };
+ #endif        /* _LINUX_EXT3_FS_I */
+diff -puN include/linux/ext3_fs_sb.h~ext3-extents include/linux/ext3_fs_sb.h
+--- linux-2.4.18-chaos/include/linux/ext3_fs_sb.h~ext3-extents 2003-08-25 20:09:59.000000000 +0400
++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_sb.h       2003-08-25 20:09:59.000000000 +0400
+@@ -84,6 +84,16 @@ struct ext3_sb_info {
+       wait_queue_head_t s_delete_thread_queue;
+       wait_queue_head_t s_delete_waiter_queue;
+ #endif
++
++      /* extents */
++      int s_ext_debug;
++      int s_ext_mindepth;
++      int s_ext_maxdepth;
++      int s_ext_sum;
++      int s_ext_count;
++      spinlock_t s_ext_lock;
++      int s_ext_extents;
++      int s_ext_blocks;
+ };
+ #endif        /* _LINUX_EXT3_FS_SB */
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-extents-oflag-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-oflag-2.4.18-chaos.patch
new file mode 100644 (file)
index 0000000..c12e397
--- /dev/null
@@ -0,0 +1,291 @@
+ fs/ext3/ialloc.c            |    5 +++--
+ fs/ext3/inode.c             |    2 +-
+ fs/ext3/namei.c             |   38 ++++++++++++++++++++++++++++++++++----
+ include/asm-alpha/fcntl.h   |    1 +
+ include/asm-arm/fcntl.h     |    1 +
+ include/asm-cris/fcntl.h    |    1 +
+ include/asm-i386/fcntl.h    |    1 +
+ include/asm-ia64/fcntl.h    |    1 +
+ include/asm-m68k/fcntl.h    |    1 +
+ include/asm-mips/fcntl.h    |    1 +
+ include/asm-mips64/fcntl.h  |    1 +
+ include/asm-parisc/fcntl.h  |    1 +
+ include/asm-ppc/fcntl.h     |    1 +
+ include/asm-s390/fcntl.h    |    1 +
+ include/asm-s390x/fcntl.h   |    1 +
+ include/asm-sh/fcntl.h      |    1 +
+ include/asm-sparc/fcntl.h   |    1 +
+ include/asm-sparc64/fcntl.h |    1 +
+ include/linux/ext3_fs.h     |    2 +-
+ 19 files changed, 54 insertions(+), 8 deletions(-)
+
+--- linux-2.4.18/fs/ext3/ialloc.c~ext3-extents-oflag-2.4.18-chaos      2003-09-08 23:12:48.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/ialloc.c       2003-09-08 23:12:56.000000000 +0400
+@@ -331,7 +331,8 @@ int ext3_itable_block_used(struct super_
+  */
+ struct inode * ext3_new_inode (handle_t *handle,
+                               const struct inode * dir, int mode,
+-                              unsigned long goal)
++                              unsigned long goal,
++                              struct lookup_intent *it)
+ {
+       struct super_block * sb;
+       struct buffer_head * bh;
+@@ -573,7 +574,7 @@ repeat:
+       ei->i_prealloc_count = 0;
+ #endif
+       ei->i_block_group = i;
+-      if (test_opt(sb, EXTENTS))
++      if (test_opt(sb, EXTENTS) && it && (it->it_flags & O_EXTENTS))
+               EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
+       ei->i_depth = 0;
+       sema_init(&ei->i_ext_sem, 1);
+--- linux-2.4.18/fs/ext3/namei.c~ext3-extents-oflag-2.4.18-chaos       2003-09-08 23:12:28.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/namei.c        2003-09-08 23:12:56.000000000 +0400
+@@ -1225,7 +1225,36 @@ static int ext3_create (struct inode * d
+               handle->h_sync = 1;
+       inode = ext3_new_inode (handle, dir, mode,
+-                              (unsigned long)dentry->d_fsdata);
++                              (unsigned long)dentry->d_fsdata, NULL);
++      err = PTR_ERR(inode);
++      if (!IS_ERR(inode)) {
++              inode->i_op = &ext3_file_inode_operations;
++              inode->i_fop = &ext3_file_operations;
++              inode->i_mapping->a_ops = &ext3_aops;
++              err = ext3_add_nondir(handle, dentry, inode);
++              ext3_mark_inode_dirty(handle, inode);
++      }
++      ext3_journal_stop(handle, dir);
++      return err;
++}
++
++static int ext3_create_it (struct inode * dir, struct dentry * dentry, int mode,
++                              struct lookup_intent *it)
++{
++      handle_t *handle; 
++      struct inode * inode;
++      int err;
++
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++
++      if (IS_SYNC(dir))
++              handle->h_sync = 1;
++
++      inode = ext3_new_inode (handle, dir, mode,
++                              (unsigned long)dentry->d_fsdata, it);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               inode->i_op = &ext3_file_inode_operations;
+@@ -1254,7 +1283,7 @@ static int ext3_mknod (struct inode * di
+               handle->h_sync = 1;
+       inode = ext3_new_inode (handle, dir, mode,
+-                              (unsigned long)dentry->d_fsdata);
++                              (unsigned long)dentry->d_fsdata, NULL);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               init_special_inode(inode, mode, rdev);
+@@ -1285,7 +1314,7 @@ static int ext3_mkdir(struct inode * dir
+               handle->h_sync = 1;
+       inode = ext3_new_inode (handle, dir, S_IFDIR | mode,
+-                              (unsigned long)dentry->d_fsdata);
++                              (unsigned long)dentry->d_fsdata, NULL);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -1678,7 +1707,7 @@ static int ext3_symlink (struct inode * 
+               handle->h_sync = 1;
+       inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO,
+-                              (unsigned long)dentry->d_fsdata);
++                              (unsigned long)dentry->d_fsdata, NULL);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -1882,6 +1911,7 @@ end_rename:
+  * directories can handle most operations...
+  */
+ struct inode_operations ext3_dir_inode_operations = {
++      create_it:      ext3_create_it,         /* BKL held */
+       create:         ext3_create,            /* BKL held */
+       lookup:         ext3_lookup,            /* BKL held */
+       link:           ext3_link,              /* BKL held */
+--- linux-2.4.18/include/asm-alpha/fcntl.h~ext3-extents-oflag-2.4.18-chaos     2003-07-28 17:52:07.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-alpha/fcntl.h      2003-09-08 23:12:56.000000000 +0400
+@@ -22,6 +22,7 @@
+ #define O_LARGEFILE   0400000 /* will be set by the kernel on every open */
+ #define O_ATOMICLOOKUP        01000000 /* do atomic file lookup */
+ #define O_DIRECT      02000000 /* direct disk access - should check with OSF/1 */
++#define O_EXTENTS     04000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-arm/fcntl.h~ext3-extents-oflag-2.4.18-chaos       2003-07-28 17:52:07.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-arm/fcntl.h        2003-09-08 23:12:56.000000000 +0400
+@@ -21,6 +21,7 @@
+ #define O_DIRECT      0200000 /* direct disk access hint - currently ignored */
+ #define O_LARGEFILE   0400000
+ #define O_ATOMICLOOKUP 01000000
++#define O_EXTENTS     02000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-cris/fcntl.h~ext3-extents-oflag-2.4.18-chaos      2001-02-09 03:32:44.000000000 +0300
++++ linux-2.4.18-alexey/include/asm-cris/fcntl.h       2003-09-08 23:12:56.000000000 +0400
+@@ -22,6 +22,7 @@
+ #define O_LARGEFILE   0100000
+ #define O_DIRECTORY   0200000 /* must be a directory */
+ #define O_NOFOLLOW    0400000 /* don't follow links */
++#define O_EXTENTS     01000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get f_flags */
+--- linux-2.4.18/include/asm-i386/fcntl.h~ext3-extents-oflag-2.4.18-chaos      2003-07-28 17:52:09.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-i386/fcntl.h       2003-09-08 23:12:56.000000000 +0400
+@@ -21,6 +21,7 @@
+ #define O_DIRECTORY   0200000 /* must be a directory */
+ #define O_NOFOLLOW    0400000 /* don't follow links */
+ #define O_ATOMICLOOKUP        01000000 /* do atomic file lookup */
++#define O_EXTENTS     02000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-ia64/fcntl.h~ext3-extents-oflag-2.4.18-chaos      2003-07-28 17:52:09.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-ia64/fcntl.h       2003-09-08 23:12:56.000000000 +0400
+@@ -29,6 +29,7 @@
+ #define O_DIRECTORY   0200000 /* must be a directory */
+ #define O_NOFOLLOW    0400000 /* don't follow links */
+ #define O_ATOMICLOOKUP  01000000 /* do atomic file lookup */
++#define O_EXTENTS     02000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-m68k/fcntl.h~ext3-extents-oflag-2.4.18-chaos      2000-11-28 05:00:49.000000000 +0300
++++ linux-2.4.18-alexey/include/asm-m68k/fcntl.h       2003-09-08 23:12:56.000000000 +0400
+@@ -20,6 +20,7 @@
+ #define O_NOFOLLOW    0100000 /* don't follow links */
+ #define O_DIRECT      0200000 /* direct disk access hint - currently ignored */
+ #define O_LARGEFILE   0400000
++#define O_EXTENTS     01000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-mips64/fcntl.h~ext3-extents-oflag-2.4.18-chaos    2003-07-28 17:52:15.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-mips64/fcntl.h     2003-09-08 23:12:56.000000000 +0400
+@@ -27,6 +27,7 @@
+ #define O_DIRECTORY   0x10000 /* must be a directory */
+ #define O_NOFOLLOW    0x20000 /* don't follow links */
+ #define O_ATOMICLOOKUP        0x40000
++#define O_EXTENTS     0x80000 /* create file with extents if possible */
+ #define O_NDELAY      O_NONBLOCK
+--- linux-2.4.18/include/asm-mips/fcntl.h~ext3-extents-oflag-2.4.18-chaos      2003-07-28 17:52:14.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-mips/fcntl.h       2003-09-08 23:12:56.000000000 +0400
+@@ -27,6 +27,7 @@
+ #define O_DIRECTORY   0x10000 /* must be a directory */
+ #define O_NOFOLLOW    0x20000 /* don't follow links */
+ #define O_ATOMICLOOKUP        0x40000
++#define O_EXTENTS     02000000 /* create file with extents if possible */
+ #define O_NDELAY      O_NONBLOCK
+--- linux-2.4.18/include/asm-parisc/fcntl.h~ext3-extents-oflag-2.4.18-chaos    2000-12-05 23:29:39.000000000 +0300
++++ linux-2.4.18-alexey/include/asm-parisc/fcntl.h     2003-09-08 23:12:56.000000000 +0400
+@@ -19,6 +19,7 @@
+ #define O_NOCTTY      00400000 /* not fcntl */
+ #define O_DSYNC               01000000 /* HPUX only */
+ #define O_RSYNC               02000000 /* HPUX only */
++#define O_EXTENTS     04000000 /* create file with extents if possible */
+ #define FASYNC                00020000 /* fcntl, for BSD compatibility */
+ #define O_DIRECT      00040000 /* direct disk access hint - currently ignored */
+--- linux-2.4.18/include/asm-ppc/fcntl.h~ext3-extents-oflag-2.4.18-chaos       2003-07-28 17:52:15.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-ppc/fcntl.h        2003-09-08 23:12:56.000000000 +0400
+@@ -24,6 +24,7 @@
+ #define O_LARGEFILE     0200000
+ #define O_DIRECT      0400000 /* direct disk access hint */
+ #define O_ATOMICLOOKUP 01000000       /* do atomic file lookup */
++#define O_EXTENT      02000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-s390/fcntl.h~ext3-extents-oflag-2.4.18-chaos      2003-07-28 17:52:15.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-s390/fcntl.h       2003-09-08 23:12:56.000000000 +0400
+@@ -28,6 +28,7 @@
+ #define O_DIRECTORY   0200000 /* must be a directory */
+ #define O_NOFOLLOW    0400000 /* don't follow links */
+ #define O_ATOMICLOOKUP        01000000 /* do atomic file lookup */
++#define O_EXTENTS     02000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-s390x/fcntl.h~ext3-extents-oflag-2.4.18-chaos     2003-07-28 17:52:15.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-s390x/fcntl.h      2003-09-08 23:12:56.000000000 +0400
+@@ -28,6 +28,7 @@
+ #define O_DIRECTORY   0200000 /* must be a directory */
+ #define O_NOFOLLOW    0400000 /* don't follow links */
+ #define O_ATOMICLOOKUP 01000000       /* do atomic file lookup */
++#define O_EXTENTS     02000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-sh/fcntl.h~ext3-extents-oflag-2.4.18-chaos        2003-07-28 17:52:15.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-sh/fcntl.h 2003-09-08 23:12:56.000000000 +0400
+@@ -21,6 +21,7 @@
+ #define O_DIRECTORY   0200000 /* must be a directory */
+ #define O_NOFOLLOW    0400000 /* don't follow links */
+ #define O_ATOMICLOOKUP  01000000
++#define O_EXTENTS     02000000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/asm-sparc64/fcntl.h~ext3-extents-oflag-2.4.18-chaos   2003-07-28 17:52:16.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-sparc64/fcntl.h    2003-09-08 23:12:56.000000000 +0400
+@@ -22,6 +22,7 @@
+ #define O_LARGEFILE   0x40000
+ #define O_ATOMICLOOKUP        0x80000 /* do atomic file lookup */
+ #define O_DIRECT        0x100000 /* direct disk access hint */
++#define O_EXTENTS     0x200000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+--- linux-2.4.18/include/asm-sparc/fcntl.h~ext3-extents-oflag-2.4.18-chaos     2003-07-28 17:52:16.000000000 +0400
++++ linux-2.4.18-alexey/include/asm-sparc/fcntl.h      2003-09-08 23:12:56.000000000 +0400
+@@ -22,6 +22,7 @@
+ #define O_LARGEFILE   0x40000
+ #define O_ATOMICLOOKUP        0x80000 /* do atomic file lookup */
+ #define O_DIRECT        0x100000 /* direct disk access hint */
++#define O_EXTENTS     0x200000 /* create file with extents if possible */
+ #define F_DUPFD               0       /* dup */
+ #define F_GETFD               1       /* get close_on_exec */
+--- linux-2.4.18/include/linux/ext3_fs.h~ext3-extents-oflag-2.4.18-chaos       2003-09-08 23:12:48.000000000 +0400
++++ linux-2.4.18-alexey/include/linux/ext3_fs.h        2003-09-08 23:12:56.000000000 +0400
+@@ -641,7 +641,7 @@ extern int ext3_sync_file (struct file *
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int,
+-                                    unsigned long);
++                                    unsigned long, struct lookup_intent *);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+--- linux-2.4.18/fs/ext3/inode.c~ext3-extents-oflag-2.4.18-chaos       2003-09-08 23:12:48.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/inode.c        2003-09-08 23:13:15.000000000 +0400
+@@ -2204,7 +2204,7 @@ void ext3_truncate_thread(struct inode *
+       if (IS_ERR(handle))
+               goto out_truncate;
+-      new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode, 0);
++      new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode, 0, 0);
+       if (IS_ERR(new_inode)) {
+               ext3_debug("truncate inode %lu directly (no new inodes)\n",
+                          old_inode->i_ino);
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-map_inode_page-2.6.0.patch b/lustre/kernel_patches/patches/ext3-map_inode_page-2.6.0.patch
new file mode 100644 (file)
index 0000000..4695c4f
--- /dev/null
@@ -0,0 +1,76 @@
+ fs/ext3/inode.c |   52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/super.c |    3 +++
+ 2 files changed, 55 insertions(+)
+
+--- linux-2.6.0-test3/fs/ext3/inode.c~ext3-map_inode_page-2.6.0        2003-09-02 14:48:43.000000000 +0400
++++ linux-2.6.0-test3-alexey/fs/ext3/inode.c   2003-09-08 17:50:16.000000000 +0400
+@@ -3129,3 +3129,55 @@ int ext3_prep_san_write(struct inode *in
+               ret = ret2;
+       return ret;
+ }
++
++int ext3_map_inode_page(struct inode *inode, struct page *page,
++                        unsigned long *blocks, int *created, int create)
++{
++        unsigned int blocksize, blocks_per_page;
++        unsigned long iblock;
++        struct buffer_head dummy;
++        void *handle;
++        int i, rc = 0, failed = 0, needed_blocks;
++
++        blocksize = inode->i_sb->s_blocksize;
++        blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
++        iblock = page->index >> (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
++
++        for (i = 0; i < blocks_per_page; i++, iblock++) {
++                blocks[i] = ext3_bmap(inode->i_mapping, iblock);
++                if (blocks[i] == 0) {
++                        failed++;
++                        created[i] = -1;
++                } else {
++                        created[i] = 0;
++                }
++        }
++
++        if (failed == 0 || create == 0)
++                return 0;
++
++        needed_blocks = ext3_writepage_trans_blocks(inode) * failed;
++        handle = ext3_journal_start(inode, needed_blocks);
++        if (IS_ERR(handle))
++                return PTR_ERR(handle);
++
++        iblock = page->index >> (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
++        for (i = 0; i < blocks_per_page; i++, iblock++) {
++                if (blocks[i] != 0)
++                        continue;
++
++                rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1);
++                if (rc) {
++                        printk(KERN_INFO "ext3_map_inode_page: error reading "
++                               "block %ld\n", iblock);
++                        goto out;
++                }
++                blocks[i] = dummy.b_blocknr;
++                created[i] = 1;
++        }
++
++ out:
++      ext3_journal_stop(handle);
++        return rc;
++}
++
+--- linux-2.6.0-test3/fs/ext3/super.c~ext3-map_inode_page-2.6.0        2003-09-02 14:48:43.000000000 +0400
++++ linux-2.6.0-test3-alexey/fs/ext3/super.c   2003-09-08 17:48:33.000000000 +0400
+@@ -2094,6 +2094,9 @@ static void __exit exit_ext3_fs(void)
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
+                         int nblocks, loff_t newsize);
+ EXPORT_SYMBOL(ext3_prep_san_write);
++int ext3_map_inode_page(struct inode *inode, struct page *page,
++                        unsigned long *blocks, int *created, int create)
++EXPORT_SYMBOL(ext3_map_inode_page);
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-no-write-super-chaos.patch b/lustre/kernel_patches/patches/ext3-no-write-super-chaos.patch
new file mode 100644 (file)
index 0000000..37a5d7a
--- /dev/null
@@ -0,0 +1,15 @@
+ fs/ext3/super.c |    1 -
+ 1 files changed, 1 deletion(-)
+
+--- linux-2.4.18-chaos/fs/ext3/super.c~ext3-no-write-super-chaos       2003-08-24 21:34:53.000000000 +0400
++++ linux-2.4.18-chaos-alexey/fs/ext3/super.c  2003-08-24 21:40:47.000000000 +0400
+@@ -1818,7 +1818,6 @@ void ext3_write_super (struct super_bloc
+       if (down_trylock(&sb->s_lock) == 0)
+               BUG();
+       sb->s_dirt = 0;
+-      log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+ }
+ static int ext3_sync_fs(struct super_block *sb)
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch b/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch
new file mode 100644 (file)
index 0000000..f0b7d7e
--- /dev/null
@@ -0,0 +1,197 @@
+
+Index: linux-2.4.20-rh/fs/ext3/inode.c
+===================================================================
+--- linux-2.4.20-rh.orig/fs/ext3/inode.c       2003-09-04 18:01:41.000000000 +0800
++++ linux-2.4.20-rh/fs/ext3/inode.c    2003-09-04 18:18:54.000000000 +0800
+@@ -27,6 +27,7 @@
+ #include <linux/ext3_jbd.h>
+ #include <linux/jbd.h>
+ #include <linux/locks.h>
++#include <linux/iobuf.h>
+ #include <linux/smp_lock.h>
+ #include <linux/highuid.h>
+ #include <linux/quotaops.h>
+@@ -743,9 +744,9 @@
+  * The BKL may not be held on entry here.  Be sure to take it early.
+  */
+-static int ext3_get_block_handle(handle_t *handle, struct inode *inode, 
+-                               long iblock,
+-                               struct buffer_head *bh_result, int create)
++static int
++ext3_get_block_handle(handle_t *handle, struct inode *inode, long iblock,
++              struct buffer_head *bh_result, int create, int extend_disksize)
+ {
+       int err = -EIO;
+       int offsets[4];
+@@ -825,15 +826,18 @@
+       if (err)
+               goto cleanup;
+-      new_size = inode->i_size;
+-      /*
+-       * This is not racy against ext3_truncate's modification of i_disksize
+-       * because VM/VFS ensures that the file cannot be extended while
+-       * truncate is in progress.  It is racy between multiple parallel
+-       * instances of get_block, but we have the BKL.
+-       */
+-      if (new_size > inode->u.ext3_i.i_disksize)
+-              inode->u.ext3_i.i_disksize = new_size;
++      if (extend_disksize) {
++              /*
++               * This is not racy against ext3_truncate's modification of
++               * i_disksize because VM/VFS ensures that the file cannot be
++               * extended while truncate is in progress.  It is racy between
++               * multiple parallel instances of get_block, but we have BKL.
++               */
++              struct ext3_inode_info *ei = EXT3_I(inode);
++              new_size = inode->i_size;
++              if (new_size > ei->i_disksize)
++                      ei->i_disksize = new_size;
++      }
+       bh_result->b_state |= (1UL << BH_New);
+       goto got_it;
+@@ -861,7 +865,38 @@
+               handle = ext3_journal_current_handle();
+               J_ASSERT(handle != 0);
+       }
+-      ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
++      ret = ext3_get_block_handle(handle, inode, iblock,
++                              bh_result, create, 1);
++      return ret;
++}
++
++#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
++
++static int
++ext3_direct_io_get_block(struct inode *inode, long iblock,
++              struct buffer_head *bh_result, int create)
++{
++      handle_t *handle = journal_current_handle();
++      int ret = 0;
++
++      lock_kernel();
++      if (handle && handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
++              /*
++               * Getting low on buffer credits...
++               */
++              if (!ext3_journal_extend(handle, DIO_CREDITS)) {
++                      /*
++                       * Couldn't extend the transaction.  Start a new one
++                       */
++                      ret = ext3_journal_restart(handle, DIO_CREDITS);
++              }
++      }
++      if (ret == 0)
++              ret = ext3_get_block_handle(handle, inode, iblock,
++                                      bh_result, create, 0);
++      if (ret == 0)
++              bh_result->b_size = (1 << inode->i_blkbits);
++      unlock_kernel();
+       return ret;
+ }
+@@ -879,7 +914,7 @@
+       dummy.b_state = 0;
+       dummy.b_blocknr = -1000;
+       buffer_trace_init(&dummy.b_history);
+-      *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
++      *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
+       if (!*errp && buffer_mapped(&dummy)) {
+               struct buffer_head *bh;
+               bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1387,6 +1422,67 @@
+       return journal_try_to_free_buffers(journal, page, wait);
+ }
++static int
++ext3_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
++              unsigned long blocknr, int blocksize)
++{
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      handle_t *handle = NULL;
++      int ret;
++      int orphan = 0;
++      loff_t offset = blocknr << inode->i_blkbits;    /* ugh */
++      ssize_t count = iobuf->length;                  /* ditto */
++
++      if (rw == WRITE) {
++              loff_t final_size = offset + count;
++
++              lock_kernel();
++              handle = ext3_journal_start(inode, DIO_CREDITS);
++              unlock_kernel();
++              if (IS_ERR(handle)) {
++                      ret = PTR_ERR(handle);
++                      goto out;
++              }
++              if (final_size > inode->i_size) {
++                      lock_kernel();
++                      ret = ext3_orphan_add(handle, inode);
++                      unlock_kernel();
++                      if (ret)
++                              goto out_stop;
++                      orphan = 1;
++                      ei->i_disksize = inode->i_size;
++              }
++      }
++
++      ret = generic_direct_IO(rw, inode, iobuf, blocknr,
++                              blocksize, ext3_direct_io_get_block);
++
++out_stop:
++      if (handle) {
++              int err;
++
++              lock_kernel();
++              if (orphan) 
++                      ext3_orphan_del(handle, inode);
++              if (orphan && ret > 0) {
++                      loff_t end = offset + ret;
++                      if (end > inode->i_size) {
++                              ei->i_disksize = end;
++                              inode->i_size = end;
++                              err = ext3_mark_inode_dirty(handle, inode);
++                              if (!ret) 
++                                      ret = err;
++                      }
++              }
++              err = ext3_journal_stop(handle, inode);
++              if (ret == 0)
++                      ret = err;
++              unlock_kernel();
++      }
++out:
++      return ret;
++
++}
+ struct address_space_operations ext3_aops = {
+       readpage:       ext3_readpage,          /* BKL not held.  Don't need */
+@@ -1397,6 +1493,7 @@
+       bmap:           ext3_bmap,              /* BKL held */
+       flushpage:      ext3_flushpage,         /* BKL not held.  Don't need */
+       releasepage:    ext3_releasepage,       /* BKL not held.  Don't need */
++      direct_IO:      ext3_direct_IO,         /* BKL not held.  Don't need */
+ };
+ /*
+@@ -2970,7 +3067,7 @@
+       /* alloc blocks one by one */
+       for (i = 0; i < nblocks; i++) {
+               ret = ext3_get_block_handle(handle, inode, blocks[i],
+-                                              &bh_tmp, 1);
++                                              &bh_tmp, 1, 1);
+               if (ret)
+                       break;
+@@ -3030,7 +3127,7 @@
+                 if (blocks[i] != 0)
+                         continue;
+-                rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1);
++                rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1);
+                 if (rc) {
+                         printk(KERN_INFO "ext3_map_inode_page: error reading "
+                                "block %ld\n", iblock);
diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch
new file mode 100644 (file)
index 0000000..f8f514b
--- /dev/null
@@ -0,0 +1,1238 @@
+ fs/ext3/ialloc.c          |    3 
+ fs/ext3/inode.c           |    3 
+ fs/ext3/namei.c           |  582 +++++++++++++++++++++++++++++++++++++---------
+ fs/ext3/super.c           |   14 +
+ include/linux/ext3_fs.h   |    1 
+ include/linux/ext3_fs_i.h |    6 
+ 6 files changed, 500 insertions(+), 109 deletions(-)
+
+--- linux-2.4.18/fs/ext3/namei.c~ext3-pdirops-2.4.18-chaos     2003-09-01 14:58:06.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/namei.c        2003-09-02 11:46:15.000000000 +0400
+@@ -52,6 +52,9 @@ static struct buffer_head *ext3_append(h
+ {
+       struct buffer_head *bh;
++      /* with parallel dir operations all appends
++       * have to be serialized -bzzz */
++      down(&EXT3_I(inode)->i_append_sem);
+       *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+       if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+@@ -59,6 +62,8 @@ static struct buffer_head *ext3_append(h
+               EXT3_I(inode)->i_disksize = inode->i_size;
+               ext3_journal_get_write_access(handle,bh);
+       }
++      up(&EXT3_I(inode)->i_append_sem);
++      
+       return bh;
+ }
+@@ -135,6 +140,8 @@ struct dx_frame
+       struct buffer_head *bh;
+       struct dx_entry *entries;
+       struct dx_entry *at;
++      unsigned long leaf;
++      unsigned int curidx;
+ };
+ struct dx_map_entry
+@@ -143,6 +150,30 @@ struct dx_map_entry
+       u32 offs;
+ };
++/* FIXME: this should be reworked using bb_spin_lock
++ * introduced in -mm tree
++ */
++#define BH_DXLock     25
++
++static inline void dx_lock_bh(struct buffer_head volatile *bh)
++{
++#ifdef CONFIG_SMP
++        while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
++                while (test_bit(BH_DXLock, &bh->b_state))
++                        cpu_relax();
++        }
++#endif
++}
++
++static inline void dx_unlock_bh(struct buffer_head *bh)
++{
++#ifdef CONFIG_SMP
++        smp_mb__before_clear_bit();
++        clear_bit(BH_DXLock, &bh->b_state);
++#endif
++}
++
++
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block (struct dx_entry *entry);
+ static void dx_set_block (struct dx_entry *entry, unsigned value);
+@@ -154,7 +185,7 @@ static void dx_set_count (struct dx_entr
+ static void dx_set_limit (struct dx_entry *entries, unsigned value);
+ static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+ static unsigned dx_node_limit (struct inode *dir);
+-static struct dx_frame *dx_probe(struct dentry *dentry,
++static struct dx_frame *dx_probe(struct qstr *name,
+                                struct inode *dir,
+                                struct dx_hash_info *hinfo,
+                                struct dx_frame *frame,
+@@ -166,15 +197,18 @@ static void dx_sort_map(struct dx_map_en
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+               struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+                                struct dx_frame *frame,
+                                struct dx_frame *frames, int *err,
+                                __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+-                     struct ext3_dir_entry_2 **res_dir, int *err);
++                     struct ext3_dir_entry_2 **res_dir, int *err,
++                     int rwlock, void **lock);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode);
++static inline void *ext3_lock_htree(struct inode *, unsigned long, int);
++static inline void ext3_unlock_htree(struct inode *, void *);
+ /*
+  * Future: use high four bits of block for coalesce-on-delete flags
+@@ -307,6 +341,94 @@ struct stats dx_show_entries(struct dx_h
+ #endif /* DX_DEBUG */
+ /*
++ * dx_find_position
++ *
++ * search position of specified hash in index
++ *
++ */
++
++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash)
++{
++      struct dx_entry *p, *q, *m;
++      int count;
++
++      count = dx_get_count(entries);
++      p = entries + 1;
++      q = entries + count - 1;
++      while (p <= q)
++      {
++              m = p + (q - p)/2;
++              if (dx_get_hash(m) > hash)
++                      q = m - 1;
++              else
++                      p = m + 1;
++      }
++      return p - 1;
++}
++
++/*
++ * returns 1 if path is unchanged
++ */
++int dx_check_path(struct dx_frame *frame, u32 hash)
++{
++      struct dx_entry *p;
++      int ret = 1;
++
++      dx_lock_bh(frame->bh);
++      p = dx_find_position(frame->entries, hash);
++      if (frame->leaf != dx_get_block(p))
++              ret = 0;
++      dx_unlock_bh(frame->bh);
++      
++      return ret;
++}
++
++/*
++ * 0 - changed
++ * 1 - hasn't changed
++ */
++static int
++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo)
++{
++      struct dx_entry *p;
++      struct dx_frame *frame = frames;
++      u32 leaf;
++
++      /* check first level */
++      dx_lock_bh(frame->bh);
++      p = dx_find_position(frame->entries, hinfo->hash);
++      leaf = dx_get_block(p);
++      dx_unlock_bh(frame->bh);
++      
++      if (leaf != frame->leaf) 
++              return 0;
++      
++      /* is there 2nd level? */
++      frame++;
++      if (frame->bh == NULL)
++              return 1;
++
++      /* check second level */
++      dx_lock_bh(frame->bh);
++
++      /* probably 1st level got changed, check it */
++      if (!dx_check_path(frames, hinfo->hash)) {
++              /* path changed */
++              dx_unlock_bh(frame->bh);
++              return 0;
++      }
++
++      p = dx_find_position(frame->entries, hinfo->hash);
++      leaf = dx_get_block(p);
++      dx_unlock_bh(frame->bh);
++      
++      if (leaf != frame->leaf)
++              return 0;
++
++      return 1;
++}
++
++/*
+  * Probe for a directory leaf block to search.
+  *
+  * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+@@ -316,19 +438,20 @@ struct stats dx_show_entries(struct dx_h
+  * back to userspace.
+  */
+ static struct dx_frame *
+-dx_probe(struct dentry *dentry, struct inode *dir,
++dx_probe(struct qstr *name, struct inode *dir,
+        struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+ {
+-      unsigned count, indirect;
+-      struct dx_entry *at, *entries, *p, *q, *m;
++      unsigned indirect;
++      struct dx_entry *at, *entries;
+       struct dx_root *root;
+       struct buffer_head *bh;
+       struct dx_frame *frame = frame_in;
+       u32 hash;
++      unsigned int curidx;
+       frame->bh = NULL;
+-      if (dentry)
+-              dir = dentry->d_parent->d_inode;
++      frame[1].bh = NULL;
++
+       if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+               goto fail;
+       root = (struct dx_root *) bh->b_data;
+@@ -344,8 +467,8 @@ dx_probe(struct dentry *dentry, struct i
+       }
+       hinfo->hash_version = root->info.hash_version;
+       hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
+-      if (dentry)
+-              ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++      if (name)
++              ext3fs_dirhash(name->name, name->len, hinfo);
+       hash = hinfo->hash;
+       if (root->info.unused_flags & 1) {
+@@ -357,7 +480,19 @@ dx_probe(struct dentry *dentry, struct i
+               goto fail;
+       }
++repeat:
++      curidx = 0;
++      entries = (struct dx_entry *) (((char *)&root->info) +
++                                     root->info.info_length);
++      assert(dx_get_limit(entries) == dx_root_limit(dir,
++                                                    root->info.info_length));
++      dxtrace (printk("Look up %x", hash));
++      dx_lock_bh(bh);
++      /* indirect must be initialized under bh lock because
++       * 2nd level creation procedure may change it and dx_probe()
++       * will suggest htree is still single-level -bzzz */
+       if ((indirect = root->info.indirect_levels) > 1) {
++              dx_unlock_bh(bh);
+               ext3_warning(dir->i_sb, __FUNCTION__,
+                            "Unimplemented inode hash depth: %#06x",
+                            root->info.indirect_levels);
+@@ -365,56 +500,46 @@ dx_probe(struct dentry *dentry, struct i
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+       }
+-
+-      entries = (struct dx_entry *) (((char *)&root->info) +
+-                                     root->info.info_length);
+-      assert(dx_get_limit(entries) == dx_root_limit(dir,
+-                                                    root->info.info_length));
+-      dxtrace (printk("Look up %x", hash));
++      
+       while (1)
+       {
+-              count = dx_get_count(entries);
+-              assert (count && count <= dx_get_limit(entries));
+-              p = entries + 1;
+-              q = entries + count - 1;
+-              while (p <= q)
+-              {
+-                      m = p + (q - p)/2;
+-                      dxtrace(printk("."));
+-                      if (dx_get_hash(m) > hash)
+-                              q = m - 1;
+-                      else
+-                              p = m + 1;
+-              }
+-
+-              if (0) // linear search cross check
+-              {
+-                      unsigned n = count - 1;
+-                      at = entries;
+-                      while (n--)
+-                      {
+-                              dxtrace(printk(","));
+-                              if (dx_get_hash(++at) > hash)
+-                              {
+-                                      at--;
+-                                      break;
+-                              }
+-                      }
+-                      assert (at == p - 1);
+-              }
+-
+-              at = p - 1;
+-              dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++              at = dx_find_position(entries, hinfo->hash);
++              dxtrace(printk(" %x->%u\n",
++                              at == entries? 0: dx_get_hash(at),
++                              dx_get_block(at)));
+               frame->bh = bh;
+               frame->entries = entries;
+               frame->at = at;
+-              if (!indirect--) return frame;
+-              if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++              frame->curidx = curidx;
++              frame->leaf = dx_get_block(at);
++              if (!indirect--) {
++                      dx_unlock_bh(bh);
++                      return frame;
++              }
++              
++              /* step into next htree level */
++              curidx = dx_get_block(at);
++              dx_unlock_bh(bh);
++              if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err)))
+                       goto fail2;
++              
++              dx_lock_bh(bh);
++              /* splitting may change root index block and move
++               * hash we're looking for into another index block
++               * so, we have to check this situation and repeat
++               * from begining if path got changed -bzzz */
++              if (!dx_check_path(frame, hash)) {
++                      dx_unlock_bh(bh);
++                      bh = frame->bh;
++                      indirect++;
++                      goto repeat;
++              }
++              
+               at = entries = ((struct dx_node *) bh->b_data)->entries;
+               assert (dx_get_limit(entries) == dx_node_limit (dir));
+               frame++;
+       }
++      dx_unlock_bh(bh);
+ fail2:
+       while (frame >= frame_in) {
+               brelse(frame->bh);
+@@ -428,8 +553,7 @@ static void dx_release (struct dx_frame 
+ {
+       if (frames[0].bh == NULL)
+               return;
+-
+-      if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++      if (frames[1].bh != NULL)
+               brelse(frames[1].bh);
+       brelse(frames[0].bh);
+ }
+@@ -471,8 +595,10 @@ static int ext3_htree_next_block(struct 
+        * nodes need to be read.
+        */
+       while (1) {
+-              if (++(p->at) < p->entries + dx_get_count(p->entries))
++              if (++(p->at) < p->entries + dx_get_count(p->entries)) {
++                      p->leaf = dx_get_block(p->at);
+                       break;
++              }
+               if (p == frames)
+                       return 0;
+               num_frames++;
+@@ -498,13 +624,17 @@ static int ext3_htree_next_block(struct 
+        * block so no check is necessary
+        */
+       while (num_frames--) {
+-              if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+-                                    0, err)))
++              u32 idx;
++              
++              idx = p->leaf = dx_get_block(p->at);
++              if (!(bh = ext3_bread(NULL, dir, idx, 0, err)))
+                       return -1; /* Failure */
+               p++;
+               brelse (p->bh);
+               p->bh = bh;
+               p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++              p->curidx = idx;
++              p->leaf = dx_get_block(p->at);
+       }
+       return 1;
+ }
+@@ -544,7 +674,7 @@ int ext3_htree_fill_tree(struct file *di
+       dir = dir_file->f_dentry->d_inode;
+       hinfo.hash = start_hash;
+       hinfo.minor_hash = 0;
+-      frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++      frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+@@ -626,7 +756,8 @@ static int dx_make_map (struct ext3_dir_
+                       count++;
+               }
+               /* XXX: do we need to check rec_len == 0 case? -Chris */
+-              de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++              de = (struct ext3_dir_entry_2 *)((char*)de +
++                              le16_to_cpu(de->rec_len));
+       }
+       return count;
+ }
+@@ -659,7 +790,8 @@ static void dx_sort_map (struct dx_map_e
+       } while(more);
+ }
+-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct inode *dir, struct dx_frame *frame,
++                      u32 hash, u32 block, u32 idx)
+ {
+       struct dx_entry *entries = frame->entries;
+       struct dx_entry *old = frame->at, *new = old + 1;
+@@ -671,6 +803,7 @@ static void dx_insert_block(struct dx_fr
+       dx_set_hash(new, hash);
+       dx_set_block(new, block);
+       dx_set_count(entries, count + 1);
++      
+ }
+ #endif
+@@ -753,7 +886,8 @@ static int inline search_dirblock(struct
+       
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+-                                      struct ext3_dir_entry_2 ** res_dir)
++                                      struct ext3_dir_entry_2 ** res_dir,
++                                      int rwlock, void **lock)
+ {
+       struct super_block * sb;
+       struct buffer_head * bh_use[NAMEI_RA_SIZE];
+@@ -769,6 +903,7 @@ static struct buffer_head * ext3_find_en
+       int namelen;
+       const u8 *name;
+       unsigned blocksize;
++      int do_not_use_dx = 0;
+       *res_dir = NULL;
+       sb = dir->i_sb;
+@@ -777,9 +912,10 @@ static struct buffer_head * ext3_find_en
+       name = dentry->d_name.name;
+       if (namelen > EXT3_NAME_LEN)
+               return NULL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+       if (is_dx(dir)) {
+-              bh = ext3_dx_find_entry(dentry, res_dir, &err);
++              bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock);
+               /*
+                * On success, or if the error was file not found,
+                * return.  Otherwise, fall back to doing a search the
+@@ -788,8 +924,14 @@ static struct buffer_head * ext3_find_en
+               if (bh || (err != ERR_BAD_DX_DIR))
+                       return bh;
+               dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++              do_not_use_dx = 1;
+       }
+ #endif
++      *lock = ext3_lock_htree(dir, 0, rwlock);
++      if (is_dx(dir) && !do_not_use_dx) {
++              ext3_unlock_htree(dir, *lock);
++              goto repeat;
++      }
+       nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+       start = EXT3_I(dir)->i_dir_start_lookup;
+       if (start >= nblocks)
+@@ -861,12 +1003,17 @@ cleanup_and_exit:
+       /* Clean up the read-ahead blocks */
+       for (; ra_ptr < ra_max; ra_ptr++)
+               brelse (bh_use[ra_ptr]);
++      if (!ret) {
++              ext3_unlock_htree(dir, *lock);
++              *lock = NULL;
++      }
+       return ret;
+ }
+ #ifdef CONFIG_EXT3_INDEX
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+-                     struct ext3_dir_entry_2 **res_dir, int *err)
++                     struct ext3_dir_entry_2 **res_dir, int *err,
++                     int rwlock, void **lock)
+ {
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+@@ -881,11 +1028,22 @@ static struct buffer_head * ext3_dx_find
+       struct inode *dir = dentry->d_parent->d_inode;
+       
+       sb = dir->i_sb;
+-      if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++repeat:
++      if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err)))
+               return NULL;
++      
++      *lock = ext3_lock_htree(dir, frame->leaf, rwlock);
++      /* while locking leaf we just found may get splitted
++       * so, we need another leaf. check this */
++      if (!dx_check_full_path(frames, &hinfo)) {
++              ext3_unlock_htree(dir, *lock);
++              dx_release(frames);
++              goto repeat;
++      }
++
+       hash = hinfo.hash;
+       do {
+-              block = dx_get_block(frame->at);
++              block = frame->leaf;
+               if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+                       goto errout;
+               de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -919,6 +1077,8 @@ static struct buffer_head * ext3_dx_find
+       *err = -ENOENT;
+ errout:
+       dxtrace(printk("%s not found\n", name));
++      ext3_unlock_htree(dir, *lock);
++      *lock = NULL;
+       dx_release (frames);
+       return NULL;
+ }
+@@ -931,6 +1091,7 @@ static struct dentry *ext3_lookup(struct
+       struct ext3_dir_entry_2 * de;
+       struct buffer_head * bh;
+       struct dentry *alternate = NULL;
++      void *lock = NULL;
+       if (dentry->d_name.len > EXT3_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
+@@ -938,10 +1099,11 @@ static struct dentry *ext3_lookup(struct
+       if (ext3_check_for_iopen(dir, dentry))
+               return NULL;
+-      bh = ext3_find_entry(dentry, &de);
++      bh = ext3_find_entry(dentry, &de, 0, &lock);
+       inode = NULL;
+       if (bh) {
+               unsigned long ino = le32_to_cpu(de->inode);
++              ext3_unlock_htree(dir, lock);
+               brelse (bh);
+               inode = iget(dir->i_sb, ino);
+@@ -984,7 +1146,8 @@ dx_move_dirents(char *from, char *to, st
+       unsigned rec_len = 0;
+       while (count--) {
+-              struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++              struct ext3_dir_entry_2 *de =
++                      (struct ext3_dir_entry_2 *) (from + map->offs);
+               rec_len = EXT3_DIR_REC_LEN(de->name_len);
+               memcpy (to, de, rec_len);
+               ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
+@@ -997,7 +1160,8 @@ dx_move_dirents(char *from, char *to, st
+ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+ {
+-      struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++      struct ext3_dir_entry_2 *next, *to, *prev;
++      struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base;
+       unsigned rec_len = 0;
+       prev = to = de;
+@@ -1019,7 +1183,8 @@ static struct ext3_dir_entry_2* dx_pack_
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+                       struct buffer_head **bh,struct dx_frame *frame,
+-                      struct dx_hash_info *hinfo, int *error)
++                      struct dx_hash_info *hinfo, void **target,
++                      int *error)
+ {
+       unsigned blocksize = dir->i_sb->s_blocksize;
+       unsigned count, continued;
+@@ -1066,23 +1231,30 @@ static struct ext3_dir_entry_2 *do_split
+       hash2 = map[split].hash;
+       continued = hash2 == map[split - 1].hash;
+       dxtrace(printk("Split block %i at %x, %i/%i\n",
+-              dx_get_block(frame->at), hash2, split, count-split));
+-
++              frame->leaf, hash2, split, count-split));
++      
+       /* Fancy dance to stay within two buffers */
+       de2 = dx_move_dirents(data1, data2, map + split, count - split);
+       de = dx_pack_dirents(data1,blocksize);
+       de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+       de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+-      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+-      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++      dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1));
++      dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1));
+       /* Which block gets the new entry? */
++      *target = NULL;
+       if (hinfo->hash >= hash2)
+       {
+               swap(*bh, bh2);
+               de = de2;
+-      }
+-      dx_insert_block (frame, hash2 + continued, newblock);
++
++              /* entry will be stored into new block
++               * we have to lock it before add_dirent_to_buf */
++              *target = ext3_lock_htree(dir, newblock, 1);
++      }
++      dx_lock_bh(frame->bh);
++      dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx);
++      dx_unlock_bh(frame->bh);
+       err = ext3_journal_dirty_metadata (handle, bh2);
+       if (err)
+               goto journal_error;
+@@ -1156,7 +1328,8 @@ static int add_dirent_to_buf(handle_t *h
+       nlen = EXT3_DIR_REC_LEN(de->name_len);
+       rlen = le16_to_cpu(de->rec_len);
+       if (de->inode) {
+-              struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++              struct ext3_dir_entry_2 *de1 =
++                      (struct ext3_dir_entry_2 *)((char *)de + nlen);
+               de1->rec_len = cpu_to_le16(rlen - nlen);
+               de->rec_len = cpu_to_le16(nlen);
+               de = de1;
+@@ -1214,7 +1387,8 @@ static int make_indexed_dir(handle_t *ha
+       unsigned        blocksize;
+       struct dx_hash_info hinfo;
+       u32             block;
+-              
++      void            *lock, *new_lock;
++
+       blocksize =  dir->i_sb->s_blocksize;
+       dxtrace(printk("Creating index\n"));
+       retval = ext3_journal_get_write_access(handle, bh);
+@@ -1225,7 +1399,6 @@ static int make_indexed_dir(handle_t *ha
+       }
+       root = (struct dx_root *) bh->b_data;
+               
+-      EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
+       bh2 = ext3_append (handle, dir, &block, &retval);
+       if (!(bh2)) {
+               brelse(bh);
+@@ -1233,6 +1406,8 @@ static int make_indexed_dir(handle_t *ha
+       }
+       data1 = bh2->b_data;
++      lock = ext3_lock_htree(dir, block, 1);
++
+       /* The 0th block becomes the root, move the dirents out */
+       de = (struct ext3_dir_entry_2 *) &root->info;
+       len = ((char *) root) + blocksize - (char *) de;
+@@ -1261,13 +1436,25 @@ static int make_indexed_dir(handle_t *ha
+       frame->entries = entries;
+       frame->at = entries;
+       frame->bh = bh;
++      frame->curidx = 0;
++      frame->leaf = 0;
++      frame[1].bh = NULL;
+       bh = bh2;
+-      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++      de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval);
+       dx_release (frames);
+       if (!(de))
+-              return retval;
++              goto cleanup;
++
++      retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++cleanup:
++      if (new_lock)
++              ext3_unlock_htree(dir, new_lock);
++      /* we mark directory indexed in order to
++       * avoid races while htree being created -bzzz */
++      EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++      ext3_unlock_htree(dir, lock);
+-      return add_dirent_to_buf(handle, dentry, inode, de, bh);
++      return retval;
+ }
+ #endif
+@@ -1296,11 +1483,13 @@ static int ext3_add_entry (handle_t *han
+       unsigned blocksize;
+       unsigned nlen, rlen;
+       u32 block, blocks;
++      void *lock;
+       sb = dir->i_sb;
+       blocksize = sb->s_blocksize;
+       if (!dentry->d_name.len)
+               return -EINVAL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+       if (is_dx(dir)) {
+               retval = ext3_dx_add_entry(handle, dentry, inode);
+@@ -1311,36 +1500,53 @@ static int ext3_add_entry (handle_t *han
+               ext3_mark_inode_dirty(handle, dir);
+       }
+ #endif
++      lock = ext3_lock_htree(dir, 0, 1);
++      if (is_dx(dir)) {
++              /* we got lock for block 0
++               * probably previous holder of the lock
++               * created htree -bzzz */
++              ext3_unlock_htree(dir, lock);
++              goto repeat;
++      }
++      
+       blocks = dir->i_size >> sb->s_blocksize_bits;
+       for (block = 0, offset = 0; block < blocks; block++) {
+               bh = ext3_bread(handle, dir, block, 0, &retval);
+-              if(!bh)
++              if(!bh) {
++                      ext3_unlock_htree(dir, lock);
+                       return retval;
++              }
+               retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
+-              if (retval != -ENOSPC)
++              if (retval != -ENOSPC) {
++                      ext3_unlock_htree(dir, lock);
+                       return retval;
++              }
+ #ifdef CONFIG_EXT3_INDEX
+               if (blocks == 1 && !dx_fallback &&
+-                  EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+-                      return make_indexed_dir(handle, dentry, inode, bh);
++                  EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) {
++                      retval = make_indexed_dir(handle, dentry, inode, bh);
++                      ext3_unlock_htree(dir, lock);
++                      return retval;
++              }
+ #endif
+               brelse(bh);
+       }
+       bh = ext3_append(handle, dir, &block, &retval);
+-      if (!bh)
++      if (!bh) {
++              ext3_unlock_htree(dir, lock);
+               return retval;
++      }
+       de = (struct ext3_dir_entry_2 *) bh->b_data;
+       de->inode = 0;
+       de->rec_len = cpu_to_le16(rlen = blocksize);
+       nlen = 0;
+-      return add_dirent_to_buf(handle, dentry, inode, de, bh);
++      retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++      ext3_unlock_htree(dir, lock);
++      return retval;
+ }
+ #ifdef CONFIG_EXT3_INDEX
+-/*
+- * Returns 0 for success, or a negative error value
+- */
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+@@ -1352,15 +1558,28 @@ static int ext3_dx_add_entry(handle_t *h
+       struct super_block * sb = dir->i_sb;
+       struct ext3_dir_entry_2 *de;
+       int err;
+-
+-      frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++      int curidx;
++      void *idx_lock, *leaf_lock, *newleaf_lock;
++      
++repeat:
++      frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+-      entries = frame->entries;
+-      at = frame->at;
+-      if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++      /* we're going to chage leaf, so lock it first */
++      leaf_lock = ext3_lock_htree(dir, frame->leaf, 1);
++
++      /* while locking leaf we just found may get splitted
++       * so we need to check this */
++      if (!dx_check_full_path(frames, &hinfo)) {
++              ext3_unlock_htree(dir, leaf_lock);
++              dx_release(frames);
++              goto repeat;
++      }
++      if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) {
++              printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err);
+               goto cleanup;
++      }
+       BUFFER_TRACE(bh, "get_write_access");
+       err = ext3_journal_get_write_access(handle, bh);
+@@ -1373,6 +1592,35 @@ static int ext3_dx_add_entry(handle_t *h
+               goto cleanup;
+       }
++      /* our leaf has no enough space. hence, we have to
++       * split it. so lock index for this leaf first */
++      curidx = frame->curidx;
++      idx_lock = ext3_lock_htree(dir, curidx, 1);
++
++      /* now check did path get changed? */
++      dx_release(frames);
++
++      frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode,
++                      &hinfo, frames, &err);
++      if (!frame) {
++              /* FIXME: error handling here */
++              brelse(bh);
++              ext3_unlock_htree(dir, idx_lock);
++              return err;
++      }
++      
++      if (frame->curidx != curidx) {
++              /* path has been changed. we have to drop old lock
++               * and repeat */
++              brelse(bh);
++              ext3_unlock_htree(dir, idx_lock);
++              ext3_unlock_htree(dir, leaf_lock);
++              dx_release(frames);
++              goto repeat;
++      }
++      entries = frame->entries;
++      at = frame->at;
++
+       /* Block full, should compress but for now just split */
+       dxtrace(printk("using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+@@ -1384,7 +1632,8 @@ static int ext3_dx_add_entry(handle_t *h
+               struct dx_entry *entries2;
+               struct dx_node *node2;
+               struct buffer_head *bh2;
+-
++              void *nb_lock;
++              
+               if (levels && (dx_get_count(frames->entries) ==
+                              dx_get_limit(frames->entries))) {
+                       ext3_warning(sb, __FUNCTION__,
+@@ -1395,6 +1644,7 @@ static int ext3_dx_add_entry(handle_t *h
+               bh2 = ext3_append (handle, dir, &newblock, &err);
+               if (!(bh2))
+                       goto cleanup;
++              nb_lock = ext3_lock_htree(dir, newblock, 1);
+               node2 = (struct dx_node *)(bh2->b_data);
+               entries2 = node2->entries;
+               node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+@@ -1406,27 +1656,73 @@ static int ext3_dx_add_entry(handle_t *h
+               if (levels) {
+                       unsigned icount1 = icount/2, icount2 = icount - icount1;
+                       unsigned hash2 = dx_get_hash(entries + icount1);
++                      void *ri_lock;
++
++                      /* we have to protect root htree index against
++                       * another dx_add_entry() which would want to
++                       * split it too -bzzz */
++                      ri_lock = ext3_lock_htree(dir, 0, 1);
++
++                      /* as root index block blocked we must repeat
++                       * searching for current position of our 2nd index -bzzz */
++                      dx_lock_bh(frame->bh);
++                      frames->at = dx_find_position(frames->entries, hinfo.hash);
++                      dx_unlock_bh(frame->bh);
++                      
+                       dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+-                              
+-                      BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++      
++                      BUFFER_TRACE(frame->bh, "get_write_access");
+                       err = ext3_journal_get_write_access(handle,
+                                                            frames[0].bh);
+                       if (err)
+                               goto journal_error;
+-                              
++                      
++                      /* copy index into new one */
+                       memcpy ((char *) entries2, (char *) (entries + icount1),
+                               icount2 * sizeof(struct dx_entry));
+-                      dx_set_count (entries, icount1);
+                       dx_set_count (entries2, icount2);
+                       dx_set_limit (entries2, dx_node_limit(dir));
+                       /* Which index block gets the new entry? */
+                       if (at - entries >= icount1) {
++                              /* unlock index we won't use */
++                              ext3_unlock_htree(dir, idx_lock);
++                              idx_lock = nb_lock;
+                               frame->at = at = at - entries - icount1 + entries2;
+-                              frame->entries = entries = entries2;
++                              frame->entries = entries2;
++                              frame->curidx = curidx = newblock;
+                               swap(frame->bh, bh2);
++                      } else {
++                              /* we'll use old index,so new one may be freed */
++                              ext3_unlock_htree(dir, nb_lock);
+                       }
+-                      dx_insert_block (frames + 0, hash2, newblock);
++              
++                      /* NOTE: very subtle piece of code
++                       * competing dx_probe() may find 2nd level index in root
++                       * index, then we insert new index here and set new count
++                       * in that 2nd level index. so, dx_probe() may see 2nd
++                       * level index w/o hash it looks for. the solution is
++                       * to check root index after we locked just founded 2nd
++                       * level index -bzzz */
++                      dx_lock_bh(frames[0].bh);
++                      dx_insert_block (dir, frames + 0, hash2, newblock, 0);
++                      dx_unlock_bh(frames[0].bh);
++                      
++                      /* now old and new 2nd level index blocks contain
++                       * all pointers, so dx_probe() may find it in the both.
++                       * it's OK -bzzz */
++                      
++                      dx_lock_bh(frame->bh);
++                      dx_set_count(entries, icount1);
++                      dx_unlock_bh(frame->bh);
++
++                      /* now old 2nd level index block points to first half
++                       * of leafs. it's importand that dx_probe() must
++                       * check root index block for changes under
++                       * dx_lock_bh(frame->bh) -bzzz */
++
++                      ext3_unlock_htree(dir, ri_lock);
++              
+                       dxtrace(dx_show_index ("node", frames[1].entries));
+                       dxtrace(dx_show_index ("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+@@ -1435,38 +1731,61 @@ static int ext3_dx_add_entry(handle_t *h
+                               goto journal_error;
+                       brelse (bh2);
+               } else {
++                      unsigned long leaf = frame->leaf;
++
+                       dxtrace(printk("Creating second level index...\n"));
+                       memcpy((char *) entries2, (char *) entries,
+                              icount * sizeof(struct dx_entry));
+                       dx_set_limit(entries2, dx_node_limit(dir));
+                       /* Set up root */
++                      dx_lock_bh(frames[0].bh);
+                       dx_set_count(entries, 1);
+                       dx_set_block(entries + 0, newblock);
+                       ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++                      dx_unlock_bh(frames[0].bh);
+                       /* Add new access path frame */
+                       frame = frames + 1;
+                       frame->at = at = at - entries + entries2;
+                       frame->entries = entries = entries2;
+                       frame->bh = bh2;
++                      frame->curidx = newblock;
++                      frame->leaf = leaf;
+                       err = ext3_journal_get_write_access(handle,
+                                                            frame->bh);
+                       if (err)
+                               goto journal_error;
++
++                      /* first level index was root. it's already initialized */
++                      /* we my unlock it now */
++                      ext3_unlock_htree(dir, idx_lock);
++
++                      /* current index is just created 2nd level index */
++                      curidx = newblock;
++                      idx_lock = nb_lock;
+               }
+               ext3_journal_dirty_metadata(handle, frames[0].bh);
+       }
+-      de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++      de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err);
+       if (!de)
+               goto cleanup;
++
++      /* index splitted */
++      ext3_unlock_htree(dir, idx_lock);
++      
+       err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++
++      if (newleaf_lock)
++              ext3_unlock_htree(dir, newleaf_lock);
++      
+       bh = 0;
+       goto cleanup;
+       
+ journal_error:
+       ext3_std_error(dir->i_sb, err);
+ cleanup:
++      ext3_unlock_htree(dir, leaf_lock);
+       if (bh)
+               brelse(bh);
+       dx_release(frames);
+@@ -1899,6 +2218,7 @@ static int ext3_rmdir (struct inode * di
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de;
+       handle_t *handle;
++      void *lock;
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+       if (IS_ERR(handle)) {
+@@ -1906,7 +2226,7 @@ static int ext3_rmdir (struct inode * di
+       }
+       retval = -ENOENT;
+-      bh = ext3_find_entry (dentry, &de);
++      bh = ext3_find_entry (dentry, &de, 1, &lock);
+       if (!bh)
+               goto end_rmdir;
+@@ -1917,14 +2237,19 @@ static int ext3_rmdir (struct inode * di
+       DQUOT_INIT(inode);
+       retval = -EIO;
+-      if (le32_to_cpu(de->inode) != inode->i_ino)
++      if (le32_to_cpu(de->inode) != inode->i_ino) {
++              ext3_unlock_htree(dir, lock);
+               goto end_rmdir;
++      }
+       retval = -ENOTEMPTY;
+-      if (!empty_dir (inode))
++      if (!empty_dir (inode)) {
++              ext3_unlock_htree(dir, lock);
+               goto end_rmdir;
++      }
+       retval = ext3_delete_entry(handle, dir, de, bh);
++      ext3_unlock_htree(dir, lock);
+       if (retval)
+               goto end_rmdir;
+       if (inode->i_nlink != 2)
+@@ -1957,6 +2282,7 @@ static int ext3_unlink(struct inode * di
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de;
+       handle_t *handle;
++      void *lock;
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+       if (IS_ERR(handle)) {
+@@ -1967,7 +2293,7 @@ static int ext3_unlink(struct inode * di
+               handle->h_sync = 1;
+       retval = -ENOENT;
+-      bh = ext3_find_entry (dentry, &de);
++      bh = ext3_find_entry (dentry, &de, 1, &lock);
+       if (!bh)
+               goto end_unlink;
+@@ -1975,8 +2301,10 @@ static int ext3_unlink(struct inode * di
+       DQUOT_INIT(inode);
+       retval = -EIO;
+-      if (le32_to_cpu(de->inode) != inode->i_ino)
++      if (le32_to_cpu(de->inode) != inode->i_ino) {
++              ext3_unlock_htree(dir, lock);
+               goto end_unlink;
++      }
+       
+       if (!inode->i_nlink) {
+               ext3_warning (inode->i_sb, "ext3_unlink",
+@@ -1985,6 +2313,7 @@ static int ext3_unlink(struct inode * di
+               inode->i_nlink = 1;
+       }
+       retval = ext3_delete_entry(handle, dir, de, bh);
++      ext3_unlock_htree(dir, lock);
+       if (retval)
+               goto end_unlink;
+       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+@@ -2106,6 +2435,7 @@ static int ext3_rename (struct inode * o
+       struct buffer_head * old_bh, * new_bh, * dir_bh;
+       struct ext3_dir_entry_2 * old_de, * new_de;
+       int retval;
++      void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL;
+       old_bh = new_bh = dir_bh = NULL;
+@@ -2118,7 +2448,10 @@ static int ext3_rename (struct inode * o
+       if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+               handle->h_sync = 1;
+-      old_bh = ext3_find_entry (old_dentry, &old_de);
++      if (old_dentry->d_parent == new_dentry->d_parent)
++              down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
++
++      old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */);
+       /*
+        *  Check for inode number is _not_ due to possible IO errors.
+        *  We might rmdir the source, keep it as pwd of some process
+@@ -2131,7 +2464,7 @@ static int ext3_rename (struct inode * o
+               goto end_rename;
+       new_inode = new_dentry->d_inode;
+-      new_bh = ext3_find_entry (new_dentry, &new_de);
++      new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */);
+       if (new_bh) {
+               if (!new_inode) {
+                       brelse (new_bh);
+@@ -2194,7 +2527,7 @@ static int ext3_rename (struct inode * o
+               struct buffer_head *old_bh2;
+               struct ext3_dir_entry_2 *old_de2;
+               
+-              old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++              old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */);
+               if (old_bh2) {
+                       retval = ext3_delete_entry(handle, old_dir,
+                                                  old_de2, old_bh2);
+@@ -2237,6 +2570,14 @@ static int ext3_rename (struct inode * o
+       retval = 0;
+ end_rename:
++      if (lock1)
++              ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1);
++      if (lock2)
++              ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2);
++      if (lock3)
++              ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3);
++      if (old_dentry->d_parent == new_dentry->d_parent)
++              up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
+       brelse (dir_bh);
+       brelse (old_bh);
+       brelse (new_bh);
+@@ -2245,6 +2586,29 @@ end_rename:
+ }
+ /*
++ * this locking primitives are used to protect parts
++ * of dir's htree. protection unit is block: leaf or index
++ */
++static inline void *ext3_lock_htree(struct inode *dir,
++                                      unsigned long value, int rwlock)
++{
++      void *lock;
++      
++      if (!test_opt(dir->i_sb, PDIROPS))
++              return NULL;
++      lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL);
++      return lock;
++}
++
++static inline void ext3_unlock_htree(struct inode *dir,
++                                      void *lock)
++{
++      if (!test_opt(dir->i_sb, PDIROPS) || !lock)
++              return;
++      dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock);
++}
++
++/*
+  * directories can handle most operations...
+  */
+ struct inode_operations ext3_dir_inode_operations = {
+--- linux-2.4.18/fs/ext3/super.c~ext3-pdirops-2.4.18-chaos     2003-09-01 16:33:25.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/super.c        2003-09-02 12:46:29.000000000 +0400
+@@ -786,6 +786,8 @@ static int parse_options (char * options
+                               return 0;
+                       }
+               }
++              else if (!strcmp (this_char, "pdirops"))
++                      set_opt (sbi->s_mount_opt, PDIROPS);
+               else if (!strcmp (this_char, "grpid") ||
+                        !strcmp (this_char, "bsdgroups"))
+                       set_opt (*mount_options, GRPID);
+@@ -812,6 +814,9 @@ static int parse_options (char * options
+                       if (want_numeric(value, "sb", sb_block))
+                               return 0;
+               }
++              else if (!strcmp (this_char, "pdirops")) {
++                      set_opt (sbi->s_mount_opt, PDIROPS);
++              }
+ #ifdef CONFIG_JBD_DEBUG
+               else if (!strcmp (this_char, "ro-after")) {
+                       unsigned long v;
+@@ -969,6 +974,10 @@ static int ext3_setup_super(struct super
+               ext3_check_inodes_bitmap (sb);
+       }
+ #endif
++#ifdef S_PDIROPS
++      if (test_opt (sb, PDIROPS))
++              sb->s_flags |= S_PDIROPS;
++#endif
+       setup_ro_after(sb);
+       return res;
+ }
+@@ -1463,6 +1472,11 @@ struct super_block * ext3_read_super (st
+               test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+               "writeback");
++      if (test_opt(sb, PDIROPS)) {
++              printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n");
++              sb->s_flags |= S_PDIROPS;
++      }
++              
+       return sb;
+ failed_mount3:
+--- linux-2.4.18/include/linux/ext3_fs.h~ext3-pdirops-2.4.18-chaos     2003-09-01 14:58:06.000000000 +0400
++++ linux-2.4.18-alexey/include/linux/ext3_fs.h        2003-09-02 11:46:15.000000000 +0400
+@@ -310,6 +310,7 @@ struct ext3_inode {
+ /*
+  * Mount flags
+  */
++#define EXT3_MOUNT_PDIROPS            0x800000/* Parallel dir operations */
+ #define EXT3_MOUNT_CHECK              0x0001  /* Do mount-time checks */
+ #define EXT3_MOUNT_GRPID              0x0004  /* Create files with directory's group */
+ #define EXT3_MOUNT_DEBUG              0x0008  /* Some debugging messages */
+--- linux-2.4.18/include/linux/ext3_fs_i.h~ext3-pdirops-2.4.18-chaos   2003-08-29 11:57:30.000000000 +0400
++++ linux-2.4.18-alexey/include/linux/ext3_fs_i.h      2003-09-02 11:46:15.000000000 +0400
+@@ -17,6 +17,7 @@
+ #define _LINUX_EXT3_FS_I
+ #include <linux/rwsem.h>
++#include <linux/dynlocks.h>
+ /*
+  * second extended file system inode data in memory
+@@ -73,6 +74,11 @@ struct ext3_inode_info {
+        * by other means, so we have truncate_sem.
+        */
+       struct rw_semaphore truncate_sem;
++
++      /* following fields for parallel directory operations -bzzz */
++      struct dynlock i_htree_lock;
++      struct semaphore i_append_sem;
++      struct semaphore i_rename_sem;
+ };
+ #endif        /* _LINUX_EXT3_FS_I */
+--- linux-2.4.18/fs/ext3/inode.c~ext3-pdirops-2.4.18-chaos     2003-09-01 16:33:25.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/inode.c        2003-09-02 11:46:15.000000000 +0400
+@@ -2454,6 +2454,9 @@ void ext3_read_inode(struct inode * inod
+       } else if (S_ISDIR(inode->i_mode)) {
+               inode->i_op = &ext3_dir_inode_operations;
+               inode->i_fop = &ext3_dir_operations;
++              dynlock_init(&EXT3_I(inode)->i_htree_lock);
++              sema_init(&EXT3_I(inode)->i_rename_sem, 1);
++              sema_init(&EXT3_I(inode)->i_append_sem, 1);
+       } else if (S_ISLNK(inode->i_mode)) {
+               if (ext3_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext3_fast_symlink_inode_operations;
+--- linux-2.4.18/fs/ext3/ialloc.c~ext3-pdirops-2.4.18-chaos    2003-09-01 14:58:05.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/ialloc.c       2003-09-02 11:46:15.000000000 +0400
+@@ -601,6 +601,9 @@ repeat:
+               return ERR_PTR(-EDQUOT);
+       }
+       ext3_debug ("allocating inode %lu\n", inode->i_ino);
++      dynlock_init(&EXT3_I(inode)->i_htree_lock);
++      sema_init(&EXT3_I(inode)->i_rename_sem, 1);
++      sema_init(&EXT3_I(inode)->i_append_sem, 1);
+       return inode;
+ fail:
+
+_
diff --git a/lustre/kernel_patches/patches/iopen-2.4.18-2.patch b/lustre/kernel_patches/patches/iopen-2.4.18-2.patch
new file mode 100644 (file)
index 0000000..3d9a864
--- /dev/null
@@ -0,0 +1,422 @@
+ Documentation/filesystems/ext2.txt |   16 ++
+ fs/ext3/Makefile                   |    2 
+ fs/ext3/inode.c                    |    4 
+ fs/ext3/iopen.c                    |  259 +++++++++++++++++++++++++++++++++++++
+ fs/ext3/iopen.h                    |   13 +
+ fs/ext3/namei.c                    |   12 +
+ fs/ext3/super.c                    |   11 +
+ include/linux/ext3_fs.h            |    2 
+ 8 files changed, 318 insertions(+), 1 deletion(-)
+
+--- linux-2.4.18-p4smp/Documentation/filesystems/ext2.txt~iopen-2.4.18 2003-07-09 12:17:30.000000000 -0600
++++ linux-2.4.18-p4smp-braam/Documentation/filesystems/ext2.txt        2003-07-09 17:13:02.000000000 -0600
+@@ -35,6 +35,22 @@ resgid=n                    The group ID which may use th
+ sb=n                          Use alternate superblock at this location.
++iopen                         Makes an invisible pseudo-directory called 
++                              __iopen__ available in the root directory
++                              of the filesystem.  Allows open-by-inode-
++                              number.  i.e., inode 3145 can be accessed
++                              via /mntpt/__iopen__/3145
++
++iopen_nopriv                  This option makes the iopen directory be
++                              world-readable.  This may be safer since it
++                              allows daemons to run as an unprivileged user,
++                              however it significantly changes the security
++                              model of a Unix filesystem, since previously
++                              all files under a mode 700 directory were not
++                              generally avilable even if the
++                              permissions on the file itself is
++                              world-readable.
++
+ grpquota,noquota,quota,usrquota       Quota options are silently ignored by ext2.
+--- linux-2.4.18-p4smp/fs/ext3/Makefile~iopen-2.4.18   2003-07-09 17:12:12.000000000 -0600
++++ linux-2.4.18-p4smp-braam/fs/ext3/Makefile  2003-07-09 17:13:15.000000000 -0600
+@@ -11,7 +11,7 @@ O_TARGET := ext3.o
+ export-objs :=        super.o inode.o xattr.o ext3-exports.o
+-obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++obj-y    := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+               ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o
+ obj-m    := $(O_TARGET)
+--- linux-2.4.18-p4smp/fs/ext3/inode.c~iopen-2.4.18    2003-07-09 17:11:19.000000000 -0600
++++ linux-2.4.18-p4smp-braam/fs/ext3/inode.c   2003-07-09 17:13:02.000000000 -0600
+@@ -31,6 +31,7 @@
+ #include <linux/highuid.h>
+ #include <linux/quotaops.h>
+ #include <linux/module.h>
++#include "iopen.h"
+ /*
+  * SEARCH_FROM_ZERO forces each block allocation to search from the start
+@@ -2165,6 +2166,9 @@ void ext3_read_inode(struct inode * inod
+       struct buffer_head *bh;
+       int block;
+       
++      if (ext3_iopen_get_inode(inode))
++              return;
++      
+       if(ext3_get_inode_loc(inode, &iloc))
+               goto bad_inode;
+       bh = iloc.bh;
+--- /dev/null  2003-01-30 03:24:37.000000000 -0700
++++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.c   2003-07-09 17:13:02.000000000 -0600
+@@ -0,0 +1,259 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ * 
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
++ *
++ * Invariants:
++ *   - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ *     for an inode at one time.
++ *   - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ *     aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup().  Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent.  This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/smp_lock.h>
++#include "iopen.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN        32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry)
++{
++      struct inode *inode;
++      unsigned long ino;
++      struct list_head *lp;
++      struct dentry *alternate;
++      char buf[IOPEN_NAME_LEN];
++      
++      if (dentry->d_name.len >= IOPEN_NAME_LEN)
++              return ERR_PTR(-ENAMETOOLONG);
++
++      memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++      buf[dentry->d_name.len] = 0;
++
++      if (strcmp(buf, ".") == 0)
++              ino = dir->i_ino;
++      else if (strcmp(buf, "..") == 0)
++              ino = EXT3_ROOT_INO;
++      else
++              ino = simple_strtoul(buf, 0, 0);
++
++      if ((ino != EXT3_ROOT_INO &&
++           //ino != EXT3_ACL_IDX_INO &&
++           //ino != EXT3_ACL_DATA_INO &&
++           ino < EXT3_FIRST_INO(dir->i_sb)) ||
++          ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
++              return ERR_PTR(-ENOENT);
++
++      inode = iget(dir->i_sb, ino);
++      if (!inode)
++              return ERR_PTR(-EACCES);
++      if (is_bad_inode(inode)) {
++              iput(inode);
++              return ERR_PTR(-ENOENT);
++      }
++
++      /* preferrably return a connected dentry */
++      spin_lock(&dcache_lock);
++      list_for_each(lp, &inode->i_dentry) {
++              alternate = list_entry(lp, struct dentry, d_alias);
++              assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED));
++      }
++
++      if (!list_empty(&inode->i_dentry)) {
++              alternate = list_entry(inode->i_dentry.next, 
++                                     struct dentry, d_alias);
++              dget_locked(alternate);
++              alternate->d_vfs_flags |= DCACHE_REFERENCED;
++              iput(inode);
++              spin_unlock(&dcache_lock);
++              return alternate;
++      }
++      dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
++      spin_unlock(&dcache_lock);
++
++      d_add(dentry, inode);
++      return NULL;
++}
++
++#define do_switch(x,y) do { \
++      __typeof__ (x) __tmp = x; \
++      x = y; y = __tmp; } while (0)
++
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
++{
++      const unsigned char *old_name, *new_name;
++
++      memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); 
++      old_name = target->d_name.name;
++      new_name = dentry->d_name.name;
++      if (old_name == target->d_iname)
++              old_name = dentry->d_iname;
++      if (new_name == dentry->d_iname)
++              new_name = target->d_iname;
++      target->d_name.name = new_name;
++      dentry->d_name.name = old_name;
++}
++
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++{
++      struct dentry *tmp, *goal = NULL;
++      struct list_head *lp;
++
++      /* preferrably return a connected dentry */
++      spin_lock(&dcache_lock);
++      /* verify this dentry is really new */
++      assert(!de->d_inode);
++      assert(list_empty(&de->d_subdirs));
++      assert(list_empty(&de->d_alias));
++
++
++      list_for_each(lp, &inode->i_dentry) {
++              tmp = list_entry(lp, struct dentry, d_alias);
++              if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) {
++                      assert(tmp->d_alias.next == &inode->i_dentry);
++                      assert(tmp->d_alias.prev == &inode->i_dentry);
++                      goal = tmp;
++                      dget_locked(goal);
++                      break;
++              }
++      }
++
++      if (!goal) { 
++              spin_unlock(&dcache_lock);
++              return NULL; 
++      }
++
++      /* Move the goal to the de hash queue - like d_move() */
++      goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
++      list_del(&goal->d_hash);
++      list_add(&goal->d_hash, &de->d_hash);
++
++      list_del(&goal->d_child);
++      list_del(&de->d_child);
++
++      /* Switch the parents and the names.. */
++      switch_names(goal, de);
++      do_switch(goal->d_parent, de->d_parent);
++      do_switch(goal->d_name.len, de->d_name.len);
++      do_switch(goal->d_name.hash, de->d_name.hash);
++
++      /* And add them back to the (new) parent lists */
++      list_add(&goal->d_child, &goal->d_parent->d_subdirs);
++      list_add(&de->d_child, &de->d_parent->d_subdirs);
++      spin_unlock(&dcache_lock);
++
++      return goal;
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++      lookup:         iopen_lookup,           /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++      read:           generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++      int     len;
++
++      len = strlen(name);
++      if (dentry->d_name.len != len)
++              return 0;
++      if (strncmp(dentry->d_name.name, name, len))
++              return 0;
++      return 1;
++}
++
++/*
++ * This function is spliced into ext3_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++      struct inode *inode;
++
++      if (dir->i_ino != EXT3_ROOT_INO ||
++          !test_opt(dir->i_sb, IOPEN) ||
++          !match_dentry(dentry, "__iopen__"))
++              return 0;
++
++      inode = iget(dir->i_sb, EXT3_BAD_INO);
++
++      if (!inode) 
++              return 0;
++      d_add(dentry, inode);
++      return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately.  Otherwise, this fuction returns 0.
++ */
++int ext3_iopen_get_inode(struct inode *inode)
++{
++      if (inode->i_ino != EXT3_BAD_INO)
++              return 0;
++
++      inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++      if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++              inode->i_mode |= 0777;
++      inode->i_uid = 0;
++      inode->i_gid = 0;
++      inode->i_nlink = 1;
++      inode->i_size = 4096;
++      inode->i_atime = CURRENT_TIME;
++      inode->i_ctime = CURRENT_TIME;
++      inode->i_mtime = CURRENT_TIME;
++      inode->u.ext3_i.i_dtime = 0;
++      inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
++                                       * (for stat), not the fs block
++                                       * size */  
++      inode->i_blocks = 0;
++      inode->i_version = 1;
++      inode->i_generation = 0;
++
++      inode->i_op = &iopen_inode_operations;
++      inode->i_fop = &iopen_file_operations;
++      inode->i_mapping->a_ops = 0;
++
++      return 1;
++}
+--- /dev/null  2003-01-30 03:24:37.000000000 -0700
++++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.h   2003-07-09 17:13:02.000000000 -0600
+@@ -0,0 +1,13 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ * 
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ * 
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
+--- linux-2.4.18-p4smp/fs/ext3/namei.c~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
++++ linux-2.4.18-p4smp-braam/fs/ext3/namei.c   2003-07-09 17:13:02.000000000 -0600
+@@ -34,6 +34,7 @@
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
++#include "iopen.h"
+ /*
+  * define how far ahead to read directories while searching them.
+@@ -703,16 +704,21 @@ cleanup_and_exit:
+       return NULL;
+ }
+ #endif
++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode);
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+       struct inode * inode;
+       struct ext3_dir_entry_2 * de;
+       struct buffer_head * bh;
++      struct dentry *alternate = NULL;
+       if (dentry->d_name.len > EXT3_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
++      if (ext3_check_for_iopen(dir, dentry))
++              return NULL;
++
+       bh = ext3_find_entry(dentry, &de);
+       inode = NULL;
+       if (bh) {
+@@ -723,6 +729,12 @@ static struct dentry *ext3_lookup(struct
+               if (!inode)
+                       return ERR_PTR(-EACCES);
+       }
++
++      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
++              iput(inode);
++              return alternate;
++      }
++
+       d_add(dentry, inode);
+       return NULL;
+ }
+--- linux-2.4.18-p4smp/fs/ext3/super.c~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
++++ linux-2.4.18-p4smp-braam/fs/ext3/super.c   2003-07-09 17:13:02.000000000 -0600
+@@ -831,6 +831,17 @@ static int parse_options (char * options
+                        || !strcmp (this_char, "quota")
+                        || !strcmp (this_char, "usrquota"))
+                       /* Don't do anything ;-) */ ;
++              else if (!strcmp (this_char, "iopen")) {
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              } else if (!strcmp (this_char, "noiopen")) {
++                      clear_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              }
++              else if (!strcmp (this_char, "iopen_nopriv")) {
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              }
+               else if (!strcmp (this_char, "journal")) {
+                       /* @@@ FIXME */
+                       /* Eventually we will want to be able to create
+--- linux-2.4.18-p4smp/include/linux/ext3_fs.h~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
++++ linux-2.4.18-p4smp-braam/include/linux/ext3_fs.h   2003-07-09 17:13:02.000000000 -0600
+@@ -321,6 +321,8 @@ struct ext3_inode {
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_INDEX              0x4000  /* Enable directory index */
++#define EXT3_MOUNT_IOPEN              0x8000  /* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV               0x10000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+
+_
diff --git a/lustre/kernel_patches/patches/iopen-2.6.0.patch b/lustre/kernel_patches/patches/iopen-2.6.0.patch
new file mode 100644 (file)
index 0000000..af67758
--- /dev/null
@@ -0,0 +1,403 @@
+ Documentation/filesystems/ext2.txt |   16 ++
+ fs/ext3/Makefile                   |    2 
+ fs/ext3/inode.c                    |    3 
+ fs/ext3/iopen.c                    |  239 +++++++++++++++++++++++++++++++++++++
+ fs/ext3/iopen.h                    |   15 ++
+ fs/ext3/namei.c                    |   13 ++
+ fs/ext3/super.c                    |   11 +
+ include/linux/ext3_fs.h            |    2 
+ 8 files changed, 300 insertions(+), 1 deletion(-)
+
+--- linux-2.6.0-test1/Documentation/filesystems/ext2.txt~iopen-2.6.0   2002-11-11 06:28:06.000000000 +0300
++++ linux-2.6.0-test1-alexey/Documentation/filesystems/ext2.txt        2003-08-24 13:02:02.000000000 +0400
+@@ -35,6 +35,22 @@ resgid=n                    The group ID which may use th
+ sb=n                          Use alternate superblock at this location.
++iopen                         Makes an invisible pseudo-directory called 
++                              __iopen__ available in the root directory
++                              of the filesystem.  Allows open-by-inode-
++                              number.  i.e., inode 3145 can be accessed
++                              via /mntpt/__iopen__/3145
++
++iopen_nopriv                  This option makes the iopen directory be
++                              world-readable.  This may be safer since it
++                              allows daemons to run as an unprivileged user,
++                              however it significantly changes the security
++                              model of a Unix filesystem, since previously
++                              all files under a mode 700 directory were not
++                              generally avilable even if the
++                              permissions on the file itself is
++                              world-readable.
++
+ grpquota,noquota,quota,usrquota       Quota options are silently ignored by ext2.
+--- linux-2.6.0-test1/fs/ext3/inode.c~iopen-2.6.0      2003-08-24 13:00:36.000000000 +0400
++++ linux-2.6.0-test1-alexey/fs/ext3/inode.c   2003-08-24 13:02:02.000000000 +0400
+@@ -37,6 +37,7 @@
+ #include <linux/mpage.h>
+ #include <linux/uio.h>
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+ /*
+@@ -2477,6 +2478,8 @@ void ext3_read_inode(struct inode * inod
+       ei->i_acl = EXT3_ACL_NOT_CACHED;
+       ei->i_default_acl = EXT3_ACL_NOT_CACHED;
+ #endif
++      if (ext3_iopen_get_inode(inode))
++              return;
+       if (ext3_get_inode_loc(inode, &iloc, 0))
+               goto bad_inode;
+       bh = iloc.bh;
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.0-test1-alexey/fs/ext3/iopen.c   2003-08-24 13:02:02.000000000 +0400
+@@ -0,0 +1,239 @@
++
++
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ * 
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/smp_lock.h>
++#include "iopen.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN        32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
++{
++      struct inode * inode;
++      unsigned long ino;
++        struct list_head *lp;
++        struct dentry *alternate;
++      char buf[IOPEN_NAME_LEN];
++      
++      if (dentry->d_name.len >= IOPEN_NAME_LEN)
++              return ERR_PTR(-ENAMETOOLONG);
++
++      memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++      buf[dentry->d_name.len] = 0;
++
++      if (strcmp(buf, ".") == 0)
++              ino = dir->i_ino;
++      else if (strcmp(buf, "..") == 0)
++              ino = EXT3_ROOT_INO;
++      else
++              ino = simple_strtoul(buf, 0, 0);
++
++      if ((ino != EXT3_ROOT_INO &&
++           //ino != EXT3_ACL_IDX_INO &&
++           //ino != EXT3_ACL_DATA_INO &&
++           ino < EXT3_FIRST_INO(dir->i_sb)) ||
++          ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
++              return ERR_PTR(-ENOENT);
++
++      inode = iget(dir->i_sb, ino);
++      if (!inode)
++              return ERR_PTR(-EACCES);
++      if (is_bad_inode(inode)) {
++              iput(inode);
++              return ERR_PTR(-ENOENT);
++      }
++
++        /* preferrably return a connected dentry */
++        spin_lock(&dcache_lock);
++        list_for_each(lp, &inode->i_dentry) {
++                alternate = list_entry(lp, struct dentry, d_alias);
++                assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
++        }
++
++        if (!list_empty(&inode->i_dentry)) {
++                alternate = list_entry(inode->i_dentry.next, 
++                                       struct dentry, d_alias);
++                dget_locked(alternate);
++                alternate->d_vfs_flags |= DCACHE_REFERENCED;
++                iput(inode);
++                spin_unlock(&dcache_lock);
++                return alternate;
++        }
++        dentry->d_flags |= DCACHE_DISCONNECTED;
++        spin_unlock(&dcache_lock);
++
++      d_add(dentry, inode);
++      return NULL;
++}
++
++#define do_switch(x,y) do { \
++      __typeof__ (x) __tmp = x; \
++      x = y; y = __tmp; } while (0)
++
++static inline void switch_names(struct dentry * dentry, struct dentry * target)
++{
++      const unsigned char *old_name, *new_name;
++
++      memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); 
++      old_name = target->d_name.name;
++      new_name = dentry->d_name.name;
++      if (old_name == target->d_iname)
++              old_name = dentry->d_iname;
++      if (new_name == dentry->d_iname)
++              new_name = target->d_iname;
++      target->d_name.name = new_name;
++      dentry->d_name.name = old_name;
++}
++
++
++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++{
++        struct dentry *tmp, *goal = NULL;
++        struct list_head *lp;
++
++        /* preferrably return a connected dentry */
++        spin_lock(&dcache_lock);
++        /* verify this dentry is really new */
++        assert(!de->d_inode);
++        assert(list_empty(&de->d_subdirs));
++        assert(list_empty(&de->d_alias));
++
++
++        list_for_each(lp, &inode->i_dentry) {
++                tmp = list_entry(lp, struct dentry, d_alias);
++                if (tmp->d_flags & DCACHE_DISCONNECTED) {
++                        assert(tmp->d_alias.next == &inode->i_dentry);
++                        assert(tmp->d_alias.prev == &inode->i_dentry);
++                        goal = tmp;
++                        dget_locked(goal);
++                        break;
++                }
++        }
++
++        if (!goal) { 
++                spin_unlock(&dcache_lock);
++                return NULL; 
++        }
++
++        /* Move the goal to the de hash queue */
++        goal->d_flags &= ~DCACHE_DISCONNECTED;
++      hlist_add_before(&goal->d_hash, &de->d_hash);
++      hlist_del(&goal->d_hash);
++
++      list_del(&goal->d_child);
++      list_del(&de->d_child);
++
++      /* Switch the parents and the names.. */
++      switch_names(goal, de);
++      do_switch(goal->d_parent, de->d_parent);
++      do_switch(goal->d_name.len, de->d_name.len);
++      do_switch(goal->d_name.hash, de->d_name.hash);
++
++      /* And add them back to the (new) parent lists */
++      list_add(&goal->d_child, &goal->d_parent->d_subdirs);
++      list_add(&de->d_child, &de->d_parent->d_subdirs);
++
++        spin_unlock(&dcache_lock);
++        return goal;
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++      lookup:         iopen_lookup,           /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++      read:           generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++      int     len;
++
++      len = strlen(name);
++      if (dentry->d_name.len != len)
++              return 0;
++      if (strncmp(dentry->d_name.name, name, len))
++              return 0;
++      return 1;
++}
++
++/*
++ * This function is spliced into ext3_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry)
++{
++      struct inode * inode;
++
++      if (dir->i_ino != EXT3_ROOT_INO ||
++          !test_opt(dir->i_sb, IOPEN) ||
++          !match_dentry(dentry, "__iopen__"))
++              return 0;
++
++      inode = iget(dir->i_sb, EXT3_BAD_INO);
++
++      if (!inode) 
++              return 0;
++      d_add(dentry, inode);
++      return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately.  Otherwise, this fuction returns 0.
++ */
++int ext3_iopen_get_inode(struct inode * inode)
++{
++      if (inode->i_ino != EXT3_BAD_INO)
++              return 0;
++
++      inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++      if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++              inode->i_mode |= 0777;
++      inode->i_uid = 0;
++      inode->i_gid = 0;
++      inode->i_nlink = 1;
++      inode->i_size = 4096;
++      inode->i_atime = CURRENT_TIME;
++      inode->i_ctime = CURRENT_TIME;
++      inode->i_mtime = CURRENT_TIME;
++      EXT3_I(inode)->i_dtime = 0;
++      inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
++                                       * (for stat), not the fs block
++                                       * size */  
++      inode->i_blocks = 0;
++      inode->i_version = 1;
++      inode->i_generation = 0;
++
++      inode->i_op = &iopen_inode_operations;
++      inode->i_fop = &iopen_file_operations;
++      inode->i_mapping->a_ops = 0;
++
++      return 1;
++}
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.0-test1-alexey/fs/ext3/iopen.h   2003-08-24 13:02:02.000000000 +0400
+@@ -0,0 +1,15 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ * 
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ * 
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode * inode);
++
++
+--- linux-2.6.0-test1/fs/ext3/Makefile~iopen-2.6.0     2003-08-24 12:58:32.000000000 +0400
++++ linux-2.6.0-test1-alexey/fs/ext3/Makefile  2003-08-24 13:02:40.000000000 +0400
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+ ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-         ioctl.o namei.o super.o symlink.o hash.o
++         iopen.o ioctl.o namei.o super.o symlink.o hash.o
+ ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+--- linux-2.6.0-test1/fs/ext3/namei.c~iopen-2.6.0      2003-07-24 15:52:30.000000000 +0400
++++ linux-2.6.0-test1-alexey/fs/ext3/namei.c   2003-08-24 13:02:02.000000000 +0400
+@@ -37,6 +37,7 @@
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+ /*
+@@ -970,15 +971,21 @@ errout:
+ }
+ #endif
++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode);
++ 
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+ {
+       struct inode * inode;
+       struct ext3_dir_entry_2 * de;
+       struct buffer_head * bh;
++      struct dentry *alternate = NULL;
+       if (dentry->d_name.len > EXT3_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
++      if (ext3_check_for_iopen(dir, dentry))
++              return NULL;
++
+       bh = ext3_find_entry(dentry, &de);
+       inode = NULL;
+       if (bh) {
+@@ -991,6 +998,12 @@ static struct dentry *ext3_lookup(struct
+       }
+       if (inode)
+               return d_splice_alias(inode, dentry);
++
++      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
++              iput(inode);
++              return alternate;
++      }
++
+       d_add(dentry, inode);
+       return NULL;
+ }
+--- linux-2.6.0-test1/fs/ext3/super.c~iopen-2.6.0      2003-08-24 13:00:36.000000000 +0400
++++ linux-2.6.0-test1-alexey/fs/ext3/super.c   2003-08-24 13:02:02.000000000 +0400
+@@ -755,6 +755,17 @@ static int parse_options (char * options
+                        || !strcmp (this_char, "quota")
+                        || !strcmp (this_char, "usrquota"))
+                       /* Don't do anything ;-) */ ;
++              else if (!strcmp (this_char, "iopen")) {
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              } else if (!strcmp (this_char, "noiopen")) {
++                      clear_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              }
++              else if (!strcmp (this_char, "iopen_nopriv")) {
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              }
+               else if (!strcmp (this_char, "journal")) {
+                       /* @@@ FIXME */
+                       /* Eventually we will want to be able to create
+--- linux-2.6.0-test1/include/linux/ext3_fs.h~iopen-2.6.0      2003-08-24 12:58:57.000000000 +0400
++++ linux-2.6.0-test1-alexey/include/linux/ext3_fs.h   2003-08-24 13:02:02.000000000 +0400
+@@ -324,6 +324,8 @@ struct ext3_inode {
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
+ #define EXT3_MOUNT_POSIX_ACL          0x8000  /* POSIX Access Control Lists */
++#define EXT3_MOUNT_IOPEN             0x10000  /* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV              0x20000  /* Make iopen world-readable */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+
+_
diff --git a/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch b/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch
new file mode 100644 (file)
index 0000000..c7d06a8
--- /dev/null
@@ -0,0 +1,1775 @@
+ fs/ext3/Makefile           |    4 
+ fs/ext3/ext3-exports.c     |   13 
+ fs/ext3/ialloc.c           |    2 
+ fs/ext3/inode.c            |   29 -
+ fs/ext3/namei.c            |    8 
+ fs/ext3/super.c            |   23 
+ fs/ext3/xattr.c            | 1242 +++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/ext3_fs.h    |   46 -
+ include/linux/ext3_jbd.h   |    8 
+ include/linux/ext3_xattr.h |  155 +++++
+ include/linux/xattr.h      |   15 
+ 11 files changed, 1494 insertions(+), 51 deletions(-)
+
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.18-alexey/fs/ext3/ext3-exports.c 2003-09-01 14:55:39.000000000 +0400
+@@ -0,0 +1,13 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
++
++EXPORT_SYMBOL(ext3_force_commit);
++EXPORT_SYMBOL(ext3_bread);
++EXPORT_SYMBOL(ext3_xattr_register);
++EXPORT_SYMBOL(ext3_xattr_unregister);
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_list);
++EXPORT_SYMBOL(ext3_xattr_set);
+--- linux-2.4.18/fs/ext3/ialloc.c~linux-2.4.18ea-0.8.26-2      2003-07-28 17:52:04.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/ialloc.c       2003-09-01 14:55:39.000000000 +0400
+@@ -17,6 +17,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, 
+        * as writing the quota to disk may need the lock as well.
+        */
+       DQUOT_INIT(inode);
++      ext3_xattr_drop_inode(handle, inode);
+       DQUOT_FREE_INODE(inode);
+       DQUOT_DROP(inode);
+--- linux-2.4.18/fs/ext3/inode.c~linux-2.4.18ea-0.8.26-2       2003-07-28 17:52:04.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/inode.c        2003-09-01 14:55:39.000000000 +0400
+@@ -39,6 +39,18 @@
+  */
+ #undef SEARCH_FROM_ZERO
++/*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext3_inode_is_fast_symlink(struct inode *inode)
++{
++      int ea_blocks = EXT3_I(inode)->i_file_acl ?
++              (inode->i_sb->s_blocksize >> 9) : 0;
++
++      return (S_ISLNK(inode->i_mode) &&
++              inode->i_blocks - ea_blocks == 0);
++}
++
+ /* The ext3 forget function must perform a revoke if we are freeing data
+  * which has been journaled.  Metadata (eg. indirect blocks) must be
+  * revoked in all cases. 
+@@ -48,7 +60,7 @@
+  * still needs to be revoked.
+  */
+-static int ext3_forget(handle_t *handle, int is_metadata,
++int ext3_forget(handle_t *handle, int is_metadata,
+                      struct inode *inode, struct buffer_head *bh,
+                      int blocknr)
+ {
+@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i
+ {
+       handle_t *handle;
+       
+-      if (is_bad_inode(inode) ||
+-          inode->i_ino == EXT3_ACL_IDX_INO ||
+-          inode->i_ino == EXT3_ACL_DATA_INO)
++      if (is_bad_inode(inode))
+               goto no_delete;
+       lock_kernel();
+@@ -1877,6 +1887,8 @@ void ext3_truncate(struct inode * inode)
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode)))
+               return;
++      if (ext3_inode_is_fast_symlink(inode))
++              return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+               return;
+@@ -2038,8 +2050,6 @@ int ext3_get_inode_loc (struct inode *in
+       struct ext3_group_desc * gdp;
+               
+       if ((inode->i_ino != EXT3_ROOT_INO &&
+-              inode->i_ino != EXT3_ACL_IDX_INO &&
+-              inode->i_ino != EXT3_ACL_DATA_INO &&
+               inode->i_ino != EXT3_JOURNAL_INO &&
+               inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+               inode->i_ino > le32_to_cpu(
+@@ -2166,10 +2176,7 @@ void ext3_read_inode(struct inode * inod
+       brelse (iloc.bh);
+-      if (inode->i_ino == EXT3_ACL_IDX_INO ||
+-          inode->i_ino == EXT3_ACL_DATA_INO)
+-              /* Nothing to do */ ;
+-      else if (S_ISREG(inode->i_mode)) {
++      if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &ext3_file_inode_operations;
+               inode->i_fop = &ext3_file_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+@@ -2177,7 +2184,7 @@ void ext3_read_inode(struct inode * inod
+               inode->i_op = &ext3_dir_inode_operations;
+               inode->i_fop = &ext3_dir_operations;
+       } else if (S_ISLNK(inode->i_mode)) {
+-              if (!inode->i_blocks)
++              if (ext3_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext3_fast_symlink_inode_operations;
+               else {
+                       inode->i_op = &page_symlink_inode_operations;
+--- linux-2.4.18/fs/ext3/Makefile~linux-2.4.18ea-0.8.26-2      2003-08-29 16:53:17.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/Makefile       2003-09-01 14:55:50.000000000 +0400
+@@ -9,10 +9,10 @@
+ O_TARGET := ext3.o
+-export-objs :=        super.o inode.o
++export-objs :=        ext3-exports.o
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-              ioctl.o namei.o super.o symlink.o hash.o
++              ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o
+ obj-m    := $(O_TARGET)
+ include $(TOPDIR)/Rules.make
+--- linux-2.4.18/fs/ext3/namei.c~linux-2.4.18ea-0.8.26-2       2003-09-01 11:50:59.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/namei.c        2003-09-01 14:55:39.000000000 +0400
+@@ -29,6 +29,7 @@
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+@@ -1524,6 +1525,7 @@ static int ext3_add_nondir(handle_t *han
+               d_instantiate(dentry, inode);
+               return 0;
+       }
++      ext3_xattr_drop_inode(handle, inode);
+       ext3_dec_count(handle, inode);
+       iput(inode);
+       return err;
+@@ -1612,7 +1614,7 @@ static int ext3_mkdir(struct inode * dir
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext3_new_inode (handle, dir, S_IFDIR);
++      inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -1620,7 +1622,6 @@ static int ext3_mkdir(struct inode * dir
+       inode->i_op = &ext3_dir_inode_operations;
+       inode->i_fop = &ext3_dir_operations;
+       inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+-      inode->i_blocks = 0;    
+       dir_block = ext3_bread (handle, inode, 0, 1, &err);
+       if (!dir_block) {
+               inode->i_nlink--; /* is this nlink == 0? */
+@@ -1647,9 +1648,6 @@ static int ext3_mkdir(struct inode * dir
+       BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+       ext3_journal_dirty_metadata(handle, dir_block);
+       brelse (dir_block);
+-      inode->i_mode = S_IFDIR | mode;
+-      if (dir->i_mode & S_ISGID)
+-              inode->i_mode |= S_ISGID;
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_entry (handle, dentry, inode);
+       if (err) {
+--- linux-2.4.18/fs/ext3/super.c~linux-2.4.18ea-0.8.26-2       2003-08-29 16:53:17.000000000 +0400
++++ linux-2.4.18-alexey/fs/ext3/super.c        2003-09-01 14:55:39.000000000 +0400
+@@ -24,6 +24,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -406,6 +407,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+               EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+@@ -1743,18 +1745,27 @@ int ext3_statfs (struct super_block * sb
+ static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super);
+-static int __init init_ext3_fs(void)
++static void exit_ext3_fs(void)
+ {
+-        return register_filesystem(&ext3_fs_type);
++      unregister_filesystem(&ext3_fs_type);
++      exit_ext3_xattr_user();
++      exit_ext3_xattr();
+ }
+-static void __exit exit_ext3_fs(void)
++static int __init init_ext3_fs(void)
+ {
+-      unregister_filesystem(&ext3_fs_type);
++      int error = init_ext3_xattr();
++      if (!error)
++              error = init_ext3_xattr_user();
++      if (!error)
++              error = register_filesystem(&ext3_fs_type);
++      if (!error)
++              return 0;
++
++      exit_ext3_fs();
++      return error;
+ }
+-EXPORT_SYMBOL(ext3_force_commit);
+-EXPORT_SYMBOL(ext3_bread);
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.18-alexey/fs/ext3/xattr.c        2003-09-01 14:55:39.000000000 +0400
+@@ -0,0 +1,1242 @@
++/*
++ * linux/fs/ext3/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
++ * Extended attributes for symlinks and special files added per
++ *  suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ *   +------------------+
++ *   | header           |
++ *   Â¦ entry 1          | |
++ *   | entry 2          | | growing downwards
++ *   | entry 3          | v
++ *   | four null bytes  |
++ *   | . . .            |
++ *   | value 1          | ^
++ *   | value 3          | | growing upwards
++ *   | value 2          | |
++ *   +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT3_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++#include <linux/mbcache.h>
++#endif
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++#include <linux/module.h>
++
++/* These symbols may be needed by a module. */
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT3_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++              printk(KERN_DEBUG "inode %s:%ld: ", \
++                      kdevname(inode->i_dev), inode->i_ino); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++# define ea_bdebug(bh, f...) do { \
++              printk(KERN_DEBUG "block %s:%ld: ", \
++                      kdevname(bh->b_dev), bh->b_blocknr); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *,
++                         struct ext3_xattr_header *);
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++static int ext3_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext3_xattr_cache_find(struct inode *,
++                                               struct ext3_xattr_header *);
++static void ext3_xattr_cache_remove(struct buffer_head *);
++static void ext3_xattr_rehash(struct ext3_xattr_header *,
++                            struct ext3_xattr_entry *);
++
++static struct mb_cache *ext3_xattr_cache;
++
++#else
++# define ext3_xattr_cache_insert(bh) 0
++# define ext3_xattr_cache_find(inode, header) NULL
++# define ext3_xattr_cache_remove(bh) do {} while(0)
++# define ext3_xattr_rehash(header, entry) do {} while(0)
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext3_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext3_xattr_sem);
++
++static inline void
++ext3_xattr_lock(void)
++{
++      down(&ext3_xattr_sem);
++}
++
++static inline void
++ext3_xattr_unlock(void)
++{
++      up(&ext3_xattr_sem);
++}
++
++static inline int
++ext3_xattr_new_block(handle_t *handle, struct inode *inode,
++                   int * errp, int force)
++{
++      struct super_block *sb = inode->i_sb;
++      int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
++              EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb);
++
++      /* How can we enforce the allocation? */
++      int block = ext3_new_block(handle, inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++      if (!*errp)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++      return block;
++}
++
++static inline int
++ext3_xattr_quota_alloc(struct inode *inode, int force)
++{
++      /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++      int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++      if (!error)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++      int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++      return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext3_xattr_quota_free(struct inode *inode)
++{
++      DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext3_xattr_free_block(handle_t *handle, struct inode * inode,
++                    unsigned long block)
++{
++      ext3_free_blocks(handle, inode, block, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext3_xattr_quota_free(inode) \
++      DQUOT_FREE_BLOCK(inode, 1)
++# define ext3_xattr_free_block(handle, inode, block) \
++      ext3_free_blocks(handle, inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++      return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++      return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
++{
++      int error = -EINVAL;
++
++      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++              write_lock(&ext3_handler_lock);
++              if (!ext3_xattr_handlers[name_index-1]) {
++                      ext3_xattr_handlers[name_index-1] = handler;
++                      error = 0;
++              }
++              write_unlock(&ext3_handler_lock);
++      }
++      return error;
++}
++
++void
++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
++{
++      if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
++              write_lock(&ext3_handler_lock);
++              ext3_xattr_handlers[name_index-1] = NULL;
++              write_unlock(&ext3_handler_lock);
++      }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++      while (*a_prefix && *a == *a_prefix) {
++              a++;
++              a_prefix++;
++      }
++      return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static inline struct ext3_xattr_handler *
++ext3_xattr_resolve_name(const char **name)
++{
++      struct ext3_xattr_handler *handler = NULL;
++      int i;
++
++      if (!*name)
++              return NULL;
++      read_lock(&ext3_handler_lock);
++      for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
++              if (ext3_xattr_handlers[i]) {
++                      const char *n = strcmp_prefix(*name,
++                              ext3_xattr_handlers[i]->prefix);
++                      if (n) {
++                              handler = ext3_xattr_handlers[i];
++                              *name = n;
++                              break;
++                      }
++              }
++      }
++      read_unlock(&ext3_handler_lock);
++      return handler;
++}
++
++static inline struct ext3_xattr_handler *
++ext3_xattr_handler(int name_index)
++{
++      struct ext3_xattr_handler *handler = NULL;
++      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++              read_lock(&ext3_handler_lock);
++              handler = ext3_xattr_handlers[name_index-1];
++              read_unlock(&ext3_handler_lock);
++      }
++      return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_getxattr(struct dentry *dentry, const char *name,
++            void *buffer, size_t size)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++      return ext3_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_setxattr(struct dentry *dentry, const char *name,
++            void *value, size_t size, int flags)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      if (size == 0)
++              value = "";  /* empty EA, do not remove */
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_removexattr(struct dentry *dentry, const char *name)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext3_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_entry *entry;
++      unsigned int block, size;
++      char *end;
++      int name_len, error;
++
++      ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++                name_index, name, buffer, (long)buffer_size);
++
++      if (name == NULL)
++              return -EINVAL;
++      if (!EXT3_I(inode)->i_file_acl)
++              return -ENOATTR;
++      block = EXT3_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext3_error(inode->i_sb, "ext3_xattr_get",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* find named attribute */
++      name_len = strlen(name);
++
++      error = -ERANGE;
++      if (name_len > 255)
++              goto cleanup;
++      entry = FIRST_ENTRY(bh);
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              if (name_index == entry->e_name_index &&
++                  name_len == entry->e_name_len &&
++                  memcmp(name, entry->e_name, name_len) == 0)
++                      goto found;
++              entry = next;
++      }
++      /* Check the remaining name entries */
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              entry = next;
++      }
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      error = -ENOATTR;
++      goto cleanup;
++found:
++      /* check the buffer size */
++      if (entry->e_value_block != 0)
++              goto bad_block;
++      size = le32_to_cpu(entry->e_value_size);
++      if (size > inode->i_sb->s_blocksize ||
++          le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++              goto bad_block;
++
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (buffer) {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++              /* return value of attribute */
++              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++                      size);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * ext3_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_entry *entry;
++      unsigned int block, size = 0;
++      char *buf, *end;
++      int error;
++
++      ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++                buffer, (long)buffer_size);
++
++      if (!EXT3_I(inode)->i_file_acl)
++              return 0;
++      block = EXT3_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext3_error(inode->i_sb, "ext3_xattr_list",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* compute the size required for the list of attribute names */
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT3_XATTR_NEXT(entry)) {
++              struct ext3_xattr_handler *handler;
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++
++              handler = ext3_xattr_handler(entry->e_name_index);
++              if (handler) {
++                      size += handler->list(NULL, inode, entry->e_name,
++                                            entry->e_name_len) + 1;
++              }
++      }
++
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (!buffer) {
++              error = size;
++              goto cleanup;
++      } else {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++      }
++
++      /* list the attribute names */
++      buf = buffer;
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT3_XATTR_NEXT(entry)) {
++              struct ext3_xattr_handler *handler;
++
++              handler = ext3_xattr_handler(entry->e_name_index);
++              if (handler) {
++                      buf += handler->list(buf, inode, entry->e_name,
++                                           entry->e_name_len);
++                      *buf++ = '\0';
++              }
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext3_xattr_update_super_block(handle_t *handle,
++                                        struct super_block *sb)
++{
++      if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
++              return;
++
++      lock_super(sb);
++      ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++      EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR;
++#endif
++      EXT3_SB(sb)->s_es->s_feature_compat |=
++              cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
++      sb->s_dirt = 1;
++      ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++      unlock_super(sb);
++}
++
++/*
++ * ext3_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++             const char *name, void *value, size_t value_len, int flags)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_header *header = NULL;
++      struct ext3_xattr_entry *here, *last;
++      unsigned int name_len;
++      int min_offs = sb->s_blocksize, not_found = 1, free, error;
++      char *end;
++      
++      /*
++       * header -- Points either into bh, or to a temporarily
++       *           allocated buffer.
++       * here -- The named entry found, or the place for inserting, within
++       *         the block pointed to by header.
++       * last -- Points right after the last named entry within the block
++       *         pointed to by header.
++       * min_offs -- The offset of the first value (values are aligned
++       *             towards the end of the block).
++       * end -- Points right after the block pointed to by header.
++       */
++      
++      ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++                name_index, name, value, (long)value_len);
++
++      if (IS_RDONLY(inode))
++              return -EROFS;
++      if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++              return -EPERM;
++      if (value == NULL)
++              value_len = 0;
++      if (name == NULL)
++              return -EINVAL;
++      name_len = strlen(name);
++      if (name_len > 255 || value_len > sb->s_blocksize)
++              return -ERANGE;
++      ext3_xattr_lock();
++
++      if (EXT3_I(inode)->i_file_acl) {
++              /* The inode already has an extended attribute block. */
++              int block = EXT3_I(inode)->i_file_acl;
++
++              bh = sb_bread(sb, block);
++              error = -EIO;
++              if (!bh)
++                      goto cleanup;
++              ea_bdebug(bh, "b_count=%d, refcount=%d",
++                      atomic_read(&(bh->b_count)),
++                      le32_to_cpu(HDR(bh)->h_refcount));
++              header = HDR(bh);
++              end = bh->b_data + bh->b_size;
++              if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++                  header->h_blocks != cpu_to_le32(1)) {
++bad_block:            ext3_error(sb, "ext3_xattr_set",
++                              "inode %ld: bad block %d", inode->i_ino, block);
++                      error = -EIO;
++                      goto cleanup;
++              }
++              /* Find the named attribute. */
++              here = FIRST_ENTRY(bh);
++              while (!IS_LAST_ENTRY(here)) {
++                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!here->e_value_block && here->e_value_size) {
++                              int offs = le16_to_cpu(here->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      not_found = name_index - here->e_name_index;
++                      if (!not_found)
++                              not_found = name_len - here->e_name_len;
++                      if (!not_found)
++                              not_found = memcmp(name, here->e_name,name_len);
++                      if (not_found <= 0)
++                              break;
++                      here = next;
++              }
++              last = here;
++              /* We still need to compute min_offs and last. */
++              while (!IS_LAST_ENTRY(last)) {
++                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!last->e_value_block && last->e_value_size) {
++                              int offs = le16_to_cpu(last->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      last = next;
++              }
++
++              /* Check whether we have enough space left. */
++              free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++      } else {
++              /* We will use a new extended attribute block. */
++              free = sb->s_blocksize -
++                      sizeof(struct ext3_xattr_header) - sizeof(__u32);
++              here = last = NULL;  /* avoid gcc uninitialized warning. */
++      }
++
++      if (not_found) {
++              /* Request to remove a nonexistent attribute? */
++              error = -ENOATTR;
++              if (flags & XATTR_REPLACE)
++                      goto cleanup;
++              error = 0;
++              if (value == NULL)
++                      goto cleanup;
++              else
++                      free -= EXT3_XATTR_LEN(name_len);
++      } else {
++              /* Request to create an existing attribute? */
++              error = -EEXIST;
++              if (flags & XATTR_CREATE)
++                      goto cleanup;
++              if (!here->e_value_block && here->e_value_size) {
++                      unsigned int size = le32_to_cpu(here->e_value_size);
++
++                      if (le16_to_cpu(here->e_value_offs) + size > 
++                          sb->s_blocksize || size > sb->s_blocksize)
++                              goto bad_block;
++                      free += EXT3_XATTR_SIZE(size);
++              }
++      }
++      free -= EXT3_XATTR_SIZE(value_len);
++      error = -ENOSPC;
++      if (free < 0)
++              goto cleanup;
++
++      /* Here we know that we can set the new attribute. */
++
++      if (header) {
++              if (header->h_refcount == cpu_to_le32(1)) {
++                      ea_bdebug(bh, "modifying in-place");
++                      ext3_xattr_cache_remove(bh);
++                      error = ext3_journal_get_write_access(handle, bh);
++                      if (error)
++                              goto cleanup;
++              } else {
++                      int offset;
++
++                      ea_bdebug(bh, "cloning");
++                      header = kmalloc(bh->b_size, GFP_KERNEL);
++                      error = -ENOMEM;
++                      if (header == NULL)
++                              goto cleanup;
++                      memcpy(header, HDR(bh), bh->b_size);
++                      header->h_refcount = cpu_to_le32(1);
++                      offset = (char *)header - bh->b_data;
++                      here = ENTRY((char *)here + offset);
++                      last = ENTRY((char *)last + offset);
++              }
++      } else {
++              /* Allocate a buffer where we construct the new block. */
++              header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++              error = -ENOMEM;
++              if (header == NULL)
++                      goto cleanup;
++              memset(header, 0, sb->s_blocksize);
++              end = (char *)header + sb->s_blocksize;
++              header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
++              header->h_blocks = header->h_refcount = cpu_to_le32(1);
++              last = here = ENTRY(header+1);
++      }
++
++      if (not_found) {
++              /* Insert the new name. */
++              int size = EXT3_XATTR_LEN(name_len);
++              int rest = (char *)last - (char *)here;
++              memmove((char *)here + size, here, rest);
++              memset(here, 0, size);
++              here->e_name_index = name_index;
++              here->e_name_len = name_len;
++              memcpy(here->e_name, name, name_len);
++      } else {
++              /* Remove the old value. */
++              if (!here->e_value_block && here->e_value_size) {
++                      char *first_val = (char *)header + min_offs;
++                      int offs = le16_to_cpu(here->e_value_offs);
++                      char *val = (char *)header + offs;
++                      size_t size = EXT3_XATTR_SIZE(
++                              le32_to_cpu(here->e_value_size));
++                      memmove(first_val + size, first_val, val - first_val);
++                      memset(first_val, 0, size);
++                      here->e_value_offs = 0;
++                      min_offs += size;
++
++                      /* Adjust all value offsets. */
++                      last = ENTRY(header+1);
++                      while (!IS_LAST_ENTRY(last)) {
++                              int o = le16_to_cpu(last->e_value_offs);
++                              if (!last->e_value_block && o < offs)
++                                      last->e_value_offs =
++                                              cpu_to_le16(o + size);
++                              last = EXT3_XATTR_NEXT(last);
++                      }
++              }
++              if (value == NULL) {
++                      /* Remove this attribute. */
++                      if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) {
++                              /* This block is now empty. */
++                              error = ext3_xattr_set2(handle, inode, bh,NULL);
++                              goto cleanup;
++                      } else {
++                              /* Remove the old name. */
++                              int size = EXT3_XATTR_LEN(name_len);
++                              last = ENTRY((char *)last - size);
++                              memmove(here, (char*)here + size,
++                                      (char*)last - (char*)here);
++                              memset(last, 0, size);
++                      }
++              }
++      }
++
++      if (value != NULL) {
++              /* Insert the new value. */
++              here->e_value_size = cpu_to_le32(value_len);
++              if (value_len) {
++                      size_t size = EXT3_XATTR_SIZE(value_len);
++                      char *val = (char *)header + min_offs - size;
++                      here->e_value_offs =
++                              cpu_to_le16((char *)val - (char *)header);
++                      memset(val + size - EXT3_XATTR_PAD, 0,
++                             EXT3_XATTR_PAD); /* Clear the pad bytes. */
++                      memcpy(val, value, value_len);
++              }
++      }
++      ext3_xattr_rehash(header, here);
++
++      error = ext3_xattr_set2(handle, inode, bh, header);
++
++cleanup:
++      brelse(bh);
++      if (!(bh && header == HDR(bh)))
++              kfree(header);
++      ext3_xattr_unlock();
++
++      return error;
++}
++
++/*
++ * Second half of ext3_xattr_set(): Update the file system.
++ */
++static int
++ext3_xattr_set2(handle_t *handle, struct inode *inode,
++              struct buffer_head *old_bh, struct ext3_xattr_header *header)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *new_bh = NULL;
++      int error;
++
++      if (header) {
++              new_bh = ext3_xattr_cache_find(inode, header);
++              if (new_bh) {
++                      /*
++                       * We found an identical block in the cache.
++                       * The old block will be released after updating
++                       * the inode.
++                       */
++                      ea_bdebug(old_bh, "reusing block %ld",
++                              new_bh->b_blocknr);
++                      
++                      error = -EDQUOT;
++                      if (ext3_xattr_quota_alloc(inode, 1))
++                              goto cleanup;
++                      
++                      error = ext3_journal_get_write_access(handle, new_bh);
++                      if (error)
++                              goto cleanup;
++                      HDR(new_bh)->h_refcount = cpu_to_le32(
++                              le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++                      ea_bdebug(new_bh, "refcount now=%d",
++                              le32_to_cpu(HDR(new_bh)->h_refcount));
++              } else if (old_bh && header == HDR(old_bh)) {
++                      /* Keep this block. */
++                      new_bh = old_bh;
++                      (void)ext3_xattr_cache_insert(new_bh);
++              } else {
++                      /* We need to allocate a new block */
++                      int force = EXT3_I(inode)->i_file_acl != 0;
++                      int block = ext3_xattr_new_block(handle, inode,
++                                                       &error, force);
++                      if (error)
++                              goto cleanup;
++                      ea_idebug(inode, "creating block %d", block);
++
++                      new_bh = sb_getblk(sb, block);
++                      if (!new_bh) {
++getblk_failed:                        ext3_xattr_free_block(handle, inode, block);
++                              error = -EIO;
++                              goto cleanup;
++                      }
++                      lock_buffer(new_bh);
++                      error = ext3_journal_get_create_access(handle, new_bh);
++                      if (error) {
++                              unlock_buffer(new_bh);
++                              goto getblk_failed;
++                      }
++                      memcpy(new_bh->b_data, header, new_bh->b_size);
++                      mark_buffer_uptodate(new_bh, 1);
++                      unlock_buffer(new_bh);
++                      (void)ext3_xattr_cache_insert(new_bh);
++                      ext3_xattr_update_super_block(handle, sb);
++              }
++              error = ext3_journal_dirty_metadata(handle, new_bh);
++              if (error)
++                      goto cleanup;
++      }
++
++      /* Update the inode. */
++      EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++      inode->i_ctime = CURRENT_TIME;
++      ext3_mark_inode_dirty(handle, inode);
++      if (IS_SYNC(inode))
++              handle->h_sync = 1;
++
++      error = 0;
++      if (old_bh && old_bh != new_bh) {
++              /*
++               * If there was an old block, and we are not still using it,
++               * we now release the old block.
++              */
++              unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++              error = ext3_journal_get_write_access(handle, old_bh);
++              if (error)
++                      goto cleanup;
++              if (refcount == 1) {
++                      /* Free the old block. */
++                      ea_bdebug(old_bh, "freeing");
++                      ext3_xattr_free_block(handle, inode, old_bh->b_blocknr);
++
++                      /* ext3_forget() calls bforget() for us, but we
++                         let our caller release old_bh, so we need to
++                         duplicate the handle before. */
++                      get_bh(old_bh);
++                      ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
++              } else {
++                      /* Decrement the refcount only. */
++                      refcount--;
++                      HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++                      ext3_xattr_quota_free(inode);
++                      ext3_journal_dirty_metadata(handle, old_bh);
++                      ea_bdebug(old_bh, "refcount now=%d", refcount);
++              }
++      }
++
++cleanup:
++      if (old_bh != new_bh)
++              brelse(new_bh);
++
++      return error;
++}
++
++/*
++ * ext3_xattr_drop_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext3_xattr_drop_inode(handle_t *handle, struct inode *inode)
++{
++      struct buffer_head *bh;
++      unsigned int block = EXT3_I(inode)->i_file_acl;
++
++      if (!block)
++              return;
++      ext3_xattr_lock();
++
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh) {
++              ext3_error(inode->i_sb, "ext3_xattr_drop_inode",
++                      "inode %ld: block %d read error", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++              ext3_error(inode->i_sb, "ext3_xattr_drop_inode",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              goto cleanup;
++      }
++      ext3_journal_get_write_access(handle, bh);
++      ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++      if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++              ext3_xattr_cache_remove(bh);
++              ext3_xattr_free_block(handle, inode, block);
++              ext3_forget(handle, 1, inode, bh, block);
++              bh = NULL;
++      } else {
++              HDR(bh)->h_refcount = cpu_to_le32(
++                      le32_to_cpu(HDR(bh)->h_refcount) - 1);
++              ext3_journal_dirty_metadata(handle, bh);
++              if (IS_SYNC(inode))
++                      handle->h_sync = 1;
++              ext3_xattr_quota_free(inode);
++      }
++      EXT3_I(inode)->i_file_acl = 0;
++
++cleanup:
++      brelse(bh);
++      ext3_xattr_unlock();
++}
++
++/*
++ * ext3_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext3_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++      mb_cache_shrink(ext3_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++/*
++ * ext3_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext3_xattr_cache_insert(struct buffer_head *bh)
++{
++      __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++      struct mb_cache_entry *ce;
++      int error;
++
++      ce = mb_cache_entry_alloc(ext3_xattr_cache);
++      if (!ce)
++              return -ENOMEM;
++      error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++      if (error) {
++              mb_cache_entry_free(ce);
++              if (error == -EBUSY) {
++                      ea_bdebug(bh, "already in cache (%d cache entries)",
++                              atomic_read(&ext3_xattr_cache->c_entry_count));
++                      error = 0;
++              }
++      } else {
++              ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++                        atomic_read(&ext3_xattr_cache->c_entry_count));
++              mb_cache_entry_release(ce);
++      }
++      return error;
++}
++
++/*
++ * ext3_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext3_xattr_cmp(struct ext3_xattr_header *header1,
++             struct ext3_xattr_header *header2)
++{
++      struct ext3_xattr_entry *entry1, *entry2;
++
++      entry1 = ENTRY(header1+1);
++      entry2 = ENTRY(header2+1);
++      while (!IS_LAST_ENTRY(entry1)) {
++              if (IS_LAST_ENTRY(entry2))
++                      return 1;
++              if (entry1->e_hash != entry2->e_hash ||
++                  entry1->e_name_len != entry2->e_name_len ||
++                  entry1->e_value_size != entry2->e_value_size ||
++                  memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++                      return 1;
++              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++                      return -EIO;
++              if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++                         (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++                         le32_to_cpu(entry1->e_value_size)))
++                      return 1;
++
++              entry1 = EXT3_XATTR_NEXT(entry1);
++              entry2 = EXT3_XATTR_NEXT(entry2);
++      }
++      if (!IS_LAST_ENTRY(entry2))
++              return 1;
++      return 0;
++}
++
++/*
++ * ext3_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
++{
++      __u32 hash = le32_to_cpu(header->h_hash);
++      struct mb_cache_entry *ce;
++
++      if (!header->h_hash)
++              return NULL;  /* never share */
++      ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++      ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash);
++      while (ce) {
++              struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++              if (!bh) {
++                      ext3_error(inode->i_sb, "ext3_xattr_cache_find",
++                              "inode %ld: block %ld read error",
++                              inode->i_ino, ce->e_block);
++              } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++                         EXT3_XATTR_REFCOUNT_MAX) {
++                      ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++                              le32_to_cpu(HDR(bh)->h_refcount),
++                              EXT3_XATTR_REFCOUNT_MAX);
++              } else if (!ext3_xattr_cmp(header, HDR(bh))) {
++                      ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++                      mb_cache_entry_release(ce);
++                      return bh;
++              }
++              brelse(bh);
++              ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++      }
++      return NULL;
++}
++
++/*
++ * ext3_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext3_xattr_cache_remove(struct buffer_head *bh)
++{
++      struct mb_cache_entry *ce;
++
++      ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr);
++      if (ce) {
++              ea_bdebug(bh, "removing (%d cache entries remaining)",
++                        atomic_read(&ext3_xattr_cache->c_entry_count)-1);
++              mb_cache_entry_free(ce);
++      } else 
++              ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
++                                       struct ext3_xattr_entry *entry)
++{
++      __u32 hash = 0;
++      char *name = entry->e_name;
++      int n;
++
++      for (n=0; n < entry->e_name_len; n++) {
++              hash = (hash << NAME_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++                     *name++;
++      }
++
++      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++              __u32 *value = (__u32 *)((char *)header +
++                      le16_to_cpu(entry->e_value_offs));
++              for (n = (le32_to_cpu(entry->e_value_size) +
++                   EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
++                      hash = (hash << VALUE_HASH_SHIFT) ^
++                             (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++                             le32_to_cpu(*value++);
++              }
++      }
++      entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext3_xattr_rehash(struct ext3_xattr_header *header,
++                            struct ext3_xattr_entry *entry)
++{
++      struct ext3_xattr_entry *here;
++      __u32 hash = 0;
++      
++      ext3_xattr_hash_entry(header, entry);
++      here = ENTRY(header+1);
++      while (!IS_LAST_ENTRY(here)) {
++              if (!here->e_hash) {
++                      /* Block is not shared if an entry's hash value == 0 */
++                      hash = 0;
++                      break;
++              }
++              hash = (hash << BLOCK_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++                     le32_to_cpu(here->e_hash);
++              here = EXT3_XATTR_NEXT(here);
++      }
++      header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext3_xattr(void)
++{
++      ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
++              sizeof(struct mb_cache_entry) +
++              sizeof(struct mb_cache_entry_index), 1, 61);
++      if (!ext3_xattr_cache)
++              return -ENOMEM;
++
++      return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++      if (ext3_xattr_cache)
++              mb_cache_destroy(ext3_xattr_cache);
++      ext3_xattr_cache = NULL;
++}
++
++#else  /* CONFIG_EXT3_FS_XATTR_SHARING */
++
++int __init
++init_ext3_xattr(void)
++{
++      return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++}
++
++#endif  /* CONFIG_EXT3_FS_XATTR_SHARING */
+--- linux-2.4.18/include/linux/ext3_fs.h~linux-2.4.18ea-0.8.26-2       2003-09-01 11:51:00.000000000 +0400
++++ linux-2.4.18-alexey/include/linux/ext3_fs.h        2003-09-01 14:55:39.000000000 +0400
+@@ -63,8 +63,6 @@
+  */
+ #define       EXT3_BAD_INO             1      /* Bad blocks inode */
+ #define EXT3_ROOT_INO          2      /* Root inode */
+-#define EXT3_ACL_IDX_INO       3      /* ACL inode */
+-#define EXT3_ACL_DATA_INO      4      /* ACL inode */
+ #define EXT3_BOOT_LOADER_INO   5      /* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO     6      /* Undelete directory inode */
+ #define EXT3_RESIZE_INO                7      /* Reserved group descriptors inode */
+@@ -94,7 +92,6 @@
+ #else
+ # define EXT3_BLOCK_SIZE(s)           (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT3_ACLE_PER_BLOCK(s)                (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define       EXT3_ADDR_PER_BLOCK(s)          (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+@@ -129,28 +126,6 @@
+ #endif
+ /*
+- * ACL structures
+- */
+-struct ext3_acl_header        /* Header of Access Control Lists */
+-{
+-      __u32   aclh_size;
+-      __u32   aclh_file_count;
+-      __u32   aclh_acle_count;
+-      __u32   aclh_first_acle;
+-};
+-
+-struct ext3_acl_entry /* Access Control List Entry */
+-{
+-      __u32   acle_size;
+-      __u16   acle_perms;     /* Access permissions */
+-      __u16   acle_type;      /* Type of entry */
+-      __u16   acle_tag;       /* User or group identity */
+-      __u16   acle_pad1;
+-      __u32   acle_next;      /* Pointer on next entry for the */
+-                                      /* same inode or on next free entry */
+-};
+-
+-/*
+  * Structure of a blocks group descriptor
+  */
+ struct ext3_group_desc
+@@ -521,7 +496,7 @@ struct ext3_super_block {
+ #define EXT3_FEATURE_INCOMPAT_RECOVER         0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008 /* Journal device */
+-#define EXT3_FEATURE_COMPAT_SUPP      0
++#define EXT3_FEATURE_COMPAT_SUPP      EXT3_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP    (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+                                        EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP   (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+@@ -623,6 +598,24 @@ struct dx_hash_info
+ #define HASH_NB_ALWAYS                1
++/* Defined for extended attributes */
++#define CONFIG_EXT3_FS_XATTR y
++#ifndef ENOATTR
++#define ENOATTR ENODATA               /* No such attribute */
++#endif
++#ifndef ENOTSUP
++#define ENOTSUP EOPNOTSUPP    /* Operation not supported */
++#endif
++#ifndef XATTR_NAME_MAX
++#define XATTR_NAME_MAX   255  /* # chars in an extended attribute name */
++#define XATTR_SIZE_MAX 65536  /* size of an extended attribute value (64k) */
++#define XATTR_LIST_MAX 65536  /* size of extended attribute namelist (64k) */
++#endif
++#ifndef XATTR_CREATE
++#define XATTR_CREATE  1       /* set value, fail if attr already exists */
++#define XATTR_REPLACE 2       /* set value, fail if attr does not exist */
++#endif
++
+ /*
+  * Describe an inode's exact location on disk and in memory
+  */
+@@ -704,6 +697,7 @@ extern void ext3_check_inodes_bitmap (st
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+ /* inode.c */
++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+--- linux-2.4.18/include/linux/ext3_jbd.h~linux-2.4.18ea-0.8.26-2      2003-08-29 16:53:17.000000000 +0400
++++ linux-2.4.18-alexey/include/linux/ext3_jbd.h       2003-09-01 14:55:39.000000000 +0400
+@@ -30,13 +30,19 @@
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS  8
++/* Extended attributes may touch two data buffers, two bitmap buffers,
++ * and two group and summaries. */
++
++#define EXT3_XATTR_TRANS_BLOCKS               8
++
+ /* Define the minimum size for a transaction which modifies data.  This
+  * needs to take into account the fact that we may end up modifying two
+  * quota files too (one for the group, one for the user quota).  The
+  * superblock only gets updated once, of course, so don't bother
+  * counting that again for the quota updates. */
+-#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
++#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \
++                                       EXT3_XATTR_TRANS_BLOCKS - 2)
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.18-alexey/include/linux/ext3_xattr.h     2003-09-01 14:55:39.000000000 +0400
+@@ -0,0 +1,155 @@
++/*
++  File: linux/ext3_xattr.h
++
++  On-disk format of extended attributes for the ext3 filesystem.
++
++  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT3_XATTR_MAGIC              0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT3_XATTR_REFCOUNT_MAX               1024
++
++/* Name indexes */
++#define EXT3_XATTR_INDEX_MAX                  10
++#define EXT3_XATTR_INDEX_USER                 1
++
++struct ext3_xattr_header {
++      __u32   h_magic;        /* magic number for identification */
++      __u32   h_refcount;     /* reference count */
++      __u32   h_blocks;       /* number of disk blocks used */
++      __u32   h_hash;         /* hash value of all attributes */
++      __u32   h_reserved[4];  /* zero right now */
++};
++
++struct ext3_xattr_entry {
++      __u8    e_name_len;     /* length of name */
++      __u8    e_name_index;   /* attribute name index */
++      __u16   e_value_offs;   /* offset in disk block of value */
++      __u32   e_value_block;  /* disk block attribute is stored on (n/i) */
++      __u32   e_value_size;   /* size of attribute value */
++      __u32   e_hash;         /* hash value of name and value */
++      char    e_name[0];      /* attribute name */
++};
++
++#define EXT3_XATTR_PAD_BITS           2
++#define EXT3_XATTR_PAD                (1<<EXT3_XATTR_PAD_BITS)
++#define EXT3_XATTR_ROUND              (EXT3_XATTR_PAD-1)
++#define EXT3_XATTR_LEN(name_len) \
++      (((name_len) + EXT3_XATTR_ROUND + \
++      sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
++#define EXT3_XATTR_NEXT(entry) \
++      ( (struct ext3_xattr_entry *)( \
++        (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
++#define EXT3_XATTR_SIZE(size) \
++      (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT3_FS_XATTR
++
++struct ext3_xattr_handler {
++      char *prefix;
++      size_t (*list)(char *list, struct inode *inode, const char *name,
++                     int name_len);
++      int (*get)(struct inode *inode, const char *name, void *buffer,
++                 size_t size);
++      int (*set)(struct inode *inode, const char *name, void *buffer,
++                 size_t size, int flags);
++};
++
++extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
++
++extern int ext3_setxattr(struct dentry *, const char *, void *, size_t, int);
++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
++extern int ext3_removexattr(struct dentry *, const char *);
++
++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext3_xattr_list(struct inode *, char *, size_t);
++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, void *, size_t, int);
++
++extern void ext3_xattr_drop_inode(handle_t *, struct inode *);
++extern void ext3_xattr_put_super(struct super_block *);
++
++extern int init_ext3_xattr(void) __init;
++extern void exit_ext3_xattr(void);
++
++# else  /* CONFIG_EXT3_FS_XATTR */
++#  define ext3_setxattr               NULL
++#  define ext3_getxattr               NULL
++#  define ext3_listxattr      NULL
++#  define ext3_removexattr    NULL
++
++static inline int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_list(struct inode *inode, void *buffer, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++             const char *name, void *value, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline void
++ext3_xattr_drop_inode(handle_t *handle, struct inode *inode)
++{
++}
++
++static inline void
++ext3_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext3_xattr(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext3_xattr(void)
++{
++}
++
++# endif  /* CONFIG_EXT3_FS_XATTR */
++
++# ifdef CONFIG_EXT3_FS_XATTR_USER
++
++extern int init_ext3_xattr_user(void) __init;
++extern void exit_ext3_xattr_user(void);
++
++# else  /* CONFIG_EXT3_FS_XATTR_USER */
++
++static inline int
++init_ext3_xattr_user(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext3_xattr_user(void)
++{
++}
++
++#endif  /* CONFIG_EXT3_FS_XATTR_USER */
++
++#endif  /* __KERNEL__ */
++
+--- /dev/null  2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.18-alexey/include/linux/xattr.h  2003-09-01 14:55:39.000000000 +0400
+@@ -0,0 +1,15 @@
++/*
++  File: linux/xattr.h
++
++  Extended attributes handling.
++
++  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
++  Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
++*/
++#ifndef _LINUX_XATTR_H
++#define _LINUX_XATTR_H
++
++#define XATTR_CREATE  1       /* set value, fail if attr already exists */
++#define XATTR_REPLACE 2       /* set value, fail if attr does not exist */
++
++#endif        /* _LINUX_XATTR_H */
+
+_
diff --git a/lustre/kernel_patches/patches/removepage-2.4.20.patch b/lustre/kernel_patches/patches/removepage-2.4.20.patch
new file mode 100644 (file)
index 0000000..cc721e1
--- /dev/null
@@ -0,0 +1,28 @@
+ include/linux/fs.h |    1 +
+ mm/filemap.c       |    3 +++
+ 2 files changed, 4 insertions(+)
+
+--- linux-2.4.20-b_llpmd-l24/include/linux/fs.h~removepage-2.4.20      2003-09-05 11:45:42.000000000 -0700
++++ linux-2.4.20-b_llpmd-l24-zab/include/linux/fs.h    2003-09-05 11:46:25.000000000 -0700
+@@ -402,6 +402,7 @@ struct address_space_operations {
+       int (*releasepage) (struct page *, int);
+ #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
+       int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
++      void (*removepage)(struct page *); /* called when page gets removed from the inode */
+ };
+ struct address_space {
+--- linux-2.4.20-b_llpmd-l24/mm/filemap.c~removepage-2.4.20    2003-09-05 11:45:42.000000000 -0700
++++ linux-2.4.20-b_llpmd-l24-zab/mm/filemap.c  2003-09-05 11:46:25.000000000 -0700
+@@ -95,6 +95,9 @@ static inline void remove_page_from_inod
+ {
+       struct address_space * mapping = page->mapping;
++      if (mapping->a_ops->removepage)
++              mapping->a_ops->removepage(page);
++      
+       mapping->nrpages--;
+       list_del(&page->list);
+       page->mapping = NULL;
+
+_
diff --git a/lustre/kernel_patches/patches/removepage-2.6.0.patch b/lustre/kernel_patches/patches/removepage-2.6.0.patch
new file mode 100644 (file)
index 0000000..268ca97
--- /dev/null
@@ -0,0 +1,28 @@
+ include/linux/fs.h |    1 +
+ mm/filemap.c       |    3 +++
+ 2 files changed, 4 insertions(+)
+
+--- linux-2.6.0-test3-l25/include/linux/fs.h~removepage-2.6.0  2003-09-05 15:31:52.000000000 -0700
++++ linux-2.6.0-test3-l25-zab/include/linux/fs.h       2003-09-08 10:47:30.000000000 -0700
+@@ -311,6 +311,7 @@ struct address_space_operations {
+       int (*releasepage) (struct page *, int);
+       int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
+                       loff_t offset, unsigned long nr_segs);
++      void (*removepage)(struct page *); /* called when page gets removed from the inode */
+ };
+ struct backing_dev_info;
+--- linux-2.6.0-test3-l25/mm/filemap.c~removepage-2.6.0        2003-08-08 21:34:39.000000000 -0700
++++ linux-2.6.0-test3-l25-zab/mm/filemap.c     2003-09-08 10:48:10.000000000 -0700
+@@ -81,6 +81,9 @@ void __remove_from_page_cache(struct pag
+ {
+       struct address_space *mapping = page->mapping;
++      if (mapping->a_ops->removepage)
++              mapping->a_ops->removepage(page);
++
+       radix_tree_delete(&mapping->page_tree, page->index);
+       list_del(&page->list);
+       page->mapping = NULL;
+
+_
diff --git a/lustre/kernel_patches/patches/uml-2.6.0-fix.patch b/lustre/kernel_patches/patches/uml-2.6.0-fix.patch
new file mode 100644 (file)
index 0000000..2910f97
--- /dev/null
@@ -0,0 +1,19 @@
+ include/asm-um/unistd.h |    2 ++
+ 1 files changed, 2 insertions(+)
+
+diff -puN include/asm-um/unistd.h~uml-2.6.0-fix include/asm-um/unistd.h
+--- linux-2.6.0-test3/include/asm-um/unistd.h~uml-2.6.0-fix    2003-09-04 18:39:45.000000000 +0400
++++ linux-2.6.0-test3-alexey/include/asm-um/unistd.h   2003-09-04 18:39:59.000000000 +0400
+@@ -6,8 +6,10 @@
+ #ifndef _UM_UNISTD_H_
+ #define _UM_UNISTD_H_
++#ifdef __KERNEL__
+ #include "linux/resource.h"
+ #include "asm/uaccess.h"
++#endif
+ extern long sys_open(const char *filename, int flags, int mode);
+ extern long sys_dup(unsigned int fildes);
+
+_
diff --git a/lustre/kernel_patches/patches/uml-patch-2.6.0-test3-1.patch b/lustre/kernel_patches/patches/uml-patch-2.6.0-test3-1.patch
new file mode 100644 (file)
index 0000000..8ea5a43
--- /dev/null
@@ -0,0 +1,8716 @@
+diff -Naur a/arch/um/Kconfig b/arch/um/Kconfig
+--- a/arch/um/Kconfig  Fri Aug 15 15:05:57 2003
++++ b/arch/um/Kconfig  Fri Aug 15 15:11:53 2003
+@@ -61,6 +61,20 @@
+ config NET
+       bool "Networking support"
++      help
++      Unless you really know what you are doing, you should say Y here.
++      The reason is that some programs need kernel networking support even
++      when running on a stand-alone machine that isn't connected to any
++      other computer. If you are upgrading from an older kernel, you
++      should consider updating your networking tools too because changes
++      in the kernel and the tools often go hand in hand. The tools are
++      contained in the package net-tools, the location and version number
++      of which are given in Documentation/Changes.
++
++      For a general introduction to Linux networking, it is highly
++      recommended to read the NET-HOWTO, available from
++      <http://www.tldp.org/docs.html#howto>.
++
+ source "fs/Kconfig.binfmt"
+@@ -85,6 +99,19 @@
+         If you'd like to be able to work with files stored on the host, 
+         say Y or M here; otherwise say N.
++config HPPFS
++      tristate "HoneyPot ProcFS"
++      help
++      hppfs (HoneyPot ProcFS) is a filesystem which allows UML /proc 
++      entries to be overridden, removed, or fabricated from the host.
++      Its purpose is to allow a UML to appear to be a physical machine
++      by removing or changing anything in /proc which gives away the
++      identity of a UML.
++
++      See http://user-mode-linux.sf.net/hppfs.html for more information.
++
++      You only need this if you are setting up a UML honeypot.  Otherwise,
++      it is safe to say 'N' here.
+ config MCONSOLE
+       bool "Management console"
+@@ -105,6 +132,16 @@
+ config MAGIC_SYSRQ
+       bool "Magic SysRq key"
+       depends on MCONSOLE
++      help
++      If you say Y here, you will have some control over the system even
++      if the system crashes for example during kernel debugging (e.g., you
++      will be able to flush the buffer cache to disk, reboot the system
++      immediately or dump some status information). This is accomplished
++      by pressing various keys while holding SysRq (Alt+PrintScreen). It
++      also works on a serial console (on PC hardware at least), if you
++      send a BREAK and then within 5 seconds a command keypress. The
++      keys are documented in Documentation/sysrq.txt. Don't say Y
++      unless you really know what this hack does.
+ config HOST_2G_2G
+       bool "2G/2G host address space split"
+@@ -159,6 +196,9 @@
+ config HIGHMEM
+       bool "Highmem support"
++config PROC_MM
++      bool "/proc/mm support"
++
+ config KERNEL_STACK_ORDER
+       int "Kernel stack size order"
+       default 2
+@@ -239,6 +279,10 @@
+ config PT_PROXY
+       bool "Enable ptrace proxy"
+       depends on XTERM_CHAN && DEBUG_INFO
++      help
++      This option enables a debugging interface which allows gdb to debug
++      the kernel without needing to actually attach to kernel threads.
++      If you want to do kernel debugging, say Y here; otherwise say N.
+ config GPROF
+       bool "Enable gprof support"
+diff -Naur a/arch/um/Kconfig_block b/arch/um/Kconfig_block
+--- a/arch/um/Kconfig_block    Fri Aug 15 15:07:32 2003
++++ b/arch/um/Kconfig_block    Fri Aug 15 15:12:56 2003
+@@ -29,6 +29,20 @@
+         wise choice too.  In all other cases (for example, if you're just
+         playing around with User-Mode Linux) you can choose N.
++# Turn this back on when the driver actually works
++#
++#config BLK_DEV_COW
++#     tristate "COW block device"
++#     help
++#     This is a layered driver which sits above two other block devices.
++#     One is read-only, and the other is a read-write layer which stores
++#     all changes.  This provides the illusion that the read-only layer
++#     can be mounted read-write and changed.
++
++config BLK_DEV_COW_COMMON
++      bool
++      default BLK_DEV_COW || BLK_DEV_UBD
++
+ config BLK_DEV_LOOP
+       tristate "Loopback device support"
+diff -Naur a/arch/um/Kconfig_net b/arch/um/Kconfig_net
+--- a/arch/um/Kconfig_net      Fri Aug 15 15:06:52 2003
++++ b/arch/um/Kconfig_net      Fri Aug 15 15:12:43 2003
+@@ -1,5 +1,5 @@
+-menu "Network Devices"
++menu "UML Network Devices"
+       depends on NET
+ # UML virtual driver
+@@ -176,73 +176,5 @@
+       
+         Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp"
+-
+-# Below are hardware-independent drivers mirrored from
+-# drivers/net/Config.in. It would be nice if Linux
+-# had HW independent drivers separated from the other
+-# but it does not. Until then each non-ISA/PCI arch
+-# needs to provide it's own menu of network drivers
+-config DUMMY
+-      tristate "Dummy net driver support"
+-
+-config BONDING
+-      tristate "Bonding driver support"
+-
+-config EQUALIZER
+-      tristate "EQL (serial line load balancing) support"
+-
+-config TUN
+-      tristate "Universal TUN/TAP device driver support"
+-
+-config ETHERTAP
+-      tristate "Ethertap network tap (OBSOLETE)"
+-      depends on EXPERIMENTAL && NETLINK
+-
+-config PPP
+-      tristate "PPP (point-to-point protocol) support"
+-
+-config PPP_MULTILINK
+-      bool "PPP multilink support (EXPERIMENTAL)"
+-      depends on PPP && EXPERIMENTAL
+-
+-config PPP_FILTER
+-      bool "PPP filtering"
+-      depends on PPP && FILTER
+-
+-config PPP_ASYNC
+-      tristate "PPP support for async serial ports"
+-      depends on PPP
+-
+-config PPP_SYNC_TTY
+-      tristate "PPP support for sync tty ports"
+-      depends on PPP
+-
+-config PPP_DEFLATE
+-      tristate "PPP Deflate compression"
+-      depends on PPP
+-
+-config PPP_BSDCOMP
+-      tristate "PPP BSD-Compress compression"
+-      depends on PPP
+-
+-config PPPOE
+-      tristate "PPP over Ethernet (EXPERIMENTAL)"
+-      depends on PPP && EXPERIMENTAL
+-
+-config SLIP
+-      tristate "SLIP (serial line) support"
+-
+-config SLIP_COMPRESSED
+-      bool "CSLIP compressed headers"
+-      depends on SLIP=y
+-
+-config SLIP_SMART
+-      bool "Keepalive and linefill"
+-      depends on SLIP=y
+-
+-config SLIP_MODE_SLIP6
+-      bool "Six bit SLIP encapsulation"
+-      depends on SLIP=y
+-
+ endmenu
+diff -Naur a/arch/um/Makefile b/arch/um/Makefile
+--- a/arch/um/Makefile Fri Aug 15 15:07:18 2003
++++ b/arch/um/Makefile Fri Aug 15 15:12:45 2003
+@@ -24,15 +24,17 @@
+ # Have to precede the include because the included Makefiles reference them.
+ SYMLINK_HEADERS = include/asm-um/archparam.h include/asm-um/system.h \
+       include/asm-um/sigcontext.h include/asm-um/processor.h \
+-      include/asm-um/ptrace.h include/asm-um/arch-signal.h
++      include/asm-um/ptrace.h include/asm-um/arch-signal.h \
++      include/asm-um/module.h
+ ARCH_SYMLINKS = include/asm-um/arch $(ARCH_DIR)/include/sysdep $(ARCH_DIR)/os \
+       $(SYMLINK_HEADERS) $(ARCH_DIR)/include/uml-config.h
+ GEN_HEADERS += $(ARCH_DIR)/include/task.h $(ARCH_DIR)/include/kern_constants.h
+-include $(ARCH_DIR)/Makefile-$(SUBARCH)
+-include $(ARCH_DIR)/Makefile-os-$(OS)
++.PHONY: sys_prepare
++sys_prepare:
++      @:
+ MAKEFILE-$(CONFIG_MODE_TT) += Makefile-tt
+ MAKEFILE-$(CONFIG_MODE_SKAS) += Makefile-skas
+@@ -41,6 +43,9 @@
+   include $(addprefix $(ARCH_DIR)/,$(MAKEFILE-y))
+ endif
++include $(ARCH_DIR)/Makefile-$(SUBARCH)
++include $(ARCH_DIR)/Makefile-os-$(OS)
++
+ EXTRAVERSION := $(EXTRAVERSION)-1um
+ ARCH_INCLUDE = -I$(ARCH_DIR)/include
+@@ -52,14 +57,14 @@
+ CFLAGS += $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \
+       -D_LARGEFILE64_SOURCE $(ARCH_INCLUDE) -Derrno=kernel_errno \
+-      $(MODE_INCLUDE)
++      -Dsigprocmask=kernel_sigprocmask $(MODE_INCLUDE)
+ LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc
+ SIZE = (($(CONFIG_NEST_LEVEL) + $(CONFIG_KERNEL_HALF_GIGS)) * 0x20000000)
+ ifeq ($(CONFIG_MODE_SKAS), y)
+-$(SYS_HEADERS) : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h
++$(SYS_HEADERS) : $(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h
+ endif
+ include/linux/version.h: arch/$(ARCH)/Makefile
+@@ -116,6 +121,7 @@
+ USER_CFLAGS := $(patsubst -I%,,$(CFLAGS))
+ USER_CFLAGS := $(patsubst -Derrno=kernel_errno,,$(USER_CFLAGS))
++USER_CFLAGS := $(patsubst -Dsigprocmask=kernel_sigprocmask,,$(USER_CFLAGS))
+ USER_CFLAGS := $(patsubst -D__KERNEL__,,$(USER_CFLAGS)) $(ARCH_INCLUDE) \
+       $(MODE_INCLUDE)
+@@ -123,9 +129,10 @@
+ USER_CFLAGS += -D_GNU_SOURCE
+ CLEAN_FILES += linux x.i gmon.out $(ARCH_DIR)/uml.lds.s \
+-      $(ARCH_DIR)/dyn_link.ld.s $(GEN_HEADERS)
++      $(ARCH_DIR)/dyn_link.ld.s $(ARCH_DIR)/include/uml-config.h \
++      $(GEN_HEADERS)
+-$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c
++$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c sys_prepare
+       $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $<
+ archmrproper:
+@@ -161,19 +168,23 @@
+ $(ARCH_DIR)/os:
+       cd $(ARCH_DIR) && ln -sf os-$(OS) os
+-$(ARCH_DIR)/include/uml-config.h :
++$(ARCH_DIR)/include/uml-config.h : $(TOPDIR)/include/linux/autoconf.h
+       sed 's/ CONFIG/ UML_CONFIG/' $(TOPDIR)/include/linux/autoconf.h > $@
++filechk_$(ARCH_DIR)/include/task.h := $(ARCH_DIR)/util/mk_task
++
+ $(ARCH_DIR)/include/task.h : $(ARCH_DIR)/util/mk_task
+-      $< > $@
++      $(call filechk,$@)
++
++filechk_$(ARCH_DIR)/include/kern_constants.h := $(ARCH_DIR)/util/mk_constants
+ $(ARCH_DIR)/include/kern_constants.h : $(ARCH_DIR)/util/mk_constants
+-      $< > $@
++      $(call filechk,$@)
+-$(ARCH_DIR)/util/mk_task : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h \
+-      $(ARCH_DIR)/util FORCE ;
++$(ARCH_DIR)/util/mk_task $(ARCH_DIR)/util/mk_constants : $(ARCH_DIR)/util \
++      sys_prepare FORCE ;
+ $(ARCH_DIR)/util: FORCE
+-      @$(call descend,$@,)
++      $(MAKE) -f scripts/Makefile.build obj=$@
+-export SUBARCH USER_CFLAGS OS
++export SUBARCH USER_CFLAGS OS 
+diff -Naur a/arch/um/Makefile-i386 b/arch/um/Makefile-i386
+--- a/arch/um/Makefile-i386    Fri Aug 15 15:07:46 2003
++++ b/arch/um/Makefile-i386    Fri Aug 15 15:13:14 2003
+@@ -16,22 +16,28 @@
+ SYS_HEADERS = $(SYS_DIR)/sc.h $(SYS_DIR)/thread.h
++sys_prepare: $(SYS_DIR)/sc.h
++
+ prepare: $(SYS_HEADERS)
++filechk_$(SYS_DIR)/sc.h := $(SYS_UTIL_DIR)/mk_sc
++
+ $(SYS_DIR)/sc.h: $(SYS_UTIL_DIR)/mk_sc
+-      $< > $@
++      $(call filechk,$@)
++
++filechk_$(SYS_DIR)/thread.h := $(SYS_UTIL_DIR)/mk_thread 
+ $(SYS_DIR)/thread.h: $(SYS_UTIL_DIR)/mk_thread 
+-      $< > $@
++      $(call filechk,$@)
+-$(SYS_UTIL_DIR)/mk_sc: FORCE ; 
+-      @$(call descend,$(SYS_UTIL_DIR),$@)
++$(SYS_UTIL_DIR)/mk_sc: scripts/fixdep include/config/MARKER FORCE ; 
++      +@$(call descend,$(SYS_UTIL_DIR),$@)
+-$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) FORCE ; 
+-      @$(call descend,$(SYS_UTIL_DIR),$@)
++$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) sys_prepare FORCE ; 
++      +@$(call descend,$(SYS_UTIL_DIR),$@)
+ $(SYS_UTIL_DIR): include/asm FORCE
+-      @$(call descend,$@,)
++      +@$(call descend,$@,)
+ sysclean :
+       rm -f $(SYS_HEADERS)
+diff -Naur a/arch/um/Makefile-skas b/arch/um/Makefile-skas
+--- a/arch/um/Makefile-skas    Fri Aug 15 15:05:43 2003
++++ b/arch/um/Makefile-skas    Fri Aug 15 15:11:52 2003
+@@ -14,7 +14,7 @@
+ LINK_SKAS = -Wl,-rpath,/lib 
+ LD_SCRIPT_SKAS = dyn.lds.s
+-GEN_HEADERS += $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h
++GEN_HEADERS += $(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h
+-$(ARCH_DIR)/kernel/skas/include/skas_ptregs.h :
+-      $(MAKE) -C $(ARCH_DIR)/kernel/skas include/skas_ptregs.h
++$(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h :
++      $(call descend,$(ARCH_DIR)/kernel/skas,$@)
+diff -Naur a/arch/um/config.release b/arch/um/config.release
+--- a/arch/um/config.release   Fri Aug 15 15:09:05 2003
++++ b/arch/um/config.release   Fri Aug 15 15:13:48 2003
+@@ -228,7 +228,6 @@
+ CONFIG_EXT2_FS=y
+ CONFIG_SYSV_FS=m
+ CONFIG_UDF_FS=m
+-# CONFIG_UDF_RW is not set
+ CONFIG_UFS_FS=m
+ # CONFIG_UFS_FS_WRITE is not set
+diff -Naur a/arch/um/defconfig b/arch/um/defconfig
+--- a/arch/um/defconfig        Fri Aug 15 15:07:30 2003
++++ b/arch/um/defconfig        Fri Aug 15 15:12:54 2003
+@@ -6,7 +6,6 @@
+ CONFIG_SWAP=y
+ CONFIG_UID16=y
+ CONFIG_RWSEM_GENERIC_SPINLOCK=y
+-CONFIG_CONFIG_LOG_BUF_SHIFT=14
+ #
+ # Code maturity level options
+@@ -116,7 +115,6 @@
+ CONFIG_PACKET_MMAP=y
+ # CONFIG_NETLINK_DEV is not set
+ # CONFIG_NETFILTER is not set
+-# CONFIG_FILTER is not set
+ CONFIG_UNIX=y
+ # CONFIG_NET_KEY is not set
+ CONFIG_INET=y
+@@ -385,7 +383,6 @@
+ #
+ # Disk-On-Chip Device Drivers
+ #
+-# CONFIG_MTD_DOC1000 is not set
+ # CONFIG_MTD_DOC2000 is not set
+ # CONFIG_MTD_DOC2001 is not set
+diff -Naur a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
+--- a/arch/um/drivers/Makefile Fri Aug 15 15:06:42 2003
++++ b/arch/um/drivers/Makefile Fri Aug 15 15:12:40 2003
+@@ -1,5 +1,5 @@
+ # 
+-# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
++# Copyright (C) 2000, 2002, 2003 Jeff Dike (jdike@karaya.com)
+ # Licensed under the GPL
+ #
+@@ -39,6 +39,8 @@
+ obj-$(CONFIG_TTY_CHAN) += tty.o 
+ obj-$(CONFIG_XTERM_CHAN) += xterm.o xterm_kern.o
+ obj-$(CONFIG_UML_WATCHDOG) += harddog.o
++obj-$(CONFIG_BLK_DEV_COW) += cow_kern.o
++obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o
+ obj-y += stdio_console.o $(CHAN_OBJS)
+@@ -46,7 +48,7 @@
+ USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) fd.o \
+       null.o pty.o tty.o xterm.o
+-USER_OBJS := $(foreach file,$(USER_OBJS),arch/um/drivers/$(file))
++USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
+ $(USER_OBJS) : %.o: %.c
+       $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
+diff -Naur a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
+--- a/arch/um/drivers/chan_kern.c      Fri Aug 15 15:09:13 2003
++++ b/arch/um/drivers/chan_kern.c      Fri Aug 15 15:13:51 2003
+@@ -8,6 +8,7 @@
+ #include <linux/list.h>
+ #include <linux/slab.h>
+ #include <linux/tty.h>
++#include <linux/string.h>
+ #include <linux/tty_flip.h>
+ #include <asm/irq.h>
+ #include "chan_kern.h"
+diff -Naur a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c
+--- a/arch/um/drivers/chan_user.c      Fri Aug 15 15:03:46 2003
++++ b/arch/um/drivers/chan_user.c      Fri Aug 15 15:10:09 2003
+@@ -188,8 +188,8 @@
+       if(!isatty(fd)) return;
+       pid = tcgetpgrp(fd);
+-      if(!CHOOSE_MODE(is_tracer_winch(pid, fd, device_data), 0) && 
+-         (pid == -1)){
++      if(!CHOOSE_MODE_PROC(is_tracer_winch, is_skas_winch, pid, fd, 
++                           device_data) && (pid == -1)){
+               thread = winch_tramp(fd, device_data, &thread_fd);
+               if(fd != -1){
+                       register_winch_irq(thread_fd, fd, thread, device_data);
+diff -Naur a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h
+--- a/arch/um/drivers/cow.h    Wed Dec 31 19:00:00 1969
++++ b/arch/um/drivers/cow.h    Fri Aug 15 15:10:34 2003
+@@ -0,0 +1,40 @@
++#ifndef __COW_H__
++#define __COW_H__
++
++#include <asm/types.h>
++
++#if __BYTE_ORDER == __BIG_ENDIAN
++# define ntohll(x) (x)
++# define htonll(x) (x)
++#elif __BYTE_ORDER == __LITTLE_ENDIAN
++# define ntohll(x)  bswap_64(x)
++# define htonll(x)  bswap_64(x)
++#else
++#error "__BYTE_ORDER not defined"
++#endif
++
++extern int init_cow_file(int fd, char *cow_file, char *backing_file, 
++                       int sectorsize, int *bitmap_offset_out, 
++                       unsigned long *bitmap_len_out, int *data_offset_out);
++
++extern int file_reader(__u64 offset, char *buf, int len, void *arg);
++extern int read_cow_header(int (*reader)(__u64, char *, int, void *), 
++                         void *arg, __u32 *magic_out, 
++                         char **backing_file_out, time_t *mtime_out, 
++                         __u64 *size_out, int *sectorsize_out, 
++                         int *bitmap_offset_out);
++
++extern int write_cow_header(char *cow_file, int fd, char *backing_file, 
++                          int sectorsize, long long *size);
++
++extern void cow_sizes(__u64 size, int sectorsize, int bitmap_offset, 
++                    unsigned long *bitmap_len_out, int *data_offset_out);
++
++#endif
++
++/*
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/arch/um/drivers/cow_kern.c b/arch/um/drivers/cow_kern.c
+--- a/arch/um/drivers/cow_kern.c       Wed Dec 31 19:00:00 1969
++++ b/arch/um/drivers/cow_kern.c       Fri Aug 15 15:13:51 2003
+@@ -0,0 +1,628 @@
++#define COW_MAJOR 60
++#define MAJOR_NR COW_MAJOR
++
++#include <linux/stddef.h>
++#include <linux/kernel.h>
++#include <linux/ctype.h>
++#include <linux/stat.h>
++#include <linux/vmalloc.h>
++#include <linux/blkdev.h>
++#include <linux/blk.h>
++#include <linux/fs.h>
++#include <linux/genhd.h>
++#include <linux/devfs_fs.h>
++#include <asm/uaccess.h>
++#include "2_5compat.h"
++#include "cow.h"
++#include "ubd_user.h"
++
++#define COW_SHIFT 4
++
++struct cow {
++      int count;
++      char *cow_path;
++      dev_t cow_dev;
++      struct block_device *cow_bdev;
++      char *backing_path;
++      dev_t backing_dev;
++      struct block_device *backing_bdev;
++      int sectorsize;
++      unsigned long *bitmap;
++      unsigned long bitmap_len;
++      int bitmap_offset;
++      int data_offset;
++      devfs_handle_t devfs;
++      struct semaphore sem;
++      struct semaphore io_sem;
++      atomic_t working;
++      spinlock_t io_lock;
++      struct buffer_head *bh;
++      struct buffer_head *bhtail;
++      void *end_io;
++};
++
++#define DEFAULT_COW { \
++      .count                  = 0, \
++      .cow_path               = NULL, \
++      .cow_dev                = 0, \
++      .backing_path           = NULL, \
++      .backing_dev            = 0, \
++        .bitmap                       = NULL, \
++      .bitmap_len             = 0, \
++      .bitmap_offset          = 0, \
++        .data_offset          = 0, \
++      .devfs                  = NULL, \
++      .working                = ATOMIC_INIT(0), \
++      .io_lock                = SPIN_LOCK_UNLOCKED, \
++}
++
++#define MAX_DEV (8)
++#define MAX_MINOR (MAX_DEV << COW_SHIFT)
++
++struct cow cow_dev[MAX_DEV] = { [ 0 ... MAX_DEV - 1 ] = DEFAULT_COW };
++
++/* Not modified by this driver */
++static int blk_sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = BLOCK_SIZE };
++static int hardsect_sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = 512 };
++
++/* Protected by cow_lock */
++static int sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = 0 };
++
++static struct hd_struct       cow_part[MAX_MINOR] =
++      { [ 0 ... MAX_MINOR - 1 ] = { 0, 0, 0 } };
++
++/* Protected by io_request_lock */
++static request_queue_t *cow_queue;
++
++static int cow_open(struct inode *inode, struct file *filp);
++static int cow_release(struct inode * inode, struct file * file);
++static int cow_ioctl(struct inode * inode, struct file * file,
++                   unsigned int cmd, unsigned long arg);
++static int cow_revalidate(kdev_t rdev);
++
++static struct block_device_operations cow_blops = {
++       .open          = cow_open,
++       .release       = cow_release,
++       .ioctl         = cow_ioctl,
++       .revalidate    = cow_revalidate,
++};
++
++/* Initialized in an initcall, and unchanged thereafter */
++devfs_handle_t cow_dir_handle;
++
++#define INIT_GENDISK(maj, name, parts, shift, bsizes, max, blops) \
++{ \
++      .major          = maj, \
++      .major_name     = name, \
++      .minor_shift    = shift, \
++      .max_p          = 1 << shift, \
++      .part           = parts, \
++      .sizes          = bsizes, \
++      .nr_real        = max, \
++      .real_devices   = NULL, \
++      .next           = NULL, \
++      .fops           = blops, \
++      .de_arr         = NULL, \
++      .flags          = 0 \
++}
++
++static spinlock_t cow_lock = SPIN_LOCK_UNLOCKED;
++
++static struct gendisk cow_gendisk = INIT_GENDISK(MAJOR_NR, "cow", cow_part,
++                                               COW_SHIFT, sizes, MAX_DEV, 
++                                               &cow_blops);
++
++static int cow_add(int n)
++{
++      struct cow *dev = &cow_dev[n];
++      char name[sizeof("nnnnnn\0")];
++      int err = -ENODEV;
++
++      if(dev->cow_path == NULL)
++              goto out;
++
++      sprintf(name, "%d", n);
++      dev->devfs = devfs_register(cow_dir_handle, name, DEVFS_FL_REMOVABLE,
++                                  MAJOR_NR, n << COW_SHIFT, S_IFBLK | 
++                                  S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP,
++                                  &cow_blops, NULL);
++
++      init_MUTEX_LOCKED(&dev->sem);
++      init_MUTEX(&dev->io_sem);
++
++      return(0);
++
++out:
++      return(err);
++}
++
++/*
++* Add buffer_head to back of pending list
++*/
++static void cow_add_bh(struct cow *cow, struct buffer_head *bh)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&cow->io_lock, flags);
++      if(cow->bhtail != NULL){
++              cow->bhtail->b_reqnext = bh;
++              cow->bhtail = bh;
++      }
++      else {
++              cow->bh = bh;
++              cow->bhtail = bh;
++      }
++      spin_unlock_irqrestore(&cow->io_lock, flags);
++}
++
++/*
++* Grab first pending buffer
++*/
++static struct buffer_head *cow_get_bh(struct cow *cow)
++{
++      struct buffer_head *bh;
++
++      spin_lock_irq(&cow->io_lock);
++      bh = cow->bh;
++      if(bh != NULL){
++              if(bh == cow->bhtail)
++                      cow->bhtail = NULL;
++              cow->bh = bh->b_reqnext;
++              bh->b_reqnext = NULL;
++      }
++      spin_unlock_irq(&cow->io_lock);
++
++      return(bh);
++}
++
++static void cow_handle_bh(struct cow *cow, struct buffer_head *bh, 
++                        struct buffer_head **cow_bh, int ncow_bh)
++{
++      int i;
++
++      if(ncow_bh > 0)
++              ll_rw_block(WRITE, ncow_bh, cow_bh);
++
++      for(i = 0; i < ncow_bh ; i++){
++              wait_on_buffer(cow_bh[i]);
++              brelse(cow_bh[i]);
++      }
++
++      ll_rw_block(WRITE, 1, &bh);
++      brelse(bh);
++}
++
++static struct buffer_head *cow_new_bh(struct cow *dev, int sector)
++{
++      struct buffer_head *bh;
++
++      sector = (dev->bitmap_offset + sector / 8) / dev->sectorsize;
++      bh = getblk(dev->cow_dev, sector, dev->sectorsize);
++      memcpy(bh->b_data, dev->bitmap + sector / (8 * sizeof(dev->bitmap[0])),
++             dev->sectorsize);
++      return(bh);
++}
++
++/* Copied from loop.c, needed to avoid deadlocking in make_request. */
++
++static int cow_thread(void *data)
++{
++      struct cow *dev = data;
++      struct buffer_head *bh;
++
++      daemonize();
++      exit_files(current);
++
++      sprintf(current->comm, "cow%d", dev - cow_dev);
++
++      spin_lock_irq(&current->sigmask_lock);
++      sigfillset(&current->blocked);
++      flush_signals(current);
++      spin_unlock_irq(&current->sigmask_lock);
++
++      atomic_inc(&dev->working);
++
++      current->policy = SCHED_OTHER;
++      current->nice = -20;
++
++      current->flags |= PF_NOIO;
++
++      /*
++       * up sem, we are running
++       */
++      up(&dev->sem);
++
++      for(;;){
++              int start, len, nbh, i, update_bitmap = 0;
++              struct buffer_head *cow_bh[2];
++
++              down_interruptible(&dev->io_sem);
++              /*
++               * could be upped because of tear-down, not because of
++               * pending work
++               */
++              if(!atomic_read(&dev->working))
++                      break;
++
++              bh = cow_get_bh(dev);
++              if(bh == NULL){
++                      printk(KERN_ERR "cow: missing bh\n");
++                      continue;
++              }
++
++              start = bh->b_blocknr * bh->b_size / dev->sectorsize;
++              len = bh->b_size / dev->sectorsize;
++              for(i = 0; i < len ; i++){
++                      if(ubd_test_bit(start +ni, 
++                                      (unsigned char *) dev->bitmap))
++                              continue;
++
++                      update_bitmap = 1;
++                      ubd_set_bit(start + i, (unsigned char *) dev->bitmap);
++              }
++
++              cow_bh[0] = NULL;
++              cow_bh[1] = NULL;
++              nbh = 0;
++              if(update_bitmap){
++                      cow_bh[0] = cow_new_bh(dev, start);
++                      nbh++;
++                      if(start / dev->sectorsize != 
++                         (start + len) / dev->sectorsize){
++                              cow_bh[1] = cow_new_bh(dev, start + len);
++                              nbh++;
++                      }
++              }
++              
++              bh->b_dev = dev->cow_dev;
++              bh->b_blocknr += dev->data_offset / dev->sectorsize;
++
++              cow_handle_bh(dev, bh, cow_bh, nbh);
++
++              /*
++               * upped both for pending work and tear-down, lo_pending
++               * will hit zero then
++               */
++              if(atomic_dec_and_test(&dev->working))
++                      break;
++      }
++
++      up(&dev->sem);
++      return(0);
++}
++
++static int cow_make_request(request_queue_t *q, int rw, struct buffer_head *bh)
++{
++      struct cow *dev;
++      int n, minor;
++
++      minor = MINOR(bh->b_rdev);
++      n = minor >> COW_SHIFT;
++      dev = &cow_dev[n];
++
++      dev->end_io = NULL;
++      if(ubd_test_bit(bh->b_rsector, (unsigned char *) dev->bitmap)){
++              bh->b_rdev = dev->cow_dev;
++              bh->b_rsector += dev->data_offset / dev->sectorsize;
++      }
++      else if(rw == WRITE){
++              bh->b_dev = dev->cow_dev;
++              bh->b_blocknr += dev->data_offset / dev->sectorsize;
++
++              cow_add_bh(dev, bh);
++              up(&dev->io_sem);
++              return(0);
++      }
++      else {
++              bh->b_rdev = dev->backing_dev;
++      }
++
++      return(1);
++}
++
++int cow_init(void)
++{
++      int i;
++
++      cow_dir_handle = devfs_mk_dir (NULL, "cow", NULL);
++      if (devfs_register_blkdev(MAJOR_NR, "cow", &cow_blops)) {
++              printk(KERN_ERR "cow: unable to get major %d\n", MAJOR_NR);
++              return -1;
++      }
++      read_ahead[MAJOR_NR] = 8;               /* 8 sector (4kB) read-ahead */
++      blksize_size[MAJOR_NR] = blk_sizes;
++      blk_size[MAJOR_NR] = sizes;
++      INIT_HARDSECT(hardsect_size, MAJOR_NR, hardsect_sizes);
++
++      cow_queue = BLK_DEFAULT_QUEUE(MAJOR_NR);
++      blk_init_queue(cow_queue, NULL);
++      INIT_ELV(cow_queue, &cow_queue->elevator);
++      blk_queue_make_request(cow_queue, cow_make_request);
++
++       add_gendisk(&cow_gendisk);
++
++      for(i=0;i<MAX_DEV;i++) 
++              cow_add(i);
++
++      return(0);
++}
++
++__initcall(cow_init);
++
++static int reader(__u64 start, char *buf, int count, void *arg)
++{
++      dev_t dev = *((dev_t *) arg);
++      struct buffer_head *bh;
++      __u64 block;
++      int cur, offset, left, n, blocksize = get_hardsect_size(dev);
++
++      if(blocksize == 0)
++              panic("Zero blocksize");
++
++      block = start / blocksize;
++      offset = start % blocksize;
++      left = count;
++      cur = 0;
++      while(left > 0){
++              n = (left > blocksize) ? blocksize : left;
++
++              bh = bread(dev, block, (n < 512) ? 512 : n);
++              if(bh == NULL)
++                      return(-EIO);
++
++              n -= offset;
++              memcpy(&buf[cur], bh->b_data + offset, n);
++              block++;
++              left -= n;
++              cur += n;
++              offset = 0;
++              brelse(bh);
++      }
++
++      return(count);
++}
++
++static int cow_open(struct inode *inode, struct file *filp)
++{
++      int (*dev_ioctl)(struct inode *, struct file *, unsigned int, 
++                       unsigned long);
++      mm_segment_t fs;
++      struct cow *dev;
++      __u64 size;
++      __u32 magic;
++      time_t mtime;
++      char *backing_file;
++      int n, offset, err = 0;
++
++      n = DEVICE_NR(inode->i_rdev);
++      if(n >= MAX_DEV)
++              return(-ENODEV);
++      dev = &cow_dev[n];
++      offset = n << COW_SHIFT;
++
++      spin_lock(&cow_lock);
++
++      if(dev->count == 0){
++              dev->cow_dev = name_to_kdev_t(dev->cow_path);
++              if(dev->cow_dev == 0){
++                      printk(KERN_ERR "cow_open - name_to_kdev_t(\"%s\") "
++                             "failed\n", dev->cow_path);
++                      err = -ENODEV;
++              }
++
++              dev->backing_dev = name_to_kdev_t(dev->backing_path);
++              if(dev->backing_dev == 0){
++                      printk(KERN_ERR "cow_open - name_to_kdev_t(\"%s\") "
++                             "failed\n", dev->backing_path);
++                      err = -ENODEV;
++              }
++
++              if(err) 
++                      goto out;
++
++              dev->cow_bdev = bdget(dev->cow_dev);
++              if(dev->cow_bdev == NULL){
++                      printk(KERN_ERR "cow_open - bdget(\"%s\") failed\n", 
++                             dev->cow_path);
++                      err = -ENOMEM;
++              }
++              dev->backing_bdev = bdget(dev->backing_dev);
++              if(dev->backing_bdev == NULL){
++                      printk(KERN_ERR "cow_open - bdget(\"%s\") failed\n", 
++                             dev->backing_path);
++                      err = -ENOMEM;
++              }
++
++              if(err) 
++                      goto out;
++
++              err = blkdev_get(dev->cow_bdev, FMODE_READ|FMODE_WRITE, 0, 
++                               BDEV_RAW);
++              if(err){
++                      printk("cow_open - blkdev_get of COW device failed, "
++                             "error = %d\n", err);
++                      goto out;
++              }
++              
++              err = blkdev_get(dev->backing_bdev, FMODE_READ, 0, BDEV_RAW);
++              if(err){
++                      printk("cow_open - blkdev_get of backing device "
++                             "failed, error = %d\n", err);
++                      goto out;
++              }
++              
++              err = read_cow_header(reader, &dev->cow_dev, &magic, 
++                                    &backing_file, &mtime, &size,
++                                    &dev->sectorsize, &dev->bitmap_offset);
++              if(err){
++                      printk(KERN_ERR "cow_open - read_cow_header failed, "
++                             "err = %d\n", err);
++                      goto out;
++              }
++
++              cow_sizes(size, dev->sectorsize, dev->bitmap_offset, 
++                        &dev->bitmap_len, &dev->data_offset);
++              dev->bitmap = (void *) vmalloc(dev->bitmap_len);
++              if(dev->bitmap == NULL){
++                      err = -ENOMEM;
++                      printk(KERN_ERR "Failed to vmalloc COW bitmap\n");
++                      goto out;
++              }
++              flush_tlb_kernel_vm();
++              
++              err = reader(dev->bitmap_offset, (char *) dev->bitmap, 
++                           dev->bitmap_len, &dev->cow_dev);
++              if(err < 0){
++                      printk(KERN_ERR "Failed to read COW bitmap\n");
++                      vfree(dev->bitmap);
++                      goto out;
++              }
++
++              dev_ioctl = dev->backing_bdev->bd_op->ioctl;
++              fs = get_fs();
++              set_fs(KERNEL_DS);
++              err = (*dev_ioctl)(inode, filp, BLKGETSIZE, 
++                                 (unsigned long) &sizes[offset]);
++              set_fs(fs);
++              if(err){
++                      printk(KERN_ERR "cow_open - BLKGETSIZE failed, "
++                             "error = %d\n", err);
++                      goto out;
++              }
++
++              kernel_thread(cow_thread, dev, 
++                            CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
++              down(&dev->sem);
++      }
++      dev->count++;
++out:
++      spin_unlock(&cow_lock);
++      return(err);
++}
++
++static int cow_release(struct inode * inode, struct file * file)
++{
++      struct cow *dev;
++      int n, err;
++
++      n = DEVICE_NR(inode->i_rdev);
++      if(n >= MAX_DEV)
++              return(-ENODEV);
++      dev = &cow_dev[n];
++
++      spin_lock(&cow_lock);
++
++      if(--dev->count > 0)
++              goto out;
++
++      err = blkdev_put(dev->cow_bdev, BDEV_RAW);
++      if(err)
++              printk("cow_release - blkdev_put of cow device failed, "
++                     "error = %d\n", err);
++      bdput(dev->cow_bdev);
++      dev->cow_bdev = 0;
++
++      err = blkdev_put(dev->backing_bdev, BDEV_RAW);
++      if(err)
++              printk("cow_release - blkdev_put of backing device failed, "
++                     "error = %d\n", err);
++      bdput(dev->backing_bdev);
++      dev->backing_bdev = 0;
++
++out:
++      spin_unlock(&cow_lock);
++      return(0);
++}
++
++static int cow_ioctl(struct inode * inode, struct file * file,
++                   unsigned int cmd, unsigned long arg)
++{
++      struct cow *dev;
++      int (*dev_ioctl)(struct inode *, struct file *, unsigned int, 
++                       unsigned long);
++      int n;
++
++      n = DEVICE_NR(inode->i_rdev);
++      if(n >= MAX_DEV)
++              return(-ENODEV);
++      dev = &cow_dev[n];
++
++      dev_ioctl = dev->backing_bdev->bd_op->ioctl;
++      return((*dev_ioctl)(inode, file, cmd, arg));
++}
++
++static int cow_revalidate(kdev_t rdev)
++{
++      printk(KERN_ERR "Need to implement cow_revalidate\n");
++      return(0);
++}
++
++static int parse_unit(char **ptr)
++{
++      char *str = *ptr, *end;
++      int n = -1;
++
++      if(isdigit(*str)) {
++              n = simple_strtoul(str, &end, 0);
++              if(end == str)
++                      return(-1);
++              *ptr = end;
++      }
++      else if (('a' <= *str) && (*str <= 'h')) {
++              n = *str - 'a';
++              str++;
++              *ptr = str;
++      }
++      return(n);
++}
++
++static int cow_setup(char *str)
++{
++      struct cow *dev;
++      char *cow_name, *backing_name;
++      int unit;
++
++      unit = parse_unit(&str);
++      if(unit < 0){
++              printk(KERN_ERR "cow_setup - Couldn't parse unit number\n");
++              return(1);
++      }
++
++      if(*str != '='){
++              printk(KERN_ERR "cow_setup - Missing '=' after unit "
++                     "number\n");
++              return(1);
++      }
++      str++;
++
++      cow_name = str;
++      backing_name = strchr(str, ',');
++      if(backing_name == NULL){
++              printk(KERN_ERR "cow_setup - missing backing device name\n");
++              return(0);
++      }
++      *backing_name = '\0';
++      backing_name++;
++
++      spin_lock(&cow_lock);
++
++      dev = &cow_dev[unit];
++      dev->cow_path = cow_name;
++      dev->backing_path = backing_name;
++      
++      spin_unlock(&cow_lock);
++      return(0);
++}
++
++__setup("cow", cow_setup);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/arch/um/drivers/cow_sys.h b/arch/um/drivers/cow_sys.h
+--- a/arch/um/drivers/cow_sys.h        Wed Dec 31 19:00:00 1969
++++ b/arch/um/drivers/cow_sys.h        Fri Aug 15 15:12:37 2003
+@@ -0,0 +1,48 @@
++#ifndef __COW_SYS_H__
++#define __COW_SYS_H__
++
++#include "kern_util.h"
++#include "user_util.h"
++#include "os.h"
++#include "user.h"
++
++static inline void *cow_malloc(int size)
++{
++      return(um_kmalloc(size));
++}
++
++static inline void cow_free(void *ptr)
++{
++      kfree(ptr);
++}
++
++#define cow_printf printk
++
++static inline char *cow_strdup(char *str)
++{
++      return(uml_strdup(str));
++}
++
++static inline int cow_seek_file(int fd, __u64 offset)
++{
++      return(os_seek_file(fd, offset));
++}
++
++static inline int cow_file_size(char *file, __u64 *size_out)
++{
++      return(os_file_size(file, size_out));
++}
++
++static inline int cow_write_file(int fd, char *buf, int size)
++{
++      return(os_write_file(fd, buf, size));
++}
++
++#endif
++
++/*
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/arch/um/drivers/cow_user.c b/arch/um/drivers/cow_user.c
+--- a/arch/um/drivers/cow_user.c       Wed Dec 31 19:00:00 1969
++++ b/arch/um/drivers/cow_user.c       Fri Aug 15 15:12:34 2003
+@@ -0,0 +1,296 @@
++#include <stddef.h>
++#include <string.h>
++#include <errno.h>
++#include <unistd.h>
++#include <byteswap.h>
++#include <sys/stat.h>
++#include <sys/time.h>
++#include <sys/param.h>
++#include <netinet/in.h>
++
++#include "cow.h"
++#include "cow_sys.h"
++
++#define PATH_LEN_V1 256
++
++struct cow_header_v1 {
++      int magic;
++      int version;
++      char backing_file[PATH_LEN_V1];
++      time_t mtime;
++      __u64 size;
++      int sectorsize;
++};
++
++#define PATH_LEN_V2 MAXPATHLEN
++
++struct cow_header_v2 {
++      unsigned long magic;
++      unsigned long version;
++      char backing_file[PATH_LEN_V2];
++      time_t mtime;
++      __u64 size;
++      int sectorsize;
++};
++
++union cow_header {
++      struct cow_header_v1 v1;
++      struct cow_header_v2 v2;
++};
++
++#define COW_MAGIC 0x4f4f4f4d  /* MOOO */
++#define COW_VERSION 2
++
++void cow_sizes(__u64 size, int sectorsize, int bitmap_offset, 
++             unsigned long *bitmap_len_out, int *data_offset_out)
++{
++      *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize);
++
++      *data_offset_out = bitmap_offset + *bitmap_len_out;
++      *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize;
++      *data_offset_out *= sectorsize;
++}
++
++static int absolutize(char *to, int size, char *from)
++{
++      char save_cwd[256], *slash;
++      int remaining;
++
++      if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) {
++              cow_printf("absolutize : unable to get cwd - errno = %d\n", 
++                         errno);
++              return(-1);
++      }
++      slash = strrchr(from, '/');
++      if(slash != NULL){
++              *slash = '\0';
++              if(chdir(from)){
++                      *slash = '/';
++                      cow_printf("absolutize : Can't cd to '%s' - " 
++                                 "errno = %d\n", from, errno);
++                      return(-1);
++              }
++              *slash = '/';
++              if(getcwd(to, size) == NULL){
++                      cow_printf("absolutize : unable to get cwd of '%s' - "
++                             "errno = %d\n", from, errno);
++                      return(-1);
++              }
++              remaining = size - strlen(to);
++              if(strlen(slash) + 1 > remaining){
++                      cow_printf("absolutize : unable to fit '%s' into %d "
++                             "chars\n", from, size);
++                      return(-1);
++              }
++              strcat(to, slash);
++      }
++      else {
++              if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){
++                      cow_printf("absolutize : unable to fit '%s' into %d "
++                             "chars\n", from, size);
++                      return(-1);
++              }
++              strcpy(to, save_cwd);
++              strcat(to, "/");
++              strcat(to, from);
++      }
++      chdir(save_cwd);
++      return(0);
++}
++
++int write_cow_header(char *cow_file, int fd, char *backing_file, 
++                   int sectorsize, long long *size)
++{
++      struct cow_header_v2 *header;
++      struct stat64 buf;
++      int err;
++
++      err = cow_seek_file(fd, 0);
++      if(err != 0){
++              cow_printf("write_cow_header - lseek failed, errno = %d\n", 
++                         errno);
++              return(-errno);
++      }
++
++      err = -ENOMEM;
++      header = cow_malloc(sizeof(*header));
++      if(header == NULL){
++              cow_printf("Failed to allocate COW V2 header\n");
++              goto out;
++      }
++      header->magic = htonl(COW_MAGIC);
++      header->version = htonl(COW_VERSION);
++
++      err = -EINVAL;
++      if(strlen(backing_file) > sizeof(header->backing_file) - 1){
++              cow_printf("Backing file name \"%s\" is too long - names are "
++                         "limited to %d characters\n", backing_file, 
++                         sizeof(header->backing_file) - 1);
++              goto out_free;
++      }
++
++      if(absolutize(header->backing_file, sizeof(header->backing_file), 
++                    backing_file))
++              goto out_free;
++
++      err = stat64(header->backing_file, &buf);
++      if(err < 0){
++              cow_printf("Stat of backing file '%s' failed, errno = %d\n",
++                         header->backing_file, errno);
++              err = -errno;
++              goto out_free;
++      }
++
++      err = cow_file_size(header->backing_file, size);
++      if(err){
++              cow_printf("Couldn't get size of backing file '%s', "
++                         "errno = %d\n", header->backing_file, -*size);
++              goto out_free;
++      }
++
++      header->mtime = htonl(buf.st_mtime);
++      header->size = htonll(*size);
++      header->sectorsize = htonl(sectorsize);
++
++      err = write(fd, header, sizeof(*header));
++      if(err != sizeof(*header)){
++              cow_printf("Write of header to new COW file '%s' failed, "
++                         "errno = %d\n", cow_file, errno);
++              goto out_free;
++      }
++      err = 0;
++ out_free:
++      cow_free(header);
++ out:
++      return(err);
++}
++
++int file_reader(__u64 offset, char *buf, int len, void *arg)
++{
++      int fd = *((int *) arg);
++
++      return(pread(fd, buf, len, offset));
++}
++
++int read_cow_header(int (*reader)(__u64, char *, int, void *), void *arg, 
++                  __u32 *magic_out, char **backing_file_out, 
++                  time_t *mtime_out, __u64 *size_out, 
++                  int *sectorsize_out, int *bitmap_offset_out)
++{
++      union cow_header *header;
++      char *file;
++      int err, n;
++      unsigned long version, magic;
++
++      header = cow_malloc(sizeof(*header));
++      if(header == NULL){
++              cow_printf("read_cow_header - Failed to allocate header\n");
++              return(-ENOMEM);
++      }
++      err = -EINVAL;
++      n = (*reader)(0, (char *) header, sizeof(*header), arg);
++      if(n < offsetof(typeof(header->v1), backing_file)){
++              cow_printf("read_cow_header - short header\n");
++              goto out;
++      }
++
++      magic = header->v1.magic;
++      if(magic == COW_MAGIC) {
++              version = header->v1.version;
++      }
++      else if(magic == ntohl(COW_MAGIC)){
++              version = ntohl(header->v1.version);
++      }
++      /* No error printed because the non-COW case comes through here */
++      else goto out;
++
++      *magic_out = COW_MAGIC;
++
++      if(version == 1){
++              if(n < sizeof(header->v1)){
++                      cow_printf("read_cow_header - failed to read V1 "
++                                 "header\n");
++                      goto out;
++              }
++              *mtime_out = header->v1.mtime;
++              *size_out = header->v1.size;
++              *sectorsize_out = header->v1.sectorsize;
++              *bitmap_offset_out = sizeof(header->v1);
++              file = header->v1.backing_file;
++      }
++      else if(version == 2){
++              if(n < sizeof(header->v2)){
++                      cow_printf("read_cow_header - failed to read V2 "
++                                 "header\n");
++                      goto out;
++              }
++              *mtime_out = ntohl(header->v2.mtime);
++              *size_out = ntohll(header->v2.size);
++              *sectorsize_out = ntohl(header->v2.sectorsize);
++              *bitmap_offset_out = sizeof(header->v2);
++              file = header->v2.backing_file;
++      }
++      else {
++              cow_printf("read_cow_header - invalid COW version\n");
++              goto out;
++      }
++      err = -ENOMEM;
++      *backing_file_out = cow_strdup(file);
++      if(*backing_file_out == NULL){
++              cow_printf("read_cow_header - failed to allocate backing "
++                         "file\n");
++              goto out;
++      }
++      err = 0;
++ out:
++      cow_free(header);
++      return(err);
++}
++
++int init_cow_file(int fd, char *cow_file, char *backing_file, int sectorsize,
++                int *bitmap_offset_out, unsigned long *bitmap_len_out, 
++                int *data_offset_out)
++{
++      __u64 size, offset;
++      char zero = 0;
++      int err;
++
++      err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size);
++      if(err) 
++              goto out;
++      
++      cow_sizes(size, sectorsize, sizeof(struct cow_header_v2), 
++                bitmap_len_out, data_offset_out);
++      *bitmap_offset_out = sizeof(struct cow_header_v2);
++
++      offset = *data_offset_out + size - sizeof(zero);
++      err = cow_seek_file(fd, offset);
++      if(err != 0){
++              cow_printf("cow bitmap lseek failed : errno = %d\n", errno);
++              goto out;
++      }
++
++      /* does not really matter how much we write it is just to set EOF 
++       * this also sets the entire COW bitmap
++       * to zero without having to allocate it 
++       */
++      err = cow_write_file(fd, &zero, sizeof(zero));
++      if(err != sizeof(zero)){
++              err = -EINVAL;
++              cow_printf("Write of bitmap to new COW file '%s' failed, "
++                         "errno = %d\n", cow_file, errno);
++              goto out;
++      }
++
++      return(0);
++
++ out:
++      return(err);
++}
++
++/*
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c
+--- a/arch/um/drivers/hostaudio_kern.c Fri Aug 15 15:09:05 2003
++++ b/arch/um/drivers/hostaudio_kern.c Fri Aug 15 15:13:48 2003
+@@ -11,6 +11,7 @@
+ #include "linux/fs.h"
+ #include "linux/sound.h"
+ #include "linux/soundcard.h"
++#include "asm/uaccess.h"
+ #include "kern_util.h"
+ #include "init.h"
+ #include "hostaudio.h"
+@@ -22,7 +23,7 @@
+ #ifndef MODULE
+ static int set_dsp(char *name, int *add)
+ {
+-      dsp = uml_strdup(name);
++      dsp = name;
+       return(0);
+ }
+@@ -34,7 +35,7 @@
+ static int set_mixer(char *name, int *add)
+ {
+-      mixer = uml_strdup(name);
++      mixer = name;
+       return(0);
+ }
+@@ -51,23 +52,55 @@
+                             loff_t *ppos)
+ {
+         struct hostaudio_state *state = file->private_data;
++      void *kbuf;
++      int err;
+ #ifdef DEBUG
+         printk("hostaudio: read called, count = %d\n", count);
+ #endif
+-        return(hostaudio_read_user(state, buffer, count, ppos));
++      kbuf = kmalloc(count, GFP_KERNEL);
++      if(kbuf == NULL)
++              return(-ENOMEM);
++
++        err = hostaudio_read_user(state, kbuf, count, ppos);
++      if(err < 0)
++              goto out;
++
++      if(copy_to_user(buffer, kbuf, err))
++              err = -EFAULT;
++
++ out:
++      kfree(kbuf);
++      return(err);
+ }
+ static ssize_t hostaudio_write(struct file *file, const char *buffer, 
+                              size_t count, loff_t *ppos)
+ {
+         struct hostaudio_state *state = file->private_data;
++      void *kbuf;
++      int err;
+ #ifdef DEBUG
+         printk("hostaudio: write called, count = %d\n", count);
+ #endif
+-        return(hostaudio_write_user(state, buffer, count, ppos));
++
++      kbuf = kmalloc(count, GFP_KERNEL);
++      if(kbuf == NULL)
++              return(-ENOMEM);
++
++      err = -EFAULT;
++      if(copy_from_user(kbuf, buffer, count))
++              goto out;
++
++        err = hostaudio_write_user(state, kbuf, count, ppos);
++      if(err < 0)
++              goto out;
++
++ out:
++      kfree(kbuf);
++      return(err);
+ }
+ static unsigned int hostaudio_poll(struct file *file, 
+@@ -86,12 +119,43 @@
+                          unsigned int cmd, unsigned long arg)
+ {
+         struct hostaudio_state *state = file->private_data;
++      unsigned long data = 0;
++      int err;
+ #ifdef DEBUG
+         printk("hostaudio: ioctl called, cmd = %u\n", cmd);
+ #endif
++      switch(cmd){
++      case SNDCTL_DSP_SPEED:
++      case SNDCTL_DSP_STEREO:
++      case SNDCTL_DSP_GETBLKSIZE:
++      case SNDCTL_DSP_CHANNELS:
++      case SNDCTL_DSP_SUBDIVIDE:
++      case SNDCTL_DSP_SETFRAGMENT:
++              if(get_user(data, (int *) arg))
++                      return(-EFAULT);
++              break;
++      default:
++              break;
++      }
++
++        err = hostaudio_ioctl_user(state, cmd, (unsigned long) &data);
++
++      switch(cmd){
++      case SNDCTL_DSP_SPEED:
++      case SNDCTL_DSP_STEREO:
++      case SNDCTL_DSP_GETBLKSIZE:
++      case SNDCTL_DSP_CHANNELS:
++      case SNDCTL_DSP_SUBDIVIDE:
++      case SNDCTL_DSP_SETFRAGMENT:
++              if(put_user(data, (int *) arg))
++                      return(-EFAULT);
++              break;
++      default:
++              break;
++      }
+-        return(hostaudio_ioctl_user(state, cmd, arg));
++      return(err);
+ }
+ static int hostaudio_open(struct inode *inode, struct file *file)
+@@ -225,7 +289,8 @@
+ static int __init hostaudio_init_module(void)
+ {
+-        printk(KERN_INFO "UML Audio Relay\n");
++        printk(KERN_INFO "UML Audio Relay (host dsp = %s, host mixer = %s)\n",
++             dsp, mixer);
+       module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1);
+         if(module_data.dev_audio < 0){
+diff -Naur a/arch/um/drivers/line.c b/arch/um/drivers/line.c
+--- a/arch/um/drivers/line.c   Fri Aug 15 15:08:24 2003
++++ b/arch/um/drivers/line.c   Fri Aug 15 15:13:28 2003
+@@ -6,8 +6,8 @@
+ #include "linux/sched.h"
+ #include "linux/slab.h"
+ #include "linux/list.h"
++#include "linux/interrupt.h"
+ #include "linux/devfs_fs_kernel.h"
+-#include "asm/irq.h"
+ #include "asm/uaccess.h"
+ #include "chan_kern.h"
+ #include "irq_user.h"
+@@ -16,16 +16,18 @@
+ #include "user_util.h"
+ #include "kern_util.h"
+ #include "os.h"
++#include "irq_kern.h"
+ #define LINE_BUFSIZE 4096
+-void line_interrupt(int irq, void *data, struct pt_regs *unused)
++irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused)
+ {
+       struct line *dev = data;
+       if(dev->count > 0) 
+               chan_interrupt(&dev->chan_list, &dev->task, dev->tty, irq, 
+                              dev);
++      return IRQ_HANDLED;
+ }
+ void line_timer_cb(void *arg)
+@@ -136,20 +138,22 @@
+       return(len);
+ }
+-void line_write_interrupt(int irq, void *data, struct pt_regs *unused)
++irqreturn_t line_write_interrupt(int irq, void *data, struct pt_regs *unused)
+ {
+       struct line *dev = data;
+       struct tty_struct *tty = dev->tty;
+       int err;
+       err = flush_buffer(dev);
+-      if(err == 0) return;
++      if(err == 0) 
++              return(IRQ_NONE);
+       else if(err < 0){
+               dev->head = dev->buffer;
+               dev->tail = dev->buffer;
+       }
+-      if(tty == NULL) return;
++      if(tty == NULL) 
++              return(IRQ_NONE);
+       if(test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) &&
+          (tty->ldisc.write_wakeup != NULL))
+@@ -161,9 +165,9 @@
+        * writes.
+        */
+-      if (waitqueue_active(&tty->write_wait))
++      if(waitqueue_active(&tty->write_wait))
+               wake_up_interruptible(&tty->write_wait);
+-
++      return(IRQ_HANDLED);
+ }
+ int line_write_room(struct tty_struct *tty)
+@@ -369,7 +373,7 @@
+       dev = simple_strtoul(name, &end, 0);
+       if((*end != '\0') || (end == name)){
+-              *error_out = "line_setup failed to parse device number";
++              *error_out = "line_get_config failed to parse device number";
+               return(0);
+       }
+@@ -379,15 +383,15 @@
+       }
+       line = &lines[dev];
++
+       down(&line->sem);
+-      
+       if(!line->valid)
+               CONFIG_CHUNK(str, size, n, "none", 1);
+       else if(line->count == 0)
+               CONFIG_CHUNK(str, size, n, line->init_str, 1);
+       else n = chan_config_string(&line->chan_list, str, size, error_out);
+-
+       up(&line->sem);
++
+       return(n);
+ }
+@@ -412,7 +416,8 @@
+               return NULL;
+       driver->driver_name = line_driver->name;
+-      driver->name = line_driver->devfs_name;
++      driver->name = line_driver->device_name;
++      driver->devfs_name = line_driver->devfs_name;
+       driver->major = line_driver->major;
+       driver->minor_start = line_driver->minor_start;
+       driver->type = line_driver->type;
+@@ -432,7 +437,7 @@
+       for(i = 0; i < nlines; i++){
+               if(!lines[i].valid) 
+-                      tty_unregister_devfs(driver, i);
++                      tty_unregister_device(driver, i);
+       }
+       mconsole_register_dev(&line_driver->mc);
+@@ -465,24 +470,25 @@
+       struct line *line;
+ };
+-void winch_interrupt(int irq, void *data, struct pt_regs *unused)
++irqreturn_t winch_interrupt(int irq, void *data, struct pt_regs *unused)
+ {
+       struct winch *winch = data;
+       struct tty_struct *tty;
+       int err;
+       char c;
+-      err = generic_read(winch->fd, &c, NULL);
+-      if(err < 0){
+-              if(err != -EAGAIN){
+-                      printk("winch_interrupt : read failed, errno = %d\n", 
+-                             -err);
+-                      printk("fd %d is losing SIGWINCH support\n", 
+-                             winch->tty_fd);
+-                      free_irq(irq, data);
+-                      return;
++      if(winch->fd != -1){
++              err = generic_read(winch->fd, &c, NULL);
++              if(err < 0){
++                      if(err != -EAGAIN){
++                              printk("winch_interrupt : read failed, "
++                                     "errno = %d\n", -err);
++                              printk("fd %d is losing SIGWINCH support\n", 
++                                     winch->tty_fd);
++                              return(IRQ_HANDLED);
++                      }
++                      goto out;
+               }
+-              goto out;
+       }
+       tty = winch->line->tty;
+       if(tty != NULL){
+@@ -492,7 +498,9 @@
+               kill_pg(tty->pgrp, SIGWINCH, 1);
+       }
+  out:
+-      reactivate_fd(winch->fd, WINCH_IRQ);
++      if(winch->fd != -1)
++              reactivate_fd(winch->fd, WINCH_IRQ);
++      return(IRQ_HANDLED);
+ }
+ DECLARE_MUTEX(winch_handler_sem);
+@@ -529,7 +537,10 @@
+       list_for_each(ele, &winch_handlers){
+               winch = list_entry(ele, struct winch, list);
+-              close(winch->fd);
++              if(winch->fd != -1){
++                      deactivate_fd(winch->fd, WINCH_IRQ);
++                      close(winch->fd);
++              }
+               if(winch->pid != -1) 
+                       os_kill_process(winch->pid, 1);
+       }
+diff -Naur a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
+--- a/arch/um/drivers/mconsole_kern.c  Fri Aug 15 15:03:47 2003
++++ b/arch/um/drivers/mconsole_kern.c  Fri Aug 15 15:10:11 2003
+@@ -27,6 +27,7 @@
+ #include "init.h"
+ #include "os.h"
+ #include "umid.h"
++#include "irq_kern.h"
+ static int do_unlink_socket(struct notifier_block *notifier, 
+                           unsigned long what, void *data)
+@@ -67,7 +68,7 @@
+ DECLARE_WORK(mconsole_work, mc_work_proc, NULL);
+-void mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++irqreturn_t mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+ {
+       int fd;
+       struct mconsole_entry *new;
+@@ -88,6 +89,7 @@
+       }
+       if(!list_empty(&mc_requests)) schedule_work(&mconsole_work);
+       reactivate_fd(fd, MCONSOLE_IRQ);
++      return(IRQ_HANDLED);
+ }
+ void mconsole_version(struct mc_request *req)
+@@ -100,20 +102,34 @@
+       mconsole_reply(req, version, 0, 0);
+ }
++void mconsole_log(struct mc_request *req)
++{
++      int len;
++      char *ptr = req->request.data;
++      
++      ptr += strlen("log");
++      while(isspace(*ptr)) ptr++;
++
++      len = ptr - req->request.data;
++      printk("%.*s", len, ptr);
++      mconsole_reply(req, "", 0, 0);
++}
++
+ #define UML_MCONSOLE_HELPTEXT \
+-"Commands:
+-    version - Get kernel version
+-    help - Print this message
+-    halt - Halt UML
+-    reboot - Reboot UML
+-    config <dev>=<config> - Add a new device to UML; 
+-      same syntax as command line
+-    config <dev> - Query the configuration of a device
+-    remove <dev> - Remove a device from UML
+-    sysrq <letter> - Performs the SysRq action controlled by the letter
+-    cad - invoke the Ctl-Alt-Del handler
+-    stop - pause the UML; it will do nothing until it receives a 'go'
+-    go - continue the UML after a 'stop'
++"Commands: \n\
++    version - Get kernel version \n\
++    help - Print this message \n\
++    halt - Halt UML \n\
++    reboot - Reboot UML \n\
++    config <dev>=<config> - Add a new device to UML;  \n\
++      same syntax as command line \n\
++    config <dev> - Query the configuration of a device \n\
++    remove <dev> - Remove a device from UML \n\
++    sysrq <letter> - Performs the SysRq action controlled by the letter \n\
++    cad - invoke the Ctl-Alt-Del handler \n\
++    stop - pause the UML; it will do nothing until it receives a 'go' \n\
++    go - continue the UML after a 'stop' \n\
++    log <string> - make UML enter <string> into the kernel log\n\
+ "
+ void mconsole_help(struct mc_request *req)
+@@ -302,7 +318,7 @@
+       if(umid_file_name("mconsole", file, sizeof(file))) return(-1);
+       snprintf(mconsole_socket_name, sizeof(file), "%s", file);
+-      sock = create_unix_socket(file, sizeof(file));
++      sock = create_unix_socket(file, sizeof(file), 1);
+       if (sock < 0){
+               printk("Failed to initialize management console\n");
+               return(1);
+diff -Naur a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c
+--- a/arch/um/drivers/mconsole_user.c  Fri Aug 15 15:04:47 2003
++++ b/arch/um/drivers/mconsole_user.c  Fri Aug 15 15:10:35 2003
+@@ -28,6 +28,7 @@
+       { "cad", mconsole_cad, 1 },
+       { "stop", mconsole_stop, 0 },
+       { "go", mconsole_go, 1 },
++      { "log", mconsole_log, 1 },
+ };
+ /* Initialized in mconsole_init, which is an initcall */
+@@ -139,6 +140,7 @@
+               memcpy(reply.data, str, len);
+               reply.data[len] = '\0';
+               total -= len;
++              str += len;
+               reply.len = len + 1;
+               len = sizeof(reply) + reply.len - sizeof(reply.data);
+diff -Naur a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c
+--- a/arch/um/drivers/mmapper_kern.c   Fri Aug 15 15:04:33 2003
++++ b/arch/um/drivers/mmapper_kern.c   Fri Aug 15 15:10:32 2003
+@@ -120,7 +120,10 @@
+       printk(KERN_INFO "Mapper v0.1\n");
+       v_buf = (char *) find_iomem("mmapper", &mmapper_size);
+-      if(mmapper_size == 0) return(0);
++      if(mmapper_size == 0){
++              printk(KERN_ERR "mmapper_init - find_iomem failed\n");
++              return(0);
++      }
+       p_buf = __pa(v_buf);
+diff -Naur a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
+--- a/arch/um/drivers/net_kern.c       Fri Aug 15 15:05:49 2003
++++ b/arch/um/drivers/net_kern.c       Fri Aug 15 15:11:52 2003
+@@ -26,6 +26,7 @@
+ #include "mconsole_kern.h"
+ #include "init.h"
+ #include "irq_user.h"
++#include "irq_kern.h"
+ static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED;
+ LIST_HEAD(opened);
+@@ -61,14 +62,14 @@
+       return pkt_len;
+ }
+-void uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++irqreturn_t uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+ {
+       struct net_device *dev = dev_id;
+       struct uml_net_private *lp = dev->priv;
+       int err;
+       if(!netif_running(dev))
+-              return;
++              return(IRQ_NONE);
+       spin_lock(&lp->lock);
+       while((err = uml_net_rx(dev)) > 0) ;
+@@ -83,6 +84,7 @@
+  out:
+       spin_unlock(&lp->lock);
++      return(IRQ_HANDLED);
+ }
+ static int uml_net_open(struct net_device *dev)
+@@ -292,7 +294,7 @@
+       struct uml_net *device;
+       struct net_device *dev;
+       struct uml_net_private *lp;
+-      int err, size;
++      int save, err, size;
+       size = transport->private_size + sizeof(struct uml_net_private) + 
+               sizeof(((struct uml_net_private *) 0)->user);
+@@ -362,21 +364,29 @@
+               return 1;
+       lp = dev->priv;
+-      INIT_LIST_HEAD(&lp->list);
+-      spin_lock_init(&lp->lock);
+-      lp->dev = dev;
+-      lp->fd = -1;
+-      lp->mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0 };
+-      lp->have_mac = device->have_mac;
+-      lp->protocol = transport->kern->protocol;
+-      lp->open = transport->user->open;
+-      lp->close = transport->user->close;
+-      lp->remove = transport->user->remove;
+-      lp->read = transport->kern->read;
+-      lp->write = transport->kern->write;
+-      lp->add_address = transport->user->add_address;
+-      lp->delete_address = transport->user->delete_address;
+-      lp->set_mtu = transport->user->set_mtu;
++      /* lp.user is the first four bytes of the transport data, which
++       * has already been initialized.  This structure assignment will
++       * overwrite that, so we make sure that .user gets overwritten with
++       * what it already has.
++       */
++      save = lp->user[0];
++      *lp = ((struct uml_net_private) 
++              { .list                 = LIST_HEAD_INIT(lp->list),
++                .lock                 = SPIN_LOCK_UNLOCKED,
++                .dev                  = dev,
++                .fd                   = -1,
++                .mac                  = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0},
++                .have_mac             = device->have_mac,
++                .protocol             = transport->kern->protocol,
++                .open                 = transport->user->open,
++                .close                = transport->user->close,
++                .remove               = transport->user->remove,
++                .read                 = transport->kern->read,
++                .write                = transport->kern->write,
++                .add_address          = transport->user->add_address,
++                .delete_address       = transport->user->delete_address,
++                .set_mtu              = transport->user->set_mtu,
++                .user                 = { save } });
+       init_timer(&lp->tl);
+       lp->tl.function = uml_net_user_timer_expire;
+diff -Naur a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
+--- a/arch/um/drivers/port_kern.c      Fri Aug 15 15:04:01 2003
++++ b/arch/um/drivers/port_kern.c      Fri Aug 15 15:10:18 2003
+@@ -6,6 +6,7 @@
+ #include "linux/list.h"
+ #include "linux/sched.h"
+ #include "linux/slab.h"
++#include "linux/interrupt.h"
+ #include "linux/irq.h"
+ #include "linux/spinlock.h"
+ #include "linux/errno.h"
+@@ -14,6 +15,7 @@
+ #include "kern_util.h"
+ #include "kern.h"
+ #include "irq_user.h"
++#include "irq_kern.h"
+ #include "port.h"
+ #include "init.h"
+ #include "os.h"
+@@ -44,7 +46,7 @@
+       struct port_list *port;
+ };
+-static void pipe_interrupt(int irq, void *data, struct pt_regs *regs)
++static irqreturn_t pipe_interrupt(int irq, void *data, struct pt_regs *regs)
+ {
+       struct connection *conn = data;
+       int fd;
+@@ -52,7 +54,7 @@
+       fd = os_rcv_fd(conn->socket[0], &conn->helper_pid);
+       if(fd < 0){
+               if(fd == -EAGAIN)
+-                      return;
++                      return(IRQ_NONE);
+               printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", 
+                      -fd);
+@@ -65,6 +67,7 @@
+       list_add(&conn->list, &conn->port->connections);
+       up(&conn->port->sem);
++      return(IRQ_HANDLED);
+ }
+ static int port_accept(struct port_list *port)
+@@ -138,12 +141,13 @@
+ DECLARE_WORK(port_work, port_work_proc, NULL);
+-static void port_interrupt(int irq, void *data, struct pt_regs *regs)
++static irqreturn_t port_interrupt(int irq, void *data, struct pt_regs *regs)
+ {
+       struct port_list *port = data;
+       port->has_connection = 1;
+       schedule_work(&port_work);
++      return(IRQ_HANDLED);
+ } 
+ void *port_data(int port_num)
+diff -Naur a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c
+--- a/arch/um/drivers/ssl.c    Fri Aug 15 15:06:09 2003
++++ b/arch/um/drivers/ssl.c    Fri Aug 15 15:12:30 2003
+@@ -53,8 +53,9 @@
+ static struct line_driver driver = {
+       .name                   = "UML serial line",
+-      .devfs_name             = "tts/%d",
+-      .major                  = TTYAUX_MAJOR,
++      .device_name            = "ttS",
++      .devfs_name             = "tts/",
++      .major                  = TTY_MAJOR,
+       .minor_start            = 64,
+       .type                   = TTY_DRIVER_TYPE_SERIAL,
+       .subtype                = 0,
+diff -Naur a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c
+--- a/arch/um/drivers/stdio_console.c  Fri Aug 15 15:04:51 2003
++++ b/arch/um/drivers/stdio_console.c  Fri Aug 15 15:10:56 2003
+@@ -83,7 +83,8 @@
+ static struct line_driver driver = {
+       .name                   = "UML console",
+-      .devfs_name             = "vc/%d",
++      .device_name            = "tty",
++      .devfs_name             = "vc/",
+       .major                  = TTY_MAJOR,
+       .minor_start            = 0,
+       .type                   = TTY_DRIVER_TYPE_CONSOLE,
+@@ -159,6 +160,15 @@
+ static int con_init_done = 0;
++static struct tty_operations console_ops = {
++      .open                   = con_open,
++      .close                  = con_close,
++      .write                  = con_write,
++      .chars_in_buffer        = chars_in_buffer,
++      .set_termios            = set_termios,
++      .write_room             = line_write_room,
++};
++
+ int stdio_init(void)
+ {
+       char *new_title;
+@@ -166,7 +176,8 @@
+       printk(KERN_INFO "Initializing stdio console driver\n");
+       console_driver = line_register_devfs(&console_lines, &driver,
+-                              &console_ops, vts, sizeof(vts)/sizeof(vts[0]));
++                                           &console_ops, vts,
++                                           sizeof(vts)/sizeof(vts[0]));
+       lines_init(vts, sizeof(vts)/sizeof(vts[0]));
+@@ -188,15 +199,6 @@
+       if(con_init_done) up(&vts[console->index].sem);
+ }
+-static struct tty_operations console_ops = {
+-      .open                   = con_open,
+-      .close                  = con_close,
+-      .write                  = con_write,
+-      .chars_in_buffer        = chars_in_buffer,
+-      .set_termios            = set_termios,
+-      .write_room             = line_write_room,
+-};
+-
+ static struct tty_driver *console_device(struct console *c, int *index)
+ {
+       *index = c->index;
+@@ -212,12 +214,14 @@
+                                              console_device, console_setup,
+                                              CON_PRINTBUFFER);
+-static void __init stdio_console_init(void)
++static int __init stdio_console_init(void)
+ {
+       INIT_LIST_HEAD(&vts[0].chan_list);
+       list_add(&init_console_chan.list, &vts[0].chan_list);
+       register_console(&stdiocons);
++      return(0);
+ }
++
+ console_initcall(stdio_console_init);
+ static int console_chan_setup(char *str)
+diff -Naur a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
+--- a/arch/um/drivers/ubd_kern.c       Fri Aug 15 15:05:56 2003
++++ b/arch/um/drivers/ubd_kern.c       Fri Aug 15 15:11:53 2003
+@@ -8,6 +8,13 @@
+  * old style ubd by setting UBD_SHIFT to 0
+  * 2002-09-27...2002-10-18 massive tinkering for 2.5
+  * partitions have changed in 2.5
++ * 2003-01-29 more tinkering for 2.5.59-1
++ * This should now address the sysfs problems and has
++ * the symlink for devfs to allow for booting with
++ * the common /dev/ubd/discX/... names rather than
++ * only /dev/ubdN/discN this version also has lots of
++ * clean ups preparing for ubd-many.
++ * James McMechan
+  */
+ #define MAJOR_NR UBD_MAJOR
+@@ -40,6 +47,7 @@
+ #include "mconsole_kern.h"
+ #include "init.h"
+ #include "irq_user.h"
++#include "irq_kern.h"
+ #include "ubd_user.h"
+ #include "2_5compat.h"
+ #include "os.h"
+@@ -70,7 +78,7 @@
+ static request_queue_t *ubd_queue;
+ /* Protected by ubd_lock */
+-static int fake_major = 0;
++static int fake_major = MAJOR_NR;
+ static struct gendisk *ubd_gendisk[MAX_DEV];
+ static struct gendisk *fake_gendisk[MAX_DEV];
+@@ -99,12 +107,12 @@
+ struct ubd {
+       char *file;
+-      int is_dir;
+       int count;
+       int fd;
+       __u64 size;
+       struct openflags boot_openflags;
+       struct openflags openflags;
++      int no_cow;
+       struct cow cow;
+ };
+@@ -118,12 +126,12 @@
+ #define DEFAULT_UBD { \
+       .file =                 NULL, \
+-      .is_dir =               0, \
+       .count =                0, \
+       .fd =                   -1, \
+       .size =                 -1, \
+       .boot_openflags =       OPEN_FLAGS, \
+       .openflags =            OPEN_FLAGS, \
++        .no_cow =               0, \
+         .cow =                        DEFAULT_COW, \
+ }
+@@ -131,8 +139,10 @@
+ static int ubd0_init(void)
+ {
+-      if(ubd_dev[0].file == NULL)
+-              ubd_dev[0].file = "root_fs";
++      struct ubd *dev = &ubd_dev[0];
++
++      if(dev->file == NULL)
++              dev->file = "root_fs";
+       return(0);
+ }
+@@ -199,19 +209,39 @@
+ "    Create ide0 entries that map onto ubd devices.\n\n"
+ );
++static int parse_unit(char **ptr)
++{
++      char *str = *ptr, *end;
++      int n = -1;
++
++      if(isdigit(*str)) {
++              n = simple_strtoul(str, &end, 0);
++              if(end == str)
++                      return(-1);
++              *ptr = end;
++      }
++      else if (('a' <= *str) && (*str <= 'h')) {
++              n = *str - 'a';
++              str++;
++              *ptr = str;
++      }
++      return(n);
++}
++
+ static int ubd_setup_common(char *str, int *index_out)
+ {
++      struct ubd *dev;
+       struct openflags flags = global_openflags;
+       char *backing_file;
+       int n, err;
+       if(index_out) *index_out = -1;
+-      n = *str++;
++      n = *str;
+       if(n == '='){
+-              static int fake_major_allowed = 1;
+               char *end;
+               int major;
++              str++;
+               if(!strcmp(str, "sync")){
+                       global_openflags.s = 1;
+                       return(0);
+@@ -223,20 +253,14 @@
+                       return(1);
+               }
+-              if(!fake_major_allowed){
+-                      printk(KERN_ERR "Can't assign a fake major twice\n");
+-                      return(1);
+-              }
+-
+               err = 1;
+               spin_lock(&ubd_lock);
+-              if(!fake_major_allowed){
++              if(fake_major != MAJOR_NR){
+                       printk(KERN_ERR "Can't assign a fake major twice\n");
+                       goto out1;
+               }
+  
+               fake_major = major;
+-              fake_major_allowed = 0;
+               printk(KERN_INFO "Setting extra ubd major number to %d\n",
+                      major);
+@@ -246,25 +270,23 @@
+               return(err);
+       }
+-      if(n < '0'){
+-              printk(KERN_ERR "ubd_setup : index out of range\n"); }
+-
+-      if((n >= '0') && (n <= '9')) n -= '0';
+-      else if((n >= 'a') && (n <= 'z')) n -= 'a';
+-      else {
+-              printk(KERN_ERR "ubd_setup : device syntax invalid\n");
++      n = parse_unit(&str);
++      if(n < 0){
++              printk(KERN_ERR "ubd_setup : couldn't parse unit number "
++                     "'%s'\n", str);
+               return(1);
+       }
+       if(n >= MAX_DEV){
+-              printk(KERN_ERR "ubd_setup : index out of range "
+-                     "(%d devices)\n", MAX_DEV);      
++              printk(KERN_ERR "ubd_setup : index %d out of range "
++                     "(%d devices)\n", n, MAX_DEV);
+               return(1);
+       }
+       err = 1;
+       spin_lock(&ubd_lock);
+-      if(ubd_dev[n].file != NULL){
++      dev = &ubd_dev[n];
++      if(dev->file != NULL){
+               printk(KERN_ERR "ubd_setup : device already configured\n");
+               goto out2;
+       }
+@@ -279,6 +301,11 @@
+               flags.s = 1;
+               str++;
+       }
++      if (*str == 'd'){
++              dev->no_cow = 1;
++              str++;
++      }
++
+       if(*str++ != '='){
+               printk(KERN_ERR "ubd_setup : Expected '='\n");
+               goto out2;
+@@ -287,14 +314,17 @@
+       err = 0;
+       backing_file = strchr(str, ',');
+       if(backing_file){
+-              *backing_file = '\0';
+-              backing_file++;
++              if(dev->no_cow)
++                      printk(KERN_ERR "Can't specify both 'd' and a "
++                             "cow file\n");
++              else {
++                      *backing_file = '\0';
++                      backing_file++;
++              }
+       }
+-      ubd_dev[n].file = str;
+-      if(ubd_is_dir(ubd_dev[n].file))
+-              ubd_dev[n].is_dir = 1;
+-      ubd_dev[n].cow.file = backing_file;
+-      ubd_dev[n].boot_openflags = flags;
++      dev->file = str;
++      dev->cow.file = backing_file;
++      dev->boot_openflags = flags;
+  out2:
+       spin_unlock(&ubd_lock);
+       return(err);
+@@ -324,8 +354,7 @@
+ static int fakehd_set = 0;
+ static int fakehd(char *str)
+ {
+-      printk(KERN_INFO 
+-             "fakehd : Changing ubd name to \"hd\".\n");
++      printk(KERN_INFO "fakehd : Changing ubd name to \"hd\".\n");
+       fakehd_set = 1;
+       return 1;
+ }
+@@ -394,9 +423,10 @@
+       do_ubd_request(ubd_queue);
+ }
+-static void ubd_intr(int irq, void *dev, struct pt_regs *unused)
++static irqreturn_t ubd_intr(int irq, void *dev, struct pt_regs *unused)
+ {
+       ubd_handler();
++      return(IRQ_HANDLED);
+ }
+ /* Only changed by ubd_init, which is an initcall. */
+@@ -432,16 +462,18 @@
+ static int ubd_open_dev(struct ubd *dev)
+ {
+       struct openflags flags;
+-      int err, n, create_cow, *create_ptr;
++      char **back_ptr;
++      int err, create_cow, *create_ptr;
++      dev->openflags = dev->boot_openflags;
+       create_cow = 0;
+       create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL;
+-      dev->fd = open_ubd_file(dev->file, &dev->openflags, &dev->cow.file,
++      back_ptr = dev->no_cow ? NULL : &dev->cow.file;
++      dev->fd = open_ubd_file(dev->file, &dev->openflags, back_ptr,
+                               &dev->cow.bitmap_offset, &dev->cow.bitmap_len, 
+                               &dev->cow.data_offset, create_ptr);
+       if((dev->fd == -ENOENT) && create_cow){
+-              n = dev - ubd_dev;
+               dev->fd = create_cow_file(dev->file, dev->cow.file, 
+                                         dev->openflags, 1 << 9,
+                                         &dev->cow.bitmap_offset, 
+@@ -458,7 +490,10 @@
+       if(dev->cow.file != NULL){
+               err = -ENOMEM;
+               dev->cow.bitmap = (void *) vmalloc(dev->cow.bitmap_len);
+-              if(dev->cow.bitmap == NULL) goto error;
++              if(dev->cow.bitmap == NULL){
++                      printk(KERN_ERR "Failed to vmalloc COW bitmap\n");
++                      goto error;
++              }
+               flush_tlb_kernel_vm();
+               err = read_cow_bitmap(dev->fd, dev->cow.bitmap, 
+@@ -484,17 +519,31 @@
+                       
+ {
+       struct gendisk *disk;
++      char from[sizeof("ubd/nnnnn\0")], to[sizeof("discnnnnn/disc\0")];
++      int err;
+       disk = alloc_disk(1 << UBD_SHIFT);
+-      if (!disk)
+-              return -ENOMEM;
++      if(disk == NULL)
++              return(-ENOMEM);
+       disk->major = major;
+       disk->first_minor = unit << UBD_SHIFT;
+       disk->fops = &ubd_blops;
+       set_capacity(disk, size / 512);
+-      sprintf(disk->disk_name, "ubd");
+-      sprintf(disk->devfs_name, "ubd/disc%d", unit);
++      if(major == MAJOR_NR){
++              sprintf(disk->disk_name, "ubd%d", unit);
++              sprintf(disk->devfs_name, "ubd/disc%d", unit);
++              sprintf(from, "ubd/%d", unit);
++              sprintf(to, "disc%d/disc", unit);
++              err = devfs_mk_symlink(from, to);
++              if(err)
++                      printk("ubd_new_disk failed to make link from %s to "
++                             "%s, error = %d\n", from, to, err);
++      }
++      else {
++              sprintf(disk->disk_name, "ubd_fake%d", unit);
++              sprintf(disk->devfs_name, "ubd_fake/disc%d", unit);
++      }
+       disk->private_data = &ubd_dev[unit];
+       disk->queue = ubd_queue;
+@@ -509,10 +558,7 @@
+       struct ubd *dev = &ubd_dev[n];
+       int err;
+-      if(dev->is_dir)
+-              return(-EISDIR);
+-
+-      if (!dev->file)
++      if(dev->file == NULL)
+               return(-ENODEV);
+       if (ubd_open_dev(dev))
+@@ -526,7 +572,7 @@
+       if(err) 
+               return(err);
+  
+-      if(fake_major)
++      if(fake_major != MAJOR_NR)
+               ubd_new_disk(fake_major, dev->size, n, 
+                            &fake_gendisk[n]);
+@@ -564,42 +610,42 @@
+       return(err);
+ }
+-static int ubd_get_config(char *dev, char *str, int size, char **error_out)
++static int ubd_get_config(char *name, char *str, int size, char **error_out)
+ {
+-      struct ubd *ubd;
++      struct ubd *dev;
+       char *end;
+-      int major, n = 0;
++      int n, len = 0;
+-      major = simple_strtoul(dev, &end, 0);
+-      if((*end != '\0') || (end == dev)){
+-              *error_out = "ubd_get_config : didn't parse major number";
++      n = simple_strtoul(name, &end, 0);
++      if((*end != '\0') || (end == name)){
++              *error_out = "ubd_get_config : didn't parse device number";
+               return(-1);
+       }
+-      if((major >= MAX_DEV) || (major < 0)){
+-              *error_out = "ubd_get_config : major number out of range";
++      if((n >= MAX_DEV) || (n < 0)){
++              *error_out = "ubd_get_config : device number out of range";
+               return(-1);
+       }
+-      ubd = &ubd_dev[major];
++      dev = &ubd_dev[n];
+       spin_lock(&ubd_lock);
+-      if(ubd->file == NULL){
+-              CONFIG_CHUNK(str, size, n, "", 1);
++      if(dev->file == NULL){
++              CONFIG_CHUNK(str, size, len, "", 1);
+               goto out;
+       }
+-      CONFIG_CHUNK(str, size, n, ubd->file, 0);
++      CONFIG_CHUNK(str, size, len, dev->file, 0);
+-      if(ubd->cow.file != NULL){
+-              CONFIG_CHUNK(str, size, n, ",", 0);
+-              CONFIG_CHUNK(str, size, n, ubd->cow.file, 1);
++      if(dev->cow.file != NULL){
++              CONFIG_CHUNK(str, size, len, ",", 0);
++              CONFIG_CHUNK(str, size, len, dev->cow.file, 1);
+       }
+-      else CONFIG_CHUNK(str, size, n, "", 1);
++      else CONFIG_CHUNK(str, size, len, "", 1);
+  out:
+       spin_unlock(&ubd_lock);
+-      return(n);
++      return(len);
+ }
+ static int ubd_remove(char *str)
+@@ -607,11 +653,9 @@
+       struct ubd *dev;
+       int n, err = -ENODEV;
+-      if(!isdigit(*str))
+-              return(err);    /* it should be a number 0-7/a-h */
++      n = parse_unit(&str);
+-      n = *str - '0';
+-      if(n >= MAX_DEV) 
++      if((n < 0) || (n >= MAX_DEV))
+               return(err);
+       dev = &ubd_dev[n];
+@@ -672,7 +716,7 @@
+               
+       elevator_init(ubd_queue, &elevator_noop);
+-      if (fake_major != 0) {
++      if (fake_major != MAJOR_NR) {
+               char name[sizeof("ubd_nnn\0")];
+               snprintf(name, sizeof(name), "ubd_%d", fake_major);
+@@ -717,15 +761,9 @@
+ {
+       struct gendisk *disk = inode->i_bdev->bd_disk;
+       struct ubd *dev = disk->private_data;
+-      int err = -EISDIR;
+-
+-      if(dev->is_dir == 1)
+-              goto out;
++      int err = 0;
+-      err = 0;
+       if(dev->count == 0){
+-              dev->openflags = dev->boot_openflags;
+-
+               err = ubd_open_dev(dev);
+               if(err){
+                       printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n",
+@@ -799,15 +837,6 @@
+       if(req->rq_status == RQ_INACTIVE) return(1);
+-      if(dev->is_dir){
+-              strcpy(req->buffer, "HOSTFS:");
+-              strcat(req->buffer, dev->file);
+-              spin_lock(&ubd_io_lock);
+-              end_request(req, 1);
+-              spin_unlock(&ubd_io_lock);
+-              return(1);
+-      }
+-
+       if((rq_data_dir(req) == WRITE) && !dev->openflags.w){
+               printk("Write attempted on readonly ubd device %s\n", 
+                      disk->disk_name);
+diff -Naur a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c
+--- a/arch/um/drivers/ubd_user.c       Fri Aug 15 15:04:51 2003
++++ b/arch/um/drivers/ubd_user.c       Fri Aug 15 15:10:54 2003
+@@ -24,142 +24,24 @@
+ #include "user.h"
+ #include "ubd_user.h"
+ #include "os.h"
++#include "cow.h"
+ #include <endian.h>
+ #include <byteswap.h>
+-#if __BYTE_ORDER == __BIG_ENDIAN
+-# define ntohll(x) (x)
+-# define htonll(x) (x)
+-#elif __BYTE_ORDER == __LITTLE_ENDIAN
+-# define ntohll(x)  bswap_64(x)
+-# define htonll(x)  bswap_64(x)
+-#else
+-#error "__BYTE_ORDER not defined"
+-#endif
+-
+-#define PATH_LEN_V1 256
+-
+-struct cow_header_v1 {
+-      int magic;
+-      int version;
+-      char backing_file[PATH_LEN_V1];
+-      time_t mtime;
+-      __u64 size;
+-      int sectorsize;
+-};
+-
+-#define PATH_LEN_V2 MAXPATHLEN
+-
+-struct cow_header_v2 {
+-      unsigned long magic;
+-      unsigned long version;
+-      char backing_file[PATH_LEN_V2];
+-      time_t mtime;
+-      __u64 size;
+-      int sectorsize;
+-};
+-
+-union cow_header {
+-      struct cow_header_v1 v1;
+-      struct cow_header_v2 v2;
+-};
+-
+-#define COW_MAGIC 0x4f4f4f4d  /* MOOO */
+-#define COW_VERSION 2
+-
+-static void sizes(__u64 size, int sectorsize, int bitmap_offset, 
+-                unsigned long *bitmap_len_out, int *data_offset_out)
+-{
+-      *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize);
+-
+-      *data_offset_out = bitmap_offset + *bitmap_len_out;
+-      *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize;
+-      *data_offset_out *= sectorsize;
+-}
+-
+-static int read_cow_header(int fd, int *magic_out, char **backing_file_out, 
+-                         time_t *mtime_out, __u64 *size_out, 
+-                         int *sectorsize_out, int *bitmap_offset_out)
+-{
+-      union cow_header *header;
+-      char *file;
+-      int err, n;
+-      unsigned long version, magic;
+-
+-      header = um_kmalloc(sizeof(*header));
+-      if(header == NULL){
+-              printk("read_cow_header - Failed to allocate header\n");
+-              return(-ENOMEM);
+-      }
+-      err = -EINVAL;
+-      n = read(fd, header, sizeof(*header));
+-      if(n < offsetof(typeof(header->v1), backing_file)){
+-              printk("read_cow_header - short header\n");
+-              goto out;
+-      }
+-
+-      magic = header->v1.magic;
+-      if(magic == COW_MAGIC) {
+-              version = header->v1.version;
+-      }
+-      else if(magic == ntohl(COW_MAGIC)){
+-              version = ntohl(header->v1.version);
+-      }
+-      else goto out;
+-
+-      *magic_out = COW_MAGIC;
+-
+-      if(version == 1){
+-              if(n < sizeof(header->v1)){
+-                      printk("read_cow_header - failed to read V1 header\n");
+-                      goto out;
+-              }
+-              *mtime_out = header->v1.mtime;
+-              *size_out = header->v1.size;
+-              *sectorsize_out = header->v1.sectorsize;
+-              *bitmap_offset_out = sizeof(header->v1);
+-              file = header->v1.backing_file;
+-      }
+-      else if(version == 2){
+-              if(n < sizeof(header->v2)){
+-                      printk("read_cow_header - failed to read V2 header\n");
+-                      goto out;
+-              }
+-              *mtime_out = ntohl(header->v2.mtime);
+-              *size_out = ntohll(header->v2.size);
+-              *sectorsize_out = ntohl(header->v2.sectorsize);
+-              *bitmap_offset_out = sizeof(header->v2);
+-              file = header->v2.backing_file;
+-      }
+-      else {
+-              printk("read_cow_header - invalid COW version\n");
+-              goto out;
+-      }
+-      err = -ENOMEM;
+-      *backing_file_out = uml_strdup(file);
+-      if(*backing_file_out == NULL){
+-              printk("read_cow_header - failed to allocate backing file\n");
+-              goto out;
+-      }
+-      err = 0;
+- out:
+-      kfree(header);
+-      return(err);
+-}
+ static int same_backing_files(char *from_cmdline, char *from_cow, char *cow)
+ {
+-      struct stat buf1, buf2;
++      struct stat64 buf1, buf2;
+       if(from_cmdline == NULL) return(1);
+       if(!strcmp(from_cmdline, from_cow)) return(1);
+-      if(stat(from_cmdline, &buf1) < 0){
++      if(stat64(from_cmdline, &buf1) < 0){
+               printk("Couldn't stat '%s', errno = %d\n", from_cmdline, 
+                      errno);
+               return(1);
+       }
+-      if(stat(from_cow, &buf2) < 0){
++      if(stat64(from_cow, &buf2) < 0){
+               printk("Couldn't stat '%s', errno = %d\n", from_cow, errno);
+               return(1);
+       }
+@@ -178,6 +60,7 @@
+       long long actual;
+       int err;
++      printk("%ld", htonll(size));
+       if(stat64(file, &buf) < 0){
+               printk("Failed to stat backing file \"%s\", errno = %d\n",
+                      file, errno);
+@@ -215,118 +98,6 @@
+       return(0);
+ }
+-static int absolutize(char *to, int size, char *from)
+-{
+-      char save_cwd[256], *slash;
+-      int remaining;
+-
+-      if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) {
+-              printk("absolutize : unable to get cwd - errno = %d\n", errno);
+-              return(-1);
+-      }
+-      slash = strrchr(from, '/');
+-      if(slash != NULL){
+-              *slash = '\0';
+-              if(chdir(from)){
+-                      *slash = '/';
+-                      printk("absolutize : Can't cd to '%s' - errno = %d\n",
+-                             from, errno);
+-                      return(-1);
+-              }
+-              *slash = '/';
+-              if(getcwd(to, size) == NULL){
+-                      printk("absolutize : unable to get cwd of '%s' - "
+-                             "errno = %d\n", from, errno);
+-                      return(-1);
+-              }
+-              remaining = size - strlen(to);
+-              if(strlen(slash) + 1 > remaining){
+-                      printk("absolutize : unable to fit '%s' into %d "
+-                             "chars\n", from, size);
+-                      return(-1);
+-              }
+-              strcat(to, slash);
+-      }
+-      else {
+-              if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){
+-                      printk("absolutize : unable to fit '%s' into %d "
+-                             "chars\n", from, size);
+-                      return(-1);
+-              }
+-              strcpy(to, save_cwd);
+-              strcat(to, "/");
+-              strcat(to, from);
+-      }
+-      chdir(save_cwd);
+-      return(0);
+-}
+-
+-static int write_cow_header(char *cow_file, int fd, char *backing_file, 
+-                          int sectorsize, long long *size)
+-{
+-        struct cow_header_v2 *header;
+-      struct stat64 buf;
+-      int err;
+-
+-      err = os_seek_file(fd, 0);
+-      if(err != 0){
+-              printk("write_cow_header - lseek failed, errno = %d\n", errno);
+-              return(-errno);
+-      }
+-
+-      err = -ENOMEM;
+-      header = um_kmalloc(sizeof(*header));
+-      if(header == NULL){
+-              printk("Failed to allocate COW V2 header\n");
+-              goto out;
+-      }
+-      header->magic = htonl(COW_MAGIC);
+-      header->version = htonl(COW_VERSION);
+-
+-      err = -EINVAL;
+-      if(strlen(backing_file) > sizeof(header->backing_file) - 1){
+-              printk("Backing file name \"%s\" is too long - names are "
+-                     "limited to %d characters\n", backing_file, 
+-                     sizeof(header->backing_file) - 1);
+-              goto out_free;
+-      }
+-
+-      if(absolutize(header->backing_file, sizeof(header->backing_file), 
+-                    backing_file))
+-              goto out_free;
+-
+-      err = stat64(header->backing_file, &buf);
+-      if(err < 0){
+-              printk("Stat of backing file '%s' failed, errno = %d\n",
+-                     header->backing_file, errno);
+-              err = -errno;
+-              goto out_free;
+-      }
+-
+-      err = os_file_size(header->backing_file, size);
+-      if(err){
+-              printk("Couldn't get size of backing file '%s', errno = %d\n",
+-                     header->backing_file, -*size);
+-              goto out_free;
+-      }
+-
+-      header->mtime = htonl(buf.st_mtime);
+-      header->size = htonll(*size);
+-      header->sectorsize = htonl(sectorsize);
+-
+-      err = write(fd, header, sizeof(*header));
+-      if(err != sizeof(*header)){
+-              printk("Write of header to new COW file '%s' failed, "
+-                     "errno = %d\n", cow_file, errno);
+-              goto out_free;
+-      }
+-      err = 0;
+- out_free:
+-      kfree(header);
+- out:
+-      return(err);
+-}
+-
+ int open_ubd_file(char *file, struct openflags *openflags, 
+                 char **backing_file_out, int *bitmap_offset_out, 
+                 unsigned long *bitmap_len_out, int *data_offset_out, 
+@@ -346,10 +117,17 @@
+                 if((fd = os_open_file(file, *openflags, mode)) < 0) 
+                       return(fd);
+         }
++
++      err = os_lock_file(fd, openflags->w);
++      if(err){
++              printk("Failed to lock '%s', errno = %d\n", file, -err);
++              goto error;
++      }
++      
+       if(backing_file_out == NULL) return(fd);
+-      err = read_cow_header(fd, &magic, &backing_file, &mtime, &size, 
+-                            &sectorsize, bitmap_offset_out);
++      err = read_cow_header(file_reader, &fd, &magic, &backing_file, &mtime, 
++                            &size, &sectorsize, bitmap_offset_out);
+       if(err && (*backing_file_out != NULL)){
+               printk("Failed to read COW header from COW file \"%s\", "
+                      "errno = %d\n", file, err);
+@@ -376,12 +154,12 @@
+               if(err) goto error;
+       }
+-      sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, 
+-            data_offset_out);
++      cow_sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, 
++                data_offset_out);
+         return(fd);
+  error:
+-      close(fd);
++      os_close_file(fd);
+       return(err);
+ }
+@@ -389,10 +167,7 @@
+                   int sectorsize, int *bitmap_offset_out, 
+                   unsigned long *bitmap_len_out, int *data_offset_out)
+ {
+-      __u64 blocks;
+-      long zero;
+-      int err, fd, i;
+-      long long size;
++      int err, fd;
+       flags.c = 1;
+       fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL);
+@@ -403,29 +178,12 @@
+               goto out;
+       }
+-      err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size);
+-      if(err) goto out_close;
+-
+-      blocks = (size + sectorsize - 1) / sectorsize;
+-      blocks = (blocks + sizeof(long) * 8 - 1) / (sizeof(long) * 8);
+-      zero = 0;
+-      for(i = 0; i < blocks; i++){
+-              err = write(fd, &zero, sizeof(zero));
+-              if(err != sizeof(zero)){
+-                      printk("Write of bitmap to new COW file '%s' failed, "
+-                             "errno = %d\n", cow_file, errno);
+-                      goto out_close;
+-              }
+-      }
+-
+-      sizes(size, sectorsize, sizeof(struct cow_header_v2), 
+-            bitmap_len_out, data_offset_out);
+-      *bitmap_offset_out = sizeof(struct cow_header_v2);
+-
+-      return(fd);
+-
+- out_close:
+-      close(fd);
++      err = init_cow_file(fd, cow_file, backing_file, sectorsize, 
++                          bitmap_offset_out, bitmap_len_out, 
++                          data_offset_out);
++      if(!err)
++              return(fd);
++      os_close_file(fd);
+  out:
+       return(err);
+ }
+@@ -448,14 +206,6 @@
+       else return(n);
+ }
+-int ubd_is_dir(char *file)
+-{
+-      struct stat64 buf;
+-
+-      if(stat64(file, &buf) < 0) return(0);
+-      return(S_ISDIR(buf.st_mode));
+-}
+-
+ void do_io(struct io_thread_req *req)
+ {
+       char *buf;
+diff -Naur a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c
+--- a/arch/um/drivers/xterm.c  Fri Aug 15 15:04:00 2003
++++ b/arch/um/drivers/xterm.c  Fri Aug 15 15:10:18 2003
+@@ -108,7 +108,7 @@
+       }
+       close(fd);
+-      fd = create_unix_socket(file, sizeof(file));
++      fd = create_unix_socket(file, sizeof(file), 1);
+       if(fd < 0){
+               printk("xterm_open : create_unix_socket failed, errno = %d\n", 
+                      -fd);
+diff -Naur a/arch/um/drivers/xterm_kern.c b/arch/um/drivers/xterm_kern.c
+--- a/arch/um/drivers/xterm_kern.c     Fri Aug 15 15:07:37 2003
++++ b/arch/um/drivers/xterm_kern.c     Fri Aug 15 15:13:03 2003
+@@ -5,9 +5,12 @@
+ #include "linux/errno.h"
+ #include "linux/slab.h"
++#include "linux/signal.h"
++#include "linux/interrupt.h"
+ #include "asm/semaphore.h"
+ #include "asm/irq.h"
+ #include "irq_user.h"
++#include "irq_kern.h"
+ #include "kern_util.h"
+ #include "os.h"
+ #include "xterm.h"
+@@ -19,17 +22,18 @@
+       int new_fd;
+ };
+-static void xterm_interrupt(int irq, void *data, struct pt_regs *regs)
++static irqreturn_t xterm_interrupt(int irq, void *data, struct pt_regs *regs)
+ {
+       struct xterm_wait *xterm = data;
+       int fd;
+       fd = os_rcv_fd(xterm->fd, &xterm->pid);
+       if(fd == -EAGAIN)
+-              return;
++              return(IRQ_NONE);
+       xterm->new_fd = fd;
+       up(&xterm->sem);
++      return(IRQ_HANDLED);
+ }
+ int xterm_fd(int socket, int *pid_out)
+diff -Naur a/arch/um/dyn.lds.S b/arch/um/dyn.lds.S
+--- a/arch/um/dyn.lds.S        Fri Aug 15 15:06:20 2003
++++ b/arch/um/dyn.lds.S        Fri Aug 15 15:12:31 2003
+@@ -15,7 +15,11 @@
+   . = ALIGN(4096);            /* Init code and data */
+   _stext = .;
+   __init_begin = .;
+-  .text.init : { *(.text.init) }
++  .init.text : { 
++      _sinittext = .;
++      *(.init.text)
++      _einittext = .;
++  }
+   . = ALIGN(4096);
+@@ -67,7 +71,7 @@
+   #include "asm/common.lds.S"
+-  .data.init : { *(.data.init) }
++  init.data : { *(.init.data) }
+   /* Ensure the __preinit_array_start label is properly aligned.  We
+      could instead move the label definition inside the section, but
+diff -Naur a/arch/um/include/irq_kern.h b/arch/um/include/irq_kern.h
+--- a/arch/um/include/irq_kern.h       Wed Dec 31 19:00:00 1969
++++ b/arch/um/include/irq_kern.h       Fri Aug 15 15:11:53 2003
+@@ -0,0 +1,28 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __IRQ_KERN_H__
++#define __IRQ_KERN_H__
++
++#include "linux/interrupt.h"
++
++extern int um_request_irq(unsigned int irq, int fd, int type,
++                        irqreturn_t (*handler)(int, void *, 
++                                               struct pt_regs *),
++                        unsigned long irqflags,  const char * devname,
++                        void *dev_id);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
+--- a/arch/um/include/kern_util.h      Fri Aug 15 15:05:04 2003
++++ b/arch/um/include/kern_util.h      Fri Aug 15 15:11:18 2003
+@@ -63,10 +63,9 @@
+ extern void *syscall_sp(void *t);
+ extern void syscall_trace(void);
+ extern int hz(void);
+-extern void idle_timer(void);
++extern void uml_idle_timer(void);
+ extern unsigned int do_IRQ(int irq, union uml_pt_regs *regs);
+ extern int external_pid(void *t);
+-extern int pid_to_processor_id(int pid);
+ extern void boot_timer_handler(int sig);
+ extern void interrupt_end(void);
+ extern void initial_thread_cb(void (*proc)(void *), void *arg);
+@@ -90,9 +89,7 @@
+ extern char *uml_strdup(char *string);
+ extern void unprotect_kernel_mem(void);
+ extern void protect_kernel_mem(void);
+-extern void set_kmem_end(unsigned long);
+ extern void uml_cleanup(void);
+-extern int pid_to_processor_id(int pid);
+ extern void set_current(void *t);
+ extern void lock_signalled_task(void *t);
+ extern void IPI_handler(int cpu);
+@@ -101,7 +98,9 @@
+ extern int clear_user_proc(void *buf, int size);
+ extern int copy_to_user_proc(void *to, void *from, int size);
+ extern int copy_from_user_proc(void *to, void *from, int size);
++extern int strlen_user_proc(char *str);
+ extern void bus_handler(int sig, union uml_pt_regs *regs);
++extern void winch(int sig, union uml_pt_regs *regs);
+ extern long execute_syscall(void *r);
+ extern int smp_sigio_handler(void);
+ extern void *get_current(void);
+diff -Naur a/arch/um/include/line.h b/arch/um/include/line.h
+--- a/arch/um/include/line.h   Fri Aug 15 15:07:40 2003
++++ b/arch/um/include/line.h   Fri Aug 15 15:13:11 2003
+@@ -9,12 +9,14 @@
+ #include "linux/list.h"
+ #include "linux/workqueue.h"
+ #include "linux/tty.h"
++#include "linux/interrupt.h"
+ #include "asm/semaphore.h"
+ #include "chan_user.h"
+ #include "mconsole_kern.h"
+ struct line_driver {
+       char *name;
++      char *device_name;
+       char *devfs_name;
+       short major;
+       short minor_start;
+@@ -67,8 +69,9 @@
+ #define LINES_INIT(n) {  num :                n }
+-extern void line_interrupt(int irq, void *data, struct pt_regs *unused);
+-extern void line_write_interrupt(int irq, void *data, struct pt_regs *unused);
++extern irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused);
++extern irqreturn_t line_write_interrupt(int irq, void *data, 
++                                      struct pt_regs *unused);
+ extern void line_close(struct line *lines, struct tty_struct *tty);
+ extern int line_open(struct line *lines, struct tty_struct *tty, 
+                    struct chan_opts *opts);
+diff -Naur a/arch/um/include/mconsole.h b/arch/um/include/mconsole.h
+--- a/arch/um/include/mconsole.h       Fri Aug 15 15:05:26 2003
++++ b/arch/um/include/mconsole.h       Fri Aug 15 15:11:43 2003
+@@ -77,6 +77,7 @@
+ extern void mconsole_cad(struct mc_request *req);
+ extern void mconsole_stop(struct mc_request *req);
+ extern void mconsole_go(struct mc_request *req);
++extern void mconsole_log(struct mc_request *req);
+ extern int mconsole_get_request(int fd, struct mc_request *req);
+ extern int mconsole_notify(char *sock_name, int type, const void *data, 
+diff -Naur a/arch/um/include/mem.h b/arch/um/include/mem.h
+--- a/arch/um/include/mem.h    Fri Aug 15 15:09:22 2003
++++ b/arch/um/include/mem.h    Fri Aug 15 15:14:01 2003
+@@ -13,7 +13,6 @@
+ };
+ extern void set_usable_vm(unsigned long start, unsigned long end);
+-extern void set_kmem_end(unsigned long new);
+ #endif
+diff -Naur a/arch/um/include/mem_user.h b/arch/um/include/mem_user.h
+--- a/arch/um/include/mem_user.h       Fri Aug 15 15:07:31 2003
++++ b/arch/um/include/mem_user.h       Fri Aug 15 15:12:54 2003
+@@ -51,9 +51,6 @@
+ extern int init_mem_user(void);
+ extern int create_mem_file(unsigned long len);
+-extern void setup_range(int fd, char *driver, unsigned long start,
+-                      unsigned long pfn, unsigned long total, int need_vm, 
+-                      struct mem_region *region, void *reserved);
+ extern void setup_memory(void *entry);
+ extern unsigned long find_iomem(char *driver, unsigned long *len_out);
+ extern int init_maps(struct mem_region *region);
+diff -Naur a/arch/um/include/os.h b/arch/um/include/os.h
+--- a/arch/um/include/os.h     Fri Aug 15 15:04:50 2003
++++ b/arch/um/include/os.h     Fri Aug 15 15:10:48 2003
+@@ -103,10 +103,11 @@
+ extern int os_shutdown_socket(int fd, int r, int w);
+ extern void os_close_file(int fd);
+ extern int os_rcv_fd(int fd, int *helper_pid_out);
+-extern int create_unix_socket(char *file, int len);
++extern int create_unix_socket(char *file, int len, int close_on_exec);
+ extern int os_connect_socket(char *name);
+ extern int os_file_type(char *file);
+ extern int os_file_mode(char *file, struct openflags *mode_out);
++extern int os_lock_file(int fd, int excl);
+ extern unsigned long os_process_pc(int pid);
+ extern int os_process_parent(int pid);
+@@ -120,6 +121,7 @@
+ extern int os_protect_memory(void *addr, unsigned long len, 
+                            int r, int w, int x);
+ extern int os_unmap_memory(void *addr, int len);
++extern void os_flush_stdout(void);
+ #endif
+diff -Naur a/arch/um/include/sysdep-i386/sigcontext.h b/arch/um/include/sysdep-i386/sigcontext.h
+--- a/arch/um/include/sysdep-i386/sigcontext.h Fri Aug 15 15:07:37 2003
++++ b/arch/um/include/sysdep-i386/sigcontext.h Fri Aug 15 15:13:03 2003
+@@ -28,8 +28,8 @@
+  */
+ #define SC_START_SYSCALL(sc) do SC_EAX(sc) = -ENOSYS; while(0)
+-/* These are General Protection and Page Fault */
+-#define SEGV_IS_FIXABLE(trap) ((trap == 13) || (trap == 14))
++/* This is Page Fault */
++#define SEGV_IS_FIXABLE(trap) (trap == 14)
+ #define SC_SEGV_IS_FIXABLE(sc) (SEGV_IS_FIXABLE(SC_TRAPNO(sc)))
+diff -Naur a/arch/um/include/ubd_user.h b/arch/um/include/ubd_user.h
+--- a/arch/um/include/ubd_user.h       Fri Aug 15 15:06:34 2003
++++ b/arch/um/include/ubd_user.h       Fri Aug 15 15:12:37 2003
+@@ -39,7 +39,6 @@
+ extern int write_ubd_fs(int fd, char *buffer, int len);
+ extern int start_io_thread(unsigned long sp, int *fds_out);
+ extern void do_io(struct io_thread_req *req);
+-extern int ubd_is_dir(char *file);
+ static inline int ubd_test_bit(__u64 bit, unsigned char *data)
+ {
+diff -Naur a/arch/um/include/user.h b/arch/um/include/user.h
+--- a/arch/um/include/user.h   Fri Aug 15 15:03:58 2003
++++ b/arch/um/include/user.h   Fri Aug 15 15:10:14 2003
+@@ -14,7 +14,7 @@
+ extern void kfree(void *ptr);
+ extern int in_aton(char *str);
+ extern int open_gdb_chan(void);
+-
++extern int strlcpy(char *, const char *, int);
+ #endif
+ /*
+diff -Naur a/arch/um/include/user_util.h b/arch/um/include/user_util.h
+--- a/arch/um/include/user_util.h      Fri Aug 15 15:04:33 2003
++++ b/arch/um/include/user_util.h      Fri Aug 15 15:10:32 2003
+@@ -59,7 +59,6 @@
+ extern void *add_signal_handler(int sig, void (*handler)(int));
+ extern int start_fork_tramp(void *arg, unsigned long temp_stack, 
+                           int clone_flags, int (*tramp)(void *));
+-extern int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags);
+ extern int linux_main(int argc, char **argv);
+ extern void set_cmdline(char *cmd);
+ extern void input_cb(void (*proc)(void *), void *arg, int arg_len);
+@@ -90,7 +89,8 @@
+ extern int arch_fixup(unsigned long address, void *sc_ptr);
+ extern void forward_pending_sigio(int target);
+ extern int can_do_skas(void);
+- 
++extern void arch_init_thread(void);
++
+ #endif
+ /*
+diff -Naur a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
+--- a/arch/um/kernel/Makefile  Fri Aug 15 15:07:32 2003
++++ b/arch/um/kernel/Makefile  Fri Aug 15 15:12:57 2003
+@@ -19,6 +19,8 @@
+ obj-$(CONFIG_MODE_TT) += tt/
+ obj-$(CONFIG_MODE_SKAS) += skas/
++clean-files   := config.c
++
+ user-objs-$(CONFIG_TTY_LOG) += tty_log.o
+ USER_OBJS := $(filter %_user.o,$(obj-y))  $(user-objs-y) config.o helper.o \
+@@ -43,17 +45,13 @@
+ $(obj)/frame.o: $(src)/frame.c
+       $(CC) $(CFLAGS_$(notdir $@)) -c -o $@ $<
+-QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }'
++QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; $$config =~ s/\n/\\n"\n"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }'
+ $(obj)/config.c : $(src)/config.c.in $(TOPDIR)/.config
+       $(PERL) -e $(QUOTE) < $(src)/config.c.in > $@
+ $(obj)/config.o : $(obj)/config.c
+-clean:
+-      rm -f config.c
+-      for dir in $(subdir-y) ; do $(MAKE) -C $$dir clean; done
+-
+ modules:
+ fastdep:
+diff -Naur a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in
+--- a/arch/um/kernel/config.c.in       Fri Aug 15 15:07:37 2003
++++ b/arch/um/kernel/config.c.in       Fri Aug 15 15:13:03 2003
+@@ -7,9 +7,7 @@
+ #include <stdlib.h>
+ #include "init.h"
+-static __initdata char *config = "
+-CONFIG
+-";
++static __initdata char *config = "CONFIG";
+ static int __init print_config(char *line, int *add)
+ {
+diff -Naur a/arch/um/kernel/exec_kern.c b/arch/um/kernel/exec_kern.c
+--- a/arch/um/kernel/exec_kern.c       Fri Aug 15 15:04:54 2003
++++ b/arch/um/kernel/exec_kern.c       Fri Aug 15 15:11:03 2003
+@@ -32,10 +32,15 @@
+       CHOOSE_MODE_PROC(start_thread_tt, start_thread_skas, regs, eip, esp);
+ }
++extern void log_exec(char **argv, void *tty);
++
+ static int execve1(char *file, char **argv, char **env)
+ {
+         int error;
++#ifdef CONFIG_TTY_LOG
++      log_exec(argv, current->tty);
++#endif
+         error = do_execve(file, argv, env, &current->thread.regs);
+         if (error == 0){
+                 current->ptrace &= ~PT_DTRACE;
+diff -Naur a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
+--- a/arch/um/kernel/init_task.c       Fri Aug 15 15:09:24 2003
++++ b/arch/um/kernel/init_task.c       Fri Aug 15 15:14:04 2003
+@@ -17,6 +17,7 @@
+ struct mm_struct init_mm = INIT_MM(init_mm);
+ static struct files_struct init_files = INIT_FILES;
+ static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
++static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+ /*
+  * Initial task structure.
+@@ -38,26 +39,12 @@
+ __attribute__((__section__(".data.init_task"))) = 
+ { INIT_THREAD_INFO(init_task) };
+-struct task_struct *alloc_task_struct(void)
+-{
+-      return((struct task_struct *) 
+-             __get_free_pages(GFP_KERNEL, CONFIG_KERNEL_STACK_ORDER));
+-}
+-
+ void unprotect_stack(unsigned long stack)
+ {
+       protect_memory(stack, (1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE, 
+                      1, 1, 0, 1);
+ }
+-void free_task_struct(struct task_struct *task)
+-{
+-      /* free_pages decrements the page counter and only actually frees
+-       * the pages if they are now not accessed by anything.
+-       */
+-      free_pages((unsigned long) task, CONFIG_KERNEL_STACK_ORDER);
+-}
+-
+ /*
+  * Overrides for Emacs so that we follow Linus's tabbing style.
+  * Emacs will notice this stuff at the end of the file and automatically
+diff -Naur a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
+--- a/arch/um/kernel/irq.c     Fri Aug 15 15:07:53 2003
++++ b/arch/um/kernel/irq.c     Fri Aug 15 15:13:18 2003
+@@ -28,6 +28,7 @@
+ #include "user_util.h"
+ #include "kern_util.h"
+ #include "irq_user.h"
++#include "irq_kern.h"
+ static void register_irq_proc (unsigned int irq);
+@@ -82,65 +83,52 @@
+       end_none
+ };
+-/* Not changed */
+-volatile unsigned long irq_err_count;
+-
+ /*
+  * Generic, controller-independent functions:
+  */
+-int get_irq_list(char *buf)
++int show_interrupts(struct seq_file *p, void *v)
+ {
+       int i, j;
+-      unsigned long flags;
+       struct irqaction * action;
+-      char *p = buf;
++      unsigned long flags;
+-      p += sprintf(p, "           ");
+-      for (j=0; j<num_online_cpus(); j++)
+-              p += sprintf(p, "CPU%d       ",j);
+-      *p++ = '\n';
++      seq_printf(p, "           ");
++      for (j=0; j<NR_CPUS; j++)
++              if (cpu_online(j))
++                      seq_printf(p, "CPU%d       ",j);
++      seq_putc(p, '\n');
+       for (i = 0 ; i < NR_IRQS ; i++) {
+               spin_lock_irqsave(&irq_desc[i].lock, flags);
+               action = irq_desc[i].action;
+               if (!action) 
+-                      goto end;
+-              p += sprintf(p, "%3d: ",i);
++                      goto skip;
++              seq_printf(p, "%3d: ",i);
+ #ifndef CONFIG_SMP
+-              p += sprintf(p, "%10u ", kstat_irqs(i));
++              seq_printf(p, "%10u ", kstat_irqs(i));
+ #else
+-              for (j = 0; j < num_online_cpus(); j++)
+-                      p += sprintf(p, "%10u ",
+-                              kstat_cpu(cpu_logical_map(j)).irqs[i]);
++              for (j = 0; j < NR_CPUS; j++)
++                      if (cpu_online(j))
++                              seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ #endif
+-              p += sprintf(p, " %14s", irq_desc[i].handler->typename);
+-              p += sprintf(p, "  %s", action->name);
++              seq_printf(p, " %14s", irq_desc[i].handler->typename);
++              seq_printf(p, "  %s", action->name);
+               for (action=action->next; action; action = action->next)
+-                      p += sprintf(p, ", %s", action->name);
+-              *p++ = '\n';
+-      end:
++                      seq_printf(p, ", %s", action->name);
++
++              seq_putc(p, '\n');
++skip:
+               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+       }
+-      p += sprintf(p, "\n");
+-#ifdef notdef
+-#ifdef CONFIG_SMP
+-      p += sprintf(p, "LOC: ");
+-      for (j = 0; j < num_online_cpus(); j++)
+-              p += sprintf(p, "%10u ",
+-                      apic_timer_irqs[cpu_logical_map(j)]);
+-      p += sprintf(p, "\n");
+-#endif
+-#endif
+-      p += sprintf(p, "ERR: %10lu\n", irq_err_count);
+-      return p - buf;
+-}
+-
++      seq_printf(p, "NMI: ");
++      for (j = 0; j < NR_CPUS; j++)
++              if (cpu_online(j))
++                      seq_printf(p, "%10u ", nmi_count(j));
++      seq_putc(p, '\n');
+-int show_interrupts(struct seq_file *p, void *v)
+-{
+-      return(0);
++      return 0;
+ }
+ /*
+@@ -281,13 +269,12 @@
+        * 0 return value means that this irq is already being
+        * handled by some other CPU. (or is disabled)
+        */
+-      int cpu = smp_processor_id();
+       irq_desc_t *desc = irq_desc + irq;
+       struct irqaction * action;
+       unsigned int status;
+       irq_enter();
+-      kstat_cpu(cpu).irqs[irq]++;
++      kstat_this_cpu.irqs[irq]++;
+       spin_lock(&desc->lock);
+       desc->handler->ack(irq);
+       /*
+@@ -384,7 +371,7 @@
+  */
+  
+ int request_irq(unsigned int irq,
+-              void (*handler)(int, void *, struct pt_regs *),
++              irqreturn_t (*handler)(int, void *, struct pt_regs *),
+               unsigned long irqflags, 
+               const char * devname,
+               void *dev_id)
+@@ -430,15 +417,19 @@
+ }
+ int um_request_irq(unsigned int irq, int fd, int type,
+-                 void (*handler)(int, void *, struct pt_regs *),
++                 irqreturn_t (*handler)(int, void *, struct pt_regs *),
+                  unsigned long irqflags, const char * devname,
+                  void *dev_id)
+ {
+-      int retval;
++      int err;
+-      retval = request_irq(irq, handler, irqflags, devname, dev_id);
+-      if(retval) return(retval);
+-      return(activate_fd(irq, fd, type, dev_id));
++      err = request_irq(irq, handler, irqflags, devname, dev_id);
++      if(err) 
++              return(err);
++
++      if(fd != -1)
++              err = activate_fd(irq, fd, type, dev_id);
++      return(err);
+ }
+ /* this was setup_x86_irq but it seems pretty generic */
+diff -Naur a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
+--- a/arch/um/kernel/mem.c     Fri Aug 15 15:05:20 2003
++++ b/arch/um/kernel/mem.c     Fri Aug 15 15:11:21 2003
+@@ -119,11 +119,6 @@
+       return(kmem_top);
+ }
+-void set_kmem_end(unsigned long new)
+-{
+-      kmem_top = new;
+-}
+-
+ #ifdef CONFIG_HIGHMEM
+ /* Changed during early boot */
+ pte_t *kmap_pte;
+@@ -218,7 +213,7 @@
+               if(regions[i] == NULL) break;           
+       }
+       if(i == NREGIONS){
+-              printk("setup_range : no free regions\n");
++              printk("setup_one_range : no free regions\n");
+               i = -1;
+               goto out;
+       }
+@@ -227,7 +222,9 @@
+               fd = create_mem_file(len);
+       if(region == NULL){
+-              region = alloc_bootmem_low_pages(sizeof(*region));
++              if(kmalloc_ok)
++                      region = kmalloc(sizeof(*region), GFP_KERNEL);
++              else region = alloc_bootmem_low_pages(sizeof(*region));
+               if(region == NULL)
+                       panic("Failed to allocating mem_region");
+       }
+@@ -528,9 +525,9 @@
+       return(NREGIONS);
+ }
+-void setup_range(int fd, char *driver, unsigned long start, unsigned long pfn,
+-               unsigned long len, int need_vm, struct mem_region *region, 
+-               void *reserved)
++static void setup_range(int fd, char *driver, unsigned long start, 
++                      unsigned long pfn, unsigned long len, int need_vm, 
++                      struct mem_region *region, void *reserved)
+ {
+       int i, cur;
+diff -Naur a/arch/um/kernel/mem_user.c b/arch/um/kernel/mem_user.c
+--- a/arch/um/kernel/mem_user.c        Fri Aug 15 15:06:25 2003
++++ b/arch/um/kernel/mem_user.c        Fri Aug 15 15:12:36 2003
+@@ -111,6 +111,11 @@
+               offset = 0;
+       }
++      if(offset >= region->len){
++              printf("%d bytes of physical memory is insufficient\n",
++                     region->len);
++              exit(1);
++      }
+       loc = mmap(start, region->len - offset, PROT_READ | PROT_WRITE, 
+                  MAP_SHARED | MAP_FIXED, region->fd, offset);
+       if(loc != start){
+@@ -122,26 +127,26 @@
+ static int __init parse_iomem(char *str, int *add)
+ {
+-      struct stat buf;
++      struct stat64 buf;
+       char *file, *driver;
+       int fd;
+       driver = str;
+       file = strchr(str,',');
+       if(file == NULL){
+-              printk("parse_iomem : failed to parse iomem\n");
++              printf("parse_iomem : failed to parse iomem\n");
+               return(1);
+       }
+       *file = '\0';
+       file++;
+       fd = os_open_file(file, of_rdwr(OPENFLAGS()), 0);
+       if(fd < 0){
+-              printk("parse_iomem - Couldn't open io file, errno = %d\n", 
++              printf("parse_iomem - Couldn't open io file, errno = %d\n", 
+                      errno);
+               return(1);
+       }
+-      if(fstat(fd, &buf) < 0) {
+-              printk("parse_iomem - cannot fstat file, errno = %d\n", errno);
++      if(fstat64(fd, &buf) < 0) {
++              printf("parse_iomem - cannot fstat file, errno = %d\n", errno);
+               return(1);
+       }
+       add_iomem(driver, fd, buf.st_size);
+diff -Naur a/arch/um/kernel/process.c b/arch/um/kernel/process.c
+--- a/arch/um/kernel/process.c Fri Aug 15 15:08:15 2003
++++ b/arch/um/kernel/process.c Fri Aug 15 15:13:26 2003
+@@ -72,7 +72,6 @@
+                   SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1);
+       set_handler(SIGUSR2, (__sighandler_t) sig_handler, 
+                   SA_NOMASK | flags, -1);
+-      (void) CHOOSE_MODE(signal(SIGCHLD, SIG_IGN), (void *) 0);
+       signal(SIGHUP, SIG_IGN);
+       init_irq_signals(altstack);
+@@ -127,7 +126,8 @@
+       if(err < 0) panic("Waiting for outer trampoline failed - errno = %d", 
+                         errno);
+       if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL))
+-              panic("outer trampoline didn't exit with SIGKILL");
++              panic("outer trampoline didn't exit with SIGKILL, "
++                    "status = %d", status);
+       return(arg.pid);
+ }
+diff -Naur a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c
+--- a/arch/um/kernel/process_kern.c    Fri Aug 15 15:06:24 2003
++++ b/arch/um/kernel/process_kern.c    Fri Aug 15 15:12:35 2003
+@@ -52,17 +52,12 @@
+ struct task_struct *get_task(int pid, int require)
+ {
+-        struct task_struct *task, *ret;
++        struct task_struct *ret;
+-        ret = NULL;
+         read_lock(&tasklist_lock);
+-        for_each_process(task){
+-                if(task->pid == pid){
+-                        ret = task;
+-                        break;
+-                }
+-        }
++      ret = find_task_by_pid(pid);
+         read_unlock(&tasklist_lock);
++
+         if(require && (ret == NULL)) panic("get_task couldn't find a task\n");
+         return(ret);
+ }
+@@ -103,13 +98,14 @@
+ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ {
+-      struct task_struct *p;
++      int pid;
+       current->thread.request.u.thread.proc = fn;
+       current->thread.request.u.thread.arg = arg;
+-      p = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL);
+-      if(IS_ERR(p)) panic("do_fork failed in kernel_thread");
+-      return(p->pid);
++      pid = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL);
++      if(pid < 0)
++              panic("do_fork failed in kernel_thread, errno = %d", pid);
++      return(pid);
+ }
+ void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
+@@ -157,6 +153,10 @@
+       return(current);
+ }
++void prepare_to_copy(struct task_struct *tsk)
++{
++}
++
+ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+               unsigned long stack_top, struct task_struct * p, 
+               struct pt_regs *regs)
+@@ -190,7 +190,7 @@
+ void default_idle(void)
+ {
+-      idle_timer();
++      uml_idle_timer();
+       atomic_inc(&init_mm.mm_count);
+       current->mm = &init_mm;
+@@ -363,6 +363,11 @@
+       return(clear_user(buf, size));
+ }
++int strlen_user_proc(char *str)
++{
++      return(strlen_user(str));
++}
++
+ int smp_sigio_handler(void)
+ {
+ #ifdef CONFIG_SMP
+diff -Naur a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
+--- a/arch/um/kernel/ptrace.c  Fri Aug 15 15:04:36 2003
++++ b/arch/um/kernel/ptrace.c  Fri Aug 15 15:10:33 2003
+@@ -311,11 +311,8 @@
+       /* the 0x80 provides a way for the tracing parent to distinguish
+          between a syscall stop and SIGTRAP delivery */
+-      current->exit_code = SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+-                                      ? 0x80 : 0);
+-      current->state = TASK_STOPPED;
+-      notify_parent(current, SIGCHLD);
+-      schedule();
++      ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
++                               ? 0x80 : 0));
+       /*
+        * this isn't the same as continuing with a signal, but it will do
+diff -Naur a/arch/um/kernel/sigio_kern.c b/arch/um/kernel/sigio_kern.c
+--- a/arch/um/kernel/sigio_kern.c      Fri Aug 15 15:04:52 2003
++++ b/arch/um/kernel/sigio_kern.c      Fri Aug 15 15:10:59 2003
+@@ -6,18 +6,21 @@
+ #include "linux/kernel.h"
+ #include "linux/list.h"
+ #include "linux/slab.h"
+-#include "asm/irq.h"
++#include "linux/signal.h"
++#include "linux/interrupt.h"
+ #include "init.h"
+ #include "sigio.h"
+ #include "irq_user.h"
++#include "irq_kern.h"
+ /* Protected by sigio_lock() called from write_sigio_workaround */
+ static int sigio_irq_fd = -1;
+-void sigio_interrupt(int irq, void *data, struct pt_regs *unused)
++irqreturn_t sigio_interrupt(int irq, void *data, struct pt_regs *unused)
+ {
+       read_sigio_fd(sigio_irq_fd);
+       reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ);
++      return(IRQ_HANDLED);
+ }
+ int write_sigio_irq(int fd)
+diff -Naur a/arch/um/kernel/signal_kern.c b/arch/um/kernel/signal_kern.c
+--- a/arch/um/kernel/signal_kern.c     Fri Aug 15 15:06:38 2003
++++ b/arch/um/kernel/signal_kern.c     Fri Aug 15 15:12:40 2003
+@@ -36,7 +36,7 @@
+       if(sig == SIGSEGV){
+               struct k_sigaction *ka;
+-              ka = &current->sig->action[SIGSEGV - 1];
++              ka = &current->sighand->action[SIGSEGV - 1];
+               ka->sa.sa_handler = SIG_DFL;
+       }
+       force_sig(SIGSEGV, current);
+@@ -142,7 +142,7 @@
+               return(0);
+       /* Whee!  Actually deliver the signal.  */
+-      ka = &current->sig->action[sig -1 ];
++      ka = &current->sighand->action[sig -1 ];
+       err = handle_signal(regs, sig, ka, &info, oldset, error);
+       if(!err) return(1);
+@@ -201,7 +201,7 @@
+       }
+ }
+-int sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize)
++int sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
+ {
+       sigset_t saveset, newset;
+@@ -227,6 +227,42 @@
+       }
+ }
++int sys_sigaction(int sig, const struct old_sigaction __user *act,
++                       struct old_sigaction __user *oact)
++{
++      struct k_sigaction new_ka, old_ka;
++      int ret;
++
++      if (act) {
++              old_sigset_t mask;
++              if (verify_area(VERIFY_READ, act, sizeof(*act)) ||
++                  __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
++                  __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
++                      return -EFAULT;
++              __get_user(new_ka.sa.sa_flags, &act->sa_flags);
++              __get_user(mask, &act->sa_mask);
++              siginitset(&new_ka.sa.sa_mask, mask);
++      }
++
++      ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
++
++      if (!ret && oact) {
++              if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) ||
++                  __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
++                  __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
++                      return -EFAULT;
++              __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
++              __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
++      }
++
++      return ret;
++}
++
++int sys_sigaltstack(const stack_t *uss, stack_t *uoss)
++{
++      return(do_sigaltstack(uss, uoss, PT_REGS_SP(&current->thread.regs)));
++}
++
+ static int copy_sc_from_user(struct pt_regs *to, void *from, 
+                            struct arch_frame_data *arch)
+ {
+@@ -239,8 +275,8 @@
+ int sys_sigreturn(struct pt_regs regs)
+ {
+-      void *sc = sp_to_sc(PT_REGS_SP(&current->thread.regs));
+-      void *mask = sp_to_mask(PT_REGS_SP(&current->thread.regs));
++      void __user *sc = sp_to_sc(PT_REGS_SP(&current->thread.regs));
++      void __user *mask = sp_to_mask(PT_REGS_SP(&current->thread.regs));
+       int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
+       spin_lock_irq(&current->sighand->siglock);
+@@ -257,7 +293,8 @@
+ int sys_rt_sigreturn(struct pt_regs regs)
+ {
+-      struct ucontext *uc = sp_to_uc(PT_REGS_SP(&current->thread.regs));
++      unsigned long sp = PT_REGS_SP(&current->thread.regs);
++      struct ucontext __user *uc = sp_to_uc(sp);
+       void *fp;
+       int sig_size = _NSIG_WORDS * sizeof(unsigned long);
+diff -Naur a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile
+--- a/arch/um/kernel/skas/Makefile     Fri Aug 15 15:05:00 2003
++++ b/arch/um/kernel/skas/Makefile     Fri Aug 15 15:11:08 2003
+@@ -7,18 +7,22 @@
+       process_kern.o syscall_kern.o syscall_user.o time.o tlb.o trap_user.o \
+       sys-$(SUBARCH)/
++host-progs    := util/mk_ptregs
++clean-files   := include/skas_ptregs.h
++
+ USER_OBJS = $(filter %_user.o,$(obj-y)) process.o time.o
+ USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
+-include/skas_ptregs.h : util/mk_ptregs
+-      util/mk_ptregs > $@
+-
+-util/mk_ptregs :
+-      $(MAKE) -C util
++$(TOPDIR)/arch/um/include/skas_ptregs.h : $(src)/util/mk_ptregs
++      @echo -n '  Generating $@'
++      @$< > $@.tmp
++      @if [ -r $@ ] && cmp -s $@ $@.tmp; then \
++              echo ' (unchanged)'; \
++              rm -f $@.tmp; \
++      else \
++              echo ' (updated)'; \
++              mv -f $@.tmp $@; \
++      fi
+ $(USER_OBJS) : %.o: %.c
+       $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
+-
+-clean :
+-      $(MAKE) -C util clean
+-      $(RM) -f include/skas_ptregs.h
+diff -Naur a/arch/um/kernel/skas/include/mode.h b/arch/um/kernel/skas/include/mode.h
+--- a/arch/um/kernel/skas/include/mode.h       Fri Aug 15 15:06:34 2003
++++ b/arch/um/kernel/skas/include/mode.h       Fri Aug 15 15:12:37 2003
+@@ -20,6 +20,7 @@
+ extern void halt_skas(void);
+ extern void reboot_skas(void);
+ extern void kill_off_processes_skas(void);
++extern int is_skas_winch(int pid, int fd, void *data);
+ #endif
+diff -Naur a/arch/um/kernel/skas/include/uaccess.h b/arch/um/kernel/skas/include/uaccess.h
+--- a/arch/um/kernel/skas/include/uaccess.h    Fri Aug 15 15:05:28 2003
++++ b/arch/um/kernel/skas/include/uaccess.h    Fri Aug 15 15:11:44 2003
+@@ -19,7 +19,7 @@
+ #define access_ok_skas(type, addr, size) \
+       ((segment_eq(get_fs(), KERNEL_DS)) || \
+        (((unsigned long) (addr) < TASK_SIZE) && \
+-        ((unsigned long) (addr) + (size) < TASK_SIZE)))
++        ((unsigned long) (addr) + (size) <= TASK_SIZE)))
+ static inline int verify_area_skas(int type, const void * addr, 
+                                  unsigned long size)
+diff -Naur a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
+--- a/arch/um/kernel/skas/process.c    Fri Aug 15 15:08:54 2003
++++ b/arch/um/kernel/skas/process.c    Fri Aug 15 15:13:46 2003
+@@ -4,6 +4,7 @@
+  */
+ #include <stdlib.h>
++#include <unistd.h>
+ #include <errno.h>
+ #include <signal.h>
+ #include <setjmp.h>
+@@ -24,6 +25,16 @@
+ #include "os.h"
+ #include "proc_mm.h"
+ #include "skas_ptrace.h"
++#include "chan_user.h"
++
++int is_skas_winch(int pid, int fd, void *data)
++{
++      if(pid != getpid())
++              return(0);
++
++      register_winch_irq(-1, fd, -1, data);
++      return(1);
++}
+ unsigned long exec_regs[FRAME_SIZE];
+ unsigned long exec_fp_regs[HOST_FP_SIZE];
+@@ -72,8 +83,6 @@
+       handle_syscall(regs);
+ }
+-int userspace_pid;
+-
+ static int userspace_tramp(void *arg)
+ {
+       init_new_thread_signals(0);
+@@ -83,6 +92,8 @@
+       return(0);
+ }
++int userspace_pid;
++
+ void start_userspace(void)
+ {
+       void *stack;
+@@ -149,6 +160,7 @@
+                       case SIGILL:
+                       case SIGBUS:
+                       case SIGFPE:
++                      case SIGWINCH:
+                               user_signal(WSTOPSIG(status), regs);
+                               break;
+                       default:
+@@ -328,7 +340,8 @@
+ int new_mm(int from)
+ {
+       struct proc_mm_op copy;
+-      int n, fd = os_open_file("/proc/mm", of_write(OPENFLAGS()), 0);
++      int n, fd = os_open_file("/proc/mm", 
++                               of_cloexec(of_write(OPENFLAGS())), 0);
+       if(fd < 0)
+               return(-errno);
+@@ -342,6 +355,7 @@
+                       printk("new_mm : /proc/mm copy_segments failed, "
+                              "errno = %d\n", errno);
+       }
++
+       return(fd);
+ }
+diff -Naur a/arch/um/kernel/skas/process_kern.c b/arch/um/kernel/skas/process_kern.c
+--- a/arch/um/kernel/skas/process_kern.c       Fri Aug 15 15:04:51 2003
++++ b/arch/um/kernel/skas/process_kern.c       Fri Aug 15 15:10:56 2003
+@@ -61,9 +61,8 @@
+       thread_wait(&current->thread.mode.skas.switch_buf, 
+                   current->thread.mode.skas.fork_buf);
+-#ifdef CONFIG_SMP
+-      schedule_tail(NULL);
+-#endif
++      if(current->thread.prev_sched != NULL)
++              schedule_tail(current->thread.prev_sched);
+       current->thread.prev_sched = NULL;
+       n = run_kernel_thread(fn, arg, &current->thread.exec_buf);
+@@ -93,9 +92,8 @@
+                   current->thread.mode.skas.fork_buf);
+       
+       force_flush_all();
+-#ifdef CONFIG_SMP
+-      schedule_tail(current->thread.prev_sched);
+-#endif
++      if(current->thread.prev_sched != NULL)
++              schedule_tail(current->thread.prev_sched);
+       current->thread.prev_sched = NULL;
+       unblock_signals();
+@@ -164,7 +162,7 @@
+       capture_signal_stack();
+       init_new_thread_signals(1);
+-      idle_timer();
++      uml_idle_timer();
+       init_task.thread.request.u.thread.proc = start_kernel_proc;
+       init_task.thread.request.u.thread.arg = NULL;
+diff -Naur a/arch/um/kernel/skas/util/mk_ptregs.c b/arch/um/kernel/skas/util/mk_ptregs.c
+--- a/arch/um/kernel/skas/util/mk_ptregs.c     Fri Aug 15 15:05:20 2003
++++ b/arch/um/kernel/skas/util/mk_ptregs.c     Fri Aug 15 15:11:21 2003
+@@ -1,3 +1,4 @@
++#include <stdio.h>
+ #include <asm/ptrace.h>
+ #include <asm/user.h>
+diff -Naur a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
+--- a/arch/um/kernel/smp.c     Fri Aug 15 15:04:50 2003
++++ b/arch/um/kernel/smp.c     Fri Aug 15 15:10:52 2003
+@@ -140,8 +140,10 @@
+         current->thread.request.u.thread.proc = idle_proc;
+         current->thread.request.u.thread.arg = (void *) cpu;
+-      new_task = do_fork(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, NULL);
+-      if(IS_ERR(new_task)) panic("do_fork failed in idle_thread");
++      new_task = copy_process(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, 
++                              NULL);
++      if(IS_ERR(new_task)) 
++              panic("copy_process failed in idle_thread");
+       cpu_tasks[cpu] = ((struct cpu_task) 
+                         { .pid =      new_task->thread.mode.tt.extern_pid,
+@@ -150,6 +152,7 @@
+       CHOOSE_MODE(write(new_task->thread.mode.tt.switch_pipe[1], &c, 
+                         sizeof(c)),
+                   ({ panic("skas mode doesn't support SMP"); }));
++      wake_up_forked_process(new_task);
+       return(new_task);
+ }
+@@ -254,15 +257,19 @@
+       atomic_inc(&scf_finished);
+ }
+-int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, 
+-                    int wait)
++int smp_call_function_on_cpu(void (*_func)(void *info), void *_info, int wait,
++                              unsigned long mask)
+ {
+-      int cpus = num_online_cpus() - 1;
+-      int i;
+-
+-      if (!cpus)
+-              return 0;
++      int i, cpu, num_cpus;
++      cpu = get_cpu();
++      mask &= ~(1UL << cpu);
++      num_cpus = hweight32(mask);
++      if(num_cpus == 0){
++              put_cpu_no_resched();
++              return(0);
++      }
++      
+       spin_lock_bh(&call_lock);
+       atomic_set(&scf_started, 0);
+       atomic_set(&scf_finished, 0);
+@@ -270,19 +277,25 @@
+       info = _info;
+       for (i=0;i<NR_CPUS;i++)
+-              if((i != current->thread_info->cpu) && 
+-                 test_bit(i, &cpu_online_map))
++              if(cpu_online(i) && ((1UL << i) & mask))
+                       write(cpu_data[i].ipi_pipe[1], "C", 1);
+-      while (atomic_read(&scf_started) != cpus)
++      while(atomic_read(&scf_started) != num_cpus)
+               barrier();
+-      if (wait)
+-              while (atomic_read(&scf_finished) != cpus)
++      if(wait)
++              while(atomic_read(&scf_finished) != num_cpus)
+                       barrier();
+       spin_unlock_bh(&call_lock);
+-      return 0;
++      put_cpu_no_resched();
++      return(0);
++}
++
++int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, 
++                    int wait)
++{
++      return(smp_call_function_on_cpu(_func, _info, wait, cpu_online_map));
+ }
+ #endif
+diff -Naur a/arch/um/kernel/sys_call_table.c b/arch/um/kernel/sys_call_table.c
+--- a/arch/um/kernel/sys_call_table.c  Fri Aug 15 15:07:57 2003
++++ b/arch/um/kernel/sys_call_table.c  Fri Aug 15 15:13:24 2003
+@@ -219,6 +219,18 @@
+ extern syscall_handler_t sys_gettid;
+ extern syscall_handler_t sys_readahead;
+ extern syscall_handler_t sys_tkill;
++extern syscall_handler_t sys_setxattr;
++extern syscall_handler_t sys_lsetxattr;
++extern syscall_handler_t sys_fsetxattr;
++extern syscall_handler_t sys_getxattr;
++extern syscall_handler_t sys_lgetxattr;
++extern syscall_handler_t sys_fgetxattr;
++extern syscall_handler_t sys_listxattr;
++extern syscall_handler_t sys_llistxattr;
++extern syscall_handler_t sys_flistxattr;
++extern syscall_handler_t sys_removexattr;
++extern syscall_handler_t sys_lremovexattr;
++extern syscall_handler_t sys_fremovexattr;
+ extern syscall_handler_t sys_sendfile64;
+ extern syscall_handler_t sys_futex;
+ extern syscall_handler_t sys_sched_setaffinity;
+@@ -235,6 +247,19 @@
+ extern syscall_handler_t sys_epoll_wait;
+ extern syscall_handler_t sys_remap_file_pages;
+ extern syscall_handler_t sys_set_tid_address;
++extern syscall_handler_t sys_timer_create;
++extern syscall_handler_t sys_timer_settime;
++extern syscall_handler_t sys_timer_gettime;
++extern syscall_handler_t sys_timer_getoverrun;
++extern syscall_handler_t sys_timer_delete;
++extern syscall_handler_t sys_clock_settime;
++extern syscall_handler_t sys_clock_gettime;
++extern syscall_handler_t sys_clock_getres;
++extern syscall_handler_t sys_clock_nanosleep;
++extern syscall_handler_t sys_statfs64;
++extern syscall_handler_t sys_fstatfs64;
++extern syscall_handler_t sys_tgkill;
++extern syscall_handler_t sys_utimes;
+ #ifdef CONFIG_NFSD
+ #define NFSSERVCTL sys_nfsservctl
+@@ -459,18 +484,18 @@
+       [ __NR_getdents64 ] = sys_getdents64,
+       [ __NR_gettid ] = sys_gettid,
+       [ __NR_readahead ] = sys_readahead,
+-      [ __NR_setxattr ] = sys_ni_syscall,
+-      [ __NR_lsetxattr ] = sys_ni_syscall,
+-      [ __NR_fsetxattr ] = sys_ni_syscall,
+-      [ __NR_getxattr ] = sys_ni_syscall,
+-      [ __NR_lgetxattr ] = sys_ni_syscall,
+-      [ __NR_fgetxattr ] = sys_ni_syscall,
+-      [ __NR_listxattr ] = sys_ni_syscall,
+-      [ __NR_llistxattr ] = sys_ni_syscall,
+-      [ __NR_flistxattr ] = sys_ni_syscall,
+-      [ __NR_removexattr ] = sys_ni_syscall,
+-      [ __NR_lremovexattr ] = sys_ni_syscall,
+-      [ __NR_fremovexattr ] = sys_ni_syscall,
++      [ __NR_setxattr ] = sys_setxattr,
++      [ __NR_lsetxattr ] = sys_lsetxattr,
++      [ __NR_fsetxattr ] = sys_fsetxattr,
++      [ __NR_getxattr ] = sys_getxattr,
++      [ __NR_lgetxattr ] = sys_lgetxattr,
++      [ __NR_fgetxattr ] = sys_fgetxattr,
++      [ __NR_listxattr ] = sys_listxattr,
++      [ __NR_llistxattr ] = sys_llistxattr,
++      [ __NR_flistxattr ] = sys_flistxattr,
++      [ __NR_removexattr ] = sys_removexattr,
++      [ __NR_lremovexattr ] = sys_lremovexattr,
++      [ __NR_fremovexattr ] = sys_fremovexattr,
+       [ __NR_tkill ] = sys_tkill,
+       [ __NR_sendfile64 ] = sys_sendfile64,
+       [ __NR_futex ] = sys_futex,
+@@ -488,6 +513,19 @@
+       [ __NR_epoll_wait ] = sys_epoll_wait,
+         [ __NR_remap_file_pages ] = sys_remap_file_pages,
+         [ __NR_set_tid_address ] = sys_set_tid_address,
++      [ __NR_timer_create ] = sys_timer_create,
++      [ __NR_timer_settime ] = sys_timer_settime,
++      [ __NR_timer_gettime ] = sys_timer_gettime,
++      [ __NR_timer_getoverrun ] = sys_timer_getoverrun,
++      [ __NR_timer_delete ] = sys_timer_delete,
++      [ __NR_clock_settime ] = sys_clock_settime,
++      [ __NR_clock_gettime ] = sys_clock_gettime,
++      [ __NR_clock_getres ] = sys_clock_getres,
++      [ __NR_clock_nanosleep ] = sys_clock_nanosleep,
++      [ __NR_statfs64 ] = sys_statfs64,
++      [ __NR_fstatfs64 ] = sys_fstatfs64,
++      [ __NR_tgkill ] = sys_tgkill,
++      [ __NR_utimes ] = sys_utimes,
+       ARCH_SYSCALLS
+       [ LAST_SYSCALL + 1 ... NR_syscalls ] = 
+diff -Naur a/arch/um/kernel/syscall_kern.c b/arch/um/kernel/syscall_kern.c
+--- a/arch/um/kernel/syscall_kern.c    Fri Aug 15 15:07:37 2003
++++ b/arch/um/kernel/syscall_kern.c    Fri Aug 15 15:13:03 2003
+@@ -35,39 +35,40 @@
+ long sys_fork(void)
+ {
+-      struct task_struct *p;
++      long ret;
+       current->thread.forking = 1;
+-        p = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL);
++        ret = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL);
+       current->thread.forking = 0;
+-      return(IS_ERR(p) ? PTR_ERR(p) : p->pid);
++      return(ret);
+ }
+-long sys_clone(unsigned long clone_flags, unsigned long newsp)
++long sys_clone(unsigned long clone_flags, unsigned long newsp, 
++             int *parent_tid, int *child_tid)
+ {
+-      struct task_struct *p;
++      long ret;
+       current->thread.forking = 1;
+-      p = do_fork(clone_flags, newsp, NULL, 0, NULL, NULL);
++      ret = do_fork(clone_flags, newsp, NULL, 0, parent_tid, child_tid);
+       current->thread.forking = 0;
+-      return(IS_ERR(p) ? PTR_ERR(p) : p->pid);
++      return(ret);
+ }
+ long sys_vfork(void)
+ {
+-      struct task_struct *p;
++      long ret;
+       current->thread.forking = 1;
+-      p = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, NULL);
++      ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, 
++                    NULL);
+       current->thread.forking = 0;
+-      return(IS_ERR(p) ? PTR_ERR(p) : p->pid);
++      return(ret);
+ }
+ /* common code for old and new mmaps */
+-static inline long do_mmap2(
+-      unsigned long addr, unsigned long len,
+-      unsigned long prot, unsigned long flags,
+-      unsigned long fd, unsigned long pgoff)
++long do_mmap2(struct mm_struct *mm, unsigned long addr, unsigned long len,
++            unsigned long prot, unsigned long flags, unsigned long fd,
++            unsigned long pgoff)
+ {
+       int error = -EBADF;
+       struct file * file = NULL;
+@@ -79,9 +80,9 @@
+                       goto out;
+       }
+-      down_write(&current->mm->mmap_sem);
+-      error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+-      up_write(&current->mm->mmap_sem);
++      down_write(&mm->mmap_sem);
++      error = do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff);
++      up_write(&mm->mmap_sem);
+       if (file)
+               fput(file);
+@@ -93,7 +94,7 @@
+              unsigned long prot, unsigned long flags,
+              unsigned long fd, unsigned long pgoff)
+ {
+-      return do_mmap2(addr, len, prot, flags, fd, pgoff);
++      return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff);
+ }
+ /*
+@@ -120,7 +121,8 @@
+       if (offset & ~PAGE_MASK)
+               goto out;
+-      err = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT);
++      err = do_mmap2(current->mm, addr, len, prot, flags, fd, 
++                     offset >> PAGE_SHIFT);
+  out:
+       return err;
+ }
+@@ -141,37 +143,6 @@
+         return error;
+ }
+-int sys_sigaction(int sig, const struct old_sigaction *act,
+-                       struct old_sigaction *oact)
+-{
+-      struct k_sigaction new_ka, old_ka;
+-      int ret;
+-
+-      if (act) {
+-              old_sigset_t mask;
+-              if (verify_area(VERIFY_READ, act, sizeof(*act)) ||
+-                  __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+-                  __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+-                      return -EFAULT;
+-              __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+-              __get_user(mask, &act->sa_mask);
+-              siginitset(&new_ka.sa.sa_mask, mask);
+-      }
+-
+-      ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+-
+-      if (!ret && oact) {
+-              if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) ||
+-                  __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+-                  __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+-                      return -EFAULT;
+-              __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+-              __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+-      }
+-
+-      return ret;
+-}
+-
+ /*
+  * sys_ipc() is the de-multiplexer for the SysV IPC calls..
+  *
+@@ -253,7 +224,7 @@
+               return sys_shmctl (first, second,
+                                  (struct shmid_ds *) ptr);
+       default:
+-              return -EINVAL;
++              return -ENOSYS;
+       }
+ }
+@@ -302,11 +273,6 @@
+       return error;
+ }
+-int sys_sigaltstack(const stack_t *uss, stack_t *uoss)
+-{
+-      return(do_sigaltstack(uss, uoss, PT_REGS_SP(&current->thread.regs)));
+-}
+-
+ long execute_syscall(void *r)
+ {
+       return(CHOOSE_MODE_PROC(execute_syscall_tt, execute_syscall_skas, r));
+diff -Naur a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
+--- a/arch/um/kernel/sysrq.c   Fri Aug 15 15:05:01 2003
++++ b/arch/um/kernel/sysrq.c   Fri Aug 15 15:11:13 2003
+@@ -11,6 +11,14 @@
+ #include "sysrq.h"
+ #include "user_util.h"
++void show_stack(struct task_struct *task, unsigned long *sp)
++{
++      if(task)
++              show_trace_task(task);
++      else
++              show_trace(sp);
++}
++
+ void show_trace(unsigned long * stack)
+ {
+         int i;
+diff -Naur a/arch/um/kernel/time.c b/arch/um/kernel/time.c
+--- a/arch/um/kernel/time.c    Fri Aug 15 15:04:49 2003
++++ b/arch/um/kernel/time.c    Fri Aug 15 15:10:46 2003
+@@ -15,12 +15,16 @@
+ #include "process.h"
+ #include "signal_user.h"
+ #include "time_user.h"
++#include "kern_constants.h"
+ extern struct timeval xtime;
++struct timeval local_offset = { 0, 0 };
++
+ void timer(void)
+ {
+       gettimeofday(&xtime, NULL);
++      timeradd(&xtime, &local_offset, &xtime);
+ }
+ void set_interval(int timer_type)
+@@ -65,7 +69,7 @@
+                      errno);
+ }
+-void idle_timer(void)
++void uml_idle_timer(void)
+ {
+       if(signal(SIGVTALRM, SIG_IGN) == SIG_ERR)
+               panic("Couldn't unset SIGVTALRM handler");
+@@ -82,8 +86,6 @@
+       set_interval(ITIMER_VIRTUAL);
+ }
+-struct timeval local_offset = { 0, 0 };
+-
+ void do_gettimeofday(struct timeval *tv)
+ {
+       unsigned long flags;
+@@ -100,7 +102,7 @@
+       unsigned long flags;
+       struct timeval tv_in;
+-      if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
++      if ((unsigned long) tv->tv_nsec >= UM_NSEC_PER_SEC)
+               return -EINVAL;
+       tv_in.tv_sec = tv->tv_sec;
+@@ -110,6 +112,8 @@
+       gettimeofday(&now, NULL);
+       timersub(&tv_in, &now, &local_offset);
+       time_unlock(flags);
++
++      return(0);
+ }
+ void idle_sleep(int secs)
+diff -Naur a/arch/um/kernel/time_kern.c b/arch/um/kernel/time_kern.c
+--- a/arch/um/kernel/time_kern.c       Fri Aug 15 15:07:19 2003
++++ b/arch/um/kernel/time_kern.c       Fri Aug 15 15:12:46 2003
+@@ -55,12 +55,13 @@
+       do_timer(&regs);
+ }
+-void um_timer(int irq, void *dev, struct pt_regs *regs)
++irqreturn_t um_timer(int irq, void *dev, struct pt_regs *regs)
+ {
+       do_timer(regs);
+-      write_seqlock(&xtime_lock);
++      write_seqlock_irq(&xtime_lock);
+       timer();
+-      write_sequnlock(&xtime_lock);
++      write_sequnlock_irq(&xtime_lock);
++      return(IRQ_HANDLED);
+ }
+ long um_time(int * tloc)
+@@ -78,12 +79,12 @@
+ long um_stime(int * tptr)
+ {
+       int value;
+-      struct timeval new;
++      struct timespec new;
+       if (get_user(value, tptr))
+                 return -EFAULT;
+       new.tv_sec = value;
+-      new.tv_usec = 0;
++      new.tv_nsec = 0;
+       do_settimeofday(&new);
+       return 0;
+ }
+@@ -122,7 +123,9 @@
+ void timer_handler(int sig, union uml_pt_regs *regs)
+ {
+ #ifdef CONFIG_SMP
++      local_irq_disable();
+       update_process_times(user_context(UPT_SP(regs)));
++      local_irq_enable();
+ #endif
+       if(current->thread_info->cpu == 0)
+               timer_irq(regs);
+diff -Naur a/arch/um/kernel/trap_kern.c b/arch/um/kernel/trap_kern.c
+--- a/arch/um/kernel/trap_kern.c       Fri Aug 15 15:04:01 2003
++++ b/arch/um/kernel/trap_kern.c       Fri Aug 15 15:10:18 2003
+@@ -16,6 +16,7 @@
+ #include "asm/tlbflush.h"
+ #include "asm/a.out.h"
+ #include "asm/current.h"
++#include "asm/irq.h"
+ #include "user_util.h"
+ #include "kern_util.h"
+ #include "kern.h"
+@@ -180,6 +181,11 @@
+       else relay_signal(sig, regs);
+ }
++void winch(int sig, union uml_pt_regs *regs)
++{
++      do_IRQ(WINCH_IRQ, regs);
++}
++
+ void trap_init(void)
+ {
+ }
+diff -Naur a/arch/um/kernel/trap_user.c b/arch/um/kernel/trap_user.c
+--- a/arch/um/kernel/trap_user.c       Fri Aug 15 15:05:45 2003
++++ b/arch/um/kernel/trap_user.c       Fri Aug 15 15:11:52 2003
+@@ -82,6 +82,8 @@
+                    .is_irq            = 0 },
+       [ SIGILL ] { .handler           = relay_signal,
+                    .is_irq            = 0 },
++      [ SIGWINCH ] { .handler         = winch,
++                     .is_irq          = 1 },
+       [ SIGBUS ] { .handler           = bus_handler,
+                    .is_irq            = 0 },
+       [ SIGSEGV] { .handler           = segv_handler,
+diff -Naur a/arch/um/kernel/tt/include/uaccess.h b/arch/um/kernel/tt/include/uaccess.h
+--- a/arch/um/kernel/tt/include/uaccess.h      Fri Aug 15 15:07:25 2003
++++ b/arch/um/kernel/tt/include/uaccess.h      Fri Aug 15 15:12:52 2003
+@@ -46,18 +46,20 @@
+ static inline int copy_from_user_tt(void *to, const void *from, int n)
+ {
+-      return(access_ok_tt(VERIFY_READ, from, n) ?
+-             __do_copy_from_user(to, from, n, 
+-                                 &current->thread.fault_addr,
+-                                 &current->thread.fault_catcher) : n);
++      if(!access_ok_tt(VERIFY_READ, from, n)) 
++              return(n);
++
++      return(__do_copy_from_user(to, from, n, &current->thread.fault_addr,
++                                 &current->thread.fault_catcher));
+ }
+ static inline int copy_to_user_tt(void *to, const void *from, int n)
+ {
+-      return(access_ok_tt(VERIFY_WRITE, to, n) ?
+-             __do_copy_to_user(to, from, n, 
+-                                 &current->thread.fault_addr,
+-                                 &current->thread.fault_catcher) : n);
++      if(!access_ok_tt(VERIFY_WRITE, to, n))
++              return(n);
++              
++      return(__do_copy_to_user(to, from, n, &current->thread.fault_addr,
++                               &current->thread.fault_catcher));
+ }
+ extern int __do_strncpy_from_user(char *dst, const char *src, size_t n,
+@@ -67,7 +69,9 @@
+ {
+       int n;
+-      if(!access_ok_tt(VERIFY_READ, src, 1)) return(-EFAULT);
++      if(!access_ok_tt(VERIFY_READ, src, 1)) 
++              return(-EFAULT);
++
+       n = __do_strncpy_from_user(dst, src, count, 
+                                  &current->thread.fault_addr,
+                                  &current->thread.fault_catcher);
+@@ -87,10 +91,11 @@
+ static inline int clear_user_tt(void *mem, int len)
+ {
+-      return(access_ok_tt(VERIFY_WRITE, mem, len) ? 
+-             __do_clear_user(mem, len, 
+-                             &current->thread.fault_addr,
+-                             &current->thread.fault_catcher) : len);
++      if(!access_ok_tt(VERIFY_WRITE, mem, len))
++              return(len);
++
++      return(__do_clear_user(mem, len, &current->thread.fault_addr,
++                             &current->thread.fault_catcher));
+ }
+ extern int __do_strnlen_user(const char *str, unsigned long n,
+diff -Naur a/arch/um/kernel/tt/process_kern.c b/arch/um/kernel/tt/process_kern.c
+--- a/arch/um/kernel/tt/process_kern.c Fri Aug 15 15:07:55 2003
++++ b/arch/um/kernel/tt/process_kern.c Fri Aug 15 15:13:23 2003
+@@ -104,7 +104,10 @@
+ void release_thread_tt(struct task_struct *task)
+ {
+-      os_kill_process(task->thread.mode.tt.extern_pid, 0);
++      int pid = task->thread.mode.tt.extern_pid;
++
++      if(os_getpid() != pid)
++              os_kill_process(pid, 0);
+ }
+ void exit_thread_tt(void)
+@@ -125,27 +128,27 @@
+       UPT_SC(&current->thread.regs.regs) = (void *) (&sig + 1);
+       suspend_new_thread(current->thread.mode.tt.switch_pipe[0]);
+-      block_signals();
++      force_flush_all();
++      if(current->thread.prev_sched != NULL)
++              schedule_tail(current->thread.prev_sched);
++      current->thread.prev_sched = NULL;
++
+       init_new_thread_signals(1);
+-#ifdef CONFIG_SMP
+-      schedule_tail(current->thread.prev_sched);
+-#endif
+       enable_timer();
+       free_page(current->thread.temp_stack);
+       set_cmdline("(kernel thread)");
+-      force_flush_all();
+-      current->thread.prev_sched = NULL;
+       change_sig(SIGUSR1, 1);
+       change_sig(SIGVTALRM, 1);
+       change_sig(SIGPROF, 1);
+-      unblock_signals();
++      local_irq_enable();
+       if(!run_kernel_thread(fn, arg, &current->thread.exec_buf))
+               do_exit(0);
+ }
+ static int new_thread_proc(void *stack)
+ {
++      local_irq_disable();
+       init_new_thread_stack(stack, new_thread_handler);
+       os_usr1_process(os_getpid());
+       return(0);
+@@ -165,35 +168,32 @@
+       UPT_SC(&current->thread.regs.regs) = (void *) (&sig + 1);
+       suspend_new_thread(current->thread.mode.tt.switch_pipe[0]);
+-#ifdef CONFIG_SMP     
+-      schedule_tail(NULL);
+-#endif
++      force_flush_all();
++      if(current->thread.prev_sched != NULL)
++              schedule_tail(current->thread.prev_sched);
++      current->thread.prev_sched = NULL;
++
+       enable_timer();
+       change_sig(SIGVTALRM, 1);
+       local_irq_enable();
+-      force_flush_all();
+       if(current->mm != current->parent->mm)
+               protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 
+                              1, 0, 1);
+       task_protections((unsigned long) current->thread_info);
+-      current->thread.prev_sched = NULL;
+-
+       free_page(current->thread.temp_stack);
++      local_irq_disable();
+       change_sig(SIGUSR1, 0);
+       set_user_mode(current);
+ }
+-static int sigusr1 = SIGUSR1;
+-
+ int fork_tramp(void *stack)
+ {
+-      int sig = sigusr1;
+-
+       local_irq_disable();
++      arch_init_thread();
+       init_new_thread_stack(stack, finish_fork_handler);
+-      kill(os_getpid(), sig);
++      os_usr1_process(os_getpid());
+       return(0);
+ }
+diff -Naur a/arch/um/kernel/tt/ptproxy/proxy.c b/arch/um/kernel/tt/ptproxy/proxy.c
+--- a/arch/um/kernel/tt/ptproxy/proxy.c        Fri Aug 15 15:07:01 2003
++++ b/arch/um/kernel/tt/ptproxy/proxy.c        Fri Aug 15 15:12:44 2003
+@@ -293,10 +293,10 @@
+ }
+ char gdb_init_string[] = 
+-"att 1
+-b panic
+-b stop
+-handle SIGWINCH nostop noprint pass
++"att 1 \n\
++b panic \n\
++b stop \n\
++handle SIGWINCH nostop noprint pass \n\
+ ";
+ int start_debugger(char *prog, int startup, int stop, int *fd_out)
+diff -Naur a/arch/um/kernel/tt/tracer.c b/arch/um/kernel/tt/tracer.c
+--- a/arch/um/kernel/tt/tracer.c       Fri Aug 15 15:03:51 2003
++++ b/arch/um/kernel/tt/tracer.c       Fri Aug 15 15:10:12 2003
+@@ -39,7 +39,7 @@
+               return(0);
+       register_winch_irq(tracer_winch[0], fd, -1, data);
+-      return(0);
++      return(1);
+ }
+ static void tracer_winch_handler(int sig)
+@@ -401,7 +401,7 @@
+               
+               if(!strcmp(line, "go")) debug_stop = 0;
+               else if(!strcmp(line, "parent")) debug_parent = 1;
+-              else printk("Unknown debug option : '%s'\n", line);
++              else printf("Unknown debug option : '%s'\n", line);
+               line = next;
+       }
+diff -Naur a/arch/um/kernel/tt/uaccess_user.c b/arch/um/kernel/tt/uaccess_user.c
+--- a/arch/um/kernel/tt/uaccess_user.c Fri Aug 15 15:05:00 2003
++++ b/arch/um/kernel/tt/uaccess_user.c Fri Aug 15 15:11:10 2003
+@@ -8,15 +8,20 @@
+ #include <string.h>
+ #include "user_util.h"
+ #include "uml_uaccess.h"
++#include "task.h"
++#include "kern_util.h"
+ int __do_copy_from_user(void *to, const void *from, int n,
+                       void **fault_addr, void **fault_catcher)
+ {
++      struct tt_regs save = TASK_REGS(get_current())->tt;
+       unsigned long fault;
+       int faulted;
+       fault = __do_user_copy(to, from, n, fault_addr, fault_catcher,
+                              __do_copy, &faulted);
++      TASK_REGS(get_current())->tt = save;
++
+       if(!faulted) return(0);
+       else return(n - (fault - (unsigned long) from));
+ }
+@@ -29,11 +34,14 @@
+ int __do_strncpy_from_user(char *dst, const char *src, unsigned long count,
+                          void **fault_addr, void **fault_catcher)
+ {
++      struct tt_regs save = TASK_REGS(get_current())->tt;
+       unsigned long fault;
+       int faulted;
+       fault = __do_user_copy(dst, src, count, fault_addr, fault_catcher,
+                              __do_strncpy, &faulted);
++      TASK_REGS(get_current())->tt = save;
++
+       if(!faulted) return(strlen(dst));
+       else return(-1);
+ }
+@@ -46,11 +54,14 @@
+ int __do_clear_user(void *mem, unsigned long len,
+                   void **fault_addr, void **fault_catcher)
+ {
++      struct tt_regs save = TASK_REGS(get_current())->tt;
+       unsigned long fault;
+       int faulted;
+       fault = __do_user_copy(mem, NULL, len, fault_addr, fault_catcher,
+                              __do_clear, &faulted);
++      TASK_REGS(get_current())->tt = save;
++
+       if(!faulted) return(0);
+       else return(len - (fault - (unsigned long) mem));
+ }
+@@ -58,6 +69,7 @@
+ int __do_strnlen_user(const char *str, unsigned long n,
+                     void **fault_addr, void **fault_catcher)
+ {
++      struct tt_regs save = TASK_REGS(get_current())->tt;
+       int ret;
+       unsigned long *faddrp = (unsigned long *)fault_addr;
+       jmp_buf jbuf;
+@@ -71,6 +83,8 @@
+       }
+       *fault_addr = NULL;
+       *fault_catcher = NULL;
++
++      TASK_REGS(get_current())->tt = save;
+       return ret;
+ }
+diff -Naur a/arch/um/kernel/tty_log.c b/arch/um/kernel/tty_log.c
+--- a/arch/um/kernel/tty_log.c Fri Aug 15 15:07:04 2003
++++ b/arch/um/kernel/tty_log.c Fri Aug 15 15:12:44 2003
+@@ -13,6 +13,7 @@
+ #include <sys/time.h>
+ #include "init.h"
+ #include "user.h"
++#include "kern_util.h"
+ #include "os.h"
+ #define TTY_LOG_DIR "./"
+@@ -24,29 +25,40 @@
+ #define TTY_LOG_OPEN 1
+ #define TTY_LOG_CLOSE 2
+ #define TTY_LOG_WRITE 3
++#define TTY_LOG_EXEC 4
++
++#define TTY_READ 1
++#define TTY_WRITE 2
+ struct tty_log_buf {
+       int what;
+       unsigned long tty;
+       int len;
++      int direction;
++      unsigned long sec;
++      unsigned long usec;
+ };
+-int open_tty_log(void *tty)
++int open_tty_log(void *tty, void *current_tty)
+ {
+       struct timeval tv;
+       struct tty_log_buf data;
+       char buf[strlen(tty_log_dir) + sizeof("01234567890-01234567\0")];
+       int fd;
++      gettimeofday(&tv, NULL);
+       if(tty_log_fd != -1){
+-              data = ((struct tty_log_buf) { what :   TTY_LOG_OPEN,
+-                                             tty : (unsigned long) tty,
+-                                             len : 0 });
++              data = ((struct tty_log_buf) { .what    = TTY_LOG_OPEN,
++                                             .tty  = (unsigned long) tty,
++                                             .len  = sizeof(current_tty),
++                                             .direction = 0,
++                                             .sec = tv.tv_sec,
++                                             .usec = tv.tv_usec } );
+               write(tty_log_fd, &data, sizeof(data));
++              write(tty_log_fd, &current_tty, data.len);
+               return(tty_log_fd);
+       }
+-      gettimeofday(&tv, NULL);
+       sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, 
+               (unsigned int) tv.tv_usec);
+@@ -62,30 +74,114 @@
+ void close_tty_log(int fd, void *tty)
+ {
+       struct tty_log_buf data;
++      struct timeval tv;
+       if(tty_log_fd != -1){
+-              data = ((struct tty_log_buf) { what :   TTY_LOG_CLOSE,
+-                                             tty : (unsigned long) tty,
+-                                             len : 0 });
++              gettimeofday(&tv, NULL);
++              data = ((struct tty_log_buf) { .what    = TTY_LOG_CLOSE,
++                                             .tty  = (unsigned long) tty,
++                                             .len  = 0,
++                                             .direction = 0,
++                                             .sec = tv.tv_sec,
++                                             .usec = tv.tv_usec } );
+               write(tty_log_fd, &data, sizeof(data));
+               return;
+       }
+       close(fd);
+ }
+-int write_tty_log(int fd, char *buf, int len, void *tty)
++static int log_chunk(int fd, const char *buf, int len)
+ {
++      int total = 0, try, missed, n;
++      char chunk[64];
++
++      while(len > 0){
++              try = (len > sizeof(chunk)) ? sizeof(chunk) : len;
++              missed = copy_from_user_proc(chunk, (char *) buf, try);
++              try -= missed;
++              n = write(fd, chunk, try);
++              if(n != try)
++                      return(-errno);
++              if(missed != 0)
++                      return(-EFAULT);
++
++              len -= try;
++              total += try;
++              buf += try;
++      }
++
++      return(total);
++}
++
++int write_tty_log(int fd, const char *buf, int len, void *tty, int is_read)
++{
++      struct timeval tv;
+       struct tty_log_buf data;
++      int direction;
+       if(fd == tty_log_fd){
+-              data = ((struct tty_log_buf) { what :   TTY_LOG_WRITE,
+-                                             tty : (unsigned long) tty,
+-                                             len : len });
++              gettimeofday(&tv, NULL);
++              direction = is_read ? TTY_READ : TTY_WRITE;
++              data = ((struct tty_log_buf) { .what    = TTY_LOG_WRITE,
++                                             .tty  = (unsigned long) tty,
++                                             .len  = len,
++                                             .direction = direction,
++                                             .sec = tv.tv_sec,
++                                             .usec = tv.tv_usec } );
+               write(tty_log_fd, &data, sizeof(data));
+       }
+-      return(write(fd, buf, len));
++
++      return(log_chunk(fd, buf, len));
+ }
++void log_exec(char **argv, void *tty)
++{
++      struct timeval tv;
++      struct tty_log_buf data;
++      char **ptr,*arg;
++      int len;
++      
++      if(tty_log_fd == -1) return;
++
++      gettimeofday(&tv, NULL);
++
++      len = 0;
++      for(ptr = argv; ; ptr++){
++              if(copy_from_user_proc(&arg, ptr, sizeof(arg)))
++                      return;
++              if(arg == NULL) break;
++              len += strlen_user_proc(arg);
++      }
++
++      data = ((struct tty_log_buf) { .what    = TTY_LOG_EXEC,
++                                     .tty  = (unsigned long) tty,
++                                     .len  = len,
++                                     .direction = 0,
++                                     .sec = tv.tv_sec,
++                                     .usec = tv.tv_usec } );
++      write(tty_log_fd, &data, sizeof(data));
++
++      for(ptr = argv; ; ptr++){
++              if(copy_from_user_proc(&arg, ptr, sizeof(arg)))
++                      return;
++              if(arg == NULL) break;
++              log_chunk(tty_log_fd, arg, strlen_user_proc(arg));
++      }
++}
++
++extern void register_tty_logger(int (*opener)(void *, void *),
++                              int (*writer)(int, const char *, int, 
++                                            void *, int),
++                              void (*closer)(int, void *));
++
++static int register_logger(void)
++{
++      register_tty_logger(open_tty_log, write_tty_log, close_tty_log);
++      return(0);
++}
++
++__uml_initcall(register_logger);
++
+ static int __init set_tty_log_dir(char *name, int *add)
+ {
+       tty_log_dir = name;
+@@ -104,7 +200,7 @@
+       tty_log_fd = strtoul(name, &end, 0);
+       if((*end != '\0') || (end == name)){
+-              printk("set_tty_log_fd - strtoul failed on '%s'\n", name);
++              printf("set_tty_log_fd - strtoul failed on '%s'\n", name);
+               tty_log_fd = -1;
+       }
+       return 0;
+diff -Naur a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
+--- a/arch/um/kernel/um_arch.c Fri Aug 15 15:07:48 2003
++++ b/arch/um/kernel/um_arch.c Fri Aug 15 15:13:14 2003
+@@ -38,13 +38,18 @@
+ #include "mode_kern.h"
+ #include "mode.h"
+-#define DEFAULT_COMMAND_LINE "root=6200"
++#define DEFAULT_COMMAND_LINE "root=ubd0"
+ struct cpuinfo_um boot_cpu_data = { 
+       .loops_per_jiffy        = 0,
+       .ipi_pipe               = { -1, -1 }
+ };
++/* Placeholder to make UML link until the vsyscall stuff is actually 
++ * implemented
++ */
++void *__kernel_vsyscall;
++
+ unsigned long thread_saved_pc(struct task_struct *task)
+ {
+       return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas,
+@@ -61,10 +66,14 @@
+               return 0;
+ #endif
+-      seq_printf(m, "bogomips\t: %lu.%02lu\n",
++      seq_printf(m, "processor\t: %d\n", index);
++      seq_printf(m, "vendor_id\t: User Mode Linux\n");
++      seq_printf(m, "model name\t: UML\n");
++      seq_printf(m, "mode\t\t: %s\n", CHOOSE_MODE("tt", "skas"));
++      seq_printf(m, "host\t\t: %s\n", host_info);
++      seq_printf(m, "bogomips\t: %lu.%02lu\n\n",
+                  loops_per_jiffy/(500000/HZ),
+                  (loops_per_jiffy/(5000/HZ)) % 100);
+-      seq_printf(m, "host\t\t: %s\n", host_info);
+       return(0);
+ }
+@@ -134,12 +143,12 @@
+       if(umid != NULL){
+               snprintf(argv1_begin, 
+                        (argv1_end - argv1_begin) * sizeof(*ptr), 
+-                       "(%s)", umid);
++                       "(%s) ", umid);
+               ptr = &argv1_begin[strlen(argv1_begin)];
+       }
+       else ptr = argv1_begin;
+-      snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), " [%s]", cmd);
++      snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), "[%s]", cmd);
+       memset(argv1_begin + strlen(argv1_begin), '\0', 
+              argv1_end - argv1_begin - strlen(argv1_begin));
+ #endif
+@@ -179,7 +188,7 @@
+ static int __init uml_ncpus_setup(char *line, int *add)
+ {
+        if (!sscanf(line, "%d", &ncpus)) {
+-               printk("Couldn't parse [%s]\n", line);
++               printf("Couldn't parse [%s]\n", line);
+                return -1;
+        }
+@@ -210,7 +219,7 @@
+ static int __init mode_tt_setup(char *line, int *add)
+ {
+-      printk("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n");
++      printf("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n");
+       return(0);
+ }
+@@ -221,7 +230,7 @@
+ static int __init mode_tt_setup(char *line, int *add)
+ {
+-      printk("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n");
++      printf("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n");
+       return(0);
+ }
+@@ -369,6 +378,7 @@
+               2 * PAGE_SIZE;
+       task_protections((unsigned long) &init_thread_info);
++      os_flush_stdout();
+       return(CHOOSE_MODE(start_uml_tt(), start_uml_skas()));
+ }
+diff -Naur a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c
+--- a/arch/um/kernel/umid.c    Fri Aug 15 15:08:44 2003
++++ b/arch/um/kernel/umid.c    Fri Aug 15 15:13:39 2003
+@@ -33,18 +33,19 @@
+ static int umid_is_random = 1;
+ static int umid_inited = 0;
+-static int make_umid(void);
++static int make_umid(int (*printer)(const char *fmt, ...));
+-static int __init set_umid(char *name, int is_random)
++static int __init set_umid(char *name, int is_random, 
++                         int (*printer)(const char *fmt, ...))
+ {
+       if(umid_inited){
+-              printk("Unique machine name can't be set twice\n");
++              (*printer)("Unique machine name can't be set twice\n");
+               return(-1);
+       }
+       if(strlen(name) > UMID_LEN - 1)
+-              printk("Unique machine name is being truncated to %s "
+-                     "characters\n", UMID_LEN);
++              (*printer)("Unique machine name is being truncated to %s "
++                         "characters\n", UMID_LEN);
+       strlcpy(umid, name, sizeof(umid));
+       umid_is_random = is_random;
+@@ -54,7 +55,7 @@
+ static int __init set_umid_arg(char *name, int *add)
+ {
+-      return(set_umid(name, 0));
++      return(set_umid(name, 0, printf));
+ }
+ __uml_setup("umid=", set_umid_arg,
+@@ -67,7 +68,7 @@
+ {
+       int n;
+-      if(!umid_inited && make_umid()) return(-1);
++      if(!umid_inited && make_umid(printk)) return(-1);
+       n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1;
+       if(n > len){
+@@ -92,14 +93,14 @@
+       fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))), 
+                         0644);
+       if(fd < 0){
+-              printk("Open of machine pid file \"%s\" failed - "
++              printf("Open of machine pid file \"%s\" failed - "
+                      "errno = %d\n", file, -fd);
+               return 0;
+       }
+       sprintf(pid, "%d\n", os_getpid());
+       if(write(fd, pid, strlen(pid)) != strlen(pid))
+-              printk("Write of pid file failed - errno = %d\n", errno);
++              printf("Write of pid file failed - errno = %d\n", errno);
+       close(fd);
+       return 0;
+ }
+@@ -197,7 +198,7 @@
+       if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){
+               uml_dir = malloc(strlen(name) + 1);
+               if(uml_dir == NULL){
+-                      printk("Failed to malloc uml_dir - error = %d\n",
++                      printf("Failed to malloc uml_dir - error = %d\n",
+                              errno);
+                       uml_dir = name;
+                       return(0);
+@@ -217,7 +218,7 @@
+               char *home = getenv("HOME");
+               if(home == NULL){
+-                      printk("make_uml_dir : no value in environment for "
++                      printf("make_uml_dir : no value in environment for "
+                              "$HOME\n");
+                       exit(1);
+               }
+@@ -239,25 +240,25 @@
+       strcpy(uml_dir, dir);
+       
+       if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){
+-              printk("Failed to mkdir %s - errno = %i\n", uml_dir, errno);
++              printf("Failed to mkdir %s - errno = %i\n", uml_dir, errno);
+               return(-1);
+       }
+       return 0;
+ }
+-static int __init make_umid(void)
++static int __init make_umid(int (*printer)(const char *fmt, ...))
+ {
+       int fd, err;
+       char tmp[strlen(uml_dir) + UMID_LEN + 1];
+       strlcpy(tmp, uml_dir, sizeof(tmp));
+-      if(*umid == 0){
++      if(!umid_inited){
+               strcat(tmp, "XXXXXX");
+               fd = mkstemp(tmp);
+               if(fd < 0){
+-                      printk("make_umid - mkstemp failed, errno = %d\n",
+-                             errno);
++                      (*printer)("make_umid - mkstemp failed, errno = %d\n",
++                                 errno);
+                       return(1);
+               }
+@@ -267,7 +268,7 @@
+                * for directories.
+                */
+               unlink(tmp);
+-              set_umid(&tmp[strlen(uml_dir)], 1);
++              set_umid(&tmp[strlen(uml_dir)], 1, printer);
+       }
+       
+       sprintf(tmp, "%s%s", uml_dir, umid);
+@@ -275,14 +276,14 @@
+       if((err = mkdir(tmp, 0777)) < 0){
+               if(errno == EEXIST){
+                       if(not_dead_yet(tmp)){
+-                              printk("umid '%s' is in use\n", umid);
++                              (*printer)("umid '%s' is in use\n", umid);
+                               return(-1);
+                       }
+                       err = mkdir(tmp, 0777);
+               }
+       }
+       if(err < 0){
+-              printk("Failed to create %s - errno = %d\n", umid, errno);
++              (*printer)("Failed to create %s - errno = %d\n", umid, errno);
+               return(-1);
+       }
+@@ -295,7 +296,13 @@
+ );
+ __uml_postsetup(make_uml_dir);
+-__uml_postsetup(make_umid);
++
++static int __init make_umid_setup(void)
++{
++      return(make_umid(printf));
++}
++
++__uml_postsetup(make_umid_setup);
+ __uml_postsetup(create_pid_file);
+ /*
+diff -Naur a/arch/um/kernel/user_util.c b/arch/um/kernel/user_util.c
+--- a/arch/um/kernel/user_util.c       Fri Aug 15 15:04:48 2003
++++ b/arch/um/kernel/user_util.c       Fri Aug 15 15:10:41 2003
+@@ -119,17 +119,6 @@
+       }
+ }
+-int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags)
+-{
+-      int pid;
+-
+-      pid = clone(fn, sp, flags, arg);
+-      if(pid < 0) return(-1);
+-      wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL);
+-      ptrace(PTRACE_CONT, pid, 0, 0);
+-      return(pid);
+-}
+-
+ int raw(int fd, int complain)
+ {
+       struct termios tt;
+diff -Naur a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c
+--- a/arch/um/os-Linux/drivers/tuntap_user.c   Fri Aug 15 15:09:23 2003
++++ b/arch/um/os-Linux/drivers/tuntap_user.c   Fri Aug 15 15:14:02 2003
+@@ -142,7 +142,7 @@
+                       return(-errno);
+               }
+               memset(&ifr, 0, sizeof(ifr));
+-              ifr.ifr_flags = IFF_TAP;
++              ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+               strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name));
+               if(ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0){
+                       printk("TUNSETIFF failed, errno = %d", errno);
+diff -Naur a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
+--- a/arch/um/os-Linux/file.c  Fri Aug 15 15:09:15 2003
++++ b/arch/um/os-Linux/file.c  Fri Aug 15 15:13:54 2003
+@@ -315,7 +315,7 @@
+       return(new);
+ }
+-int create_unix_socket(char *file, int len)
++int create_unix_socket(char *file, int len, int close_on_exec)
+ {
+       struct sockaddr_un addr;
+       int sock, err;
+@@ -327,6 +327,10 @@
+               return(-errno);
+       }
++      if(close_on_exec && fcntl(sock, F_SETFD, 1) < 0)
++              printk("create_unix_socket : Setting FD_CLOEXEC failed, "
++                     "errno = %d", errno);
++
+       addr.sun_family = AF_UNIX;
+       /* XXX Be more careful about overflow */
+@@ -342,6 +346,37 @@
+       return(sock);
+ }
++void os_flush_stdout(void)
++{
++      fflush(stdout);
++}
++
++int os_lock_file(int fd, int excl)
++{
++      int type = excl ? F_WRLCK : F_RDLCK;
++      struct flock lock = ((struct flock) { .l_type   = type,
++                                            .l_whence = SEEK_SET,
++                                            .l_start  = 0,
++                                            .l_len    = 0 } );
++      int err, save;
++
++      err = fcntl(fd, F_SETLK, &lock);
++      if(!err)
++              goto out;
++
++      save = -errno;
++      err = fcntl(fd, F_GETLK, &lock);
++      if(err){
++              err = -errno;
++              goto out;
++      }
++      
++      printk("F_SETLK failed, file already locked by pid %d\n", lock.l_pid);
++      err = save;
++ out:
++      return(err);
++}
++
+ /*
+  * Overrides for Emacs so that we follow Linus's tabbing style.
+  * Emacs will notice this stuff at the end of the file and automatically
+diff -Naur a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile
+--- a/arch/um/sys-i386/Makefile        Fri Aug 15 15:04:47 2003
++++ b/arch/um/sys-i386/Makefile        Fri Aug 15 15:10:35 2003
+@@ -1,7 +1,8 @@
+-obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o module.o \
+-      ptrace.o ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o
++obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o ptrace.o \
++      ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o
+ obj-$(CONFIG_HIGHMEM) += highmem.o
++obj-$(CONFIG_MODULES) += module.o
+ USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o
+ USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
+@@ -9,6 +10,8 @@
+ SYMLINKS = semaphore.c highmem.c module.c
+ SYMLINKS := $(foreach f,$(SYMLINKS),$(src)/$f)
++clean-files := $(SYMLINKS)
++
+ semaphore.c-dir = kernel
+ highmem.c-dir = mm
+ module.c-dir = kernel
+@@ -24,8 +27,7 @@
+ $(SYMLINKS): 
+       $(call make_link,$@)
+-clean:
+-      $(MAKE) -C util clean
++subdir- := util
+ fastdep:
+diff -Naur a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c
+--- a/arch/um/sys-i386/bugs.c  Fri Aug 15 15:07:41 2003
++++ b/arch/um/sys-i386/bugs.c  Fri Aug 15 15:13:14 2003
+@@ -8,6 +8,7 @@
+ #include <errno.h>
+ #include <string.h>
+ #include <sys/signal.h>
++#include <asm/ldt.h>
+ #include "kern_util.h"
+ #include "user.h"
+ #include "sysdep/ptrace.h"
+@@ -16,8 +17,8 @@
+ #define MAXTOKEN 64
+ /* Set during early boot */
+-int cpu_has_cmov = 1;
+-int cpu_has_xmm = 0;
++int host_has_cmov = 1;
++int host_has_xmm = 0;
+ static char token(int fd, char *buf, int len, char stop)
+ {
+@@ -104,6 +105,25 @@
+       return(1);
+ }
++static void disable_lcall(void)
++{
++      struct modify_ldt_ldt_s ldt;
++      int err;
++
++      bzero(&ldt, sizeof(ldt));
++      ldt.entry_number = 7;
++      ldt.base_addr = 0;
++      ldt.limit = 0;
++      err = modify_ldt(1, &ldt, sizeof(ldt));
++      if(err)
++              printk("Failed to disable lcall7 - errno = %d\n", errno);
++}
++
++void arch_init_thread(void)
++{
++      disable_lcall();
++}
++
+ void arch_check_bugs(void)
+ {
+       int have_it;
+@@ -113,8 +133,8 @@
+                      "checks\n");
+               return;
+       }
+-      if(check_cpu_feature("cmov", &have_it)) cpu_has_cmov = have_it;
+-      if(check_cpu_feature("xmm", &have_it)) cpu_has_xmm = have_it;
++      if(check_cpu_feature("cmov", &have_it)) host_has_cmov = have_it;
++      if(check_cpu_feature("xmm", &have_it)) host_has_xmm = have_it;
+ }
+ int arch_handle_signal(int sig, union uml_pt_regs *regs)
+@@ -130,18 +150,18 @@
+       if((*((char *) ip) != 0x0f) || ((*((char *) (ip + 1)) & 0xf0) != 0x40))
+               return(0);
+-      if(cpu_has_cmov == 0)
++      if(host_has_cmov == 0)
+               panic("SIGILL caused by cmov, which this processor doesn't "
+                     "implement, boot a filesystem compiled for older "
+                     "processors");
+-      else if(cpu_has_cmov == 1)
++      else if(host_has_cmov == 1)
+               panic("SIGILL caused by cmov, which this processor claims to "
+                     "implement");
+-      else if(cpu_has_cmov == -1)
++      else if(host_has_cmov == -1)
+               panic("SIGILL caused by cmov, couldn't tell if this processor "
+                     "implements it, boot a filesystem compiled for older "
+                     "processors");
+-      else panic("Bad value for cpu_has_cmov (%d)", cpu_has_cmov);
++      else panic("Bad value for host_has_cmov (%d)", host_has_cmov);
+       return(0);
+ }
+diff -Naur a/arch/um/uml.lds.S b/arch/um/uml.lds.S
+--- a/arch/um/uml.lds.S        Fri Aug 15 15:05:37 2003
++++ b/arch/um/uml.lds.S        Fri Aug 15 15:11:48 2003
+@@ -26,7 +26,11 @@
+   . = ALIGN(4096);            /* Init code and data */
+   _stext = .;
+   __init_begin = .;
+-  .text.init : { *(.text.init) }
++  .init.text : { 
++      _sinittext = .;
++      *(.init.text)
++      _einittext = .;
++  }
+   . = ALIGN(4096);
+   .text      :
+   {
+@@ -38,7 +42,7 @@
+   #include "asm/common.lds.S"
+-  .data.init : { *(.data.init) }
++  init.data : { *(init.data) }
+   .data    :
+   {
+     . = ALIGN(KERNEL_STACK_SIZE);             /* init_task */
+diff -Naur a/arch/um/util/mk_constants_kern.c b/arch/um/util/mk_constants_kern.c
+--- a/arch/um/util/mk_constants_kern.c Fri Aug 15 15:04:15 2003
++++ b/arch/um/util/mk_constants_kern.c Fri Aug 15 15:10:27 2003
+@@ -1,5 +1,6 @@
+ #include "linux/kernel.h"
+ #include "linux/stringify.h"
++#include "linux/time.h"
+ #include "asm/page.h"
+ extern void print_head(void);
+@@ -11,6 +12,7 @@
+ {
+   print_head();
+   print_constant_int("UM_KERN_PAGE_SIZE", PAGE_SIZE);
++
+   print_constant_str("UM_KERN_EMERG", KERN_EMERG);
+   print_constant_str("UM_KERN_ALERT", KERN_ALERT);
+   print_constant_str("UM_KERN_CRIT", KERN_CRIT);
+@@ -19,6 +21,8 @@
+   print_constant_str("UM_KERN_NOTICE", KERN_NOTICE);
+   print_constant_str("UM_KERN_INFO", KERN_INFO);
+   print_constant_str("UM_KERN_DEBUG", KERN_DEBUG);
++
++  print_constant_int("UM_NSEC_PER_SEC", NSEC_PER_SEC);
+   print_tail();
+   return(0);
+ }
+diff -Naur a/fs/Makefile b/fs/Makefile
+--- a/fs/Makefile      Fri Aug 15 15:06:45 2003
++++ b/fs/Makefile      Fri Aug 15 15:12:41 2003
+@@ -91,3 +91,5 @@
+ obj-$(CONFIG_XFS_FS)          += xfs/
+ obj-$(CONFIG_AFS_FS)          += afs/
+ obj-$(CONFIG_BEFS_FS)         += befs/
++obj-$(CONFIG_HOSTFS)          += hostfs/
++obj-$(CONFIG_HPPFS)           += hppfs/
+diff -Naur a/fs/hostfs/Makefile b/fs/hostfs/Makefile
+--- a/fs/hostfs/Makefile       Wed Dec 31 19:00:00 1969
++++ b/fs/hostfs/Makefile       Fri Aug 15 15:10:07 2003
+@@ -0,0 +1,36 @@
++# 
++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++# struct stat64 changed the inode field name between 2.2 and 2.4 from st_ino
++# to __st_ino.  It stayed in the same place, so as long as the correct name
++# is used, hostfs compiled on 2.2 should work on 2.4 and vice versa.
++
++STAT64_INO_FIELD := $(shell grep -q __st_ino /usr/include/bits/stat.h && \
++                              echo __)st_ino
++
++hostfs-objs := hostfs_kern.o hostfs_user.o
++
++obj-y = 
++obj-$(CONFIG_HOSTFS) += hostfs.o
++
++SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs))
++
++USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(SINGLE_OBJS))
++USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
++
++USER_CFLAGS += -DSTAT64_INO_FIELD=$(STAT64_INO_FIELD)
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $<
++
++clean:
++
++modules:
++
++fastdep:
++
++dep:
++
++archmrproper: clean
+diff -Naur a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
+--- a/fs/hostfs/hostfs.h       Wed Dec 31 19:00:00 1969
++++ b/fs/hostfs/hostfs.h       Fri Aug 15 15:10:06 2003
+@@ -0,0 +1,79 @@
++#ifndef __UM_FS_HOSTFS
++#define __UM_FS_HOSTFS
++
++#include "os.h"
++
++/* These are exactly the same definitions as in fs.h, but the names are 
++ * changed so that this file can be included in both kernel and user files.
++ */
++
++#define HOSTFS_ATTR_MODE      1
++#define HOSTFS_ATTR_UID       2
++#define HOSTFS_ATTR_GID       4
++#define HOSTFS_ATTR_SIZE      8
++#define HOSTFS_ATTR_ATIME     16
++#define HOSTFS_ATTR_MTIME     32
++#define HOSTFS_ATTR_CTIME     64
++#define HOSTFS_ATTR_ATIME_SET 128
++#define HOSTFS_ATTR_MTIME_SET 256
++#define HOSTFS_ATTR_FORCE     512     /* Not a change, but a change it */
++#define HOSTFS_ATTR_ATTR_FLAG 1024
++
++struct hostfs_iattr {
++      unsigned int    ia_valid;
++      mode_t          ia_mode;
++      uid_t           ia_uid;
++      gid_t           ia_gid;
++      loff_t          ia_size;
++      struct timespec ia_atime;
++      struct timespec ia_mtime;
++      struct timespec ia_ctime;
++      unsigned int    ia_attr_flags;
++};
++
++extern int stat_file(const char *path, unsigned long long *inode_out, 
++                   int *mode_out, int *nlink_out, int *uid_out, int *gid_out,
++                   unsigned long long *size_out, struct timespec *atime_out, 
++                   struct timespec *mtime_out, struct timespec *ctime_out, 
++                   int *blksize_out, unsigned long long *blocks_out);
++extern int access_file(char *path, int r, int w, int x);
++extern int open_file(char *path, int r, int w, int append);
++extern int file_type(const char *path, int *rdev);
++extern void *open_dir(char *path, int *err_out);
++extern char *read_dir(void *stream, unsigned long long *pos, 
++                    unsigned long long *ino_out, int *len_out);
++extern void close_file(void *stream);
++extern void close_dir(void *stream);
++extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
++extern int write_file(int fd, unsigned long long *offset, const char *buf,
++                    int len);
++extern int lseek_file(int fd, long long offset, int whence);
++extern int file_create(char *name, int ur, int uw, int ux, int gr, 
++                     int gw, int gx, int or, int ow, int ox);
++extern int set_attr(const char *file, struct hostfs_iattr *attrs);
++extern int make_symlink(const char *from, const char *to);
++extern int unlink_file(const char *file);
++extern int do_mkdir(const char *file, int mode);
++extern int do_rmdir(const char *file);
++extern int do_mknod(const char *file, int mode, int dev);
++extern int link_file(const char *from, const char *to);
++extern int do_readlink(char *file, char *buf, int size);
++extern int rename_file(char *from, char *to);
++extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 
++                   long long *bfree_out, long long *bavail_out, 
++                   long long *files_out, long long *ffree_out, 
++                   void *fsid_out, int fsid_size, long *namelen_out, 
++                   long *spare_out);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
+--- a/fs/hostfs/hostfs_kern.c  Wed Dec 31 19:00:00 1969
++++ b/fs/hostfs/hostfs_kern.c  Fri Aug 15 15:10:12 2003
+@@ -0,0 +1,1010 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ *
++ * Ported the filesystem routines to 2.5.
++ * 2003-02-10 Petr Baudis <pasky@ucw.cz>
++ */
++
++#include <linux/stddef.h>
++#include <linux/fs.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/pagemap.h>
++#include <linux/blkdev.h>
++#include <linux/list.h>
++#include <linux/buffer_head.h>
++#include <linux/root_dev.h>
++#include <linux/statfs.h>
++#include <asm/uaccess.h>
++#include "hostfs.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "user_util.h"
++#include "2_5compat.h"
++#include "init.h"
++
++struct hostfs_inode_info {
++      char *host_filename;
++      int fd;
++      int mode;
++      struct inode vfs_inode;
++};
++
++static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
++{
++      return(list_entry(inode, struct hostfs_inode_info, vfs_inode));
++}
++
++#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_dentry->d_inode)
++
++int hostfs_d_delete(struct dentry *dentry)
++{
++      return(1);
++}
++
++struct dentry_operations hostfs_dentry_ops = {
++      .d_delete               = hostfs_d_delete,
++};
++
++/* Changed in hostfs_args before the kernel starts running */
++static char *root_ino = "/";
++static int append = 0;
++
++#define HOSTFS_SUPER_MAGIC 0x00c0ffee
++
++static struct inode_operations hostfs_iops;
++static struct inode_operations hostfs_dir_iops;
++static struct address_space_operations hostfs_link_aops;
++
++static int __init hostfs_args(char *options, int *add)
++{
++      char *ptr;
++
++      ptr = strchr(options, ',');
++      if(ptr != NULL)
++              *ptr++ = '\0';
++      if(*options != '\0')
++              root_ino = options;
++
++      options = ptr;
++      while(options){
++              ptr = strchr(options, ',');
++              if(ptr != NULL)
++                      *ptr++ = '\0';
++              if(*options != '\0'){
++                      if(!strcmp(options, "append"))
++                              append = 1;
++                      else printf("hostfs_args - unsupported option - %s\n",
++                                  options);
++              }
++              options = ptr;
++      }
++      return(0);
++}
++
++__uml_setup("hostfs=", hostfs_args,
++"hostfs=<root dir>,<flags>,...\n"
++"    This is used to set hostfs parameters.  The root directory argument\n"
++"    is used to confine all hostfs mounts to within the specified directory\n"
++"    tree on the host.  If this isn't specified, then a user inside UML can\n"
++"    mount anything on the host that's accessible to the user that's running\n"
++"    it.\n"
++"    The only flag currently supported is 'append', which specifies that all\n"
++"    files opened by hostfs will be opened in append mode.\n\n"
++);
++
++static char *dentry_name(struct dentry *dentry, int extra)
++{
++      struct dentry *parent;
++      char *root, *name;
++      int len;
++
++      len = 0;
++      parent = dentry;
++      while(parent->d_parent != parent){
++              len += parent->d_name.len + 1;
++              parent = parent->d_parent;
++      }
++      
++      root = HOSTFS_I(parent->d_inode)->host_filename;
++      len += strlen(root);
++      name = kmalloc(len + extra + 1, GFP_KERNEL);
++      if(name == NULL) return(NULL);
++
++      name[len] = '\0';
++      parent = dentry;
++      while(parent->d_parent != parent){
++              len -= parent->d_name.len + 1;
++              name[len] = '/';
++              strncpy(&name[len + 1], parent->d_name.name, 
++                      parent->d_name.len);
++              parent = parent->d_parent;
++      }
++      strncpy(name, root, strlen(root));
++      return(name);
++}
++
++static char *inode_name(struct inode *ino, int extra)
++{
++      struct dentry *dentry;
++
++      dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias);
++      return(dentry_name(dentry, extra));
++}
++
++static int read_name(struct inode *ino, char *name)
++{
++      /* The non-int inode fields are copied into ints by stat_file and
++       * then copied into the inode because passing the actual pointers
++       * in and having them treated as int * breaks on big-endian machines
++       */
++      int err;
++      int i_mode, i_nlink, i_blksize;
++      unsigned long long i_size;
++      unsigned long long i_ino;
++      unsigned long long i_blocks;
++
++      err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid, 
++                      &ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime, 
++                      &ino->i_ctime, &i_blksize, &i_blocks);
++      if(err) 
++              return(err);
++
++      ino->i_ino = i_ino;
++      ino->i_mode = i_mode;
++      ino->i_nlink = i_nlink;
++      ino->i_size = i_size;
++      ino->i_blksize = i_blksize;
++      ino->i_blocks = i_blocks;
++      if((ino->i_sb->s_dev == ROOT_DEV) && (ino->i_uid == getuid()))
++              ino->i_uid = 0;
++      return(0);
++}
++
++static char *follow_link(char *link)
++{
++      int len, n;
++      char *name, *resolved, *end;
++
++      len = 64;
++      while(1){
++              n = -ENOMEM;
++              name = kmalloc(len, GFP_KERNEL);
++              if(name == NULL)
++                      goto out;
++
++              n = do_readlink(link, name, len);
++              if(n < len)
++                      break;
++              len *= 2;
++              kfree(name);
++      }
++      if(n < 0)
++              goto out_free;
++
++      if(*name == '/')
++              return(name);
++
++      end = strrchr(link, '/');
++      if(end == NULL)
++              return(name);
++
++      *(end + 1) = '\0';
++      len = strlen(link) + strlen(name) + 1;
++
++      resolved = kmalloc(len, GFP_KERNEL);
++      if(resolved == NULL){
++              n = -ENOMEM;
++              goto out_free;
++      }
++
++      sprintf(resolved, "%s%s", link, name);
++      kfree(name);
++      kfree(link);
++      return(resolved);
++
++ out_free:
++      kfree(name);
++ out:
++      return(ERR_PTR(n));
++}
++
++static int read_inode(struct inode *ino)
++{
++      char *name;
++      int err = 0;
++
++      /* Unfortunately, we are called from iget() when we don't have a dentry
++       * allocated yet.
++       */
++      if(list_empty(&ino->i_dentry))
++              goto out;
++ 
++      err = -ENOMEM;
++      name = inode_name(ino, 0);
++      if(name == NULL) 
++              goto out;
++
++      if(file_type(name, NULL) == OS_TYPE_SYMLINK){
++              name = follow_link(name);
++              if(IS_ERR(name)){
++                      err = PTR_ERR(name);
++                      goto out;
++              }
++      }
++      
++      err = read_name(ino, name);
++      kfree(name);
++ out:
++      return(err);
++}
++
++int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
++{
++      /* do_statfs uses struct statfs64 internally, but the linux kernel
++       * struct statfs still has 32-bit versions for most of these fields,
++       * so we convert them here
++       */
++      int err;
++      long long f_blocks;
++      long long f_bfree;
++      long long f_bavail;
++      long long f_files;
++      long long f_ffree;
++
++      err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename,
++                      &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
++                      &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), 
++                      &sf->f_namelen, sf->f_spare);
++      if(err) return(err);
++      sf->f_blocks = f_blocks;
++      sf->f_bfree = f_bfree;
++      sf->f_bavail = f_bavail;
++      sf->f_files = f_files;
++      sf->f_ffree = f_ffree;
++      sf->f_type = HOSTFS_SUPER_MAGIC;
++      return(0);
++}
++
++static struct inode *hostfs_alloc_inode(struct super_block *sb)
++{
++      struct hostfs_inode_info *hi;
++
++      hi = kmalloc(sizeof(*hi), GFP_KERNEL);
++      if(hi == NULL) 
++              return(NULL);
++
++      *hi = ((struct hostfs_inode_info) { .host_filename      = NULL,
++                                          .fd                 = -1,
++                                          .mode               = 0 });
++      inode_init_once(&hi->vfs_inode);
++      return(&hi->vfs_inode);
++}
++
++static void hostfs_destroy_inode(struct inode *inode)
++{
++      if(HOSTFS_I(inode)->host_filename) 
++              kfree(HOSTFS_I(inode)->host_filename);
++
++      if(HOSTFS_I(inode)->fd != -1) 
++              close_file(&HOSTFS_I(inode)->fd);
++
++      kfree(HOSTFS_I(inode));
++}
++
++static void hostfs_read_inode(struct inode *inode)
++{
++      read_inode(inode);
++}
++
++static struct super_operations hostfs_sbops = { 
++      .alloc_inode    = hostfs_alloc_inode,
++      .destroy_inode  = hostfs_destroy_inode,
++      .read_inode     = hostfs_read_inode,
++      .statfs         = hostfs_statfs,
++};
++
++int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
++{
++      void *dir;
++      char *name;
++      unsigned long long next, ino;
++      int error, len;
++
++      name = dentry_name(file->f_dentry, 0);
++      if(name == NULL) return(-ENOMEM);
++      dir = open_dir(name, &error);
++      kfree(name);
++      if(dir == NULL) return(-error);
++      next = file->f_pos;
++      while((name = read_dir(dir, &next, &ino, &len)) != NULL){
++              error = (*filldir)(ent, name, len, file->f_pos, 
++                                 ino, DT_UNKNOWN);
++              if(error) break;
++              file->f_pos = next;
++      }
++      close_dir(dir);
++      return(0);
++}
++
++int hostfs_file_open(struct inode *ino, struct file *file)
++{
++      char *name;
++      int mode = 0, r = 0, w = 0, fd;
++
++      mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
++      if((mode & HOSTFS_I(ino)->mode) == mode)
++              return(0);
++
++      /* The file may already have been opened, but with the wrong access,
++       * so this resets things and reopens the file with the new access.
++       */
++      if(HOSTFS_I(ino)->fd != -1){
++              close_file(&HOSTFS_I(ino)->fd);
++              HOSTFS_I(ino)->fd = -1;
++      }
++
++      HOSTFS_I(ino)->mode |= mode;
++      if(HOSTFS_I(ino)->mode & FMODE_READ) 
++              r = 1;
++      if(HOSTFS_I(ino)->mode & FMODE_WRITE) 
++              w = 1;
++      if(w) 
++              r = 1;
++
++      name = dentry_name(file->f_dentry, 0);
++      if(name == NULL) 
++              return(-ENOMEM);
++
++      fd = open_file(name, r, w, append);
++      kfree(name);
++      if(fd < 0) return(fd);
++      FILE_HOSTFS_I(file)->fd = fd;
++
++      return(0);
++}
++
++int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
++{
++      return(0);
++}
++
++static struct file_operations hostfs_file_fops = {
++      .llseek         = generic_file_llseek,
++      .read           = generic_file_read,
++      .write          = generic_file_write,
++      .mmap           = generic_file_mmap,
++      .open           = hostfs_file_open,
++      .release        = NULL,
++      .fsync          = hostfs_fsync,
++};
++
++static struct file_operations hostfs_dir_fops = {
++      .readdir        = hostfs_readdir,
++      .read           = generic_read_dir,
++};
++
++int hostfs_writepage(struct page *page, struct writeback_control *wbc)
++{
++      struct address_space *mapping = page->mapping;
++      struct inode *inode = mapping->host;
++      char *buffer;
++      unsigned long long base;
++      int count = PAGE_CACHE_SIZE;
++      int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
++      int err;
++
++      if (page->index >= end_index)
++              count = inode->i_size & (PAGE_CACHE_SIZE-1);
++
++      buffer = kmap(page);
++      base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT;
++
++      err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
++      if(err != count){
++              ClearPageUptodate(page);
++              goto out;
++      }
++
++      if (base > inode->i_size)
++              inode->i_size = base;
++
++      if (PageError(page))
++              ClearPageError(page);   
++      err = 0;
++
++ out: 
++      kunmap(page);
++
++      unlock_page(page);
++      return err; 
++}
++
++int hostfs_readpage(struct file *file, struct page *page)
++{
++      char *buffer;
++      long long start;
++      int err = 0;
++
++      start = (long long) page->index << PAGE_CACHE_SHIFT;
++      buffer = kmap(page);
++      err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
++                      PAGE_CACHE_SIZE);
++      if(err < 0) goto out;
++
++      memset(&buffer[err], 0, PAGE_CACHE_SIZE - err);
++
++      flush_dcache_page(page);
++      SetPageUptodate(page);
++      if (PageError(page)) ClearPageError(page);
++      err = 0;
++ out:
++      kunmap(page);
++      unlock_page(page);
++      return(err);
++}
++
++int hostfs_prepare_write(struct file *file, struct page *page, 
++                       unsigned int from, unsigned int to)
++{
++      char *buffer;
++      long long start, tmp;
++      int err;
++
++      start = (long long) page->index << PAGE_CACHE_SHIFT;
++      buffer = kmap(page);
++      if(from != 0){
++              tmp = start;
++              err = read_file(FILE_HOSTFS_I(file)->fd, &tmp, buffer,
++                              from);
++              if(err < 0) goto out;
++      }
++      if(to != PAGE_CACHE_SIZE){
++              start += to;
++              err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer + to,
++                              PAGE_CACHE_SIZE - to);
++              if(err < 0) goto out;           
++      }
++      err = 0;
++ out:
++      kunmap(page);
++      return(err);
++}
++
++int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
++               unsigned to)
++{
++      struct address_space *mapping = page->mapping;
++      struct inode *inode = mapping->host;
++      char *buffer;
++      long long start;
++      int err = 0;
++
++      start = (long long) (page->index << PAGE_CACHE_SHIFT) + from;
++      buffer = kmap(page);
++      err = write_file(FILE_HOSTFS_I(file)->fd, &start, buffer + from, 
++                       to - from);
++      if(err > 0) err = 0;
++      if(!err && (start > inode->i_size))
++              inode->i_size = start;
++
++      kunmap(page);
++      return(err);
++}
++
++static struct address_space_operations hostfs_aops = {
++      .writepage      = hostfs_writepage,
++      .readpage       = hostfs_readpage,
++/*    .set_page_dirty = __set_page_dirty_nobuffers, */
++      .prepare_write  = hostfs_prepare_write,
++      .commit_write   = hostfs_commit_write
++};
++
++static int init_inode(struct inode *inode, struct dentry *dentry)
++{
++      char *name;
++      int type, err = -ENOMEM, rdev;
++
++      if(dentry){
++              name = dentry_name(dentry, 0);
++              if(name == NULL)
++                      goto out;
++              type = file_type(name, &rdev);
++              kfree(name);
++      }
++      else type = OS_TYPE_DIR;
++
++      err = 0;
++      if(type == OS_TYPE_SYMLINK)
++              inode->i_op = &page_symlink_inode_operations;
++      else if(type == OS_TYPE_DIR)
++              inode->i_op = &hostfs_dir_iops;
++      else inode->i_op = &hostfs_iops;
++
++      if(type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops;
++      else inode->i_fop = &hostfs_file_fops;
++
++      if(type == OS_TYPE_SYMLINK) 
++              inode->i_mapping->a_ops = &hostfs_link_aops;
++      else inode->i_mapping->a_ops = &hostfs_aops;
++
++      switch (type) {
++      case OS_TYPE_CHARDEV:
++              init_special_inode(inode, S_IFCHR, rdev);
++              break;
++      case OS_TYPE_BLOCKDEV:
++              init_special_inode(inode, S_IFBLK, rdev);
++              break;
++      case OS_TYPE_FIFO:
++              init_special_inode(inode, S_IFIFO, 0);
++              break;
++      case OS_TYPE_SOCK:
++              init_special_inode(inode, S_IFSOCK, 0);
++              break;
++      }
++ out:
++      return(err);
++}
++
++int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, 
++                 struct nameidata *nd)
++{
++      struct inode *inode;
++      char *name;
++      int error, fd;
++
++      error = -ENOMEM;
++      inode = iget(dir->i_sb, 0);
++      if(inode == NULL) goto out;
++
++      error = init_inode(inode, dentry);
++      if(error) 
++              goto out_put;
++      
++      error = -ENOMEM;
++      name = dentry_name(dentry, 0);
++      if(name == NULL)
++              goto out_put;
++
++      fd = file_create(name, 
++                       mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, 
++                       mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, 
++                       mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
++      if(fd < 0) 
++              error = fd;
++      else error = read_name(inode, name);
++
++      kfree(name);
++      if(error)
++              goto out_put;
++
++      HOSTFS_I(inode)->fd = fd;
++      HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE;
++      d_instantiate(dentry, inode);
++      return(0);
++
++ out_free:
++      kfree(name);
++ out_put:
++      iput(inode);
++ out:
++      return(error);
++}
++
++struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, 
++                            struct nameidata *nd)
++{
++      struct inode *inode;
++      char *name;
++      int err;
++
++      err = -ENOMEM;
++      inode = iget(ino->i_sb, 0);
++      if(inode == NULL) 
++              goto out;
++ 
++      err = init_inode(inode, dentry);
++      if(err) 
++              goto out_put;
++
++      err = -ENOMEM;
++      name = dentry_name(dentry, 0);
++      if(name == NULL)
++              goto out_put;
++
++      err = read_name(inode, name);
++      kfree(name);
++      if(err == -ENOENT){
++              iput(inode);
++              inode = NULL;
++      }
++      else if(err)
++              goto out_put;
++
++      d_add(dentry, inode);
++      dentry->d_op = &hostfs_dentry_ops;
++      return(NULL);
++
++ out_put:
++      iput(inode);
++ out:
++      return(ERR_PTR(err));
++}
++
++static char *inode_dentry_name(struct inode *ino, struct dentry *dentry)
++{
++        char *file;
++      int len;
++
++      file = inode_name(ino, dentry->d_name.len + 1);
++      if(file == NULL) return(NULL);
++        strcat(file, "/");
++      len = strlen(file);
++        strncat(file, dentry->d_name.name, dentry->d_name.len);
++      file[len + dentry->d_name.len] = '\0';
++        return(file);
++}
++
++int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
++{
++        char *from_name, *to_name;
++        int err;
++
++        if((from_name = inode_dentry_name(ino, from)) == NULL) 
++                return(-ENOMEM);
++        to_name = dentry_name(to, 0);
++      if(to_name == NULL){
++              kfree(from_name);
++              return(-ENOMEM);
++      }
++        err = link_file(to_name, from_name);
++        kfree(from_name);
++        kfree(to_name);
++        return(err);
++}
++
++int hostfs_unlink(struct inode *ino, struct dentry *dentry)
++{
++      char *file;
++      int err;
++
++      if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
++      if(append)
++              return(-EPERM);
++
++      err = unlink_file(file);
++      kfree(file);
++      return(err);
++}
++
++int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
++{
++      char *file;
++      int err;
++
++      if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
++      err = make_symlink(file, to);
++      kfree(file);
++      return(err);
++}
++
++int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
++{
++      char *file;
++      int err;
++
++      if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
++      err = do_mkdir(file, mode);
++      kfree(file);
++      return(err);
++}
++
++int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
++{
++      char *file;
++      int err;
++
++      if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
++      err = do_rmdir(file);
++      kfree(file);
++      return(err);
++}
++
++int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
++{
++      struct inode *inode;
++      char *name;
++      int err = -ENOMEM;
++ 
++      inode = iget(dir->i_sb, 0);
++      if(inode == NULL) 
++              goto out;
++
++      err = init_inode(inode, dentry);
++      if(err) 
++              goto out_put;
++
++      err = -ENOMEM;
++      name = dentry_name(dentry, 0);
++      if(name == NULL)
++              goto out_put;
++
++      init_special_inode(inode, mode, dev);
++      err = do_mknod(name, mode, dev);
++      if(err)
++              goto out_free;
++
++      err = read_name(inode, name);
++      kfree(name);
++      if(err)
++              goto out_put;
++
++      d_instantiate(dentry, inode);
++      return(0);
++
++ out_free:
++      kfree(name);
++ out_put:
++      iput(inode);
++ out:
++      return(err);
++}
++
++int hostfs_rename(struct inode *from_ino, struct dentry *from,
++                struct inode *to_ino, struct dentry *to)
++{
++      char *from_name, *to_name;
++      int err;
++
++      if((from_name = inode_dentry_name(from_ino, from)) == NULL)
++              return(-ENOMEM);
++      if((to_name = inode_dentry_name(to_ino, to)) == NULL){
++              kfree(from_name);
++              return(-ENOMEM);
++      }
++      err = rename_file(from_name, to_name);
++      kfree(from_name);
++      kfree(to_name);
++      return(err);
++}
++
++void hostfs_truncate(struct inode *ino)
++{
++      not_implemented();
++}
++
++int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
++{
++      char *name;
++      int r = 0, w = 0, x = 0, err;
++
++      if(desired & MAY_READ) r = 1;
++      if(desired & MAY_WRITE) w = 1;
++      if(desired & MAY_EXEC) x = 1;
++      name = inode_name(ino, 0);
++      if(name == NULL) return(-ENOMEM);
++      err = access_file(name, r, w, x);
++      kfree(name);
++      if(!err) err = vfs_permission(ino, desired);
++      return(err);
++}
++
++int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
++{
++      struct hostfs_iattr attrs;
++      char *name;
++      int err;
++      
++      if(append) 
++              attr->ia_valid &= ~ATTR_SIZE;
++
++      attrs.ia_valid = 0;
++      if(attr->ia_valid & ATTR_MODE){
++              attrs.ia_valid |= HOSTFS_ATTR_MODE;
++              attrs.ia_mode = attr->ia_mode;
++      }
++      if(attr->ia_valid & ATTR_UID){
++              if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && 
++                 (attr->ia_uid == 0))
++                      attr->ia_uid = getuid();
++              attrs.ia_valid |= HOSTFS_ATTR_UID;
++              attrs.ia_uid = attr->ia_uid;
++      }
++      if(attr->ia_valid & ATTR_GID){
++              if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && 
++                 (attr->ia_gid == 0))
++                      attr->ia_gid = getuid();
++              attrs.ia_valid |= HOSTFS_ATTR_GID;
++              attrs.ia_gid = attr->ia_gid;
++      }
++      if(attr->ia_valid & ATTR_SIZE){
++              attrs.ia_valid |= HOSTFS_ATTR_SIZE;
++              attrs.ia_size = attr->ia_size;
++      }
++      if(attr->ia_valid & ATTR_ATIME){
++              attrs.ia_valid |= HOSTFS_ATTR_ATIME;
++              attrs.ia_atime = attr->ia_atime;
++      }
++      if(attr->ia_valid & ATTR_MTIME){
++              attrs.ia_valid |= HOSTFS_ATTR_MTIME;
++              attrs.ia_mtime = attr->ia_mtime;
++      }
++      if(attr->ia_valid & ATTR_CTIME){
++              attrs.ia_valid |= HOSTFS_ATTR_CTIME;
++              attrs.ia_ctime = attr->ia_ctime;
++      }
++      if(attr->ia_valid & ATTR_ATIME_SET){
++              attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET;
++      }
++      if(attr->ia_valid & ATTR_MTIME_SET){
++              attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
++      }
++      name = dentry_name(dentry, 0);
++      if(name == NULL) return(-ENOMEM);
++      err = set_attr(name, &attrs);
++      kfree(name);
++      if(err)
++              return(err);
++
++      return(inode_setattr(dentry->d_inode, attr));
++}
++
++int hostfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 
++         struct kstat *stat)
++{
++      generic_fillattr(dentry->d_inode, stat);
++      return(0);
++}
++
++static struct inode_operations hostfs_iops = {
++      .create         = hostfs_create,
++      .link           = hostfs_link,
++      .unlink         = hostfs_unlink,
++      .symlink        = hostfs_symlink,
++      .mkdir          = hostfs_mkdir,
++      .rmdir          = hostfs_rmdir,
++      .mknod          = hostfs_mknod,
++      .rename         = hostfs_rename,
++      .truncate       = hostfs_truncate,
++      .permission     = hostfs_permission,
++      .setattr        = hostfs_setattr,
++      .getattr        = hostfs_getattr,
++};
++
++static struct inode_operations hostfs_dir_iops = {
++      .create         = hostfs_create,
++      .lookup         = hostfs_lookup,
++      .link           = hostfs_link,
++      .unlink         = hostfs_unlink,
++      .symlink        = hostfs_symlink,
++      .mkdir          = hostfs_mkdir,
++      .rmdir          = hostfs_rmdir,
++      .mknod          = hostfs_mknod,
++      .rename         = hostfs_rename,
++      .truncate       = hostfs_truncate,
++      .permission     = hostfs_permission,
++      .setattr        = hostfs_setattr,
++      .getattr        = hostfs_getattr,
++};
++
++int hostfs_link_readpage(struct file *file, struct page *page)
++{
++      char *buffer, *name;
++      long long start;
++      int err;
++
++      start = page->index << PAGE_CACHE_SHIFT;
++      buffer = kmap(page);
++      name = inode_name(page->mapping->host, 0);
++      if(name == NULL) return(-ENOMEM);
++      err = do_readlink(name, buffer, PAGE_CACHE_SIZE);
++      kfree(name);
++      if(err == PAGE_CACHE_SIZE)
++              err = -E2BIG;
++      else if(err > 0){
++              flush_dcache_page(page);
++              SetPageUptodate(page);
++              if (PageError(page)) ClearPageError(page);
++              err = 0;
++      }
++      kunmap(page);
++      unlock_page(page);
++      return(err);
++}
++
++static struct address_space_operations hostfs_link_aops = {
++      .readpage       = hostfs_link_readpage,
++};
++
++static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
++{
++      struct inode *root_inode;
++      char *name, *data = d;
++      int err;
++
++      sb->s_blocksize = 1024;
++      sb->s_blocksize_bits = 10;
++      sb->s_magic = HOSTFS_SUPER_MAGIC;
++      sb->s_op = &hostfs_sbops;
++
++      if((data == NULL) || (*data == '\0')) 
++              data = root_ino;
++
++      err = -ENOMEM;
++      name = kmalloc(strlen(data) + 1, GFP_KERNEL);
++      if(name == NULL) 
++              goto out;
++
++      strcpy(name, data);
++
++      root_inode = iget(sb, 0);
++      if(root_inode == NULL)
++              goto out_free;
++
++      err = init_inode(root_inode, NULL);
++      if(err)
++              goto out_put;
++
++      HOSTFS_I(root_inode)->host_filename = name;
++
++      err = -ENOMEM;
++      sb->s_root = d_alloc_root(root_inode);
++      if(sb->s_root == NULL)
++              goto out_put;
++
++      err = read_inode(root_inode);
++      if(err)
++              goto out_put;
++
++      return(0);
++
++ out_put:
++      iput(root_inode);
++ out_free:
++      kfree(name);
++ out:
++      return(err);
++}
++
++static struct super_block *hostfs_read_sb(struct file_system_type *type,
++                                           int flags, const char *dev_name,
++                                           void *data)
++{
++      return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common));
++}
++
++static struct file_system_type hostfs_type = {
++      .owner          = THIS_MODULE,
++      .name           = "hostfs",
++      .get_sb         = hostfs_read_sb,
++      .kill_sb        = kill_anon_super,
++      .fs_flags       = 0,
++};
++
++static int __init init_hostfs(void)
++{
++      return(register_filesystem(&hostfs_type));
++}
++
++static void __exit exit_hostfs(void)
++{
++      unregister_filesystem(&hostfs_type);
++}
++
++module_init(init_hostfs)
++module_exit(exit_hostfs)
++MODULE_LICENSE("GPL");
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
+--- a/fs/hostfs/hostfs_user.c  Wed Dec 31 19:00:00 1969
++++ b/fs/hostfs/hostfs_user.c  Fri Aug 15 15:10:43 2003
+@@ -0,0 +1,361 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <stdio.h>
++#include <fcntl.h>
++#include <dirent.h>
++#include <errno.h>
++#include <utime.h>
++#include <string.h>
++#include <sys/stat.h>
++#include <sys/time.h>
++#include <sys/vfs.h>
++#include "hostfs.h"
++#include "kern_util.h"
++#include "user.h"
++
++int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
++            int *nlink_out, int *uid_out, int *gid_out, 
++            unsigned long long *size_out, struct timespec *atime_out,
++            struct timespec *mtime_out, struct timespec *ctime_out,
++            int *blksize_out, unsigned long long *blocks_out)
++{
++      struct stat64 buf;
++
++      if(lstat64(path, &buf) < 0) 
++              return(-errno);
++
++      /* See the Makefile for why STAT64_INO_FIELD is passed in
++       * by the build
++       */
++      if(inode_out != NULL) *inode_out = buf.STAT64_INO_FIELD;
++      if(mode_out != NULL) *mode_out = buf.st_mode;
++      if(nlink_out != NULL) *nlink_out = buf.st_nlink;
++      if(uid_out != NULL) *uid_out = buf.st_uid;
++      if(gid_out != NULL) *gid_out = buf.st_gid;
++      if(size_out != NULL) *size_out = buf.st_size;
++      if(atime_out != NULL) {
++              atime_out->tv_sec = buf.st_atime;
++              atime_out->tv_nsec = 0;
++      }
++      if(mtime_out != NULL) {
++              mtime_out->tv_sec = buf.st_mtime;
++              mtime_out->tv_nsec = 0;
++      }
++      if(ctime_out != NULL) {
++              ctime_out->tv_sec = buf.st_ctime;
++              ctime_out->tv_nsec = 0;
++      }
++      if(blksize_out != NULL) *blksize_out = buf.st_blksize;
++      if(blocks_out != NULL) *blocks_out = buf.st_blocks;
++      return(0);
++}
++
++int file_type(const char *path, int *rdev)
++{
++      struct stat64 buf;
++
++      if(lstat64(path, &buf) < 0) 
++              return(-errno);
++      if(rdev != NULL) 
++              *rdev = buf.st_rdev;
++
++      if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR);
++      else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK);
++      else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV);
++      else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV);
++      else if(S_ISFIFO(buf.st_mode))return(OS_TYPE_FIFO);
++      else if(S_ISSOCK(buf.st_mode))return(OS_TYPE_SOCK);
++      else return(OS_TYPE_FILE);
++}
++
++int access_file(char *path, int r, int w, int x)
++{
++      int mode = 0;
++
++      if(r) mode = R_OK;
++      if(w) mode |= W_OK;
++      if(x) mode |= X_OK;
++      if(access(path, mode) != 0) return(-errno);
++      else return(0);
++}
++
++int open_file(char *path, int r, int w, int append)
++{
++      int mode = 0, fd;
++
++      if(r && !w) 
++              mode = O_RDONLY;
++      else if(!r && w) 
++              mode = O_WRONLY;
++      else if(r && w) 
++              mode = O_RDWR;
++      else panic("Impossible mode in open_file");
++
++      if(append)
++              mode |= O_APPEND;
++      fd = open64(path, mode);
++      if(fd < 0) return(-errno);
++      else return(fd);
++}
++
++void *open_dir(char *path, int *err_out)
++{
++      DIR *dir;
++
++      dir = opendir(path);
++      *err_out = errno;
++      if(dir == NULL) return(NULL);
++      return(dir);
++}
++
++char *read_dir(void *stream, unsigned long long *pos, 
++             unsigned long long *ino_out, int *len_out)
++{
++      DIR *dir = stream;
++      struct dirent *ent;
++
++      seekdir(dir, *pos);
++      ent = readdir(dir);
++      if(ent == NULL) return(NULL);
++      *len_out = strlen(ent->d_name);
++      *ino_out = ent->d_ino;
++      *pos = telldir(dir);
++      return(ent->d_name);
++}
++
++int read_file(int fd, unsigned long long *offset, char *buf, int len)
++{
++      int n;
++
++      n = pread64(fd, buf, len, *offset);
++      if(n < 0) return(-errno);
++      *offset += n;
++      return(n);
++}
++
++int write_file(int fd, unsigned long long *offset, const char *buf, int len)
++{
++      int n;
++
++      n = pwrite64(fd, buf, len, *offset);
++      if(n < 0) return(-errno);
++      *offset += n;
++      return(n);
++}
++
++int lseek_file(int fd, long long offset, int whence)
++{
++      int ret;
++
++      ret = lseek64(fd, offset, whence);
++      if(ret < 0) return(-errno);
++      return(0);
++}
++
++void close_file(void *stream)
++{
++      close(*((int *) stream));
++}
++
++void close_dir(void *stream)
++{
++      closedir(stream);
++}
++
++int file_create(char *name, int ur, int uw, int ux, int gr, 
++              int gw, int gx, int or, int ow, int ox)
++{
++      int mode, fd;
++
++      mode = 0;
++      mode |= ur ? S_IRUSR : 0;
++      mode |= uw ? S_IWUSR : 0;
++      mode |= ux ? S_IXUSR : 0;
++      mode |= gr ? S_IRGRP : 0;
++      mode |= gw ? S_IWGRP : 0;
++      mode |= gx ? S_IXGRP : 0;
++      mode |= or ? S_IROTH : 0;
++      mode |= ow ? S_IWOTH : 0;
++      mode |= ox ? S_IXOTH : 0;
++      fd = open64(name, O_CREAT | O_RDWR, mode);
++      if(fd < 0) 
++              return(-errno);
++      return(fd);
++}
++
++int set_attr(const char *file, struct hostfs_iattr *attrs)
++{
++      struct utimbuf buf;
++      int err, ma;
++
++      if(attrs->ia_valid & HOSTFS_ATTR_MODE){
++              if(chmod(file, attrs->ia_mode) != 0) return(-errno);
++      }
++      if(attrs->ia_valid & HOSTFS_ATTR_UID){
++              if(chown(file, attrs->ia_uid, -1)) return(-errno);
++      }
++      if(attrs->ia_valid & HOSTFS_ATTR_GID){
++              if(chown(file, -1, attrs->ia_gid)) return(-errno);
++      }
++      if(attrs->ia_valid & HOSTFS_ATTR_SIZE){
++              if(truncate(file, attrs->ia_size)) return(-errno);
++      }
++      ma = HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET;
++      if((attrs->ia_valid & ma) == ma){
++              buf.actime = attrs->ia_atime.tv_sec;
++              buf.modtime = attrs->ia_mtime.tv_sec;
++              if(utime(file, &buf) != 0) return(-errno);
++      }
++      else {
++              struct timespec ts;
++
++              if(attrs->ia_valid & HOSTFS_ATTR_ATIME_SET){
++                      err = stat_file(file, NULL, NULL, NULL, NULL, NULL, 
++                                      NULL, NULL, &ts, NULL, NULL, NULL);
++                      if(err != 0) 
++                              return(err);
++                      buf.actime = attrs->ia_atime.tv_sec;
++                      buf.modtime = ts.tv_sec;
++                      if(utime(file, &buf) != 0) 
++                              return(-errno);
++              }
++              if(attrs->ia_valid & HOSTFS_ATTR_MTIME_SET){
++                      err = stat_file(file, NULL, NULL, NULL, NULL, NULL, 
++                                      NULL, &ts, NULL, NULL, NULL, NULL);
++                      if(err != 0) 
++                              return(err);
++                      buf.actime = ts.tv_sec;
++                      buf.modtime = attrs->ia_mtime.tv_sec;
++                      if(utime(file, &buf) != 0) 
++                              return(-errno);
++              }
++      }
++      if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ;
++      if(attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)){
++              err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, 
++                              &attrs->ia_atime, &attrs->ia_mtime, NULL, 
++                              NULL, NULL);
++              if(err != 0) return(err);
++      }
++      return(0);
++}
++
++int make_symlink(const char *from, const char *to)
++{
++      int err;
++
++      err = symlink(to, from);
++      if(err) return(-errno);
++      return(0);
++}
++
++int unlink_file(const char *file)
++{
++      int err;
++
++      err = unlink(file);
++      if(err) return(-errno);
++      return(0);
++}
++
++int do_mkdir(const char *file, int mode)
++{
++      int err;
++
++      err = mkdir(file, mode);
++      if(err) return(-errno);
++      return(0);
++}
++
++int do_rmdir(const char *file)
++{
++      int err;
++
++      err = rmdir(file);
++      if(err) return(-errno);
++      return(0);
++}
++
++int do_mknod(const char *file, int mode, int dev)
++{
++      int err;
++
++      err = mknod(file, mode, dev);
++      if(err) return(-errno);
++      return(0);
++}
++
++int link_file(const char *to, const char *from)
++{
++      int err;
++
++      err = link(to, from);
++      if(err) return(-errno);
++      return(0);
++}
++
++int do_readlink(char *file, char *buf, int size)
++{
++      int n;
++
++      n = readlink(file, buf, size);
++      if(n < 0) 
++              return(-errno);
++      if(n < size) 
++              buf[n] = '\0';
++      return(n);
++}
++
++int rename_file(char *from, char *to)
++{
++      int err;
++
++      err = rename(from, to);
++      if(err < 0) return(-errno);
++      return(0);      
++}
++
++int do_statfs(char *root, long *bsize_out, long long *blocks_out, 
++            long long *bfree_out, long long *bavail_out, 
++            long long *files_out, long long *ffree_out,
++            void *fsid_out, int fsid_size, long *namelen_out, 
++            long *spare_out)
++{
++      struct statfs64 buf;
++      int err;
++
++      err = statfs64(root, &buf);
++      if(err < 0) return(-errno);
++      *bsize_out = buf.f_bsize;
++      *blocks_out = buf.f_blocks;
++      *bfree_out = buf.f_bfree;
++      *bavail_out = buf.f_bavail;
++      *files_out = buf.f_files;
++      *ffree_out = buf.f_ffree;
++      memcpy(fsid_out, &buf.f_fsid, 
++             sizeof(buf.f_fsid) > fsid_size ? fsid_size : 
++             sizeof(buf.f_fsid));
++      *namelen_out = buf.f_namelen;
++      spare_out[0] = buf.f_spare[0];
++      spare_out[1] = buf.f_spare[1];
++      spare_out[2] = buf.f_spare[2];
++      spare_out[3] = buf.f_spare[3];
++      spare_out[4] = buf.f_spare[4];
++      spare_out[5] = buf.f_spare[5];
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/fs/hppfs/Makefile b/fs/hppfs/Makefile
+--- a/fs/hppfs/Makefile        Wed Dec 31 19:00:00 1969
++++ b/fs/hppfs/Makefile        Fri Aug 15 15:12:31 2003
+@@ -0,0 +1,19 @@
++# 
++# Copyright (C) 2002, 2003 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++hppfs-objs := hppfs_kern.o
++
++obj-y = 
++obj-$(CONFIG_HPPFS) += hppfs.o
++
++clean:
++
++modules:
++
++fastdep:
++
++dep:
++
++archmrproper: clean
+diff -Naur a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
+--- a/fs/hppfs/hppfs_kern.c    Wed Dec 31 19:00:00 1969
++++ b/fs/hppfs/hppfs_kern.c    Fri Aug 15 15:11:52 2003
+@@ -0,0 +1,811 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <linux/fs.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/kernel.h>
++#include <linux/ctype.h>
++#include <linux/dcache.h>
++#include <linux/statfs.h>
++#include <asm/uaccess.h>
++#include <asm/fcntl.h>
++#include "os.h"
++
++static int init_inode(struct inode *inode, struct dentry *dentry);
++
++struct hppfs_data {
++      struct list_head list;
++      char contents[PAGE_SIZE - sizeof(struct list_head)];
++};
++
++struct hppfs_private {
++      struct file proc_file;
++      int host_fd;
++      loff_t len;
++      struct hppfs_data *contents;
++};
++
++struct hppfs_inode_info {
++        struct dentry *proc_dentry;
++      struct inode vfs_inode;
++};
++
++static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
++{
++      return(list_entry(inode, struct hppfs_inode_info, vfs_inode));
++}
++
++#define HPPFS_SUPER_MAGIC 0xb00000ee
++
++static struct super_operations hppfs_sbops;
++
++static int is_pid(struct dentry *dentry)
++{
++      struct super_block *sb;
++      int i;
++
++      sb = dentry->d_sb;
++      if((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root))
++              return(0);
++
++      for(i = 0; i < dentry->d_name.len; i++){
++              if(!isdigit(dentry->d_name.name[i]))
++                      return(0);
++      }
++      return(1);
++}
++
++static char *dentry_name(struct dentry *dentry, int extra)
++{
++      struct dentry *parent;
++      char *root, *name;
++      const char *seg_name;
++      int len, seg_len;
++
++      len = 0;
++      parent = dentry;
++      while(parent->d_parent != parent){
++              if(is_pid(parent))
++                      len += strlen("pid") + 1;
++              else len += parent->d_name.len + 1;
++              parent = parent->d_parent;
++      }
++      
++      root = "proc";
++      len += strlen(root);
++      name = kmalloc(len + extra + 1, GFP_KERNEL);
++      if(name == NULL) return(NULL);
++
++      name[len] = '\0';
++      parent = dentry;
++      while(parent->d_parent != parent){
++              if(is_pid(parent)){
++                      seg_name = "pid";
++                      seg_len = strlen("pid");
++              }
++              else {
++                      seg_name = parent->d_name.name;
++                      seg_len = parent->d_name.len;
++              }
++
++              len -= seg_len + 1;
++              name[len] = '/';
++              strncpy(&name[len + 1], seg_name, seg_len);
++              parent = parent->d_parent;
++      }
++      strncpy(name, root, strlen(root));
++      return(name);
++}
++
++struct dentry_operations hppfs_dentry_ops = {
++};
++
++static int file_removed(struct dentry *dentry, const char *file)
++{
++      char *host_file;
++      int extra, fd;
++
++      extra = 0;
++      if(file != NULL) extra += strlen(file) + 1;
++
++      host_file = dentry_name(dentry, extra + strlen("/remove"));
++      if(host_file == NULL){
++              printk("file_removed : allocation failed\n");
++              return(-ENOMEM);
++      }
++
++      if(file != NULL){
++              strcat(host_file, "/");
++              strcat(host_file, file);
++      }
++      strcat(host_file, "/remove");
++
++      fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
++      kfree(host_file);
++      if(fd > 0){
++              os_close_file(fd);
++              return(1);
++      }
++      return(0);
++}
++
++static void hppfs_read_inode(struct inode *ino)
++{
++      struct inode *proc_ino;
++
++      if(HPPFS_I(ino)->proc_dentry == NULL)
++              return;
++
++      proc_ino = HPPFS_I(ino)->proc_dentry->d_inode;
++      ino->i_uid = proc_ino->i_uid;
++      ino->i_gid = proc_ino->i_gid;
++      ino->i_atime = proc_ino->i_atime;
++      ino->i_mtime = proc_ino->i_mtime;
++      ino->i_ctime = proc_ino->i_ctime;
++      ino->i_ino = proc_ino->i_ino;
++      ino->i_mode = proc_ino->i_mode;
++      ino->i_nlink = proc_ino->i_nlink;
++      ino->i_size = proc_ino->i_size;
++      ino->i_blksize = proc_ino->i_blksize;
++      ino->i_blocks = proc_ino->i_blocks;
++}
++
++static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, 
++                                  struct nameidata *nd)
++{
++      struct dentry *proc_dentry, *new, *parent;
++      struct inode *inode;
++      int err, deleted;
++
++      deleted = file_removed(dentry, NULL);
++      if(deleted < 0)
++              return(ERR_PTR(deleted));
++      else if(deleted)
++              return(ERR_PTR(-ENOENT));
++
++      err = -ENOMEM;
++      parent = HPPFS_I(ino)->proc_dentry;
++      down(&parent->d_inode->i_sem);
++      proc_dentry = d_lookup(parent, &dentry->d_name);
++      if(proc_dentry == NULL){
++              proc_dentry = d_alloc(parent, &dentry->d_name);
++              if(proc_dentry == NULL){
++                      up(&parent->d_inode->i_sem);
++                      goto out;
++              }
++              new = (*parent->d_inode->i_op->lookup)(parent->d_inode, 
++                                                     proc_dentry, NULL);
++              if(new){
++                      dput(proc_dentry);
++                      proc_dentry = new;
++              }
++      }
++      up(&parent->d_inode->i_sem);
++
++      if(IS_ERR(proc_dentry))
++              return(proc_dentry);
++
++      inode = iget(ino->i_sb, 0);
++      if(inode == NULL) 
++              goto out_dput;
++
++      err = init_inode(inode, proc_dentry);
++      if(err) 
++              goto out_put;
++      
++      hppfs_read_inode(inode);
++
++      d_add(dentry, inode);
++      dentry->d_op = &hppfs_dentry_ops;
++      return(NULL);
++
++ out_put:
++      iput(inode);
++ out_dput:
++      dput(proc_dentry);
++ out:
++      return(ERR_PTR(err));
++}
++
++static struct inode_operations hppfs_file_iops = {
++};
++
++static ssize_t read_proc(struct file *file, char *buf, ssize_t count, 
++                       loff_t *ppos, int is_user)
++{
++      ssize_t (*read)(struct file *, char *, size_t, loff_t *);
++      ssize_t n;
++
++      read = file->f_dentry->d_inode->i_fop->read;
++
++      if(!is_user)
++              set_fs(KERNEL_DS);
++              
++      n = (*read)(file, buf, count, &file->f_pos);
++
++      if(!is_user)
++              set_fs(USER_DS);
++
++      if(ppos) *ppos = file->f_pos;
++      return(n);
++}
++
++static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count)
++{
++      ssize_t n;
++      int cur, err;
++      char *new_buf;
++
++      n = -ENOMEM;
++      new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
++      if(new_buf == NULL){
++              printk("hppfs_read_file : kmalloc failed\n");
++              goto out;
++      }
++      n = 0;
++      while(count > 0){
++              cur = min_t(ssize_t, count, PAGE_SIZE);
++              err = os_read_file(fd, new_buf, cur);
++              if(err < 0){
++                      printk("hppfs_read : read failed, errno = %d\n",
++                             count);
++                      n = err;
++                      goto out_free;
++              }
++              else if(err == 0)
++                      break;
++
++              if(copy_to_user(buf, new_buf, err)){
++                      n = -EFAULT;
++                      goto out_free;
++              }
++              n += err;
++              count -= err;
++      }
++ out_free:
++      kfree(new_buf);
++ out:
++      return(n);
++}
++
++static ssize_t hppfs_read(struct file *file, char *buf, size_t count, 
++                        loff_t *ppos)
++{
++      struct hppfs_private *hppfs = file->private_data;
++      struct hppfs_data *data;
++      loff_t off;
++      int err;
++
++      if(hppfs->contents != NULL){
++              if(*ppos >= hppfs->len) return(0);
++
++              data = hppfs->contents;
++              off = *ppos;
++              while(off >= sizeof(data->contents)){
++                      data = list_entry(data->list.next, struct hppfs_data,
++                                        list);
++                      off -= sizeof(data->contents);
++              }
++
++              if(off + count > hppfs->len)
++                      count = hppfs->len - off;
++              copy_to_user(buf, &data->contents[off], count);
++              *ppos += count;
++      }
++      else if(hppfs->host_fd != -1){
++              err = os_seek_file(hppfs->host_fd, *ppos);
++              if(err){
++                      printk("hppfs_read : seek failed, errno = %d\n", err);
++                      return(err);
++              }
++              count = hppfs_read_file(hppfs->host_fd, buf, count);
++              if(count > 0)
++                      *ppos += count;
++      }
++      else count = read_proc(&hppfs->proc_file, buf, count, ppos, 1);
++
++      return(count);
++}
++
++static ssize_t hppfs_write(struct file *file, const char *buf, size_t len, 
++                         loff_t *ppos)
++{
++      struct hppfs_private *data = file->private_data;
++      struct file *proc_file = &data->proc_file;
++      ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
++      int err;
++
++      write = proc_file->f_dentry->d_inode->i_fop->write;
++
++      proc_file->f_pos = file->f_pos;
++      err = (*write)(proc_file, buf, len, &proc_file->f_pos);
++      file->f_pos = proc_file->f_pos;
++
++      return(err);
++}
++
++static int open_host_sock(char *host_file, int *filter_out)
++{
++      char *end;
++      int fd;
++
++      end = &host_file[strlen(host_file)];
++      strcpy(end, "/rw");
++      *filter_out = 1;
++      fd = os_connect_socket(host_file);
++      if(fd > 0)
++              return(fd);
++
++      strcpy(end, "/r");
++      *filter_out = 0;
++      fd = os_connect_socket(host_file);
++      return(fd);
++}
++
++static void free_contents(struct hppfs_data *head)
++{
++      struct hppfs_data *data;
++      struct list_head *ele, *next;
++
++      if(head == NULL) return;
++
++      list_for_each_safe(ele, next, &head->list){
++              data = list_entry(ele, struct hppfs_data, list);
++              kfree(data);
++      }
++      kfree(head);
++}
++
++static struct hppfs_data *hppfs_get_data(int fd, int filter, 
++                                       struct file *proc_file, 
++                                       struct file *hppfs_file, 
++                                       loff_t *size_out)
++{
++      struct hppfs_data *data, *new, *head;
++      int n, err;
++
++      err = -ENOMEM;
++      data = kmalloc(sizeof(*data), GFP_KERNEL);
++      if(data == NULL){
++              printk("hppfs_get_data : head allocation failed\n");
++              goto failed;
++      }
++
++      INIT_LIST_HEAD(&data->list);
++
++      head = data;
++      *size_out = 0;
++
++      if(filter){
++              while((n = read_proc(proc_file, data->contents,
++                                   sizeof(data->contents), NULL, 0)) > 0)
++                      os_write_file(fd, data->contents, n);
++              err = os_shutdown_socket(fd, 0, 1);
++              if(err){
++                      printk("hppfs_get_data : failed to shut down "
++                             "socket\n");
++                      goto failed_free;
++              }
++      }
++      while(1){
++              n = os_read_file(fd, data->contents, sizeof(data->contents));
++              if(n < 0){
++                      err = n;
++                      printk("hppfs_get_data : read failed, errno = %d\n",
++                             err);
++                      goto failed_free;
++              }
++              else if(n == 0)
++                      break;
++
++              *size_out += n;
++
++              if(n < sizeof(data->contents))
++                      break;
++
++              new = kmalloc(sizeof(*data), GFP_KERNEL);
++              if(new == 0){
++                      printk("hppfs_get_data : data allocation failed\n");
++                      err = -ENOMEM;
++                      goto failed_free;
++              }
++      
++              INIT_LIST_HEAD(&new->list);
++              list_add(&new->list, &data->list);
++              data = new;
++      }
++      return(head);
++
++ failed_free:
++      free_contents(head);
++ failed:              
++      return(ERR_PTR(err));
++}
++
++static struct hppfs_private *hppfs_data(void)
++{
++      struct hppfs_private *data;
++
++      data = kmalloc(sizeof(*data), GFP_KERNEL);
++      if(data == NULL)
++              return(data);
++
++      *data = ((struct hppfs_private ) { .host_fd             = -1,
++                                         .len                 = -1,
++                                         .contents            = NULL } );
++      return(data);
++}
++
++static int file_mode(int fmode)
++{
++      if(fmode == (FMODE_READ | FMODE_WRITE))
++              return(O_RDWR);
++      if(fmode == FMODE_READ)
++              return(O_RDONLY);
++      if(fmode == FMODE_WRITE)
++              return(O_WRONLY);
++      return(0);
++}
++
++static int hppfs_open(struct inode *inode, struct file *file)
++{
++      struct hppfs_private *data;
++      struct dentry *proc_dentry;
++      char *host_file;
++      int err, fd, type, filter;
++
++      err = -ENOMEM;
++      data = hppfs_data();
++      if(data == NULL)
++              goto out;
++
++      host_file = dentry_name(file->f_dentry, strlen("/rw"));
++      if(host_file == NULL)
++              goto out_free2;
++
++      proc_dentry = HPPFS_I(inode)->proc_dentry;
++
++      /* XXX This isn't closed anywhere */
++      err = open_private_file(&data->proc_file, proc_dentry, 
++                              file_mode(file->f_mode));
++      if(err)
++              goto out_free1;
++
++      type = os_file_type(host_file);
++      if(type == OS_TYPE_FILE){
++              fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
++              if(fd >= 0) 
++                      data->host_fd = fd;
++              else printk("hppfs_open : failed to open '%s', errno = %d\n",
++                          host_file, -fd);
++
++              data->contents = NULL;
++      }
++      else if(type == OS_TYPE_DIR){
++              fd = open_host_sock(host_file, &filter);
++              if(fd > 0){
++                      data->contents = hppfs_get_data(fd, filter, 
++                                                      &data->proc_file, 
++                                                      file, &data->len);
++                      if(!IS_ERR(data->contents))
++                              data->host_fd = fd;
++              }
++              else printk("hppfs_open : failed to open a socket in "
++                          "'%s', errno = %d\n", host_file, -fd);
++      }
++      kfree(host_file);
++
++      file->private_data = data;
++      return(0);
++
++ out_free1:
++      kfree(host_file);
++ out_free2:
++      free_contents(data->contents);
++      kfree(data);
++ out:
++      return(err);
++}
++
++static int hppfs_dir_open(struct inode *inode, struct file *file)
++{
++      struct hppfs_private *data;
++      struct dentry *proc_dentry;
++      int err;
++
++      err = -ENOMEM;
++      data = hppfs_data();
++      if(data == NULL)
++              goto out;
++
++      proc_dentry = HPPFS_I(inode)->proc_dentry;
++      err = open_private_file(&data->proc_file, proc_dentry, 
++                              file_mode(file->f_mode));
++      if(err)
++              goto out_free;
++
++      file->private_data = data;
++      return(0);
++
++ out_free:
++      kfree(data);
++ out:
++      return(err);
++}
++
++static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
++{
++      struct hppfs_private *data = file->private_data;
++      struct file *proc_file = &data->proc_file;
++      loff_t (*llseek)(struct file *, loff_t, int);
++      loff_t ret;
++
++      llseek = proc_file->f_dentry->d_inode->i_fop->llseek;
++      if(llseek != NULL){
++              ret = (*llseek)(proc_file, off, where);
++              if(ret < 0)
++                      return(ret);
++      }
++
++      return(default_llseek(file, off, where));
++}
++
++static struct file_operations hppfs_file_fops = {
++      .owner          = NULL,
++      .llseek         = hppfs_llseek,
++      .read           = hppfs_read,
++      .write          = hppfs_write,
++      .open           = hppfs_open,
++};
++
++struct hppfs_dirent {
++      void *vfs_dirent;
++      filldir_t filldir;
++      struct dentry *dentry;
++};
++
++static int hppfs_filldir(void *d, const char *name, int size, 
++                       loff_t offset, ino_t inode, unsigned int type)
++{
++      struct hppfs_dirent *dirent = d;
++
++      if(file_removed(dirent->dentry, name))
++              return(0);
++
++      return((*dirent->filldir)(dirent->vfs_dirent, name, size, offset, 
++                                inode, type));
++}
++
++static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
++{
++      struct hppfs_private *data = file->private_data;
++      struct file *proc_file = &data->proc_file;
++      int (*readdir)(struct file *, void *, filldir_t);
++      struct hppfs_dirent dirent = ((struct hppfs_dirent)
++                                    { .vfs_dirent     = ent,
++                                      .filldir        = filldir,
++                                      .dentry         = file->f_dentry } );
++      int err;
++
++      readdir = proc_file->f_dentry->d_inode->i_fop->readdir;
++
++      proc_file->f_pos = file->f_pos;
++      err = (*readdir)(proc_file, &dirent, hppfs_filldir);
++      file->f_pos = proc_file->f_pos;
++
++      return(err);
++}
++
++static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
++{
++      return(0);
++}
++
++static struct file_operations hppfs_dir_fops = {
++      .owner          = NULL,
++      .readdir        = hppfs_readdir,
++      .open           = hppfs_dir_open,
++      .fsync          = hppfs_fsync,
++};
++
++static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf)
++{
++      sf->f_blocks = 0;
++      sf->f_bfree = 0;
++      sf->f_bavail = 0;
++      sf->f_files = 0;
++      sf->f_ffree = 0;
++      sf->f_type = HPPFS_SUPER_MAGIC;
++      return(0);
++}
++
++static struct inode *hppfs_alloc_inode(struct super_block *sb)
++{
++      struct hppfs_inode_info *hi;
++
++      hi = kmalloc(sizeof(*hi), GFP_KERNEL);
++      if(hi == NULL) 
++              return(NULL);
++
++      *hi = ((struct hppfs_inode_info) { .proc_dentry = NULL });
++      inode_init_once(&hi->vfs_inode);
++      return(&hi->vfs_inode);
++}
++
++void hppfs_delete_inode(struct inode *ino)
++{
++      clear_inode(ino);
++}
++
++static void hppfs_destroy_inode(struct inode *inode)
++{
++      kfree(HPPFS_I(inode));
++}
++
++static struct super_operations hppfs_sbops = { 
++      .alloc_inode    = hppfs_alloc_inode,
++      .destroy_inode  = hppfs_destroy_inode,
++      .read_inode     = hppfs_read_inode,
++      .delete_inode   = hppfs_delete_inode,
++      .statfs         = hppfs_statfs,
++};
++
++static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen)
++{
++      struct file proc_file;
++      struct dentry *proc_dentry;
++      int (*readlink)(struct dentry *, char *, int);
++      int err, n;
++
++      proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
++      err = open_private_file(&proc_file, proc_dentry, O_RDONLY);
++      if(err) 
++              return(err);
++
++      readlink = proc_dentry->d_inode->i_op->readlink;
++      n = (*readlink)(proc_dentry, buffer, buflen);
++
++      close_private_file(&proc_file);
++      
++      return(n);
++}
++
++static int hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
++{
++      struct file proc_file;
++      struct dentry *proc_dentry;
++      int (*follow_link)(struct dentry *, struct nameidata *);
++      int err, n;
++
++      proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
++      err = open_private_file(&proc_file, proc_dentry, O_RDONLY);
++      if(err) 
++              return(err);
++
++      follow_link = proc_dentry->d_inode->i_op->follow_link;
++      n = (*follow_link)(proc_dentry, nd);
++
++      close_private_file(&proc_file);
++      
++      return(n);
++}
++
++static struct inode_operations hppfs_dir_iops = {
++      .lookup         = hppfs_lookup,
++};
++
++static struct inode_operations hppfs_link_iops = {
++      .readlink       = hppfs_readlink,
++      .follow_link    = hppfs_follow_link,
++};
++
++static int init_inode(struct inode *inode, struct dentry *dentry)
++{
++      if(S_ISDIR(dentry->d_inode->i_mode)){
++              inode->i_op = &hppfs_dir_iops;
++              inode->i_fop = &hppfs_dir_fops;
++      }
++      else if(S_ISLNK(dentry->d_inode->i_mode)){
++              inode->i_op = &hppfs_link_iops;
++              inode->i_fop = &hppfs_file_fops;
++      }
++      else {
++              inode->i_op = &hppfs_file_iops;
++              inode->i_fop = &hppfs_file_fops;
++      }
++
++      HPPFS_I(inode)->proc_dentry = dentry;
++
++      return(0);
++}
++
++static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
++{
++      struct inode *root_inode;
++      struct file_system_type *procfs;
++      struct super_block *proc_sb;
++      int err;
++
++      err = -ENOENT;
++      procfs = get_fs_type("proc");
++      if(procfs == NULL) 
++              goto out;
++
++      if(list_empty(&procfs->fs_supers))
++              goto out;
++
++      proc_sb = list_entry(procfs->fs_supers.next, struct super_block,
++                           s_instances);
++      
++      sb->s_blocksize = 1024;
++      sb->s_blocksize_bits = 10;
++      sb->s_magic = HPPFS_SUPER_MAGIC;
++      sb->s_op = &hppfs_sbops;
++
++      root_inode = iget(sb, 0);
++      if(root_inode == NULL)
++              goto out;
++
++      err = init_inode(root_inode, proc_sb->s_root);
++      if(err)
++              goto out_put;
++
++      err = -ENOMEM;
++      sb->s_root = d_alloc_root(root_inode);
++      if(sb->s_root == NULL)
++              goto out_put;
++
++      hppfs_read_inode(root_inode);
++
++      return(0);
++
++ out_put:
++      iput(root_inode);
++ out:
++      return(err);
++}
++
++static struct super_block *hppfs_read_super(struct file_system_type *type,
++                                           int flags, const char *dev_name,
++                                           void *data)
++{
++      return(get_sb_nodev(type, flags, data, hppfs_fill_super));
++}
++
++static struct file_system_type hppfs_type = {
++      .owner          = THIS_MODULE,
++      .name           = "hppfs",
++      .get_sb         = hppfs_read_super,
++      .kill_sb        = kill_anon_super,
++      .fs_flags       = 0,
++};
++
++static int __init init_hppfs(void)
++{
++      return(register_filesystem(&hppfs_type));
++}
++
++static void __exit exit_hppfs(void)
++{
++      unregister_filesystem(&hppfs_type);
++}
++
++module_init(init_hppfs)
++module_exit(exit_hppfs)
++MODULE_LICENSE("GPL");
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur a/include/asm-um/archparam-i386.h b/include/asm-um/archparam-i386.h
+--- a/include/asm-um/archparam-i386.h  Fri Aug 15 15:07:52 2003
++++ b/include/asm-um/archparam-i386.h  Fri Aug 15 15:13:17 2003
+@@ -56,6 +56,65 @@
+       pr_reg[16] = PT_REGS_SS(regs);          \
+ } while(0);
++#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL))
++#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE)
++#define VSYSCALL_ENTRY        ((unsigned long) &__kernel_vsyscall)
++extern void *__kernel_vsyscall;
++
++/*
++ * Architecture-neutral AT_ values in 0-17, leave some room
++ * for more of them, start the x86-specific ones at 32.
++ */
++#define AT_SYSINFO            32
++#define AT_SYSINFO_EHDR               33
++
++#define ARCH_DLINFO                                           \
++do {                                                          \
++              NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY);        \
++              NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE);    \
++} while (0)
++
++/*
++ * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out
++ * extra segments containing the vsyscall DSO contents.  Dumping its
++ * contents makes post-mortem fully interpretable later without matching up
++ * the same kernel and hardware config to see what PC values meant.
++ * Dumping its extra ELF program headers includes all the other information
++ * a debugger needs to easily find how the vsyscall DSO was being used.
++ */
++#define ELF_CORE_EXTRA_PHDRS          (VSYSCALL_EHDR->e_phnum)
++#define ELF_CORE_WRITE_EXTRA_PHDRS                                          \
++do {                                                                        \
++      const struct elf_phdr *const vsyscall_phdrs =                         \
++              (const struct elf_phdr *) (VSYSCALL_BASE                      \
++                                         + VSYSCALL_EHDR->e_phoff);         \
++      int i;                                                                \
++      Elf32_Off ofs = 0;                                                    \
++      for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {                        \
++              struct elf_phdr phdr = vsyscall_phdrs[i];                     \
++              if (phdr.p_type == PT_LOAD) {                                 \
++                      ofs = phdr.p_offset = offset;                         \
++                      offset += phdr.p_filesz;                              \
++              }                                                             \
++              else                                                          \
++                      phdr.p_offset += ofs;                                 \
++              phdr.p_paddr = 0; /* match other core phdrs */                \
++              DUMP_WRITE(&phdr, sizeof(phdr));                              \
++      }                                                                     \
++} while (0)
++#define ELF_CORE_WRITE_EXTRA_DATA                                           \
++do {                                                                        \
++      const struct elf_phdr *const vsyscall_phdrs =                         \
++              (const struct elf_phdr *) (VSYSCALL_BASE                      \
++                                         + VSYSCALL_EHDR->e_phoff);         \
++      int i;                                                                \
++      for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {                        \
++              if (vsyscall_phdrs[i].p_type == PT_LOAD)                      \
++                      DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr,        \
++                                 vsyscall_phdrs[i].p_filesz);               \
++      }                                                                     \
++} while (0)
++
+ /********* Bits for asm-um/delay.h **********/
+ typedef unsigned long um_udelay_t;
+diff -Naur a/include/asm-um/common.lds.S b/include/asm-um/common.lds.S
+--- a/include/asm-um/common.lds.S      Fri Aug 15 15:04:49 2003
++++ b/include/asm-um/common.lds.S      Fri Aug 15 15:10:46 2003
+@@ -1,3 +1,5 @@
++#include <asm-generic/vmlinux.lds.h>
++
+   .fini      : { *(.fini)    } =0x9090
+   _etext = .;
+   PROVIDE (etext = .);
+@@ -67,6 +69,10 @@
+   }
+   __initcall_end = .;
++  __con_initcall_start = .;
++  .con_initcall.init : { *(.con_initcall.init) }
++  __con_initcall_end = .;
++
+   __uml_initcall_start = .;
+   .uml.initcall.init : { *(.uml.initcall.init) }
+   __uml_initcall_end = .;
+@@ -80,7 +86,33 @@
+   .uml.exitcall : { *(.uml.exitcall.exit) }
+   __uml_exitcall_end = .;
+-  . = ALIGN(4096);
++  . = ALIGN(4);
++  __alt_instructions = .;
++  .altinstructions : { *(.altinstructions) } 
++  __alt_instructions_end = .; 
++  .altinstr_replacement : { *(.altinstr_replacement) } 
++  /* .exit.text is discard at runtime, not link time, to deal with references
++     from .altinstructions and .eh_frame */
++  .exit.text : { *(.exit.text) }
++  .exit.data : { *(.exit.data) }
++ 
++  __preinit_array_start = .;
++  .preinit_array : { *(.preinit_array) }
++  __preinit_array_end = .;
++  __init_array_start = .;
++  .init_array : { *(.init_array) }
++  __init_array_end = .;
++  __fini_array_start = .;
++  .fini_array : { *(.fini_array) }
++  __fini_array_end = .;
++
++   . = ALIGN(4096);
+   __initramfs_start = .;
+   .init.ramfs : { *(.init.ramfs) }
+   __initramfs_end = .;
++
++  /* Sections to be discarded */
++  /DISCARD/ : {
++      *(.exitcall.exit)
++  }
++ 
+diff -Naur a/include/asm-um/cpufeature.h b/include/asm-um/cpufeature.h
+--- a/include/asm-um/cpufeature.h      Wed Dec 31 19:00:00 1969
++++ b/include/asm-um/cpufeature.h      Fri Aug 15 15:10:07 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_CPUFEATURE_H
++#define __UM_CPUFEATURE_H
++
++#include "asm/arch/cpufeature.h"
++
++#endif
+diff -Naur a/include/asm-um/current.h b/include/asm-um/current.h
+--- a/include/asm-um/current.h Fri Aug 15 15:04:11 2003
++++ b/include/asm-um/current.h Fri Aug 15 15:10:19 2003
+@@ -16,8 +16,10 @@
+ #define CURRENT_THREAD(dummy) (((unsigned long) &dummy) & \
+                               (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER))
+-#define current ({ int dummy; \
+-                   ((struct thread_info *) CURRENT_THREAD(dummy))->task; })
++#define current_thread \
++      ({ int dummy; ((struct thread_info *) CURRENT_THREAD(dummy)); })
++
++#define current (current_thread->task)
+ #endif /* __ASSEMBLY__ */
+diff -Naur a/include/asm-um/fixmap.h b/include/asm-um/fixmap.h
+--- a/include/asm-um/fixmap.h  Fri Aug 15 15:08:40 2003
++++ b/include/asm-um/fixmap.h  Fri Aug 15 15:13:36 2003
+@@ -34,6 +34,7 @@
+       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+ #endif
++      FIX_VSYSCALL,
+       __end_of_fixed_addresses
+ };
+@@ -63,6 +64,13 @@
+ #define __fix_to_virt(x)      (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+ #define __virt_to_fix(x)      ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
++/*
++ * This is the range that is readable by user mode, and things
++ * acting like user mode such as get_user_pages.
++ */
++#define FIXADDR_USER_START    (__fix_to_virt(FIX_VSYSCALL))
++#define FIXADDR_USER_END      (FIXADDR_USER_START + PAGE_SIZE)
++
+ extern void __this_fixmap_does_not_exist(void);
+ /*
+diff -Naur a/include/asm-um/irq.h b/include/asm-um/irq.h
+--- a/include/asm-um/irq.h     Fri Aug 15 15:09:15 2003
++++ b/include/asm-um/irq.h     Fri Aug 15 15:13:51 2003
+@@ -1,15 +1,6 @@
+ #ifndef __UM_IRQ_H
+ #define __UM_IRQ_H
+-/* The i386 irq.h has a struct task_struct in a prototype without including
+- * sched.h.  This forward declaration kills the resulting warning.
+- */
+-struct task_struct;
+-
+-#include "asm/ptrace.h"
+-
+-#undef NR_IRQS
+-
+ #define TIMER_IRQ             0
+ #define UMN_IRQ                       1
+ #define CONSOLE_IRQ           2
+@@ -28,8 +19,4 @@
+ #define LAST_IRQ XTERM_IRQ
+ #define NR_IRQS (LAST_IRQ + 1)
+-extern int um_request_irq(unsigned int irq, int fd, int type,
+-                        void (*handler)(int, void *, struct pt_regs *),
+-                        unsigned long irqflags,  const char * devname,
+-                        void *dev_id);
+ #endif
+diff -Naur a/include/asm-um/local.h b/include/asm-um/local.h
+--- a/include/asm-um/local.h   Wed Dec 31 19:00:00 1969
++++ b/include/asm-um/local.h   Fri Aug 15 15:12:46 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_LOCAL_H
++#define __UM_LOCAL_H
++
++#include "asm/arch/local.h"
++
++#endif
+diff -Naur a/include/asm-um/module-generic.h b/include/asm-um/module-generic.h
+--- a/include/asm-um/module-generic.h  Wed Dec 31 19:00:00 1969
++++ b/include/asm-um/module-generic.h  Fri Aug 15 15:12:38 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_MODULE_GENERIC_H
++#define __UM_MODULE_GENERIC_H
++
++#include "asm/arch/module.h"
++
++#endif
+diff -Naur a/include/asm-um/module-i386.h b/include/asm-um/module-i386.h
+--- a/include/asm-um/module-i386.h     Wed Dec 31 19:00:00 1969
++++ b/include/asm-um/module-i386.h     Fri Aug 15 15:12:37 2003
+@@ -0,0 +1,13 @@
++#ifndef __UM_MODULE_I386_H
++#define __UM_MODULE_I386_H
++
++/* UML is simple */
++struct mod_arch_specific
++{
++};
++
++#define Elf_Shdr Elf32_Shdr
++#define Elf_Sym Elf32_Sym
++#define Elf_Ehdr Elf32_Ehdr
++
++#endif
+diff -Naur a/include/asm-um/page.h b/include/asm-um/page.h
+--- a/include/asm-um/page.h    Fri Aug 15 15:06:42 2003
++++ b/include/asm-um/page.h    Fri Aug 15 15:12:40 2003
+@@ -4,7 +4,6 @@
+ struct page;
+ #include "asm/arch/page.h"
+-#include "asm/bug.h"
+ #undef __pa
+ #undef __va
+diff -Naur a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
+--- a/include/asm-um/pgtable.h Fri Aug 15 15:09:25 2003
++++ b/include/asm-um/pgtable.h Fri Aug 15 15:14:09 2003
+@@ -79,12 +79,13 @@
+ #define _PAGE_PRESENT 0x001
+ #define _PAGE_NEWPAGE 0x002
+-#define _PAGE_PROTNONE        0x004   /* If not present */
+-#define _PAGE_RW      0x008
+-#define _PAGE_USER    0x010
+-#define _PAGE_ACCESSED        0x020
+-#define _PAGE_DIRTY   0x040
+-#define _PAGE_NEWPROT   0x080
++#define _PAGE_NEWPROT   0x004
++#define _PAGE_FILE    0x008   /* set:pagecache unset:swap */
++#define _PAGE_PROTNONE        0x010   /* If not present */
++#define _PAGE_RW      0x020
++#define _PAGE_USER    0x040
++#define _PAGE_ACCESSED        0x080
++#define _PAGE_DIRTY   0x100
+ #define REGION_MASK   0xf0000000
+ #define REGION_SHIFT  28
+@@ -203,6 +204,16 @@
+ #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot))
+ #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot))
++/*
++ * Bits 0 through 3 are taken
++ */
++#define PTE_FILE_MAX_BITS     28
++
++#define pte_to_pgoff(pte) ((pte).pte_low >> 4)
++
++#define pgoff_to_pte(off) \
++      ((pte_t) { ((off) << 4) + _PAGE_FILE })
++
+ static inline pte_t pte_mknewprot(pte_t pte)
+ {
+       pte_val(pte) |= _PAGE_NEWPROT;
+@@ -236,6 +247,12 @@
+  * The following only work if pte_present() is true.
+  * Undefined behaviour if not..
+  */
++static inline int pte_user(pte_t pte)
++{ 
++      return((pte_val(pte) & _PAGE_USER) && 
++             !(pte_val(pte) & _PAGE_PROTNONE));
++}
++
+ static inline int pte_read(pte_t pte)
+ { 
+       return((pte_val(pte) & _PAGE_USER) && 
+@@ -253,6 +270,14 @@
+              !(pte_val(pte) & _PAGE_PROTNONE));
+ }
++/*
++ * The following only works if pte_present() is not true.
++ */
++static inline int pte_file(pte_t pte)
++{ 
++      return (pte).pte_low & _PAGE_FILE; 
++}
++
+ static inline int pte_dirty(pte_t pte)        { return pte_val(pte) & _PAGE_DIRTY; }
+ static inline int pte_young(pte_t pte)        { return pte_val(pte) & _PAGE_ACCESSED; }
+ static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; }
+@@ -355,14 +380,26 @@
+ #define pmd_page(pmd) (phys_mem_map(pmd_val(pmd) & PAGE_MASK) + \
+                      ((phys_addr(pmd_val(pmd)) >> PAGE_SHIFT)))
+-/* to find an entry in a page-table-directory. */
++/*
++ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
++ *
++ * this macro returns the index of the entry in the pgd page which would
++ * control the given virtual address
++ */
+ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
+-/* to find an entry in a page-table-directory */
++/*
++ * pgd_offset() returns a (pgd_t *)
++ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
++ */
+ #define pgd_offset(mm, address) \
+ ((mm)->pgd + ((address) >> PGDIR_SHIFT))
+-/* to find an entry in a kernel page-table-directory */
++
++/*
++ * a shortcut which implies the use of the kernel's pgd, instead
++ * of a process's
++ */
+ #define pgd_offset_k(address) pgd_offset(&init_mm, address)
+ #define pmd_index(address) \
+@@ -374,7 +411,12 @@
+       return (pmd_t *) dir;
+ }
+-/* Find an entry in the third-level page table.. */ 
++/*
++ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
++ *
++ * this macro returns the index of the entry in the pte page which would
++ * control the given virtual address
++ */
+ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+ #define pte_offset_kernel(dir, address) \
+       ((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
+@@ -400,11 +442,11 @@
+ #define update_mmu_cache(vma,address,pte) do ; while (0)
+ /* Encode and de-code a swap entry */
+-#define __swp_type(x)                 (((x).val >> 3) & 0x7f)
+-#define __swp_offset(x)                       ((x).val >> 10)
++#define __swp_type(x)                 (((x).val >> 4) & 0x3f)
++#define __swp_offset(x)                       ((x).val >> 11)
+ #define __swp_entry(type, offset) \
+-      ((swp_entry_t) { ((type) << 3) | ((offset) << 10) })
++      ((swp_entry_t) { ((type) << 4) | ((offset) << 11) })
+ #define __pte_to_swp_entry(pte) \
+       ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) })
+ #define __swp_entry_to_pte(x)         ((pte_t) { (x).val })
+diff -Naur a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
+--- a/include/asm-um/processor-generic.h       Fri Aug 15 15:04:48 2003
++++ b/include/asm-um/processor-generic.h       Fri Aug 15 15:10:42 2003
+@@ -11,9 +11,7 @@
+ struct task_struct;
+ #include "linux/config.h"
+-#include "linux/signal.h"
+ #include "asm/ptrace.h"
+-#include "asm/siginfo.h"
+ #include "choose-mode.h"
+ struct mm_struct;
+@@ -101,14 +99,19 @@
+ } mm_segment_t;
+ extern struct task_struct *alloc_task_struct(void);
+-extern void free_task_struct(struct task_struct *task);
+ extern void release_thread(struct task_struct *);
+ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
+ extern void dump_thread(struct pt_regs *regs, struct user *u);
++extern void prepare_to_copy(struct task_struct *tsk);
+ extern unsigned long thread_saved_pc(struct task_struct *t);
++static inline void mm_copy_segments(struct mm_struct *from_mm, 
++                                  struct mm_struct *new_mm)
++{
++}
++
+ #define init_stack    (init_thread_union.stack)
+ /*
+diff -Naur a/include/asm-um/processor-i386.h b/include/asm-um/processor-i386.h
+--- a/include/asm-um/processor-i386.h  Fri Aug 15 15:04:00 2003
++++ b/include/asm-um/processor-i386.h  Fri Aug 15 15:10:18 2003
+@@ -6,8 +6,8 @@
+ #ifndef __UM_PROCESSOR_I386_H
+ #define __UM_PROCESSOR_I386_H
+-extern int cpu_has_xmm;
+-extern int cpu_has_cmov;
++extern int host_has_xmm;
++extern int host_has_cmov;
+ struct arch_thread {
+       unsigned long debugregs[8];
+diff -Naur a/include/asm-um/sections.h b/include/asm-um/sections.h
+--- a/include/asm-um/sections.h        Wed Dec 31 19:00:00 1969
++++ b/include/asm-um/sections.h        Fri Aug 15 15:12:54 2003
+@@ -0,0 +1,7 @@
++#ifndef _UM_SECTIONS_H
++#define _UM_SECTIONS_H
++
++/* nothing to see, move along */
++#include <asm-generic/sections.h>
++
++#endif
+diff -Naur a/include/asm-um/smp.h b/include/asm-um/smp.h
+--- a/include/asm-um/smp.h     Fri Aug 15 15:03:35 2003
++++ b/include/asm-um/smp.h     Fri Aug 15 15:10:04 2003
+@@ -7,9 +7,10 @@
+ #include "linux/config.h"
+ #include "linux/bitops.h"
++#include "linux/threads.h"
+ #include "asm/current.h"
+-#define smp_processor_id() (current->thread_info->cpu)
++#define smp_processor_id() (current_thread->cpu)
+ #define cpu_logical_map(n) (n)
+ #define cpu_number_map(n) (n)
+ #define PROC_CHANGE_PENALTY   15 /* Pick a number, any number */
+@@ -30,6 +31,13 @@
+ {
+ }
++extern inline int any_online_cpu(unsigned int mask)
++{
++        if (mask & cpu_online_map)
++                return __ffs(mask & cpu_online_map);
++
++        return -1;
++}
+ #endif
+ #endif
+diff -Naur a/include/asm-um/system-generic.h b/include/asm-um/system-generic.h
+--- a/include/asm-um/system-generic.h  Fri Aug 15 15:09:22 2003
++++ b/include/asm-um/system-generic.h  Fri Aug 15 15:14:01 2003
+@@ -23,8 +23,10 @@
+ extern void block_signals(void);
+ extern void unblock_signals(void);
+-#define local_save_flags(flags) do { (flags) = get_signals(); } while(0)
+-#define local_irq_restore(flags) do { set_signals(flags); } while(0)
++#define local_save_flags(flags) do { typecheck(unsigned long, flags); \
++                                   (flags) = get_signals(); } while(0)
++#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \
++                                    set_signals(flags); } while(0)
+ #define local_irq_save(flags) do { local_save_flags(flags); \
+                                    local_irq_disable(); } while(0)
+diff -Naur a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h
+--- a/include/asm-um/thread_info.h     Fri Aug 15 15:05:00 2003
++++ b/include/asm-um/thread_info.h     Fri Aug 15 15:11:11 2003
+@@ -9,6 +9,7 @@
+ #ifndef __ASSEMBLY__
+ #include <asm/processor.h>
++#include <asm/types.h>
+ struct thread_info {
+       struct task_struct      *task;          /* main task structure */
+@@ -43,15 +44,18 @@
+ static inline struct thread_info *current_thread_info(void)
+ {
+       struct thread_info *ti;
+-      __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~16383UL));
++      unsigned long mask = PAGE_SIZE * 
++              (1 << CONFIG_KERNEL_STACK_ORDER) - 1;
++      __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~mask));
+       return ti;
+ }
+ /* thread information allocation */
+-#define THREAD_SIZE (4*PAGE_SIZE)
+-#define alloc_thread_info(tsk) ((struct thread_info *) \
+-      __get_free_pages(GFP_KERNEL,2))
+-#define free_thread_info(ti) free_pages((unsigned long) (ti), 2)
++#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE)
++#define alloc_thread_info(tsk) \
++      ((struct thread_info *) kmalloc(THREAD_SIZE, GFP_KERNEL))
++#define free_thread_info(ti) kfree(ti)
++      
+ #define get_thread_info(ti) get_task_struct((ti)->task)
+ #define put_thread_info(ti) put_task_struct((ti)->task)
+@@ -65,11 +69,13 @@
+ #define TIF_POLLING_NRFLAG      3       /* true if poll_idle() is polling 
+                                        * TIF_NEED_RESCHED 
+                                        */
++#define TIF_RESTART_BLOCK     4
+ #define _TIF_SYSCALL_TRACE    (1 << TIF_SYSCALL_TRACE)
+ #define _TIF_SIGPENDING               (1 << TIF_SIGPENDING)
+ #define _TIF_NEED_RESCHED     (1 << TIF_NEED_RESCHED)
+ #define _TIF_POLLING_NRFLAG     (1 << TIF_POLLING_NRFLAG)
++#define _TIF_RESTART_BLOCK    (1 << TIF_RESTART_BLOCK)
+ #endif
+diff -Naur a/include/asm-um/timex.h b/include/asm-um/timex.h
+--- a/include/asm-um/timex.h   Fri Aug 15 15:07:22 2003
++++ b/include/asm-um/timex.h   Fri Aug 15 15:12:48 2003
+@@ -1,8 +1,6 @@
+ #ifndef __UM_TIMEX_H
+ #define __UM_TIMEX_H
+-#include "linux/time.h"
+-
+ typedef unsigned long cycles_t;
+ #define cacheflush_time (0)
+diff -Naur a/include/linux/mm.h b/include/linux/mm.h
+--- a/include/linux/mm.h       Fri Aug 15 15:03:56 2003
++++ b/include/linux/mm.h       Fri Aug 15 15:10:14 2003
+@@ -483,6 +483,9 @@
+       return __set_page_dirty_buffers(page);
+ }
++extern long do_mprotect(struct mm_struct *mm, unsigned long start, 
++                      size_t len, unsigned long prot);
++
+ /*
+  * On a two-level page table, this ends up being trivial. Thus the
+  * inlining and the symmetry break with pte_alloc_map() that does all
+@@ -513,9 +516,10 @@
+ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+-      unsigned long len, unsigned long prot,
+-      unsigned long flag, unsigned long pgoff);
++extern unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file *file, 
++                                 unsigned long addr, unsigned long len,
++                                 unsigned long prot, unsigned long flag,
++                                 unsigned long pgoff);
+ static inline unsigned long do_mmap(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot,
+@@ -525,7 +529,8 @@
+       if ((offset + PAGE_ALIGN(len)) < offset)
+               goto out;
+       if (!(offset & ~PAGE_MASK))
+-              ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
++              ret = do_mmap_pgoff(current->mm, file, addr, len, prot, flag, 
++                                  offset >> PAGE_SHIFT);
+ out:
+       return ret;
+ }
+diff -Naur a/include/linux/proc_mm.h b/include/linux/proc_mm.h
+--- a/include/linux/proc_mm.h  Wed Dec 31 19:00:00 1969
++++ b/include/linux/proc_mm.h  Fri Aug 15 15:10:02 2003
+@@ -0,0 +1,48 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PROC_MM_H
++#define __PROC_MM_H
++
++#include "linux/sched.h"
++
++#define MM_MMAP 54
++#define MM_MUNMAP 55
++#define MM_MPROTECT 56
++#define MM_COPY_SEGMENTS 57
++
++struct mm_mmap {
++      unsigned long addr;
++      unsigned long len;
++      unsigned long prot;
++      unsigned long flags;
++      unsigned long fd;
++      unsigned long offset;
++};
++
++struct mm_munmap {
++      unsigned long addr;
++      unsigned long len;      
++};
++
++struct mm_mprotect {
++      unsigned long addr;
++      unsigned long len;
++        unsigned int prot;
++};
++
++struct proc_mm_op {
++      int op;
++      union {
++              struct mm_mmap mmap;
++              struct mm_munmap munmap;
++              struct mm_mprotect mprotect;
++              int copy_segments;
++      } u;
++};
++
++extern struct mm_struct *proc_mm_get_mm(int fd);
++
++#endif
+diff -Naur a/mm/Makefile b/mm/Makefile
+--- a/mm/Makefile      Fri Aug 15 15:07:22 2003
++++ b/mm/Makefile      Fri Aug 15 15:12:48 2003
+@@ -12,3 +12,5 @@
+                          slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
+ obj-$(CONFIG_SWAP)    += page_io.o swap_state.o swapfile.o
++obj-$(CONFIG_PROC_MM) += proc_mm.o
++
+diff -Naur a/mm/memory.c b/mm/memory.c
+--- a/mm/memory.c      Fri Aug 15 15:05:37 2003
++++ b/mm/memory.c      Fri Aug 15 15:11:48 2003
+@@ -45,6 +45,7 @@
+ #include <linux/pagemap.h>
+ #include <linux/vcache.h>
+ #include <linux/rmap-locking.h>
++#include <linux/init.h>
+ #include <asm/pgalloc.h>
+ #include <asm/rmap.h>
+@@ -669,6 +670,24 @@
+ }
++static struct vm_area_struct fixmap_vma = {
++      /* Catch users - if there are any valid
++         ones, we can make this be "&init_mm" or
++         something.  */
++      .vm_mm = NULL,
++      .vm_page_prot = PAGE_READONLY,
++      .vm_flags = VM_READ | VM_EXEC,
++};
++
++static int init_fixmap_vma(void)
++{
++      fixmap_vma.vm_start = FIXADDR_START;
++      fixmap_vma.vm_end = FIXADDR_TOP;
++      return(0);
++}
++
++__initcall(init_fixmap_vma);
++
+ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long start, int len, int write, int force,
+               struct page **pages, struct vm_area_struct **vmas)
+@@ -689,19 +708,8 @@
+               vma = find_extend_vma(mm, start);
+-#ifdef FIXADDR_USER_START
+-              if (!vma &&
+-                  start >= FIXADDR_USER_START && start < FIXADDR_USER_END) {
+-                      static struct vm_area_struct fixmap_vma = {
+-                              /* Catch users - if there are any valid
+-                                 ones, we can make this be "&init_mm" or
+-                                 something.  */
+-                              .vm_mm = NULL,
+-                              .vm_start = FIXADDR_USER_START,
+-                              .vm_end = FIXADDR_USER_END,
+-                              .vm_page_prot = PAGE_READONLY,
+-                              .vm_flags = VM_READ | VM_EXEC,
+-                      };
++#ifdef FIXADDR_START
++              if (!vma && start >= FIXADDR_START && start < FIXADDR_TOP) {
+                       unsigned long pg = start & PAGE_MASK;
+                       pgd_t *pgd;
+                       pmd_t *pmd;
+diff -Naur a/mm/mmap.c b/mm/mmap.c
+--- a/mm/mmap.c        Fri Aug 15 15:07:18 2003
++++ b/mm/mmap.c        Fri Aug 15 15:12:45 2003
+@@ -457,11 +457,11 @@
+  * The caller must hold down_write(current->mm->mmap_sem).
+  */
+-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+-                      unsigned long len, unsigned long prot,
+-                      unsigned long flags, unsigned long pgoff)
++unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file * file, 
++                          unsigned long addr, unsigned long len,
++                          unsigned long prot, unsigned long flags,
++                          unsigned long pgoff)
+ {
+-      struct mm_struct * mm = current->mm;
+       struct vm_area_struct * vma, * prev;
+       struct inode *inode;
+       unsigned int vm_flags;
+diff -Naur a/mm/mprotect.c b/mm/mprotect.c
+--- a/mm/mprotect.c    Fri Aug 15 15:05:20 2003
++++ b/mm/mprotect.c    Fri Aug 15 15:11:21 2003
+@@ -222,7 +222,8 @@
+ }
+ asmlinkage long
+-sys_mprotect(unsigned long start, size_t len, unsigned long prot)
++do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, 
++           unsigned long prot)
+ {
+       unsigned long nstart, end, tmp;
+       struct vm_area_struct * vma, * next, * prev;
+@@ -239,9 +240,9 @@
+       if (end == start)
+               return 0;
+-      down_write(&current->mm->mmap_sem);
++      down_write(&mm->mmap_sem);
+-      vma = find_vma_prev(current->mm, start, &prev);
++      vma = find_vma_prev(mm, start, &prev);
+       error = -ENOMEM;
+       if (!vma || vma->vm_start > start)
+               goto out;
+@@ -301,6 +302,11 @@
+               prev->vm_mm->map_count--;
+       }
+ out:
+-      up_write(&current->mm->mmap_sem);
++      up_write(&mm->mmap_sem);
+       return error;
+ }
++
++asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
++{
++        return(do_mprotect(current->mm, start, len, prot));
++}
+diff -Naur a/mm/proc_mm.c b/mm/proc_mm.c
+--- a/mm/proc_mm.c     Wed Dec 31 19:00:00 1969
++++ b/mm/proc_mm.c     Fri Aug 15 15:11:44 2003
+@@ -0,0 +1,174 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/mm.h"
++#include "linux/init.h"
++#include "linux/proc_fs.h"
++#include "linux/proc_mm.h"
++#include "linux/file.h"
++#include "asm/uaccess.h"
++#include "asm/mmu_context.h"
++
++static struct file_operations proc_mm_fops;
++
++struct mm_struct *proc_mm_get_mm(int fd)
++{
++      struct mm_struct *ret = ERR_PTR(-EBADF);
++      struct file *file;
++
++      file = fget(fd);
++      if (!file)
++              goto out;
++
++      ret = ERR_PTR(-EINVAL);
++      if(file->f_op != &proc_mm_fops)
++              goto out_fput;
++
++      ret = file->private_data;
++ out_fput:
++      fput(file);
++ out:
++      return(ret);
++}
++
++extern long do_mmap2(struct mm_struct *mm, unsigned long addr, 
++                   unsigned long len, unsigned long prot, 
++                   unsigned long flags, unsigned long fd,
++                   unsigned long pgoff);
++
++static ssize_t write_proc_mm(struct file *file, const char *buffer,
++                           size_t count, loff_t *ppos)
++{
++      struct mm_struct *mm = file->private_data;
++      struct proc_mm_op req;
++      int n, ret;
++
++      if(count > sizeof(req))
++              return(-EINVAL);
++
++      n = copy_from_user(&req, buffer, count);
++      if(n != 0)
++              return(-EFAULT);
++
++      ret = count;
++      switch(req.op){
++      case MM_MMAP: {
++              struct mm_mmap *map = &req.u.mmap;
++
++              ret = do_mmap2(mm, map->addr, map->len, map->prot, 
++                             map->flags, map->fd, map->offset >> PAGE_SHIFT);
++              if((ret & ~PAGE_MASK) == 0)
++                      ret = count;
++      
++              break;
++      }
++      case MM_MUNMAP: {
++              struct mm_munmap *unmap = &req.u.munmap;
++
++              down_write(&mm->mmap_sem);
++              ret = do_munmap(mm, unmap->addr, unmap->len);
++              up_write(&mm->mmap_sem);
++
++              if(ret == 0)
++                      ret = count;
++              break;
++      }
++      case MM_MPROTECT: {
++              struct mm_mprotect *protect = &req.u.mprotect;
++
++              ret = do_mprotect(mm, protect->addr, protect->len, 
++                                protect->prot);
++              if(ret == 0)
++                      ret = count;
++              break;
++      }
++
++      case MM_COPY_SEGMENTS: {
++              struct mm_struct *from = proc_mm_get_mm(req.u.copy_segments);
++
++              if(IS_ERR(from)){
++                      ret = PTR_ERR(from);
++                      break;
++              }
++
++              mm_copy_segments(from, mm);
++              break;
++      }
++      default:
++              ret = -EINVAL;
++              break;
++      }
++
++      return(ret);
++}
++
++static int open_proc_mm(struct inode *inode, struct file *file)
++{
++      struct mm_struct *mm = mm_alloc();
++      int ret;
++
++      ret = -ENOMEM;
++      if(mm == NULL)
++              goto out_mem;
++
++      ret = init_new_context(current, mm);
++      if(ret)
++              goto out_free;
++
++      spin_lock(&mmlist_lock);
++      list_add(&mm->mmlist, &current->mm->mmlist);
++      mmlist_nr++;
++      spin_unlock(&mmlist_lock);
++
++      file->private_data = mm;
++
++      return(0);
++
++ out_free:
++      mmput(mm);
++ out_mem:
++      return(ret);
++}
++
++static int release_proc_mm(struct inode *inode, struct file *file)
++{
++      struct mm_struct *mm = file->private_data;
++
++      mmput(mm);
++      return(0);
++}
++
++static struct file_operations proc_mm_fops = {
++      .open           = open_proc_mm,
++      .release        = release_proc_mm,
++      .write          = write_proc_mm,
++};
++
++static int make_proc_mm(void)
++{
++      struct proc_dir_entry *ent;
++
++      ent = create_proc_entry("mm", 0222, &proc_root);
++      if(ent == NULL){
++              printk("make_proc_mm : Failed to register /proc/mm\n");
++              return(0);
++      }
++      ent->proc_fops = &proc_mm_fops;
++
++      return(0);
++}
++
++__initcall(make_proc_mm);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch
new file mode 100644 (file)
index 0000000..a9cc225
--- /dev/null
@@ -0,0 +1,265 @@
+ fs/inode.c         |    1 
+ fs/namei.c         |   66 ++++++++++++++++++++++++++++++++++++++---------------
+ include/linux/fs.h |   11 ++++----
+ 3 files changed, 54 insertions(+), 24 deletions(-)
+
+--- linux-2.4.18/fs/namei.c~vfs-pdirops-2.4.18-chaos   2003-09-01 14:58:03.000000000 +0400
++++ linux-2.4.18-alexey/fs/namei.c     2003-09-01 17:56:10.000000000 +0400
+@@ -101,6 +101,36 @@ void intent_release(struct lookup_intent
+ }
++static void *lock_dir(struct inode *dir, struct qstr *name)
++{
++      unsigned long hash;
++      
++      if (!IS_PDIROPS(dir)) {
++              down(&dir->i_sem);
++              return 0;
++      }
++
++      /* OK. fs understands parallel directory operations.
++       * so, we try to acquire lock for hash of requested
++       * filename in order to prevent any operations with
++       * same name in same time -bzzz */
++
++      /* calculate name hash */
++      hash = full_name_hash(name->name, name->len);
++
++      /* lock this hash */
++      return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC);
++}
++
++static void unlock_dir(struct inode *dir, void *lock)
++{
++      if (!IS_PDIROPS(dir)) {
++              up(&dir->i_sem);
++              return;
++      }
++      dynlock_unlock(&dir->i_dcache_lock, lock);
++}
++
+ /* In order to reduce some races, while at the same time doing additional
+  * checking and hopefully speeding things up, we copy filenames to the
+  * kernel data space before using them..
+@@ -302,10 +332,10 @@ static struct dentry *real_lookup(struct
+ {
+       struct dentry * result;
+       struct inode *dir = parent->d_inode;
++      void *lock;
+ again:
+-
+-      down(&dir->i_sem);
++      lock = lock_dir(dir, name);
+       /*
+        * First re-do the cached lookup just in case it was created
+        * while we waited for the directory semaphore..
+@@ -329,7 +359,7 @@ again:
+                       else
+                               result = dentry;
+               }
+-              up(&dir->i_sem);
++              unlock_dir(dir, lock);
+               return result;
+       }
+@@ -337,7 +367,7 @@ again:
+        * Uhhuh! Nasty case: the cache was re-populated while
+        * we waited on the semaphore. Need to revalidate.
+        */
+-      up(&dir->i_sem);
++      unlock_dir(dir, lock);
+       if (result->d_op && result->d_op->d_revalidate) {
+               if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) {
+                       dput(result);
+@@ -1234,13 +1264,13 @@ struct file *filp_open(const char * path
+               goto exit;
+       dir = nd.dentry;
+-      down(&dir->d_inode->i_sem);
++      nd.lock = lock_dir(dir->d_inode, &nd.last);
+       dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
+ do_last:
+       error = PTR_ERR(dentry);
+       if (IS_ERR(dentry)) {
+-              up(&dir->d_inode->i_sem);
++              unlock_dir(dir->d_inode, nd.lock);
+               goto exit;
+       }
+@@ -1249,7 +1279,7 @@ do_last:
+       if (!dentry->d_inode) {
+               error = vfs_create_it(dir->d_inode, dentry,
+                                  mode & ~current->fs->umask, &it);
+-              up(&dir->d_inode->i_sem);
++              unlock_dir(dir->d_inode, nd.lock);
+               dput(nd.dentry);
+               nd.dentry = dentry;
+               if (error)
+@@ -1264,7 +1294,7 @@ do_last:
+       /*
+        * It already exists.
+        */
+-      up(&dir->d_inode->i_sem);
++      unlock_dir(dir->d_inode, nd.lock);
+       error = -EEXIST;
+       if (flag & O_EXCL)
+@@ -1344,7 +1374,7 @@ do_link:
+               goto exit;
+       }
+       dir = nd.dentry;
+-      down(&dir->d_inode->i_sem);
++      nd.lock = lock_dir(dir->d_inode, &nd.last);
+       dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
+       putname(nd.last.name);
+       goto do_last;
+@@ -1357,7 +1387,7 @@ static struct dentry *lookup_create(stru
+ {
+       struct dentry *dentry;
+-      down(&nd->dentry->d_inode->i_sem);
++      nd->lock = lock_dir(nd->dentry->d_inode, &nd->last);
+       dentry = ERR_PTR(-EEXIST);
+       if (nd->last_type != LAST_NORM)
+               goto fail;
+@@ -1446,7 +1476,7 @@ asmlinkage long sys_mknod(const char * f
+               }
+               dput(dentry);
+       }
+-      up(&nd.dentry->d_inode->i_sem);
++      unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+       path_release(&nd);
+ out:
+@@ -1509,7 +1539,7 @@ asmlinkage long sys_mkdir(const char * p
+                                         mode & ~current->fs->umask);
+                       dput(dentry);
+               }
+-              up(&nd.dentry->d_inode->i_sem);
++              unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+               path_release(&nd);
+ out:
+@@ -1619,14 +1649,14 @@ asmlinkage long sys_rmdir(const char * p
+               if (error != -EOPNOTSUPP)
+                       goto exit1;
+       }
+-      down(&nd.dentry->d_inode->i_sem);
++      nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
+       dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
+       error = PTR_ERR(dentry);
+       if (!IS_ERR(dentry)) {
+               error = vfs_rmdir(nd.dentry->d_inode, dentry);
+               dput(dentry);
+       }
+-      up(&nd.dentry->d_inode->i_sem);
++      unlock_dir(nd.dentry->d_inode, nd.lock);
+ exit1:
+       path_release(&nd);
+ exit:
+@@ -1685,7 +1715,7 @@ asmlinkage long sys_unlink(const char * 
+               if (error != -EOPNOTSUPP)
+                       goto exit1;
+       }
+-      down(&nd.dentry->d_inode->i_sem);
++      nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
+       dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
+       error = PTR_ERR(dentry);
+       if (!IS_ERR(dentry)) {
+@@ -1696,7 +1726,7 @@ asmlinkage long sys_unlink(const char * 
+       exit2:
+               dput(dentry);
+       }
+-      up(&nd.dentry->d_inode->i_sem);
++      unlock_dir(nd.dentry->d_inode, nd.lock);
+ exit1:
+       path_release(&nd);
+ exit:
+@@ -1766,7 +1796,7 @@ asmlinkage long sys_symlink(const char *
+                       error = vfs_symlink(nd.dentry->d_inode, dentry, from);
+                       dput(dentry);
+               }
+-              up(&nd.dentry->d_inode->i_sem);
++              unlock_dir(nd.dentry->d_inode, nd.lock);
+       out2:
+               path_release(&nd);
+       out:
+@@ -1858,7 +1888,7 @@ asmlinkage long sys_link(const char * ol
+                       error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+                       dput(new_dentry);
+               }
+-              up(&nd.dentry->d_inode->i_sem);
++              unlock_dir(nd.dentry->d_inode, nd.lock);
+ out_release:
+               path_release(&nd);
+ out:
+--- linux-2.4.18/include/linux/fs.h~vfs-pdirops-2.4.18-chaos   2003-09-01 14:58:03.000000000 +0400
++++ linux-2.4.18-alexey/include/linux/fs.h     2003-09-01 16:36:16.000000000 +0400
+@@ -21,6 +21,7 @@
+ #include <linux/cache.h>
+ #include <linux/stddef.h>
+ #include <linux/string.h>
++#include <linux/dynlocks.h>
+ #include <asm/atomic.h>
+ #include <asm/bitops.h>
+@@ -136,6 +137,7 @@ extern int leases_enable, dir_notify_ena
+ #define S_IMMUTABLE   16      /* Immutable file */
+ #define S_DEAD                32      /* removed, but still open directory */
+ #define S_NOQUOTA     64      /* Inode is not counted to quota */
++#define S_PDIROPS     256     /* Parallel directory operations */
+ /*
+  * Note that nosuid etc flags are inode-specific: setting some file-system
+@@ -162,6 +164,7 @@ extern int leases_enable, dir_notify_ena
+ #define IS_IMMUTABLE(inode)   ((inode)->i_flags & S_IMMUTABLE)
+ #define IS_NOATIME(inode)     (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME))
+ #define IS_NODIRATIME(inode)  __IS_FLG(inode, MS_NODIRATIME)
++#define IS_PDIROPS(inode)     __IS_FLG(inode, S_PDIROPS)
+ #define IS_DEADDIR(inode)     ((inode)->i_flags & S_DEAD)
+@@ -490,6 +493,7 @@ struct inode {
+       atomic_t                i_writecount;
+       unsigned int            i_attr_flags;
+       __u32                   i_generation;
++      struct dynlock          i_dcache_lock;  /* for parallel directory ops */
+       union {
+               struct minix_inode_info         minix_i;
+               struct ext2_inode_info          ext2_i;
+@@ -713,6 +717,7 @@ struct nameidata {
+       unsigned int flags;
+       int last_type;
+       struct lookup_intent *intent;
++      void *lock;
+ };
+ #define DQUOT_USR_ENABLED     0x01            /* User diskquotas enabled */
+@@ -1610,12 +1615,6 @@ static inline struct dentry *get_parent(
+       return dget(dentry->d_parent);
+ }
+-static inline void unlock_dir(struct dentry *dir)
+-{
+-      up(&dir->d_inode->i_sem);
+-      dput(dir);
+-}
+-
+ /*
+  * Whee.. Deadlock country. Happily there are only two VFS
+  * operations that does this..
+--- linux-2.4.18/fs/inode.c~vfs-pdirops-2.4.18-chaos   2003-09-01 14:58:03.000000000 +0400
++++ linux-2.4.18-alexey/fs/inode.c     2003-09-01 16:36:16.000000000 +0400
+@@ -119,6 +119,7 @@ static struct inode *alloc_inode(struct 
+               mapping->host = inode;
+               mapping->gfp_mask = GFP_HIGHUSER;
+               inode->i_mapping = mapping;
++              dynlock_init(&inode->i_dcache_lock);
+       }
+       return inode;
+ }
+
+_
diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.4.20-rh.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.4.20-rh.patch
new file mode 100644 (file)
index 0000000..c9228a8
--- /dev/null
@@ -0,0 +1,269 @@
+ fs/inode.c         |    1 
+ fs/namei.c         |   66 ++++++++++++++++++++++++++++++++++++++---------------
+ include/linux/fs.h |   11 ++++----
+ 3 files changed, 54 insertions(+), 24 deletions(-)
+
+Index: linux-2.4.20-rh/fs/namei.c
+===================================================================
+--- linux-2.4.20-rh.orig/fs/namei.c    2003-09-04 20:58:33.000000000 +0800
++++ linux-2.4.20-rh/fs/namei.c 2003-09-04 21:21:20.000000000 +0800
+@@ -101,6 +101,36 @@
+ }
++static void *lock_dir(struct inode *dir, struct qstr *name)
++{
++      unsigned long hash;
++      
++      if (!IS_PDIROPS(dir)) {
++              down(&dir->i_sem);
++              return 0;
++      }
++
++      /* OK. fs understands parallel directory operations.
++       * so, we try to acquire lock for hash of requested
++       * filename in order to prevent any operations with
++       * same name in same time -bzzz */
++
++      /* calculate name hash */
++      hash = full_name_hash(name->name, name->len);
++
++      /* lock this hash */
++      return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC);
++}
++
++static void unlock_dir(struct inode *dir, void *lock)
++{
++      if (!IS_PDIROPS(dir)) {
++              up(&dir->i_sem);
++              return;
++      }
++      dynlock_unlock(&dir->i_dcache_lock, lock);
++}
++
+ /* In order to reduce some races, while at the same time doing additional
+  * checking and hopefully speeding things up, we copy filenames to the
+  * kernel data space before using them..
+@@ -302,10 +332,10 @@
+ {
+       struct dentry * result;
+       struct inode *dir = parent->d_inode;
++      void *lock;
+ again:
+-
+-      down(&dir->i_sem);
++      lock = lock_dir(dir, name);
+       /*
+        * First re-do the cached lookup just in case it was created
+        * while we waited for the directory semaphore..
+@@ -329,7 +359,7 @@
+                       else
+                               result = dentry;
+               }
+-              up(&dir->i_sem);
++              unlock_dir(dir, lock);
+               return result;
+       }
+@@ -337,7 +367,7 @@
+        * Uhhuh! Nasty case: the cache was re-populated while
+        * we waited on the semaphore. Need to revalidate.
+        */
+-      up(&dir->i_sem);
++      unlock_dir(dir, lock);
+       if (result->d_op && result->d_op->d_revalidate) {
+               if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) {
+                       dput(result);
+@@ -1180,13 +1210,13 @@
+               goto exit;
+       dir = nd->dentry;
+-      down(&dir->d_inode->i_sem);
++      nd->lock = lock_dir(dir->d_inode, &nd->last);
+       dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+ do_last:
+       error = PTR_ERR(dentry);
+       if (IS_ERR(dentry)) {
+-              up(&dir->d_inode->i_sem);
++              unlock_dir(dir->d_inode, nd->lock);
+               goto exit;
+       }
+@@ -1195,7 +1225,7 @@
+       if (!dentry->d_inode) {
+               error = vfs_create_it(dir->d_inode, dentry,
+                                  mode & ~current->fs->umask, it);
+-              up(&dir->d_inode->i_sem);
++              unlock_dir(dir->d_inode, nd->lock);             
+               dput(nd->dentry);
+               nd->dentry = dentry;
+               if (error)
+@@ -1209,7 +1239,7 @@
+       /*
+        * It already exists.
+        */
+-      up(&dir->d_inode->i_sem);
++      unlock_dir(dir->d_inode, nd->lock);
+       error = -EEXIST;
+       if (flag & O_EXCL)
+@@ -1362,7 +1392,7 @@
+               goto exit;
+       }
+       dir = nd->dentry;
+-      down(&dir->d_inode->i_sem);
++      nd->lock = lock_dir(dir->d_inode, &nd->last);
+       dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+       putname(nd->last.name);
+       goto do_last;
+@@ -1380,7 +1410,7 @@
+ {
+       struct dentry *dentry;
+-      down(&nd->dentry->d_inode->i_sem);
++      nd->lock = lock_dir(nd->dentry->d_inode, &nd->last);
+       dentry = ERR_PTR(-EEXIST);
+       if (nd->last_type != LAST_NORM)
+               goto fail;
+@@ -1469,7 +1499,7 @@
+               }
+               dput(dentry);
+       }
+-      up(&nd.dentry->d_inode->i_sem);
++      unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+       path_release(&nd);
+ out:
+@@ -1532,7 +1562,7 @@
+                                         mode & ~current->fs->umask);
+                       dput(dentry);
+               }
+-              up(&nd.dentry->d_inode->i_sem);
++              unlock_dir(nd.dentry->d_inode, nd.lock);
+ out2:
+               path_release(&nd);
+ out:
+@@ -1642,14 +1672,14 @@
+               if (error != -EOPNOTSUPP)
+                       goto exit1;
+       }
+-      down(&nd.dentry->d_inode->i_sem);
++      nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
+       dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
+       error = PTR_ERR(dentry);
+       if (!IS_ERR(dentry)) {
+               error = vfs_rmdir(nd.dentry->d_inode, dentry);
+               dput(dentry);
+       }
+-      up(&nd.dentry->d_inode->i_sem);
++      unlock_dir(nd.dentry->d_inode, nd.lock);
+ exit1:
+       path_release(&nd);
+ exit:
+@@ -1708,7 +1738,7 @@
+               if (error != -EOPNOTSUPP)
+                       goto exit1;
+       }
+-      down(&nd.dentry->d_inode->i_sem);
++      nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
+       dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
+       error = PTR_ERR(dentry);
+       if (!IS_ERR(dentry)) {
+@@ -1719,7 +1749,7 @@
+       exit2:
+               dput(dentry);
+       }
+-      up(&nd.dentry->d_inode->i_sem);
++      unlock_dir(nd.dentry->d_inode, nd.lock);
+ exit1:
+       path_release(&nd);
+ exit:
+@@ -1789,7 +1819,7 @@
+                       error = vfs_symlink(nd.dentry->d_inode, dentry, from);
+                       dput(dentry);
+               }
+-              up(&nd.dentry->d_inode->i_sem);
++              unlock_dir(nd.dentry->d_inode, nd.lock);
+       out2:
+               path_release(&nd);
+       out:
+@@ -1881,7 +1911,7 @@
+                       error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+                       dput(new_dentry);
+               }
+-              up(&nd.dentry->d_inode->i_sem);
++              unlock_dir(nd.dentry->d_inode, nd.lock);
+ out_release:
+               path_release(&nd);
+ out:
+Index: linux-2.4.20-rh/include/linux/fs.h
+===================================================================
+--- linux-2.4.20-rh.orig/include/linux/fs.h    2003-09-04 20:59:14.000000000 +0800
++++ linux-2.4.20-rh/include/linux/fs.h 2003-09-04 21:03:46.000000000 +0800
+@@ -21,6 +21,7 @@
+ #include <linux/cache.h>
+ #include <linux/stddef.h>
+ #include <linux/string.h>
++#include <linux/dynlocks.h>
+ #include <asm/atomic.h>
+ #include <asm/bitops.h>
+@@ -136,6 +137,7 @@
+ #define S_IMMUTABLE   16      /* Immutable file */
+ #define S_DEAD                32      /* removed, but still open directory */
+ #define S_NOQUOTA     64      /* Inode is not counted to quota */
++#define S_PDIROPS     256     /* Parallel directory operations */
+ /*
+  * Note that nosuid etc flags are inode-specific: setting some file-system
+@@ -162,6 +164,7 @@
+ #define IS_IMMUTABLE(inode)   ((inode)->i_flags & S_IMMUTABLE)
+ #define IS_NOATIME(inode)     (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME))
+ #define IS_NODIRATIME(inode)  __IS_FLG(inode, MS_NODIRATIME)
++#define IS_PDIROPS(inode)     __IS_FLG(inode, S_PDIROPS)
+ #define IS_DEADDIR(inode)     ((inode)->i_flags & S_DEAD)
+@@ -489,6 +492,7 @@
+       atomic_t                i_writecount;
+       unsigned int            i_attr_flags;
+       __u32                   i_generation;
++      struct dynlock          i_dcache_lock;  /* for parallel directory ops */
+       union {
+               struct minix_inode_info         minix_i;
+               struct ext2_inode_info          ext2_i;
+@@ -708,6 +712,7 @@
+       unsigned int flags;
+       int last_type;
+       struct lookup_intent *intent;
++      void *lock;
+ };
+ /*
+@@ -1621,12 +1626,6 @@
+       return dget(dentry->d_parent);
+ }
+-static inline void unlock_dir(struct dentry *dir)
+-{
+-      up(&dir->d_inode->i_sem);
+-      dput(dir);
+-}
+-
+ /*
+  * Whee.. Deadlock country. Happily there are only two VFS
+  * operations that does this..
+Index: linux-2.4.20-rh/fs/inode.c
+===================================================================
+--- linux-2.4.20-rh.orig/fs/inode.c    2003-09-04 20:58:35.000000000 +0800
++++ linux-2.4.20-rh/fs/inode.c 2003-09-04 21:03:46.000000000 +0800
+@@ -121,6 +121,7 @@
+               mapping->host = inode;
+               mapping->gfp_mask = GFP_HIGHUSER;
+               inode->i_mapping = mapping;
++              dynlock_init(&inode->i_dcache_lock);
+       }
+       return inode;
+ }
diff --git a/lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc b/lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc
new file mode 100644 (file)
index 0000000..b626dcf
--- /dev/null
@@ -0,0 +1,3 @@
+include/linux/dynlocks.h
+lib/dynlocks.c
+lib/Makefile
diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-5.pc b/lustre/kernel_patches/pc/ext-2.4-patch-5.pc
new file mode 100644 (file)
index 0000000..7191405
--- /dev/null
@@ -0,0 +1 @@
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc
new file mode 100644 (file)
index 0000000..bd89204
--- /dev/null
@@ -0,0 +1,20 @@
+fs/ext3/balloc.c
+fs/ext3/balloc.c.orig
+fs/ext3/dir.c
+fs/ext3/dir.c.orig
+fs/ext3/ialloc.c
+fs/ext3/ialloc.c.orig
+fs/ext3/inode.c
+fs/ext3/inode.c.orig
+fs/ext3/ioctl.c
+fs/ext3/ioctl.c.orig
+fs/ext3/namei.c
+fs/ext3/namei.c.orig
+fs/ext3/super.c
+fs/ext3/super.c.orig
+fs/ext3/symlink.c
+fs/ext3/symlink.c.orig
+include/linux/ext3_fs.h
+include/linux/ext3_fs.h.orig
+include/linux/ext3_jbd.h
+include/linux/ext3_jbd.h.orig
diff --git a/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc
new file mode 100644 (file)
index 0000000..9b16759
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/namei.c
diff --git a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc
new file mode 100644 (file)
index 0000000..42243c8
--- /dev/null
@@ -0,0 +1,6 @@
+fs/ext3/file.c
+fs/ext3/file.c.orig
+fs/ext3/inode.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
+include/linux/ext3_fs_sb.h
diff --git a/lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos.pc
new file mode 100644 (file)
index 0000000..f408025
--- /dev/null
@@ -0,0 +1,8 @@
+fs/ext3/extents.c
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/Makefile
+fs/ext3/super.c
+include/linux/ext3_fs.h
+include/linux/ext3_fs_i.h
+include/linux/ext3_fs_sb.h
diff --git a/lustre/kernel_patches/pc/ext3-extents-oflag-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-extents-oflag-2.4.18-chaos.pc
new file mode 100644 (file)
index 0000000..56c1739
--- /dev/null
@@ -0,0 +1,19 @@
+fs/ext3/ialloc.c
+fs/ext3/namei.c
+include/asm-alpha/fcntl.h
+include/asm-arm/fcntl.h
+include/asm-cris/fcntl.h
+include/asm-i386/fcntl.h
+include/asm-ia64/fcntl.h
+include/asm-m68k/fcntl.h
+include/asm-mips64/fcntl.h
+include/asm-mips/fcntl.h
+include/asm-parisc/fcntl.h
+include/asm-ppc/fcntl.h
+include/asm-s390/fcntl.h
+include/asm-s390x/fcntl.h
+include/asm-sh/fcntl.h
+include/asm-sparc64/fcntl.h
+include/asm-sparc/fcntl.h
+include/linux/ext3_fs.h
+fs/ext3/inode.c
diff --git a/lustre/kernel_patches/pc/ext3-map_inode_page-2.6.0.pc b/lustre/kernel_patches/pc/ext3-map_inode_page-2.6.0.pc
new file mode 100644 (file)
index 0000000..231df0e
--- /dev/null
@@ -0,0 +1,2 @@
+fs/ext3/inode.c
+fs/ext3/super.c
diff --git a/lustre/kernel_patches/pc/ext3-no-write-super-chaos.pc b/lustre/kernel_patches/pc/ext3-no-write-super-chaos.pc
new file mode 100644 (file)
index 0000000..08795de
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/super.c
diff --git a/lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc
new file mode 100644 (file)
index 0000000..2ad2584
--- /dev/null
@@ -0,0 +1,6 @@
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
+include/linux/ext3_fs_i.h
+fs/ext3/inode.c
+fs/ext3/ialloc.c
diff --git a/lustre/kernel_patches/pc/iopen-2.4.18-2.pc b/lustre/kernel_patches/pc/iopen-2.4.18-2.pc
new file mode 100644 (file)
index 0000000..308490e
--- /dev/null
@@ -0,0 +1,8 @@
+Documentation/filesystems/ext2.txt
+fs/ext3/inode.c
+fs/ext3/iopen.c
+fs/ext3/iopen.h
+fs/ext3/Makefile
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/iopen-2.6.0.pc b/lustre/kernel_patches/pc/iopen-2.6.0.pc
new file mode 100644 (file)
index 0000000..308490e
--- /dev/null
@@ -0,0 +1,8 @@
+Documentation/filesystems/ext2.txt
+fs/ext3/inode.c
+fs/ext3/iopen.c
+fs/ext3/iopen.h
+fs/ext3/Makefile
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc b/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc
new file mode 100644 (file)
index 0000000..1078cb4
--- /dev/null
@@ -0,0 +1,11 @@
+fs/ext3/ext3-exports.c
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/Makefile
+fs/ext3/namei.c
+fs/ext3/super.c
+fs/ext3/xattr.c
+include/linux/ext3_fs.h
+include/linux/ext3_jbd.h
+include/linux/ext3_xattr.h
+include/linux/xattr.h
diff --git a/lustre/kernel_patches/pc/removepage-2.4.20.pc b/lustre/kernel_patches/pc/removepage-2.4.20.pc
new file mode 100644 (file)
index 0000000..c659e15
--- /dev/null
@@ -0,0 +1,2 @@
+include/linux/fs.h
+mm/filemap.c
diff --git a/lustre/kernel_patches/pc/removepage-2.6.0.pc b/lustre/kernel_patches/pc/removepage-2.6.0.pc
new file mode 100644 (file)
index 0000000..c659e15
--- /dev/null
@@ -0,0 +1,2 @@
+include/linux/fs.h
+mm/filemap.c
diff --git a/lustre/kernel_patches/pc/uml-2.6.0-fix.pc b/lustre/kernel_patches/pc/uml-2.6.0-fix.pc
new file mode 100644 (file)
index 0000000..980e3ee
--- /dev/null
@@ -0,0 +1 @@
+include/asm-um/unistd.h
diff --git a/lustre/kernel_patches/pc/uml-patch-2.6.0-test3-1.pc b/lustre/kernel_patches/pc/uml-patch-2.6.0-test3-1.pc
new file mode 100644 (file)
index 0000000..9a32c9a
--- /dev/null
@@ -0,0 +1,113 @@
+arch/um/config.release
+arch/um/defconfig
+arch/um/drivers/chan_kern.c
+arch/um/drivers/chan_user.c
+arch/um/drivers/cow.h
+arch/um/drivers/cow_kern.c
+arch/um/drivers/cow_sys.h
+arch/um/drivers/cow_user.c
+arch/um/drivers/hostaudio_kern.c
+arch/um/drivers/line.c
+arch/um/drivers/Makefile
+arch/um/drivers/mconsole_kern.c
+arch/um/drivers/mconsole_user.c
+arch/um/drivers/mmapper_kern.c
+arch/um/drivers/net_kern.c
+arch/um/drivers/port_kern.c
+arch/um/drivers/ssl.c
+arch/um/drivers/stdio_console.c
+arch/um/drivers/ubd_kern.c
+arch/um/drivers/ubd_user.c
+arch/um/drivers/xterm.c
+arch/um/drivers/xterm_kern.c
+arch/um/dyn.lds.S
+arch/um/include/irq_kern.h
+arch/um/include/kern_util.h
+arch/um/include/line.h
+arch/um/include/mconsole.h
+arch/um/include/mem.h
+arch/um/include/mem_user.h
+arch/um/include/os.h
+arch/um/include/sysdep-i386/sigcontext.h
+arch/um/include/ubd_user.h
+arch/um/include/user.h
+arch/um/include/user_util.h
+arch/um/Kconfig
+arch/um/Kconfig_block
+arch/um/Kconfig_net
+arch/um/kernel/config.c.in
+arch/um/kernel/exec_kern.c
+arch/um/kernel/init_task.c
+arch/um/kernel/irq.c
+arch/um/kernel/Makefile
+arch/um/kernel/mem.c
+arch/um/kernel/mem_user.c
+arch/um/kernel/process.c
+arch/um/kernel/process_kern.c
+arch/um/kernel/ptrace.c
+arch/um/kernel/sigio_kern.c
+arch/um/kernel/signal_kern.c
+arch/um/kernel/skas/include/mode.h
+arch/um/kernel/skas/include/uaccess.h
+arch/um/kernel/skas/Makefile
+arch/um/kernel/skas/process.c
+arch/um/kernel/skas/process_kern.c
+arch/um/kernel/skas/util/mk_ptregs.c
+arch/um/kernel/smp.c
+arch/um/kernel/syscall_kern.c
+arch/um/kernel/sys_call_table.c
+arch/um/kernel/sysrq.c
+arch/um/kernel/time.c
+arch/um/kernel/time_kern.c
+arch/um/kernel/trap_kern.c
+arch/um/kernel/trap_user.c
+arch/um/kernel/tt/include/uaccess.h
+arch/um/kernel/tt/process_kern.c
+arch/um/kernel/tt/ptproxy/proxy.c
+arch/um/kernel/tt/tracer.c
+arch/um/kernel/tt/uaccess_user.c
+arch/um/kernel/tty_log.c
+arch/um/kernel/um_arch.c
+arch/um/kernel/umid.c
+arch/um/kernel/user_util.c
+arch/um/Makefile
+arch/um/Makefile-i386
+arch/um/Makefile-skas
+arch/um/os-Linux/drivers/tuntap_user.c
+arch/um/os-Linux/file.c
+arch/um/sys-i386/bugs.c
+arch/um/sys-i386/Makefile
+arch/um/uml.lds.S
+arch/um/util/mk_constants_kern.c
+fs/hostfs/hostfs.h
+fs/hostfs/hostfs_kern.c
+fs/hostfs/hostfs_user.c
+fs/hostfs/Makefile
+fs/hppfs/hppfs_kern.c
+fs/hppfs/Makefile
+fs/Makefile
+include/asm-um/archparam-i386.h
+include/asm-um/common.lds.S
+include/asm-um/cpufeature.h
+include/asm-um/current.h
+include/asm-um/fixmap.h
+include/asm-um/irq.h
+include/asm-um/local.h
+include/asm-um/module-generic.h
+include/asm-um/module-i386.h
+include/asm-um/page.h
+include/asm-um/pgtable.h
+include/asm-um/processor-generic.h
+include/asm-um/processor-i386.h
+include/asm-um/sections.h
+include/asm-um/smp.h
+include/asm-um/system-generic.h
+include/asm-um/thread_info.h
+include/asm-um/timex.h
+include/linux/mm.h
+include/linux/proc_mm.h
+mm/Makefile
+mm/memory.c
+mm/mmap.c
+mm/mprotect.c
+mm/proc_mm.c
diff --git a/lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc b/lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc
new file mode 100644 (file)
index 0000000..f244b84
--- /dev/null
@@ -0,0 +1,3 @@
+fs/namei.c
+include/linux/fs.h
+fs/inode.c
index 8a36dc0..5ecead5 100644 (file)
@@ -23,6 +23,6 @@ iopen-2.4.18.patch
 jbd-dont-account-blocks-twice.patch
 jbd-commit-tricks.patch
 ext3-o_direct-1-2.4.18-chaos.patch
-ext3-no-write-super.patch
-jbd-ctx_switch.patch
-jbd-get_write_access.patch
+ext3-no-write-super-chaos.patch
+ext3-extents-2.4.18-chaos.patch
+ext3-extents-oflag-2.4.18-chaos.patch
diff --git a/lustre/kernel_patches/series/chaos-2.4.18-pdirops b/lustre/kernel_patches/series/chaos-2.4.18-pdirops
new file mode 100644 (file)
index 0000000..d4545e2
--- /dev/null
@@ -0,0 +1,35 @@
+dev_read_only.patch
+exports.patch
+kmem_cache_validate.patch
+lustre_version.patch
+vfs_intent-2.4.18-18-chaos65.patch
+invalidate_show.patch
+iod-rmap-exports.patch
+export-truncate.patch
+ext3-compat-2.4.18-chaos.patch
+ext-2.4-patch-1.patch
+ext-2.4-patch-2.patch
+ext-2.4-patch-3.patch
+ext-2.4-patch-4.patch
+ext-2.4-patch-5.patch
+linux-2.4.18ea-0.8.26-2.patch
+ext3-2.4-ino_t.patch
+ext3-2.4.18-ino_sb_macro-2.patch
+ext3-orphan_lock.patch
+ext3-delete_thread-2.4.18-2.patch
+extN-misc-fixup.patch
+extN-noread.patch
+extN-wantedi.patch
+ext3-san-2.4.20.patch
+extN-2.4.18-ino_sb_fixup.patch
+ext3-map_inode_page_2.4.18.patch
+ext3-error-export.patch
+iopen-2.4.18-2.patch
+jbd-dont-account-blocks-twice.patch
+jbd-commit-tricks.patch
+ext3-o_direct-1-2.4.18-chaos.patch
+ext3-no-write-super-chaos.patch
+dynamic-locks-2.4.18-chaos.patch
+vfs-pdirops-2.4.18-chaos.patch
+ext3-pdirops-2.4.18-chaos.patch
+add_page_private.patch
diff --git a/lustre/kernel_patches/series/uml_2.6.0_test3 b/lustre/kernel_patches/series/uml_2.6.0_test3
new file mode 100644 (file)
index 0000000..7b89a36
--- /dev/null
@@ -0,0 +1,14 @@
+uml-patch-2.6.0-test3-1.patch
+lustre_build.patch
+lustre_version.patch
+vfs_intent_2.6.0-test1.patch
+vfs_nointent_2.6.0-test1.patch
+vfs_races_2.5.72_rev1.patch
+vfs_mntcwd_2.5.72_rev1.patch
+ext3-san-jdike-2.5.73.patch
+iopen-2.6.0.patch
+export-truncate-2.5.63.patch
+qla2xxx-v8.00.00b1-2.5.73.patch
+uml-2.6.0-fix.patch
+ext3-map_inode_page-2.6.0.patch
+removepage-2.6.0.patch
index 88af047..5d4c927 100644 (file)
@@ -470,6 +470,7 @@ static int llu_file_release(struct inode *inode)
         if (!fd) /* no process opened the file after an mcreate */
                 RETURN(rc = 0);
 
+#if 0
         /* we might not be able to get a valid handle on this file
          * again so we really want to flush our write cache.. */
         if (S_ISREG(inode->i_mode) && lsm) {
@@ -481,11 +482,12 @@ static int llu_file_release(struct inode *inode)
                 memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE);
                 oa.o_valid |= OBD_MD_FLHANDLE;
 
-                rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
+                rc = obd_close(ll_s2obdexp(sbi), &oa, lsm, NULL);
                 if (rc)
                         CERROR("inode %lu object close failed: rc = "
                                "%d\n", lli->lli_st_ino, rc);
        }
+#endif
 
         rc2 = llu_mdc_close(&sbi->ll_mdc_conn, inode);
         if (rc2 && !rc)
index ce2e23b..977dbca 100644 (file)
@@ -20,7 +20,7 @@ struct llu_sb_info
 {
         struct obd_uuid         ll_sb_uuid;
         struct lustre_handle    ll_mdc_conn;
-        struct lustre_handle    ll_osc_conn;
+        struct obd_export      ll_osc_exp;
         obd_id                  ll_rootino;
         int                     ll_flags;
         struct list_head        ll_conn_chain;
index 847b1d0..c5df187 100644 (file)
@@ -74,7 +74,7 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode,
         down(&lli->lli_getattr_sem);
 
         if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) {
-                rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
+                rc = ll_inode_getattr(inode, lsm);
                 if (rc == 0) {
                         set_bit(LLI_F_DID_GETATTR, &lli->lli_flags);
                 } else {
index 0939352..a51be12 100644 (file)
@@ -170,8 +170,7 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
         dst->o_valid |= (valid & ~OBD_MD_FLID);
 }
 
-int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
-                      char *ostdata)
+static int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm)
 {
         struct llu_sb_info *sbi = llu_i2sbi(inode);
         struct obdo oa;
@@ -187,11 +186,6 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
         oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
                 OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
 
-        if (ostdata != NULL) {
-                memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE);
-                oa.o_valid |= OBD_MD_FLHANDLE;
-        }
-
         rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
         if (rc)
                 RETURN(rc);
@@ -327,7 +321,7 @@ static int llu_iop_lookup(struct pnode *pnode,
         llu_update_inode(*inop, body, lic.lic_lsm);
 
         if (llu_i2info(*inop)->lli_smd) {
-                rc = llu_inode_getattr(*inop, llu_i2info(*inop)->lli_smd, NULL);
+                rc = llu_inode_getattr(*inop, llu_i2info(*inop)->lli_smd);
                 if (rc)
                         _sysio_i_gone(*inop);
         }
diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c
new file mode 100644 (file)
index 0000000..2e63dc7
--- /dev/null
@@ -0,0 +1,550 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MDC
+
+#ifdef __KERNEL__
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+#else
+# include <liblustre.h>
+# include <linux/obd_class.h>
+#endif
+
+#include <linux/lustre_mds.h>
+#include <linux/lustre_lite.h>
+#include <linux/lustre_dlm.h>
+#include <linux/lprocfs_status.h>
+#include "mdc_internal.h"
+
+int it_disposition(struct lookup_intent *it, int flag)
+{
+        return it->d.lustre.it_disposition & flag;
+}
+EXPORT_SYMBOL(it_disposition);
+
+void it_set_disposition(struct lookup_intent *it, int flag)
+{
+        it->d.lustre.it_disposition |= flag;
+}
+EXPORT_SYMBOL(it_set_disposition);
+
+static void mdc_fid2mdc_op_data(struct mdc_op_data *data,
+                            struct ll_uctxt *ctxt,
+                            struct ll_fid *f1,
+                            struct ll_fid *f2,
+                            const char *name,
+                            int namelen,
+                            int mode)
+{
+        LASSERT(data);
+        LASSERT(ctxt);
+        LASSERT(f1);
+
+        data->ctxt = *ctxt;
+        data->fid1 = *f1;
+        if (f2)
+                data->fid2 = *f2;
+        else 
+                memset(&data->fid2, 0, sizeof(data->fid2));
+        data->name = name;
+        data->namelen = namelen;
+        data->create_mode = mode;
+}
+
+static int it_to_lock_mode(struct lookup_intent *it)
+{
+        /* CREAT needs to be tested before open (both could be set) */
+        if (it->it_op & IT_CREAT)
+                return LCK_PW;
+        else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP))
+                return LCK_PR;
+
+        LBUG();
+        RETURN(-EINVAL);
+}
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+        if (it_disposition(it, DISP_OPEN_OPEN)) {
+                if (phase == DISP_OPEN_OPEN)
+                        return it->d.lustre.it_status;
+                else
+                        return 0;
+        }
+
+        if (it_disposition(it, DISP_OPEN_CREATE)) {
+                if (phase == DISP_OPEN_CREATE)
+                        return it->d.lustre.it_status;
+                else
+                        return 0;
+        }
+
+        if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+                if (phase == DISP_LOOKUP_EXECD)
+                        return it->d.lustre.it_status;
+                else
+                        return 0;
+        }
+
+        if (it_disposition(it, DISP_IT_EXECD)) {
+                if (phase == DISP_IT_EXECD)
+                        return it->d.lustre.it_status;
+                else
+                        return 0;
+        }
+        CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
+               it->d.lustre.it_status);
+        LBUG();
+        return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+void mdc_set_lock_data(__u64 *l, void *data)
+{
+        struct ldlm_lock *lock;
+        struct lustre_handle *lockh = (struct lustre_handle *)l;
+        ENTRY;
+
+        if (!*l) {
+                EXIT;
+                return;
+        }
+
+        lock = ldlm_handle2lock(lockh);
+
+        LASSERT(lock != NULL);
+        l_lock(&lock->l_resource->lr_namespace->ns_lock);
+#if !defined(LIBLUSTRE)
+        if (lock->l_data && lock->l_data != data) {
+                struct inode *new_inode = data;
+                struct inode *old_inode = lock->l_data;
+                unsigned long state = old_inode->i_state & I_FREEING;
+                CERROR("Found existing inode %p/%lu/%u state %lu in lock: "
+                       "setting data to %p/%lu/%u\n", old_inode,
+                       old_inode->i_ino, old_inode->i_generation, state,
+                       new_inode, new_inode->i_ino, new_inode->i_generation);
+                LASSERT(state);
+        }
+#endif
+        lock->l_data = data;
+        l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+        LDLM_LOCK_PUT(lock);
+
+        EXIT;
+}
+EXPORT_SYMBOL(mdc_set_lock_data);
+
+int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid, 
+                      ldlm_iterator_t it, void *data)
+{
+        struct ldlm_res_id res_id = { .name = {0} };
+        ENTRY;
+
+        res_id.name[0] = fid->id;
+        res_id.name[1] = fid->generation;
+
+        ldlm_change_cbdata(class_exp2obd(exp)->obd_namespace, &res_id, it, 
+                           data);
+        EXIT;
+        return 0;
+}
+
+
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type. */
+int mdc_enqueue(struct obd_export *exp,
+                int lock_type,
+                struct lookup_intent *it,
+                int lock_mode,
+                struct mdc_op_data *data,
+                struct lustre_handle *lockh,
+                char *tgt,
+                int tgtlen,
+                ldlm_completion_callback cb_completion,
+                ldlm_blocking_callback cb_blocking,
+                void *cb_data)
+{
+        struct ptlrpc_request *req;
+        struct obd_device *obddev = class_exp2obd(exp);
+        struct ldlm_res_id res_id =
+                { .name = {data->fid1.id, data->fid1.generation} };
+        int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)};
+        int rc, flags = LDLM_FL_HAS_INTENT;
+        int repsize[4] = {sizeof(struct ldlm_reply),
+                          sizeof(struct mds_body),
+                          obddev->u.cli.cl_max_mds_easize,
+                          obddev->u.cli.cl_max_mds_cookiesize};
+        struct ldlm_reply *dlm_rep;
+        struct ldlm_intent *lit;
+        struct ldlm_request *lockreq;
+        void *eadata;
+        unsigned long irqflags;
+        int   reply_buffers = 0;
+        ENTRY;
+
+//        LDLM_DEBUG_NOLOCK("mdsintent=%s,name=%s,dir=%lu",
+//                          ldlm_it2str(it->it_op), it_name, it_inode->i_ino);
+
+        if (it->it_op & IT_OPEN) {
+                it->it_create_mode |= S_IFREG;
+                it->it_create_mode &= ~current->fs->umask;
+
+                size[2] = sizeof(struct mds_rec_create);
+                size[3] = data->namelen + 1;
+                size[4] = obddev->u.cli.cl_max_mds_easize;
+                req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 5,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                spin_lock_irqsave (&req->rq_lock, irqflags);
+                req->rq_replay = 1;
+                spin_unlock_irqrestore (&req->rq_lock, irqflags);
+
+                /* pack the intent */
+                lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+                lit->opc = (__u64)it->it_op;
+
+                /* pack the intended request */
+                mdc_open_pack(req, 2, data, it->it_create_mode, 0, 
+                              LTIME_S(CURRENT_TIME),
+                              it->it_flags, tgt, tgtlen);
+                /* get ready for the reply */
+                reply_buffers = 3;
+                req->rq_replen = lustre_msg_size(3, repsize);
+        } else if (it->it_op & IT_UNLINK) {
+                size[2] = sizeof(struct mds_rec_unlink);
+                size[3] = data->namelen + 1;
+                req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 4,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* pack the intent */
+                lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+                lit->opc = (__u64)it->it_op;
+
+                /* pack the intended request */
+                mdc_unlink_pack(req, 2, data);
+                /* get ready for the reply */
+                reply_buffers = 4;
+                req->rq_replen = lustre_msg_size(4, repsize);
+        } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+                int valid = OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE;
+                size[2] = sizeof(struct mds_body);
+                size[3] = data->namelen + 1;
+
+                req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 4,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* pack the intent */
+                lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+                lit->opc = (__u64)it->it_op;
+
+                /* pack the intended request */
+                mdc_getattr_pack(req, valid, 2, it->it_flags, data);
+                /* get ready for the reply */
+                reply_buffers = 3;
+                req->rq_replen = lustre_msg_size(3, repsize);
+        } else if (it->it_op == IT_READDIR) {
+                req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 1,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* get ready for the reply */
+                reply_buffers = 1;
+                req->rq_replen = lustre_msg_size(1, repsize);
+        }  else {
+                LBUG();
+                RETURN(-EINVAL);
+        }
+
+        mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+        rc = ldlm_cli_enqueue(exp, req, obddev->obd_namespace, NULL, res_id,
+                              lock_type, NULL, 0, lock_mode, &flags,
+                              cb_completion, cb_blocking, cb_data, lockh);
+        mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+
+        /* Similarly, if we're going to replay this request, we don't want to
+         * actually get a lock, just perform the intent. */
+        if (req->rq_transno || req->rq_replay) {
+                lockreq = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*lockreq));
+                lockreq->lock_flags |= LDLM_FL_INTENT_ONLY;
+        }
+
+        /* This can go when we're sure that this can never happen */
+        LASSERT(rc != -ENOENT);
+        if (rc == ELDLM_LOCK_ABORTED) {
+                lock_mode = 0;
+                memset(lockh, 0, sizeof(*lockh));
+                rc = 0;
+        } else if (rc != 0) {
+                CERROR("ldlm_cli_enqueue: %d\n", rc);
+                LASSERT (rc < 0);
+                ptlrpc_req_finished(req);
+                RETURN(rc);
+        } else { /* rc = 0 */
+                struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+                LASSERT(lock);
+
+                /* If the server gave us back a different lock mode, we should
+                 * fix up our variables. */
+                if (lock->l_req_mode != lock_mode) {
+                        ldlm_lock_addref(lockh, lock->l_req_mode);
+                        ldlm_lock_decref(lockh, lock_mode);
+                        lock_mode = lock->l_req_mode;
+                }
+
+                LDLM_LOCK_PUT(lock);
+        }
+
+        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep));
+        LASSERT(dlm_rep != NULL);           /* checked by ldlm_cli_enqueue() */
+        LASSERT_REPSWABBED(req, 0);         /* swabbed by ldlm_cli_enqueue() */
+
+        it->d.lustre.it_disposition = (int) dlm_rep->lock_policy_res1;
+        it->d.lustre.it_status = (int) dlm_rep->lock_policy_res2;
+        it->d.lustre.it_lock_mode = lock_mode;
+        it->d.lustre.it_data = req;
+
+        /* We know what to expect, so we do any byte flipping required here */
+        LASSERT(reply_buffers == 4 || reply_buffers == 3 || reply_buffers == 1);
+        if (reply_buffers >= 3) {
+                struct mds_body *body;
+
+                body = lustre_swab_repbuf(req, 1, sizeof (*body),
+                                           lustre_swab_mds_body);
+                if (body == NULL) {
+                        CERROR ("Can't swab mds_body\n");
+                        RETURN (-EPROTO);
+                }
+
+                if ((body->valid & OBD_MD_FLEASIZE) != 0) {
+                        void *replayea;
+                        /* The eadata is opaque; just check that it is
+                         * there.  Eventually, obd_unpackmd() will check
+                         * the contents */
+                        eadata = lustre_swab_repbuf(req, 2, body->eadatasize,
+                                                    NULL);
+                        if (eadata == NULL) {
+                                CERROR ("Missing/short eadata\n");
+                                RETURN (-EPROTO);
+                        }
+                        if (it->it_op & IT_OPEN) {
+                                replayea = lustre_msg_buf(req->rq_reqmsg, 4, 
+                                                          obddev->u.cli.cl_max_mds_easize);
+                                LASSERT(replayea);
+                                memcpy(replayea, eadata, body->eadatasize);
+                        }
+                }
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(mdc_enqueue);
+
+/* 
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what d.lustre.it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt,
+                    struct ll_fid *pfid, const char *name, int len,
+                    struct ll_fid *cfid, struct lookup_intent *it, int flags,
+                    struct ptlrpc_request **reqp,
+                    ldlm_blocking_callback cb_blocking)
+{
+        struct lustre_handle lockh;
+        struct ptlrpc_request *request;
+        int rc = 0;
+        struct mds_body *mds_body;
+        struct lustre_handle old_lock;
+        struct ldlm_lock *lock;
+        ENTRY;
+        LASSERT(it);
+
+        CDEBUG(D_DLMTRACE, "name: %*s in %ld, intent: %s\n", len, name,
+               (unsigned long) pfid->id, ldlm_it2str(it->it_op));
+
+        if (cfid && (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR)) {
+                /* We could just return 1 immediately, but since we should only
+                 * be called in revalidate_it if we already have a lock, let's
+                 * verify that. */
+                struct ldlm_res_id res_id ={.name = {cfid->id, 
+                                                     cfid->generation}};
+                struct lustre_handle lockh;
+                int mode, flags = LDLM_FL_BLOCK_GRANTED;
+
+                mode = LCK_PR;
+                rc = ldlm_lock_match(exp->exp_obd->obd_namespace, flags,
+                                     &res_id, LDLM_PLAIN, NULL, 0, LCK_PR,
+                                     &lockh);
+                if (!rc) {
+                        mode = LCK_PW;
+                        rc = ldlm_lock_match(exp->exp_obd->obd_namespace, flags,
+                                             &res_id, LDLM_PLAIN, NULL, 0,
+                                             LCK_PW, &lockh);
+                }
+                if (rc) {
+                        memcpy(&it->d.lustre.it_lock_handle, &lockh, 
+                               sizeof(lockh));
+                        it->d.lustre.it_lock_mode = mode;
+                }
+                RETURN(rc);
+        }
+
+        /* This function may be called twice, we only once want to
+           execute the request associated with the intent. If it was
+           done already, we skip past this and use the results. */ 
+        if (!it_disposition(it, DISP_ENQ_COMPLETE)) {
+                struct mdc_op_data op_data;
+                mdc_fid2mdc_op_data(&op_data, uctxt, pfid, cfid, name, len, 0);
+
+                rc = mdc_enqueue(exp, LDLM_PLAIN, it, it_to_lock_mode(it),
+                                 &op_data, &lockh, NULL, 0, ldlm_completion_ast,
+                                 cb_blocking, NULL);
+                if (rc < 0)
+                        RETURN(rc);
+                memcpy(&it->d.lustre.it_lock_handle, &lockh, sizeof(lockh));
+        }
+        request = *reqp = it->d.lustre.it_data;
+        LASSERT(request != NULL);
+
+        if (!it_disposition(it, DISP_IT_EXECD)) {
+                /* The server failed before it even started executing the
+                 * intent, i.e. because it couldn't unpack the request. */
+                LASSERT(it->d.lustre.it_status != 0);
+                RETURN(it->d.lustre.it_status);
+        }
+        rc = it_open_error(DISP_IT_EXECD, it);
+        if (rc)
+                RETURN(rc);
+
+        mds_body = lustre_msg_buf(request->rq_repmsg, 1, sizeof(*mds_body));
+        LASSERT(mds_body != NULL);           /* mdc_enqueue checked */
+        LASSERT_REPSWABBED(request, 1); /* mdc_enqueue swabbed */
+
+        /* If we were revalidating a fid/name pair, mark the intent in
+         * case we fail and get called again from lookup */
+        if (cfid != NULL) {
+                it_set_disposition(it, DISP_ENQ_COMPLETE);
+                /* Also: did we find the same inode? */
+                if (memcmp(cfid, &mds_body->fid1, sizeof(*cfid))) {
+                        ptlrpc_request_addref(request);
+                        RETURN(-ESTALE);
+                }
+        }
+
+        /* If we're doing an IT_OPEN which did not result in an actual
+         * successful open, then we need to remove the bit which saves
+         * this request for unconditional replay. */
+        if (it->it_op & IT_OPEN) {
+                if (!it_disposition(it, DISP_OPEN_OPEN) ||
+                    it->d.lustre.it_status != 0) {
+                        unsigned long flags;
+
+                        spin_lock_irqsave(&request->rq_lock, flags);
+                        request->rq_replay = 0;
+                        spin_unlock_irqrestore(&request->rq_lock, flags);
+                }
+        }
+
+        rc = it_open_error(DISP_LOOKUP_EXECD, it);
+        if (rc)
+                RETURN(rc);
+
+        /* keep requests around for the multiple phases of the call
+         * this shows the DISP_XX must guarantee we make it into the call
+         */
+        if (it_disposition(it, DISP_OPEN_CREATE) &&
+            !it_open_error(DISP_OPEN_CREATE, it))
+                ptlrpc_request_addref(request);
+        if (it_disposition(it, DISP_OPEN_OPEN) &&
+            !it_open_error(DISP_OPEN_OPEN, it))
+                ptlrpc_request_addref(request);
+
+        if (it->it_op & IT_CREAT) {
+                /* XXX this belongs in ll_create_iit */
+        } else if (it->it_op == IT_OPEN) {
+                LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+        } else {
+                LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP));
+        }
+
+        /* If we already have a matching lock, then cancel the new
+         * one.  We have to set the data here instead of in
+         * mdc_enqueue, because we need to use the child's inode as
+         * the l_data to match, and that's not available until
+         * intent_finish has performed the iget().) */
+        lock = ldlm_handle2lock(&lockh);
+        if (lock) {
+                LDLM_DEBUG(lock, "matching against this");
+                LDLM_LOCK_PUT(lock);
+                memcpy(&old_lock, &lockh, sizeof(lockh));
+                if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+                                    LDLM_PLAIN, NULL, 0, LCK_NL, &old_lock)) {
+                        ldlm_lock_decref_and_cancel(&lockh,
+                                                    it->d.lustre.it_lock_mode);
+                        memcpy(&lockh, &old_lock, sizeof(old_lock));
+                        memcpy(&it->d.lustre.it_lock_handle, &lockh,
+                               sizeof(lockh));
+                }
+        }
+        CDEBUG(D_DENTRY, "D_IT dentry %*s intent: %s status %d disp %x rc %d\n",
+               len, name, ldlm_it2str(it->it_op), it->d.lustre.it_status,
+               it->d.lustre.it_disposition, rc);
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(mdc_intent_lock);
diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c
new file mode 100644 (file)
index 0000000..a109ef6
--- /dev/null
@@ -0,0 +1,237 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  linux/fs/obdfilter/filter_io.c
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *   Author: Peter Braam <braam@clusterfs.com>
+ *   Author: Andreas Dilger <adilger@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/pagemap.h> // XXX kill me soon
+#include <linux/version.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/iobuf.h>
+
+#include <linux/obd_class.h>
+#include <linux/lustre_fsfilt.h>
+#include "filter_internal.h"
+
+
+/* We should only change the file mtime (and not the ctime, like
+ * update_inode_times() in generic_file_write()) when we only change data. */
+void inode_update_time(struct inode *inode, int ctime_too)
+{
+        time_t now = CURRENT_TIME;
+        if (inode->i_mtime == now && (!ctime_too || inode->i_ctime == now))
+                return;
+        inode->i_mtime = now;
+        if (ctime_too)
+                inode->i_ctime = now;
+        mark_inode_dirty_sync(inode);
+}
+
+int ext3_map_inode_page(struct inode *inode, struct page *page,
+                        unsigned long *blocks, int *created, int create);
+int filter_direct_io(int rw, struct inode *inode, struct kiobuf *iobuf)
+{
+        struct page *page;
+        unsigned long *b = iobuf->blocks;
+        int rc, i, create = (rw == OBD_BRW_WRITE), blocks_per_page, *created;
+        int *cr, cleanup_phase;
+        ENTRY;
+
+        blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
+        if (iobuf->nr_pages * blocks_per_page > KIO_MAX_SECTORS)
+                RETURN(-EINVAL);
+
+        OBD_ALLOC(created, sizeof(*created) * iobuf->nr_pages*blocks_per_page);
+        if (created == NULL)
+                RETURN(-ENOMEM);
+        cleanup_phase = 1;
+
+        rc = lock_kiovec(1, &iobuf, 1);
+        if (rc < 0)
+                GOTO(cleanup, rc);
+        cleanup_phase = 2;
+
+        down(&inode->i_sem);
+        cleanup_phase = 3;
+        for (i = 0, cr = created, b = iobuf->blocks; i < iobuf->nr_pages; i++){
+                page = iobuf->maplist[i];
+
+                rc = ext3_map_inode_page(inode, page, b, cr, create);
+                if (rc)
+                        GOTO(cleanup, rc);
+
+                b += blocks_per_page;
+                cr += blocks_per_page;
+        }
+        up(&inode->i_sem);
+        cleanup_phase = 2;
+
+        rc = brw_kiovec(WRITE, 1, &iobuf, inode->i_dev, iobuf->blocks,
+                        1 << inode->i_blkbits);
+        CDEBUG(D_INFO, "tried to write %d pages, rc = %d\n",
+               iobuf->nr_pages, rc);
+        if (rc != (1 << inode->i_blkbits) * iobuf->nr_pages * blocks_per_page)
+                CERROR("short write?  expected %d, wrote %d\n",
+                       (1 << inode->i_blkbits) * iobuf->nr_pages *
+                       blocks_per_page, rc);
+        if (rc > 0)
+                rc = 0;
+
+        EXIT;
+cleanup:
+        switch(cleanup_phase) {
+                case 3:
+                        up(&inode->i_sem);
+                case 2:
+                        unlock_kiovec(1, &iobuf);
+                case 1:
+                        OBD_FREE(created, sizeof(*created) * 
+                                          iobuf->nr_pages*blocks_per_page);
+                        break;
+                default:
+                        CERROR("corrupt cleanup_phase (%d)?\n", cleanup_phase);
+                        LBUG();
+                        break;
+        }
+        return rc;
+}
+
+int filter_commitrw_write(struct obd_export *exp, int objcount,
+                                 struct obd_ioobj *obj, int niocount,
+                                 struct niobuf_local *res,
+                                 struct obd_trans_info *oti)
+{
+        struct obd_device *obd = exp->exp_obd;
+        struct obd_run_ctxt saved;
+        struct niobuf_local *lnb;
+        struct fsfilt_objinfo fso;
+        struct iattr iattr = { .ia_valid = ATTR_SIZE, .ia_size = 0, };
+        struct kiobuf *iobuf;
+        struct inode *inode = NULL;
+        int rc = 0, i, cleanup_phase = 0, err;
+        unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */
+        ENTRY;
+        LASSERT(oti != NULL);
+        LASSERT(objcount == 1);
+        LASSERT(current->journal_info == NULL);
+
+        rc = alloc_kiovec(1, &iobuf);
+        if (rc)
+                GOTO(cleanup, rc);
+        cleanup_phase = 1;
+
+#if (LINUX_VERSION_CODE == KERNEL_VERSION(2,4,18))
+        iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */
+#endif
+        rc = expand_kiobuf(iobuf, obj->ioo_bufcnt);
+        if (rc)
+                GOTO(cleanup, rc);
+
+        iobuf->offset = 0;
+        iobuf->length = PAGE_SIZE * obj->ioo_bufcnt;
+        iobuf->nr_pages = obj->ioo_bufcnt;
+
+        cleanup_phase = 1;
+        fso.fso_dentry = res->dentry;
+        fso.fso_bufcnt = obj->ioo_bufcnt;
+        inode = res->dentry->d_inode;
+
+        for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
+                loff_t this_size;
+                iobuf->maplist[i] = lnb->page;
+                /* We expect these pages to be in offset order, but we'll
+                 * be forgiving */
+                this_size = lnb->offset + lnb->len;
+                if (this_size > iattr.ia_size)
+                        iattr.ia_size = this_size;
+        }
+
+        push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL);
+        cleanup_phase = 2; 
+
+        oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, oti);
+        if (IS_ERR(oti->oti_handle)) {
+                rc = PTR_ERR(oti->oti_handle);
+                CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
+                       "error starting transaction: rc = %d\n", rc);
+                oti->oti_handle = NULL;
+                GOTO(cleanup, rc);
+        }
+
+        if (time_after(jiffies, now + 15 * HZ))
+                CERROR("slow brw_start %lus\n", (jiffies - now) / HZ);
+
+        rc = filter_direct_io(OBD_BRW_WRITE, inode, iobuf);
+        if (rc == 0) {
+                down(&inode->i_sem);
+                inode_update_time(inode, 1);
+                if (iattr.ia_size > inode->i_size) {
+                        CDEBUG(D_INFO, "setting i_size to "LPU64"\n",
+                               iattr.ia_size);
+                        fsfilt_setattr(obd, res->dentry, oti->oti_handle,
+                                       &iattr, 0);
+                }
+                up(&inode->i_sem);
+        }
+
+        if (time_after(jiffies, now + 15 * HZ))
+                CERROR("slow direct_io %lus\n", (jiffies - now) / HZ);
+
+        rc = filter_finish_transno(exp, oti, rc);
+        err = fsfilt_commit(obd, inode, oti->oti_handle, obd_sync_filter);
+        if (err)
+                rc = err;
+        if (obd_sync_filter)
+                LASSERT(oti->oti_transno <= obd->obd_last_committed);
+        if (time_after(jiffies, now + 15 * HZ))
+                CERROR("slow commitrw commit %lus\n", (jiffies - now) / HZ);
+
+cleanup:
+        switch (cleanup_phase) {
+        case 2:
+                pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL);
+                LASSERT(current->journal_info == NULL);
+        case 1:
+                free_kiovec(1, &iobuf);
+        case 0:
+                for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
+                        /* flip_.. gets a ref, while free_page only frees
+                         * when it decrefs to 0 */
+                        if (rc == 0)
+                                flip_into_page_cache(inode, lnb->page);
+                        __free_page(lnb->page);
+                }
+                f_dput(res->dentry);
+        }
+
+        RETURN(rc);
+}
+
+#endif
+
diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c
new file mode 100644 (file)
index 0000000..ec9957a
--- /dev/null
@@ -0,0 +1,228 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  linux/fs/obdfilter/filter_io.c
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *   Author: Peter Braam <braam@clusterfs.com>
+ *   Author: Andreas Dilger <adilger@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/pagemap.h> // XXX kill me soon
+#include <linux/version.h>
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/obd_class.h>
+#include <linux/lustre_fsfilt.h>
+#include "filter_internal.h"
+
+int ext3_map_inode_page(struct inode *inode, struct page *page,
+                        unsigned long *blocks, int *created, int create);
+
+/* 512byte block min */
+#define MAX_BLOCKS_PER_PAGE (PAGE_SIZE / 512)
+struct dio_request {
+        atomic_t numreqs;       /* number of reqs being processed */
+        struct bio *bio_list;   /* list of completed bios */
+        wait_queue_head_t wait;
+       int created[MAX_BLOCKS_PER_PAGE];
+       unsigned long blocks[MAX_BLOCKS_PER_PAGE];
+        spinlock_t lock;
+};
+
+static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
+{
+        struct dio_request *dreq = bio->bi_private;
+        unsigned long flags;
+
+        spin_lock_irqsave(&dreq->lock, flags);
+        bio->bi_private = dreq->bio_list;
+        dreq->bio_list = bio;
+        spin_unlock_irqrestore(&dreq->lock, flags);
+        if (atomic_dec_and_test(&dreq->numreqs))
+                wake_up(&dreq->wait);
+
+        return 0;
+}
+
+static int can_be_merged(struct bio *bio, sector_t sector)
+{
+       int size;
+       
+       if (!bio)
+               return 0;
+       
+       size = bio->bi_size >> 9;
+       return bio->bi_sector + size == sector ? 1 : 0;
+}
+
+int filter_commitrw_write(struct obd_export *exp, int objcount,
+                                 struct obd_ioobj *obj, int niocount,
+                                 struct niobuf_local *res,
+                                 struct obd_trans_info *oti)
+{
+        struct obd_device *obd = exp->exp_obd;
+        struct obd_run_ctxt saved;
+        struct niobuf_local *lnb;
+        struct fsfilt_objinfo fso;
+        struct iattr iattr = { .ia_valid = ATTR_SIZE, .ia_size = 0, };
+        struct inode *inode = NULL;
+        int rc = 0, i, k, cleanup_phase = 0, err;
+        unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */
+       int blocks_per_page;
+        struct dio_request *dreq;
+        struct bio *bio = NULL;
+        ENTRY;
+        LASSERT(oti != NULL);
+        LASSERT(objcount == 1);
+        LASSERT(current->journal_info == NULL);
+
+        blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
+       LASSERT(blocks_per_page <= MAX_BLOCKS_PER_PAGE);
+
+        OBD_ALLOC(dreq, sizeof(*dreq));
+        if (dreq == NULL)
+                RETURN(-ENOMEM);
+        dreq->bio_list = NULL;
+        init_waitqueue_head(&dreq->wait);
+        atomic_set(&dreq->numreqs, 0);
+        spin_lock_init(&dreq->lock);
+
+        cleanup_phase = 1;
+        fso.fso_dentry = res->dentry;
+        fso.fso_bufcnt = obj->ioo_bufcnt;
+        inode = res->dentry->d_inode;
+
+        push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL);
+        cleanup_phase = 2; 
+
+        oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, oti);
+        if (IS_ERR(oti->oti_handle)) {
+                rc = PTR_ERR(oti->oti_handle);
+                CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
+                       "error starting transaction: rc = %d\n", rc);
+                oti->oti_handle = NULL;
+                GOTO(cleanup, rc);
+        }
+
+        if (time_after(jiffies, now + 15 * HZ))
+                CERROR("slow brw_start %lus\n", (jiffies - now) / HZ);
+
+        for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
+                loff_t this_size;
+               sector_t sector;
+               int offs;
+
+               /* get block number for next page */
+                rc = ext3_map_inode_page(inode, lnb->page, dreq->blocks,
+                                                dreq->created, 1);
+                if (rc)
+                        GOTO(cleanup, rc);
+
+               for (k = 0; k < blocks_per_page; k++) {
+                       sector = dreq->blocks[k] * (inode->i_sb->s_blocksize >> 9);
+                       offs = k * inode->i_sb->s_blocksize;
+
+                       if (!bio || !can_be_merged(bio, sector) ||
+                               !bio_add_page(bio, lnb->page, lnb->len, offs)) {
+                               if (bio) {
+                                        atomic_inc(&dreq->numreqs);
+                                       submit_bio(WRITE, bio);
+                                       bio = NULL;
+                               }
+                               /* allocate new bio */
+                               bio = bio_alloc(GFP_NOIO, obj->ioo_bufcnt);
+                               bio->bi_bdev = inode->i_sb->s_bdev;
+                               bio->bi_sector = sector;
+                               bio->bi_end_io = dio_complete_routine; 
+                                bio->bi_private = dreq;
+
+                               if (!bio_add_page(bio, lnb->page, lnb->len, 0))
+                                       LBUG();
+                       }
+               }
+
+                /* We expect these pages to be in offset order, but we'll
+                 * be forgiving */
+                this_size = lnb->offset + lnb->len;
+                if (this_size > iattr.ia_size)
+                        iattr.ia_size = this_size;
+        }
+       if (bio) {
+                atomic_inc(&dreq->numreqs);
+                submit_bio(WRITE, bio);
+        }
+
+       /* time to wait for I/O completion */
+        wait_event(dreq->wait, atomic_read(&dreq->numreqs) == 0);
+
+        /* free all bios */
+        while (dreq->bio_list) {
+                bio = dreq->bio_list;
+                dreq->bio_list = bio->bi_private;
+                bio_put(bio);
+        }
+
+        if (rc == 0) {
+                down(&inode->i_sem);
+                inode_update_time(inode, 1);
+                if (iattr.ia_size > inode->i_size) {
+                        CDEBUG(D_INFO, "setting i_size to "LPU64"\n",
+                               iattr.ia_size);
+                        fsfilt_setattr(obd, res->dentry, oti->oti_handle,
+                                       &iattr, 0);
+                }
+                up(&inode->i_sem);
+        }
+
+        if (time_after(jiffies, now + 15 * HZ))
+                CERROR("slow direct_io %lus\n", (jiffies - now) / HZ);
+
+        rc = filter_finish_transno(exp, oti, rc);
+        err = fsfilt_commit(obd, inode, oti->oti_handle, obd_sync_filter);
+        if (err)
+                rc = err;
+        if (obd_sync_filter)
+                LASSERT(oti->oti_transno <= obd->obd_last_committed);
+        if (time_after(jiffies, now + 15 * HZ))
+                CERROR("slow commitrw commit %lus\n", (jiffies - now) / HZ);
+
+cleanup:
+        switch (cleanup_phase) {
+        case 2:
+                pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL);
+                LASSERT(current->journal_info == NULL);
+        case 1:
+                OBD_FREE(dreq, sizeof(*dreq));
+        case 0:
+                for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
+                        /* flip_.. gets a ref, while free_page only frees
+                         * when it decrefs to 0 */
+                        if (rc == 0)
+                                flip_into_page_cache(inode, lnb->page);
+                        __free_page(lnb->page);
+                }
+                f_dput(res->dentry);
+        }
+
+        RETURN(rc);
+}
diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c
new file mode 100644 (file)
index 0000000..3fb9d08
--- /dev/null
@@ -0,0 +1,343 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ *   Author Peter Braam <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *  For testing and management it is treated as an obd_device,
+ *  although * it does not export a full OBD method table (the
+ *  requests are coming * in over the wire, so object target modules
+ *  do not have a full * method table.)
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_OSC
+
+#ifdef __KERNEL__
+# include <linux/version.h>
+# include <linux/module.h>
+# include <linux/mm.h>
+# include <linux/highmem.h>
+# include <linux/lustre_dlm.h>
+# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#  include <linux/workqueue.h>
+#  include <linux/smp_lock.h>
+# else
+#  include <linux/locks.h>
+# endif
+#else /* __KERNEL__ */
+# include <liblustre.h>
+#endif
+
+#include <linux/kp30.h>
+#include <linux/lustre_mds.h> /* for mds_objid */
+#include <linux/obd_ost.h>
+#include <linux/lustre_commit_confd.h>
+#include <linux/obd_lov.h>
+
+#ifndef  __CYGWIN__
+# include <linux/ctype.h>
+# include <linux/init.h>
+#else
+# include <ctype.h>
+#endif
+
+#include <linux/lustre_ha.h>
+#include <linux/obd_support.h> /* for OBD_FAIL_CHECK */
+#include <linux/lustre_lite.h> /* for ll_i2info */
+#include <portals/lib-types.h> /* for PTL_MD_MAX_IOV */
+#include <linux/lprocfs_status.h>
+#include "osc_internal.h"
+
+struct osc_created {
+        wait_queue_head_t osccd_waitq;       /* the daemon sleeps on this */
+        wait_queue_head_t osccd_ctl_waitq;   /* insmod rmmod sleep on this */
+        spinlock_t osccd_lock;
+        int osccd_flags;
+        struct task_struct *osccd_thread;
+        struct list_head osccd_queue_list_head;
+        struct list_head osccd_work_list_head;
+};
+
+
+#define OSCCD_STOPPING          0x1
+#define OSCCD_STOPPED           0x2
+#define OSCCD_RUNNING           0x4
+#define OSCCD_KICKED            0x8
+#define OSCCD_PRECREATED         0x10
+
+
+static struct osc_created osc_created;
+
+static int oscc_has_objects(struct osc_creator *oscc, int count)
+{
+        int rc;
+        spin_lock(&oscc->oscc_lock);
+        rc = ((__s64)(oscc->oscc_last_id - oscc->oscc_next_id) >= count);
+        spin_unlock(&oscc->oscc_lock);
+        return rc;
+}
+
+static int oscc_precreate(struct osc_creator *oscc, struct osc_created *osccd,
+                          int wait)
+{
+        int rc = 0;
+        struct l_wait_info lwi = { 0 };
+        ENTRY;
+
+        if (oscc_has_objects(oscc, oscc->oscc_kick_barrier))
+                RETURN(0);
+
+        spin_lock(&osccd->osccd_lock);
+        spin_lock(&oscc->oscc_lock);
+        if (list_empty(&oscc->oscc_list)) {
+                list_add(&oscc->oscc_list, &osccd->osccd_queue_list_head);
+                osccd->osccd_flags |= OSCCD_KICKED;
+                wake_up(&osccd->osccd_waitq);
+        }
+        spin_unlock(&oscc->oscc_lock);
+        spin_unlock(&osccd->osccd_lock);
+
+        /* an MDS using this call may time out on this. This is a
+         *  recovery style wait.
+         */
+        if (wait)
+                rc = l_wait_event(oscc->oscc_waitq, oscc_has_objects(oscc, 1),
+                                  &lwi);
+        if (rc || !wait)
+                RETURN(rc);
+
+        spin_lock(&oscc->oscc_lock);
+        rc = oscc->oscc_status;
+        spin_unlock(&oscc->oscc_lock);
+        RETURN(rc);
+}
+
+int osc_create(struct obd_export *exp, struct obdo *oa,
+               struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+        struct lov_stripe_md *lsm;
+        struct osc_creator *oscc = &exp->u.eu_osc_data.oed_oscc;
+        struct osc_created *osccd = oscc->oscc_osccd;
+        int try_again = 1, rc = 0;
+        ENTRY;
+
+        LASSERT(oa);
+        LASSERT(ea);
+
+        lsm = *ea;
+        if (lsm == NULL) {
+                rc = obd_alloc_memmd(exp, &lsm);
+                if (rc < 0)
+                        RETURN(rc);
+        }
+
+       /* this is the special case where create removes orphans */
+       if (oa->o_valid == OBD_MD_FLFLAGS &&
+           oa->o_flags == OBD_FL_DELORPHAN) {
+                /* delete from next_id on up */
+                oa->o_valid |= OBD_MD_FLID;
+                oa->o_id = oscc->oscc_next_id;
+                if (oa->o_id == 0)
+                        RETURN(0);
+                rc = osc_real_create(oscc->oscc_exp, oa, ea, NULL);
+
+                spin_lock(&osccd->osccd_lock);
+                spin_lock(&oscc->oscc_lock);
+                oscc->oscc_status = rc;
+                oscc->oscc_last_id = oscc->oscc_next_id - 1;
+                spin_unlock(&oscc->oscc_lock);
+                spin_unlock(&osccd->osccd_lock);
+
+               RETURN(rc);
+       }
+
+        while (try_again) {
+                spin_lock(&oscc->oscc_lock);
+                if (oscc->oscc_last_id >= oscc->oscc_next_id) {
+                        memcpy(oa, &oscc->oscc_oa, sizeof(*oa));
+                        oa->o_id = oscc->oscc_next_id;
+                        lsm->lsm_object_id = oscc->oscc_next_id;
+                        *ea = lsm;
+                        oscc->oscc_next_id++;
+                        try_again = 0;
+                }
+                spin_unlock(&oscc->oscc_lock);
+                rc = oscc_precreate(oscc, osccd, try_again);
+        }
+
+        if (rc == 0)
+                CDEBUG(D_INFO, "returning objid "LPU64"\n", lsm->lsm_object_id);
+        else if (*ea == NULL)
+                obd_free_memmd(exp, &lsm);
+        RETURN(rc);
+}
+
+void osccd_do_create(struct osc_created *osccd)
+{
+        struct list_head *tmp;
+
+ next:
+        spin_lock(&osccd->osccd_lock);
+        list_for_each (tmp, &osccd->osccd_queue_list_head) {
+                int rc;
+                struct osc_creator *oscc = list_entry(tmp, struct osc_creator,
+                                                      oscc_list);
+                list_del_init(&oscc->oscc_list);
+                list_add(&oscc->oscc_list, &osccd->osccd_work_list_head);
+                spin_lock(&oscc->oscc_lock);
+               oscc->oscc_oa.o_id = oscc->oscc_last_id + oscc->oscc_grow_count;
+               oscc->oscc_oa.o_valid |= OBD_MD_FLID;
+                spin_unlock(&oscc->oscc_lock);
+                spin_unlock(&osccd->osccd_lock);
+
+                rc = osc_real_create(oscc->oscc_exp, &oscc->oscc_oa,
+                                     &oscc->oscc_ea, NULL);
+
+                /* This is not used and leaked, so might as well free
+                 * it now.*/
+                if (rc == 0 && oscc->oscc_ea != NULL) 
+                        obd_free_memmd(oscc->oscc_exp, &oscc->oscc_ea);
+
+                spin_lock(&osccd->osccd_lock);
+                spin_lock(&oscc->oscc_lock);
+                list_del_init(&oscc->oscc_list);
+                oscc->oscc_status = rc;
+                oscc->oscc_last_id = oscc->oscc_oa.o_id;
+                spin_unlock(&oscc->oscc_lock);
+                spin_unlock(&osccd->osccd_lock);
+
+                CDEBUG(D_INFO, "preallocated through id "LPU64" (last used "
+                       LPU64")\n", oscc->oscc_last_id, oscc->oscc_next_id);
+                wake_up(&oscc->oscc_waitq);
+                goto next;
+        }
+        spin_unlock(&osccd->osccd_lock);
+}
+
+static int osccd_main(void *arg)
+{
+        struct osc_created *osccd = (struct osc_created *)arg;
+        unsigned long flags;
+        ENTRY;
+
+        lock_kernel();
+        kportal_daemonize("lustre_created");
+
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+
+        unlock_kernel();
+
+        /* Record that the  thread is running */
+        osccd->osccd_flags =  OSCCD_RUNNING;
+        wake_up(&osccd->osccd_ctl_waitq);
+
+        /* And now, loop forever on requests */
+        while (1) {
+                struct l_wait_info lwi = { 0 };
+                l_wait_event(osccd->osccd_waitq,
+                             osccd->osccd_flags & (OSCCD_STOPPING|OSCCD_KICKED),
+                             &lwi);
+
+                spin_lock(&osccd->osccd_lock);
+                if (osccd->osccd_flags & OSCCD_STOPPING) {
+                        spin_unlock(&osccd->osccd_lock);
+                        EXIT;
+                        break;
+                }
+                osccd->osccd_flags &= ~OSCCD_KICKED;
+                spin_unlock(&osccd->osccd_lock);
+                osccd_do_create(osccd);
+        }
+
+        osccd->osccd_thread = NULL;
+        osccd->osccd_flags = OSCCD_STOPPED;
+        wake_up(&osccd->osccd_ctl_waitq);
+        CDEBUG(D_NET, "commit callback daemon exiting %d\n", current->pid);
+        RETURN(0);
+}
+
+void oscc_init(struct lustre_handle *exph)
+{
+        struct obd_export *exp = class_conn2export(exph);
+        struct osc_export_data *oed;
+
+        if (exp == NULL)
+                return;
+
+        oed = &exp->exp_osc_data;
+        memset(oed, 0, sizeof(*oed));
+        INIT_LIST_HEAD(&oed->oed_oscc.oscc_list);
+        init_waitqueue_head(&oed->oed_oscc.oscc_waitq);
+        spin_lock_init(&oed->oed_oscc.oscc_lock);
+        oed->oed_oscc.oscc_exp = exp;
+        oed->oed_oscc.oscc_osccd = &osc_created;
+        oed->oed_oscc.oscc_kick_barrier = 50;
+        oed->oed_oscc.oscc_grow_count = 100;
+        oed->oed_oscc.oscc_initial_create_count = 100;
+
+        oed->oed_oscc.oscc_next_id = 2;
+        oed->oed_oscc.oscc_last_id = 1;
+        /* XXX the export handle should give the oscc the last object */
+        /* oed->oed_oscc.oscc_last_id = exph->....; */
+}
+
+int osccd_setup(void)
+{
+        struct osc_created *osccd = &osc_created;
+        int rc;
+        struct l_wait_info lwi = { 0 };
+        ENTRY;
+
+        INIT_LIST_HEAD(&osccd->osccd_queue_list_head);
+        INIT_LIST_HEAD(&osccd->osccd_work_list_head);
+        init_waitqueue_head(&osccd->osccd_ctl_waitq);
+        init_waitqueue_head(&osccd->osccd_waitq);
+        spin_lock_init(&osccd->osccd_lock);
+        rc = kernel_thread(osccd_main, osccd,
+                           CLONE_VM | CLONE_FS | CLONE_FILES);
+        if (rc < 0) {
+                CERROR("cannot start thread\n");
+                RETURN(rc);
+        }
+        l_wait_event(osccd->osccd_ctl_waitq, osccd->osccd_flags & OSCCD_RUNNING,
+                     &lwi);
+        RETURN(0);
+}
+
+int osccd_cleanup(void)
+{
+        struct osc_created *osccd = &osc_created;
+        struct l_wait_info lwi = { 0 };
+        ENTRY;
+
+        spin_lock(&osccd->osccd_lock);
+        osccd->osccd_flags = OSCCD_STOPPING;
+        spin_unlock(&osccd->osccd_lock);
+
+        wake_up(&osccd->osccd_waitq);
+        l_wait_event(osccd->osccd_ctl_waitq,
+                     osccd->osccd_flags & OSCCD_STOPPED, &lwi);
+        RETURN(0);
+}
index a84a29c..db70ea7 100644 (file)
@@ -96,7 +96,6 @@ enum {
 };
 
 int ptlrpc_expire_one_request(struct ptlrpc_request *req);
-int ptlrpc_check_set(struct ptlrpc_request_set *set);
 
 void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
 #endif /* PTLRPC_INTERNAL_H */
index d33670f..094de0b 100644 (file)
@@ -120,9 +120,7 @@ static void __exit ptlrpc_exit(void)
 {
         ptlrpc_exit_portals();
         ptlrpc_cleanup_connection();
-#ifdef ENABLE_ORPHANS
         llog_cleanup_commit_master(0);
-#endif
 }
 
 /* connection.c */
@@ -170,8 +168,13 @@ EXPORT_SYMBOL(ptlrpc_next_xid);
 
 EXPORT_SYMBOL(ptlrpc_prep_set);
 EXPORT_SYMBOL(ptlrpc_set_add_req);
+EXPORT_SYMBOL(ptlrpc_set_add_new_req);
 EXPORT_SYMBOL(ptlrpc_set_destroy);
+EXPORT_SYMBOL(ptlrpc_set_next_timeout);
+EXPORT_SYMBOL(ptlrpc_check_set);
 EXPORT_SYMBOL(ptlrpc_set_wait);
+EXPORT_SYMBOL(ptlrpc_expired_set);
+EXPORT_SYMBOL(ptlrpc_interrupted_set);
 
 /* service.c */
 EXPORT_SYMBOL(ptlrpc_init_svc);
@@ -192,6 +195,7 @@ EXPORT_SYMBOL(lustre_swab_obd_statfs);
 EXPORT_SYMBOL(lustre_swab_obd_ioobj);
 EXPORT_SYMBOL(lustre_swab_niobuf_remote);
 EXPORT_SYMBOL(lustre_swab_ost_body);
+EXPORT_SYMBOL(lustre_swab_ost_last_id);
 EXPORT_SYMBOL(lustre_swab_ll_fid);
 EXPORT_SYMBOL(lustre_swab_mds_status_req);
 EXPORT_SYMBOL(lustre_swab_mds_fileh_body);
diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh
new file mode 100755 (executable)
index 0000000..eabee0a
--- /dev/null
@@ -0,0 +1,90 @@
+#!/bin/sh
+
+set -e
+
+# Skip these tests
+# 3 - bug 1852
+ALWAYS_EXCEPT="3"
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest}
+PATH=$LUSTRE/utils:$LUSTRE/tests:$PATH
+
+RLUSTRE=${RLUSTRE:-$LUSTRE}
+RPWD=${RPWD:-$PWD}
+
+XMLCONFIG="`basename $0 .sh`.xml"
+
+. $LUSTRE/tests/test-framework.sh
+
+CHECKSTAT="${CHECKSTAT:-checkstat} -v"
+
+# XXX I wish all this stuff was in some default-config.sh somewhere
+MOUNT=${MOUNT:-/mnt/lustre}
+DIR=${DIR:-$MOUNT}
+MDSDEV=${MDSDEV:-/tmp/mds-`hostname`}
+MDSSIZE=${MDSSIZE:-100000}
+OSTDEV=${OSTDEV:-/tmp/ost-`hostname`}
+OSTSIZE=${OSTSIZE:-100000}
+UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh}
+FSTYPE=${FSTYPE:-ext3}
+TIMEOUT=${TIMEOUT:-5}
+
+STRIPE_BYTES=65536
+STRIPES_PER_OBJ=1
+
+
+gen_config() {
+    rm -f $XMLCONFIG
+    add_facet mds
+    add_facet ost
+    add_facet client --lustre_upcall $UPCALL
+    do_lmc --add mds --node mds_facet --mds mds1 --dev $MDSDEV --size $MDSSIZE
+    do_lmc --add lov --mds mds1 --lov lov1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
+    do_lmc --add ost --lov lov1 --failover --node ost_facet --ost ost1 --dev $OSTDEV --size $OSTSIZE
+    do_lmc --add mtpt --node client_facet --path $MOUNT --mds mds1 --ost lov1
+}
+
+
+build_test_filter
+
+gen_config
+start mds --reformat $MDSLCONFARGS
+start ost --reformat $OSTLCONFARGS
+start client --gdb $CLIENTLCONFARGS
+
+mkdir -p $DIR
+
+test_0() {
+    replay_barrier ost
+    fail ost
+}
+run_test 0 "empty replay"
+
+test_1() {
+    replay_barrier ost
+    touch $DIR/$tfile
+    fail ost
+    $CHECKSTAT -t file $DIR/$tfile || return 1
+}
+run_test 1 "touch"
+
+test_2() {
+    replay_barrier ost
+    for i in `seq 10`; do
+        echo "tag-$i" > $DIR/$tfile-$i
+    done 
+    fail ost
+    for i in `seq 10`; do
+      grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i"
+    done 
+}
+run_test 2 "|x| 10 open(O_CREAT)s"
+
+exit 0
+
+equals_msg test complete, cleaning up
+stop client ${FORCE:=--force} $CLIENTLCONFARGS
+stop ost ${FORCE}
+stop mds ${FORCE} $MDSLCONFARGS --dump cleanup.log
+
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh
new file mode 100644 (file)
index 0000000..3ba7402
--- /dev/null
@@ -0,0 +1,126 @@
+#!/bin/sh
+
+set -e
+
+init_test_env() {
+    export TESTSUITE=`basename $0 .sh`
+    export XMLCONFIG="${TESTSUITE}.xml"
+    export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest}
+    export PATH=$LUSTRE/utils:$LUSTRE/tests:$PATH
+
+    export RLUSTRE=${RLUSTRE:-$LUSTRE}
+    export RPWD=${RPWD:-$PWD}
+    export CHECKSTAT="${CHECKSTAT:-checkstat} -v"
+}
+
+start() {
+    facet=$1
+    shift
+    lconf --node ${facet}_facet $@ $XMLCONFIG
+}
+
+stop() {
+    facet=$1
+    shift
+    lconf --node ${facet}_facet $@ --cleanup $XMLCONFIG
+}
+
+replay_barrier() {
+    local dev=$1
+    sync
+    df $MOUNT
+    lctl --device %${dev}1 readonly
+    lctl --device %${dev}1 notransno
+    lctl mark "REPLAY BARRIER"
+}
+
+fail() {
+    local facet=$1
+    stop $facet --force --failover --nomod
+    start $facet --nomod
+    df $MOUNT
+}
+
+do_lmc() {
+    lmc -m ${XMLCONFIG} $@
+}
+
+add_facet() {
+    local facet=$1
+    shift
+    do_lmc --add node --node ${facet}_facet $@ --timeout $TIMEOUT
+    do_lmc --add net --node ${facet}_facet --nid localhost --nettype tcp
+}
+
+error() {
+    echo "${TESTSUITE}: **** FAIL:" $@
+    exit 1
+}
+
+build_test_filter() {
+        for O in $ONLY; do
+            eval ONLY_${O}=true
+        done
+        for E in $EXCEPT $ALWAYS_EXCEPT; do
+            eval EXCEPT_${E}=true
+        done
+}
+
+_basetest() {
+    echo $*
+}
+
+basetest() {
+    IFS=abcdefghijklmnopqrstuvwxyz _basetest $1
+}
+
+run_test() {
+        export base=`basetest $1`
+        if [ ! -z "$ONLY" ]; then
+                 testname=ONLY_$1
+                 if [ ${!testname}x != x ]; then
+                     run_one $1 "$2"
+                     return $?
+                 fi
+                 testname=ONLY_$base
+                 if [ ${!testname}x != x ]; then
+                     run_one $1 "$2"
+                     return $?
+                 fi
+                 echo -n "."
+                 return 0
+        fi
+        testname=EXCEPT_$1
+        if [ ${!testname}x != x ]; then
+                 echo "skipping excluded test $1"
+                 return 0
+        fi
+        testname=EXCEPT_$base
+        if [ ${!testname}x != x ]; then
+                 echo "skipping excluded test $1 (base $base)"
+                 return 0
+        fi
+        run_one $1 "$2"
+
+        return $?
+}
+
+EQUALS="======================================================================"
+equals_msg() {
+   msg="$@"
+
+   local suffixlen=$((65 - ${#msg}))
+   printf '===== %s %.*s\n' "$msg" $suffixlen $EQUALS
+}
+
+run_one() {
+    testnum=$1
+    message=$2
+    tfile=f$base
+    tdir=d$base
+
+    # Pretty tests run faster.
+    equals_msg $testnum: $message
+
+    test_${testnum} || error "test_$testnum failed with $?"
+}