From 147e8cdf3794e8b9c60bbf11b945575d6185290d Mon Sep 17 00:00:00 2001 From: phil Date: Tue, 9 Sep 2003 03:55:05 +0000 Subject: [PATCH] merge b_llpmd into b_devel. the major highlights: - new I/O backend - new client page cache and llite/lov/osc plumbing - pre-creation of OST objects - most of the OBD protocol now revolves around exports, not obd_devices --- lustre/include/linux/lustre_compat25.h | 35 +- .../kernel_configs/uml_2.6.0_test3.config | 325 + .../patches/dynamic-locks-2.4.18-chaos.patch | 212 + .../patches/dynamic-locks-2.4.20-rh.patch | 217 + .../kernel_patches/patches/ext-2.4-patch-5.patch | 15 + .../patches/ext3-2.4.18-ino_sb_macro-2.patch | 1461 ++++ .../patches/ext3-compat-2.4.18-chaos.patch | 19 + .../patches/ext3-delete_thread-2.4.18-2.patch | 478 ++ .../patches/ext3-extents-2.4.18-chaos.patch | 1831 ++++ .../patches/ext3-extents-oflag-2.4.18-chaos.patch | 291 + .../patches/ext3-map_inode_page-2.6.0.patch | 76 + .../patches/ext3-no-write-super-chaos.patch | 15 + .../patches/ext3-o_direct-1.2.4.20-rh.patch | 197 + .../patches/ext3-pdirops-2.4.18-chaos.patch | 1238 +++ lustre/kernel_patches/patches/iopen-2.4.18-2.patch | 422 + lustre/kernel_patches/patches/iopen-2.6.0.patch | 403 + .../patches/linux-2.4.18ea-0.8.26-2.patch | 1775 ++++ .../kernel_patches/patches/removepage-2.4.20.patch | 28 + .../kernel_patches/patches/removepage-2.6.0.patch | 28 + lustre/kernel_patches/patches/uml-2.6.0-fix.patch | 19 + .../patches/uml-patch-2.6.0-test3-1.patch | 8716 ++++++++++++++++++++ .../patches/vfs-pdirops-2.4.18-chaos.patch | 265 + .../patches/vfs-pdirops-2.4.20-rh.patch | 269 + .../pc/dynamic-locks-2.4.18-chaos.pc | 3 + lustre/kernel_patches/pc/ext-2.4-patch-5.pc | 1 + .../pc/ext3-2.4.18-ino_sb_macro-2.pc | 20 + .../kernel_patches/pc/ext3-compat-2.4.18-chaos.pc | 1 + .../pc/ext3-delete_thread-2.4.18-2.pc | 6 + .../kernel_patches/pc/ext3-extents-2.4.18-chaos.pc | 8 + .../pc/ext3-extents-oflag-2.4.18-chaos.pc | 19 + .../kernel_patches/pc/ext3-map_inode_page-2.6.0.pc | 2 + .../kernel_patches/pc/ext3-no-write-super-chaos.pc | 1 + .../kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc | 6 + lustre/kernel_patches/pc/iopen-2.4.18-2.pc | 8 + lustre/kernel_patches/pc/iopen-2.6.0.pc | 8 + .../kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc | 11 + lustre/kernel_patches/pc/removepage-2.4.20.pc | 2 + lustre/kernel_patches/pc/removepage-2.6.0.pc | 2 + lustre/kernel_patches/pc/uml-2.6.0-fix.pc | 1 + .../kernel_patches/pc/uml-patch-2.6.0-test3-1.pc | 113 + .../kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc | 3 + lustre/kernel_patches/series/chaos-2.4.18 | 6 +- lustre/kernel_patches/series/chaos-2.4.18-pdirops | 35 + lustre/kernel_patches/series/uml_2.6.0_test3 | 14 + lustre/liblustre/file.c | 4 +- lustre/liblustre/llite_lib.h | 2 +- lustre/liblustre/rw.c | 2 +- lustre/liblustre/super.c | 10 +- lustre/mdc/mdc_locks.c | 550 ++ lustre/obdfilter/filter_io_24.c | 237 + lustre/obdfilter/filter_io_26.c | 228 + lustre/osc/osc_create.c | 343 + lustre/ptlrpc/ptlrpc_internal.h | 1 - lustre/ptlrpc/ptlrpc_module.c | 8 +- lustre/tests/replay-ost-single.sh | 90 + lustre/tests/test-framework.sh | 126 + 56 files changed, 20171 insertions(+), 35 deletions(-) create mode 100644 lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config create mode 100644 lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/dynamic-locks-2.4.20-rh.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-5.patch create mode 100644 lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch create mode 100644 lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-oflag-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-map_inode_page-2.6.0.patch create mode 100644 lustre/kernel_patches/patches/ext3-no-write-super-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch create mode 100644 lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.4.18-2.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.6.0.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch create mode 100644 lustre/kernel_patches/patches/removepage-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/removepage-2.6.0.patch create mode 100644 lustre/kernel_patches/patches/uml-2.6.0-fix.patch create mode 100644 lustre/kernel_patches/patches/uml-patch-2.6.0-test3-1.patch create mode 100644 lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch create mode 100644 lustre/kernel_patches/patches/vfs-pdirops-2.4.20-rh.patch create mode 100644 lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/pc/ext-2.4-patch-5.pc create mode 100644 lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc create mode 100644 lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc create mode 100644 lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/pc/ext3-extents-oflag-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/pc/ext3-map_inode_page-2.6.0.pc create mode 100644 lustre/kernel_patches/pc/ext3-no-write-super-chaos.pc create mode 100644 lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/pc/iopen-2.4.18-2.pc create mode 100644 lustre/kernel_patches/pc/iopen-2.6.0.pc create mode 100644 lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc create mode 100644 lustre/kernel_patches/pc/removepage-2.4.20.pc create mode 100644 lustre/kernel_patches/pc/removepage-2.6.0.pc create mode 100644 lustre/kernel_patches/pc/uml-2.6.0-fix.pc create mode 100644 lustre/kernel_patches/pc/uml-patch-2.6.0-test3-1.pc create mode 100644 lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc create mode 100644 lustre/kernel_patches/series/chaos-2.4.18-pdirops create mode 100644 lustre/kernel_patches/series/uml_2.6.0_test3 create mode 100644 lustre/mdc/mdc_locks.c create mode 100644 lustre/obdfilter/filter_io_24.c create mode 100644 lustre/obdfilter/filter_io_26.c create mode 100644 lustre/osc/osc_create.c create mode 100755 lustre/tests/replay-ost-single.sh create mode 100644 lustre/tests/test-framework.sh diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 96e52c4..1f26364 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -32,8 +32,11 @@ #include #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) -# define PGCACHE_WRLOCK(mapping) write_lock(&mapping->page_lock) -# define PGCACHE_WRUNLOCK(mapping) write_unlock(&mapping->page_lock) + +/* XXX our code should be using the 2.6 calls, not the other way around */ +#define TryLockPage(page) TestSetPageLocked(page) +#define filemap_fdatasync(mapping) filemap_fdatawrite(mapping) +#define Page_Uptodate(page) PageUptodate(page) #define KDEVT_INIT(val) { .value = val } @@ -46,12 +49,22 @@ #define ll_vfs_create(a,b,c,d) vfs_create(a,b,c,d) +#define ll_dev_t dev_t + +#include + #else /* 2.4.. */ #define ll_vfs_create(a,b,c,d) vfs_create(a,b,c) #define ll_permission(a,b,c) permission(a,b) -# define PGCACHE_WRLOCK(mapping) spin_lock(&pagecache_lock) -# define PGCACHE_WRUNLOCK(mapping) spin_unlock(&pagecache_lock) + +#define ll_dev_t int + +static inline void clear_page_dirty(struct page *page) +{ + if (PageDirty(page)) + ClearPageDirty(page); +} /* 2.5 uses hlists for some things, like the d_hash. we'll treat them * as 2.5 and let macros drop back.. */ @@ -94,20 +107,6 @@ static inline void __d_drop(struct dentry *dentry) #endif /* end of 2.4 compat macros */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) -# define filemap_fdatasync(mapping) filemap_fdatawrite(mapping) -#endif - - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) -# define TryLockPage(page) TestSetPageLocked(page) -#endif - - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) -# define Page_Uptodate(page) PageUptodate(page) -#endif - #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) #define rb_node_s rb_node #define rb_root_s rb_root diff --git a/lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config b/lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config new file mode 100644 index 0000000..f933188 --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config @@ -0,0 +1,325 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_USERMODE=y +CONFIG_MMU=y +CONFIG_UID16=y +CONFIG_RWSEM_GENERIC_SPINLOCK=y + +# +# UML-specific options +# +CONFIG_MODE_TT=y +# CONFIG_MODE_SKAS is not set +CONFIG_NET=y +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=y +CONFIG_HOSTFS=y +# CONFIG_HPPFS is not set +CONFIG_MCONSOLE=y +CONFIG_MAGIC_SYSRQ=y +# CONFIG_HOST_2G_2G is not set +# CONFIG_UML_SMP is not set +# CONFIG_SMP is not set +CONFIG_NEST_LEVEL=0 +CONFIG_KERNEL_HALF_GIGS=1 +# CONFIG_HIGHMEM is not set +# CONFIG_PROC_MM is not set +CONFIG_KERNEL_STACK_ORDER=2 + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_EMBEDDED is not set +CONFIG_KALLSYMS=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y + +# +# Loadable module support +# +# CONFIG_MODULES is not set + +# +# Generic Driver Options +# +# CONFIG_FW_LOADER is not set + +# +# Character Devices +# +CONFIG_STDIO_CONSOLE=y +CONFIG_SSL=y +CONFIG_FD_CHAN=y +# CONFIG_NULL_CHAN is not set +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y +CONFIG_CON_ZERO_CHAN="fd:0,fd:1" +CONFIG_CON_CHAN="xterm" +CONFIG_SSL_CHAN="pty" +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=256 +# CONFIG_WATCHDOG is not set +# CONFIG_UML_SOUND is not set +# CONFIG_SOUND is not set +# CONFIG_HOSTAUDIO is not set + +# +# Block Devices +# +CONFIG_BLK_DEV_UBD=y +# CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_COW_COMMON=y +CONFIG_BLK_DEV_LOOP=y +# CONFIG_BLK_DEV_NBD is not set +# CONFIG_BLK_DEV_RAM is not set +# CONFIG_MMAPPER is not set +CONFIG_NETDEVICES=y + +# +# UML Network Devices +# +CONFIG_UML_NET=y +CONFIG_UML_NET_ETHERTAP=y +CONFIG_UML_NET_TUNTAP=y +CONFIG_UML_NET_SLIP=y +CONFIG_UML_NET_DAEMON=y +CONFIG_UML_NET_MCAST=y +# CONFIG_UML_NET_PCAP is not set +# CONFIG_UML_NET_SLIRP is not set + +# +# Networking support +# + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +# CONFIG_NETLINK_DEV is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +# CONFIG_IP_MULTICAST is not set +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set +# CONFIG_IPV6 is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_NETFILTER is not set +# CONFIG_XFRM_USER is not set + +# +# SCTP Configuration (EXPERIMENTAL) +# +CONFIG_IPV6_SCTP__=y +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_LLC is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +CONFIG_DUMMY=y +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +CONFIG_TUN=y +# CONFIG_ETHERTAP is not set + +# +# Ethernet (10 or 100Mbit) +# +# CONFIG_NET_ETHERNET is not set + +# +# Ethernet (1000 Mbit) +# + +# +# Ethernet (10000 Mbit) +# +CONFIG_PPP=y +# CONFIG_PPP_MULTILINK is not set +# CONFIG_PPP_FILTER is not set +# CONFIG_PPP_ASYNC is not set +# CONFIG_PPP_SYNC_TTY is not set +# CONFIG_PPP_DEFLATE is not set +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_SLIP=y +# CONFIG_SLIP_COMPRESSED is not set +# CONFIG_SLIP_SMART is not set +# CONFIG_SLIP_MODE_SLIP6 is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Token Ring devices (depends on LLC=y) +# +# CONFIG_SHAPER is not set + +# +# Wan interfaces +# +# CONFIG_WAN is not set + +# +# File systems +# +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +# CONFIG_EXT3_FS_POSIX_ACL is not set +# CONFIG_EXT3_FS_SECURITY is not set +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y +# CONFIG_REISERFS_FS is not set +# CONFIG_JFS_FS is not set +# CONFIG_XFS_FS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_ROMFS_FS is not set +# CONFIG_QUOTA is not set +# CONFIG_AUTOFS_FS is not set +# CONFIG_AUTOFS4_FS is not set + +# +# CD-ROM/DVD Filesystems +# +# CONFIG_ISO9660_FS is not set +# CONFIG_UDF_FS is not set + +# +# DOS/FAT/NT Filesystems +# +# CONFIG_FAT_FS is not set +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_DEVFS_FS=y +CONFIG_DEVFS_MOUNT=y +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_DEVPTS_FS_XATTR is not set +CONFIG_TMPFS=y +CONFIG_RAMFS=y + +# +# Miscellaneous filesystems +# +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +# CONFIG_CRAMFS is not set +# CONFIG_VXFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UFS_FS is not set + +# +# Network File Systems +# +# CONFIG_NFS_FS is not set +# CONFIG_NFSD is not set +# CONFIG_EXPORTFS is not set +# CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y + +# +# Security options +# +# CONFIG_SECURITY is not set + +# +# Cryptographic options +# +# CONFIG_CRYPTO is not set + +# +# Library routines +# +# CONFIG_CRC32 is not set + +# +# SCSI support +# +# CONFIG_SCSI is not set + +# +# Multi-device support (RAID and LVM) +# +# CONFIG_MD is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Kernel hacking +# +CONFIG_DEBUG_SLAB=y +# CONFIG_DEBUG_SPINLOCK is not set +CONFIG_DEBUG_INFO=y +CONFIG_FRAME_POINTER=y +CONFIG_PT_PROXY=y +# CONFIG_GPROF is not set +# CONFIG_GCOV is not set diff --git a/lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch b/lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch new file mode 100644 index 0000000..a1cef3e --- /dev/null +++ b/lustre/kernel_patches/patches/dynamic-locks-2.4.18-chaos.patch @@ -0,0 +1,212 @@ + include/linux/dynlocks.h | 33 ++++++++++ + lib/Makefile | 4 - + lib/dynlocks.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 187 insertions(+), 2 deletions(-) + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/include/linux/dynlocks.h 2003-09-01 16:33:25.000000000 +0400 +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_DYNLOCKS_H ++#define _LINUX_DYNLOCKS_H ++ ++#include ++#include ++ ++struct dynlock_member { ++ struct list_head dl_list; ++ unsigned long dl_value; /* lock value */ ++ int dl_refcount; /* number of users */ ++ int dl_readers; ++ int dl_writers; ++ int dl_pid; /* holder of the lock */ ++ wait_queue_head_t dl_wait; ++}; ++ ++/* ++ * lock's namespace: ++ * - list of locks ++ * - lock to protect this list ++ */ ++struct dynlock { ++ struct list_head dl_list; ++ spinlock_t dl_list_lock; ++}; ++ ++void dynlock_init(struct dynlock *dl); ++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp); ++void dynlock_unlock(struct dynlock *dl, void *lock); ++ ++ ++#endif ++ +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/lib/dynlocks.c 2003-09-01 16:36:00.000000000 +0400 +@@ -0,0 +1,152 @@ ++/* ++ * Dynamic Locks ++ * ++ * struct dynlock is lockspace ++ * one may request lock (exclusive or shared) for some value ++ * in that lockspace ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * dynlock_init ++ * ++ * initialize lockspace ++ * ++ */ ++void dynlock_init(struct dynlock *dl) ++{ ++ spin_lock_init(&dl->dl_list_lock); ++ INIT_LIST_HEAD(&dl->dl_list); ++} ++ ++/* ++ * dynlock_lock ++ * ++ * acquires lock (exclusive or shared) in specified lockspace ++ * each lock in lockspace is allocated separately, so user have ++ * to specify GFP flags. ++ * routine returns pointer to lock. this pointer is intended to ++ * be passed to dynlock_unlock ++ * ++ */ ++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp) ++{ ++ struct dynlock_member *nhl = NULL; ++ struct dynlock_member *hl; ++ struct list_head *cur; ++ ++repeat: ++ /* find requested lock in lockspace */ ++ spin_lock(&dl->dl_list_lock); ++ list_for_each(cur, &dl->dl_list) { ++ hl = list_entry(cur, struct dynlock_member, dl_list); ++ if (hl->dl_value == value) { ++ /* lock is found */ ++ if (nhl) { ++ /* someone else just allocated ++ * lock we didn't find and just created ++ * so, we drop our lock ++ */ ++ kfree(nhl); ++ nhl = NULL; ++ } ++ hl->dl_refcount++; ++ goto found; ++ } ++ } ++ /* lock not found */ ++ if (nhl) { ++ /* we already have allocated lock. use it */ ++ hl = nhl; ++ nhl = NULL; ++ list_add(&hl->dl_list, &dl->dl_list); ++ goto found; ++ } ++ spin_unlock(&dl->dl_list_lock); ++ ++ /* lock not found and we haven't allocated lock yet. allocate it */ ++ nhl = kmalloc(sizeof(struct dynlock_member), gfp); ++ if (nhl == NULL) ++ return NULL; ++ nhl->dl_refcount = 1; ++ nhl->dl_value = value; ++ nhl->dl_readers = 0; ++ nhl->dl_writers = 0; ++ init_waitqueue_head(&nhl->dl_wait); ++ ++ /* while lock is being allocated, someone else may allocate it ++ * and put onto to list. check this situation ++ */ ++ goto repeat; ++ ++found: ++ if (rw) { ++ /* exclusive lock: user don't want to share lock at all ++ * NOTE: one process may take the same lock several times ++ * this functionaly is useful for rename operations */ ++ while ((hl->dl_writers && hl->dl_pid != current->pid) || ++ hl->dl_readers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dl_wait, ++ hl->dl_writers == 0 && hl->dl_readers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dl_writers++; ++ } else { ++ /* shared lock: user do not want to share lock with writer */ ++ while (hl->dl_writers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dl_wait, hl->dl_writers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dl_readers++; ++ } ++ hl->dl_pid = current->pid; ++ spin_unlock(&dl->dl_list_lock); ++ ++ return hl; ++} ++ ++ ++/* ++ * dynlock_unlock ++ * ++ * user have to specify lockspace (dl) and pointer to lock structure ++ * returned by dynlock_lock() ++ * ++ */ ++void dynlock_unlock(struct dynlock *dl, void *lock) ++{ ++ struct dynlock_member *hl = lock; ++ int wakeup = 0; ++ ++ spin_lock(&dl->dl_list_lock); ++ if (hl->dl_writers) { ++ hl->dl_writers--; ++ if (hl->dl_writers == 0) ++ wakeup = 1; ++ } else { ++ hl->dl_readers--; ++ if (hl->dl_readers == 0) ++ wakeup = 1; ++ } ++ if (wakeup) { ++ hl->dl_pid = 0; ++ wake_up(&hl->dl_wait); ++ } ++ if (--(hl->dl_refcount) == 0) ++ list_del(&hl->dl_list); ++ spin_unlock(&dl->dl_list_lock); ++ if (hl->dl_refcount == 0) ++ kfree(hl); ++} ++ ++EXPORT_SYMBOL(dynlock_init); ++EXPORT_SYMBOL(dynlock_lock); ++EXPORT_SYMBOL(dynlock_unlock); ++ +--- linux-2.4.18/lib/Makefile~dynamic-locks-2.4.18-chaos 2003-08-29 11:57:40.000000000 +0400 ++++ linux-2.4.18-alexey/lib/Makefile 2003-09-01 16:35:23.000000000 +0400 +@@ -8,9 +8,9 @@ + + L_TARGET := lib.a + +-export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o ++export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o dynlocks.o + +-obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o ++obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o dynlocks.o + + obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o + obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o + +_ diff --git a/lustre/kernel_patches/patches/dynamic-locks-2.4.20-rh.patch b/lustre/kernel_patches/patches/dynamic-locks-2.4.20-rh.patch new file mode 100644 index 0000000..59f0a3e --- /dev/null +++ b/lustre/kernel_patches/patches/dynamic-locks-2.4.20-rh.patch @@ -0,0 +1,217 @@ + include/linux/dynlocks.h | 33 ++++++++++ + lib/Makefile | 4 - + lib/dynlocks.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 187 insertions(+), 2 deletions(-) + +Index: linux-2.4.20-rh/include/linux/dynlocks.h +=================================================================== +--- linux-2.4.20-rh.orig/include/linux/dynlocks.h 2003-09-04 18:25:49.000000000 +0800 ++++ linux-2.4.20-rh/include/linux/dynlocks.h 2003-09-04 18:25:49.000000000 +0800 +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_DYNLOCKS_H ++#define _LINUX_DYNLOCKS_H ++ ++#include ++#include ++ ++struct dynlock_member { ++ struct list_head dl_list; ++ unsigned long dl_value; /* lock value */ ++ int dl_refcount; /* number of users */ ++ int dl_readers; ++ int dl_writers; ++ int dl_pid; /* holder of the lock */ ++ wait_queue_head_t dl_wait; ++}; ++ ++/* ++ * lock's namespace: ++ * - list of locks ++ * - lock to protect this list ++ */ ++struct dynlock { ++ struct list_head dl_list; ++ spinlock_t dl_list_lock; ++}; ++ ++void dynlock_init(struct dynlock *dl); ++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp); ++void dynlock_unlock(struct dynlock *dl, void *lock); ++ ++ ++#endif ++ +Index: linux-2.4.20-rh/lib/dynlocks.c +=================================================================== +--- linux-2.4.20-rh.orig/lib/dynlocks.c 2003-09-04 18:25:49.000000000 +0800 ++++ linux-2.4.20-rh/lib/dynlocks.c 2003-09-04 18:25:49.000000000 +0800 +@@ -0,0 +1,152 @@ ++/* ++ * Dynamic Locks ++ * ++ * struct dynlock is lockspace ++ * one may request lock (exclusive or shared) for some value ++ * in that lockspace ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * dynlock_init ++ * ++ * initialize lockspace ++ * ++ */ ++void dynlock_init(struct dynlock *dl) ++{ ++ spin_lock_init(&dl->dl_list_lock); ++ INIT_LIST_HEAD(&dl->dl_list); ++} ++ ++/* ++ * dynlock_lock ++ * ++ * acquires lock (exclusive or shared) in specified lockspace ++ * each lock in lockspace is allocated separately, so user have ++ * to specify GFP flags. ++ * routine returns pointer to lock. this pointer is intended to ++ * be passed to dynlock_unlock ++ * ++ */ ++void *dynlock_lock(struct dynlock *dl, unsigned long value, int rw, int gfp) ++{ ++ struct dynlock_member *nhl = NULL; ++ struct dynlock_member *hl; ++ struct list_head *cur; ++ ++repeat: ++ /* find requested lock in lockspace */ ++ spin_lock(&dl->dl_list_lock); ++ list_for_each(cur, &dl->dl_list) { ++ hl = list_entry(cur, struct dynlock_member, dl_list); ++ if (hl->dl_value == value) { ++ /* lock is found */ ++ if (nhl) { ++ /* someone else just allocated ++ * lock we didn't find and just created ++ * so, we drop our lock ++ */ ++ kfree(nhl); ++ nhl = NULL; ++ } ++ hl->dl_refcount++; ++ goto found; ++ } ++ } ++ /* lock not found */ ++ if (nhl) { ++ /* we already have allocated lock. use it */ ++ hl = nhl; ++ nhl = NULL; ++ list_add(&hl->dl_list, &dl->dl_list); ++ goto found; ++ } ++ spin_unlock(&dl->dl_list_lock); ++ ++ /* lock not found and we haven't allocated lock yet. allocate it */ ++ nhl = kmalloc(sizeof(struct dynlock_member), gfp); ++ if (nhl == NULL) ++ return NULL; ++ nhl->dl_refcount = 1; ++ nhl->dl_value = value; ++ nhl->dl_readers = 0; ++ nhl->dl_writers = 0; ++ init_waitqueue_head(&nhl->dl_wait); ++ ++ /* while lock is being allocated, someone else may allocate it ++ * and put onto to list. check this situation ++ */ ++ goto repeat; ++ ++found: ++ if (rw) { ++ /* exclusive lock: user don't want to share lock at all ++ * NOTE: one process may take the same lock several times ++ * this functionaly is useful for rename operations */ ++ while ((hl->dl_writers && hl->dl_pid != current->pid) || ++ hl->dl_readers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dl_wait, ++ hl->dl_writers == 0 && hl->dl_readers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dl_writers++; ++ } else { ++ /* shared lock: user do not want to share lock with writer */ ++ while (hl->dl_writers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dl_wait, hl->dl_writers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dl_readers++; ++ } ++ hl->dl_pid = current->pid; ++ spin_unlock(&dl->dl_list_lock); ++ ++ return hl; ++} ++ ++ ++/* ++ * dynlock_unlock ++ * ++ * user have to specify lockspace (dl) and pointer to lock structure ++ * returned by dynlock_lock() ++ * ++ */ ++void dynlock_unlock(struct dynlock *dl, void *lock) ++{ ++ struct dynlock_member *hl = lock; ++ int wakeup = 0; ++ ++ spin_lock(&dl->dl_list_lock); ++ if (hl->dl_writers) { ++ hl->dl_writers--; ++ if (hl->dl_writers == 0) ++ wakeup = 1; ++ } else { ++ hl->dl_readers--; ++ if (hl->dl_readers == 0) ++ wakeup = 1; ++ } ++ if (wakeup) { ++ hl->dl_pid = 0; ++ wake_up(&hl->dl_wait); ++ } ++ if (--(hl->dl_refcount) == 0) ++ list_del(&hl->dl_list); ++ spin_unlock(&dl->dl_list_lock); ++ if (hl->dl_refcount == 0) ++ kfree(hl); ++} ++ ++EXPORT_SYMBOL(dynlock_init); ++EXPORT_SYMBOL(dynlock_lock); ++EXPORT_SYMBOL(dynlock_unlock); ++ +Index: linux-2.4.20-rh/lib/Makefile +=================================================================== +--- linux-2.4.20-rh.orig/lib/Makefile 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-rh/lib/Makefile 2003-09-04 18:27:26.000000000 +0800 +@@ -8,10 +8,10 @@ + + L_TARGET := lib.a + +-export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o ++export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o rbtree.o dynlocks.o + + obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o \ +- bust_spinlocks.o rbtree.o dump_stack.o ++ bust_spinlocks.o rbtree.o dump_stack.o dynlocks.o + + obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o + obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-5.patch b/lustre/kernel_patches/patches/ext-2.4-patch-5.patch new file mode 100644 index 0000000..a65f6ed --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-5.patch @@ -0,0 +1,15 @@ + include/linux/ext3_fs.h | 1 + + 1 files changed, 1 insertion(+) + +--- linux-2.4.18/include/linux/ext3_fs.h~ext-2.4-patch-5 2003-08-29 16:53:18.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs.h 2003-09-01 11:50:37.000000000 +0400 +@@ -344,6 +344,7 @@ struct ext3_inode { + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ ++#define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H + +_ diff --git a/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch new file mode 100644 index 0000000..8343e54 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch @@ -0,0 +1,1461 @@ +--- ./fs/ext3/balloc.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/balloc.c Tue May 7 15:35:59 2002 +@@ -46,18 +46,18 @@ struct ext3_group_desc * ext3_get_group_ + unsigned long desc; + struct ext3_group_desc * gdp; + +- if (block_group >= sb->u.ext3_sb.s_groups_count) { ++ if (block_group >= EXT3_SB(sb)->s_groups_count) { + ext3_error (sb, "ext3_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", +- block_group, sb->u.ext3_sb.s_groups_count); ++ block_group, EXT3_SB(sb)->s_groups_count); + + return NULL; + } + + group_desc = block_group / EXT3_DESC_PER_BLOCK(sb); + desc = block_group % EXT3_DESC_PER_BLOCK(sb); +- if (!sb->u.ext3_sb.s_group_desc[group_desc]) { ++ if (!EXT3_SB(sb)->s_group_desc[group_desc]) { + ext3_error (sb, "ext3_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", +@@ -66,9 +66,9 @@ struct ext3_group_desc * ext3_get_group_ + } + + gdp = (struct ext3_group_desc *) +- sb->u.ext3_sb.s_group_desc[group_desc]->b_data; ++ EXT3_SB(sb)->s_group_desc[group_desc]->b_data; + if (bh) +- *bh = sb->u.ext3_sb.s_group_desc[group_desc]; ++ *bh = EXT3_SB(sb)->s_group_desc[group_desc]; + return gdp + desc; + } + +@@ -104,8 +104,8 @@ static int read_block_bitmap (struct sup + * this group. The IO will be retried next time. + */ + error_out: +- sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group; +- sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh; ++ EXT3_SB(sb)->s_block_bitmap_number[bitmap_nr] = block_group; ++ EXT3_SB(sb)->s_block_bitmap[bitmap_nr] = bh; + return retval; + } + +@@ -128,16 +128,17 @@ static int __load_block_bitmap (struct s + int i, j, retval = 0; + unsigned long block_bitmap_number; + struct buffer_head * block_bitmap; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); + +- if (block_group >= sb->u.ext3_sb.s_groups_count) ++ if (block_group >= sbi->s_groups_count) + ext3_panic (sb, "load_block_bitmap", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", +- block_group, sb->u.ext3_sb.s_groups_count); ++ block_group, EXT3_SB(sb)->s_groups_count); + +- if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) { +- if (sb->u.ext3_sb.s_block_bitmap[block_group]) { +- if (sb->u.ext3_sb.s_block_bitmap_number[block_group] == ++ if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) { ++ if (sbi->s_block_bitmap[block_group]) { ++ if (sbi->s_block_bitmap_number[block_group] == + block_group) + return block_group; + ext3_error (sb, "__load_block_bitmap", +@@ -149,21 +150,20 @@ static int __load_block_bitmap (struct s + return block_group; + } + +- for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps && +- sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++) ++ for (i = 0; i < sbi->s_loaded_block_bitmaps && ++ sbi->s_block_bitmap_number[i] != block_group; i++) + ; +- if (i < sb->u.ext3_sb.s_loaded_block_bitmaps && +- sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) { +- block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i]; +- block_bitmap = sb->u.ext3_sb.s_block_bitmap[i]; ++ if (i < sbi->s_loaded_block_bitmaps && ++ sbi->s_block_bitmap_number[i] == block_group) { ++ block_bitmap_number = sbi->s_block_bitmap_number[i]; ++ block_bitmap = sbi->s_block_bitmap[i]; + for (j = i; j > 0; j--) { +- sb->u.ext3_sb.s_block_bitmap_number[j] = +- sb->u.ext3_sb.s_block_bitmap_number[j - 1]; +- sb->u.ext3_sb.s_block_bitmap[j] = +- sb->u.ext3_sb.s_block_bitmap[j - 1]; ++ sbi->s_block_bitmap_number[j] = ++ sbi->s_block_bitmap_number[j - 1]; ++ sbi->s_block_bitmap[j] = sbi->s_block_bitmap[j - 1]; + } +- sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number; +- sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap; ++ sbi->s_block_bitmap_number[0] = block_bitmap_number; ++ sbi->s_block_bitmap[0] = block_bitmap; + + /* + * There's still one special case here --- if block_bitmap == 0 +@@ -173,17 +173,14 @@ static int __load_block_bitmap (struct s + if (!block_bitmap) + retval = read_block_bitmap (sb, block_group, 0); + } else { +- if (sb->u.ext3_sb.s_loaded_block_bitmapsu.ext3_sb.s_loaded_block_bitmaps++; ++ if (sbi->s_loaded_block_bitmapss_loaded_block_bitmaps++; + else +- brelse (sb->u.ext3_sb.s_block_bitmap +- [EXT3_MAX_GROUP_LOADED - 1]); +- for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1; +- j > 0; j--) { +- sb->u.ext3_sb.s_block_bitmap_number[j] = +- sb->u.ext3_sb.s_block_bitmap_number[j - 1]; +- sb->u.ext3_sb.s_block_bitmap[j] = +- sb->u.ext3_sb.s_block_bitmap[j - 1]; ++ brelse(sbi->s_block_bitmap[EXT3_MAX_GROUP_LOADED - 1]); ++ for (j = sbi->s_loaded_block_bitmaps - 1; j > 0; j--) { ++ sbi->s_block_bitmap_number[j] = ++ sbi->s_block_bitmap_number[j - 1]; ++ sbi->s_block_bitmap[j] = sbi->s_block_bitmap[j - 1]; + } + retval = read_block_bitmap (sb, block_group, 0); + } +@@ -206,24 +203,25 @@ static int __load_block_bitmap (struct s + static inline int load_block_bitmap (struct super_block * sb, + unsigned int block_group) + { ++ struct ext3_sb_info *sbi = EXT3_SB(sb); + int slot; +- ++ + /* + * Do the lookup for the slot. First of all, check if we're asking + * for the same slot as last time, and did we succeed that last time? + */ +- if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 && +- sb->u.ext3_sb.s_block_bitmap_number[0] == block_group && +- sb->u.ext3_sb.s_block_bitmap[0]) { ++ if (sbi->s_loaded_block_bitmaps > 0 && ++ sbi->s_block_bitmap_number[0] == block_group && ++ sbi->s_block_bitmap[0]) { + return 0; + } + /* + * Or can we do a fast lookup based on a loaded group on a filesystem + * small enough to be mapped directly into the superblock? + */ +- else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && +- sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group +- && sb->u.ext3_sb.s_block_bitmap[block_group]) { ++ else if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED && ++ sbi->s_block_bitmap_number[block_group] == block_group ++ && sbi->s_block_bitmap[block_group]) { + slot = block_group; + } + /* +@@ -243,7 +241,7 @@ static inline int load_block_bitmap (str + * If it's a valid slot, we may still have cached a previous IO error, + * in which case the bh in the superblock cache will be zero. + */ +- if (!sb->u.ext3_sb.s_block_bitmap[slot]) ++ if (!sbi->s_block_bitmap[slot]) + return -EIO; + + /* +@@ -275,7 +273,7 @@ void ext3_free_blocks (handle_t *handle, + return; + } + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + (block + count) > le32_to_cpu(es->s_blocks_count)) { +@@ -304,7 +302,7 @@ do_more: + if (bitmap_nr < 0) + goto error_return; + +- bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ bitmap_bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; + gdp = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!gdp) + goto error_return; +@@ -330,8 +328,8 @@ do_more: + if (err) + goto error_return; + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto error_return; + +@@ -341,7 +339,7 @@ + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range(block, le32_to_cpu(gdp->bg_inode_table), +- sb->u.ext2_sb.s_itb_per_group)) { ++ EXT3_SB(sb)->s_itb_per_group)) { + ext3_error(sb, __FUNCTION__, + "Freeing block in system zone - block = %lu", + block); +@@ -410,8 +407,8 @@ do_more: + if (!err) err = ret; + + /* And the superblock */ +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock"); +- ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "dirtied superblock"); ++ ret = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (!err) err = ret; + + if (overflow && !err) { +@@ -564,12 +560,12 @@ int ext3_new_block (handle_t *handle, st + } + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + if (le32_to_cpu(es->s_free_blocks_count) <= + le32_to_cpu(es->s_r_blocks_count) && +- ((sb->u.ext3_sb.s_resuid != current->fsuid) && +- (sb->u.ext3_sb.s_resgid == 0 || +- !in_group_p (sb->u.ext3_sb.s_resgid)) && ++ ((EXT3_SB(sb)->s_resuid != current->fsuid) && ++ (EXT3_SB(sb)->s_resgid == 0 || ++ !in_group_p (EXT3_SB(sb)->s_resgid)) && + !capable(CAP_SYS_RESOURCE))) + goto out; + +@@ -598,7 +595,7 @@ int ext3_new_block (handle_t *handle, st + if (bitmap_nr < 0) + goto io_error; + +- bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; + + ext3_debug ("goal is at %d:%d.\n", i, j); + +@@ -621,9 +618,9 @@ int ext3_new_block (handle_t *handle, st + * Now search the rest of the groups. We assume that + * i and gdp correctly point to the last group visited. + */ +- for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) { ++ for (k = 0; k < EXT3_SB(sb)->s_groups_count; k++) { + i++; +- if (i >= sb->u.ext3_sb.s_groups_count) ++ if (i >= EXT3_SB(sb)->s_groups_count) + i = 0; + gdp = ext3_get_group_desc (sb, i, &bh2); + if (!gdp) { +@@ -635,7 +632,7 @@ int ext3_new_block (handle_t *handle, st + if (bitmap_nr < 0) + goto io_error; + +- bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; ++ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; + j = find_next_usable_block(-1, bh, + EXT3_BLOCKS_PER_GROUP(sb)); + if (j >= 0) +@@ -674,8 +671,8 @@ got_block: + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto out; + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); +- fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); ++ fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (fatal) goto out; + + tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb) +@@ -796,7 +804,7 @@ got_block: + if (!fatal) fatal = err; + + BUFFER_TRACE(bh, "journal_dirty_metadata for superblock"); +- err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (!fatal) fatal = err; + + sb->s_dirt = 1; +@@ -829,11 +837,11 @@ unsigned long ext3_count_free_blocks (st + int i; + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; +- for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; +@@ -842,7 +850,7 @@ unsigned long ext3_count_free_blocks (st + if (bitmap_nr < 0) + continue; + +- x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr], ++ x = ext3_count_free (EXT3_SB(sb)->s_block_bitmap[bitmap_nr], + sb->s_blocksize); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); +@@ -853,7 +861,7 @@ unsigned long ext3_count_free_blocks (st + unlock_super (sb); + return bitmap_count; + #else +- return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count); ++ return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count); + #endif + } + +@@ -862,7 +870,7 @@ static inline int block_in_use (unsigned + unsigned char * map) + { + return ext3_test_bit ((block - +- le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) % ++ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb), map); + } + +@@ -930,11 +938,11 @@ void ext3_check_blocks_bitmap (struct su + struct ext3_group_desc * gdp; + int i; + +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; +- for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; +@@ -968,7 +976,7 @@ void ext3_check_blocks_bitmap (struct su + "Inode bitmap for group %d is marked free", + i); + +- for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++) ++ for (j = 0; j < EXT3_SB(sb)->s_itb_per_group; j++) + if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, + sb, bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", +--- ./fs/ext3/dir.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/dir.c Tue May 7 14:54:13 2002 +@@ -52,7 +52,7 @@ int ext3_check_dir_entry (const char * f + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + else if (le32_to_cpu(de->inode) > +- le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) + error_msg = "inode out of bounds"; + + if (error_msg != NULL) +--- ./fs/ext3/ialloc.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/ialloc.c Tue May 7 15:39:26 2002 +@@ -73,8 +73,8 @@ static int read_inode_bitmap (struct sup + * this group. The IO will be retried next time. + */ + error_out: +- sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group; +- sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh; ++ EXT3_SB(sb)->s_inode_bitmap_number[bitmap_nr] = block_group; ++ EXT3_SB(sb)->s_inode_bitmap[bitmap_nr] = bh; + return retval; + } + +@@ -225,7 +225,7 @@ void ext3_free_inode (handle_t *handle, + clear_inode (inode); + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_free_inode", + "reserved or nonexistent inode %lu", ino); +@@ -237,7 +237,7 @@ void ext3_free_inode (handle_t *handle, + if (bitmap_nr < 0) + goto error_return; + +- bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; ++ bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]; + + BUFFER_TRACE(bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh); +@@ -255,8 +255,8 @@ void ext3_free_inode (handle_t *handle, + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access"); +- fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get write access"); ++ fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (fatal) goto error_return; + + if (gdp) { +@@ -271,9 +271,9 @@ void ext3_free_inode (handle_t *handle, + if (!fatal) fatal = err; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, + "call ext3_journal_dirty_metadata"); +- err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); +@@ -305,6 +305,8 @@ struct inode * ext3_new_inode (handle_t + int i, j, avefreei; + struct inode * inode; + int bitmap_nr; ++ struct ext3_inode_info *ei; ++ struct ext3_sb_info *sbi; + struct ext3_group_desc * gdp; + struct ext3_group_desc * tmp; + struct ext3_super_block * es; +@@ -318,7 +320,9 @@ struct inode * ext3_new_inode (handle_t + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); +- init_rwsem(&inode->u.ext3_i.truncate_sem); ++ sbi = EXT3_SB(sb); ++ ei = EXT3_I(inode); ++ init_rwsem(&ei->truncate_sem); + + lock_super (sb); + es = sb->u.ext3_sb.s_es; +@@ -328,9 +332,9 @@ struct inode * ext3_new_inode (handle_t + + if (S_ISDIR(mode)) { + avefreei = le32_to_cpu(es->s_free_inodes_count) / +- sb->u.ext3_sb.s_groups_count; ++ sbi->s_groups_count; + if (!gdp) { +- for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) { ++ for (j = 0; j < sbi->s_groups_count; j++) { + struct buffer_head *temp_buffer; + tmp = ext3_get_group_desc (sb, j, &temp_buffer); + if (tmp && +@@ -350,7 +354,7 @@ repeat: + /* + * Try to place the inode in its parent directory + */ +- i = dir->u.ext3_i.i_block_group; ++ i = EXT3_I(dir)->i_block_group; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) + gdp = tmp; +@@ -360,10 +364,10 @@ repeat: + * Use a quadratic hash to find a group with a + * free inode + */ +- for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) { ++ for (j = 1; j < sbi->s_groups_count; j <<= 1) { + i += j; +- if (i >= sb->u.ext3_sb.s_groups_count) +- i -= sb->u.ext3_sb.s_groups_count; ++ if (i >= sbi->s_groups_count) ++ i -= sbi->s_groups_count; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count)) { +@@ -376,9 +380,9 @@ repeat: + /* + * That failed: try linear search for a free inode + */ +- i = dir->u.ext3_i.i_block_group + 1; +- for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) { +- if (++i >= sb->u.ext3_sb.s_groups_count) ++ i = EXT3_I(dir)->i_block_group + 1; ++ for (j = 2; j < sbi->s_groups_count; j++) { ++ if (++i >= sbi->s_groups_count) + i = 0; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && +@@ -399,11 +403,11 @@ repeat: + if (bitmap_nr < 0) + goto fail; + +- bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; ++ bh = sbi->s_inode_bitmap[bitmap_nr]; + + if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data, +- EXT3_INODES_PER_GROUP(sb))) < +- EXT3_INODES_PER_GROUP(sb)) { ++ sbi->s_inodes_per_group)) < ++ sbi->s_inodes_per_group) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) goto fail; +@@ -457,13 +461,13 @@ repeat: + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(sbi->s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, sbi->s_sbh); + if (err) goto fail; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata"); +- err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(sbi->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + sb->s_dirt = 1; + if (err) goto fail; + +@@ -483,31 +487,31 @@ repeat: + inode->i_blksize = PAGE_SIZE; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; +- inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; + if (S_ISLNK(mode)) +- inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); ++ ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + #ifdef EXT3_FRAGMENTS +- inode->u.ext3_i.i_faddr = 0; +- inode->u.ext3_i.i_frag_no = 0; +- inode->u.ext3_i.i_frag_size = 0; ++ ei->i_faddr = 0; ++ ei->i_frag_no = 0; ++ ei->i_frag_size = 0; + #endif +- inode->u.ext3_i.i_file_acl = 0; +- inode->u.ext3_i.i_dir_acl = 0; +- inode->u.ext3_i.i_dtime = 0; +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ ei->i_file_acl = 0; ++ ei->i_dir_acl = 0; ++ ei->i_dtime = 0; ++ INIT_LIST_HEAD(&ei->i_orphan); + #ifdef EXT3_PREALLOCATE +- inode->u.ext3_i.i_prealloc_count = 0; ++ ei->i_prealloc_count = 0; + #endif +- inode->u.ext3_i.i_block_group = i; ++ ei->i_block_group = i; + +- if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) ++ if (ei->i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (IS_SYNC(inode)) + handle->h_sync = 1; + insert_inode_hash(inode); +- inode->i_generation = sb->u.ext3_sb.s_next_generation++; ++ inode->i_generation = sbi->s_next_generation++; + +- inode->u.ext3_i.i_state = EXT3_STATE_NEW; ++ ei->i_state = EXT3_STATE_NEW; + err = ext3_mark_inode_dirty(handle, inode); + if (err) goto fail; + +@@ -585,19 +589,19 @@ struct inode *ext3_orphan_get (struct su + + unsigned long ext3_count_free_inodes (struct super_block * sb) + { ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_super_block *es = sbi->s_es; + #ifdef EXT3FS_DEBUG +- struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; +- for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; +@@ -606,8 +610,8 @@ unsigned long ext3_count_free_inodes (st + if (bitmap_nr < 0) + continue; + +- x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], +- EXT3_INODES_PER_GROUP(sb) / 8); ++ x = ext3_count_free(sbi->s_inode_bitmap[bitmap_nr], ++ sbi->s_inodes_per_group / 8); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; +@@ -617,7 +621,7 @@ unsigned long ext3_count_free_inodes (st + unlock_super (sb); + return desc_count; + #else +- return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count); ++ return le32_to_cpu(es->s_free_inodes_count); + #endif + } + +@@ -626,16 +630,18 @@ unsigned long ext3_count_free_inodes (st + void ext3_check_inodes_bitmap (struct super_block * sb) + { + struct ext3_super_block * es; ++ struct ext3_sb_info *sbi; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + +- es = sb->u.ext3_sb.s_es; ++ sbi = EXT3_SB(sb); ++ es = sbi->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; +- for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { ++ for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; +@@ -644,7 +650,7 @@ void ext3_check_inodes_bitmap (struct su + if (bitmap_nr < 0) + continue; + +- x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], ++ x = ext3_count_free (sbi->s_inode_bitmap[bitmap_nr], + EXT3_INODES_PER_GROUP(sb) / 8); + if (le16_to_cpu(gdp->bg_free_inodes_count) != x) + ext3_error (sb, "ext3_check_inodes_bitmap", +--- ./fs/ext3/inode.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/inode.c Tue May 7 15:41:23 2002 +@@ -196,7 +196,7 @@ void ext3_delete_inode (struct inode * i + * (Well, we could do this if we need to, but heck - it works) + */ + ext3_orphan_del(handle, inode); +- inode->u.ext3_i.i_dtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = CURRENT_TIME; + + /* + * One subtle ordering requirement: if anything has gone wrong +@@ -220,13 +220,14 @@ no_delete: + void ext3_discard_prealloc (struct inode * inode) + { + #ifdef EXT3_PREALLOCATE ++ struct ext3_inode_info *ei = EXT3_I(inode); + lock_kernel(); + /* Writer: ->i_prealloc* */ +- if (inode->u.ext3_i.i_prealloc_count) { +- unsigned short total = inode->u.ext3_i.i_prealloc_count; +- unsigned long block = inode->u.ext3_i.i_prealloc_block; +- inode->u.ext3_i.i_prealloc_count = 0; +- inode->u.ext3_i.i_prealloc_block = 0; ++ if (ei->i_prealloc_count) { ++ unsigned short total = ei->i_prealloc_count; ++ unsigned long block = ei->i_prealloc_block; ++ ei->i_prealloc_count = 0; ++ ei->i_prealloc_block = 0; + /* Writer: end */ + ext3_free_blocks (inode, block, total); + } +@@ -243,13 +244,15 @@ static int ext3_alloc_block (handle_t *h + unsigned long result; + + #ifdef EXT3_PREALLOCATE ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ + /* Writer: ->i_prealloc* */ +- if (inode->u.ext3_i.i_prealloc_count && +- (goal == inode->u.ext3_i.i_prealloc_block || +- goal + 1 == inode->u.ext3_i.i_prealloc_block)) ++ if (ei->i_prealloc_count && ++ (goal == ei->i_prealloc_block || ++ goal + 1 == ei->i_prealloc_block)) + { +- result = inode->u.ext3_i.i_prealloc_block++; +- inode->u.ext3_i.i_prealloc_count--; ++ result = ei->i_prealloc_block++; ++ ei->i_prealloc_count--; + /* Writer: end */ + ext3_debug ("preallocation hit (%lu/%lu).\n", + ++alloc_hits, ++alloc_attempts); +@@ -259,8 +262,8 @@ static int ext3_alloc_block (handle_t *h + alloc_hits, ++alloc_attempts); + if (S_ISREG(inode->i_mode)) + result = ext3_new_block (inode, goal, +- &inode->u.ext3_i.i_prealloc_count, +- &inode->u.ext3_i.i_prealloc_block, err); ++ &ei->i_prealloc_count, ++ &ei->i_prealloc_block, err); + else + result = ext3_new_block (inode, goal, 0, 0, err); + /* +@@ -394,7 +397,7 @@ static Indirect *ext3_get_branch(struct + + *err = 0; + /* i_data is not going away, no lock needed */ +- add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets); ++ add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { +@@ -437,7 +440,8 @@ no_block: + + static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) + { +- u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data; + u32 *p; + + /* Try to find previous block */ +@@ -453,9 +456,8 @@ static inline unsigned long ext3_find_ne + * It is going to be refered from inode itself? OK, just put it into + * the same cylinder group then. + */ +- return (inode->u.ext3_i.i_block_group * +- EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + +- le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); ++ return (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + } + + /** +@@ -474,14 +477,15 @@ + static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], + Indirect *partial, unsigned long *goal) + { ++ struct ext3_inode_info *ei = EXT3_I(inode); + /* Writer: ->i_next_alloc* */ +- if (block == inode->u.ext3_i.i_next_alloc_block + 1) { +- inode->u.ext3_i.i_next_alloc_block++; +- inode->u.ext3_i.i_next_alloc_goal++; ++ if (block == ei->i_next_alloc_block + 1) { ++ ei->i_next_alloc_block++; ++ ei->i_next_alloc_goal++; + } + #ifdef SEARCH_FROM_ZERO +- inode->u.ext3_i.i_next_alloc_block = 0; +- inode->u.ext3_i.i_next_alloc_goal = 0; ++ ei->i_next_alloc_block = 0; ++ ei->i_next_alloc_goal = 0; + #endif + /* Writer: end */ + /* Reader: pointers, ->i_next_alloc* */ +@@ -490,8 +493,8 @@ static int ext3_find_goal(struct inode * + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ +- if (block == inode->u.ext3_i.i_next_alloc_block) +- *goal = inode->u.ext3_i.i_next_alloc_goal; ++ if (block == ei->i_next_alloc_block) ++ *goal = ei->i_next_alloc_goal; + if (!*goal) + *goal = ext3_find_near(inode, partial); + #ifdef SEARCH_FROM_ZERO +@@ -619,6 +621,7 @@ + { + int i; + int err = 0; ++ struct ext3_inode_info *ei = EXT3_I(inode); + + /* + * If we're splicing into a [td]indirect block (as opposed to the +@@ -641,11 +644,11 @@ static int ext3_splice_branch(handle_t * + /* That's it */ + + *where->p = where->key; +- inode->u.ext3_i.i_next_alloc_block = block; +- inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); ++ ei->i_next_alloc_block = block; ++ ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key); + #ifdef SEARCH_FROM_ZERO +- inode->u.ext3_i.i_next_alloc_block = 0; +- inode->u.ext3_i.i_next_alloc_goal = 0; ++ ei->i_next_alloc_block = 0; ++ ei->i_next_alloc_goal = 0; + #endif + /* Writer: end */ + +@@ -729,6 +732,7 @@ + unsigned long goal; + int left; + int depth = ext3_block_to_path(inode, iblock, offsets); ++ struct ext3_inode_info *ei = EXT3_I(inode); + loff_t new_size; + + J_ASSERT(handle != NULL || create == 0); +@@ -782,7 +785,7 @@ out: + /* + * Block out ext3_truncate while we alter the tree + */ +- down_read(&inode->u.ext3_i.truncate_sem); ++ down_read(&ei->truncate_sem); + err = ext3_alloc_branch(handle, inode, left, goal, + offsets+(partial-chain), partial); + +@@ -794,7 +797,7 @@ out: + if (!err) + err = ext3_splice_branch(handle, inode, iblock, chain, + partial, left); +- up_read(&inode->u.ext3_i.truncate_sem); ++ up_read(&ei->truncate_sem); + if (err == -EAGAIN) + goto changed; + if (err) +@@ -807,8 +810,8 @@ out: + * truncate is in progress. It is racy between multiple parallel + * instances of get_block, but we have the BKL. + */ +- if (new_size > inode->u.ext3_i.i_disksize) +- inode->u.ext3_i.i_disksize = new_size; ++ if (new_size > ei->i_disksize) ++ ei->i_disksize = new_size; + + bh_result->b_state |= (1UL << BH_New); + goto got_it; +@@ -921,7 +924,7 @@ struct buffer_head *ext3_bread(handle_t + struct buffer_head *tmp_bh; + + for (i = 1; +- inode->u.ext3_i.i_prealloc_count && ++ EXT3_I(inode)->i_prealloc_count && + i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; + i++) { + /* +@@ -1131,8 +1134,8 @@ static int ext3_commit_write(struct file + kunmap(page); + } + } +- if (inode->i_size > inode->u.ext3_i.i_disksize) { +- inode->u.ext3_i.i_disksize = inode->i_size; ++ if (inode->i_size > EXT3_I(inode)->i_disksize) { ++ EXT3_I(inode)->i_disksize = inode->i_size; + ret2 = ext3_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; +@@ -1832,7 +1835,8 @@ static void ext3_free_branches(handle_t + void ext3_truncate(struct inode * inode) + { + handle_t *handle; +- u32 *i_data = inode->u.ext3_i.i_data; ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ u32 *i_data = EXT3_I(inode)->i_data; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; +@@ -1884,13 +1887,13 @@ void ext3_truncate(struct inode * inode) + * on-disk inode. We do this via i_disksize, which is the value which + * ext3 *really* writes onto the disk inode. + */ +- inode->u.ext3_i.i_disksize = inode->i_size; ++ ei->i_disksize = inode->i_size; + + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. + */ +- down_write(&inode->u.ext3_i.truncate_sem); ++ down_write(&ei->truncate_sem); + + if (n == 1) { /* direct blocks */ + ext3_free_data(handle, inode, NULL, i_data+offsets[0], +@@ -1954,7 +1957,7 @@ do_indirects: + case EXT3_TIND_BLOCK: + ; + } +- up_write(&inode->u.ext3_i.truncate_sem); ++ up_write(&ei->truncate_sem); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + +@@ -1983,6 +1986,8 @@ out_stop: + + int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) + { ++ struct super_block *sb = inode->i_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); + struct buffer_head *bh = 0; + unsigned long block; + unsigned long block_group; +@@ -1997,23 +2010,19 @@ int ext3_get_inode_loc (struct inode *in + inode->i_ino != EXT3_JOURNAL_INO && +- inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || +- inode->i_ino > le32_to_cpu( +- inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", +- "bad inode number: %lu", inode->i_ino); ++ inode->i_ino < EXT3_FIRST_INO(sb)) || ++ inode->i_ino > le32_to_cpu(sbi->s_es->s_inodes_count)) { ++ ext3_error (sb, __FUNCTION__, "bad inode #%lu", inode->i_ino); + goto bad_inode; + } +- block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); +- if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", +- "group >= groups count"); ++ block_group = (inode->i_ino - 1) / sbi->s_inodes_per_group; ++ if (block_group >= sbi->s_groups_count) { ++ ext3_error(sb, __FUNCTION__, "group >= groups count"); + goto bad_inode; + } +- group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); +- desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); +- bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; ++ group_desc = block_group >> sbi->s_desc_per_block_bits; ++ desc = block_group & (sbi->s_desc_per_block - 1); ++ bh = sbi->s_group_desc[group_desc]; + if (!bh) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", +- "Descriptor not loaded"); ++ ext3_error(sb, __FUNCTION__, "Descriptor not loaded"); + goto bad_inode; + } + +@@ -2021,17 +2022,17 @@ int ext3_get_inode_loc (struct inode *in + /* + * Figure out the offset within the block group inode table + */ +- offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * +- EXT3_INODE_SIZE(inode->i_sb); ++ offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group) * ++ sbi->s_inode_size; + block = le32_to_cpu(gdp[desc].bg_inode_table) + +- (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); +- if (!(bh = sb_bread(inode->i_sb, block))) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", ++ (offset >> EXT3_BLOCK_SIZE_BITS(sb)); ++ if (!(bh = sb_bread(sb, block))) { ++ ext3_error (sb, __FUNCTION__, + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, block); + goto bad_inode; + } +- offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); ++ offset &= (EXT3_BLOCK_SIZE(sb) - 1); + + iloc->bh = bh; + iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); +@@ -2047,6 +2048,7 @@ void ext3_read_inode(struct inode * inod + { + struct ext3_iloc iloc; + struct ext3_inode *raw_inode; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh; + int block; + +@@ -2054,7 +2056,7 @@ void ext3_read_inode(struct inode * inod + goto bad_inode; + bh = iloc.bh; + raw_inode = iloc.raw_inode; +- init_rwsem(&inode->u.ext3_i.truncate_sem); ++ init_rwsem(&ei->truncate_sem); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); +@@ -2067,7 +2069,7 @@ void ext3_read_inode(struct inode * inod + inode->i_atime = le32_to_cpu(raw_inode->i_atime); + inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); +- inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); ++ ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses +@@ -2075,7 +2077,7 @@ void ext3_read_inode(struct inode * inod + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || +- !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) { ++ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + goto bad_inode; +@@ -2090,33 +2092,33 @@ void ext3_read_inode(struct inode * inod + * size */ + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + inode->i_version = ++event; +- inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags); ++ ei->i_flags = le32_to_cpu(raw_inode->i_flags); + #ifdef EXT3_FRAGMENTS +- inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); +- inode->u.ext3_i.i_frag_no = raw_inode->i_frag; +- inode->u.ext3_i.i_frag_size = raw_inode->i_fsize; ++ ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); ++ ei->i_frag_no = raw_inode->i_frag; ++ ei->i_frag_size = raw_inode->i_fsize; + #endif +- inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); ++ ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { +- inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); ++ ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } +- inode->u.ext3_i.i_disksize = inode->i_size; ++ ei->i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + #ifdef EXT3_PREALLOCATE +- inode->u.ext3_i.i_prealloc_count = 0; ++ ei->i_prealloc_count = 0; + #endif +- inode->u.ext3_i.i_block_group = iloc.block_group; ++ ei->i_block_group = iloc.block_group; + + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3_N_BLOCKS; block++) +- inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ ei->i_data[block] = iloc.raw_inode->i_block[block]; ++ INIT_LIST_HEAD(&ei->i_orphan); + + brelse (iloc.bh); + +@@ -2143,17 +2145,17 @@ void ext3_read_inode(struct inode * inod + /* inode->i_attr_flags = 0; unused */ +- if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { ++ if (ei->i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ + inode->i_flags |= S_SYNC; + } +- if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) { ++ if (ei->i_flags & EXT3_APPEND_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */ + inode->i_flags |= S_APPEND; + } +- if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FL) { ++ if (ei->i_flags & EXT3_IMMUTABLE_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */ + inode->i_flags |= S_IMMUTABLE; + } +- if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) { ++ if (ei->i_flags & EXT3_NOATIME_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ + inode->i_flags |= S_NOATIME; + } +@@ -2175,6 +2177,7 @@ static int ext3_do_update_inode(handle_t + struct ext3_iloc *iloc) + { + struct ext3_inode *raw_inode = iloc->raw_inode; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + +@@ -2192,7 +2195,7 @@ static int ext3_do_update_inode(handle_t + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ +- if(!inode->u.ext3_i.i_dtime) { ++ if(!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = +@@ -2210,34 +2213,33 @@ static int ext3_do_update_inode(handle_t + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); +- raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); ++ raw_inode->i_size = cpu_to_le32(ei->i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); +- raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime); +- raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags); ++ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); ++ raw_inode->i_flags = cpu_to_le32(ei->i_flags); + #ifdef EXT3_FRAGMENTS +- raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr); +- raw_inode->i_frag = inode->u.ext3_i.i_frag_no; +- raw_inode->i_fsize = inode->u.ext3_i.i_frag_size; ++ raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); ++ raw_inode->i_frag = ei->i_frag_no; ++ raw_inode->i_fsize = ei->i_frag_size; + #else + /* If we are not tracking these fields in the in-memory inode, + * then preserve them on disk, but still initialise them to zero + * for new inodes. */ +- if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { ++ if (ei->i_state & EXT3_STATE_NEW) { + raw_inode->i_faddr = 0; + raw_inode->i_frag = 0; + raw_inode->i_fsize = 0; + } + #endif +- raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); ++ raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) { +- raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); ++ raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + } else { +- raw_inode->i_size_high = +- cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); +- if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) { ++ raw_inode->i_size_high = cpu_to_le32(ei->i_disksize >> 32); ++ if (ei->i_disksize > MAX_NON_LFS) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || +@@ -2247,7 +2249,7 @@ static int ext3_do_update_inode(handle_t + * created, add a flag to the superblock. + */ + err = ext3_journal_get_write_access(handle, +- sb->u.ext3_sb.s_sbh); ++ EXT3_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext3_update_dynamic_rev(sb); +@@ -2256,7 +2258,7 @@ static int ext3_do_update_inode(handle_t + sb->s_dirt = 1; + handle->h_sync = 1; + err = ext3_journal_dirty_metadata(handle, +- sb->u.ext3_sb.s_sbh); ++ EXT3_SB(sb)->s_sbh); + } + } + } +@@ -2265,13 +2267,13 @@ static int ext3_do_update_inode(handle_t + raw_inode->i_block[0] = + cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); + else for (block = 0; block < EXT3_N_BLOCKS; block++) +- raw_inode->i_block[block] = inode->u.ext3_i.i_data[block]; ++ raw_inode->i_block[block] = ei->i_data[block]; + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; +- EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; ++ ei->i_state &= ~EXT3_STATE_NEW; + + out_brelse: + brelse (bh); +@@ -2379,7 +2381,7 @@ int ext3_setattr(struct dentry *dentry, + } + + error = ext3_orphan_add(handle, inode); +- inode->u.ext3_i.i_disksize = attr->ia_size; ++ EXT3_I(inode)->i_disksize = attr->ia_size; + rc = ext3_mark_inode_dirty(handle, inode); + if (!error) + error = rc; +@@ -2622,9 +2624,9 @@ int ext3_change_inode_journal_flag(struc + */ + + if (val) +- inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL; ++ EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; + else +- inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL; ++ EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; + + journal_unlock_updates(journal); + +--- ./fs/ext3/ioctl.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/ioctl.c Tue May 7 15:20:52 2002 +@@ -18,13 +18,14 @@ + int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) + { ++ struct ext3_inode_info *ei = EXT3_I(inode); + unsigned int flags; + + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT3_IOC_GETFLAGS: +- flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; ++ flags = ei->i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); + case EXT3_IOC_SETFLAGS: { + handle_t *handle = NULL; +@@ -42,7 +42,7 @@ int ext3_ioctl (struct inode * inode, st + if (get_user(flags, (int *) arg)) + return -EFAULT; + +- oldflags = inode->u.ext3_i.i_flags; ++ oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3_JOURNAL_DATA_FL; +@@ -79,7 +79,7 @@ int ext3_ioctl (struct inode * inode, st + + flags = flags & EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; +- inode->u.ext3_i.i_flags = flags; ++ ei->i_flags = flags; + + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; +@@ -155,12 +155,12 @@ flags_err: + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); +- add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); +- if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) { ++ add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); ++ if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { + schedule(); + ret = 1; + } +- remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); ++ remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); + return ret; + } + #endif +--- ./fs/ext3/namei.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/namei.c Tue May 7 16:05:51 2002 +@@ -1430,8 +1430,8 @@ int ext3_orphan_add(handle_t *handle, st + J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); +- err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto out_unlock; + +@@ -1442,7 +1442,7 @@ int ext3_orphan_add(handle_t *handle, st + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); + EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); +- err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + rc = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +@@ -1520,8 +1520,7 @@ int ext3_orphan_del(handle_t *handle, st + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext3_iloc iloc2; +- struct inode *i_prev = +- list_entry(prev, struct inode, u.ext3_i.i_orphan); ++ struct inode *i_prev = orphan_list_entry(prev); + + jbd_debug(4, "orphan inode %lu will point to %lu\n", + i_prev->i_ino, ino_next); +--- ./fs/ext3/super.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/super.c Tue May 7 16:05:44 2002 +@@ -121,7 +121,7 @@ static int ext3_error_behaviour(struct s + /* If no overrides were specified on the mount, then fall back + * to the default behaviour set in the filesystem's superblock + * on disk. */ +- switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) { ++ switch (le16_to_cpu(EXT3_SB(sb)->s_es->s_errors)) { + case EXT3_ERRORS_PANIC: + return EXT3_ERRORS_PANIC; + case EXT3_ERRORS_RO: +@@ -269,9 +269,9 @@ void ext3_abort (struct super_block * sb + return; + + printk (KERN_CRIT "Remounting filesystem read-only\n"); +- sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; ++ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + sb->s_flags |= MS_RDONLY; +- sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT; ++ EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + journal_abort(EXT3_SB(sb)->s_journal, -EIO); + } + +@@ -377,8 +377,6 @@ static int ext3_blkdev_remove(struct ext3 + return ret; + } + +-#define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan) +- + static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) + { + struct list_head *l; +@@ -818,7 +818,7 @@ static void ext3_orphan_cleanup (struct + sb->s_flags &= ~MS_RDONLY; + } + +- if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) { ++ if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); +@@ -1463,12 +1463,14 @@ static void ext3_commit_super (struct su + struct ext3_super_block * es, + int sync) + { ++ struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; ++ + es->s_wtime = cpu_to_le32(CURRENT_TIME); +- BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty"); +- mark_buffer_dirty(sb->u.ext3_sb.s_sbh); ++ BUFFER_TRACE(sbh, "marking dirty"); ++ mark_buffer_dirty(sbh); + if (sync) { +- ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh); +- wait_on_buffer(sb->u.ext3_sb.s_sbh); ++ ll_rw_block(WRITE, 1, &sbh); ++ wait_on_buffer(sbh); + } + } + +@@ -1519,7 +1521,7 @@ static void ext3_clear_journal_err(struc + ext3_warning(sb, __FUNCTION__, "Marking fs in need of " + "filesystem check."); + +- sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; ++ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + ext3_commit_super (sb, es, 1); + +--- ./fs/ext3/symlink.c.orig Fri Apr 12 10:27:49 2002 ++++ ./fs/ext3/symlink.c Tue May 7 15:25:39 2002 +@@ -23,13 +23,13 @@ + + static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) + { +- char *s = (char *)dentry->d_inode->u.ext3_i.i_data; +- return vfs_readlink(dentry, buffer, buflen, s); ++ struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); ++ return vfs_readlink(dentry, buffer, buflen, (char *)ei->i_data); + } + + static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) + { +- char *s = (char *)dentry->d_inode->u.ext3_i.i_data; +- return vfs_follow_link(nd, s); ++ struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); ++ return vfs_follow_link(nd, (char*)ei->i_data); + } + +--- ./include/linux/ext3_fs.h.orig Tue Apr 16 14:27:25 2002 ++++ ./include/linux/ext3_fs.h Tue May 7 16:47:36 2002 +@@ -84,22 +84,25 @@ + #define EXT3_MIN_BLOCK_SIZE 1024 + #define EXT3_MAX_BLOCK_SIZE 4096 + #define EXT3_MIN_BLOCK_LOG_SIZE 10 ++ + #ifdef __KERNEL__ +-# define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) +-#else +-# define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) +-#endif +-#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) +-#ifdef __KERNEL__ +-# define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +-#else +-# define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +-#endif +-#ifdef __KERNEL__ +-#define EXT3_ADDR_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_addr_per_block_bits) +-#define EXT3_INODE_SIZE(s) ((s)->u.ext3_sb.s_inode_size) +-#define EXT3_FIRST_INO(s) ((s)->u.ext3_sb.s_first_ino) ++#define EXT3_SB(sb) (&((sb)->u.ext3_sb)) ++#define EXT3_I(inode) (&((inode)->u.ext3_i)) ++ ++#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) ++#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) ++#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits) ++#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size) ++#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino) + #else ++ ++/* Assume that user mode programs are passing in an ext3fs superblock, not ++ * a kernel struct super_block. This will allow us to call the feature-test ++ * macros from user land. */ ++#define EXT3_SB(sb) (sb) ++ ++#define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) ++#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) + #define EXT3_INODE_SIZE(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ + EXT3_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +@@ -108,6 +110,7 @@ + EXT3_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) + #endif ++#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + + /* + * Macro-instructions used to manage fragments +@@ -116,8 +120,8 @@ + #define EXT3_MAX_FRAG_SIZE 4096 + #define EXT3_MIN_FRAG_LOG_SIZE 10 + #ifdef __KERNEL__ +-# define EXT3_FRAG_SIZE(s) ((s)->u.ext3_sb.s_frag_size) +-# define EXT3_FRAGS_PER_BLOCK(s) ((s)->u.ext3_sb.s_frags_per_block) ++# define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size) ++# define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block) + #else + # define EXT3_FRAG_SIZE(s) (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size) + # define EXT3_FRAGS_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s)) +@@ -163,15 +167,13 @@ + /* + * Macro-instructions used to manage group descriptors + */ ++# define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group) ++# define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group) + #ifdef __KERNEL__ +-# define EXT3_BLOCKS_PER_GROUP(s) ((s)->u.ext3_sb.s_blocks_per_group) +-# define EXT3_DESC_PER_BLOCK(s) ((s)->u.ext3_sb.s_desc_per_block) +-# define EXT3_INODES_PER_GROUP(s) ((s)->u.ext3_sb.s_inodes_per_group) +-# define EXT3_DESC_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_desc_per_block_bits) ++# define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block) ++# define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits) + #else +-# define EXT3_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) + # define EXT3_DESC_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc)) +-# define EXT3_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) + #endif + + /* +@@ -344,7 +347,7 @@ + #ifndef _LINUX_EXT2_FS_H + #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt + #define set_opt(o, opt) o |= EXT3_MOUNT_##opt +-#define test_opt(sb, opt) ((sb)->u.ext3_sb.s_mount_opt & \ ++#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ + EXT3_MOUNT_##opt) + #else + #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD +@@ -441,17 +443,11 @@ + /*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ + }; + +-#ifdef __KERNEL__ +-#define EXT3_SB(sb) (&((sb)->u.ext3_sb)) +-#define EXT3_I(inode) (&((inode)->u.ext3_i)) +-#else +-/* Assume that user mode programs are passing in an ext3fs superblock, not +- * a kernel struct super_block. This will allow us to call the feature-test +- * macros from user land. */ +-#define EXT3_SB(sb) (sb) +-#endif +- +-#define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime ++#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime ++static inline struct inode *orphan_list_entry(struct list_head *l) ++{ ++ return list_entry(l, struct inode, u.ext3_i.i_orphan); ++} + + /* + * Codes for operating systems +--- ./include/linux/ext3_jbd.h.orig Tue May 7 14:44:08 2002 ++++ ./include/linux/ext3_jbd.h Tue May 7 14:44:43 2002 +@@ -291,7 +291,7 @@ + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) + return 1; +- if (inode->u.ext3_i.i_flags & EXT3_JOURNAL_DATA_FL) ++ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 1; + return 0; + } diff --git a/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch new file mode 100644 index 0000000..7cd3384 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch @@ -0,0 +1,19 @@ + fs/ext3/namei.c | 2 +- + 1 files changed, 1 insertion(+), 1 deletion(-) + +diff -puN fs/ext3/namei.c~ext3-compat-2.4.18-chaos fs/ext3/namei.c +--- linux-2.4.18/fs/ext3/namei.c~ext3-compat-2.4.18-chaos 2003-08-28 20:14:27.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-08-28 20:14:27.000000000 +0400 +@@ -830,9 +830,9 @@ static int ext3_rmdir (struct inode * di + * recovery. */ + inode->i_size = 0; + ext3_orphan_add(handle, inode); +- ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + + +_ diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch new file mode 100644 index 0000000..a173981 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch @@ -0,0 +1,478 @@ + +Create a service thread to handle delete and truncate of inodes, to avoid +long latency while truncating very large files. + + + fs/ext3/inode.c | 116 ++++++++++++++++++++++ + fs/ext3/super.c | 231 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_fs.h | 5 + include/linux/ext3_fs_sb.h | 10 + + 4 files changed, 362 insertions(+) + +--- linux-2.4.18-18.8.0-l15/fs/ext3/super.c~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:21 2003 ++++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/super.c Wed Jul 2 23:49:40 2003 +@@ -396,6 +396,220 @@ static void dump_orphan_list(struct supe + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ /*tsk->flags |= PF_KERNTHREAD;*/ ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ wait_event_interruptible(sbi->s_delete_thread_queue, ++ !list_empty(&sbi->s_delete_list) || ++ !test_opt(sb, ASYNCDEL)); ++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_dentry); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_dentry); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { ++ ext3_warning(sb, __FUNCTION__, ++ "%lu blocks, %lu inodes on list?\n", ++ sbi->s_delete_blocks,sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ } ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ ++ if (!test_opt(sb, ASYNCDEL)) ++ return; ++ ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ if (sbi->s_delete_list.next == 0) /* thread never started */ ++ return; ++ ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list)); ++} ++ ++/* Instead of playing games with the inode flags, destruction, etc we just ++ * create a new inode locally and put it on a list for the truncate thread. ++ * We need large parts of the inode struct in order to complete the ++ * truncate and unlink, so we may as well just have a real inode to do it. ++ * ++ * If we have any problem deferring the delete, just delete it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++static void ext3_delete_inode_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (is_bad_inode(old_inode)) { ++ clear_inode(old_inode); ++ return; ++ } ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_delete; ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) ++ goto out_delete; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_delete; ++ } ++ ++ /* We can iget this inode again here, because our caller has unhashed ++ * old_inode, so new_inode will be in a different inode struct. ++ * ++ * We need to ensure that the i_orphan pointers in the other inodes ++ * point at the new inode copy instead of the old one so the orphan ++ * list doesn't get corrupted when the old orphan inode is freed. ++ */ ++ down(&sbi->s_orphan_lock); ++ ++ sbi->s_mount_state |= EXT3_ORPHAN_FS; ++ new_inode = iget(old_inode->i_sb, old_inode->i_ino); ++ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; ++ if (is_bad_inode(new_inode)) { ++ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); ++ iput(new_inode); ++ new_inode = NULL; ++ } ++ if (!new_inode) { ++ up(&sbi->s_orphan_lock); ++ ext3_debug("delete inode %lu directly (bad read)\n", ++ old_inode->i_ino); ++ goto out_delete; ++ } ++ J_ASSERT(new_inode != old_inode); ++ ++ J_ASSERT(!list_empty(&oei->i_orphan)); ++ ++ nei = EXT3_I(new_inode); ++ /* Ugh. We need to insert new_inode into the same spot on the list ++ * as old_inode was, to ensure the in-memory orphan list is still ++ * in the same order as the on-disk orphan list (badness otherwise). ++ */ ++ nei->i_orphan = oei->i_orphan; ++ nei->i_orphan.next->prev = &nei->i_orphan; ++ nei->i_orphan.prev->next = &nei->i_orphan; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up(&sbi->s_orphan_lock); ++ ++ clear_inode(old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_delete: ++ ext3_delete_inode(old_inode); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -403,6 +617,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -451,7 +666,11 @@ static struct super_operations ext3_sops + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ ++#ifdef EXT3_DELETE_THREAD ++ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ ++#else + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ ++#endif + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ + write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ +@@ -511,6 +730,14 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef EXT3_DELETE_THREAD ++ if (!strcmp(this_char, "asyncdel")) ++ set_opt(*mount_options, ASYNCDEL); ++ else if (!strcmp(this_char, "noasyncdel")) ++ clear_opt(*mount_options, ASYNCDEL); ++ else ++#endif ++ + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1206,6 +1433,7 @@ struct super_block * ext3_read_super (st + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock +@@ -1648,6 +1876,9 @@ int ext3_remount (struct super_block * s + if (!parse_options(data, &tmp, sbi, &tmp, 1)) + return -EINVAL; + ++ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) ++ ext3_stop_delete_thread(sbi); ++ + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + +--- linux/fs/ext3/file.c.orig Fri Jan 17 10:57:31 2003 ++++ linux/fs/ext3/file.c Mon Jun 30 13:28:52 2003 +@@ -121,7 +121,11 @@ struct file_operations ext3_file_operati + }; + + struct inode_operations ext3_file_inode_operations = { ++#ifdef EXT3_DELETE_THREAD ++ truncate: ext3_truncate_thread, /* BKL held */ ++#else + truncate: ext3_truncate, /* BKL held */ ++#endif + setattr: ext3_setattr, /* BKL held */ + }; + +--- linux-2.4.18-18.8.0-l15/fs/ext3/inode.c~ext3-delete_thread-2.4.18 Wed Jul 2 23:13:58 2003 ++++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/inode.c Wed Jul 2 23:50:29 2003 +@@ -2004,6 +2004,118 @@ out_stop: + ext3_journal_stop(handle, inode); + } + ++#ifdef EXT3_DELETE_THREAD ++/* Move blocks from to-be-truncated inode over to a new inode, and delete ++ * that one from the delete thread instead. This avoids a lot of latency ++ * when truncating large files. ++ * ++ * If we have any problem deferring the truncate, just truncate it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++void ext3_truncate_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ handle_t *handle; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_truncate; ++ ++ /* XXX This is a temporary limitation for code simplicity. ++ * We could truncate to arbitrary sizes at some later time. ++ */ ++ if (old_inode->i_size != 0) ++ goto out_truncate; ++ ++ /* We may want to truncate the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ old_inode->i_size > oei->i_disksize) ++ goto out_truncate; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_truncate; ++ } ++ ++ ext3_discard_prealloc(old_inode); ++ ++ /* old_inode = 1 ++ * new_inode = sb + GDT + ibitmap ++ * orphan list = 1 inode/superblock for add, 2 inodes for del ++ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ */ ++ handle = ext3_journal_start(old_inode, 7); ++ if (IS_ERR(handle)) ++ goto out_truncate; ++ ++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); ++ if (IS_ERR(new_inode)) { ++ ext3_debug("truncate inode %lu directly (no new inodes)\n", ++ old_inode->i_ino); ++ goto out_journal; ++ } ++ ++ nei = EXT3_I(new_inode); ++ ++ down_write(&oei->truncate_sem); ++ new_inode->i_size = old_inode->i_size; ++ new_inode->i_blocks = old_inode->i_blocks; ++ new_inode->i_uid = old_inode->i_uid; ++ new_inode->i_gid = old_inode->i_gid; ++ new_inode->i_nlink = 0; ++ ++ /* FIXME when we do arbitrary truncates */ ++ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; ++ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; ++ ++ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); ++ memset(oei->i_data, 0, sizeof(oei->i_data)); ++ ++ nei->i_disksize = oei->i_disksize; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up_write(&oei->truncate_sem); ++ ++ if (ext3_orphan_add(handle, new_inode) < 0) ++ goto out_journal; ++ ++ if (ext3_orphan_del(handle, old_inode) < 0) { ++ ext3_orphan_del(handle, new_inode); ++ iput(new_inode); ++ goto out_journal; ++ } ++ ++ ext3_journal_stop(handle, old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_journal: ++ ext3_journal_stop(handle, old_inode); ++out_truncate: ++ ext3_truncate(old_inode); ++} ++#endif /* EXT3_DELETE_THREAD */ ++ + /* + * ext3_get_inode_loc returns with an extra refcount against the + * inode's underlying buffer_head on success. +--- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:20 2003 ++++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs.h Wed Jul 2 23:19:09 2003 +@@ -190,6 +190,7 @@ struct ext3_group_desc + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +@@ -317,6 +318,7 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ ++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -651,6 +653,9 @@ extern void ext3_discard_prealloc (struc + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++#ifdef EXT3_DELETE_THREAD ++extern void ext3_truncate_thread(struct inode *inode); ++#endif + + /* ioctl.c */ + extern int ext3_ioctl (struct inode *, struct file *, unsigned int, +--- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:21 2003 ++++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs_sb.h Wed Jul 2 23:19:09 2003 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 32 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -74,6 +76,14 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ + +_ diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch new file mode 100644 index 0000000..d0c315b --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch @@ -0,0 +1,1831 @@ + fs/ext3/Makefile | 3 + fs/ext3/extents.c | 1573 +++++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/ialloc.c | 4 + fs/ext3/inode.c | 26 + fs/ext3/super.c | 9 + include/linux/ext3_fs.h | 18 + include/linux/ext3_fs_i.h | 4 + include/linux/ext3_fs_sb.h | 10 + 8 files changed, 1641 insertions(+), 6 deletions(-) + +diff -puN /dev/null fs/ext3/extents.c +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-chaos-alexey/fs/ext3/extents.c 2003-08-25 21:11:58.000000000 +0400 +@@ -0,0 +1,1573 @@ ++/* ++ * ++ * linux/fs/ext3/extents.c ++ * ++ * Extents support for EXT3 ++ * ++ * 07/08/2003 Alex Tomas ++ * ++ * TODO: ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - error handling ++ * - we could leak allocated block in some error cases ++ * - quick search for index/leaf in ext3_ext_find_extent() ++ * - tree reduction ++ * - cache last found extent ++ * - arch-independent ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if EXT_DEBUG defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG ++#ifdef EXT_DEBUG ++#define ext_debug(inode,fmt,a...) \ ++do { \ ++ if (test_opt((inode)->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(inode,fmt,a...) ++#endif ++ ++#define EXT3_ALLOC_NEEDED 2 /* block bitmap + group descriptor */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 e_block; /* first logical block extent covers */ ++ __u32 e_start; /* first physical block extents lives */ ++ __u32 e_num; /* number of blocks covered by extent */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 e_block; /* index covers logical blocks from 'block' */ ++ __u32 e_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 e_num; /* number of valid entries */ ++ __u16 e_max; /* capacity of store in entries */ ++}; ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ if (path->p_bh) { ++ /* path points to block */ ++ return ext3_journal_get_write_access(handle, path->p_bh); ++ } ++ ++ /* path points to leaf/index in inode body */ ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ if (path->p_bh) { ++ /* path points to block */ ++ return ext3_journal_dirty_metadata(handle, path->p_bh); ++ } ++ ++ /* path points to leaf/index in inode body */ ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static inline int ext3_ext_space_block(struct inode *inode) ++{ ++ int size; ++ ++ size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_inode(struct inode *inode) ++{ ++ int size; ++ ++ size = (sizeof(EXT3_I(inode)->i_data) - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_inode_idx(struct inode *inode) ++{ ++ int size; ++ ++ size = (sizeof(EXT3_I(inode)->i_data) - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int k, l = path->p_depth; ++ ++ ext_debug(inode, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(inode, " %d->%d", path->p_idx->e_block, ++ path->p_idx->e_leaf); ++ } else if (path->p_ext) { ++ ext_debug(inode, " %d:%d:%d", ++ path->p_ext->e_block, ++ path->p_ext->e_start, ++ path->p_ext->e_num); ++ } else ++ ext_debug(inode, " []"); ++ } ++ ext_debug(inode, "\n"); ++} ++ ++static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *eh = path[depth].p_hdr; ++ struct ext3_extent *ex = EXT_FIRST_EXTENT(eh); ++ int i; ++ ++ for (i = 0; i < eh->e_num; i++, ex++) { ++ ext_debug(inode, "%d:%d:%d ", ++ ex->e_block, ex->e_start, ex->e_num); ++ } ++ ext_debug(inode, "\n"); ++} ++ ++static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ depth = path->p_depth; ++ /* try to find previous block */ ++ if (path[depth].p_ext) ++ return path[depth].p_ext->e_start + ++ path[depth].p_ext->e_num - 1; ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour; ++} ++ ++static struct ext3_ext_path * ++ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ struct ext3_extent_header *eh = (void *) ei->i_data; ++ struct ext3_extent_idx *ix; ++ struct buffer_head *bh; ++ struct ext3_extent *ex; ++ int depth, i, k, ppos = 0; ++ ++ eh = (struct ext3_extent_header *) ei->i_data; ++ ++ /* initialize capacity of leaf in inode for first time */ ++ if (eh->e_max == 0) ++ eh->e_max = ext3_ext_space_inode(inode); ++ i = depth = ei->i_depth; ++ EXT_ASSERT(i == 0 || eh->e_num > 0); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(inode, "depth %d: num %d, max %d\n", ++ ppos, eh->e_num, eh->e_max); ++ ix = EXT_FIRST_INDEX(eh); ++ if (eh->e_num) ++ path[ppos].p_idx = ix; ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ for (k = 0; k < eh->e_num; k++, ix++) { ++ ext_debug(inode, "index: %d -> %d\n", ++ ix->e_block, ix->e_leaf); ++ if (block < ix->e_block) ++ break; ++ path[ppos].p_idx = ix; ++ } ++ path[ppos].p_block = path[ppos].p_idx->e_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(inode->i_sb, path[ppos].p_block); ++ if (!bh) { ++ ext3_ext_drop_refs(inode, path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++ } ++ eh = (struct ext3_extent_header *) bh->b_data; ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ i--; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ /* find extent */ ++ ex = EXT_FIRST_EXTENT(eh); ++ if (eh->e_num) ++ path[ppos].p_ext = ex; ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ for (k = 0; k < eh->e_num; k++, ex++) { ++ if (block < ex->e_block) ++ break; ++ path[ppos].p_ext = ex; ++ } ++ ++ ext3_ext_show_path(inode, path); ++ ++ return path; ++} ++ ++static void ext3_ext_check_boundary(struct inode *inode, ++ struct ext3_ext_path *curp, ++ void *addr, int len) ++{ ++ void *end; ++ ++ if (!len) ++ return; ++ if (curp->p_bh) ++ end = (void *) curp->p_hdr + inode->i_sb->s_blocksize; ++ else ++ end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data); ++ if (((unsigned long) addr) + len > (unsigned long) end) { ++ printk("overflow! 0x%p > 0x%p\n", addr + len, end); ++ BUG(); ++ } ++ if ((unsigned long) addr < (unsigned long) curp->p_hdr) { ++ printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr); ++ BUG(); ++ } ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *curp, int logical, ++ int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->e_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->e_block) { ++ /* insert after */ ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ ++ ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ ++ ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->e_block = logical; ++ ix->e_leaf = ptr; ++ curp->p_hdr->e_num++; ++ ++ err = ext3_ext_dirty(handle, inode, curp); ++ ext3_std_error(inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].e_block; ++ ext_debug(inode, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->e_block; ++ ext_debug(inode, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(inode, "allocate %d blocks for indexes and leaf\n", ++ depth - at); ++ ablocks[0] = newext->e_start++; ++ newext->e_num--; ++ for (a = 1; a < depth - at; a++) { ++ newblock = ext3_new_block(handle, inode, newext->e_start, ++ 0, 0, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_num = 0; ++ neh->e_max = ext3_ext_space_block(inode); ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->e_num == ++ path[depth].p_hdr->e_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(inode, "move %d:%d:%d in new leaf\n", ++ path[depth].p_ext->e_block, ++ path[depth].p_ext->e_start, ++ path[depth].p_ext->e_num); ++ memmove(ex++, path[depth].p_ext++, ++ sizeof(struct ext3_extent)); ++ neh->e_num++; ++ m++; ++ } ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ goto cleanup; ++ path[depth].p_hdr->e_num -= m; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(inode, ++ "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_num = 1; ++ neh->e_max = ext3_ext_space_block(inode); ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->e_block = border; ++ fidx->e_leaf = oldblock; ++ ++ ext_debug(inode, ++ "int.index at %d (block %u): %d -> %d\n", ++ i, (unsigned) newblock, ++ (int) border, ++ (int) oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= ++ EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(inode, "%d: move %d:%d in new index\n", ++ i, path[i].p_idx->e_block, ++ path[i].p_idx->e_leaf); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->e_num++; ++ m++; ++ } ++ ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle,inode,path+i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->e_num -= m; ++ err = ext3_ext_dirty(handle, inode, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, inode, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, inode, ablocks[i], 1); ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct buffer_head *bh; ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ int len, err = 0; ++ long newblock; ++ ++ /* ++ * use already allocated by the called block for new root block ++ */ ++ newblock = newext->e_start++; ++ newext->e_num--; ++ ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ len = sizeof(struct ext3_extent_header) + ++ sizeof(struct ext3_extent) * curp->p_hdr->e_max; ++ EXT_ASSERT(len >= 0 && len < 4096); ++ memmove(bh->b_data, curp->p_hdr, len); ++ ++ /* set size of new block */ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_max = ext3_ext_space_block(inode); ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ goto out; ++ ++ curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode); ++ curp->p_hdr->e_num = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block; ++ curp->p_idx->e_leaf = newblock; ++ ++ neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); ++ ++ EXT3_I(inode)->i_depth++; ++ err = ext3_ext_dirty(handle, inode, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_ext_path *curp; ++ int i = depth, err = 0; ++ long newblock = newext->e_start; ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, inode, path, newext, i); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, inode, path, newext); ++ } ++ ++ if (!err) { ++ /* refill path */ ++ ext3_ext_drop_refs(inode, path); ++ path = ext3_ext_find_extent(inode, newext->e_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * probably we've used some blocks from extent ++ * let's allocate new block for it ++ */ ++ if (newext->e_num == 0 && !err) { ++ newext->e_start = ++ ext3_new_block(handle, inode, newblock, ++ 0, 0, &err); ++ newext->e_num = 1; ++ } ++ } ++ ++ return err; ++} ++ ++/* ++ * returns next allocated block or 0xffffffff ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static inline unsigned ext3_ext_next_allocated_block(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return 0xffffffff; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].e_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ } ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * returns first allocated block from next leaf or 0xffffffff ++ */ ++static unsigned ext3_ext_next_leaf_block(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return 0xffffffff; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ k = depth - 1; ++ border = path[depth].p_ext->e_block; ++ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ return err; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k].p_idx != EXT_FIRST_INDEX(path[k].p_hdr) ++ && k != 0) ++ break; ++ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ break; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ int depth, len; ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int err; ++ ++ depth = EXT3_I(inode)->i_depth; ++ if ((ex = path[depth].p_ext)) { ++ /* try to insert block into found extent and return */ ++ if (ex->e_block + ex->e_num == newext->e_block && ++ ex->e_start + ex->e_num == newext->e_start) { ++#ifdef AGRESSIVE_TEST ++ if (ex->e_num >= 2) ++ goto repeat; ++#endif ++ if ((err = ext3_ext_get_access(handle, inode, ++ path + depth))) ++ return err; ++ ext_debug(inode, "append %d block to %d:%d (from %d)\n", ++ newext->e_num, ex->e_block, ex->e_num, ++ ex->e_start); ++ ex->e_num += newext->e_num; ++ err = ext3_ext_dirty(handle, inode, path + depth); ++ return err; ++ } ++ } ++ ++repeat: ++ depth = EXT3_I(inode)->i_depth; ++ eh = path[depth].p_hdr; ++ if (eh->e_num == eh->e_max) { ++ /* probably next leaf has space for us? */ ++ int next = ext3_ext_next_leaf_block(inode, path); ++ if (next != 0xffffffff) { ++ ext_debug(inode, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(inode, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->e_num < eh->e_max) { ++ ext_debug(inode, ++ "next leaf has free ext(%d)\n", ++ eh->e_num); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(inode, "next leaf hasno free space(%d,%d)\n", ++ eh->e_num, eh->e_max); ++ } ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, inode, path, newext); ++ if (err) ++ goto cleanup; ++ goto repeat; ++ } ++ ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, inode, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(inode, "first extent in the leaf: %d:%d:%d\n", ++ newext->e_block, newext->e_start, ++ newext->e_num); ++ eh->e_num++; ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ ++ } else if (newext->e_block > nearex->e_block) { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ ext3_ext_check_boundary(inode, path + depth, nearex + 2, len); ++ memmove(nearex + 2, nearex + 1, len); ++ path[depth].p_ext = nearex + 1; ++ eh->e_num++; ++ } else { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ eh->e_num++; ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, inode, path); ++ } ++ ++ if (!err) { ++ nearex = path[depth].p_ext; ++ nearex->e_block = newext->e_block; ++ nearex->e_start = newext->e_start; ++ nearex->e_num = newext->e_num; ++ } ++ ++ err = ext3_ext_dirty(handle, inode, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(inode, npath); ++ kfree(npath); ++ } ++ ++ return err; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, long iblock, ++ struct buffer_head *bh_result, int create, ++ int extend_disksize) ++{ ++ struct ext3_ext_path *path; ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0; ++ ++ ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n", ++ (int) iblock, (unsigned) inode->i_ino, bh_result); ++ bh_result->b_state &= ~(1UL << BH_New); ++ ++ down(&EXT3_I(inode)->i_ext_sem); ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(inode, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out2; ++ } ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) { ++ newblock = iblock - ex->e_block + ex->e_start; ++ ext_debug(inode, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->e_block, ex->e_num, ++ newblock); ++ goto out; ++ } ++ } ++ ++ /* ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) ++ goto out2; ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path); ++ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(inode, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.e_block = iblock; ++ newex.e_start = newblock; ++ newex.e_num = 1; ++ err = ext3_ext_insert_extent(handle, inode, path, &newex); ++ if (err) ++ goto out2; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.e_start; ++ bh_result->b_state |= (1UL << BH_New); ++ ++out: ++ ext3_ext_show_leaf(inode, path); ++ bh_result->b_dev = inode->i_dev; ++ bh_result->b_blocknr = newblock; ++out2: ++ ext3_ext_drop_refs(inode, path); ++ kfree(path); ++ up(&EXT3_I(inode)->i_ext_sem); ++ ++ return err; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int ext3_ext_more_to_truncate(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->e_num == path->p_block) ++ return 0; ++ ++ /* ++ * put actual number of indexes to know is this number got ++ * changed at the next iteration ++ */ ++ path->p_block = path->p_hdr->e_num; ++ ++ return 1; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_remove_index(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->e_num); ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ return err; ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ return err; ++ bh = sb_get_hash_table(inode->i_sb, path->p_idx->e_leaf); ++ ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf); ++ ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1); ++ ++ ext_debug(inode, "index is empty, remove it, free block %d\n", ++ path->p_idx->e_leaf); ++ return err; ++} ++ ++/* ++ * returns 1 if current extent needs to be freed (even partial) ++ * instead, returns 0 ++ */ ++int ext3_ext_more_leaves_to_truncate(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ unsigned blocksize = inode->i_sb->s_blocksize; ++ struct ext3_extent *ex = path->p_ext; ++ int last_block; ++ ++ EXT_ASSERT(ex); ++ ++ /* is there leave in the current leaf? */ ++ if (ex < EXT_FIRST_EXTENT(path->p_hdr)) ++ return 0; ++ ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ ++ if (last_block >= ex->e_block + ex->e_num) ++ return 0; ++ ++ /* seems it extent have to be freed */ ++ return 1; ++} ++ ++handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++/* ++ * this routine calculate max number of blocks to be modified ++ * while freeing extent and is intended to be used in truncate path ++ */ ++static int ext3_ext_calc_credits(struct inode *inode, ++ struct ext3_ext_path *path, ++ int num) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ int needed; ++ ++ /* ++ * extent couldn't cross group, so we will modify ++ * single bitmap block and single group descriptor ++ */ ++ needed = 2; ++ ++ /* ++ * if this is last extent in a leaf, then we have to ++ * free leaf block and remove pointer from index above. ++ * that pointer could be last in index block, so we'll ++ * have to remove it too. this way we could modify/free ++ * the whole path + root index (inode stored) will be ++ * modified ++ */ ++ if (!path || (num == path->p_ext->e_num && ++ path->p_ext == EXT_FIRST_EXTENT(path->p_hdr))) ++ needed += (depth * EXT3_ALLOC_NEEDED) + 1; ++ ++ return needed; ++} ++ ++/* ++ * core of the truncate procedure: ++ * - calculated what part of each extent in the requested leaf ++ * need to be freed ++ * - frees and forgets these blocks ++ * ++ * TODO: we could optimize and free several extents during ++ * single journal_restart()-journal_restart() cycle ++ */ ++static int ext3_ext_truncate_leaf(handle_t *handle, ++ struct inode *inode, ++ struct ext3_ext_path *path, ++ int depth) ++{ ++ unsigned blocksize = inode->i_sb->s_blocksize; ++ int last_block; ++ int i, err = 0, sf, num; ++ ++ ext_debug(inode, "level %d - leaf\n", depth); ++ if (!path->p_hdr) ++ path->p_hdr = ++ (struct ext3_extent_header *) path->p_bh->b_data; ++ ++ EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max); ++ ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ path->p_ext = EXT_LAST_EXTENT(path->p_hdr); ++ while (ext3_ext_more_leaves_to_truncate(inode, path)) { ++ ++ /* what part of extent have to be freed? */ ++ sf = last_block > path->p_ext->e_block ? ++ last_block : path->p_ext->e_block; ++ ++ /* number of blocks from extent to be freed */ ++ num = path->p_ext->e_block + path->p_ext->e_num - sf; ++ ++ /* calc physical first physical block to be freed */ ++ sf = path->p_ext->e_start + (sf - path->p_ext->e_block); ++ ++ i = ext3_ext_calc_credits(inode, path, num); ++ handle = ext3_ext_journal_restart(handle, i); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n", ++ path->p_ext->e_block, path->p_ext->e_start, ++ path->p_ext->e_num, sf, num); ++ for (i = 0; i < num; i++) { ++ struct buffer_head *bh = ++ sb_get_hash_table(inode->i_sb, sf + i); ++ ext3_forget(handle, 0, inode, bh, sf + i); ++ } ++ ext3_free_blocks(handle, inode, sf, num); ++ ++ /* collect extents usage stats */ ++ spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ EXT3_SB(inode->i_sb)->s_ext_extents++; ++ EXT3_SB(inode->i_sb)->s_ext_blocks += num; ++ spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ ++ /* reduce extent */ ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ return err; ++ path->p_ext->e_num -= num; ++ if (path->p_ext->e_num == 0) ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ return err; ++ ++ path->p_ext--; ++ } ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (path->p_hdr->e_num == 0 && depth > 0) ++ err = ext3_ext_remove_index(handle, inode, path); ++ ++ return err; ++} ++ ++static void ext3_ext_collect_stats(struct inode *inode) ++{ ++ int depth; ++ ++ /* skip inodes with old good bitmap */ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return; ++ ++ /* collect on full truncate only */ ++ if (inode->i_size) ++ return; ++ ++ depth = EXT3_I(inode)->i_depth; ++ if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth) ++ EXT3_SB(inode->i_sb)->s_ext_mindepth = depth; ++ if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth) ++ EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth; ++ EXT3_SB(inode->i_sb)->s_ext_sum += depth; ++ EXT3_SB(inode->i_sb)->s_ext_count++; ++ ++} ++ ++void ext3_ext_truncate(struct inode * inode) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct ext3_ext_path *path; ++ struct page * page; ++ handle_t *handle; ++ int i, depth, err = 0; ++ ++ down(&EXT3_I(inode)->i_ext_sem); ++ ext3_ext_collect_stats(inode); ++ ++ /* ++ * We have to lock the EOF page here, because lock_page() nests ++ * outside journal_start(). ++ */ ++ if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) { ++ /* Block boundary? Nothing to do */ ++ page = NULL; ++ } else { ++ page = grab_cache_page(mapping, ++ inode->i_size >> PAGE_CACHE_SHIFT); ++ if (!page) { ++ up(&EXT3_I(inode)->i_ext_sem); ++ return; ++ } ++ } ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ i = ext3_ext_calc_credits(inode, NULL, 0); ++ handle = ext3_journal_start(inode, i); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ up(&EXT3_I(inode)->i_ext_sem); ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, mapping, inode->i_size, page, ++ inode->i_sb->s_blocksize); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ i = 0; ++ depth = EXT3_I(inode)->i_depth; ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(inode->i_sb, "ext3_ext_truncate", ++ "Can't allocate path array"); ++ goto out_stop; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ ++ path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_truncate_leaf(handle, inode, ++ path + i, i); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ path[i].p_hdr = ++ (struct ext3_extent_header *) path[i].p_bh->b_data; ++ ext_debug(inode, "initialize header\n"); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); ++ path[i].p_block = path[i].p_hdr->e_num + 1; ++ ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->e_num); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_truncate(inode, path + i)) { ++ /* go to the next level */ ++ ext_debug(inode, "move to level %d (block %d)\n", i+1, ++ path[i].p_idx->e_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(inode->i_sb, ++ path[i].p_idx->e_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->e_num == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncate_leaf() ++ */ ++ err = ext3_ext_remove_index(handle, inode, ++ path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(inode, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->e_num == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct i_depth ++ */ ++ EXT3_I(inode)->i_depth = 0; ++ path->p_hdr->e_max = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ++ kfree(path); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->i_ext_sem); ++ ext3_journal_stop(handle, inode); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ int depth = ei->i_depth + 1; ++ int needed; ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) ++ printk("EXT3-fs: file extents enabled\n"); ++ spin_lock_init(&EXT3_SB(sb)->s_ext_lock); ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ /* show collected stats */ ++ if (sbi->s_ext_count && sbi->s_ext_extents) ++ printk("EXT3-fs: min depth - %d, max depth - %d, " ++ "ave. depth - %d, ave. blocks/extent - %d\n", ++ sbi->s_ext_mindepth, ++ sbi->s_ext_maxdepth, ++ sbi->s_ext_sum / sbi->s_ext_count, ++ sbi->s_ext_blocks / sbi->s_ext_extents); ++} ++ +diff -puN fs/ext3/ialloc.c~ext3-extents fs/ext3/ialloc.c +--- linux-2.4.18-chaos/fs/ext3/ialloc.c~ext3-extents 2003-08-25 20:09:59.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/ialloc.c 2003-08-25 21:12:14.000000000 +0400 +@@ -571,6 +571,10 @@ repeat: + ei->i_prealloc_count = 0; + #endif + ei->i_block_group = i; ++ if (test_opt(sb, EXTENTS)) ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ei->i_depth = 0; ++ sema_init(&ei->i_ext_sem, 1); + + if (ei->i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; +diff -puN fs/ext3/inode.c~ext3-extents fs/ext3/inode.c +--- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-extents 2003-08-25 20:09:59.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c 2003-08-25 20:09:59.000000000 +0400 +@@ -842,6 +842,15 @@ changed: + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, 1); ++ return ext3_get_block_handle(handle, inode, block, bh, create, 1); ++} ++ + /* + * The BKL is not held on entry here. + */ +@@ -855,7 +864,7 @@ static int ext3_get_block(struct inode * + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 1); + return ret; + } +@@ -882,7 +891,7 @@ ext3_direct_io_get_block(struct inode *i + } + } + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + if (ret == 0) + bh_result->b_size = (1 << inode->i_blkbits); +@@ -904,7 +913,7 @@ struct buffer_head *ext3_getblk(handle_t + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1520,7 +1529,7 @@ ext3_block_truncate_page_prepare(struct + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, ++int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from, + struct page *page, unsigned blocksize) + { +@@ -2040,6 +2049,9 @@ void ext3_truncate(struct inode * inode) + */ + ei->i_disksize = inode->i_size; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode); ++ + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. +@@ -2436,6 +2448,8 @@ void ext3_read_inode(struct inode * inod + ei->i_prealloc_count = 0; + #endif + ei->i_block_group = iloc.block_group; ++ ei->i_depth = raw_inode->osd2.linux2.l_i_depth; ++ sema_init(&ei->i_ext_sem, 1); + + /* + * NOTE! The in-memory inode i_data array is in little-endian order +@@ -2556,6 +2570,7 @@ static int ext3_do_update_inode(handle_t + raw_inode->i_fsize = 0; + } + #endif ++ raw_inode->osd2.linux2.l_i_depth = ei->i_depth; + raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); +@@ -2759,6 +2774,9 @@ int ext3_writepage_trans_blocks(struct i + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +diff -puN fs/ext3/Makefile~ext3-extents fs/ext3/Makefile +--- linux-2.4.18-chaos/fs/ext3/Makefile~ext3-extents 2003-08-25 20:09:59.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/Makefile 2003-08-25 20:09:59.000000000 +0400 +@@ -12,7 +12,8 @@ O_TARGET := ext3.o + export-objs := ext3-exports.o + + obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o ++ ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o \ ++ extents.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make +diff -puN fs/ext3/super.c~ext3-extents fs/ext3/super.c +--- linux-2.4.18-chaos/fs/ext3/super.c~ext3-extents 2003-08-25 20:09:59.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/super.c 2003-08-25 20:09:59.000000000 +0400 +@@ -619,6 +619,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_ext_release(sb); + ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -741,6 +742,12 @@ static int parse_options (char * options + else + #endif + ++ if (!strcmp (this_char, "extents")) ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ else ++ if (!strcmp (this_char, "extdebug")) ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ else + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1711,6 +1718,8 @@ static int ext3_create_journal(struct su + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + ++ ext3_ext_init(sb); ++ + return 0; + } + +diff -puN include/linux/ext3_fs.h~ext3-extents include/linux/ext3_fs.h +--- linux-2.4.18-chaos/include/linux/ext3_fs.h~ext3-extents 2003-08-25 20:09:59.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs.h 2003-08-25 21:12:14.000000000 +0400 +@@ -183,6 +183,7 @@ struct ext3_group_desc + #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ + #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + + #define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ +@@ -243,7 +244,7 @@ struct ext3_inode { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ +- __u16 i_pad1; ++ __u16 l_i_depth; + __u16 l_i_uid_high; /* these 2 fields */ + __u16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; +@@ -324,6 +325,8 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ ++#define EXT3_MOUNT_EXTENTS 0x40000 /* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x80000 /* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -663,6 +666,12 @@ extern void ext3_discard_prealloc (struc + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++extern int ext3_block_truncate_page(handle_t *handle, ++ struct address_space *mapping, loff_t from, ++ struct page *page, unsigned blocksize); ++extern int ext3_forget(handle_t *handle, int is_metadata, ++ struct inode *inode, struct buffer_head *bh, ++ int blocknr); + #ifdef EXT3_DELETE_THREAD + extern void ext3_truncate_thread(struct inode *inode); + #endif +@@ -722,6 +731,13 @@ extern struct inode_operations ext3_dir_ + /* symlink.c */ + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); + + #endif /* __KERNEL__ */ + +diff -puN include/linux/ext3_fs_i.h~ext3-extents include/linux/ext3_fs_i.h +--- linux-2.4.18-chaos/include/linux/ext3_fs_i.h~ext3-extents 2003-08-25 20:09:59.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_i.h 2003-08-25 20:09:59.000000000 +0400 +@@ -73,6 +73,10 @@ struct ext3_inode_info { + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; ++ ++ /* extents-related data */ ++ struct semaphore i_ext_sem; ++ __u16 i_depth; + }; + + #endif /* _LINUX_EXT3_FS_I */ +diff -puN include/linux/ext3_fs_sb.h~ext3-extents include/linux/ext3_fs_sb.h +--- linux-2.4.18-chaos/include/linux/ext3_fs_sb.h~ext3-extents 2003-08-25 20:09:59.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_sb.h 2003-08-25 20:09:59.000000000 +0400 +@@ -84,6 +84,16 @@ struct ext3_sb_info { + wait_queue_head_t s_delete_thread_queue; + wait_queue_head_t s_delete_waiter_queue; + #endif ++ ++ /* extents */ ++ int s_ext_debug; ++ int s_ext_mindepth; ++ int s_ext_maxdepth; ++ int s_ext_sum; ++ int s_ext_count; ++ spinlock_t s_ext_lock; ++ int s_ext_extents; ++ int s_ext_blocks; + }; + + #endif /* _LINUX_EXT3_FS_SB */ + +_ diff --git a/lustre/kernel_patches/patches/ext3-extents-oflag-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-oflag-2.4.18-chaos.patch new file mode 100644 index 0000000..c12e397 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-oflag-2.4.18-chaos.patch @@ -0,0 +1,291 @@ + fs/ext3/ialloc.c | 5 +++-- + fs/ext3/inode.c | 2 +- + fs/ext3/namei.c | 38 ++++++++++++++++++++++++++++++++++---- + include/asm-alpha/fcntl.h | 1 + + include/asm-arm/fcntl.h | 1 + + include/asm-cris/fcntl.h | 1 + + include/asm-i386/fcntl.h | 1 + + include/asm-ia64/fcntl.h | 1 + + include/asm-m68k/fcntl.h | 1 + + include/asm-mips/fcntl.h | 1 + + include/asm-mips64/fcntl.h | 1 + + include/asm-parisc/fcntl.h | 1 + + include/asm-ppc/fcntl.h | 1 + + include/asm-s390/fcntl.h | 1 + + include/asm-s390x/fcntl.h | 1 + + include/asm-sh/fcntl.h | 1 + + include/asm-sparc/fcntl.h | 1 + + include/asm-sparc64/fcntl.h | 1 + + include/linux/ext3_fs.h | 2 +- + 19 files changed, 54 insertions(+), 8 deletions(-) + +--- linux-2.4.18/fs/ext3/ialloc.c~ext3-extents-oflag-2.4.18-chaos 2003-09-08 23:12:48.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/ialloc.c 2003-09-08 23:12:56.000000000 +0400 +@@ -331,7 +331,8 @@ int ext3_itable_block_used(struct super_ + */ + struct inode * ext3_new_inode (handle_t *handle, + const struct inode * dir, int mode, +- unsigned long goal) ++ unsigned long goal, ++ struct lookup_intent *it) + { + struct super_block * sb; + struct buffer_head * bh; +@@ -573,7 +574,7 @@ repeat: + ei->i_prealloc_count = 0; + #endif + ei->i_block_group = i; +- if (test_opt(sb, EXTENTS)) ++ if (test_opt(sb, EXTENTS) && it && (it->it_flags & O_EXTENTS)) + EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; + ei->i_depth = 0; + sema_init(&ei->i_ext_sem, 1); +--- linux-2.4.18/fs/ext3/namei.c~ext3-extents-oflag-2.4.18-chaos 2003-09-08 23:12:28.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-09-08 23:12:56.000000000 +0400 +@@ -1225,7 +1225,36 @@ static int ext3_create (struct inode * d + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, mode, +- (unsigned long)dentry->d_fsdata); ++ (unsigned long)dentry->d_fsdata, NULL); ++ err = PTR_ERR(inode); ++ if (!IS_ERR(inode)) { ++ inode->i_op = &ext3_file_inode_operations; ++ inode->i_fop = &ext3_file_operations; ++ inode->i_mapping->a_ops = &ext3_aops; ++ err = ext3_add_nondir(handle, dentry, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ext3_journal_stop(handle, dir); ++ return err; ++} ++ ++static int ext3_create_it (struct inode * dir, struct dentry * dentry, int mode, ++ struct lookup_intent *it) ++{ ++ handle_t *handle; ++ struct inode * inode; ++ int err; ++ ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(dir)) ++ handle->h_sync = 1; ++ ++ inode = ext3_new_inode (handle, dir, mode, ++ (unsigned long)dentry->d_fsdata, it); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; +@@ -1254,7 +1283,7 @@ static int ext3_mknod (struct inode * di + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, mode, +- (unsigned long)dentry->d_fsdata); ++ (unsigned long)dentry->d_fsdata, NULL); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); +@@ -1285,7 +1314,7 @@ static int ext3_mkdir(struct inode * dir + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, S_IFDIR | mode, +- (unsigned long)dentry->d_fsdata); ++ (unsigned long)dentry->d_fsdata, NULL); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1678,7 +1707,7 @@ static int ext3_symlink (struct inode * + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO, +- (unsigned long)dentry->d_fsdata); ++ (unsigned long)dentry->d_fsdata, NULL); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1882,6 +1911,7 @@ end_rename: + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { ++ create_it: ext3_create_it, /* BKL held */ + create: ext3_create, /* BKL held */ + lookup: ext3_lookup, /* BKL held */ + link: ext3_link, /* BKL held */ +--- linux-2.4.18/include/asm-alpha/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:07.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-alpha/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -22,6 +22,7 @@ + #define O_LARGEFILE 0400000 /* will be set by the kernel on every open */ + #define O_ATOMICLOOKUP 01000000 /* do atomic file lookup */ + #define O_DIRECT 02000000 /* direct disk access - should check with OSF/1 */ ++#define O_EXTENTS 04000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-arm/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:07.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-arm/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -21,6 +21,7 @@ + #define O_DIRECT 0200000 /* direct disk access hint - currently ignored */ + #define O_LARGEFILE 0400000 + #define O_ATOMICLOOKUP 01000000 ++#define O_EXTENTS 02000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-cris/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2001-02-09 03:32:44.000000000 +0300 ++++ linux-2.4.18-alexey/include/asm-cris/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -22,6 +22,7 @@ + #define O_LARGEFILE 0100000 + #define O_DIRECTORY 0200000 /* must be a directory */ + #define O_NOFOLLOW 0400000 /* don't follow links */ ++#define O_EXTENTS 01000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get f_flags */ +--- linux-2.4.18/include/asm-i386/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:09.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-i386/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -21,6 +21,7 @@ + #define O_DIRECTORY 0200000 /* must be a directory */ + #define O_NOFOLLOW 0400000 /* don't follow links */ + #define O_ATOMICLOOKUP 01000000 /* do atomic file lookup */ ++#define O_EXTENTS 02000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-ia64/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:09.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-ia64/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -29,6 +29,7 @@ + #define O_DIRECTORY 0200000 /* must be a directory */ + #define O_NOFOLLOW 0400000 /* don't follow links */ + #define O_ATOMICLOOKUP 01000000 /* do atomic file lookup */ ++#define O_EXTENTS 02000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-m68k/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2000-11-28 05:00:49.000000000 +0300 ++++ linux-2.4.18-alexey/include/asm-m68k/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -20,6 +20,7 @@ + #define O_NOFOLLOW 0100000 /* don't follow links */ + #define O_DIRECT 0200000 /* direct disk access hint - currently ignored */ + #define O_LARGEFILE 0400000 ++#define O_EXTENTS 01000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-mips64/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:15.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-mips64/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -27,6 +27,7 @@ + #define O_DIRECTORY 0x10000 /* must be a directory */ + #define O_NOFOLLOW 0x20000 /* don't follow links */ + #define O_ATOMICLOOKUP 0x40000 ++#define O_EXTENTS 0x80000 /* create file with extents if possible */ + + #define O_NDELAY O_NONBLOCK + +--- linux-2.4.18/include/asm-mips/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:14.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-mips/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -27,6 +27,7 @@ + #define O_DIRECTORY 0x10000 /* must be a directory */ + #define O_NOFOLLOW 0x20000 /* don't follow links */ + #define O_ATOMICLOOKUP 0x40000 ++#define O_EXTENTS 02000000 /* create file with extents if possible */ + + #define O_NDELAY O_NONBLOCK + +--- linux-2.4.18/include/asm-parisc/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2000-12-05 23:29:39.000000000 +0300 ++++ linux-2.4.18-alexey/include/asm-parisc/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -19,6 +19,7 @@ + #define O_NOCTTY 00400000 /* not fcntl */ + #define O_DSYNC 01000000 /* HPUX only */ + #define O_RSYNC 02000000 /* HPUX only */ ++#define O_EXTENTS 04000000 /* create file with extents if possible */ + + #define FASYNC 00020000 /* fcntl, for BSD compatibility */ + #define O_DIRECT 00040000 /* direct disk access hint - currently ignored */ +--- linux-2.4.18/include/asm-ppc/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:15.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-ppc/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -24,6 +24,7 @@ + #define O_LARGEFILE 0200000 + #define O_DIRECT 0400000 /* direct disk access hint */ + #define O_ATOMICLOOKUP 01000000 /* do atomic file lookup */ ++#define O_EXTENT 02000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-s390/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:15.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-s390/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -28,6 +28,7 @@ + #define O_DIRECTORY 0200000 /* must be a directory */ + #define O_NOFOLLOW 0400000 /* don't follow links */ + #define O_ATOMICLOOKUP 01000000 /* do atomic file lookup */ ++#define O_EXTENTS 02000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-s390x/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:15.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-s390x/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -28,6 +28,7 @@ + #define O_DIRECTORY 0200000 /* must be a directory */ + #define O_NOFOLLOW 0400000 /* don't follow links */ + #define O_ATOMICLOOKUP 01000000 /* do atomic file lookup */ ++#define O_EXTENTS 02000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-sh/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:15.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-sh/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -21,6 +21,7 @@ + #define O_DIRECTORY 0200000 /* must be a directory */ + #define O_NOFOLLOW 0400000 /* don't follow links */ + #define O_ATOMICLOOKUP 01000000 ++#define O_EXTENTS 02000000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/asm-sparc64/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:16.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-sparc64/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -22,6 +22,7 @@ + #define O_LARGEFILE 0x40000 + #define O_ATOMICLOOKUP 0x80000 /* do atomic file lookup */ + #define O_DIRECT 0x100000 /* direct disk access hint */ ++#define O_EXTENTS 0x200000 /* create file with extents if possible */ + + + #define F_DUPFD 0 /* dup */ +--- linux-2.4.18/include/asm-sparc/fcntl.h~ext3-extents-oflag-2.4.18-chaos 2003-07-28 17:52:16.000000000 +0400 ++++ linux-2.4.18-alexey/include/asm-sparc/fcntl.h 2003-09-08 23:12:56.000000000 +0400 +@@ -22,6 +22,7 @@ + #define O_LARGEFILE 0x40000 + #define O_ATOMICLOOKUP 0x80000 /* do atomic file lookup */ + #define O_DIRECT 0x100000 /* direct disk access hint */ ++#define O_EXTENTS 0x200000 /* create file with extents if possible */ + + #define F_DUPFD 0 /* dup */ + #define F_GETFD 1 /* get close_on_exec */ +--- linux-2.4.18/include/linux/ext3_fs.h~ext3-extents-oflag-2.4.18-chaos 2003-09-08 23:12:48.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs.h 2003-09-08 23:12:56.000000000 +0400 +@@ -641,7 +641,7 @@ extern int ext3_sync_file (struct file * + + /* ialloc.c */ + extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int, +- unsigned long); ++ unsigned long, struct lookup_intent *); + extern void ext3_free_inode (handle_t *, struct inode *); + extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); + extern unsigned long ext3_count_free_inodes (struct super_block *); +--- linux-2.4.18/fs/ext3/inode.c~ext3-extents-oflag-2.4.18-chaos 2003-09-08 23:12:48.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/inode.c 2003-09-08 23:13:15.000000000 +0400 +@@ -2204,7 +2204,7 @@ void ext3_truncate_thread(struct inode * + if (IS_ERR(handle)) + goto out_truncate; + +- new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode, 0); ++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode, 0, 0); + if (IS_ERR(new_inode)) { + ext3_debug("truncate inode %lu directly (no new inodes)\n", + old_inode->i_ino); + +_ diff --git a/lustre/kernel_patches/patches/ext3-map_inode_page-2.6.0.patch b/lustre/kernel_patches/patches/ext3-map_inode_page-2.6.0.patch new file mode 100644 index 0000000..4695c4f --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-map_inode_page-2.6.0.patch @@ -0,0 +1,76 @@ + fs/ext3/inode.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/super.c | 3 +++ + 2 files changed, 55 insertions(+) + +--- linux-2.6.0-test3/fs/ext3/inode.c~ext3-map_inode_page-2.6.0 2003-09-02 14:48:43.000000000 +0400 ++++ linux-2.6.0-test3-alexey/fs/ext3/inode.c 2003-09-08 17:50:16.000000000 +0400 +@@ -3129,3 +3129,55 @@ int ext3_prep_san_write(struct inode *in + ret = ret2; + return ret; + } ++ ++int ext3_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create) ++{ ++ unsigned int blocksize, blocks_per_page; ++ unsigned long iblock; ++ struct buffer_head dummy; ++ void *handle; ++ int i, rc = 0, failed = 0, needed_blocks; ++ ++ blocksize = inode->i_sb->s_blocksize; ++ blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ iblock = page->index >> (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); ++ ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ blocks[i] = ext3_bmap(inode->i_mapping, iblock); ++ if (blocks[i] == 0) { ++ failed++; ++ created[i] = -1; ++ } else { ++ created[i] = 0; ++ } ++ } ++ ++ if (failed == 0 || create == 0) ++ return 0; ++ ++ needed_blocks = ext3_writepage_trans_blocks(inode) * failed; ++ handle = ext3_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ iblock = page->index >> (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ if (blocks[i] != 0) ++ continue; ++ ++ rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); ++ if (rc) { ++ printk(KERN_INFO "ext3_map_inode_page: error reading " ++ "block %ld\n", iblock); ++ goto out; ++ } ++ blocks[i] = dummy.b_blocknr; ++ created[i] = 1; ++ } ++ ++ out: ++ ext3_journal_stop(handle); ++ return rc; ++} ++ +--- linux-2.6.0-test3/fs/ext3/super.c~ext3-map_inode_page-2.6.0 2003-09-02 14:48:43.000000000 +0400 ++++ linux-2.6.0-test3-alexey/fs/ext3/super.c 2003-09-08 17:48:33.000000000 +0400 +@@ -2094,6 +2094,9 @@ static void __exit exit_ext3_fs(void) + int ext3_prep_san_write(struct inode *inode, long *blocks, + int nblocks, loff_t newsize); + EXPORT_SYMBOL(ext3_prep_san_write); ++int ext3_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create) ++EXPORT_SYMBOL(ext3_map_inode_page); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); + +_ diff --git a/lustre/kernel_patches/patches/ext3-no-write-super-chaos.patch b/lustre/kernel_patches/patches/ext3-no-write-super-chaos.patch new file mode 100644 index 0000000..37a5d7a --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-no-write-super-chaos.patch @@ -0,0 +1,15 @@ + fs/ext3/super.c | 1 - + 1 files changed, 1 deletion(-) + +--- linux-2.4.18-chaos/fs/ext3/super.c~ext3-no-write-super-chaos 2003-08-24 21:34:53.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/super.c 2003-08-24 21:40:47.000000000 +0400 +@@ -1818,7 +1818,6 @@ void ext3_write_super (struct super_bloc + if (down_trylock(&sb->s_lock) == 0) + BUG(); + sb->s_dirt = 0; +- log_start_commit(EXT3_SB(sb)->s_journal, NULL); + } + + static int ext3_sync_fs(struct super_block *sb) + +_ diff --git a/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch b/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch new file mode 100644 index 0000000..f0b7d7e --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch @@ -0,0 +1,197 @@ + +Index: linux-2.4.20-rh/fs/ext3/inode.c +=================================================================== +--- linux-2.4.20-rh.orig/fs/ext3/inode.c 2003-09-04 18:01:41.000000000 +0800 ++++ linux-2.4.20-rh/fs/ext3/inode.c 2003-09-04 18:18:54.000000000 +0800 +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -743,9 +744,9 @@ + * The BKL may not be held on entry here. Be sure to take it early. + */ + +-static int ext3_get_block_handle(handle_t *handle, struct inode *inode, +- long iblock, +- struct buffer_head *bh_result, int create) ++static int ++ext3_get_block_handle(handle_t *handle, struct inode *inode, long iblock, ++ struct buffer_head *bh_result, int create, int extend_disksize) + { + int err = -EIO; + int offsets[4]; +@@ -825,15 +826,18 @@ + if (err) + goto cleanup; + +- new_size = inode->i_size; +- /* +- * This is not racy against ext3_truncate's modification of i_disksize +- * because VM/VFS ensures that the file cannot be extended while +- * truncate is in progress. It is racy between multiple parallel +- * instances of get_block, but we have the BKL. +- */ +- if (new_size > inode->u.ext3_i.i_disksize) +- inode->u.ext3_i.i_disksize = new_size; ++ if (extend_disksize) { ++ /* ++ * This is not racy against ext3_truncate's modification of ++ * i_disksize because VM/VFS ensures that the file cannot be ++ * extended while truncate is in progress. It is racy between ++ * multiple parallel instances of get_block, but we have BKL. ++ */ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ new_size = inode->i_size; ++ if (new_size > ei->i_disksize) ++ ei->i_disksize = new_size; ++ } + + bh_result->b_state |= (1UL << BH_New); + goto got_it; +@@ -861,7 +865,38 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); ++ ret = ext3_get_block_handle(handle, inode, iblock, ++ bh_result, create, 1); ++ return ret; ++} ++ ++#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) ++ ++static int ++ext3_direct_io_get_block(struct inode *inode, long iblock, ++ struct buffer_head *bh_result, int create) ++{ ++ handle_t *handle = journal_current_handle(); ++ int ret = 0; ++ ++ lock_kernel(); ++ if (handle && handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) { ++ /* ++ * Getting low on buffer credits... ++ */ ++ if (!ext3_journal_extend(handle, DIO_CREDITS)) { ++ /* ++ * Couldn't extend the transaction. Start a new one ++ */ ++ ret = ext3_journal_restart(handle, DIO_CREDITS); ++ } ++ } ++ if (ret == 0) ++ ret = ext3_get_block_handle(handle, inode, iblock, ++ bh_result, create, 0); ++ if (ret == 0) ++ bh_result->b_size = (1 << inode->i_blkbits); ++ unlock_kernel(); + return ret; + } + +@@ -879,7 +914,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); ++ *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1387,6 +1422,67 @@ + return journal_try_to_free_buffers(journal, page, wait); + } + ++static int ++ext3_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf, ++ unsigned long blocknr, int blocksize) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ handle_t *handle = NULL; ++ int ret; ++ int orphan = 0; ++ loff_t offset = blocknr << inode->i_blkbits; /* ugh */ ++ ssize_t count = iobuf->length; /* ditto */ ++ ++ if (rw == WRITE) { ++ loff_t final_size = offset + count; ++ ++ lock_kernel(); ++ handle = ext3_journal_start(inode, DIO_CREDITS); ++ unlock_kernel(); ++ if (IS_ERR(handle)) { ++ ret = PTR_ERR(handle); ++ goto out; ++ } ++ if (final_size > inode->i_size) { ++ lock_kernel(); ++ ret = ext3_orphan_add(handle, inode); ++ unlock_kernel(); ++ if (ret) ++ goto out_stop; ++ orphan = 1; ++ ei->i_disksize = inode->i_size; ++ } ++ } ++ ++ ret = generic_direct_IO(rw, inode, iobuf, blocknr, ++ blocksize, ext3_direct_io_get_block); ++ ++out_stop: ++ if (handle) { ++ int err; ++ ++ lock_kernel(); ++ if (orphan) ++ ext3_orphan_del(handle, inode); ++ if (orphan && ret > 0) { ++ loff_t end = offset + ret; ++ if (end > inode->i_size) { ++ ei->i_disksize = end; ++ inode->i_size = end; ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (!ret) ++ ret = err; ++ } ++ } ++ err = ext3_journal_stop(handle, inode); ++ if (ret == 0) ++ ret = err; ++ unlock_kernel(); ++ } ++out: ++ return ret; ++ ++} + + struct address_space_operations ext3_aops = { + readpage: ext3_readpage, /* BKL not held. Don't need */ +@@ -1397,6 +1493,7 @@ + bmap: ext3_bmap, /* BKL held */ + flushpage: ext3_flushpage, /* BKL not held. Don't need */ + releasepage: ext3_releasepage, /* BKL not held. Don't need */ ++ direct_IO: ext3_direct_IO, /* BKL not held. Don't need */ + }; + + /* +@@ -2970,7 +3067,7 @@ + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { + ret = ext3_get_block_handle(handle, inode, blocks[i], +- &bh_tmp, 1); ++ &bh_tmp, 1, 1); + if (ret) + break; + +@@ -3030,7 +3127,7 @@ + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1); ++ rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error reading " + "block %ld\n", iblock); diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch new file mode 100644 index 0000000..f8f514b --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-pdirops-2.4.18-chaos.patch @@ -0,0 +1,1238 @@ + fs/ext3/ialloc.c | 3 + fs/ext3/inode.c | 3 + fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++--------- + fs/ext3/super.c | 14 + + include/linux/ext3_fs.h | 1 + include/linux/ext3_fs_i.h | 6 + 6 files changed, 500 insertions(+), 109 deletions(-) + +--- linux-2.4.18/fs/ext3/namei.c~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:06.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-09-02 11:46:15.000000000 +0400 +@@ -52,6 +52,9 @@ static struct buffer_head *ext3_append(h + { + struct buffer_head *bh; + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&EXT3_I(inode)->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3_bread(handle, inode, *block, 1, err))) { +@@ -59,6 +62,8 @@ static struct buffer_head *ext3_append(h + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_journal_get_write_access(handle,bh); + } ++ up(&EXT3_I(inode)->i_append_sem); ++ + return bh; + } + +@@ -135,6 +140,8 @@ struct dx_frame + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; ++ unsigned long leaf; ++ unsigned int curidx; + }; + + struct dx_map_entry +@@ -143,6 +150,30 @@ struct dx_map_entry + u32 offs; + }; + ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 ++ ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} ++ ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} ++ ++ + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block (struct dx_entry *entry); + static void dx_set_block (struct dx_entry *entry, unsigned value); +@@ -154,7 +185,7 @@ static void dx_set_count (struct dx_entr + static void dx_set_limit (struct dx_entry *entries, unsigned value); + static unsigned dx_root_limit (struct inode *dir, unsigned infosize); + static unsigned dx_node_limit (struct inode *dir); +-static struct dx_frame *dx_probe(struct dentry *dentry, ++static struct dx_frame *dx_probe(struct qstr *name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +@@ -166,15 +197,18 @@ static void dx_sort_map(struct dx_map_en + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, int *err, + __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err); ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); ++static inline void *ext3_lock_htree(struct inode *, unsigned long, int); ++static inline void ext3_unlock_htree(struct inode *, void *); + + /* + * Future: use high four bits of block for coalesce-on-delete flags +@@ -307,6 +341,94 @@ struct stats dx_show_entries(struct dx_h + #endif /* DX_DEBUG */ + + /* ++ * dx_find_position ++ * ++ * search position of specified hash in index ++ * ++ */ ++ ++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash) ++{ ++ struct dx_entry *p, *q, *m; ++ int count; ++ ++ count = dx_get_count(entries); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ return p - 1; ++} ++ ++/* ++ * returns 1 if path is unchanged ++ */ ++int dx_check_path(struct dx_frame *frame, u32 hash) ++{ ++ struct dx_entry *p; ++ int ret = 1; ++ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hash); ++ if (frame->leaf != dx_get_block(p)) ++ ret = 0; ++ dx_unlock_bh(frame->bh); ++ ++ return ret; ++} ++ ++/* ++ * 0 - changed ++ * 1 - hasn't changed ++ */ ++static int ++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo) ++{ ++ struct dx_entry *p; ++ struct dx_frame *frame = frames; ++ u32 leaf; ++ ++ /* check first level */ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ /* is there 2nd level? */ ++ frame++; ++ if (frame->bh == NULL) ++ return 1; ++ ++ /* check second level */ ++ dx_lock_bh(frame->bh); ++ ++ /* probably 1st level got changed, check it */ ++ if (!dx_check_path(frames, hinfo->hash)) { ++ /* path changed */ ++ dx_unlock_bh(frame->bh); ++ return 0; ++ } ++ ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ return 1; ++} ++ ++/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +@@ -316,19 +438,20 @@ struct stats dx_show_entries(struct dx_h + * back to userspace. + */ + static struct dx_frame * +-dx_probe(struct dentry *dentry, struct inode *dir, ++dx_probe(struct qstr *name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) + { +- unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ unsigned indirect; ++ struct dx_entry *at, *entries; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; ++ unsigned int curidx; + + frame->bh = NULL; +- if (dentry) +- dir = dentry->d_parent->d_inode; ++ frame[1].bh = NULL; ++ + if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; +@@ -344,8 +467,8 @@ dx_probe(struct dentry *dentry, struct i + } + hinfo->hash_version = root->info.hash_version; + hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; +- if (dentry) +- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ if (name) ++ ext3fs_dirhash(name->name, name->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { +@@ -357,7 +480,19 @@ dx_probe(struct dentry *dentry, struct i + goto fail; + } + ++repeat: ++ curidx = 0; ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ dx_lock_bh(bh); ++ /* indirect must be initialized under bh lock because ++ * 2nd level creation procedure may change it and dx_probe() ++ * will suggest htree is still single-level -bzzz */ + if ((indirect = root->info.indirect_levels) > 1) { ++ dx_unlock_bh(bh); + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -365,56 +500,46 @@ dx_probe(struct dentry *dentry, struct i + *err = ERR_BAD_DX_DIR; + goto fail; + } +- +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); +- assert(dx_get_limit(entries) == dx_root_limit(dir, +- root->info.info_length)); +- dxtrace (printk("Look up %x", hash)); ++ + while (1) + { +- count = dx_get_count(entries); +- assert (count && count <= dx_get_limit(entries)); +- p = entries + 1; +- q = entries + count - 1; +- while (p <= q) +- { +- m = p + (q - p)/2; +- dxtrace(printk(".")); +- if (dx_get_hash(m) > hash) +- q = m - 1; +- else +- p = m + 1; +- } +- +- if (0) // linear search cross check +- { +- unsigned n = count - 1; +- at = entries; +- while (n--) +- { +- dxtrace(printk(",")); +- if (dx_get_hash(++at) > hash) +- { +- at--; +- break; +- } +- } +- assert (at == p - 1); +- } +- +- at = p - 1; +- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ at = dx_find_position(entries, hinfo->hash); ++ dxtrace(printk(" %x->%u\n", ++ at == entries? 0: dx_get_hash(at), ++ dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; +- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ frame->curidx = curidx; ++ frame->leaf = dx_get_block(at); ++ if (!indirect--) { ++ dx_unlock_bh(bh); ++ return frame; ++ } ++ ++ /* step into next htree level */ ++ curidx = dx_get_block(at); ++ dx_unlock_bh(bh); ++ if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err))) + goto fail2; ++ ++ dx_lock_bh(bh); ++ /* splitting may change root index block and move ++ * hash we're looking for into another index block ++ * so, we have to check this situation and repeat ++ * from begining if path got changed -bzzz */ ++ if (!dx_check_path(frame, hash)) { ++ dx_unlock_bh(bh); ++ bh = frame->bh; ++ indirect++; ++ goto repeat; ++ } ++ + at = entries = ((struct dx_node *) bh->b_data)->entries; + assert (dx_get_limit(entries) == dx_node_limit (dir)); + frame++; + } ++ dx_unlock_bh(bh); + fail2: + while (frame >= frame_in) { + brelse(frame->bh); +@@ -428,8 +553,7 @@ static void dx_release (struct dx_frame + { + if (frames[0].bh == NULL) + return; +- +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ if (frames[1].bh != NULL) + brelse(frames[1].bh); + brelse(frames[0].bh); + } +@@ -471,8 +595,10 @@ static int ext3_htree_next_block(struct + * nodes need to be read. + */ + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) { ++ p->leaf = dx_get_block(p->at); + break; ++ } + if (p == frames) + return 0; + num_frames++; +@@ -498,13 +624,17 @@ static int ext3_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), +- 0, err))) ++ u32 idx; ++ ++ idx = p->leaf = dx_get_block(p->at); ++ if (!(bh = ext3_bread(NULL, dir, idx, 0, err))) + return -1; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ p->curidx = idx; ++ p->leaf = dx_get_block(p->at); + } + return 1; + } +@@ -544,7 +674,7 @@ int ext3_htree_fill_tree(struct file *di + dir = dir_file->f_dentry->d_inode; + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); + if (!frame) + return err; + +@@ -626,7 +756,8 @@ static int dx_make_map (struct ext3_dir_ + count++; + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ +- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ de = (struct ext3_dir_entry_2 *)((char*)de + ++ le16_to_cpu(de->rec_len)); + } + return count; + } +@@ -659,7 +790,8 @@ static void dx_sort_map (struct dx_map_e + } while(more); + } + +-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct inode *dir, struct dx_frame *frame, ++ u32 hash, u32 block, u32 idx) + { + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; +@@ -671,6 +803,7 @@ static void dx_insert_block(struct dx_fr + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); ++ + } + #endif + +@@ -753,7 +886,8 @@ static int inline search_dirblock(struct + + + static struct buffer_head * ext3_find_entry (struct dentry *dentry, +- struct ext3_dir_entry_2 ** res_dir) ++ struct ext3_dir_entry_2 ** res_dir, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; +@@ -769,6 +903,7 @@ static struct buffer_head * ext3_find_en + int namelen; + const u8 *name; + unsigned blocksize; ++ int do_not_use_dx = 0; + + *res_dir = NULL; + sb = dir->i_sb; +@@ -777,9 +912,10 @@ static struct buffer_head * ext3_find_en + name = dentry->d_name.name; + if (namelen > EXT3_NAME_LEN) + return NULL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { +- bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -788,8 +924,14 @@ static struct buffer_head * ext3_find_en + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ do_not_use_dx = 1; + } + #endif ++ *lock = ext3_lock_htree(dir, 0, rwlock); ++ if (is_dx(dir) && !do_not_use_dx) { ++ ext3_unlock_htree(dir, *lock); ++ goto repeat; ++ } + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) +@@ -861,12 +1003,17 @@ cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); ++ if (!ret) { ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; ++ } + return ret; + } + + #ifdef CONFIG_EXT3_INDEX + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err) ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct dx_hash_info hinfo; +@@ -881,11 +1028,22 @@ static struct buffer_head * ext3_dx_find + struct inode *dir = dentry->d_parent->d_inode; + + sb = dir->i_sb; +- if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++repeat: ++ if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err))) + return NULL; ++ ++ *lock = ext3_lock_htree(dir, frame->leaf, rwlock); ++ /* while locking leaf we just found may get splitted ++ * so, we need another leaf. check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, *lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ + hash = hinfo.hash; + do { +- block = dx_get_block(frame->at); ++ block = frame->leaf; + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -919,6 +1077,8 @@ static struct buffer_head * ext3_dx_find + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; + dx_release (frames); + return NULL; + } +@@ -931,6 +1091,7 @@ static struct dentry *ext3_lookup(struct + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; + struct dentry *alternate = NULL; ++ void *lock = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); +@@ -938,10 +1099,11 @@ static struct dentry *ext3_lookup(struct + if (ext3_check_for_iopen(dir, dentry)) + return NULL; + +- bh = ext3_find_entry(dentry, &de); ++ bh = ext3_find_entry(dentry, &de, 0, &lock); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); ++ ext3_unlock_htree(dir, lock); + brelse (bh); + inode = iget(dir->i_sb, ino); + +@@ -984,7 +1146,8 @@ dx_move_dirents(char *from, char *to, st + unsigned rec_len = 0; + + while (count--) { +- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ struct ext3_dir_entry_2 *de = ++ (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; +@@ -997,7 +1160,8 @@ dx_move_dirents(char *from, char *to, st + + static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) + { +- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ struct ext3_dir_entry_2 *next, *to, *prev; ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; +@@ -1019,7 +1183,8 @@ static struct ext3_dir_entry_2* dx_pack_ + + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct dx_hash_info *hinfo, void **target, ++ int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1066,23 +1231,30 @@ static struct ext3_dir_entry_2 *do_split + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- ++ frame->leaf, hash2, split, count-split)); ++ + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1)); + + /* Which block gets the new entry? */ ++ *target = NULL; + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; +- } +- dx_insert_block (frame, hash2 + continued, newblock); ++ ++ /* entry will be stored into new block ++ * we have to lock it before add_dirent_to_buf */ ++ *target = ext3_lock_htree(dir, newblock, 1); ++ } ++ dx_lock_bh(frame->bh); ++ dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx); ++ dx_unlock_bh(frame->bh); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1156,7 +1328,8 @@ static int add_dirent_to_buf(handle_t *h + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ struct ext3_dir_entry_2 *de1 = ++ (struct ext3_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = cpu_to_le16(rlen - nlen); + de->rec_len = cpu_to_le16(nlen); + de = de1; +@@ -1214,7 +1387,8 @@ static int make_indexed_dir(handle_t *ha + unsigned blocksize; + struct dx_hash_info hinfo; + u32 block; +- ++ void *lock, *new_lock; ++ + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1225,7 +1399,6 @@ static int make_indexed_dir(handle_t *ha + } + root = (struct dx_root *) bh->b_data; + +- EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + bh2 = ext3_append (handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); +@@ -1233,6 +1406,8 @@ static int make_indexed_dir(handle_t *ha + } + data1 = bh2->b_data; + ++ lock = ext3_lock_htree(dir, block, 1); ++ + /* The 0th block becomes the root, move the dirents out */ + de = (struct ext3_dir_entry_2 *) &root->info; + len = ((char *) root) + blocksize - (char *) de; +@@ -1261,13 +1436,25 @@ static int make_indexed_dir(handle_t *ha + frame->entries = entries; + frame->at = entries; + frame->bh = bh; ++ frame->curidx = 0; ++ frame->leaf = 0; ++ frame[1].bh = NULL; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval); + dx_release (frames); + if (!(de)) +- return retval; ++ goto cleanup; ++ ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++cleanup: ++ if (new_lock) ++ ext3_unlock_htree(dir, new_lock); ++ /* we mark directory indexed in order to ++ * avoid races while htree being created -bzzz */ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ ext3_unlock_htree(dir, lock); + +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ return retval; + } + #endif + +@@ -1296,11 +1483,13 @@ static int ext3_add_entry (handle_t *han + unsigned blocksize; + unsigned nlen, rlen; + u32 block, blocks; ++ void *lock; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { + retval = ext3_dx_add_entry(handle, dentry, inode); +@@ -1311,36 +1500,53 @@ static int ext3_add_entry (handle_t *han + ext3_mark_inode_dirty(handle, dir); + } + #endif ++ lock = ext3_lock_htree(dir, 0, 1); ++ if (is_dx(dir)) { ++ /* we got lock for block 0 ++ * probably previous holder of the lock ++ * created htree -bzzz */ ++ ext3_unlock_htree(dir, lock); ++ goto repeat; ++ } ++ + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext3_bread(handle, dir, block, 0, &retval); +- if(!bh) ++ if(!bh) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); +- if (retval != -ENOSPC) ++ if (retval != -ENOSPC) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + + #ifdef CONFIG_EXT3_INDEX + if (blocks == 1 && !dx_fallback && +- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) +- return make_indexed_dir(handle, dentry, inode, bh); ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) { ++ retval = make_indexed_dir(handle, dentry, inode, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; ++ } + #endif + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); +- if (!bh) ++ if (!bh) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(rlen = blocksize); + nlen = 0; +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; + } + + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +@@ -1352,15 +1558,28 @@ static int ext3_dx_add_entry(handle_t *h + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; + int err; +- +- frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++ int curidx; ++ void *idx_lock, *leaf_lock, *newleaf_lock; ++ ++repeat: ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +- entries = frame->entries; +- at = frame->at; + +- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ /* we're going to chage leaf, so lock it first */ ++ leaf_lock = ext3_lock_htree(dir, frame->leaf, 1); ++ ++ /* while locking leaf we just found may get splitted ++ * so we need to check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) { ++ printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err); + goto cleanup; ++ } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -1373,6 +1592,35 @@ static int ext3_dx_add_entry(handle_t *h + goto cleanup; + } + ++ /* our leaf has no enough space. hence, we have to ++ * split it. so lock index for this leaf first */ ++ curidx = frame->curidx; ++ idx_lock = ext3_lock_htree(dir, curidx, 1); ++ ++ /* now check did path get changed? */ ++ dx_release(frames); ++ ++ frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode, ++ &hinfo, frames, &err); ++ if (!frame) { ++ /* FIXME: error handling here */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ return err; ++ } ++ ++ if (frame->curidx != curidx) { ++ /* path has been changed. we have to drop old lock ++ * and repeat */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ entries = frame->entries; ++ at = frame->at; ++ + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +@@ -1384,7 +1632,8 @@ static int ext3_dx_add_entry(handle_t *h + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; +- ++ void *nb_lock; ++ + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3_warning(sb, __FUNCTION__, +@@ -1395,6 +1644,7 @@ static int ext3_dx_add_entry(handle_t *h + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; ++ nb_lock = ext3_lock_htree(dir, newblock, 1); + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); +@@ -1406,27 +1656,73 @@ static int ext3_dx_add_entry(handle_t *h + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); ++ void *ri_lock; ++ ++ /* we have to protect root htree index against ++ * another dx_add_entry() which would want to ++ * split it too -bzzz */ ++ ri_lock = ext3_lock_htree(dir, 0, 1); ++ ++ /* as root index block blocked we must repeat ++ * searching for current position of our 2nd index -bzzz */ ++ dx_lock_bh(frame->bh); ++ frames->at = dx_find_position(frames->entries, hinfo.hash); ++ dx_unlock_bh(frame->bh); ++ + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); +- +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; +- ++ ++ /* copy index into new one */ + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { ++ /* unlock index we won't use */ ++ ext3_unlock_htree(dir, idx_lock); ++ idx_lock = nb_lock; + frame->at = at = at - entries - icount1 + entries2; +- frame->entries = entries = entries2; ++ frame->entries = entries2; ++ frame->curidx = curidx = newblock; + swap(frame->bh, bh2); ++ } else { ++ /* we'll use old index,so new one may be freed */ ++ ext3_unlock_htree(dir, nb_lock); + } +- dx_insert_block (frames + 0, hash2, newblock); ++ ++ /* NOTE: very subtle piece of code ++ * competing dx_probe() may find 2nd level index in root ++ * index, then we insert new index here and set new count ++ * in that 2nd level index. so, dx_probe() may see 2nd ++ * level index w/o hash it looks for. the solution is ++ * to check root index after we locked just founded 2nd ++ * level index -bzzz */ ++ dx_lock_bh(frames[0].bh); ++ dx_insert_block (dir, frames + 0, hash2, newblock, 0); ++ dx_unlock_bh(frames[0].bh); ++ ++ /* now old and new 2nd level index blocks contain ++ * all pointers, so dx_probe() may find it in the both. ++ * it's OK -bzzz */ ++ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, icount1); ++ dx_unlock_bh(frame->bh); ++ ++ /* now old 2nd level index block points to first half ++ * of leafs. it's importand that dx_probe() must ++ * check root index block for changes under ++ * dx_lock_bh(frame->bh) -bzzz */ ++ ++ ext3_unlock_htree(dir, ri_lock); ++ + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); +@@ -1435,38 +1731,61 @@ static int ext3_dx_add_entry(handle_t *h + goto journal_error; + brelse (bh2); + } else { ++ unsigned long leaf = frame->leaf; ++ + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ ++ dx_lock_bh(frames[0].bh); + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ dx_unlock_bh(frames[0].bh); + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; ++ frame->curidx = newblock; ++ frame->leaf = leaf; + err = ext3_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; ++ ++ /* first level index was root. it's already initialized */ ++ /* we my unlock it now */ ++ ext3_unlock_htree(dir, idx_lock); ++ ++ /* current index is just created 2nd level index */ ++ curidx = newblock; ++ idx_lock = nb_lock; + } + ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err); + if (!de) + goto cleanup; ++ ++ /* index splitted */ ++ ext3_unlock_htree(dir, idx_lock); ++ + err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ++ if (newleaf_lock) ++ ext3_unlock_htree(dir, newleaf_lock); ++ + bh = 0; + goto cleanup; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: ++ ext3_unlock_htree(dir, leaf_lock); + if (bh) + brelse(bh); + dx_release(frames); +@@ -1899,6 +2218,7 @@ static int ext3_rmdir (struct inode * di + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) { +@@ -1906,7 +2226,7 @@ static int ext3_rmdir (struct inode * di + } + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_rmdir; + +@@ -1917,14 +2237,19 @@ static int ext3_rmdir (struct inode * di + DQUOT_INIT(inode); + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = -ENOTEMPTY; +- if (!empty_dir (inode)) ++ if (!empty_dir (inode)) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) +@@ -1957,6 +2282,7 @@ static int ext3_unlink(struct inode * di + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) { +@@ -1967,7 +2293,7 @@ static int ext3_unlink(struct inode * di + handle->h_sync = 1; + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_unlink; + +@@ -1975,8 +2301,10 @@ static int ext3_unlink(struct inode * di + DQUOT_INIT(inode); + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_unlink; ++ } + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", +@@ -1985,6 +2313,7 @@ static int ext3_unlink(struct inode * di + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +@@ -2106,6 +2435,7 @@ static int ext3_rename (struct inode * o + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval; ++ void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL; + + old_bh = new_bh = dir_bh = NULL; + +@@ -2118,7 +2448,10 @@ static int ext3_rename (struct inode * o + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; + +- old_bh = ext3_find_entry (old_dentry, &old_de); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); ++ ++ old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process +@@ -2131,7 +2464,7 @@ static int ext3_rename (struct inode * o + goto end_rename; + + new_inode = new_dentry->d_inode; +- new_bh = ext3_find_entry (new_dentry, &new_de); ++ new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); +@@ -2194,7 +2527,7 @@ static int ext3_rename (struct inode * o + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + +- old_bh2 = ext3_find_entry(old_dentry, &old_de2); ++ old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, + old_de2, old_bh2); +@@ -2237,6 +2570,14 @@ static int ext3_rename (struct inode * o + retval = 0; + + end_rename: ++ if (lock1) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1); ++ if (lock2) ++ ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2); ++ if (lock3) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); +@@ -2245,6 +2586,29 @@ end_rename: + } + + /* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++static inline void *ext3_lock_htree(struct inode *dir, ++ unsigned long value, int rwlock) ++{ ++ void *lock; ++ ++ if (!test_opt(dir->i_sb, PDIROPS)) ++ return NULL; ++ lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL); ++ return lock; ++} ++ ++static inline void ext3_unlock_htree(struct inode *dir, ++ void *lock) ++{ ++ if (!test_opt(dir->i_sb, PDIROPS) || !lock) ++ return; ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock); ++} ++ ++/* + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { +--- linux-2.4.18/fs/ext3/super.c~ext3-pdirops-2.4.18-chaos 2003-09-01 16:33:25.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/super.c 2003-09-02 12:46:29.000000000 +0400 +@@ -786,6 +786,8 @@ static int parse_options (char * options + return 0; + } + } ++ else if (!strcmp (this_char, "pdirops")) ++ set_opt (sbi->s_mount_opt, PDIROPS); + else if (!strcmp (this_char, "grpid") || + !strcmp (this_char, "bsdgroups")) + set_opt (*mount_options, GRPID); +@@ -812,6 +814,9 @@ static int parse_options (char * options + if (want_numeric(value, "sb", sb_block)) + return 0; + } ++ else if (!strcmp (this_char, "pdirops")) { ++ set_opt (sbi->s_mount_opt, PDIROPS); ++ } + #ifdef CONFIG_JBD_DEBUG + else if (!strcmp (this_char, "ro-after")) { + unsigned long v; +@@ -969,6 +974,10 @@ static int ext3_setup_super(struct super + ext3_check_inodes_bitmap (sb); + } + #endif ++#ifdef S_PDIROPS ++ if (test_opt (sb, PDIROPS)) ++ sb->s_flags |= S_PDIROPS; ++#endif + setup_ro_after(sb); + return res; + } +@@ -1463,6 +1472,11 @@ struct super_block * ext3_read_super (st + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ++ if (test_opt(sb, PDIROPS)) { ++ printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n"); ++ sb->s_flags |= S_PDIROPS; ++ } ++ + return sb; + + failed_mount3: +--- linux-2.4.18/include/linux/ext3_fs.h~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:06.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs.h 2003-09-02 11:46:15.000000000 +0400 +@@ -310,6 +310,7 @@ struct ext3_inode { + /* + * Mount flags + */ ++#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ + #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ + #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ + #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ +--- linux-2.4.18/include/linux/ext3_fs_i.h~ext3-pdirops-2.4.18-chaos 2003-08-29 11:57:30.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs_i.h 2003-09-02 11:46:15.000000000 +0400 +@@ -17,6 +17,7 @@ + #define _LINUX_EXT3_FS_I + + #include ++#include + + /* + * second extended file system inode data in memory +@@ -73,6 +74,11 @@ struct ext3_inode_info { + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; ++ ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; + }; + + #endif /* _LINUX_EXT3_FS_I */ +--- linux-2.4.18/fs/ext3/inode.c~ext3-pdirops-2.4.18-chaos 2003-09-01 16:33:25.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/inode.c 2003-09-02 11:46:15.000000000 +0400 +@@ -2454,6 +2454,9 @@ void ext3_read_inode(struct inode * inod + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; ++ dynlock_init(&EXT3_I(inode)->i_htree_lock); ++ sema_init(&EXT3_I(inode)->i_rename_sem, 1); ++ sema_init(&EXT3_I(inode)->i_append_sem, 1); + } else if (S_ISLNK(inode->i_mode)) { + if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; +--- linux-2.4.18/fs/ext3/ialloc.c~ext3-pdirops-2.4.18-chaos 2003-09-01 14:58:05.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/ialloc.c 2003-09-02 11:46:15.000000000 +0400 +@@ -601,6 +601,9 @@ repeat: + return ERR_PTR(-EDQUOT); + } + ext3_debug ("allocating inode %lu\n", inode->i_ino); ++ dynlock_init(&EXT3_I(inode)->i_htree_lock); ++ sema_init(&EXT3_I(inode)->i_rename_sem, 1); ++ sema_init(&EXT3_I(inode)->i_append_sem, 1); + return inode; + + fail: + +_ diff --git a/lustre/kernel_patches/patches/iopen-2.4.18-2.patch b/lustre/kernel_patches/patches/iopen-2.4.18-2.patch new file mode 100644 index 0000000..3d9a864 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.4.18-2.patch @@ -0,0 +1,422 @@ + Documentation/filesystems/ext2.txt | 16 ++ + fs/ext3/Makefile | 2 + fs/ext3/inode.c | 4 + fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ + fs/ext3/iopen.h | 13 + + fs/ext3/namei.c | 12 + + fs/ext3/super.c | 11 + + include/linux/ext3_fs.h | 2 + 8 files changed, 318 insertions(+), 1 deletion(-) + +--- linux-2.4.18-p4smp/Documentation/filesystems/ext2.txt~iopen-2.4.18 2003-07-09 12:17:30.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/Documentation/filesystems/ext2.txt 2003-07-09 17:13:02.000000000 -0600 +@@ -35,6 +35,22 @@ resgid=n The group ID which may use th + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +--- linux-2.4.18-p4smp/fs/ext3/Makefile~iopen-2.4.18 2003-07-09 17:12:12.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/ext3/Makefile 2003-07-09 17:13:15.000000000 -0600 +@@ -11,7 +11,7 @@ O_TARGET := ext3.o + + export-objs := super.o inode.o xattr.o ext3-exports.o + +-obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + +--- linux-2.4.18-p4smp/fs/ext3/inode.c~iopen-2.4.18 2003-07-09 17:11:19.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/ext3/inode.c 2003-07-09 17:13:02.000000000 -0600 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * SEARCH_FROM_ZERO forces each block allocation to search from the start +@@ -2165,6 +2166,9 @@ void ext3_read_inode(struct inode * inod + struct buffer_head *bh; + int block; + ++ if (ext3_iopen_get_inode(inode)) ++ return; ++ + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; +--- /dev/null 2003-01-30 03:24:37.000000000 -0700 ++++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.c 2003-07-09 17:13:02.000000000 -0600 +@@ -0,0 +1,259 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; ++ spin_unlock(&dcache_lock); ++ ++ d_add(dentry, inode); ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ /* verify this dentry is really new */ ++ assert(!de->d_inode); ++ assert(list_empty(&de->d_subdirs)); ++ assert(list_empty(&de->d_alias)); ++ ++ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) { ++ spin_unlock(&dcache_lock); ++ return NULL; ++ } ++ ++ /* Move the goal to the de hash queue - like d_move() */ ++ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; ++ list_del(&goal->d_hash); ++ list_add(&goal->d_hash, &de->d_hash); ++ ++ list_del(&goal->d_child); ++ list_del(&de->d_child); ++ ++ /* Switch the parents and the names.. */ ++ switch_names(goal, de); ++ do_switch(goal->d_parent, de->d_parent); ++ do_switch(goal->d_name.len, de->d_name.len); ++ do_switch(goal->d_name.hash, de->d_name.hash); ++ ++ /* And add them back to the (new) parent lists */ ++ list_add(&goal->d_child, &goal->d_parent->d_subdirs); ++ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ spin_unlock(&dcache_lock); ++ ++ return goal; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ inode->u.ext3_i.i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +--- /dev/null 2003-01-30 03:24:37.000000000 -0700 ++++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.h 2003-07-09 17:13:02.000000000 -0600 +@@ -0,0 +1,13 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); +--- linux-2.4.18-p4smp/fs/ext3/namei.c~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/ext3/namei.c 2003-07-09 17:13:02.000000000 -0600 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * define how far ahead to read directories while searching them. +@@ -703,16 +704,21 @@ cleanup_and_exit: + return NULL; + } + #endif ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode); + + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ struct dentry *alternate = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -723,6 +729,12 @@ static struct dentry *ext3_lookup(struct + if (!inode) + return ERR_PTR(-EACCES); + } ++ ++ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { ++ iput(inode); ++ return alternate; ++ } ++ + d_add(dentry, inode); + return NULL; + } +--- linux-2.4.18-p4smp/fs/ext3/super.c~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/ext3/super.c 2003-07-09 17:13:02.000000000 -0600 +@@ -831,6 +831,17 @@ static int parse_options (char * options + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; ++ else if (!strcmp (this_char, "iopen")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } else if (!strcmp (this_char, "noiopen")) { ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "iopen_nopriv")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create +--- linux-2.4.18-p4smp/include/linux/ext3_fs.h~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/include/linux/ext3_fs.h 2003-07-09 17:13:02.000000000 -0600 +@@ -321,6 +321,8 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ ++#define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + +_ diff --git a/lustre/kernel_patches/patches/iopen-2.6.0.patch b/lustre/kernel_patches/patches/iopen-2.6.0.patch new file mode 100644 index 0000000..af67758 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.6.0.patch @@ -0,0 +1,403 @@ + Documentation/filesystems/ext2.txt | 16 ++ + fs/ext3/Makefile | 2 + fs/ext3/inode.c | 3 + fs/ext3/iopen.c | 239 +++++++++++++++++++++++++++++++++++++ + fs/ext3/iopen.h | 15 ++ + fs/ext3/namei.c | 13 ++ + fs/ext3/super.c | 11 + + include/linux/ext3_fs.h | 2 + 8 files changed, 300 insertions(+), 1 deletion(-) + +--- linux-2.6.0-test1/Documentation/filesystems/ext2.txt~iopen-2.6.0 2002-11-11 06:28:06.000000000 +0300 ++++ linux-2.6.0-test1-alexey/Documentation/filesystems/ext2.txt 2003-08-24 13:02:02.000000000 +0400 +@@ -35,6 +35,22 @@ resgid=n The group ID which may use th + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +--- linux-2.6.0-test1/fs/ext3/inode.c~iopen-2.6.0 2003-08-24 13:00:36.000000000 +0400 ++++ linux-2.6.0-test1-alexey/fs/ext3/inode.c 2003-08-24 13:02:02.000000000 +0400 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -2477,6 +2478,8 @@ void ext3_read_inode(struct inode * inod + ei->i_acl = EXT3_ACL_NOT_CACHED; + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif ++ if (ext3_iopen_get_inode(inode)) ++ return; + if (ext3_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; + bh = iloc.bh; +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.6.0-test1-alexey/fs/ext3/iopen.c 2003-08-24 13:02:02.000000000 +0400 +@@ -0,0 +1,239 @@ ++ ++ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) ++{ ++ struct inode * inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ spin_unlock(&dcache_lock); ++ ++ d_add(dentry, inode); ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry * dentry, struct dentry * target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++ ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ /* verify this dentry is really new */ ++ assert(!de->d_inode); ++ assert(list_empty(&de->d_subdirs)); ++ assert(list_empty(&de->d_alias)); ++ ++ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) { ++ spin_unlock(&dcache_lock); ++ return NULL; ++ } ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~DCACHE_DISCONNECTED; ++ hlist_add_before(&goal->d_hash, &de->d_hash); ++ hlist_del(&goal->d_hash); ++ ++ list_del(&goal->d_child); ++ list_del(&de->d_child); ++ ++ /* Switch the parents and the names.. */ ++ switch_names(goal, de); ++ do_switch(goal->d_parent, de->d_parent); ++ do_switch(goal->d_name.len, de->d_name.len); ++ do_switch(goal->d_name.hash, de->d_name.hash); ++ ++ /* And add them back to the (new) parent lists */ ++ list_add(&goal->d_child, &goal->d_parent->d_subdirs); ++ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ ++ spin_unlock(&dcache_lock); ++ return goal; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry) ++{ ++ struct inode * inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode * inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.6.0-test1-alexey/fs/ext3/iopen.h 2003-08-24 13:02:02.000000000 +0400 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode * inode); ++ ++ +--- linux-2.6.0-test1/fs/ext3/Makefile~iopen-2.6.0 2003-08-24 12:58:32.000000000 +0400 ++++ linux-2.6.0-test1-alexey/fs/ext3/Makefile 2003-08-24 13:02:40.000000000 +0400 +@@ -5,7 +5,7 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o ++ iopen.o ioctl.o namei.o super.o symlink.o hash.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +--- linux-2.6.0-test1/fs/ext3/namei.c~iopen-2.6.0 2003-07-24 15:52:30.000000000 +0400 ++++ linux-2.6.0-test1-alexey/fs/ext3/namei.c 2003-08-24 13:02:02.000000000 +0400 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -970,15 +971,21 @@ errout: + } + #endif + ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode); ++ + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) + { + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ struct dentry *alternate = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -991,6 +998,12 @@ static struct dentry *ext3_lookup(struct + } + if (inode) + return d_splice_alias(inode, dentry); ++ ++ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { ++ iput(inode); ++ return alternate; ++ } ++ + d_add(dentry, inode); + return NULL; + } +--- linux-2.6.0-test1/fs/ext3/super.c~iopen-2.6.0 2003-08-24 13:00:36.000000000 +0400 ++++ linux-2.6.0-test1-alexey/fs/ext3/super.c 2003-08-24 13:02:02.000000000 +0400 +@@ -755,6 +755,17 @@ static int parse_options (char * options + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; ++ else if (!strcmp (this_char, "iopen")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } else if (!strcmp (this_char, "noiopen")) { ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "iopen_nopriv")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create +--- linux-2.6.0-test1/include/linux/ext3_fs.h~iopen-2.6.0 2003-08-24 12:58:57.000000000 +0400 ++++ linux-2.6.0-test1-alexey/include/linux/ext3_fs.h 2003-08-24 13:02:02.000000000 +0400 +@@ -324,6 +324,8 @@ struct ext3_inode { + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ ++#define EXT3_MOUNT_IOPEN 0x10000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x20000 /* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H + +_ diff --git a/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch b/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch new file mode 100644 index 0000000..c7d06a8 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26-2.patch @@ -0,0 +1,1775 @@ + fs/ext3/Makefile | 4 + fs/ext3/ext3-exports.c | 13 + fs/ext3/ialloc.c | 2 + fs/ext3/inode.c | 29 - + fs/ext3/namei.c | 8 + fs/ext3/super.c | 23 + fs/ext3/xattr.c | 1242 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_fs.h | 46 - + include/linux/ext3_jbd.h | 8 + include/linux/ext3_xattr.h | 155 +++++ + include/linux/xattr.h | 15 + 11 files changed, 1494 insertions(+), 51 deletions(-) + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/fs/ext3/ext3-exports.c 2003-09-01 14:55:39.000000000 +0400 +@@ -0,0 +1,13 @@ ++#include ++#include ++#include ++#include ++#include ++ ++EXPORT_SYMBOL(ext3_force_commit); ++EXPORT_SYMBOL(ext3_bread); ++EXPORT_SYMBOL(ext3_xattr_register); ++EXPORT_SYMBOL(ext3_xattr_unregister); ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_list); ++EXPORT_SYMBOL(ext3_xattr_set); +--- linux-2.4.18/fs/ext3/ialloc.c~linux-2.4.18ea-0.8.26-2 2003-07-28 17:52:04.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/ialloc.c 2003-09-01 14:55:39.000000000 +0400 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); ++ ext3_xattr_drop_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + +--- linux-2.4.18/fs/ext3/inode.c~linux-2.4.18ea-0.8.26-2 2003-07-28 17:52:04.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/inode.c 2003-09-01 14:55:39.000000000 +0400 +@@ -39,6 +39,18 @@ + */ + #undef SEARCH_FROM_ZERO + ++/* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext3_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = EXT3_I(inode)->i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. +@@ -48,7 +60,7 @@ + * still needs to be revoked. + */ + +-static int ext3_forget(handle_t *handle, int is_metadata, ++int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) + { +@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i + { + handle_t *handle; + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + + lock_kernel(); +@@ -1877,6 +1887,8 @@ void ext3_truncate(struct inode * inode) + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext3_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -2038,8 +2050,6 @@ int ext3_get_inode_loc (struct inode *in + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && +- inode->i_ino != EXT3_ACL_IDX_INO && +- inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( +@@ -2166,10 +2176,7 @@ void ext3_read_inode(struct inode * inod + + brelse (iloc.bh); + +- if (inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; +@@ -2177,7 +2184,7 @@ void ext3_read_inode(struct inode * inod + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { + inode->i_op = &page_symlink_inode_operations; +--- linux-2.4.18/fs/ext3/Makefile~linux-2.4.18ea-0.8.26-2 2003-08-29 16:53:17.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/Makefile 2003-09-01 14:55:50.000000000 +0400 +@@ -9,10 +9,10 @@ + + O_TARGET := ext3.o + +-export-objs := super.o inode.o ++export-objs := ext3-exports.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o ++ ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make +--- linux-2.4.18/fs/ext3/namei.c~linux-2.4.18ea-0.8.26-2 2003-09-01 11:50:59.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-09-01 14:55:39.000000000 +0400 +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1524,6 +1525,7 @@ static int ext3_add_nondir(handle_t *han + d_instantiate(dentry, inode); + return 0; + } ++ ext3_xattr_drop_inode(handle, inode); + ext3_dec_count(handle, inode); + iput(inode); + return err; +@@ -1612,7 +1614,7 @@ static int ext3_mkdir(struct inode * dir + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR); ++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1620,7 +1622,6 @@ static int ext3_mkdir(struct inode * dir + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; +- inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ +@@ -1647,9 +1648,6 @@ static int ext3_mkdir(struct inode * dir + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); +- inode->i_mode = S_IFDIR | mode; +- if (dir->i_mode & S_ISGID) +- inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) { +--- linux-2.4.18/fs/ext3/super.c~linux-2.4.18ea-0.8.26-2 2003-08-29 16:53:17.000000000 +0400 ++++ linux-2.4.18-alexey/fs/ext3/super.c 2003-09-01 14:55:39.000000000 +0400 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -406,6 +407,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); +@@ -1743,18 +1745,27 @@ int ext3_statfs (struct super_block * sb + + static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super); + +-static int __init init_ext3_fs(void) ++static void exit_ext3_fs(void) + { +- return register_filesystem(&ext3_fs_type); ++ unregister_filesystem(&ext3_fs_type); ++ exit_ext3_xattr_user(); ++ exit_ext3_xattr(); + } + +-static void __exit exit_ext3_fs(void) ++static int __init init_ext3_fs(void) + { +- unregister_filesystem(&ext3_fs_type); ++ int error = init_ext3_xattr(); ++ if (!error) ++ error = init_ext3_xattr_user(); ++ if (!error) ++ error = register_filesystem(&ext3_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext3_fs(); ++ return error; + } + +-EXPORT_SYMBOL(ext3_force_commit); +-EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/fs/ext3/xattr.c 2003-09-01 14:55:39.000000000 +0400 +@@ -0,0 +1,1242 @@ ++/* ++ * linux/fs/ext3/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Ext3 code with a lot of help from Eric Jarman . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * ¦ entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT3_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++#include ++#endif ++#include ++#include ++#include ++#include ++ ++/* These symbols may be needed by a module. */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT3_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, ++ struct ext3_xattr_header *); ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++static int ext3_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext3_xattr_cache_find(struct inode *, ++ struct ext3_xattr_header *); ++static void ext3_xattr_cache_remove(struct buffer_head *); ++static void ext3_xattr_rehash(struct ext3_xattr_header *, ++ struct ext3_xattr_entry *); ++ ++static struct mb_cache *ext3_xattr_cache; ++ ++#else ++# define ext3_xattr_cache_insert(bh) 0 ++# define ext3_xattr_cache_find(inode, header) NULL ++# define ext3_xattr_cache_remove(bh) do {} while(0) ++# define ext3_xattr_rehash(header, entry) do {} while(0) ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext3_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext3_xattr_sem); ++ ++static inline void ++ext3_xattr_lock(void) ++{ ++ down(&ext3_xattr_sem); ++} ++ ++static inline void ++ext3_xattr_unlock(void) ++{ ++ up(&ext3_xattr_sem); ++} ++ ++static inline int ++ext3_xattr_new_block(handle_t *handle, struct inode *inode, ++ int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + ++ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext3_new_block(handle, inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext3_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext3_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext3_xattr_free_block(handle_t *handle, struct inode * inode, ++ unsigned long block) ++{ ++ ext3_free_blocks(handle, inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext3_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext3_xattr_free_block(handle, inode, block) \ ++ ext3_free_blocks(handle, inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; ++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ if (!ext3_xattr_handlers[name_index-1]) { ++ ext3_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext3_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ ext3_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext3_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static inline struct ext3_xattr_handler * ++ext3_xattr_resolve_name(const char **name) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext3_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext3_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext3_handler_lock); ++ return handler; ++} ++ ++static inline struct ext3_xattr_handler * ++ext3_xattr_handler(int name_index) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ read_lock(&ext3_handler_lock); ++ handler = ext3_xattr_handlers[name_index-1]; ++ read_unlock(&ext3_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext3_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_setxattr(struct dentry *dentry, const char *name, ++ void *value, size_t size, int flags) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT3_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT3_I(inode)->i_file_acl) ++ return 0; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) { ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len) + 1; ++ } ++ } ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) { ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ *buf++ = '\0'; ++ } ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext3_xattr_update_super_block(handle_t *handle, ++ struct super_block *sb) ++{ ++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT3_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext3_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_header *header = NULL; ++ struct ext3_xattr_entry *here, *last; ++ unsigned int name_len; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ ext3_xattr_lock(); ++ ++ if (EXT3_I(inode)->i_file_acl) { ++ /* The inode already has an extended attribute block. */ ++ int block = EXT3_I(inode)->i_file_acl; ++ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(sb, "ext3_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT3_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT3_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT3_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext3_xattr_cache_remove(bh); ++ error = ext3_journal_get_write_access(handle, bh); ++ if (error) ++ goto cleanup; ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT3_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT3_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext3_xattr_set2(handle, inode, bh,NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT3_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT3_XATTR_PAD, 0, ++ EXT3_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext3_xattr_rehash(header, here); ++ ++ error = ext3_xattr_set2(handle, inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ ext3_xattr_unlock(); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext3_xattr_set(): Update the file system. ++ */ ++static int ++ext3_xattr_set2(handle_t *handle, struct inode *inode, ++ struct buffer_head *old_bh, struct ext3_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext3_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext3_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ error = ext3_journal_get_write_access(handle, new_bh); ++ if (error) ++ goto cleanup; ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ (void)ext3_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT3_I(inode)->i_file_acl != 0; ++ int block = ext3_xattr_new_block(handle, inode, ++ &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++getblk_failed: ext3_xattr_free_block(handle, inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ error = ext3_journal_get_create_access(handle, new_bh); ++ if (error) { ++ unlock_buffer(new_bh); ++ goto getblk_failed; ++ } ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); ++ ext3_xattr_update_super_block(handle, sb); ++ } ++ error = ext3_journal_dirty_metadata(handle, new_bh); ++ if (error) ++ goto cleanup; ++ } ++ ++ /* Update the inode. */ ++ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ error = ext3_journal_get_write_access(handle, old_bh); ++ if (error) ++ goto cleanup; ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr); ++ ++ /* ext3_forget() calls bforget() for us, but we ++ let our caller release old_bh, so we need to ++ duplicate the handle before. */ ++ get_bh(old_bh); ++ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext3_xattr_quota_free(inode); ++ ext3_journal_dirty_metadata(handle, old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_drop_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext3_xattr_drop_inode(handle_t *handle, struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT3_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ ext3_xattr_lock(); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_drop_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext3_error(inode->i_sb, "ext3_xattr_drop_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ext3_journal_get_write_access(handle, bh); ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext3_xattr_cache_remove(bh); ++ ext3_xattr_free_block(handle, inode, block); ++ ext3_forget(handle, 1, inode, bh, block); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ ext3_journal_dirty_metadata(handle, bh); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ext3_xattr_quota_free(inode); ++ } ++ EXT3_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ ext3_xattr_unlock(); ++} ++ ++/* ++ * ext3_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ mb_cache_shrink(ext3_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++/* ++ * ext3_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext3_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext3_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext3_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext3_xattr_cmp(struct ext3_xattr_header *header1, ++ struct ext3_xattr_header *header2) ++{ ++ struct ext3_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT3_XATTR_NEXT(entry1); ++ entry2 = EXT3_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext3_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT3_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT3_XATTR_REFCOUNT_MAX); ++ } else if (!ext3_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext3_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext3_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext3_xattr_rehash(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ struct ext3_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext3_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT3_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext3_xattr(void) ++{ ++ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext3_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++ if (ext3_xattr_cache) ++ mb_cache_destroy(ext3_xattr_cache); ++ ext3_xattr_cache = NULL; ++} ++ ++#else /* CONFIG_EXT3_FS_XATTR_SHARING */ ++ ++int __init ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ +--- linux-2.4.18/include/linux/ext3_fs.h~linux-2.4.18ea-0.8.26-2 2003-09-01 11:51:00.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_fs.h 2003-09-01 14:55:39.000000000 +0400 +@@ -63,8 +63,6 @@ + */ + #define EXT3_BAD_INO 1 /* Bad blocks inode */ + #define EXT3_ROOT_INO 2 /* Root inode */ +-#define EXT3_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT3_ACL_DATA_INO 4 /* ACL inode */ + #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ + #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +@@ -94,7 +92,6 @@ + #else + # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) + #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -129,28 +126,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext3_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext3_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext3_group_desc +@@ -521,7 +496,7 @@ struct ext3_super_block { + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + +-#define EXT3_FEATURE_COMPAT_SUPP 0 ++#define EXT3_FEATURE_COMPAT_SUPP EXT3_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ +@@ -623,6 +598,24 @@ struct dx_hash_info + #define HASH_NB_ALWAYS 1 + + ++/* Defined for extended attributes */ ++#define CONFIG_EXT3_FS_XATTR y ++#ifndef ENOATTR ++#define ENOATTR ENODATA /* No such attribute */ ++#endif ++#ifndef ENOTSUP ++#define ENOTSUP EOPNOTSUPP /* Operation not supported */ ++#endif ++#ifndef XATTR_NAME_MAX ++#define XATTR_NAME_MAX 255 /* # chars in an extended attribute name */ ++#define XATTR_SIZE_MAX 65536 /* size of an extended attribute value (64k) */ ++#define XATTR_LIST_MAX 65536 /* size of extended attribute namelist (64k) */ ++#endif ++#ifndef XATTR_CREATE ++#define XATTR_CREATE 1 /* set value, fail if attr already exists */ ++#define XATTR_REPLACE 2 /* set value, fail if attr does not exist */ ++#endif ++ + /* + * Describe an inode's exact location on disk and in memory + */ +@@ -704,6 +697,7 @@ extern void ext3_check_inodes_bitmap (st + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* inode.c */ ++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + +--- linux-2.4.18/include/linux/ext3_jbd.h~linux-2.4.18ea-0.8.26-2 2003-08-29 16:53:17.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/ext3_jbd.h 2003-09-01 14:55:39.000000000 +0400 +@@ -30,13 +30,19 @@ + + #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 + ++/* Extended attributes may touch two data buffers, two bitmap buffers, ++ * and two group and summaries. */ ++ ++#define EXT3_XATTR_TRANS_BLOCKS 8 ++ + /* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +-#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) ++#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ ++ EXT3_XATTR_TRANS_BLOCKS - 2) + + extern int ext3_writepage_trans_blocks(struct inode *inode); + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/include/linux/ext3_xattr.h 2003-09-01 14:55:39.000000000 +0400 +@@ -0,0 +1,155 @@ ++/* ++ File: linux/ext3_xattr.h ++ ++ On-disk format of extended attributes for the ext3 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT3_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT3_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT3_XATTR_INDEX_MAX 10 ++#define EXT3_XATTR_INDEX_USER 1 ++ ++struct ext3_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext3_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT3_XATTR_PAD_BITS 2 ++#define EXT3_XATTR_PAD (1<e_name_len)) ) ++#define EXT3_XATTR_SIZE(size) \ ++ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT3_FS_XATTR ++ ++struct ext3_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext3_xattr_register(int, struct ext3_xattr_handler *); ++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); ++ ++extern int ext3_setxattr(struct dentry *, const char *, void *, size_t, int); ++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); ++extern int ext3_removexattr(struct dentry *, const char *); ++ ++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext3_xattr_list(struct inode *, char *, size_t); ++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, void *, size_t, int); ++ ++extern void ext3_xattr_drop_inode(handle_t *, struct inode *); ++extern void ext3_xattr_put_super(struct super_block *); ++ ++extern int init_ext3_xattr(void) __init; ++extern void exit_ext3_xattr(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR */ ++# define ext3_setxattr NULL ++# define ext3_getxattr NULL ++# define ext3_listxattr NULL ++# define ext3_removexattr NULL ++ ++static inline int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_list(struct inode *inode, void *buffer, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext3_xattr_drop_inode(handle_t *handle, struct inode *inode) ++{ ++} ++ ++static inline void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT3_FS_XATTR */ ++ ++# ifdef CONFIG_EXT3_FS_XATTR_USER ++ ++extern int init_ext3_xattr_user(void) __init; ++extern void exit_ext3_xattr_user(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++static inline int ++init_ext3_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr_user(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-alexey/include/linux/xattr.h 2003-09-01 14:55:39.000000000 +0400 +@@ -0,0 +1,15 @@ ++/* ++ File: linux/xattr.h ++ ++ Extended attributes handling. ++ ++ Copyright (C) 2001 by Andreas Gruenbacher ++ Copyright (C) 2001 SGI - Silicon Graphics, Inc ++*/ ++#ifndef _LINUX_XATTR_H ++#define _LINUX_XATTR_H ++ ++#define XATTR_CREATE 1 /* set value, fail if attr already exists */ ++#define XATTR_REPLACE 2 /* set value, fail if attr does not exist */ ++ ++#endif /* _LINUX_XATTR_H */ + +_ diff --git a/lustre/kernel_patches/patches/removepage-2.4.20.patch b/lustre/kernel_patches/patches/removepage-2.4.20.patch new file mode 100644 index 0000000..cc721e1 --- /dev/null +++ b/lustre/kernel_patches/patches/removepage-2.4.20.patch @@ -0,0 +1,28 @@ + include/linux/fs.h | 1 + + mm/filemap.c | 3 +++ + 2 files changed, 4 insertions(+) + +--- linux-2.4.20-b_llpmd-l24/include/linux/fs.h~removepage-2.4.20 2003-09-05 11:45:42.000000000 -0700 ++++ linux-2.4.20-b_llpmd-l24-zab/include/linux/fs.h 2003-09-05 11:46:25.000000000 -0700 +@@ -402,6 +402,7 @@ struct address_space_operations { + int (*releasepage) (struct page *, int); + #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ + int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); ++ void (*removepage)(struct page *); /* called when page gets removed from the inode */ + }; + + struct address_space { +--- linux-2.4.20-b_llpmd-l24/mm/filemap.c~removepage-2.4.20 2003-09-05 11:45:42.000000000 -0700 ++++ linux-2.4.20-b_llpmd-l24-zab/mm/filemap.c 2003-09-05 11:46:25.000000000 -0700 +@@ -95,6 +95,9 @@ static inline void remove_page_from_inod + { + struct address_space * mapping = page->mapping; + ++ if (mapping->a_ops->removepage) ++ mapping->a_ops->removepage(page); ++ + mapping->nrpages--; + list_del(&page->list); + page->mapping = NULL; + +_ diff --git a/lustre/kernel_patches/patches/removepage-2.6.0.patch b/lustre/kernel_patches/patches/removepage-2.6.0.patch new file mode 100644 index 0000000..268ca97 --- /dev/null +++ b/lustre/kernel_patches/patches/removepage-2.6.0.patch @@ -0,0 +1,28 @@ + include/linux/fs.h | 1 + + mm/filemap.c | 3 +++ + 2 files changed, 4 insertions(+) + +--- linux-2.6.0-test3-l25/include/linux/fs.h~removepage-2.6.0 2003-09-05 15:31:52.000000000 -0700 ++++ linux-2.6.0-test3-l25-zab/include/linux/fs.h 2003-09-08 10:47:30.000000000 -0700 +@@ -311,6 +311,7 @@ struct address_space_operations { + int (*releasepage) (struct page *, int); + int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, + loff_t offset, unsigned long nr_segs); ++ void (*removepage)(struct page *); /* called when page gets removed from the inode */ + }; + + struct backing_dev_info; +--- linux-2.6.0-test3-l25/mm/filemap.c~removepage-2.6.0 2003-08-08 21:34:39.000000000 -0700 ++++ linux-2.6.0-test3-l25-zab/mm/filemap.c 2003-09-08 10:48:10.000000000 -0700 +@@ -81,6 +81,9 @@ void __remove_from_page_cache(struct pag + { + struct address_space *mapping = page->mapping; + ++ if (mapping->a_ops->removepage) ++ mapping->a_ops->removepage(page); ++ + radix_tree_delete(&mapping->page_tree, page->index); + list_del(&page->list); + page->mapping = NULL; + +_ diff --git a/lustre/kernel_patches/patches/uml-2.6.0-fix.patch b/lustre/kernel_patches/patches/uml-2.6.0-fix.patch new file mode 100644 index 0000000..2910f97 --- /dev/null +++ b/lustre/kernel_patches/patches/uml-2.6.0-fix.patch @@ -0,0 +1,19 @@ + include/asm-um/unistd.h | 2 ++ + 1 files changed, 2 insertions(+) + +diff -puN include/asm-um/unistd.h~uml-2.6.0-fix include/asm-um/unistd.h +--- linux-2.6.0-test3/include/asm-um/unistd.h~uml-2.6.0-fix 2003-09-04 18:39:45.000000000 +0400 ++++ linux-2.6.0-test3-alexey/include/asm-um/unistd.h 2003-09-04 18:39:59.000000000 +0400 +@@ -6,8 +6,10 @@ + #ifndef _UM_UNISTD_H_ + #define _UM_UNISTD_H_ + ++#ifdef __KERNEL__ + #include "linux/resource.h" + #include "asm/uaccess.h" ++#endif + + extern long sys_open(const char *filename, int flags, int mode); + extern long sys_dup(unsigned int fildes); + +_ diff --git a/lustre/kernel_patches/patches/uml-patch-2.6.0-test3-1.patch b/lustre/kernel_patches/patches/uml-patch-2.6.0-test3-1.patch new file mode 100644 index 0000000..8ea5a43 --- /dev/null +++ b/lustre/kernel_patches/patches/uml-patch-2.6.0-test3-1.patch @@ -0,0 +1,8716 @@ +diff -Naur a/arch/um/Kconfig b/arch/um/Kconfig +--- a/arch/um/Kconfig Fri Aug 15 15:05:57 2003 ++++ b/arch/um/Kconfig Fri Aug 15 15:11:53 2003 +@@ -61,6 +61,20 @@ + + config NET + bool "Networking support" ++ help ++ Unless you really know what you are doing, you should say Y here. ++ The reason is that some programs need kernel networking support even ++ when running on a stand-alone machine that isn't connected to any ++ other computer. If you are upgrading from an older kernel, you ++ should consider updating your networking tools too because changes ++ in the kernel and the tools often go hand in hand. The tools are ++ contained in the package net-tools, the location and version number ++ of which are given in Documentation/Changes. ++ ++ For a general introduction to Linux networking, it is highly ++ recommended to read the NET-HOWTO, available from ++ . ++ + + source "fs/Kconfig.binfmt" + +@@ -85,6 +99,19 @@ + If you'd like to be able to work with files stored on the host, + say Y or M here; otherwise say N. + ++config HPPFS ++ tristate "HoneyPot ProcFS" ++ help ++ hppfs (HoneyPot ProcFS) is a filesystem which allows UML /proc ++ entries to be overridden, removed, or fabricated from the host. ++ Its purpose is to allow a UML to appear to be a physical machine ++ by removing or changing anything in /proc which gives away the ++ identity of a UML. ++ ++ See http://user-mode-linux.sf.net/hppfs.html for more information. ++ ++ You only need this if you are setting up a UML honeypot. Otherwise, ++ it is safe to say 'N' here. + + config MCONSOLE + bool "Management console" +@@ -105,6 +132,16 @@ + config MAGIC_SYSRQ + bool "Magic SysRq key" + depends on MCONSOLE ++ help ++ If you say Y here, you will have some control over the system even ++ if the system crashes for example during kernel debugging (e.g., you ++ will be able to flush the buffer cache to disk, reboot the system ++ immediately or dump some status information). This is accomplished ++ by pressing various keys while holding SysRq (Alt+PrintScreen). It ++ also works on a serial console (on PC hardware at least), if you ++ send a BREAK and then within 5 seconds a command keypress. The ++ keys are documented in Documentation/sysrq.txt. Don't say Y ++ unless you really know what this hack does. + + config HOST_2G_2G + bool "2G/2G host address space split" +@@ -159,6 +196,9 @@ + config HIGHMEM + bool "Highmem support" + ++config PROC_MM ++ bool "/proc/mm support" ++ + config KERNEL_STACK_ORDER + int "Kernel stack size order" + default 2 +@@ -239,6 +279,10 @@ + config PT_PROXY + bool "Enable ptrace proxy" + depends on XTERM_CHAN && DEBUG_INFO ++ help ++ This option enables a debugging interface which allows gdb to debug ++ the kernel without needing to actually attach to kernel threads. ++ If you want to do kernel debugging, say Y here; otherwise say N. + + config GPROF + bool "Enable gprof support" +diff -Naur a/arch/um/Kconfig_block b/arch/um/Kconfig_block +--- a/arch/um/Kconfig_block Fri Aug 15 15:07:32 2003 ++++ b/arch/um/Kconfig_block Fri Aug 15 15:12:56 2003 +@@ -29,6 +29,20 @@ + wise choice too. In all other cases (for example, if you're just + playing around with User-Mode Linux) you can choose N. + ++# Turn this back on when the driver actually works ++# ++#config BLK_DEV_COW ++# tristate "COW block device" ++# help ++# This is a layered driver which sits above two other block devices. ++# One is read-only, and the other is a read-write layer which stores ++# all changes. This provides the illusion that the read-only layer ++# can be mounted read-write and changed. ++ ++config BLK_DEV_COW_COMMON ++ bool ++ default BLK_DEV_COW || BLK_DEV_UBD ++ + config BLK_DEV_LOOP + tristate "Loopback device support" + +diff -Naur a/arch/um/Kconfig_net b/arch/um/Kconfig_net +--- a/arch/um/Kconfig_net Fri Aug 15 15:06:52 2003 ++++ b/arch/um/Kconfig_net Fri Aug 15 15:12:43 2003 +@@ -1,5 +1,5 @@ + +-menu "Network Devices" ++menu "UML Network Devices" + depends on NET + + # UML virtual driver +@@ -176,73 +176,5 @@ + + Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" + +- +-# Below are hardware-independent drivers mirrored from +-# drivers/net/Config.in. It would be nice if Linux +-# had HW independent drivers separated from the other +-# but it does not. Until then each non-ISA/PCI arch +-# needs to provide it's own menu of network drivers +-config DUMMY +- tristate "Dummy net driver support" +- +-config BONDING +- tristate "Bonding driver support" +- +-config EQUALIZER +- tristate "EQL (serial line load balancing) support" +- +-config TUN +- tristate "Universal TUN/TAP device driver support" +- +-config ETHERTAP +- tristate "Ethertap network tap (OBSOLETE)" +- depends on EXPERIMENTAL && NETLINK +- +-config PPP +- tristate "PPP (point-to-point protocol) support" +- +-config PPP_MULTILINK +- bool "PPP multilink support (EXPERIMENTAL)" +- depends on PPP && EXPERIMENTAL +- +-config PPP_FILTER +- bool "PPP filtering" +- depends on PPP && FILTER +- +-config PPP_ASYNC +- tristate "PPP support for async serial ports" +- depends on PPP +- +-config PPP_SYNC_TTY +- tristate "PPP support for sync tty ports" +- depends on PPP +- +-config PPP_DEFLATE +- tristate "PPP Deflate compression" +- depends on PPP +- +-config PPP_BSDCOMP +- tristate "PPP BSD-Compress compression" +- depends on PPP +- +-config PPPOE +- tristate "PPP over Ethernet (EXPERIMENTAL)" +- depends on PPP && EXPERIMENTAL +- +-config SLIP +- tristate "SLIP (serial line) support" +- +-config SLIP_COMPRESSED +- bool "CSLIP compressed headers" +- depends on SLIP=y +- +-config SLIP_SMART +- bool "Keepalive and linefill" +- depends on SLIP=y +- +-config SLIP_MODE_SLIP6 +- bool "Six bit SLIP encapsulation" +- depends on SLIP=y +- + endmenu + +diff -Naur a/arch/um/Makefile b/arch/um/Makefile +--- a/arch/um/Makefile Fri Aug 15 15:07:18 2003 ++++ b/arch/um/Makefile Fri Aug 15 15:12:45 2003 +@@ -24,15 +24,17 @@ + # Have to precede the include because the included Makefiles reference them. + SYMLINK_HEADERS = include/asm-um/archparam.h include/asm-um/system.h \ + include/asm-um/sigcontext.h include/asm-um/processor.h \ +- include/asm-um/ptrace.h include/asm-um/arch-signal.h ++ include/asm-um/ptrace.h include/asm-um/arch-signal.h \ ++ include/asm-um/module.h + + ARCH_SYMLINKS = include/asm-um/arch $(ARCH_DIR)/include/sysdep $(ARCH_DIR)/os \ + $(SYMLINK_HEADERS) $(ARCH_DIR)/include/uml-config.h + + GEN_HEADERS += $(ARCH_DIR)/include/task.h $(ARCH_DIR)/include/kern_constants.h + +-include $(ARCH_DIR)/Makefile-$(SUBARCH) +-include $(ARCH_DIR)/Makefile-os-$(OS) ++.PHONY: sys_prepare ++sys_prepare: ++ @: + + MAKEFILE-$(CONFIG_MODE_TT) += Makefile-tt + MAKEFILE-$(CONFIG_MODE_SKAS) += Makefile-skas +@@ -41,6 +43,9 @@ + include $(addprefix $(ARCH_DIR)/,$(MAKEFILE-y)) + endif + ++include $(ARCH_DIR)/Makefile-$(SUBARCH) ++include $(ARCH_DIR)/Makefile-os-$(OS) ++ + EXTRAVERSION := $(EXTRAVERSION)-1um + + ARCH_INCLUDE = -I$(ARCH_DIR)/include +@@ -52,14 +57,14 @@ + + CFLAGS += $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ + -D_LARGEFILE64_SOURCE $(ARCH_INCLUDE) -Derrno=kernel_errno \ +- $(MODE_INCLUDE) ++ -Dsigprocmask=kernel_sigprocmask $(MODE_INCLUDE) + + LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc + + SIZE = (($(CONFIG_NEST_LEVEL) + $(CONFIG_KERNEL_HALF_GIGS)) * 0x20000000) + + ifeq ($(CONFIG_MODE_SKAS), y) +-$(SYS_HEADERS) : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h ++$(SYS_HEADERS) : $(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h + endif + + include/linux/version.h: arch/$(ARCH)/Makefile +@@ -116,6 +121,7 @@ + + USER_CFLAGS := $(patsubst -I%,,$(CFLAGS)) + USER_CFLAGS := $(patsubst -Derrno=kernel_errno,,$(USER_CFLAGS)) ++USER_CFLAGS := $(patsubst -Dsigprocmask=kernel_sigprocmask,,$(USER_CFLAGS)) + USER_CFLAGS := $(patsubst -D__KERNEL__,,$(USER_CFLAGS)) $(ARCH_INCLUDE) \ + $(MODE_INCLUDE) + +@@ -123,9 +129,10 @@ + USER_CFLAGS += -D_GNU_SOURCE + + CLEAN_FILES += linux x.i gmon.out $(ARCH_DIR)/uml.lds.s \ +- $(ARCH_DIR)/dyn_link.ld.s $(GEN_HEADERS) ++ $(ARCH_DIR)/dyn_link.ld.s $(ARCH_DIR)/include/uml-config.h \ ++ $(GEN_HEADERS) + +-$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c ++$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c sys_prepare + $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $< + + archmrproper: +@@ -161,19 +168,23 @@ + $(ARCH_DIR)/os: + cd $(ARCH_DIR) && ln -sf os-$(OS) os + +-$(ARCH_DIR)/include/uml-config.h : ++$(ARCH_DIR)/include/uml-config.h : $(TOPDIR)/include/linux/autoconf.h + sed 's/ CONFIG/ UML_CONFIG/' $(TOPDIR)/include/linux/autoconf.h > $@ + ++filechk_$(ARCH_DIR)/include/task.h := $(ARCH_DIR)/util/mk_task ++ + $(ARCH_DIR)/include/task.h : $(ARCH_DIR)/util/mk_task +- $< > $@ ++ $(call filechk,$@) ++ ++filechk_$(ARCH_DIR)/include/kern_constants.h := $(ARCH_DIR)/util/mk_constants + + $(ARCH_DIR)/include/kern_constants.h : $(ARCH_DIR)/util/mk_constants +- $< > $@ ++ $(call filechk,$@) + +-$(ARCH_DIR)/util/mk_task : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h \ +- $(ARCH_DIR)/util FORCE ; ++$(ARCH_DIR)/util/mk_task $(ARCH_DIR)/util/mk_constants : $(ARCH_DIR)/util \ ++ sys_prepare FORCE ; + + $(ARCH_DIR)/util: FORCE +- @$(call descend,$@,) ++ $(MAKE) -f scripts/Makefile.build obj=$@ + +-export SUBARCH USER_CFLAGS OS ++export SUBARCH USER_CFLAGS OS +diff -Naur a/arch/um/Makefile-i386 b/arch/um/Makefile-i386 +--- a/arch/um/Makefile-i386 Fri Aug 15 15:07:46 2003 ++++ b/arch/um/Makefile-i386 Fri Aug 15 15:13:14 2003 +@@ -16,22 +16,28 @@ + + SYS_HEADERS = $(SYS_DIR)/sc.h $(SYS_DIR)/thread.h + ++sys_prepare: $(SYS_DIR)/sc.h ++ + prepare: $(SYS_HEADERS) + ++filechk_$(SYS_DIR)/sc.h := $(SYS_UTIL_DIR)/mk_sc ++ + $(SYS_DIR)/sc.h: $(SYS_UTIL_DIR)/mk_sc +- $< > $@ ++ $(call filechk,$@) ++ ++filechk_$(SYS_DIR)/thread.h := $(SYS_UTIL_DIR)/mk_thread + + $(SYS_DIR)/thread.h: $(SYS_UTIL_DIR)/mk_thread +- $< > $@ ++ $(call filechk,$@) + +-$(SYS_UTIL_DIR)/mk_sc: FORCE ; +- @$(call descend,$(SYS_UTIL_DIR),$@) ++$(SYS_UTIL_DIR)/mk_sc: scripts/fixdep include/config/MARKER FORCE ; ++ +@$(call descend,$(SYS_UTIL_DIR),$@) + +-$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) FORCE ; +- @$(call descend,$(SYS_UTIL_DIR),$@) ++$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) sys_prepare FORCE ; ++ +@$(call descend,$(SYS_UTIL_DIR),$@) + + $(SYS_UTIL_DIR): include/asm FORCE +- @$(call descend,$@,) ++ +@$(call descend,$@,) + + sysclean : + rm -f $(SYS_HEADERS) +diff -Naur a/arch/um/Makefile-skas b/arch/um/Makefile-skas +--- a/arch/um/Makefile-skas Fri Aug 15 15:05:43 2003 ++++ b/arch/um/Makefile-skas Fri Aug 15 15:11:52 2003 +@@ -14,7 +14,7 @@ + LINK_SKAS = -Wl,-rpath,/lib + LD_SCRIPT_SKAS = dyn.lds.s + +-GEN_HEADERS += $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h ++GEN_HEADERS += $(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h + +-$(ARCH_DIR)/kernel/skas/include/skas_ptregs.h : +- $(MAKE) -C $(ARCH_DIR)/kernel/skas include/skas_ptregs.h ++$(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h : ++ $(call descend,$(ARCH_DIR)/kernel/skas,$@) +diff -Naur a/arch/um/config.release b/arch/um/config.release +--- a/arch/um/config.release Fri Aug 15 15:09:05 2003 ++++ b/arch/um/config.release Fri Aug 15 15:13:48 2003 +@@ -228,7 +228,6 @@ + CONFIG_EXT2_FS=y + CONFIG_SYSV_FS=m + CONFIG_UDF_FS=m +-# CONFIG_UDF_RW is not set + CONFIG_UFS_FS=m + # CONFIG_UFS_FS_WRITE is not set + +diff -Naur a/arch/um/defconfig b/arch/um/defconfig +--- a/arch/um/defconfig Fri Aug 15 15:07:30 2003 ++++ b/arch/um/defconfig Fri Aug 15 15:12:54 2003 +@@ -6,7 +6,6 @@ + CONFIG_SWAP=y + CONFIG_UID16=y + CONFIG_RWSEM_GENERIC_SPINLOCK=y +-CONFIG_CONFIG_LOG_BUF_SHIFT=14 + + # + # Code maturity level options +@@ -116,7 +115,6 @@ + CONFIG_PACKET_MMAP=y + # CONFIG_NETLINK_DEV is not set + # CONFIG_NETFILTER is not set +-# CONFIG_FILTER is not set + CONFIG_UNIX=y + # CONFIG_NET_KEY is not set + CONFIG_INET=y +@@ -385,7 +383,6 @@ + # + # Disk-On-Chip Device Drivers + # +-# CONFIG_MTD_DOC1000 is not set + # CONFIG_MTD_DOC2000 is not set + # CONFIG_MTD_DOC2001 is not set + +diff -Naur a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile +--- a/arch/um/drivers/Makefile Fri Aug 15 15:06:42 2003 ++++ b/arch/um/drivers/Makefile Fri Aug 15 15:12:40 2003 +@@ -1,5 +1,5 @@ + # +-# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) ++# Copyright (C) 2000, 2002, 2003 Jeff Dike (jdike@karaya.com) + # Licensed under the GPL + # + +@@ -39,6 +39,8 @@ + obj-$(CONFIG_TTY_CHAN) += tty.o + obj-$(CONFIG_XTERM_CHAN) += xterm.o xterm_kern.o + obj-$(CONFIG_UML_WATCHDOG) += harddog.o ++obj-$(CONFIG_BLK_DEV_COW) += cow_kern.o ++obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o + + obj-y += stdio_console.o $(CHAN_OBJS) + +@@ -46,7 +48,7 @@ + + USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) fd.o \ + null.o pty.o tty.o xterm.o +-USER_OBJS := $(foreach file,$(USER_OBJS),arch/um/drivers/$(file)) ++USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) + + $(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< +diff -Naur a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c +--- a/arch/um/drivers/chan_kern.c Fri Aug 15 15:09:13 2003 ++++ b/arch/um/drivers/chan_kern.c Fri Aug 15 15:13:51 2003 +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + #include + #include + #include "chan_kern.h" +diff -Naur a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c +--- a/arch/um/drivers/chan_user.c Fri Aug 15 15:03:46 2003 ++++ b/arch/um/drivers/chan_user.c Fri Aug 15 15:10:09 2003 +@@ -188,8 +188,8 @@ + if(!isatty(fd)) return; + + pid = tcgetpgrp(fd); +- if(!CHOOSE_MODE(is_tracer_winch(pid, fd, device_data), 0) && +- (pid == -1)){ ++ if(!CHOOSE_MODE_PROC(is_tracer_winch, is_skas_winch, pid, fd, ++ device_data) && (pid == -1)){ + thread = winch_tramp(fd, device_data, &thread_fd); + if(fd != -1){ + register_winch_irq(thread_fd, fd, thread, device_data); +diff -Naur a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h +--- a/arch/um/drivers/cow.h Wed Dec 31 19:00:00 1969 ++++ b/arch/um/drivers/cow.h Fri Aug 15 15:10:34 2003 +@@ -0,0 +1,40 @@ ++#ifndef __COW_H__ ++#define __COW_H__ ++ ++#include ++ ++#if __BYTE_ORDER == __BIG_ENDIAN ++# define ntohll(x) (x) ++# define htonll(x) (x) ++#elif __BYTE_ORDER == __LITTLE_ENDIAN ++# define ntohll(x) bswap_64(x) ++# define htonll(x) bswap_64(x) ++#else ++#error "__BYTE_ORDER not defined" ++#endif ++ ++extern int init_cow_file(int fd, char *cow_file, char *backing_file, ++ int sectorsize, int *bitmap_offset_out, ++ unsigned long *bitmap_len_out, int *data_offset_out); ++ ++extern int file_reader(__u64 offset, char *buf, int len, void *arg); ++extern int read_cow_header(int (*reader)(__u64, char *, int, void *), ++ void *arg, __u32 *magic_out, ++ char **backing_file_out, time_t *mtime_out, ++ __u64 *size_out, int *sectorsize_out, ++ int *bitmap_offset_out); ++ ++extern int write_cow_header(char *cow_file, int fd, char *backing_file, ++ int sectorsize, long long *size); ++ ++extern void cow_sizes(__u64 size, int sectorsize, int bitmap_offset, ++ unsigned long *bitmap_len_out, int *data_offset_out); ++ ++#endif ++ ++/* ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/arch/um/drivers/cow_kern.c b/arch/um/drivers/cow_kern.c +--- a/arch/um/drivers/cow_kern.c Wed Dec 31 19:00:00 1969 ++++ b/arch/um/drivers/cow_kern.c Fri Aug 15 15:13:51 2003 +@@ -0,0 +1,628 @@ ++#define COW_MAJOR 60 ++#define MAJOR_NR COW_MAJOR ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "2_5compat.h" ++#include "cow.h" ++#include "ubd_user.h" ++ ++#define COW_SHIFT 4 ++ ++struct cow { ++ int count; ++ char *cow_path; ++ dev_t cow_dev; ++ struct block_device *cow_bdev; ++ char *backing_path; ++ dev_t backing_dev; ++ struct block_device *backing_bdev; ++ int sectorsize; ++ unsigned long *bitmap; ++ unsigned long bitmap_len; ++ int bitmap_offset; ++ int data_offset; ++ devfs_handle_t devfs; ++ struct semaphore sem; ++ struct semaphore io_sem; ++ atomic_t working; ++ spinlock_t io_lock; ++ struct buffer_head *bh; ++ struct buffer_head *bhtail; ++ void *end_io; ++}; ++ ++#define DEFAULT_COW { \ ++ .count = 0, \ ++ .cow_path = NULL, \ ++ .cow_dev = 0, \ ++ .backing_path = NULL, \ ++ .backing_dev = 0, \ ++ .bitmap = NULL, \ ++ .bitmap_len = 0, \ ++ .bitmap_offset = 0, \ ++ .data_offset = 0, \ ++ .devfs = NULL, \ ++ .working = ATOMIC_INIT(0), \ ++ .io_lock = SPIN_LOCK_UNLOCKED, \ ++} ++ ++#define MAX_DEV (8) ++#define MAX_MINOR (MAX_DEV << COW_SHIFT) ++ ++struct cow cow_dev[MAX_DEV] = { [ 0 ... MAX_DEV - 1 ] = DEFAULT_COW }; ++ ++/* Not modified by this driver */ ++static int blk_sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = BLOCK_SIZE }; ++static int hardsect_sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = 512 }; ++ ++/* Protected by cow_lock */ ++static int sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = 0 }; ++ ++static struct hd_struct cow_part[MAX_MINOR] = ++ { [ 0 ... MAX_MINOR - 1 ] = { 0, 0, 0 } }; ++ ++/* Protected by io_request_lock */ ++static request_queue_t *cow_queue; ++ ++static int cow_open(struct inode *inode, struct file *filp); ++static int cow_release(struct inode * inode, struct file * file); ++static int cow_ioctl(struct inode * inode, struct file * file, ++ unsigned int cmd, unsigned long arg); ++static int cow_revalidate(kdev_t rdev); ++ ++static struct block_device_operations cow_blops = { ++ .open = cow_open, ++ .release = cow_release, ++ .ioctl = cow_ioctl, ++ .revalidate = cow_revalidate, ++}; ++ ++/* Initialized in an initcall, and unchanged thereafter */ ++devfs_handle_t cow_dir_handle; ++ ++#define INIT_GENDISK(maj, name, parts, shift, bsizes, max, blops) \ ++{ \ ++ .major = maj, \ ++ .major_name = name, \ ++ .minor_shift = shift, \ ++ .max_p = 1 << shift, \ ++ .part = parts, \ ++ .sizes = bsizes, \ ++ .nr_real = max, \ ++ .real_devices = NULL, \ ++ .next = NULL, \ ++ .fops = blops, \ ++ .de_arr = NULL, \ ++ .flags = 0 \ ++} ++ ++static spinlock_t cow_lock = SPIN_LOCK_UNLOCKED; ++ ++static struct gendisk cow_gendisk = INIT_GENDISK(MAJOR_NR, "cow", cow_part, ++ COW_SHIFT, sizes, MAX_DEV, ++ &cow_blops); ++ ++static int cow_add(int n) ++{ ++ struct cow *dev = &cow_dev[n]; ++ char name[sizeof("nnnnnn\0")]; ++ int err = -ENODEV; ++ ++ if(dev->cow_path == NULL) ++ goto out; ++ ++ sprintf(name, "%d", n); ++ dev->devfs = devfs_register(cow_dir_handle, name, DEVFS_FL_REMOVABLE, ++ MAJOR_NR, n << COW_SHIFT, S_IFBLK | ++ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, ++ &cow_blops, NULL); ++ ++ init_MUTEX_LOCKED(&dev->sem); ++ init_MUTEX(&dev->io_sem); ++ ++ return(0); ++ ++out: ++ return(err); ++} ++ ++/* ++* Add buffer_head to back of pending list ++*/ ++static void cow_add_bh(struct cow *cow, struct buffer_head *bh) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&cow->io_lock, flags); ++ if(cow->bhtail != NULL){ ++ cow->bhtail->b_reqnext = bh; ++ cow->bhtail = bh; ++ } ++ else { ++ cow->bh = bh; ++ cow->bhtail = bh; ++ } ++ spin_unlock_irqrestore(&cow->io_lock, flags); ++} ++ ++/* ++* Grab first pending buffer ++*/ ++static struct buffer_head *cow_get_bh(struct cow *cow) ++{ ++ struct buffer_head *bh; ++ ++ spin_lock_irq(&cow->io_lock); ++ bh = cow->bh; ++ if(bh != NULL){ ++ if(bh == cow->bhtail) ++ cow->bhtail = NULL; ++ cow->bh = bh->b_reqnext; ++ bh->b_reqnext = NULL; ++ } ++ spin_unlock_irq(&cow->io_lock); ++ ++ return(bh); ++} ++ ++static void cow_handle_bh(struct cow *cow, struct buffer_head *bh, ++ struct buffer_head **cow_bh, int ncow_bh) ++{ ++ int i; ++ ++ if(ncow_bh > 0) ++ ll_rw_block(WRITE, ncow_bh, cow_bh); ++ ++ for(i = 0; i < ncow_bh ; i++){ ++ wait_on_buffer(cow_bh[i]); ++ brelse(cow_bh[i]); ++ } ++ ++ ll_rw_block(WRITE, 1, &bh); ++ brelse(bh); ++} ++ ++static struct buffer_head *cow_new_bh(struct cow *dev, int sector) ++{ ++ struct buffer_head *bh; ++ ++ sector = (dev->bitmap_offset + sector / 8) / dev->sectorsize; ++ bh = getblk(dev->cow_dev, sector, dev->sectorsize); ++ memcpy(bh->b_data, dev->bitmap + sector / (8 * sizeof(dev->bitmap[0])), ++ dev->sectorsize); ++ return(bh); ++} ++ ++/* Copied from loop.c, needed to avoid deadlocking in make_request. */ ++ ++static int cow_thread(void *data) ++{ ++ struct cow *dev = data; ++ struct buffer_head *bh; ++ ++ daemonize(); ++ exit_files(current); ++ ++ sprintf(current->comm, "cow%d", dev - cow_dev); ++ ++ spin_lock_irq(¤t->sigmask_lock); ++ sigfillset(¤t->blocked); ++ flush_signals(current); ++ spin_unlock_irq(¤t->sigmask_lock); ++ ++ atomic_inc(&dev->working); ++ ++ current->policy = SCHED_OTHER; ++ current->nice = -20; ++ ++ current->flags |= PF_NOIO; ++ ++ /* ++ * up sem, we are running ++ */ ++ up(&dev->sem); ++ ++ for(;;){ ++ int start, len, nbh, i, update_bitmap = 0; ++ struct buffer_head *cow_bh[2]; ++ ++ down_interruptible(&dev->io_sem); ++ /* ++ * could be upped because of tear-down, not because of ++ * pending work ++ */ ++ if(!atomic_read(&dev->working)) ++ break; ++ ++ bh = cow_get_bh(dev); ++ if(bh == NULL){ ++ printk(KERN_ERR "cow: missing bh\n"); ++ continue; ++ } ++ ++ start = bh->b_blocknr * bh->b_size / dev->sectorsize; ++ len = bh->b_size / dev->sectorsize; ++ for(i = 0; i < len ; i++){ ++ if(ubd_test_bit(start +ni, ++ (unsigned char *) dev->bitmap)) ++ continue; ++ ++ update_bitmap = 1; ++ ubd_set_bit(start + i, (unsigned char *) dev->bitmap); ++ } ++ ++ cow_bh[0] = NULL; ++ cow_bh[1] = NULL; ++ nbh = 0; ++ if(update_bitmap){ ++ cow_bh[0] = cow_new_bh(dev, start); ++ nbh++; ++ if(start / dev->sectorsize != ++ (start + len) / dev->sectorsize){ ++ cow_bh[1] = cow_new_bh(dev, start + len); ++ nbh++; ++ } ++ } ++ ++ bh->b_dev = dev->cow_dev; ++ bh->b_blocknr += dev->data_offset / dev->sectorsize; ++ ++ cow_handle_bh(dev, bh, cow_bh, nbh); ++ ++ /* ++ * upped both for pending work and tear-down, lo_pending ++ * will hit zero then ++ */ ++ if(atomic_dec_and_test(&dev->working)) ++ break; ++ } ++ ++ up(&dev->sem); ++ return(0); ++} ++ ++static int cow_make_request(request_queue_t *q, int rw, struct buffer_head *bh) ++{ ++ struct cow *dev; ++ int n, minor; ++ ++ minor = MINOR(bh->b_rdev); ++ n = minor >> COW_SHIFT; ++ dev = &cow_dev[n]; ++ ++ dev->end_io = NULL; ++ if(ubd_test_bit(bh->b_rsector, (unsigned char *) dev->bitmap)){ ++ bh->b_rdev = dev->cow_dev; ++ bh->b_rsector += dev->data_offset / dev->sectorsize; ++ } ++ else if(rw == WRITE){ ++ bh->b_dev = dev->cow_dev; ++ bh->b_blocknr += dev->data_offset / dev->sectorsize; ++ ++ cow_add_bh(dev, bh); ++ up(&dev->io_sem); ++ return(0); ++ } ++ else { ++ bh->b_rdev = dev->backing_dev; ++ } ++ ++ return(1); ++} ++ ++int cow_init(void) ++{ ++ int i; ++ ++ cow_dir_handle = devfs_mk_dir (NULL, "cow", NULL); ++ if (devfs_register_blkdev(MAJOR_NR, "cow", &cow_blops)) { ++ printk(KERN_ERR "cow: unable to get major %d\n", MAJOR_NR); ++ return -1; ++ } ++ read_ahead[MAJOR_NR] = 8; /* 8 sector (4kB) read-ahead */ ++ blksize_size[MAJOR_NR] = blk_sizes; ++ blk_size[MAJOR_NR] = sizes; ++ INIT_HARDSECT(hardsect_size, MAJOR_NR, hardsect_sizes); ++ ++ cow_queue = BLK_DEFAULT_QUEUE(MAJOR_NR); ++ blk_init_queue(cow_queue, NULL); ++ INIT_ELV(cow_queue, &cow_queue->elevator); ++ blk_queue_make_request(cow_queue, cow_make_request); ++ ++ add_gendisk(&cow_gendisk); ++ ++ for(i=0;i 0){ ++ n = (left > blocksize) ? blocksize : left; ++ ++ bh = bread(dev, block, (n < 512) ? 512 : n); ++ if(bh == NULL) ++ return(-EIO); ++ ++ n -= offset; ++ memcpy(&buf[cur], bh->b_data + offset, n); ++ block++; ++ left -= n; ++ cur += n; ++ offset = 0; ++ brelse(bh); ++ } ++ ++ return(count); ++} ++ ++static int cow_open(struct inode *inode, struct file *filp) ++{ ++ int (*dev_ioctl)(struct inode *, struct file *, unsigned int, ++ unsigned long); ++ mm_segment_t fs; ++ struct cow *dev; ++ __u64 size; ++ __u32 magic; ++ time_t mtime; ++ char *backing_file; ++ int n, offset, err = 0; ++ ++ n = DEVICE_NR(inode->i_rdev); ++ if(n >= MAX_DEV) ++ return(-ENODEV); ++ dev = &cow_dev[n]; ++ offset = n << COW_SHIFT; ++ ++ spin_lock(&cow_lock); ++ ++ if(dev->count == 0){ ++ dev->cow_dev = name_to_kdev_t(dev->cow_path); ++ if(dev->cow_dev == 0){ ++ printk(KERN_ERR "cow_open - name_to_kdev_t(\"%s\") " ++ "failed\n", dev->cow_path); ++ err = -ENODEV; ++ } ++ ++ dev->backing_dev = name_to_kdev_t(dev->backing_path); ++ if(dev->backing_dev == 0){ ++ printk(KERN_ERR "cow_open - name_to_kdev_t(\"%s\") " ++ "failed\n", dev->backing_path); ++ err = -ENODEV; ++ } ++ ++ if(err) ++ goto out; ++ ++ dev->cow_bdev = bdget(dev->cow_dev); ++ if(dev->cow_bdev == NULL){ ++ printk(KERN_ERR "cow_open - bdget(\"%s\") failed\n", ++ dev->cow_path); ++ err = -ENOMEM; ++ } ++ dev->backing_bdev = bdget(dev->backing_dev); ++ if(dev->backing_bdev == NULL){ ++ printk(KERN_ERR "cow_open - bdget(\"%s\") failed\n", ++ dev->backing_path); ++ err = -ENOMEM; ++ } ++ ++ if(err) ++ goto out; ++ ++ err = blkdev_get(dev->cow_bdev, FMODE_READ|FMODE_WRITE, 0, ++ BDEV_RAW); ++ if(err){ ++ printk("cow_open - blkdev_get of COW device failed, " ++ "error = %d\n", err); ++ goto out; ++ } ++ ++ err = blkdev_get(dev->backing_bdev, FMODE_READ, 0, BDEV_RAW); ++ if(err){ ++ printk("cow_open - blkdev_get of backing device " ++ "failed, error = %d\n", err); ++ goto out; ++ } ++ ++ err = read_cow_header(reader, &dev->cow_dev, &magic, ++ &backing_file, &mtime, &size, ++ &dev->sectorsize, &dev->bitmap_offset); ++ if(err){ ++ printk(KERN_ERR "cow_open - read_cow_header failed, " ++ "err = %d\n", err); ++ goto out; ++ } ++ ++ cow_sizes(size, dev->sectorsize, dev->bitmap_offset, ++ &dev->bitmap_len, &dev->data_offset); ++ dev->bitmap = (void *) vmalloc(dev->bitmap_len); ++ if(dev->bitmap == NULL){ ++ err = -ENOMEM; ++ printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); ++ goto out; ++ } ++ flush_tlb_kernel_vm(); ++ ++ err = reader(dev->bitmap_offset, (char *) dev->bitmap, ++ dev->bitmap_len, &dev->cow_dev); ++ if(err < 0){ ++ printk(KERN_ERR "Failed to read COW bitmap\n"); ++ vfree(dev->bitmap); ++ goto out; ++ } ++ ++ dev_ioctl = dev->backing_bdev->bd_op->ioctl; ++ fs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = (*dev_ioctl)(inode, filp, BLKGETSIZE, ++ (unsigned long) &sizes[offset]); ++ set_fs(fs); ++ if(err){ ++ printk(KERN_ERR "cow_open - BLKGETSIZE failed, " ++ "error = %d\n", err); ++ goto out; ++ } ++ ++ kernel_thread(cow_thread, dev, ++ CLONE_FS | CLONE_FILES | CLONE_SIGHAND); ++ down(&dev->sem); ++ } ++ dev->count++; ++out: ++ spin_unlock(&cow_lock); ++ return(err); ++} ++ ++static int cow_release(struct inode * inode, struct file * file) ++{ ++ struct cow *dev; ++ int n, err; ++ ++ n = DEVICE_NR(inode->i_rdev); ++ if(n >= MAX_DEV) ++ return(-ENODEV); ++ dev = &cow_dev[n]; ++ ++ spin_lock(&cow_lock); ++ ++ if(--dev->count > 0) ++ goto out; ++ ++ err = blkdev_put(dev->cow_bdev, BDEV_RAW); ++ if(err) ++ printk("cow_release - blkdev_put of cow device failed, " ++ "error = %d\n", err); ++ bdput(dev->cow_bdev); ++ dev->cow_bdev = 0; ++ ++ err = blkdev_put(dev->backing_bdev, BDEV_RAW); ++ if(err) ++ printk("cow_release - blkdev_put of backing device failed, " ++ "error = %d\n", err); ++ bdput(dev->backing_bdev); ++ dev->backing_bdev = 0; ++ ++out: ++ spin_unlock(&cow_lock); ++ return(0); ++} ++ ++static int cow_ioctl(struct inode * inode, struct file * file, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct cow *dev; ++ int (*dev_ioctl)(struct inode *, struct file *, unsigned int, ++ unsigned long); ++ int n; ++ ++ n = DEVICE_NR(inode->i_rdev); ++ if(n >= MAX_DEV) ++ return(-ENODEV); ++ dev = &cow_dev[n]; ++ ++ dev_ioctl = dev->backing_bdev->bd_op->ioctl; ++ return((*dev_ioctl)(inode, file, cmd, arg)); ++} ++ ++static int cow_revalidate(kdev_t rdev) ++{ ++ printk(KERN_ERR "Need to implement cow_revalidate\n"); ++ return(0); ++} ++ ++static int parse_unit(char **ptr) ++{ ++ char *str = *ptr, *end; ++ int n = -1; ++ ++ if(isdigit(*str)) { ++ n = simple_strtoul(str, &end, 0); ++ if(end == str) ++ return(-1); ++ *ptr = end; ++ } ++ else if (('a' <= *str) && (*str <= 'h')) { ++ n = *str - 'a'; ++ str++; ++ *ptr = str; ++ } ++ return(n); ++} ++ ++static int cow_setup(char *str) ++{ ++ struct cow *dev; ++ char *cow_name, *backing_name; ++ int unit; ++ ++ unit = parse_unit(&str); ++ if(unit < 0){ ++ printk(KERN_ERR "cow_setup - Couldn't parse unit number\n"); ++ return(1); ++ } ++ ++ if(*str != '='){ ++ printk(KERN_ERR "cow_setup - Missing '=' after unit " ++ "number\n"); ++ return(1); ++ } ++ str++; ++ ++ cow_name = str; ++ backing_name = strchr(str, ','); ++ if(backing_name == NULL){ ++ printk(KERN_ERR "cow_setup - missing backing device name\n"); ++ return(0); ++ } ++ *backing_name = '\0'; ++ backing_name++; ++ ++ spin_lock(&cow_lock); ++ ++ dev = &cow_dev[unit]; ++ dev->cow_path = cow_name; ++ dev->backing_path = backing_name; ++ ++ spin_unlock(&cow_lock); ++ return(0); ++} ++ ++__setup("cow", cow_setup); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/arch/um/drivers/cow_sys.h b/arch/um/drivers/cow_sys.h +--- a/arch/um/drivers/cow_sys.h Wed Dec 31 19:00:00 1969 ++++ b/arch/um/drivers/cow_sys.h Fri Aug 15 15:12:37 2003 +@@ -0,0 +1,48 @@ ++#ifndef __COW_SYS_H__ ++#define __COW_SYS_H__ ++ ++#include "kern_util.h" ++#include "user_util.h" ++#include "os.h" ++#include "user.h" ++ ++static inline void *cow_malloc(int size) ++{ ++ return(um_kmalloc(size)); ++} ++ ++static inline void cow_free(void *ptr) ++{ ++ kfree(ptr); ++} ++ ++#define cow_printf printk ++ ++static inline char *cow_strdup(char *str) ++{ ++ return(uml_strdup(str)); ++} ++ ++static inline int cow_seek_file(int fd, __u64 offset) ++{ ++ return(os_seek_file(fd, offset)); ++} ++ ++static inline int cow_file_size(char *file, __u64 *size_out) ++{ ++ return(os_file_size(file, size_out)); ++} ++ ++static inline int cow_write_file(int fd, char *buf, int size) ++{ ++ return(os_write_file(fd, buf, size)); ++} ++ ++#endif ++ ++/* ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/arch/um/drivers/cow_user.c b/arch/um/drivers/cow_user.c +--- a/arch/um/drivers/cow_user.c Wed Dec 31 19:00:00 1969 ++++ b/arch/um/drivers/cow_user.c Fri Aug 15 15:12:34 2003 +@@ -0,0 +1,296 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cow.h" ++#include "cow_sys.h" ++ ++#define PATH_LEN_V1 256 ++ ++struct cow_header_v1 { ++ int magic; ++ int version; ++ char backing_file[PATH_LEN_V1]; ++ time_t mtime; ++ __u64 size; ++ int sectorsize; ++}; ++ ++#define PATH_LEN_V2 MAXPATHLEN ++ ++struct cow_header_v2 { ++ unsigned long magic; ++ unsigned long version; ++ char backing_file[PATH_LEN_V2]; ++ time_t mtime; ++ __u64 size; ++ int sectorsize; ++}; ++ ++union cow_header { ++ struct cow_header_v1 v1; ++ struct cow_header_v2 v2; ++}; ++ ++#define COW_MAGIC 0x4f4f4f4d /* MOOO */ ++#define COW_VERSION 2 ++ ++void cow_sizes(__u64 size, int sectorsize, int bitmap_offset, ++ unsigned long *bitmap_len_out, int *data_offset_out) ++{ ++ *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize); ++ ++ *data_offset_out = bitmap_offset + *bitmap_len_out; ++ *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize; ++ *data_offset_out *= sectorsize; ++} ++ ++static int absolutize(char *to, int size, char *from) ++{ ++ char save_cwd[256], *slash; ++ int remaining; ++ ++ if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) { ++ cow_printf("absolutize : unable to get cwd - errno = %d\n", ++ errno); ++ return(-1); ++ } ++ slash = strrchr(from, '/'); ++ if(slash != NULL){ ++ *slash = '\0'; ++ if(chdir(from)){ ++ *slash = '/'; ++ cow_printf("absolutize : Can't cd to '%s' - " ++ "errno = %d\n", from, errno); ++ return(-1); ++ } ++ *slash = '/'; ++ if(getcwd(to, size) == NULL){ ++ cow_printf("absolutize : unable to get cwd of '%s' - " ++ "errno = %d\n", from, errno); ++ return(-1); ++ } ++ remaining = size - strlen(to); ++ if(strlen(slash) + 1 > remaining){ ++ cow_printf("absolutize : unable to fit '%s' into %d " ++ "chars\n", from, size); ++ return(-1); ++ } ++ strcat(to, slash); ++ } ++ else { ++ if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){ ++ cow_printf("absolutize : unable to fit '%s' into %d " ++ "chars\n", from, size); ++ return(-1); ++ } ++ strcpy(to, save_cwd); ++ strcat(to, "/"); ++ strcat(to, from); ++ } ++ chdir(save_cwd); ++ return(0); ++} ++ ++int write_cow_header(char *cow_file, int fd, char *backing_file, ++ int sectorsize, long long *size) ++{ ++ struct cow_header_v2 *header; ++ struct stat64 buf; ++ int err; ++ ++ err = cow_seek_file(fd, 0); ++ if(err != 0){ ++ cow_printf("write_cow_header - lseek failed, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ ++ err = -ENOMEM; ++ header = cow_malloc(sizeof(*header)); ++ if(header == NULL){ ++ cow_printf("Failed to allocate COW V2 header\n"); ++ goto out; ++ } ++ header->magic = htonl(COW_MAGIC); ++ header->version = htonl(COW_VERSION); ++ ++ err = -EINVAL; ++ if(strlen(backing_file) > sizeof(header->backing_file) - 1){ ++ cow_printf("Backing file name \"%s\" is too long - names are " ++ "limited to %d characters\n", backing_file, ++ sizeof(header->backing_file) - 1); ++ goto out_free; ++ } ++ ++ if(absolutize(header->backing_file, sizeof(header->backing_file), ++ backing_file)) ++ goto out_free; ++ ++ err = stat64(header->backing_file, &buf); ++ if(err < 0){ ++ cow_printf("Stat of backing file '%s' failed, errno = %d\n", ++ header->backing_file, errno); ++ err = -errno; ++ goto out_free; ++ } ++ ++ err = cow_file_size(header->backing_file, size); ++ if(err){ ++ cow_printf("Couldn't get size of backing file '%s', " ++ "errno = %d\n", header->backing_file, -*size); ++ goto out_free; ++ } ++ ++ header->mtime = htonl(buf.st_mtime); ++ header->size = htonll(*size); ++ header->sectorsize = htonl(sectorsize); ++ ++ err = write(fd, header, sizeof(*header)); ++ if(err != sizeof(*header)){ ++ cow_printf("Write of header to new COW file '%s' failed, " ++ "errno = %d\n", cow_file, errno); ++ goto out_free; ++ } ++ err = 0; ++ out_free: ++ cow_free(header); ++ out: ++ return(err); ++} ++ ++int file_reader(__u64 offset, char *buf, int len, void *arg) ++{ ++ int fd = *((int *) arg); ++ ++ return(pread(fd, buf, len, offset)); ++} ++ ++int read_cow_header(int (*reader)(__u64, char *, int, void *), void *arg, ++ __u32 *magic_out, char **backing_file_out, ++ time_t *mtime_out, __u64 *size_out, ++ int *sectorsize_out, int *bitmap_offset_out) ++{ ++ union cow_header *header; ++ char *file; ++ int err, n; ++ unsigned long version, magic; ++ ++ header = cow_malloc(sizeof(*header)); ++ if(header == NULL){ ++ cow_printf("read_cow_header - Failed to allocate header\n"); ++ return(-ENOMEM); ++ } ++ err = -EINVAL; ++ n = (*reader)(0, (char *) header, sizeof(*header), arg); ++ if(n < offsetof(typeof(header->v1), backing_file)){ ++ cow_printf("read_cow_header - short header\n"); ++ goto out; ++ } ++ ++ magic = header->v1.magic; ++ if(magic == COW_MAGIC) { ++ version = header->v1.version; ++ } ++ else if(magic == ntohl(COW_MAGIC)){ ++ version = ntohl(header->v1.version); ++ } ++ /* No error printed because the non-COW case comes through here */ ++ else goto out; ++ ++ *magic_out = COW_MAGIC; ++ ++ if(version == 1){ ++ if(n < sizeof(header->v1)){ ++ cow_printf("read_cow_header - failed to read V1 " ++ "header\n"); ++ goto out; ++ } ++ *mtime_out = header->v1.mtime; ++ *size_out = header->v1.size; ++ *sectorsize_out = header->v1.sectorsize; ++ *bitmap_offset_out = sizeof(header->v1); ++ file = header->v1.backing_file; ++ } ++ else if(version == 2){ ++ if(n < sizeof(header->v2)){ ++ cow_printf("read_cow_header - failed to read V2 " ++ "header\n"); ++ goto out; ++ } ++ *mtime_out = ntohl(header->v2.mtime); ++ *size_out = ntohll(header->v2.size); ++ *sectorsize_out = ntohl(header->v2.sectorsize); ++ *bitmap_offset_out = sizeof(header->v2); ++ file = header->v2.backing_file; ++ } ++ else { ++ cow_printf("read_cow_header - invalid COW version\n"); ++ goto out; ++ } ++ err = -ENOMEM; ++ *backing_file_out = cow_strdup(file); ++ if(*backing_file_out == NULL){ ++ cow_printf("read_cow_header - failed to allocate backing " ++ "file\n"); ++ goto out; ++ } ++ err = 0; ++ out: ++ cow_free(header); ++ return(err); ++} ++ ++int init_cow_file(int fd, char *cow_file, char *backing_file, int sectorsize, ++ int *bitmap_offset_out, unsigned long *bitmap_len_out, ++ int *data_offset_out) ++{ ++ __u64 size, offset; ++ char zero = 0; ++ int err; ++ ++ err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size); ++ if(err) ++ goto out; ++ ++ cow_sizes(size, sectorsize, sizeof(struct cow_header_v2), ++ bitmap_len_out, data_offset_out); ++ *bitmap_offset_out = sizeof(struct cow_header_v2); ++ ++ offset = *data_offset_out + size - sizeof(zero); ++ err = cow_seek_file(fd, offset); ++ if(err != 0){ ++ cow_printf("cow bitmap lseek failed : errno = %d\n", errno); ++ goto out; ++ } ++ ++ /* does not really matter how much we write it is just to set EOF ++ * this also sets the entire COW bitmap ++ * to zero without having to allocate it ++ */ ++ err = cow_write_file(fd, &zero, sizeof(zero)); ++ if(err != sizeof(zero)){ ++ err = -EINVAL; ++ cow_printf("Write of bitmap to new COW file '%s' failed, " ++ "errno = %d\n", cow_file, errno); ++ goto out; ++ } ++ ++ return(0); ++ ++ out: ++ return(err); ++} ++ ++/* ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c +--- a/arch/um/drivers/hostaudio_kern.c Fri Aug 15 15:09:05 2003 ++++ b/arch/um/drivers/hostaudio_kern.c Fri Aug 15 15:13:48 2003 +@@ -11,6 +11,7 @@ + #include "linux/fs.h" + #include "linux/sound.h" + #include "linux/soundcard.h" ++#include "asm/uaccess.h" + #include "kern_util.h" + #include "init.h" + #include "hostaudio.h" +@@ -22,7 +23,7 @@ + #ifndef MODULE + static int set_dsp(char *name, int *add) + { +- dsp = uml_strdup(name); ++ dsp = name; + return(0); + } + +@@ -34,7 +35,7 @@ + + static int set_mixer(char *name, int *add) + { +- mixer = uml_strdup(name); ++ mixer = name; + return(0); + } + +@@ -51,23 +52,55 @@ + loff_t *ppos) + { + struct hostaudio_state *state = file->private_data; ++ void *kbuf; ++ int err; + + #ifdef DEBUG + printk("hostaudio: read called, count = %d\n", count); + #endif + +- return(hostaudio_read_user(state, buffer, count, ppos)); ++ kbuf = kmalloc(count, GFP_KERNEL); ++ if(kbuf == NULL) ++ return(-ENOMEM); ++ ++ err = hostaudio_read_user(state, kbuf, count, ppos); ++ if(err < 0) ++ goto out; ++ ++ if(copy_to_user(buffer, kbuf, err)) ++ err = -EFAULT; ++ ++ out: ++ kfree(kbuf); ++ return(err); + } + + static ssize_t hostaudio_write(struct file *file, const char *buffer, + size_t count, loff_t *ppos) + { + struct hostaudio_state *state = file->private_data; ++ void *kbuf; ++ int err; + + #ifdef DEBUG + printk("hostaudio: write called, count = %d\n", count); + #endif +- return(hostaudio_write_user(state, buffer, count, ppos)); ++ ++ kbuf = kmalloc(count, GFP_KERNEL); ++ if(kbuf == NULL) ++ return(-ENOMEM); ++ ++ err = -EFAULT; ++ if(copy_from_user(kbuf, buffer, count)) ++ goto out; ++ ++ err = hostaudio_write_user(state, kbuf, count, ppos); ++ if(err < 0) ++ goto out; ++ ++ out: ++ kfree(kbuf); ++ return(err); + } + + static unsigned int hostaudio_poll(struct file *file, +@@ -86,12 +119,43 @@ + unsigned int cmd, unsigned long arg) + { + struct hostaudio_state *state = file->private_data; ++ unsigned long data = 0; ++ int err; + + #ifdef DEBUG + printk("hostaudio: ioctl called, cmd = %u\n", cmd); + #endif ++ switch(cmd){ ++ case SNDCTL_DSP_SPEED: ++ case SNDCTL_DSP_STEREO: ++ case SNDCTL_DSP_GETBLKSIZE: ++ case SNDCTL_DSP_CHANNELS: ++ case SNDCTL_DSP_SUBDIVIDE: ++ case SNDCTL_DSP_SETFRAGMENT: ++ if(get_user(data, (int *) arg)) ++ return(-EFAULT); ++ break; ++ default: ++ break; ++ } ++ ++ err = hostaudio_ioctl_user(state, cmd, (unsigned long) &data); ++ ++ switch(cmd){ ++ case SNDCTL_DSP_SPEED: ++ case SNDCTL_DSP_STEREO: ++ case SNDCTL_DSP_GETBLKSIZE: ++ case SNDCTL_DSP_CHANNELS: ++ case SNDCTL_DSP_SUBDIVIDE: ++ case SNDCTL_DSP_SETFRAGMENT: ++ if(put_user(data, (int *) arg)) ++ return(-EFAULT); ++ break; ++ default: ++ break; ++ } + +- return(hostaudio_ioctl_user(state, cmd, arg)); ++ return(err); + } + + static int hostaudio_open(struct inode *inode, struct file *file) +@@ -225,7 +289,8 @@ + + static int __init hostaudio_init_module(void) + { +- printk(KERN_INFO "UML Audio Relay\n"); ++ printk(KERN_INFO "UML Audio Relay (host dsp = %s, host mixer = %s)\n", ++ dsp, mixer); + + module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1); + if(module_data.dev_audio < 0){ +diff -Naur a/arch/um/drivers/line.c b/arch/um/drivers/line.c +--- a/arch/um/drivers/line.c Fri Aug 15 15:08:24 2003 ++++ b/arch/um/drivers/line.c Fri Aug 15 15:13:28 2003 +@@ -6,8 +6,8 @@ + #include "linux/sched.h" + #include "linux/slab.h" + #include "linux/list.h" ++#include "linux/interrupt.h" + #include "linux/devfs_fs_kernel.h" +-#include "asm/irq.h" + #include "asm/uaccess.h" + #include "chan_kern.h" + #include "irq_user.h" +@@ -16,16 +16,18 @@ + #include "user_util.h" + #include "kern_util.h" + #include "os.h" ++#include "irq_kern.h" + + #define LINE_BUFSIZE 4096 + +-void line_interrupt(int irq, void *data, struct pt_regs *unused) ++irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused) + { + struct line *dev = data; + + if(dev->count > 0) + chan_interrupt(&dev->chan_list, &dev->task, dev->tty, irq, + dev); ++ return IRQ_HANDLED; + } + + void line_timer_cb(void *arg) +@@ -136,20 +138,22 @@ + return(len); + } + +-void line_write_interrupt(int irq, void *data, struct pt_regs *unused) ++irqreturn_t line_write_interrupt(int irq, void *data, struct pt_regs *unused) + { + struct line *dev = data; + struct tty_struct *tty = dev->tty; + int err; + + err = flush_buffer(dev); +- if(err == 0) return; ++ if(err == 0) ++ return(IRQ_NONE); + else if(err < 0){ + dev->head = dev->buffer; + dev->tail = dev->buffer; + } + +- if(tty == NULL) return; ++ if(tty == NULL) ++ return(IRQ_NONE); + + if(test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && + (tty->ldisc.write_wakeup != NULL)) +@@ -161,9 +165,9 @@ + * writes. + */ + +- if (waitqueue_active(&tty->write_wait)) ++ if(waitqueue_active(&tty->write_wait)) + wake_up_interruptible(&tty->write_wait); +- ++ return(IRQ_HANDLED); + } + + int line_write_room(struct tty_struct *tty) +@@ -369,7 +373,7 @@ + + dev = simple_strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ +- *error_out = "line_setup failed to parse device number"; ++ *error_out = "line_get_config failed to parse device number"; + return(0); + } + +@@ -379,15 +383,15 @@ + } + + line = &lines[dev]; ++ + down(&line->sem); +- + if(!line->valid) + CONFIG_CHUNK(str, size, n, "none", 1); + else if(line->count == 0) + CONFIG_CHUNK(str, size, n, line->init_str, 1); + else n = chan_config_string(&line->chan_list, str, size, error_out); +- + up(&line->sem); ++ + return(n); + } + +@@ -412,7 +416,8 @@ + return NULL; + + driver->driver_name = line_driver->name; +- driver->name = line_driver->devfs_name; ++ driver->name = line_driver->device_name; ++ driver->devfs_name = line_driver->devfs_name; + driver->major = line_driver->major; + driver->minor_start = line_driver->minor_start; + driver->type = line_driver->type; +@@ -432,7 +437,7 @@ + + for(i = 0; i < nlines; i++){ + if(!lines[i].valid) +- tty_unregister_devfs(driver, i); ++ tty_unregister_device(driver, i); + } + + mconsole_register_dev(&line_driver->mc); +@@ -465,24 +470,25 @@ + struct line *line; + }; + +-void winch_interrupt(int irq, void *data, struct pt_regs *unused) ++irqreturn_t winch_interrupt(int irq, void *data, struct pt_regs *unused) + { + struct winch *winch = data; + struct tty_struct *tty; + int err; + char c; + +- err = generic_read(winch->fd, &c, NULL); +- if(err < 0){ +- if(err != -EAGAIN){ +- printk("winch_interrupt : read failed, errno = %d\n", +- -err); +- printk("fd %d is losing SIGWINCH support\n", +- winch->tty_fd); +- free_irq(irq, data); +- return; ++ if(winch->fd != -1){ ++ err = generic_read(winch->fd, &c, NULL); ++ if(err < 0){ ++ if(err != -EAGAIN){ ++ printk("winch_interrupt : read failed, " ++ "errno = %d\n", -err); ++ printk("fd %d is losing SIGWINCH support\n", ++ winch->tty_fd); ++ return(IRQ_HANDLED); ++ } ++ goto out; + } +- goto out; + } + tty = winch->line->tty; + if(tty != NULL){ +@@ -492,7 +498,9 @@ + kill_pg(tty->pgrp, SIGWINCH, 1); + } + out: +- reactivate_fd(winch->fd, WINCH_IRQ); ++ if(winch->fd != -1) ++ reactivate_fd(winch->fd, WINCH_IRQ); ++ return(IRQ_HANDLED); + } + + DECLARE_MUTEX(winch_handler_sem); +@@ -529,7 +537,10 @@ + + list_for_each(ele, &winch_handlers){ + winch = list_entry(ele, struct winch, list); +- close(winch->fd); ++ if(winch->fd != -1){ ++ deactivate_fd(winch->fd, WINCH_IRQ); ++ close(winch->fd); ++ } + if(winch->pid != -1) + os_kill_process(winch->pid, 1); + } +diff -Naur a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c +--- a/arch/um/drivers/mconsole_kern.c Fri Aug 15 15:03:47 2003 ++++ b/arch/um/drivers/mconsole_kern.c Fri Aug 15 15:10:11 2003 +@@ -27,6 +27,7 @@ + #include "init.h" + #include "os.h" + #include "umid.h" ++#include "irq_kern.h" + + static int do_unlink_socket(struct notifier_block *notifier, + unsigned long what, void *data) +@@ -67,7 +68,7 @@ + + DECLARE_WORK(mconsole_work, mc_work_proc, NULL); + +-void mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++irqreturn_t mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) + { + int fd; + struct mconsole_entry *new; +@@ -88,6 +89,7 @@ + } + if(!list_empty(&mc_requests)) schedule_work(&mconsole_work); + reactivate_fd(fd, MCONSOLE_IRQ); ++ return(IRQ_HANDLED); + } + + void mconsole_version(struct mc_request *req) +@@ -100,20 +102,34 @@ + mconsole_reply(req, version, 0, 0); + } + ++void mconsole_log(struct mc_request *req) ++{ ++ int len; ++ char *ptr = req->request.data; ++ ++ ptr += strlen("log"); ++ while(isspace(*ptr)) ptr++; ++ ++ len = ptr - req->request.data; ++ printk("%.*s", len, ptr); ++ mconsole_reply(req, "", 0, 0); ++} ++ + #define UML_MCONSOLE_HELPTEXT \ +-"Commands: +- version - Get kernel version +- help - Print this message +- halt - Halt UML +- reboot - Reboot UML +- config = - Add a new device to UML; +- same syntax as command line +- config - Query the configuration of a device +- remove - Remove a device from UML +- sysrq - Performs the SysRq action controlled by the letter +- cad - invoke the Ctl-Alt-Del handler +- stop - pause the UML; it will do nothing until it receives a 'go' +- go - continue the UML after a 'stop' ++"Commands: \n\ ++ version - Get kernel version \n\ ++ help - Print this message \n\ ++ halt - Halt UML \n\ ++ reboot - Reboot UML \n\ ++ config = - Add a new device to UML; \n\ ++ same syntax as command line \n\ ++ config - Query the configuration of a device \n\ ++ remove - Remove a device from UML \n\ ++ sysrq - Performs the SysRq action controlled by the letter \n\ ++ cad - invoke the Ctl-Alt-Del handler \n\ ++ stop - pause the UML; it will do nothing until it receives a 'go' \n\ ++ go - continue the UML after a 'stop' \n\ ++ log - make UML enter into the kernel log\n\ + " + + void mconsole_help(struct mc_request *req) +@@ -302,7 +318,7 @@ + if(umid_file_name("mconsole", file, sizeof(file))) return(-1); + snprintf(mconsole_socket_name, sizeof(file), "%s", file); + +- sock = create_unix_socket(file, sizeof(file)); ++ sock = create_unix_socket(file, sizeof(file), 1); + if (sock < 0){ + printk("Failed to initialize management console\n"); + return(1); +diff -Naur a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c +--- a/arch/um/drivers/mconsole_user.c Fri Aug 15 15:04:47 2003 ++++ b/arch/um/drivers/mconsole_user.c Fri Aug 15 15:10:35 2003 +@@ -28,6 +28,7 @@ + { "cad", mconsole_cad, 1 }, + { "stop", mconsole_stop, 0 }, + { "go", mconsole_go, 1 }, ++ { "log", mconsole_log, 1 }, + }; + + /* Initialized in mconsole_init, which is an initcall */ +@@ -139,6 +140,7 @@ + memcpy(reply.data, str, len); + reply.data[len] = '\0'; + total -= len; ++ str += len; + reply.len = len + 1; + + len = sizeof(reply) + reply.len - sizeof(reply.data); +diff -Naur a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c +--- a/arch/um/drivers/mmapper_kern.c Fri Aug 15 15:04:33 2003 ++++ b/arch/um/drivers/mmapper_kern.c Fri Aug 15 15:10:32 2003 +@@ -120,7 +120,10 @@ + printk(KERN_INFO "Mapper v0.1\n"); + + v_buf = (char *) find_iomem("mmapper", &mmapper_size); +- if(mmapper_size == 0) return(0); ++ if(mmapper_size == 0){ ++ printk(KERN_ERR "mmapper_init - find_iomem failed\n"); ++ return(0); ++ } + + p_buf = __pa(v_buf); + +diff -Naur a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c +--- a/arch/um/drivers/net_kern.c Fri Aug 15 15:05:49 2003 ++++ b/arch/um/drivers/net_kern.c Fri Aug 15 15:11:52 2003 +@@ -26,6 +26,7 @@ + #include "mconsole_kern.h" + #include "init.h" + #include "irq_user.h" ++#include "irq_kern.h" + + static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED; + LIST_HEAD(opened); +@@ -61,14 +62,14 @@ + return pkt_len; + } + +-void uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++irqreturn_t uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) + { + struct net_device *dev = dev_id; + struct uml_net_private *lp = dev->priv; + int err; + + if(!netif_running(dev)) +- return; ++ return(IRQ_NONE); + + spin_lock(&lp->lock); + while((err = uml_net_rx(dev)) > 0) ; +@@ -83,6 +84,7 @@ + + out: + spin_unlock(&lp->lock); ++ return(IRQ_HANDLED); + } + + static int uml_net_open(struct net_device *dev) +@@ -292,7 +294,7 @@ + struct uml_net *device; + struct net_device *dev; + struct uml_net_private *lp; +- int err, size; ++ int save, err, size; + + size = transport->private_size + sizeof(struct uml_net_private) + + sizeof(((struct uml_net_private *) 0)->user); +@@ -362,21 +364,29 @@ + return 1; + lp = dev->priv; + +- INIT_LIST_HEAD(&lp->list); +- spin_lock_init(&lp->lock); +- lp->dev = dev; +- lp->fd = -1; +- lp->mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0 }; +- lp->have_mac = device->have_mac; +- lp->protocol = transport->kern->protocol; +- lp->open = transport->user->open; +- lp->close = transport->user->close; +- lp->remove = transport->user->remove; +- lp->read = transport->kern->read; +- lp->write = transport->kern->write; +- lp->add_address = transport->user->add_address; +- lp->delete_address = transport->user->delete_address; +- lp->set_mtu = transport->user->set_mtu; ++ /* lp.user is the first four bytes of the transport data, which ++ * has already been initialized. This structure assignment will ++ * overwrite that, so we make sure that .user gets overwritten with ++ * what it already has. ++ */ ++ save = lp->user[0]; ++ *lp = ((struct uml_net_private) ++ { .list = LIST_HEAD_INIT(lp->list), ++ .lock = SPIN_LOCK_UNLOCKED, ++ .dev = dev, ++ .fd = -1, ++ .mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0}, ++ .have_mac = device->have_mac, ++ .protocol = transport->kern->protocol, ++ .open = transport->user->open, ++ .close = transport->user->close, ++ .remove = transport->user->remove, ++ .read = transport->kern->read, ++ .write = transport->kern->write, ++ .add_address = transport->user->add_address, ++ .delete_address = transport->user->delete_address, ++ .set_mtu = transport->user->set_mtu, ++ .user = { save } }); + + init_timer(&lp->tl); + lp->tl.function = uml_net_user_timer_expire; +diff -Naur a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c +--- a/arch/um/drivers/port_kern.c Fri Aug 15 15:04:01 2003 ++++ b/arch/um/drivers/port_kern.c Fri Aug 15 15:10:18 2003 +@@ -6,6 +6,7 @@ + #include "linux/list.h" + #include "linux/sched.h" + #include "linux/slab.h" ++#include "linux/interrupt.h" + #include "linux/irq.h" + #include "linux/spinlock.h" + #include "linux/errno.h" +@@ -14,6 +15,7 @@ + #include "kern_util.h" + #include "kern.h" + #include "irq_user.h" ++#include "irq_kern.h" + #include "port.h" + #include "init.h" + #include "os.h" +@@ -44,7 +46,7 @@ + struct port_list *port; + }; + +-static void pipe_interrupt(int irq, void *data, struct pt_regs *regs) ++static irqreturn_t pipe_interrupt(int irq, void *data, struct pt_regs *regs) + { + struct connection *conn = data; + int fd; +@@ -52,7 +54,7 @@ + fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); + if(fd < 0){ + if(fd == -EAGAIN) +- return; ++ return(IRQ_NONE); + + printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", + -fd); +@@ -65,6 +67,7 @@ + list_add(&conn->list, &conn->port->connections); + + up(&conn->port->sem); ++ return(IRQ_HANDLED); + } + + static int port_accept(struct port_list *port) +@@ -138,12 +141,13 @@ + + DECLARE_WORK(port_work, port_work_proc, NULL); + +-static void port_interrupt(int irq, void *data, struct pt_regs *regs) ++static irqreturn_t port_interrupt(int irq, void *data, struct pt_regs *regs) + { + struct port_list *port = data; + + port->has_connection = 1; + schedule_work(&port_work); ++ return(IRQ_HANDLED); + } + + void *port_data(int port_num) +diff -Naur a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c +--- a/arch/um/drivers/ssl.c Fri Aug 15 15:06:09 2003 ++++ b/arch/um/drivers/ssl.c Fri Aug 15 15:12:30 2003 +@@ -53,8 +53,9 @@ + + static struct line_driver driver = { + .name = "UML serial line", +- .devfs_name = "tts/%d", +- .major = TTYAUX_MAJOR, ++ .device_name = "ttS", ++ .devfs_name = "tts/", ++ .major = TTY_MAJOR, + .minor_start = 64, + .type = TTY_DRIVER_TYPE_SERIAL, + .subtype = 0, +diff -Naur a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c +--- a/arch/um/drivers/stdio_console.c Fri Aug 15 15:04:51 2003 ++++ b/arch/um/drivers/stdio_console.c Fri Aug 15 15:10:56 2003 +@@ -83,7 +83,8 @@ + + static struct line_driver driver = { + .name = "UML console", +- .devfs_name = "vc/%d", ++ .device_name = "tty", ++ .devfs_name = "vc/", + .major = TTY_MAJOR, + .minor_start = 0, + .type = TTY_DRIVER_TYPE_CONSOLE, +@@ -159,6 +160,15 @@ + + static int con_init_done = 0; + ++static struct tty_operations console_ops = { ++ .open = con_open, ++ .close = con_close, ++ .write = con_write, ++ .chars_in_buffer = chars_in_buffer, ++ .set_termios = set_termios, ++ .write_room = line_write_room, ++}; ++ + int stdio_init(void) + { + char *new_title; +@@ -166,7 +176,8 @@ + printk(KERN_INFO "Initializing stdio console driver\n"); + + console_driver = line_register_devfs(&console_lines, &driver, +- &console_ops, vts, sizeof(vts)/sizeof(vts[0])); ++ &console_ops, vts, ++ sizeof(vts)/sizeof(vts[0])); + + lines_init(vts, sizeof(vts)/sizeof(vts[0])); + +@@ -188,15 +199,6 @@ + if(con_init_done) up(&vts[console->index].sem); + } + +-static struct tty_operations console_ops = { +- .open = con_open, +- .close = con_close, +- .write = con_write, +- .chars_in_buffer = chars_in_buffer, +- .set_termios = set_termios, +- .write_room = line_write_room, +-}; +- + static struct tty_driver *console_device(struct console *c, int *index) + { + *index = c->index; +@@ -212,12 +214,14 @@ + console_device, console_setup, + CON_PRINTBUFFER); + +-static void __init stdio_console_init(void) ++static int __init stdio_console_init(void) + { + INIT_LIST_HEAD(&vts[0].chan_list); + list_add(&init_console_chan.list, &vts[0].chan_list); + register_console(&stdiocons); ++ return(0); + } ++ + console_initcall(stdio_console_init); + + static int console_chan_setup(char *str) +diff -Naur a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c +--- a/arch/um/drivers/ubd_kern.c Fri Aug 15 15:05:56 2003 ++++ b/arch/um/drivers/ubd_kern.c Fri Aug 15 15:11:53 2003 +@@ -8,6 +8,13 @@ + * old style ubd by setting UBD_SHIFT to 0 + * 2002-09-27...2002-10-18 massive tinkering for 2.5 + * partitions have changed in 2.5 ++ * 2003-01-29 more tinkering for 2.5.59-1 ++ * This should now address the sysfs problems and has ++ * the symlink for devfs to allow for booting with ++ * the common /dev/ubd/discX/... names rather than ++ * only /dev/ubdN/discN this version also has lots of ++ * clean ups preparing for ubd-many. ++ * James McMechan + */ + + #define MAJOR_NR UBD_MAJOR +@@ -40,6 +47,7 @@ + #include "mconsole_kern.h" + #include "init.h" + #include "irq_user.h" ++#include "irq_kern.h" + #include "ubd_user.h" + #include "2_5compat.h" + #include "os.h" +@@ -70,7 +78,7 @@ + static request_queue_t *ubd_queue; + + /* Protected by ubd_lock */ +-static int fake_major = 0; ++static int fake_major = MAJOR_NR; + + static struct gendisk *ubd_gendisk[MAX_DEV]; + static struct gendisk *fake_gendisk[MAX_DEV]; +@@ -99,12 +107,12 @@ + + struct ubd { + char *file; +- int is_dir; + int count; + int fd; + __u64 size; + struct openflags boot_openflags; + struct openflags openflags; ++ int no_cow; + struct cow cow; + }; + +@@ -118,12 +126,12 @@ + + #define DEFAULT_UBD { \ + .file = NULL, \ +- .is_dir = 0, \ + .count = 0, \ + .fd = -1, \ + .size = -1, \ + .boot_openflags = OPEN_FLAGS, \ + .openflags = OPEN_FLAGS, \ ++ .no_cow = 0, \ + .cow = DEFAULT_COW, \ + } + +@@ -131,8 +139,10 @@ + + static int ubd0_init(void) + { +- if(ubd_dev[0].file == NULL) +- ubd_dev[0].file = "root_fs"; ++ struct ubd *dev = &ubd_dev[0]; ++ ++ if(dev->file == NULL) ++ dev->file = "root_fs"; + return(0); + } + +@@ -199,19 +209,39 @@ + " Create ide0 entries that map onto ubd devices.\n\n" + ); + ++static int parse_unit(char **ptr) ++{ ++ char *str = *ptr, *end; ++ int n = -1; ++ ++ if(isdigit(*str)) { ++ n = simple_strtoul(str, &end, 0); ++ if(end == str) ++ return(-1); ++ *ptr = end; ++ } ++ else if (('a' <= *str) && (*str <= 'h')) { ++ n = *str - 'a'; ++ str++; ++ *ptr = str; ++ } ++ return(n); ++} ++ + static int ubd_setup_common(char *str, int *index_out) + { ++ struct ubd *dev; + struct openflags flags = global_openflags; + char *backing_file; + int n, err; + + if(index_out) *index_out = -1; +- n = *str++; ++ n = *str; + if(n == '='){ +- static int fake_major_allowed = 1; + char *end; + int major; + ++ str++; + if(!strcmp(str, "sync")){ + global_openflags.s = 1; + return(0); +@@ -223,20 +253,14 @@ + return(1); + } + +- if(!fake_major_allowed){ +- printk(KERN_ERR "Can't assign a fake major twice\n"); +- return(1); +- } +- + err = 1; + spin_lock(&ubd_lock); +- if(!fake_major_allowed){ ++ if(fake_major != MAJOR_NR){ + printk(KERN_ERR "Can't assign a fake major twice\n"); + goto out1; + } + + fake_major = major; +- fake_major_allowed = 0; + + printk(KERN_INFO "Setting extra ubd major number to %d\n", + major); +@@ -246,25 +270,23 @@ + return(err); + } + +- if(n < '0'){ +- printk(KERN_ERR "ubd_setup : index out of range\n"); } +- +- if((n >= '0') && (n <= '9')) n -= '0'; +- else if((n >= 'a') && (n <= 'z')) n -= 'a'; +- else { +- printk(KERN_ERR "ubd_setup : device syntax invalid\n"); ++ n = parse_unit(&str); ++ if(n < 0){ ++ printk(KERN_ERR "ubd_setup : couldn't parse unit number " ++ "'%s'\n", str); + return(1); + } + if(n >= MAX_DEV){ +- printk(KERN_ERR "ubd_setup : index out of range " +- "(%d devices)\n", MAX_DEV); ++ printk(KERN_ERR "ubd_setup : index %d out of range " ++ "(%d devices)\n", n, MAX_DEV); + return(1); + } + + err = 1; + spin_lock(&ubd_lock); + +- if(ubd_dev[n].file != NULL){ ++ dev = &ubd_dev[n]; ++ if(dev->file != NULL){ + printk(KERN_ERR "ubd_setup : device already configured\n"); + goto out2; + } +@@ -279,6 +301,11 @@ + flags.s = 1; + str++; + } ++ if (*str == 'd'){ ++ dev->no_cow = 1; ++ str++; ++ } ++ + if(*str++ != '='){ + printk(KERN_ERR "ubd_setup : Expected '='\n"); + goto out2; +@@ -287,14 +314,17 @@ + err = 0; + backing_file = strchr(str, ','); + if(backing_file){ +- *backing_file = '\0'; +- backing_file++; ++ if(dev->no_cow) ++ printk(KERN_ERR "Can't specify both 'd' and a " ++ "cow file\n"); ++ else { ++ *backing_file = '\0'; ++ backing_file++; ++ } + } +- ubd_dev[n].file = str; +- if(ubd_is_dir(ubd_dev[n].file)) +- ubd_dev[n].is_dir = 1; +- ubd_dev[n].cow.file = backing_file; +- ubd_dev[n].boot_openflags = flags; ++ dev->file = str; ++ dev->cow.file = backing_file; ++ dev->boot_openflags = flags; + out2: + spin_unlock(&ubd_lock); + return(err); +@@ -324,8 +354,7 @@ + static int fakehd_set = 0; + static int fakehd(char *str) + { +- printk(KERN_INFO +- "fakehd : Changing ubd name to \"hd\".\n"); ++ printk(KERN_INFO "fakehd : Changing ubd name to \"hd\".\n"); + fakehd_set = 1; + return 1; + } +@@ -394,9 +423,10 @@ + do_ubd_request(ubd_queue); + } + +-static void ubd_intr(int irq, void *dev, struct pt_regs *unused) ++static irqreturn_t ubd_intr(int irq, void *dev, struct pt_regs *unused) + { + ubd_handler(); ++ return(IRQ_HANDLED); + } + + /* Only changed by ubd_init, which is an initcall. */ +@@ -432,16 +462,18 @@ + static int ubd_open_dev(struct ubd *dev) + { + struct openflags flags; +- int err, n, create_cow, *create_ptr; ++ char **back_ptr; ++ int err, create_cow, *create_ptr; + ++ dev->openflags = dev->boot_openflags; + create_cow = 0; + create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL; +- dev->fd = open_ubd_file(dev->file, &dev->openflags, &dev->cow.file, ++ back_ptr = dev->no_cow ? NULL : &dev->cow.file; ++ dev->fd = open_ubd_file(dev->file, &dev->openflags, back_ptr, + &dev->cow.bitmap_offset, &dev->cow.bitmap_len, + &dev->cow.data_offset, create_ptr); + + if((dev->fd == -ENOENT) && create_cow){ +- n = dev - ubd_dev; + dev->fd = create_cow_file(dev->file, dev->cow.file, + dev->openflags, 1 << 9, + &dev->cow.bitmap_offset, +@@ -458,7 +490,10 @@ + if(dev->cow.file != NULL){ + err = -ENOMEM; + dev->cow.bitmap = (void *) vmalloc(dev->cow.bitmap_len); +- if(dev->cow.bitmap == NULL) goto error; ++ if(dev->cow.bitmap == NULL){ ++ printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); ++ goto error; ++ } + flush_tlb_kernel_vm(); + + err = read_cow_bitmap(dev->fd, dev->cow.bitmap, +@@ -484,17 +519,31 @@ + + { + struct gendisk *disk; ++ char from[sizeof("ubd/nnnnn\0")], to[sizeof("discnnnnn/disc\0")]; ++ int err; + + disk = alloc_disk(1 << UBD_SHIFT); +- if (!disk) +- return -ENOMEM; ++ if(disk == NULL) ++ return(-ENOMEM); + + disk->major = major; + disk->first_minor = unit << UBD_SHIFT; + disk->fops = &ubd_blops; + set_capacity(disk, size / 512); +- sprintf(disk->disk_name, "ubd"); +- sprintf(disk->devfs_name, "ubd/disc%d", unit); ++ if(major == MAJOR_NR){ ++ sprintf(disk->disk_name, "ubd%d", unit); ++ sprintf(disk->devfs_name, "ubd/disc%d", unit); ++ sprintf(from, "ubd/%d", unit); ++ sprintf(to, "disc%d/disc", unit); ++ err = devfs_mk_symlink(from, to); ++ if(err) ++ printk("ubd_new_disk failed to make link from %s to " ++ "%s, error = %d\n", from, to, err); ++ } ++ else { ++ sprintf(disk->disk_name, "ubd_fake%d", unit); ++ sprintf(disk->devfs_name, "ubd_fake/disc%d", unit); ++ } + + disk->private_data = &ubd_dev[unit]; + disk->queue = ubd_queue; +@@ -509,10 +558,7 @@ + struct ubd *dev = &ubd_dev[n]; + int err; + +- if(dev->is_dir) +- return(-EISDIR); +- +- if (!dev->file) ++ if(dev->file == NULL) + return(-ENODEV); + + if (ubd_open_dev(dev)) +@@ -526,7 +572,7 @@ + if(err) + return(err); + +- if(fake_major) ++ if(fake_major != MAJOR_NR) + ubd_new_disk(fake_major, dev->size, n, + &fake_gendisk[n]); + +@@ -564,42 +610,42 @@ + return(err); + } + +-static int ubd_get_config(char *dev, char *str, int size, char **error_out) ++static int ubd_get_config(char *name, char *str, int size, char **error_out) + { +- struct ubd *ubd; ++ struct ubd *dev; + char *end; +- int major, n = 0; ++ int n, len = 0; + +- major = simple_strtoul(dev, &end, 0); +- if((*end != '\0') || (end == dev)){ +- *error_out = "ubd_get_config : didn't parse major number"; ++ n = simple_strtoul(name, &end, 0); ++ if((*end != '\0') || (end == name)){ ++ *error_out = "ubd_get_config : didn't parse device number"; + return(-1); + } + +- if((major >= MAX_DEV) || (major < 0)){ +- *error_out = "ubd_get_config : major number out of range"; ++ if((n >= MAX_DEV) || (n < 0)){ ++ *error_out = "ubd_get_config : device number out of range"; + return(-1); + } + +- ubd = &ubd_dev[major]; ++ dev = &ubd_dev[n]; + spin_lock(&ubd_lock); + +- if(ubd->file == NULL){ +- CONFIG_CHUNK(str, size, n, "", 1); ++ if(dev->file == NULL){ ++ CONFIG_CHUNK(str, size, len, "", 1); + goto out; + } + +- CONFIG_CHUNK(str, size, n, ubd->file, 0); ++ CONFIG_CHUNK(str, size, len, dev->file, 0); + +- if(ubd->cow.file != NULL){ +- CONFIG_CHUNK(str, size, n, ",", 0); +- CONFIG_CHUNK(str, size, n, ubd->cow.file, 1); ++ if(dev->cow.file != NULL){ ++ CONFIG_CHUNK(str, size, len, ",", 0); ++ CONFIG_CHUNK(str, size, len, dev->cow.file, 1); + } +- else CONFIG_CHUNK(str, size, n, "", 1); ++ else CONFIG_CHUNK(str, size, len, "", 1); + + out: + spin_unlock(&ubd_lock); +- return(n); ++ return(len); + } + + static int ubd_remove(char *str) +@@ -607,11 +653,9 @@ + struct ubd *dev; + int n, err = -ENODEV; + +- if(!isdigit(*str)) +- return(err); /* it should be a number 0-7/a-h */ ++ n = parse_unit(&str); + +- n = *str - '0'; +- if(n >= MAX_DEV) ++ if((n < 0) || (n >= MAX_DEV)) + return(err); + + dev = &ubd_dev[n]; +@@ -672,7 +716,7 @@ + + elevator_init(ubd_queue, &elevator_noop); + +- if (fake_major != 0) { ++ if (fake_major != MAJOR_NR) { + char name[sizeof("ubd_nnn\0")]; + + snprintf(name, sizeof(name), "ubd_%d", fake_major); +@@ -717,15 +761,9 @@ + { + struct gendisk *disk = inode->i_bdev->bd_disk; + struct ubd *dev = disk->private_data; +- int err = -EISDIR; +- +- if(dev->is_dir == 1) +- goto out; ++ int err = 0; + +- err = 0; + if(dev->count == 0){ +- dev->openflags = dev->boot_openflags; +- + err = ubd_open_dev(dev); + if(err){ + printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", +@@ -799,15 +837,6 @@ + + if(req->rq_status == RQ_INACTIVE) return(1); + +- if(dev->is_dir){ +- strcpy(req->buffer, "HOSTFS:"); +- strcat(req->buffer, dev->file); +- spin_lock(&ubd_io_lock); +- end_request(req, 1); +- spin_unlock(&ubd_io_lock); +- return(1); +- } +- + if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ + printk("Write attempted on readonly ubd device %s\n", + disk->disk_name); +diff -Naur a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c +--- a/arch/um/drivers/ubd_user.c Fri Aug 15 15:04:51 2003 ++++ b/arch/um/drivers/ubd_user.c Fri Aug 15 15:10:54 2003 +@@ -24,142 +24,24 @@ + #include "user.h" + #include "ubd_user.h" + #include "os.h" ++#include "cow.h" + + #include + #include +-#if __BYTE_ORDER == __BIG_ENDIAN +-# define ntohll(x) (x) +-# define htonll(x) (x) +-#elif __BYTE_ORDER == __LITTLE_ENDIAN +-# define ntohll(x) bswap_64(x) +-# define htonll(x) bswap_64(x) +-#else +-#error "__BYTE_ORDER not defined" +-#endif +- +-#define PATH_LEN_V1 256 +- +-struct cow_header_v1 { +- int magic; +- int version; +- char backing_file[PATH_LEN_V1]; +- time_t mtime; +- __u64 size; +- int sectorsize; +-}; +- +-#define PATH_LEN_V2 MAXPATHLEN +- +-struct cow_header_v2 { +- unsigned long magic; +- unsigned long version; +- char backing_file[PATH_LEN_V2]; +- time_t mtime; +- __u64 size; +- int sectorsize; +-}; +- +-union cow_header { +- struct cow_header_v1 v1; +- struct cow_header_v2 v2; +-}; +- +-#define COW_MAGIC 0x4f4f4f4d /* MOOO */ +-#define COW_VERSION 2 +- +-static void sizes(__u64 size, int sectorsize, int bitmap_offset, +- unsigned long *bitmap_len_out, int *data_offset_out) +-{ +- *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize); +- +- *data_offset_out = bitmap_offset + *bitmap_len_out; +- *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize; +- *data_offset_out *= sectorsize; +-} +- +-static int read_cow_header(int fd, int *magic_out, char **backing_file_out, +- time_t *mtime_out, __u64 *size_out, +- int *sectorsize_out, int *bitmap_offset_out) +-{ +- union cow_header *header; +- char *file; +- int err, n; +- unsigned long version, magic; +- +- header = um_kmalloc(sizeof(*header)); +- if(header == NULL){ +- printk("read_cow_header - Failed to allocate header\n"); +- return(-ENOMEM); +- } +- err = -EINVAL; +- n = read(fd, header, sizeof(*header)); +- if(n < offsetof(typeof(header->v1), backing_file)){ +- printk("read_cow_header - short header\n"); +- goto out; +- } +- +- magic = header->v1.magic; +- if(magic == COW_MAGIC) { +- version = header->v1.version; +- } +- else if(magic == ntohl(COW_MAGIC)){ +- version = ntohl(header->v1.version); +- } +- else goto out; +- +- *magic_out = COW_MAGIC; +- +- if(version == 1){ +- if(n < sizeof(header->v1)){ +- printk("read_cow_header - failed to read V1 header\n"); +- goto out; +- } +- *mtime_out = header->v1.mtime; +- *size_out = header->v1.size; +- *sectorsize_out = header->v1.sectorsize; +- *bitmap_offset_out = sizeof(header->v1); +- file = header->v1.backing_file; +- } +- else if(version == 2){ +- if(n < sizeof(header->v2)){ +- printk("read_cow_header - failed to read V2 header\n"); +- goto out; +- } +- *mtime_out = ntohl(header->v2.mtime); +- *size_out = ntohll(header->v2.size); +- *sectorsize_out = ntohl(header->v2.sectorsize); +- *bitmap_offset_out = sizeof(header->v2); +- file = header->v2.backing_file; +- } +- else { +- printk("read_cow_header - invalid COW version\n"); +- goto out; +- } +- err = -ENOMEM; +- *backing_file_out = uml_strdup(file); +- if(*backing_file_out == NULL){ +- printk("read_cow_header - failed to allocate backing file\n"); +- goto out; +- } +- err = 0; +- out: +- kfree(header); +- return(err); +-} + + static int same_backing_files(char *from_cmdline, char *from_cow, char *cow) + { +- struct stat buf1, buf2; ++ struct stat64 buf1, buf2; + + if(from_cmdline == NULL) return(1); + if(!strcmp(from_cmdline, from_cow)) return(1); + +- if(stat(from_cmdline, &buf1) < 0){ ++ if(stat64(from_cmdline, &buf1) < 0){ + printk("Couldn't stat '%s', errno = %d\n", from_cmdline, + errno); + return(1); + } +- if(stat(from_cow, &buf2) < 0){ ++ if(stat64(from_cow, &buf2) < 0){ + printk("Couldn't stat '%s', errno = %d\n", from_cow, errno); + return(1); + } +@@ -178,6 +60,7 @@ + long long actual; + int err; + ++ printk("%ld", htonll(size)); + if(stat64(file, &buf) < 0){ + printk("Failed to stat backing file \"%s\", errno = %d\n", + file, errno); +@@ -215,118 +98,6 @@ + return(0); + } + +-static int absolutize(char *to, int size, char *from) +-{ +- char save_cwd[256], *slash; +- int remaining; +- +- if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) { +- printk("absolutize : unable to get cwd - errno = %d\n", errno); +- return(-1); +- } +- slash = strrchr(from, '/'); +- if(slash != NULL){ +- *slash = '\0'; +- if(chdir(from)){ +- *slash = '/'; +- printk("absolutize : Can't cd to '%s' - errno = %d\n", +- from, errno); +- return(-1); +- } +- *slash = '/'; +- if(getcwd(to, size) == NULL){ +- printk("absolutize : unable to get cwd of '%s' - " +- "errno = %d\n", from, errno); +- return(-1); +- } +- remaining = size - strlen(to); +- if(strlen(slash) + 1 > remaining){ +- printk("absolutize : unable to fit '%s' into %d " +- "chars\n", from, size); +- return(-1); +- } +- strcat(to, slash); +- } +- else { +- if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){ +- printk("absolutize : unable to fit '%s' into %d " +- "chars\n", from, size); +- return(-1); +- } +- strcpy(to, save_cwd); +- strcat(to, "/"); +- strcat(to, from); +- } +- chdir(save_cwd); +- return(0); +-} +- +-static int write_cow_header(char *cow_file, int fd, char *backing_file, +- int sectorsize, long long *size) +-{ +- struct cow_header_v2 *header; +- struct stat64 buf; +- int err; +- +- err = os_seek_file(fd, 0); +- if(err != 0){ +- printk("write_cow_header - lseek failed, errno = %d\n", errno); +- return(-errno); +- } +- +- err = -ENOMEM; +- header = um_kmalloc(sizeof(*header)); +- if(header == NULL){ +- printk("Failed to allocate COW V2 header\n"); +- goto out; +- } +- header->magic = htonl(COW_MAGIC); +- header->version = htonl(COW_VERSION); +- +- err = -EINVAL; +- if(strlen(backing_file) > sizeof(header->backing_file) - 1){ +- printk("Backing file name \"%s\" is too long - names are " +- "limited to %d characters\n", backing_file, +- sizeof(header->backing_file) - 1); +- goto out_free; +- } +- +- if(absolutize(header->backing_file, sizeof(header->backing_file), +- backing_file)) +- goto out_free; +- +- err = stat64(header->backing_file, &buf); +- if(err < 0){ +- printk("Stat of backing file '%s' failed, errno = %d\n", +- header->backing_file, errno); +- err = -errno; +- goto out_free; +- } +- +- err = os_file_size(header->backing_file, size); +- if(err){ +- printk("Couldn't get size of backing file '%s', errno = %d\n", +- header->backing_file, -*size); +- goto out_free; +- } +- +- header->mtime = htonl(buf.st_mtime); +- header->size = htonll(*size); +- header->sectorsize = htonl(sectorsize); +- +- err = write(fd, header, sizeof(*header)); +- if(err != sizeof(*header)){ +- printk("Write of header to new COW file '%s' failed, " +- "errno = %d\n", cow_file, errno); +- goto out_free; +- } +- err = 0; +- out_free: +- kfree(header); +- out: +- return(err); +-} +- + int open_ubd_file(char *file, struct openflags *openflags, + char **backing_file_out, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out, +@@ -346,10 +117,17 @@ + if((fd = os_open_file(file, *openflags, mode)) < 0) + return(fd); + } ++ ++ err = os_lock_file(fd, openflags->w); ++ if(err){ ++ printk("Failed to lock '%s', errno = %d\n", file, -err); ++ goto error; ++ } ++ + if(backing_file_out == NULL) return(fd); + +- err = read_cow_header(fd, &magic, &backing_file, &mtime, &size, +- §orsize, bitmap_offset_out); ++ err = read_cow_header(file_reader, &fd, &magic, &backing_file, &mtime, ++ &size, §orsize, bitmap_offset_out); + if(err && (*backing_file_out != NULL)){ + printk("Failed to read COW header from COW file \"%s\", " + "errno = %d\n", file, err); +@@ -376,12 +154,12 @@ + if(err) goto error; + } + +- sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, +- data_offset_out); ++ cow_sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, ++ data_offset_out); + + return(fd); + error: +- close(fd); ++ os_close_file(fd); + return(err); + } + +@@ -389,10 +167,7 @@ + int sectorsize, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out) + { +- __u64 blocks; +- long zero; +- int err, fd, i; +- long long size; ++ int err, fd; + + flags.c = 1; + fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL); +@@ -403,29 +178,12 @@ + goto out; + } + +- err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size); +- if(err) goto out_close; +- +- blocks = (size + sectorsize - 1) / sectorsize; +- blocks = (blocks + sizeof(long) * 8 - 1) / (sizeof(long) * 8); +- zero = 0; +- for(i = 0; i < blocks; i++){ +- err = write(fd, &zero, sizeof(zero)); +- if(err != sizeof(zero)){ +- printk("Write of bitmap to new COW file '%s' failed, " +- "errno = %d\n", cow_file, errno); +- goto out_close; +- } +- } +- +- sizes(size, sectorsize, sizeof(struct cow_header_v2), +- bitmap_len_out, data_offset_out); +- *bitmap_offset_out = sizeof(struct cow_header_v2); +- +- return(fd); +- +- out_close: +- close(fd); ++ err = init_cow_file(fd, cow_file, backing_file, sectorsize, ++ bitmap_offset_out, bitmap_len_out, ++ data_offset_out); ++ if(!err) ++ return(fd); ++ os_close_file(fd); + out: + return(err); + } +@@ -448,14 +206,6 @@ + else return(n); + } + +-int ubd_is_dir(char *file) +-{ +- struct stat64 buf; +- +- if(stat64(file, &buf) < 0) return(0); +- return(S_ISDIR(buf.st_mode)); +-} +- + void do_io(struct io_thread_req *req) + { + char *buf; +diff -Naur a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c +--- a/arch/um/drivers/xterm.c Fri Aug 15 15:04:00 2003 ++++ b/arch/um/drivers/xterm.c Fri Aug 15 15:10:18 2003 +@@ -108,7 +108,7 @@ + } + close(fd); + +- fd = create_unix_socket(file, sizeof(file)); ++ fd = create_unix_socket(file, sizeof(file), 1); + if(fd < 0){ + printk("xterm_open : create_unix_socket failed, errno = %d\n", + -fd); +diff -Naur a/arch/um/drivers/xterm_kern.c b/arch/um/drivers/xterm_kern.c +--- a/arch/um/drivers/xterm_kern.c Fri Aug 15 15:07:37 2003 ++++ b/arch/um/drivers/xterm_kern.c Fri Aug 15 15:13:03 2003 +@@ -5,9 +5,12 @@ + + #include "linux/errno.h" + #include "linux/slab.h" ++#include "linux/signal.h" ++#include "linux/interrupt.h" + #include "asm/semaphore.h" + #include "asm/irq.h" + #include "irq_user.h" ++#include "irq_kern.h" + #include "kern_util.h" + #include "os.h" + #include "xterm.h" +@@ -19,17 +22,18 @@ + int new_fd; + }; + +-static void xterm_interrupt(int irq, void *data, struct pt_regs *regs) ++static irqreturn_t xterm_interrupt(int irq, void *data, struct pt_regs *regs) + { + struct xterm_wait *xterm = data; + int fd; + + fd = os_rcv_fd(xterm->fd, &xterm->pid); + if(fd == -EAGAIN) +- return; ++ return(IRQ_NONE); + + xterm->new_fd = fd; + up(&xterm->sem); ++ return(IRQ_HANDLED); + } + + int xterm_fd(int socket, int *pid_out) +diff -Naur a/arch/um/dyn.lds.S b/arch/um/dyn.lds.S +--- a/arch/um/dyn.lds.S Fri Aug 15 15:06:20 2003 ++++ b/arch/um/dyn.lds.S Fri Aug 15 15:12:31 2003 +@@ -15,7 +15,11 @@ + . = ALIGN(4096); /* Init code and data */ + _stext = .; + __init_begin = .; +- .text.init : { *(.text.init) } ++ .init.text : { ++ _sinittext = .; ++ *(.init.text) ++ _einittext = .; ++ } + + . = ALIGN(4096); + +@@ -67,7 +71,7 @@ + + #include "asm/common.lds.S" + +- .data.init : { *(.data.init) } ++ init.data : { *(.init.data) } + + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but +diff -Naur a/arch/um/include/irq_kern.h b/arch/um/include/irq_kern.h +--- a/arch/um/include/irq_kern.h Wed Dec 31 19:00:00 1969 ++++ b/arch/um/include/irq_kern.h Fri Aug 15 15:11:53 2003 +@@ -0,0 +1,28 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __IRQ_KERN_H__ ++#define __IRQ_KERN_H__ ++ ++#include "linux/interrupt.h" ++ ++extern int um_request_irq(unsigned int irq, int fd, int type, ++ irqreturn_t (*handler)(int, void *, ++ struct pt_regs *), ++ unsigned long irqflags, const char * devname, ++ void *dev_id); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h +--- a/arch/um/include/kern_util.h Fri Aug 15 15:05:04 2003 ++++ b/arch/um/include/kern_util.h Fri Aug 15 15:11:18 2003 +@@ -63,10 +63,9 @@ + extern void *syscall_sp(void *t); + extern void syscall_trace(void); + extern int hz(void); +-extern void idle_timer(void); ++extern void uml_idle_timer(void); + extern unsigned int do_IRQ(int irq, union uml_pt_regs *regs); + extern int external_pid(void *t); +-extern int pid_to_processor_id(int pid); + extern void boot_timer_handler(int sig); + extern void interrupt_end(void); + extern void initial_thread_cb(void (*proc)(void *), void *arg); +@@ -90,9 +89,7 @@ + extern char *uml_strdup(char *string); + extern void unprotect_kernel_mem(void); + extern void protect_kernel_mem(void); +-extern void set_kmem_end(unsigned long); + extern void uml_cleanup(void); +-extern int pid_to_processor_id(int pid); + extern void set_current(void *t); + extern void lock_signalled_task(void *t); + extern void IPI_handler(int cpu); +@@ -101,7 +98,9 @@ + extern int clear_user_proc(void *buf, int size); + extern int copy_to_user_proc(void *to, void *from, int size); + extern int copy_from_user_proc(void *to, void *from, int size); ++extern int strlen_user_proc(char *str); + extern void bus_handler(int sig, union uml_pt_regs *regs); ++extern void winch(int sig, union uml_pt_regs *regs); + extern long execute_syscall(void *r); + extern int smp_sigio_handler(void); + extern void *get_current(void); +diff -Naur a/arch/um/include/line.h b/arch/um/include/line.h +--- a/arch/um/include/line.h Fri Aug 15 15:07:40 2003 ++++ b/arch/um/include/line.h Fri Aug 15 15:13:11 2003 +@@ -9,12 +9,14 @@ + #include "linux/list.h" + #include "linux/workqueue.h" + #include "linux/tty.h" ++#include "linux/interrupt.h" + #include "asm/semaphore.h" + #include "chan_user.h" + #include "mconsole_kern.h" + + struct line_driver { + char *name; ++ char *device_name; + char *devfs_name; + short major; + short minor_start; +@@ -67,8 +69,9 @@ + + #define LINES_INIT(n) { num : n } + +-extern void line_interrupt(int irq, void *data, struct pt_regs *unused); +-extern void line_write_interrupt(int irq, void *data, struct pt_regs *unused); ++extern irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused); ++extern irqreturn_t line_write_interrupt(int irq, void *data, ++ struct pt_regs *unused); + extern void line_close(struct line *lines, struct tty_struct *tty); + extern int line_open(struct line *lines, struct tty_struct *tty, + struct chan_opts *opts); +diff -Naur a/arch/um/include/mconsole.h b/arch/um/include/mconsole.h +--- a/arch/um/include/mconsole.h Fri Aug 15 15:05:26 2003 ++++ b/arch/um/include/mconsole.h Fri Aug 15 15:11:43 2003 +@@ -77,6 +77,7 @@ + extern void mconsole_cad(struct mc_request *req); + extern void mconsole_stop(struct mc_request *req); + extern void mconsole_go(struct mc_request *req); ++extern void mconsole_log(struct mc_request *req); + + extern int mconsole_get_request(int fd, struct mc_request *req); + extern int mconsole_notify(char *sock_name, int type, const void *data, +diff -Naur a/arch/um/include/mem.h b/arch/um/include/mem.h +--- a/arch/um/include/mem.h Fri Aug 15 15:09:22 2003 ++++ b/arch/um/include/mem.h Fri Aug 15 15:14:01 2003 +@@ -13,7 +13,6 @@ + }; + + extern void set_usable_vm(unsigned long start, unsigned long end); +-extern void set_kmem_end(unsigned long new); + + #endif + +diff -Naur a/arch/um/include/mem_user.h b/arch/um/include/mem_user.h +--- a/arch/um/include/mem_user.h Fri Aug 15 15:07:31 2003 ++++ b/arch/um/include/mem_user.h Fri Aug 15 15:12:54 2003 +@@ -51,9 +51,6 @@ + + extern int init_mem_user(void); + extern int create_mem_file(unsigned long len); +-extern void setup_range(int fd, char *driver, unsigned long start, +- unsigned long pfn, unsigned long total, int need_vm, +- struct mem_region *region, void *reserved); + extern void setup_memory(void *entry); + extern unsigned long find_iomem(char *driver, unsigned long *len_out); + extern int init_maps(struct mem_region *region); +diff -Naur a/arch/um/include/os.h b/arch/um/include/os.h +--- a/arch/um/include/os.h Fri Aug 15 15:04:50 2003 ++++ b/arch/um/include/os.h Fri Aug 15 15:10:48 2003 +@@ -103,10 +103,11 @@ + extern int os_shutdown_socket(int fd, int r, int w); + extern void os_close_file(int fd); + extern int os_rcv_fd(int fd, int *helper_pid_out); +-extern int create_unix_socket(char *file, int len); ++extern int create_unix_socket(char *file, int len, int close_on_exec); + extern int os_connect_socket(char *name); + extern int os_file_type(char *file); + extern int os_file_mode(char *file, struct openflags *mode_out); ++extern int os_lock_file(int fd, int excl); + + extern unsigned long os_process_pc(int pid); + extern int os_process_parent(int pid); +@@ -120,6 +121,7 @@ + extern int os_protect_memory(void *addr, unsigned long len, + int r, int w, int x); + extern int os_unmap_memory(void *addr, int len); ++extern void os_flush_stdout(void); + + #endif + +diff -Naur a/arch/um/include/sysdep-i386/sigcontext.h b/arch/um/include/sysdep-i386/sigcontext.h +--- a/arch/um/include/sysdep-i386/sigcontext.h Fri Aug 15 15:07:37 2003 ++++ b/arch/um/include/sysdep-i386/sigcontext.h Fri Aug 15 15:13:03 2003 +@@ -28,8 +28,8 @@ + */ + #define SC_START_SYSCALL(sc) do SC_EAX(sc) = -ENOSYS; while(0) + +-/* These are General Protection and Page Fault */ +-#define SEGV_IS_FIXABLE(trap) ((trap == 13) || (trap == 14)) ++/* This is Page Fault */ ++#define SEGV_IS_FIXABLE(trap) (trap == 14) + + #define SC_SEGV_IS_FIXABLE(sc) (SEGV_IS_FIXABLE(SC_TRAPNO(sc))) + +diff -Naur a/arch/um/include/ubd_user.h b/arch/um/include/ubd_user.h +--- a/arch/um/include/ubd_user.h Fri Aug 15 15:06:34 2003 ++++ b/arch/um/include/ubd_user.h Fri Aug 15 15:12:37 2003 +@@ -39,7 +39,6 @@ + extern int write_ubd_fs(int fd, char *buffer, int len); + extern int start_io_thread(unsigned long sp, int *fds_out); + extern void do_io(struct io_thread_req *req); +-extern int ubd_is_dir(char *file); + + static inline int ubd_test_bit(__u64 bit, unsigned char *data) + { +diff -Naur a/arch/um/include/user.h b/arch/um/include/user.h +--- a/arch/um/include/user.h Fri Aug 15 15:03:58 2003 ++++ b/arch/um/include/user.h Fri Aug 15 15:10:14 2003 +@@ -14,7 +14,7 @@ + extern void kfree(void *ptr); + extern int in_aton(char *str); + extern int open_gdb_chan(void); +- ++extern int strlcpy(char *, const char *, int); + #endif + + /* +diff -Naur a/arch/um/include/user_util.h b/arch/um/include/user_util.h +--- a/arch/um/include/user_util.h Fri Aug 15 15:04:33 2003 ++++ b/arch/um/include/user_util.h Fri Aug 15 15:10:32 2003 +@@ -59,7 +59,6 @@ + extern void *add_signal_handler(int sig, void (*handler)(int)); + extern int start_fork_tramp(void *arg, unsigned long temp_stack, + int clone_flags, int (*tramp)(void *)); +-extern int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags); + extern int linux_main(int argc, char **argv); + extern void set_cmdline(char *cmd); + extern void input_cb(void (*proc)(void *), void *arg, int arg_len); +@@ -90,7 +89,8 @@ + extern int arch_fixup(unsigned long address, void *sc_ptr); + extern void forward_pending_sigio(int target); + extern int can_do_skas(void); +- ++extern void arch_init_thread(void); ++ + #endif + + /* +diff -Naur a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile +--- a/arch/um/kernel/Makefile Fri Aug 15 15:07:32 2003 ++++ b/arch/um/kernel/Makefile Fri Aug 15 15:12:57 2003 +@@ -19,6 +19,8 @@ + obj-$(CONFIG_MODE_TT) += tt/ + obj-$(CONFIG_MODE_SKAS) += skas/ + ++clean-files := config.c ++ + user-objs-$(CONFIG_TTY_LOG) += tty_log.o + + USER_OBJS := $(filter %_user.o,$(obj-y)) $(user-objs-y) config.o helper.o \ +@@ -43,17 +45,13 @@ + $(obj)/frame.o: $(src)/frame.c + $(CC) $(CFLAGS_$(notdir $@)) -c -o $@ $< + +-QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; while() { $$_ =~ s/CONFIG/$$config/; print $$_ }' ++QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; $$config =~ s/\n/\\n"\n"/g ; while() { $$_ =~ s/CONFIG/$$config/; print $$_ }' + + $(obj)/config.c : $(src)/config.c.in $(TOPDIR)/.config + $(PERL) -e $(QUOTE) < $(src)/config.c.in > $@ + + $(obj)/config.o : $(obj)/config.c + +-clean: +- rm -f config.c +- for dir in $(subdir-y) ; do $(MAKE) -C $$dir clean; done +- + modules: + + fastdep: +diff -Naur a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in +--- a/arch/um/kernel/config.c.in Fri Aug 15 15:07:37 2003 ++++ b/arch/um/kernel/config.c.in Fri Aug 15 15:13:03 2003 +@@ -7,9 +7,7 @@ + #include + #include "init.h" + +-static __initdata char *config = " +-CONFIG +-"; ++static __initdata char *config = "CONFIG"; + + static int __init print_config(char *line, int *add) + { +diff -Naur a/arch/um/kernel/exec_kern.c b/arch/um/kernel/exec_kern.c +--- a/arch/um/kernel/exec_kern.c Fri Aug 15 15:04:54 2003 ++++ b/arch/um/kernel/exec_kern.c Fri Aug 15 15:11:03 2003 +@@ -32,10 +32,15 @@ + CHOOSE_MODE_PROC(start_thread_tt, start_thread_skas, regs, eip, esp); + } + ++extern void log_exec(char **argv, void *tty); ++ + static int execve1(char *file, char **argv, char **env) + { + int error; + ++#ifdef CONFIG_TTY_LOG ++ log_exec(argv, current->tty); ++#endif + error = do_execve(file, argv, env, ¤t->thread.regs); + if (error == 0){ + current->ptrace &= ~PT_DTRACE; +diff -Naur a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c +--- a/arch/um/kernel/init_task.c Fri Aug 15 15:09:24 2003 ++++ b/arch/um/kernel/init_task.c Fri Aug 15 15:14:04 2003 +@@ -17,6 +17,7 @@ + struct mm_struct init_mm = INIT_MM(init_mm); + static struct files_struct init_files = INIT_FILES; + static struct signal_struct init_signals = INIT_SIGNALS(init_signals); ++static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); + + /* + * Initial task structure. +@@ -38,26 +39,12 @@ + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +-struct task_struct *alloc_task_struct(void) +-{ +- return((struct task_struct *) +- __get_free_pages(GFP_KERNEL, CONFIG_KERNEL_STACK_ORDER)); +-} +- + void unprotect_stack(unsigned long stack) + { + protect_memory(stack, (1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE, + 1, 1, 0, 1); + } + +-void free_task_struct(struct task_struct *task) +-{ +- /* free_pages decrements the page counter and only actually frees +- * the pages if they are now not accessed by anything. +- */ +- free_pages((unsigned long) task, CONFIG_KERNEL_STACK_ORDER); +-} +- + /* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically +diff -Naur a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c +--- a/arch/um/kernel/irq.c Fri Aug 15 15:07:53 2003 ++++ b/arch/um/kernel/irq.c Fri Aug 15 15:13:18 2003 +@@ -28,6 +28,7 @@ + #include "user_util.h" + #include "kern_util.h" + #include "irq_user.h" ++#include "irq_kern.h" + + static void register_irq_proc (unsigned int irq); + +@@ -82,65 +83,52 @@ + end_none + }; + +-/* Not changed */ +-volatile unsigned long irq_err_count; +- + /* + * Generic, controller-independent functions: + */ + +-int get_irq_list(char *buf) ++int show_interrupts(struct seq_file *p, void *v) + { + int i, j; +- unsigned long flags; + struct irqaction * action; +- char *p = buf; ++ unsigned long flags; + +- p += sprintf(p, " "); +- for (j=0; jtypename); +- p += sprintf(p, " %s", action->name); ++ seq_printf(p, " %14s", irq_desc[i].handler->typename); ++ seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) +- p += sprintf(p, ", %s", action->name); +- *p++ = '\n'; +- end: ++ seq_printf(p, ", %s", action->name); ++ ++ seq_putc(p, '\n'); ++skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } +- p += sprintf(p, "\n"); +-#ifdef notdef +-#ifdef CONFIG_SMP +- p += sprintf(p, "LOC: "); +- for (j = 0; j < num_online_cpus(); j++) +- p += sprintf(p, "%10u ", +- apic_timer_irqs[cpu_logical_map(j)]); +- p += sprintf(p, "\n"); +-#endif +-#endif +- p += sprintf(p, "ERR: %10lu\n", irq_err_count); +- return p - buf; +-} +- ++ seq_printf(p, "NMI: "); ++ for (j = 0; j < NR_CPUS; j++) ++ if (cpu_online(j)) ++ seq_printf(p, "%10u ", nmi_count(j)); ++ seq_putc(p, '\n'); + +-int show_interrupts(struct seq_file *p, void *v) +-{ +- return(0); ++ return 0; + } + + /* +@@ -281,13 +269,12 @@ + * 0 return value means that this irq is already being + * handled by some other CPU. (or is disabled) + */ +- int cpu = smp_processor_id(); + irq_desc_t *desc = irq_desc + irq; + struct irqaction * action; + unsigned int status; + + irq_enter(); +- kstat_cpu(cpu).irqs[irq]++; ++ kstat_this_cpu.irqs[irq]++; + spin_lock(&desc->lock); + desc->handler->ack(irq); + /* +@@ -384,7 +371,7 @@ + */ + + int request_irq(unsigned int irq, +- void (*handler)(int, void *, struct pt_regs *), ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char * devname, + void *dev_id) +@@ -430,15 +417,19 @@ + } + + int um_request_irq(unsigned int irq, int fd, int type, +- void (*handler)(int, void *, struct pt_regs *), ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, const char * devname, + void *dev_id) + { +- int retval; ++ int err; + +- retval = request_irq(irq, handler, irqflags, devname, dev_id); +- if(retval) return(retval); +- return(activate_fd(irq, fd, type, dev_id)); ++ err = request_irq(irq, handler, irqflags, devname, dev_id); ++ if(err) ++ return(err); ++ ++ if(fd != -1) ++ err = activate_fd(irq, fd, type, dev_id); ++ return(err); + } + + /* this was setup_x86_irq but it seems pretty generic */ +diff -Naur a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c +--- a/arch/um/kernel/mem.c Fri Aug 15 15:05:20 2003 ++++ b/arch/um/kernel/mem.c Fri Aug 15 15:11:21 2003 +@@ -119,11 +119,6 @@ + return(kmem_top); + } + +-void set_kmem_end(unsigned long new) +-{ +- kmem_top = new; +-} +- + #ifdef CONFIG_HIGHMEM + /* Changed during early boot */ + pte_t *kmap_pte; +@@ -218,7 +213,7 @@ + if(regions[i] == NULL) break; + } + if(i == NREGIONS){ +- printk("setup_range : no free regions\n"); ++ printk("setup_one_range : no free regions\n"); + i = -1; + goto out; + } +@@ -227,7 +222,9 @@ + fd = create_mem_file(len); + + if(region == NULL){ +- region = alloc_bootmem_low_pages(sizeof(*region)); ++ if(kmalloc_ok) ++ region = kmalloc(sizeof(*region), GFP_KERNEL); ++ else region = alloc_bootmem_low_pages(sizeof(*region)); + if(region == NULL) + panic("Failed to allocating mem_region"); + } +@@ -528,9 +525,9 @@ + return(NREGIONS); + } + +-void setup_range(int fd, char *driver, unsigned long start, unsigned long pfn, +- unsigned long len, int need_vm, struct mem_region *region, +- void *reserved) ++static void setup_range(int fd, char *driver, unsigned long start, ++ unsigned long pfn, unsigned long len, int need_vm, ++ struct mem_region *region, void *reserved) + { + int i, cur; + +diff -Naur a/arch/um/kernel/mem_user.c b/arch/um/kernel/mem_user.c +--- a/arch/um/kernel/mem_user.c Fri Aug 15 15:06:25 2003 ++++ b/arch/um/kernel/mem_user.c Fri Aug 15 15:12:36 2003 +@@ -111,6 +111,11 @@ + offset = 0; + } + ++ if(offset >= region->len){ ++ printf("%d bytes of physical memory is insufficient\n", ++ region->len); ++ exit(1); ++ } + loc = mmap(start, region->len - offset, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, region->fd, offset); + if(loc != start){ +@@ -122,26 +127,26 @@ + + static int __init parse_iomem(char *str, int *add) + { +- struct stat buf; ++ struct stat64 buf; + char *file, *driver; + int fd; + + driver = str; + file = strchr(str,','); + if(file == NULL){ +- printk("parse_iomem : failed to parse iomem\n"); ++ printf("parse_iomem : failed to parse iomem\n"); + return(1); + } + *file = '\0'; + file++; + fd = os_open_file(file, of_rdwr(OPENFLAGS()), 0); + if(fd < 0){ +- printk("parse_iomem - Couldn't open io file, errno = %d\n", ++ printf("parse_iomem - Couldn't open io file, errno = %d\n", + errno); + return(1); + } +- if(fstat(fd, &buf) < 0) { +- printk("parse_iomem - cannot fstat file, errno = %d\n", errno); ++ if(fstat64(fd, &buf) < 0) { ++ printf("parse_iomem - cannot fstat file, errno = %d\n", errno); + return(1); + } + add_iomem(driver, fd, buf.st_size); +diff -Naur a/arch/um/kernel/process.c b/arch/um/kernel/process.c +--- a/arch/um/kernel/process.c Fri Aug 15 15:08:15 2003 ++++ b/arch/um/kernel/process.c Fri Aug 15 15:13:26 2003 +@@ -72,7 +72,6 @@ + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGUSR2, (__sighandler_t) sig_handler, + SA_NOMASK | flags, -1); +- (void) CHOOSE_MODE(signal(SIGCHLD, SIG_IGN), (void *) 0); + signal(SIGHUP, SIG_IGN); + + init_irq_signals(altstack); +@@ -127,7 +126,8 @@ + if(err < 0) panic("Waiting for outer trampoline failed - errno = %d", + errno); + if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL)) +- panic("outer trampoline didn't exit with SIGKILL"); ++ panic("outer trampoline didn't exit with SIGKILL, " ++ "status = %d", status); + + return(arg.pid); + } +diff -Naur a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c +--- a/arch/um/kernel/process_kern.c Fri Aug 15 15:06:24 2003 ++++ b/arch/um/kernel/process_kern.c Fri Aug 15 15:12:35 2003 +@@ -52,17 +52,12 @@ + + struct task_struct *get_task(int pid, int require) + { +- struct task_struct *task, *ret; ++ struct task_struct *ret; + +- ret = NULL; + read_lock(&tasklist_lock); +- for_each_process(task){ +- if(task->pid == pid){ +- ret = task; +- break; +- } +- } ++ ret = find_task_by_pid(pid); + read_unlock(&tasklist_lock); ++ + if(require && (ret == NULL)) panic("get_task couldn't find a task\n"); + return(ret); + } +@@ -103,13 +98,14 @@ + + int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + { +- struct task_struct *p; ++ int pid; + + current->thread.request.u.thread.proc = fn; + current->thread.request.u.thread.arg = arg; +- p = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL); +- if(IS_ERR(p)) panic("do_fork failed in kernel_thread"); +- return(p->pid); ++ pid = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL); ++ if(pid < 0) ++ panic("do_fork failed in kernel_thread, errno = %d", pid); ++ return(pid); + } + + void switch_mm(struct mm_struct *prev, struct mm_struct *next, +@@ -157,6 +153,10 @@ + return(current); + } + ++void prepare_to_copy(struct task_struct *tsk) ++{ ++} ++ + int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, + unsigned long stack_top, struct task_struct * p, + struct pt_regs *regs) +@@ -190,7 +190,7 @@ + + void default_idle(void) + { +- idle_timer(); ++ uml_idle_timer(); + + atomic_inc(&init_mm.mm_count); + current->mm = &init_mm; +@@ -363,6 +363,11 @@ + return(clear_user(buf, size)); + } + ++int strlen_user_proc(char *str) ++{ ++ return(strlen_user(str)); ++} ++ + int smp_sigio_handler(void) + { + #ifdef CONFIG_SMP +diff -Naur a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c +--- a/arch/um/kernel/ptrace.c Fri Aug 15 15:04:36 2003 ++++ b/arch/um/kernel/ptrace.c Fri Aug 15 15:10:33 2003 +@@ -311,11 +311,8 @@ + + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ +- current->exit_code = SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) +- ? 0x80 : 0); +- current->state = TASK_STOPPED; +- notify_parent(current, SIGCHLD); +- schedule(); ++ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ++ ? 0x80 : 0)); + + /* + * this isn't the same as continuing with a signal, but it will do +diff -Naur a/arch/um/kernel/sigio_kern.c b/arch/um/kernel/sigio_kern.c +--- a/arch/um/kernel/sigio_kern.c Fri Aug 15 15:04:52 2003 ++++ b/arch/um/kernel/sigio_kern.c Fri Aug 15 15:10:59 2003 +@@ -6,18 +6,21 @@ + #include "linux/kernel.h" + #include "linux/list.h" + #include "linux/slab.h" +-#include "asm/irq.h" ++#include "linux/signal.h" ++#include "linux/interrupt.h" + #include "init.h" + #include "sigio.h" + #include "irq_user.h" ++#include "irq_kern.h" + + /* Protected by sigio_lock() called from write_sigio_workaround */ + static int sigio_irq_fd = -1; + +-void sigio_interrupt(int irq, void *data, struct pt_regs *unused) ++irqreturn_t sigio_interrupt(int irq, void *data, struct pt_regs *unused) + { + read_sigio_fd(sigio_irq_fd); + reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); ++ return(IRQ_HANDLED); + } + + int write_sigio_irq(int fd) +diff -Naur a/arch/um/kernel/signal_kern.c b/arch/um/kernel/signal_kern.c +--- a/arch/um/kernel/signal_kern.c Fri Aug 15 15:06:38 2003 ++++ b/arch/um/kernel/signal_kern.c Fri Aug 15 15:12:40 2003 +@@ -36,7 +36,7 @@ + if(sig == SIGSEGV){ + struct k_sigaction *ka; + +- ka = ¤t->sig->action[SIGSEGV - 1]; ++ ka = ¤t->sighand->action[SIGSEGV - 1]; + ka->sa.sa_handler = SIG_DFL; + } + force_sig(SIGSEGV, current); +@@ -142,7 +142,7 @@ + return(0); + + /* Whee! Actually deliver the signal. */ +- ka = ¤t->sig->action[sig -1 ]; ++ ka = ¤t->sighand->action[sig -1 ]; + err = handle_signal(regs, sig, ka, &info, oldset, error); + if(!err) return(1); + +@@ -201,7 +201,7 @@ + } + } + +-int sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize) ++int sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) + { + sigset_t saveset, newset; + +@@ -227,6 +227,42 @@ + } + } + ++int sys_sigaction(int sig, const struct old_sigaction __user *act, ++ struct old_sigaction __user *oact) ++{ ++ struct k_sigaction new_ka, old_ka; ++ int ret; ++ ++ if (act) { ++ old_sigset_t mask; ++ if (verify_area(VERIFY_READ, act, sizeof(*act)) || ++ __get_user(new_ka.sa.sa_handler, &act->sa_handler) || ++ __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) ++ return -EFAULT; ++ __get_user(new_ka.sa.sa_flags, &act->sa_flags); ++ __get_user(mask, &act->sa_mask); ++ siginitset(&new_ka.sa.sa_mask, mask); ++ } ++ ++ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); ++ ++ if (!ret && oact) { ++ if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || ++ __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || ++ __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) ++ return -EFAULT; ++ __put_user(old_ka.sa.sa_flags, &oact->sa_flags); ++ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); ++ } ++ ++ return ret; ++} ++ ++int sys_sigaltstack(const stack_t *uss, stack_t *uoss) ++{ ++ return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); ++} ++ + static int copy_sc_from_user(struct pt_regs *to, void *from, + struct arch_frame_data *arch) + { +@@ -239,8 +275,8 @@ + + int sys_sigreturn(struct pt_regs regs) + { +- void *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); +- void *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); ++ void __user *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); ++ void __user *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); + int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); + + spin_lock_irq(¤t->sighand->siglock); +@@ -257,7 +293,8 @@ + + int sys_rt_sigreturn(struct pt_regs regs) + { +- struct ucontext *uc = sp_to_uc(PT_REGS_SP(¤t->thread.regs)); ++ unsigned long sp = PT_REGS_SP(¤t->thread.regs); ++ struct ucontext __user *uc = sp_to_uc(sp); + void *fp; + int sig_size = _NSIG_WORDS * sizeof(unsigned long); + +diff -Naur a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile +--- a/arch/um/kernel/skas/Makefile Fri Aug 15 15:05:00 2003 ++++ b/arch/um/kernel/skas/Makefile Fri Aug 15 15:11:08 2003 +@@ -7,18 +7,22 @@ + process_kern.o syscall_kern.o syscall_user.o time.o tlb.o trap_user.o \ + sys-$(SUBARCH)/ + ++host-progs := util/mk_ptregs ++clean-files := include/skas_ptregs.h ++ + USER_OBJS = $(filter %_user.o,$(obj-y)) process.o time.o + USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) + +-include/skas_ptregs.h : util/mk_ptregs +- util/mk_ptregs > $@ +- +-util/mk_ptregs : +- $(MAKE) -C util ++$(TOPDIR)/arch/um/include/skas_ptregs.h : $(src)/util/mk_ptregs ++ @echo -n ' Generating $@' ++ @$< > $@.tmp ++ @if [ -r $@ ] && cmp -s $@ $@.tmp; then \ ++ echo ' (unchanged)'; \ ++ rm -f $@.tmp; \ ++ else \ ++ echo ' (updated)'; \ ++ mv -f $@.tmp $@; \ ++ fi + + $(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< +- +-clean : +- $(MAKE) -C util clean +- $(RM) -f include/skas_ptregs.h +diff -Naur a/arch/um/kernel/skas/include/mode.h b/arch/um/kernel/skas/include/mode.h +--- a/arch/um/kernel/skas/include/mode.h Fri Aug 15 15:06:34 2003 ++++ b/arch/um/kernel/skas/include/mode.h Fri Aug 15 15:12:37 2003 +@@ -20,6 +20,7 @@ + extern void halt_skas(void); + extern void reboot_skas(void); + extern void kill_off_processes_skas(void); ++extern int is_skas_winch(int pid, int fd, void *data); + + #endif + +diff -Naur a/arch/um/kernel/skas/include/uaccess.h b/arch/um/kernel/skas/include/uaccess.h +--- a/arch/um/kernel/skas/include/uaccess.h Fri Aug 15 15:05:28 2003 ++++ b/arch/um/kernel/skas/include/uaccess.h Fri Aug 15 15:11:44 2003 +@@ -19,7 +19,7 @@ + #define access_ok_skas(type, addr, size) \ + ((segment_eq(get_fs(), KERNEL_DS)) || \ + (((unsigned long) (addr) < TASK_SIZE) && \ +- ((unsigned long) (addr) + (size) < TASK_SIZE))) ++ ((unsigned long) (addr) + (size) <= TASK_SIZE))) + + static inline int verify_area_skas(int type, const void * addr, + unsigned long size) +diff -Naur a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c +--- a/arch/um/kernel/skas/process.c Fri Aug 15 15:08:54 2003 ++++ b/arch/um/kernel/skas/process.c Fri Aug 15 15:13:46 2003 +@@ -4,6 +4,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -24,6 +25,16 @@ + #include "os.h" + #include "proc_mm.h" + #include "skas_ptrace.h" ++#include "chan_user.h" ++ ++int is_skas_winch(int pid, int fd, void *data) ++{ ++ if(pid != getpid()) ++ return(0); ++ ++ register_winch_irq(-1, fd, -1, data); ++ return(1); ++} + + unsigned long exec_regs[FRAME_SIZE]; + unsigned long exec_fp_regs[HOST_FP_SIZE]; +@@ -72,8 +83,6 @@ + handle_syscall(regs); + } + +-int userspace_pid; +- + static int userspace_tramp(void *arg) + { + init_new_thread_signals(0); +@@ -83,6 +92,8 @@ + return(0); + } + ++int userspace_pid; ++ + void start_userspace(void) + { + void *stack; +@@ -149,6 +160,7 @@ + case SIGILL: + case SIGBUS: + case SIGFPE: ++ case SIGWINCH: + user_signal(WSTOPSIG(status), regs); + break; + default: +@@ -328,7 +340,8 @@ + int new_mm(int from) + { + struct proc_mm_op copy; +- int n, fd = os_open_file("/proc/mm", of_write(OPENFLAGS()), 0); ++ int n, fd = os_open_file("/proc/mm", ++ of_cloexec(of_write(OPENFLAGS())), 0); + + if(fd < 0) + return(-errno); +@@ -342,6 +355,7 @@ + printk("new_mm : /proc/mm copy_segments failed, " + "errno = %d\n", errno); + } ++ + return(fd); + } + +diff -Naur a/arch/um/kernel/skas/process_kern.c b/arch/um/kernel/skas/process_kern.c +--- a/arch/um/kernel/skas/process_kern.c Fri Aug 15 15:04:51 2003 ++++ b/arch/um/kernel/skas/process_kern.c Fri Aug 15 15:10:56 2003 +@@ -61,9 +61,8 @@ + thread_wait(¤t->thread.mode.skas.switch_buf, + current->thread.mode.skas.fork_buf); + +-#ifdef CONFIG_SMP +- schedule_tail(NULL); +-#endif ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); + current->thread.prev_sched = NULL; + + n = run_kernel_thread(fn, arg, ¤t->thread.exec_buf); +@@ -93,9 +92,8 @@ + current->thread.mode.skas.fork_buf); + + force_flush_all(); +-#ifdef CONFIG_SMP +- schedule_tail(current->thread.prev_sched); +-#endif ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); + current->thread.prev_sched = NULL; + unblock_signals(); + +@@ -164,7 +162,7 @@ + capture_signal_stack(); + + init_new_thread_signals(1); +- idle_timer(); ++ uml_idle_timer(); + + init_task.thread.request.u.thread.proc = start_kernel_proc; + init_task.thread.request.u.thread.arg = NULL; +diff -Naur a/arch/um/kernel/skas/util/mk_ptregs.c b/arch/um/kernel/skas/util/mk_ptregs.c +--- a/arch/um/kernel/skas/util/mk_ptregs.c Fri Aug 15 15:05:20 2003 ++++ b/arch/um/kernel/skas/util/mk_ptregs.c Fri Aug 15 15:11:21 2003 +@@ -1,3 +1,4 @@ ++#include + #include + #include + +diff -Naur a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c +--- a/arch/um/kernel/smp.c Fri Aug 15 15:04:50 2003 ++++ b/arch/um/kernel/smp.c Fri Aug 15 15:10:52 2003 +@@ -140,8 +140,10 @@ + + current->thread.request.u.thread.proc = idle_proc; + current->thread.request.u.thread.arg = (void *) cpu; +- new_task = do_fork(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, NULL); +- if(IS_ERR(new_task)) panic("do_fork failed in idle_thread"); ++ new_task = copy_process(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, ++ NULL); ++ if(IS_ERR(new_task)) ++ panic("copy_process failed in idle_thread"); + + cpu_tasks[cpu] = ((struct cpu_task) + { .pid = new_task->thread.mode.tt.extern_pid, +@@ -150,6 +152,7 @@ + CHOOSE_MODE(write(new_task->thread.mode.tt.switch_pipe[1], &c, + sizeof(c)), + ({ panic("skas mode doesn't support SMP"); })); ++ wake_up_forked_process(new_task); + return(new_task); + } + +@@ -254,15 +257,19 @@ + atomic_inc(&scf_finished); + } + +-int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, +- int wait) ++int smp_call_function_on_cpu(void (*_func)(void *info), void *_info, int wait, ++ unsigned long mask) + { +- int cpus = num_online_cpus() - 1; +- int i; +- +- if (!cpus) +- return 0; ++ int i, cpu, num_cpus; + ++ cpu = get_cpu(); ++ mask &= ~(1UL << cpu); ++ num_cpus = hweight32(mask); ++ if(num_cpus == 0){ ++ put_cpu_no_resched(); ++ return(0); ++ } ++ + spin_lock_bh(&call_lock); + atomic_set(&scf_started, 0); + atomic_set(&scf_finished, 0); +@@ -270,19 +277,25 @@ + info = _info; + + for (i=0;ithread_info->cpu) && +- test_bit(i, &cpu_online_map)) ++ if(cpu_online(i) && ((1UL << i) & mask)) + write(cpu_data[i].ipi_pipe[1], "C", 1); + +- while (atomic_read(&scf_started) != cpus) ++ while(atomic_read(&scf_started) != num_cpus) + barrier(); + +- if (wait) +- while (atomic_read(&scf_finished) != cpus) ++ if(wait) ++ while(atomic_read(&scf_finished) != num_cpus) + barrier(); + + spin_unlock_bh(&call_lock); +- return 0; ++ put_cpu_no_resched(); ++ return(0); ++} ++ ++int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, ++ int wait) ++{ ++ return(smp_call_function_on_cpu(_func, _info, wait, cpu_online_map)); + } + + #endif +diff -Naur a/arch/um/kernel/sys_call_table.c b/arch/um/kernel/sys_call_table.c +--- a/arch/um/kernel/sys_call_table.c Fri Aug 15 15:07:57 2003 ++++ b/arch/um/kernel/sys_call_table.c Fri Aug 15 15:13:24 2003 +@@ -219,6 +219,18 @@ + extern syscall_handler_t sys_gettid; + extern syscall_handler_t sys_readahead; + extern syscall_handler_t sys_tkill; ++extern syscall_handler_t sys_setxattr; ++extern syscall_handler_t sys_lsetxattr; ++extern syscall_handler_t sys_fsetxattr; ++extern syscall_handler_t sys_getxattr; ++extern syscall_handler_t sys_lgetxattr; ++extern syscall_handler_t sys_fgetxattr; ++extern syscall_handler_t sys_listxattr; ++extern syscall_handler_t sys_llistxattr; ++extern syscall_handler_t sys_flistxattr; ++extern syscall_handler_t sys_removexattr; ++extern syscall_handler_t sys_lremovexattr; ++extern syscall_handler_t sys_fremovexattr; + extern syscall_handler_t sys_sendfile64; + extern syscall_handler_t sys_futex; + extern syscall_handler_t sys_sched_setaffinity; +@@ -235,6 +247,19 @@ + extern syscall_handler_t sys_epoll_wait; + extern syscall_handler_t sys_remap_file_pages; + extern syscall_handler_t sys_set_tid_address; ++extern syscall_handler_t sys_timer_create; ++extern syscall_handler_t sys_timer_settime; ++extern syscall_handler_t sys_timer_gettime; ++extern syscall_handler_t sys_timer_getoverrun; ++extern syscall_handler_t sys_timer_delete; ++extern syscall_handler_t sys_clock_settime; ++extern syscall_handler_t sys_clock_gettime; ++extern syscall_handler_t sys_clock_getres; ++extern syscall_handler_t sys_clock_nanosleep; ++extern syscall_handler_t sys_statfs64; ++extern syscall_handler_t sys_fstatfs64; ++extern syscall_handler_t sys_tgkill; ++extern syscall_handler_t sys_utimes; + + #ifdef CONFIG_NFSD + #define NFSSERVCTL sys_nfsservctl +@@ -459,18 +484,18 @@ + [ __NR_getdents64 ] = sys_getdents64, + [ __NR_gettid ] = sys_gettid, + [ __NR_readahead ] = sys_readahead, +- [ __NR_setxattr ] = sys_ni_syscall, +- [ __NR_lsetxattr ] = sys_ni_syscall, +- [ __NR_fsetxattr ] = sys_ni_syscall, +- [ __NR_getxattr ] = sys_ni_syscall, +- [ __NR_lgetxattr ] = sys_ni_syscall, +- [ __NR_fgetxattr ] = sys_ni_syscall, +- [ __NR_listxattr ] = sys_ni_syscall, +- [ __NR_llistxattr ] = sys_ni_syscall, +- [ __NR_flistxattr ] = sys_ni_syscall, +- [ __NR_removexattr ] = sys_ni_syscall, +- [ __NR_lremovexattr ] = sys_ni_syscall, +- [ __NR_fremovexattr ] = sys_ni_syscall, ++ [ __NR_setxattr ] = sys_setxattr, ++ [ __NR_lsetxattr ] = sys_lsetxattr, ++ [ __NR_fsetxattr ] = sys_fsetxattr, ++ [ __NR_getxattr ] = sys_getxattr, ++ [ __NR_lgetxattr ] = sys_lgetxattr, ++ [ __NR_fgetxattr ] = sys_fgetxattr, ++ [ __NR_listxattr ] = sys_listxattr, ++ [ __NR_llistxattr ] = sys_llistxattr, ++ [ __NR_flistxattr ] = sys_flistxattr, ++ [ __NR_removexattr ] = sys_removexattr, ++ [ __NR_lremovexattr ] = sys_lremovexattr, ++ [ __NR_fremovexattr ] = sys_fremovexattr, + [ __NR_tkill ] = sys_tkill, + [ __NR_sendfile64 ] = sys_sendfile64, + [ __NR_futex ] = sys_futex, +@@ -488,6 +513,19 @@ + [ __NR_epoll_wait ] = sys_epoll_wait, + [ __NR_remap_file_pages ] = sys_remap_file_pages, + [ __NR_set_tid_address ] = sys_set_tid_address, ++ [ __NR_timer_create ] = sys_timer_create, ++ [ __NR_timer_settime ] = sys_timer_settime, ++ [ __NR_timer_gettime ] = sys_timer_gettime, ++ [ __NR_timer_getoverrun ] = sys_timer_getoverrun, ++ [ __NR_timer_delete ] = sys_timer_delete, ++ [ __NR_clock_settime ] = sys_clock_settime, ++ [ __NR_clock_gettime ] = sys_clock_gettime, ++ [ __NR_clock_getres ] = sys_clock_getres, ++ [ __NR_clock_nanosleep ] = sys_clock_nanosleep, ++ [ __NR_statfs64 ] = sys_statfs64, ++ [ __NR_fstatfs64 ] = sys_fstatfs64, ++ [ __NR_tgkill ] = sys_tgkill, ++ [ __NR_utimes ] = sys_utimes, + + ARCH_SYSCALLS + [ LAST_SYSCALL + 1 ... NR_syscalls ] = +diff -Naur a/arch/um/kernel/syscall_kern.c b/arch/um/kernel/syscall_kern.c +--- a/arch/um/kernel/syscall_kern.c Fri Aug 15 15:07:37 2003 ++++ b/arch/um/kernel/syscall_kern.c Fri Aug 15 15:13:03 2003 +@@ -35,39 +35,40 @@ + + long sys_fork(void) + { +- struct task_struct *p; ++ long ret; + + current->thread.forking = 1; +- p = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL); ++ ret = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL); + current->thread.forking = 0; +- return(IS_ERR(p) ? PTR_ERR(p) : p->pid); ++ return(ret); + } + +-long sys_clone(unsigned long clone_flags, unsigned long newsp) ++long sys_clone(unsigned long clone_flags, unsigned long newsp, ++ int *parent_tid, int *child_tid) + { +- struct task_struct *p; ++ long ret; + + current->thread.forking = 1; +- p = do_fork(clone_flags, newsp, NULL, 0, NULL, NULL); ++ ret = do_fork(clone_flags, newsp, NULL, 0, parent_tid, child_tid); + current->thread.forking = 0; +- return(IS_ERR(p) ? PTR_ERR(p) : p->pid); ++ return(ret); + } + + long sys_vfork(void) + { +- struct task_struct *p; ++ long ret; + + current->thread.forking = 1; +- p = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, NULL); ++ ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, ++ NULL); + current->thread.forking = 0; +- return(IS_ERR(p) ? PTR_ERR(p) : p->pid); ++ return(ret); + } + + /* common code for old and new mmaps */ +-static inline long do_mmap2( +- unsigned long addr, unsigned long len, +- unsigned long prot, unsigned long flags, +- unsigned long fd, unsigned long pgoff) ++long do_mmap2(struct mm_struct *mm, unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, unsigned long fd, ++ unsigned long pgoff) + { + int error = -EBADF; + struct file * file = NULL; +@@ -79,9 +80,9 @@ + goto out; + } + +- down_write(¤t->mm->mmap_sem); +- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); +- up_write(¤t->mm->mmap_sem); ++ down_write(&mm->mmap_sem); ++ error = do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff); ++ up_write(&mm->mmap_sem); + + if (file) + fput(file); +@@ -93,7 +94,7 @@ + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) + { +- return do_mmap2(addr, len, prot, flags, fd, pgoff); ++ return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff); + } + + /* +@@ -120,7 +121,8 @@ + if (offset & ~PAGE_MASK) + goto out; + +- err = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); ++ err = do_mmap2(current->mm, addr, len, prot, flags, fd, ++ offset >> PAGE_SHIFT); + out: + return err; + } +@@ -141,37 +143,6 @@ + return error; + } + +-int sys_sigaction(int sig, const struct old_sigaction *act, +- struct old_sigaction *oact) +-{ +- struct k_sigaction new_ka, old_ka; +- int ret; +- +- if (act) { +- old_sigset_t mask; +- if (verify_area(VERIFY_READ, act, sizeof(*act)) || +- __get_user(new_ka.sa.sa_handler, &act->sa_handler) || +- __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) +- return -EFAULT; +- __get_user(new_ka.sa.sa_flags, &act->sa_flags); +- __get_user(mask, &act->sa_mask); +- siginitset(&new_ka.sa.sa_mask, mask); +- } +- +- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); +- +- if (!ret && oact) { +- if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || +- __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || +- __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) +- return -EFAULT; +- __put_user(old_ka.sa.sa_flags, &oact->sa_flags); +- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); +- } +- +- return ret; +-} +- + /* + * sys_ipc() is the de-multiplexer for the SysV IPC calls.. + * +@@ -253,7 +224,7 @@ + return sys_shmctl (first, second, + (struct shmid_ds *) ptr); + default: +- return -EINVAL; ++ return -ENOSYS; + } + } + +@@ -302,11 +273,6 @@ + return error; + } + +-int sys_sigaltstack(const stack_t *uss, stack_t *uoss) +-{ +- return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); +-} +- + long execute_syscall(void *r) + { + return(CHOOSE_MODE_PROC(execute_syscall_tt, execute_syscall_skas, r)); +diff -Naur a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c +--- a/arch/um/kernel/sysrq.c Fri Aug 15 15:05:01 2003 ++++ b/arch/um/kernel/sysrq.c Fri Aug 15 15:11:13 2003 +@@ -11,6 +11,14 @@ + #include "sysrq.h" + #include "user_util.h" + ++void show_stack(struct task_struct *task, unsigned long *sp) ++{ ++ if(task) ++ show_trace_task(task); ++ else ++ show_trace(sp); ++} ++ + void show_trace(unsigned long * stack) + { + int i; +diff -Naur a/arch/um/kernel/time.c b/arch/um/kernel/time.c +--- a/arch/um/kernel/time.c Fri Aug 15 15:04:49 2003 ++++ b/arch/um/kernel/time.c Fri Aug 15 15:10:46 2003 +@@ -15,12 +15,16 @@ + #include "process.h" + #include "signal_user.h" + #include "time_user.h" ++#include "kern_constants.h" + + extern struct timeval xtime; + ++struct timeval local_offset = { 0, 0 }; ++ + void timer(void) + { + gettimeofday(&xtime, NULL); ++ timeradd(&xtime, &local_offset, &xtime); + } + + void set_interval(int timer_type) +@@ -65,7 +69,7 @@ + errno); + } + +-void idle_timer(void) ++void uml_idle_timer(void) + { + if(signal(SIGVTALRM, SIG_IGN) == SIG_ERR) + panic("Couldn't unset SIGVTALRM handler"); +@@ -82,8 +86,6 @@ + set_interval(ITIMER_VIRTUAL); + } + +-struct timeval local_offset = { 0, 0 }; +- + void do_gettimeofday(struct timeval *tv) + { + unsigned long flags; +@@ -100,7 +102,7 @@ + unsigned long flags; + struct timeval tv_in; + +- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) ++ if ((unsigned long) tv->tv_nsec >= UM_NSEC_PER_SEC) + return -EINVAL; + + tv_in.tv_sec = tv->tv_sec; +@@ -110,6 +112,8 @@ + gettimeofday(&now, NULL); + timersub(&tv_in, &now, &local_offset); + time_unlock(flags); ++ ++ return(0); + } + + void idle_sleep(int secs) +diff -Naur a/arch/um/kernel/time_kern.c b/arch/um/kernel/time_kern.c +--- a/arch/um/kernel/time_kern.c Fri Aug 15 15:07:19 2003 ++++ b/arch/um/kernel/time_kern.c Fri Aug 15 15:12:46 2003 +@@ -55,12 +55,13 @@ + do_timer(®s); + } + +-void um_timer(int irq, void *dev, struct pt_regs *regs) ++irqreturn_t um_timer(int irq, void *dev, struct pt_regs *regs) + { + do_timer(regs); +- write_seqlock(&xtime_lock); ++ write_seqlock_irq(&xtime_lock); + timer(); +- write_sequnlock(&xtime_lock); ++ write_sequnlock_irq(&xtime_lock); ++ return(IRQ_HANDLED); + } + + long um_time(int * tloc) +@@ -78,12 +79,12 @@ + long um_stime(int * tptr) + { + int value; +- struct timeval new; ++ struct timespec new; + + if (get_user(value, tptr)) + return -EFAULT; + new.tv_sec = value; +- new.tv_usec = 0; ++ new.tv_nsec = 0; + do_settimeofday(&new); + return 0; + } +@@ -122,7 +123,9 @@ + void timer_handler(int sig, union uml_pt_regs *regs) + { + #ifdef CONFIG_SMP ++ local_irq_disable(); + update_process_times(user_context(UPT_SP(regs))); ++ local_irq_enable(); + #endif + if(current->thread_info->cpu == 0) + timer_irq(regs); +diff -Naur a/arch/um/kernel/trap_kern.c b/arch/um/kernel/trap_kern.c +--- a/arch/um/kernel/trap_kern.c Fri Aug 15 15:04:01 2003 ++++ b/arch/um/kernel/trap_kern.c Fri Aug 15 15:10:18 2003 +@@ -16,6 +16,7 @@ + #include "asm/tlbflush.h" + #include "asm/a.out.h" + #include "asm/current.h" ++#include "asm/irq.h" + #include "user_util.h" + #include "kern_util.h" + #include "kern.h" +@@ -180,6 +181,11 @@ + else relay_signal(sig, regs); + } + ++void winch(int sig, union uml_pt_regs *regs) ++{ ++ do_IRQ(WINCH_IRQ, regs); ++} ++ + void trap_init(void) + { + } +diff -Naur a/arch/um/kernel/trap_user.c b/arch/um/kernel/trap_user.c +--- a/arch/um/kernel/trap_user.c Fri Aug 15 15:05:45 2003 ++++ b/arch/um/kernel/trap_user.c Fri Aug 15 15:11:52 2003 +@@ -82,6 +82,8 @@ + .is_irq = 0 }, + [ SIGILL ] { .handler = relay_signal, + .is_irq = 0 }, ++ [ SIGWINCH ] { .handler = winch, ++ .is_irq = 1 }, + [ SIGBUS ] { .handler = bus_handler, + .is_irq = 0 }, + [ SIGSEGV] { .handler = segv_handler, +diff -Naur a/arch/um/kernel/tt/include/uaccess.h b/arch/um/kernel/tt/include/uaccess.h +--- a/arch/um/kernel/tt/include/uaccess.h Fri Aug 15 15:07:25 2003 ++++ b/arch/um/kernel/tt/include/uaccess.h Fri Aug 15 15:12:52 2003 +@@ -46,18 +46,20 @@ + + static inline int copy_from_user_tt(void *to, const void *from, int n) + { +- return(access_ok_tt(VERIFY_READ, from, n) ? +- __do_copy_from_user(to, from, n, +- ¤t->thread.fault_addr, +- ¤t->thread.fault_catcher) : n); ++ if(!access_ok_tt(VERIFY_READ, from, n)) ++ return(n); ++ ++ return(__do_copy_from_user(to, from, n, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); + } + + static inline int copy_to_user_tt(void *to, const void *from, int n) + { +- return(access_ok_tt(VERIFY_WRITE, to, n) ? +- __do_copy_to_user(to, from, n, +- ¤t->thread.fault_addr, +- ¤t->thread.fault_catcher) : n); ++ if(!access_ok_tt(VERIFY_WRITE, to, n)) ++ return(n); ++ ++ return(__do_copy_to_user(to, from, n, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); + } + + extern int __do_strncpy_from_user(char *dst, const char *src, size_t n, +@@ -67,7 +69,9 @@ + { + int n; + +- if(!access_ok_tt(VERIFY_READ, src, 1)) return(-EFAULT); ++ if(!access_ok_tt(VERIFY_READ, src, 1)) ++ return(-EFAULT); ++ + n = __do_strncpy_from_user(dst, src, count, + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher); +@@ -87,10 +91,11 @@ + + static inline int clear_user_tt(void *mem, int len) + { +- return(access_ok_tt(VERIFY_WRITE, mem, len) ? +- __do_clear_user(mem, len, +- ¤t->thread.fault_addr, +- ¤t->thread.fault_catcher) : len); ++ if(!access_ok_tt(VERIFY_WRITE, mem, len)) ++ return(len); ++ ++ return(__do_clear_user(mem, len, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); + } + + extern int __do_strnlen_user(const char *str, unsigned long n, +diff -Naur a/arch/um/kernel/tt/process_kern.c b/arch/um/kernel/tt/process_kern.c +--- a/arch/um/kernel/tt/process_kern.c Fri Aug 15 15:07:55 2003 ++++ b/arch/um/kernel/tt/process_kern.c Fri Aug 15 15:13:23 2003 +@@ -104,7 +104,10 @@ + + void release_thread_tt(struct task_struct *task) + { +- os_kill_process(task->thread.mode.tt.extern_pid, 0); ++ int pid = task->thread.mode.tt.extern_pid; ++ ++ if(os_getpid() != pid) ++ os_kill_process(pid, 0); + } + + void exit_thread_tt(void) +@@ -125,27 +128,27 @@ + UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); + suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); + +- block_signals(); ++ force_flush_all(); ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); ++ current->thread.prev_sched = NULL; ++ + init_new_thread_signals(1); +-#ifdef CONFIG_SMP +- schedule_tail(current->thread.prev_sched); +-#endif + enable_timer(); + free_page(current->thread.temp_stack); + set_cmdline("(kernel thread)"); +- force_flush_all(); + +- current->thread.prev_sched = NULL; + change_sig(SIGUSR1, 1); + change_sig(SIGVTALRM, 1); + change_sig(SIGPROF, 1); +- unblock_signals(); ++ local_irq_enable(); + if(!run_kernel_thread(fn, arg, ¤t->thread.exec_buf)) + do_exit(0); + } + + static int new_thread_proc(void *stack) + { ++ local_irq_disable(); + init_new_thread_stack(stack, new_thread_handler); + os_usr1_process(os_getpid()); + return(0); +@@ -165,35 +168,32 @@ + UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); + suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); + +-#ifdef CONFIG_SMP +- schedule_tail(NULL); +-#endif ++ force_flush_all(); ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); ++ current->thread.prev_sched = NULL; ++ + enable_timer(); + change_sig(SIGVTALRM, 1); + local_irq_enable(); +- force_flush_all(); + if(current->mm != current->parent->mm) + protect_memory(uml_reserved, high_physmem - uml_reserved, 1, + 1, 0, 1); + task_protections((unsigned long) current->thread_info); + +- current->thread.prev_sched = NULL; +- + free_page(current->thread.temp_stack); ++ local_irq_disable(); + change_sig(SIGUSR1, 0); + set_user_mode(current); + } + +-static int sigusr1 = SIGUSR1; +- + int fork_tramp(void *stack) + { +- int sig = sigusr1; +- + local_irq_disable(); ++ arch_init_thread(); + init_new_thread_stack(stack, finish_fork_handler); + +- kill(os_getpid(), sig); ++ os_usr1_process(os_getpid()); + return(0); + } + +diff -Naur a/arch/um/kernel/tt/ptproxy/proxy.c b/arch/um/kernel/tt/ptproxy/proxy.c +--- a/arch/um/kernel/tt/ptproxy/proxy.c Fri Aug 15 15:07:01 2003 ++++ b/arch/um/kernel/tt/ptproxy/proxy.c Fri Aug 15 15:12:44 2003 +@@ -293,10 +293,10 @@ + } + + char gdb_init_string[] = +-"att 1 +-b panic +-b stop +-handle SIGWINCH nostop noprint pass ++"att 1 \n\ ++b panic \n\ ++b stop \n\ ++handle SIGWINCH nostop noprint pass \n\ + "; + + int start_debugger(char *prog, int startup, int stop, int *fd_out) +diff -Naur a/arch/um/kernel/tt/tracer.c b/arch/um/kernel/tt/tracer.c +--- a/arch/um/kernel/tt/tracer.c Fri Aug 15 15:03:51 2003 ++++ b/arch/um/kernel/tt/tracer.c Fri Aug 15 15:10:12 2003 +@@ -39,7 +39,7 @@ + return(0); + + register_winch_irq(tracer_winch[0], fd, -1, data); +- return(0); ++ return(1); + } + + static void tracer_winch_handler(int sig) +@@ -401,7 +401,7 @@ + + if(!strcmp(line, "go")) debug_stop = 0; + else if(!strcmp(line, "parent")) debug_parent = 1; +- else printk("Unknown debug option : '%s'\n", line); ++ else printf("Unknown debug option : '%s'\n", line); + + line = next; + } +diff -Naur a/arch/um/kernel/tt/uaccess_user.c b/arch/um/kernel/tt/uaccess_user.c +--- a/arch/um/kernel/tt/uaccess_user.c Fri Aug 15 15:05:00 2003 ++++ b/arch/um/kernel/tt/uaccess_user.c Fri Aug 15 15:11:10 2003 +@@ -8,15 +8,20 @@ + #include + #include "user_util.h" + #include "uml_uaccess.h" ++#include "task.h" ++#include "kern_util.h" + + int __do_copy_from_user(void *to, const void *from, int n, + void **fault_addr, void **fault_catcher) + { ++ struct tt_regs save = TASK_REGS(get_current())->tt; + unsigned long fault; + int faulted; + + fault = __do_user_copy(to, from, n, fault_addr, fault_catcher, + __do_copy, &faulted); ++ TASK_REGS(get_current())->tt = save; ++ + if(!faulted) return(0); + else return(n - (fault - (unsigned long) from)); + } +@@ -29,11 +34,14 @@ + int __do_strncpy_from_user(char *dst, const char *src, unsigned long count, + void **fault_addr, void **fault_catcher) + { ++ struct tt_regs save = TASK_REGS(get_current())->tt; + unsigned long fault; + int faulted; + + fault = __do_user_copy(dst, src, count, fault_addr, fault_catcher, + __do_strncpy, &faulted); ++ TASK_REGS(get_current())->tt = save; ++ + if(!faulted) return(strlen(dst)); + else return(-1); + } +@@ -46,11 +54,14 @@ + int __do_clear_user(void *mem, unsigned long len, + void **fault_addr, void **fault_catcher) + { ++ struct tt_regs save = TASK_REGS(get_current())->tt; + unsigned long fault; + int faulted; + + fault = __do_user_copy(mem, NULL, len, fault_addr, fault_catcher, + __do_clear, &faulted); ++ TASK_REGS(get_current())->tt = save; ++ + if(!faulted) return(0); + else return(len - (fault - (unsigned long) mem)); + } +@@ -58,6 +69,7 @@ + int __do_strnlen_user(const char *str, unsigned long n, + void **fault_addr, void **fault_catcher) + { ++ struct tt_regs save = TASK_REGS(get_current())->tt; + int ret; + unsigned long *faddrp = (unsigned long *)fault_addr; + jmp_buf jbuf; +@@ -71,6 +83,8 @@ + } + *fault_addr = NULL; + *fault_catcher = NULL; ++ ++ TASK_REGS(get_current())->tt = save; + return ret; + } + +diff -Naur a/arch/um/kernel/tty_log.c b/arch/um/kernel/tty_log.c +--- a/arch/um/kernel/tty_log.c Fri Aug 15 15:07:04 2003 ++++ b/arch/um/kernel/tty_log.c Fri Aug 15 15:12:44 2003 +@@ -13,6 +13,7 @@ + #include + #include "init.h" + #include "user.h" ++#include "kern_util.h" + #include "os.h" + + #define TTY_LOG_DIR "./" +@@ -24,29 +25,40 @@ + #define TTY_LOG_OPEN 1 + #define TTY_LOG_CLOSE 2 + #define TTY_LOG_WRITE 3 ++#define TTY_LOG_EXEC 4 ++ ++#define TTY_READ 1 ++#define TTY_WRITE 2 + + struct tty_log_buf { + int what; + unsigned long tty; + int len; ++ int direction; ++ unsigned long sec; ++ unsigned long usec; + }; + +-int open_tty_log(void *tty) ++int open_tty_log(void *tty, void *current_tty) + { + struct timeval tv; + struct tty_log_buf data; + char buf[strlen(tty_log_dir) + sizeof("01234567890-01234567\0")]; + int fd; + ++ gettimeofday(&tv, NULL); + if(tty_log_fd != -1){ +- data = ((struct tty_log_buf) { what : TTY_LOG_OPEN, +- tty : (unsigned long) tty, +- len : 0 }); ++ data = ((struct tty_log_buf) { .what = TTY_LOG_OPEN, ++ .tty = (unsigned long) tty, ++ .len = sizeof(current_tty), ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); + write(tty_log_fd, &data, sizeof(data)); ++ write(tty_log_fd, ¤t_tty, data.len); + return(tty_log_fd); + } + +- gettimeofday(&tv, NULL); + sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, + (unsigned int) tv.tv_usec); + +@@ -62,30 +74,114 @@ + void close_tty_log(int fd, void *tty) + { + struct tty_log_buf data; ++ struct timeval tv; + + if(tty_log_fd != -1){ +- data = ((struct tty_log_buf) { what : TTY_LOG_CLOSE, +- tty : (unsigned long) tty, +- len : 0 }); ++ gettimeofday(&tv, NULL); ++ data = ((struct tty_log_buf) { .what = TTY_LOG_CLOSE, ++ .tty = (unsigned long) tty, ++ .len = 0, ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); + write(tty_log_fd, &data, sizeof(data)); + return; + } + close(fd); + } + +-int write_tty_log(int fd, char *buf, int len, void *tty) ++static int log_chunk(int fd, const char *buf, int len) + { ++ int total = 0, try, missed, n; ++ char chunk[64]; ++ ++ while(len > 0){ ++ try = (len > sizeof(chunk)) ? sizeof(chunk) : len; ++ missed = copy_from_user_proc(chunk, (char *) buf, try); ++ try -= missed; ++ n = write(fd, chunk, try); ++ if(n != try) ++ return(-errno); ++ if(missed != 0) ++ return(-EFAULT); ++ ++ len -= try; ++ total += try; ++ buf += try; ++ } ++ ++ return(total); ++} ++ ++int write_tty_log(int fd, const char *buf, int len, void *tty, int is_read) ++{ ++ struct timeval tv; + struct tty_log_buf data; ++ int direction; + + if(fd == tty_log_fd){ +- data = ((struct tty_log_buf) { what : TTY_LOG_WRITE, +- tty : (unsigned long) tty, +- len : len }); ++ gettimeofday(&tv, NULL); ++ direction = is_read ? TTY_READ : TTY_WRITE; ++ data = ((struct tty_log_buf) { .what = TTY_LOG_WRITE, ++ .tty = (unsigned long) tty, ++ .len = len, ++ .direction = direction, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); + write(tty_log_fd, &data, sizeof(data)); + } +- return(write(fd, buf, len)); ++ ++ return(log_chunk(fd, buf, len)); + } + ++void log_exec(char **argv, void *tty) ++{ ++ struct timeval tv; ++ struct tty_log_buf data; ++ char **ptr,*arg; ++ int len; ++ ++ if(tty_log_fd == -1) return; ++ ++ gettimeofday(&tv, NULL); ++ ++ len = 0; ++ for(ptr = argv; ; ptr++){ ++ if(copy_from_user_proc(&arg, ptr, sizeof(arg))) ++ return; ++ if(arg == NULL) break; ++ len += strlen_user_proc(arg); ++ } ++ ++ data = ((struct tty_log_buf) { .what = TTY_LOG_EXEC, ++ .tty = (unsigned long) tty, ++ .len = len, ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); ++ write(tty_log_fd, &data, sizeof(data)); ++ ++ for(ptr = argv; ; ptr++){ ++ if(copy_from_user_proc(&arg, ptr, sizeof(arg))) ++ return; ++ if(arg == NULL) break; ++ log_chunk(tty_log_fd, arg, strlen_user_proc(arg)); ++ } ++} ++ ++extern void register_tty_logger(int (*opener)(void *, void *), ++ int (*writer)(int, const char *, int, ++ void *, int), ++ void (*closer)(int, void *)); ++ ++static int register_logger(void) ++{ ++ register_tty_logger(open_tty_log, write_tty_log, close_tty_log); ++ return(0); ++} ++ ++__uml_initcall(register_logger); ++ + static int __init set_tty_log_dir(char *name, int *add) + { + tty_log_dir = name; +@@ -104,7 +200,7 @@ + + tty_log_fd = strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ +- printk("set_tty_log_fd - strtoul failed on '%s'\n", name); ++ printf("set_tty_log_fd - strtoul failed on '%s'\n", name); + tty_log_fd = -1; + } + return 0; +diff -Naur a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c +--- a/arch/um/kernel/um_arch.c Fri Aug 15 15:07:48 2003 ++++ b/arch/um/kernel/um_arch.c Fri Aug 15 15:13:14 2003 +@@ -38,13 +38,18 @@ + #include "mode_kern.h" + #include "mode.h" + +-#define DEFAULT_COMMAND_LINE "root=6200" ++#define DEFAULT_COMMAND_LINE "root=ubd0" + + struct cpuinfo_um boot_cpu_data = { + .loops_per_jiffy = 0, + .ipi_pipe = { -1, -1 } + }; + ++/* Placeholder to make UML link until the vsyscall stuff is actually ++ * implemented ++ */ ++void *__kernel_vsyscall; ++ + unsigned long thread_saved_pc(struct task_struct *task) + { + return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas, +@@ -61,10 +66,14 @@ + return 0; + #endif + +- seq_printf(m, "bogomips\t: %lu.%02lu\n", ++ seq_printf(m, "processor\t: %d\n", index); ++ seq_printf(m, "vendor_id\t: User Mode Linux\n"); ++ seq_printf(m, "model name\t: UML\n"); ++ seq_printf(m, "mode\t\t: %s\n", CHOOSE_MODE("tt", "skas")); ++ seq_printf(m, "host\t\t: %s\n", host_info); ++ seq_printf(m, "bogomips\t: %lu.%02lu\n\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100); +- seq_printf(m, "host\t\t: %s\n", host_info); + + return(0); + } +@@ -134,12 +143,12 @@ + if(umid != NULL){ + snprintf(argv1_begin, + (argv1_end - argv1_begin) * sizeof(*ptr), +- "(%s)", umid); ++ "(%s) ", umid); + ptr = &argv1_begin[strlen(argv1_begin)]; + } + else ptr = argv1_begin; + +- snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), " [%s]", cmd); ++ snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), "[%s]", cmd); + memset(argv1_begin + strlen(argv1_begin), '\0', + argv1_end - argv1_begin - strlen(argv1_begin)); + #endif +@@ -179,7 +188,7 @@ + static int __init uml_ncpus_setup(char *line, int *add) + { + if (!sscanf(line, "%d", &ncpus)) { +- printk("Couldn't parse [%s]\n", line); ++ printf("Couldn't parse [%s]\n", line); + return -1; + } + +@@ -210,7 +219,7 @@ + + static int __init mode_tt_setup(char *line, int *add) + { +- printk("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n"); ++ printf("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n"); + return(0); + } + +@@ -221,7 +230,7 @@ + + static int __init mode_tt_setup(char *line, int *add) + { +- printk("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n"); ++ printf("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n"); + return(0); + } + +@@ -369,6 +378,7 @@ + 2 * PAGE_SIZE; + + task_protections((unsigned long) &init_thread_info); ++ os_flush_stdout(); + + return(CHOOSE_MODE(start_uml_tt(), start_uml_skas())); + } +diff -Naur a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c +--- a/arch/um/kernel/umid.c Fri Aug 15 15:08:44 2003 ++++ b/arch/um/kernel/umid.c Fri Aug 15 15:13:39 2003 +@@ -33,18 +33,19 @@ + static int umid_is_random = 1; + static int umid_inited = 0; + +-static int make_umid(void); ++static int make_umid(int (*printer)(const char *fmt, ...)); + +-static int __init set_umid(char *name, int is_random) ++static int __init set_umid(char *name, int is_random, ++ int (*printer)(const char *fmt, ...)) + { + if(umid_inited){ +- printk("Unique machine name can't be set twice\n"); ++ (*printer)("Unique machine name can't be set twice\n"); + return(-1); + } + + if(strlen(name) > UMID_LEN - 1) +- printk("Unique machine name is being truncated to %s " +- "characters\n", UMID_LEN); ++ (*printer)("Unique machine name is being truncated to %s " ++ "characters\n", UMID_LEN); + strlcpy(umid, name, sizeof(umid)); + + umid_is_random = is_random; +@@ -54,7 +55,7 @@ + + static int __init set_umid_arg(char *name, int *add) + { +- return(set_umid(name, 0)); ++ return(set_umid(name, 0, printf)); + } + + __uml_setup("umid=", set_umid_arg, +@@ -67,7 +68,7 @@ + { + int n; + +- if(!umid_inited && make_umid()) return(-1); ++ if(!umid_inited && make_umid(printk)) return(-1); + + n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1; + if(n > len){ +@@ -92,14 +93,14 @@ + fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))), + 0644); + if(fd < 0){ +- printk("Open of machine pid file \"%s\" failed - " ++ printf("Open of machine pid file \"%s\" failed - " + "errno = %d\n", file, -fd); + return 0; + } + + sprintf(pid, "%d\n", os_getpid()); + if(write(fd, pid, strlen(pid)) != strlen(pid)) +- printk("Write of pid file failed - errno = %d\n", errno); ++ printf("Write of pid file failed - errno = %d\n", errno); + close(fd); + return 0; + } +@@ -197,7 +198,7 @@ + if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){ + uml_dir = malloc(strlen(name) + 1); + if(uml_dir == NULL){ +- printk("Failed to malloc uml_dir - error = %d\n", ++ printf("Failed to malloc uml_dir - error = %d\n", + errno); + uml_dir = name; + return(0); +@@ -217,7 +218,7 @@ + char *home = getenv("HOME"); + + if(home == NULL){ +- printk("make_uml_dir : no value in environment for " ++ printf("make_uml_dir : no value in environment for " + "$HOME\n"); + exit(1); + } +@@ -239,25 +240,25 @@ + strcpy(uml_dir, dir); + + if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){ +- printk("Failed to mkdir %s - errno = %i\n", uml_dir, errno); ++ printf("Failed to mkdir %s - errno = %i\n", uml_dir, errno); + return(-1); + } + return 0; + } + +-static int __init make_umid(void) ++static int __init make_umid(int (*printer)(const char *fmt, ...)) + { + int fd, err; + char tmp[strlen(uml_dir) + UMID_LEN + 1]; + + strlcpy(tmp, uml_dir, sizeof(tmp)); + +- if(*umid == 0){ ++ if(!umid_inited){ + strcat(tmp, "XXXXXX"); + fd = mkstemp(tmp); + if(fd < 0){ +- printk("make_umid - mkstemp failed, errno = %d\n", +- errno); ++ (*printer)("make_umid - mkstemp failed, errno = %d\n", ++ errno); + return(1); + } + +@@ -267,7 +268,7 @@ + * for directories. + */ + unlink(tmp); +- set_umid(&tmp[strlen(uml_dir)], 1); ++ set_umid(&tmp[strlen(uml_dir)], 1, printer); + } + + sprintf(tmp, "%s%s", uml_dir, umid); +@@ -275,14 +276,14 @@ + if((err = mkdir(tmp, 0777)) < 0){ + if(errno == EEXIST){ + if(not_dead_yet(tmp)){ +- printk("umid '%s' is in use\n", umid); ++ (*printer)("umid '%s' is in use\n", umid); + return(-1); + } + err = mkdir(tmp, 0777); + } + } + if(err < 0){ +- printk("Failed to create %s - errno = %d\n", umid, errno); ++ (*printer)("Failed to create %s - errno = %d\n", umid, errno); + return(-1); + } + +@@ -295,7 +296,13 @@ + ); + + __uml_postsetup(make_uml_dir); +-__uml_postsetup(make_umid); ++ ++static int __init make_umid_setup(void) ++{ ++ return(make_umid(printf)); ++} ++ ++__uml_postsetup(make_umid_setup); + __uml_postsetup(create_pid_file); + + /* +diff -Naur a/arch/um/kernel/user_util.c b/arch/um/kernel/user_util.c +--- a/arch/um/kernel/user_util.c Fri Aug 15 15:04:48 2003 ++++ b/arch/um/kernel/user_util.c Fri Aug 15 15:10:41 2003 +@@ -119,17 +119,6 @@ + } + } + +-int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags) +-{ +- int pid; +- +- pid = clone(fn, sp, flags, arg); +- if(pid < 0) return(-1); +- wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL); +- ptrace(PTRACE_CONT, pid, 0, 0); +- return(pid); +-} +- + int raw(int fd, int complain) + { + struct termios tt; +diff -Naur a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c +--- a/arch/um/os-Linux/drivers/tuntap_user.c Fri Aug 15 15:09:23 2003 ++++ b/arch/um/os-Linux/drivers/tuntap_user.c Fri Aug 15 15:14:02 2003 +@@ -142,7 +142,7 @@ + return(-errno); + } + memset(&ifr, 0, sizeof(ifr)); +- ifr.ifr_flags = IFF_TAP; ++ ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name)); + if(ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0){ + printk("TUNSETIFF failed, errno = %d", errno); +diff -Naur a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c +--- a/arch/um/os-Linux/file.c Fri Aug 15 15:09:15 2003 ++++ b/arch/um/os-Linux/file.c Fri Aug 15 15:13:54 2003 +@@ -315,7 +315,7 @@ + return(new); + } + +-int create_unix_socket(char *file, int len) ++int create_unix_socket(char *file, int len, int close_on_exec) + { + struct sockaddr_un addr; + int sock, err; +@@ -327,6 +327,10 @@ + return(-errno); + } + ++ if(close_on_exec && fcntl(sock, F_SETFD, 1) < 0) ++ printk("create_unix_socket : Setting FD_CLOEXEC failed, " ++ "errno = %d", errno); ++ + addr.sun_family = AF_UNIX; + + /* XXX Be more careful about overflow */ +@@ -342,6 +346,37 @@ + return(sock); + } + ++void os_flush_stdout(void) ++{ ++ fflush(stdout); ++} ++ ++int os_lock_file(int fd, int excl) ++{ ++ int type = excl ? F_WRLCK : F_RDLCK; ++ struct flock lock = ((struct flock) { .l_type = type, ++ .l_whence = SEEK_SET, ++ .l_start = 0, ++ .l_len = 0 } ); ++ int err, save; ++ ++ err = fcntl(fd, F_SETLK, &lock); ++ if(!err) ++ goto out; ++ ++ save = -errno; ++ err = fcntl(fd, F_GETLK, &lock); ++ if(err){ ++ err = -errno; ++ goto out; ++ } ++ ++ printk("F_SETLK failed, file already locked by pid %d\n", lock.l_pid); ++ err = save; ++ out: ++ return(err); ++} ++ + /* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically +diff -Naur a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile +--- a/arch/um/sys-i386/Makefile Fri Aug 15 15:04:47 2003 ++++ b/arch/um/sys-i386/Makefile Fri Aug 15 15:10:35 2003 +@@ -1,7 +1,8 @@ +-obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o module.o \ +- ptrace.o ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o ++obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o ptrace.o \ ++ ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o + + obj-$(CONFIG_HIGHMEM) += highmem.o ++obj-$(CONFIG_MODULES) += module.o + + USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o + USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) +@@ -9,6 +10,8 @@ + SYMLINKS = semaphore.c highmem.c module.c + SYMLINKS := $(foreach f,$(SYMLINKS),$(src)/$f) + ++clean-files := $(SYMLINKS) ++ + semaphore.c-dir = kernel + highmem.c-dir = mm + module.c-dir = kernel +@@ -24,8 +27,7 @@ + $(SYMLINKS): + $(call make_link,$@) + +-clean: +- $(MAKE) -C util clean ++subdir- := util + + fastdep: + +diff -Naur a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c +--- a/arch/um/sys-i386/bugs.c Fri Aug 15 15:07:41 2003 ++++ b/arch/um/sys-i386/bugs.c Fri Aug 15 15:13:14 2003 +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + #include "kern_util.h" + #include "user.h" + #include "sysdep/ptrace.h" +@@ -16,8 +17,8 @@ + #define MAXTOKEN 64 + + /* Set during early boot */ +-int cpu_has_cmov = 1; +-int cpu_has_xmm = 0; ++int host_has_cmov = 1; ++int host_has_xmm = 0; + + static char token(int fd, char *buf, int len, char stop) + { +@@ -104,6 +105,25 @@ + return(1); + } + ++static void disable_lcall(void) ++{ ++ struct modify_ldt_ldt_s ldt; ++ int err; ++ ++ bzero(&ldt, sizeof(ldt)); ++ ldt.entry_number = 7; ++ ldt.base_addr = 0; ++ ldt.limit = 0; ++ err = modify_ldt(1, &ldt, sizeof(ldt)); ++ if(err) ++ printk("Failed to disable lcall7 - errno = %d\n", errno); ++} ++ ++void arch_init_thread(void) ++{ ++ disable_lcall(); ++} ++ + void arch_check_bugs(void) + { + int have_it; +@@ -113,8 +133,8 @@ + "checks\n"); + return; + } +- if(check_cpu_feature("cmov", &have_it)) cpu_has_cmov = have_it; +- if(check_cpu_feature("xmm", &have_it)) cpu_has_xmm = have_it; ++ if(check_cpu_feature("cmov", &have_it)) host_has_cmov = have_it; ++ if(check_cpu_feature("xmm", &have_it)) host_has_xmm = have_it; + } + + int arch_handle_signal(int sig, union uml_pt_regs *regs) +@@ -130,18 +150,18 @@ + if((*((char *) ip) != 0x0f) || ((*((char *) (ip + 1)) & 0xf0) != 0x40)) + return(0); + +- if(cpu_has_cmov == 0) ++ if(host_has_cmov == 0) + panic("SIGILL caused by cmov, which this processor doesn't " + "implement, boot a filesystem compiled for older " + "processors"); +- else if(cpu_has_cmov == 1) ++ else if(host_has_cmov == 1) + panic("SIGILL caused by cmov, which this processor claims to " + "implement"); +- else if(cpu_has_cmov == -1) ++ else if(host_has_cmov == -1) + panic("SIGILL caused by cmov, couldn't tell if this processor " + "implements it, boot a filesystem compiled for older " + "processors"); +- else panic("Bad value for cpu_has_cmov (%d)", cpu_has_cmov); ++ else panic("Bad value for host_has_cmov (%d)", host_has_cmov); + return(0); + } + +diff -Naur a/arch/um/uml.lds.S b/arch/um/uml.lds.S +--- a/arch/um/uml.lds.S Fri Aug 15 15:05:37 2003 ++++ b/arch/um/uml.lds.S Fri Aug 15 15:11:48 2003 +@@ -26,7 +26,11 @@ + . = ALIGN(4096); /* Init code and data */ + _stext = .; + __init_begin = .; +- .text.init : { *(.text.init) } ++ .init.text : { ++ _sinittext = .; ++ *(.init.text) ++ _einittext = .; ++ } + . = ALIGN(4096); + .text : + { +@@ -38,7 +42,7 @@ + + #include "asm/common.lds.S" + +- .data.init : { *(.data.init) } ++ init.data : { *(init.data) } + .data : + { + . = ALIGN(KERNEL_STACK_SIZE); /* init_task */ +diff -Naur a/arch/um/util/mk_constants_kern.c b/arch/um/util/mk_constants_kern.c +--- a/arch/um/util/mk_constants_kern.c Fri Aug 15 15:04:15 2003 ++++ b/arch/um/util/mk_constants_kern.c Fri Aug 15 15:10:27 2003 +@@ -1,5 +1,6 @@ + #include "linux/kernel.h" + #include "linux/stringify.h" ++#include "linux/time.h" + #include "asm/page.h" + + extern void print_head(void); +@@ -11,6 +12,7 @@ + { + print_head(); + print_constant_int("UM_KERN_PAGE_SIZE", PAGE_SIZE); ++ + print_constant_str("UM_KERN_EMERG", KERN_EMERG); + print_constant_str("UM_KERN_ALERT", KERN_ALERT); + print_constant_str("UM_KERN_CRIT", KERN_CRIT); +@@ -19,6 +21,8 @@ + print_constant_str("UM_KERN_NOTICE", KERN_NOTICE); + print_constant_str("UM_KERN_INFO", KERN_INFO); + print_constant_str("UM_KERN_DEBUG", KERN_DEBUG); ++ ++ print_constant_int("UM_NSEC_PER_SEC", NSEC_PER_SEC); + print_tail(); + return(0); + } +diff -Naur a/fs/Makefile b/fs/Makefile +--- a/fs/Makefile Fri Aug 15 15:06:45 2003 ++++ b/fs/Makefile Fri Aug 15 15:12:41 2003 +@@ -91,3 +91,5 @@ + obj-$(CONFIG_XFS_FS) += xfs/ + obj-$(CONFIG_AFS_FS) += afs/ + obj-$(CONFIG_BEFS_FS) += befs/ ++obj-$(CONFIG_HOSTFS) += hostfs/ ++obj-$(CONFIG_HPPFS) += hppfs/ +diff -Naur a/fs/hostfs/Makefile b/fs/hostfs/Makefile +--- a/fs/hostfs/Makefile Wed Dec 31 19:00:00 1969 ++++ b/fs/hostfs/Makefile Fri Aug 15 15:10:07 2003 +@@ -0,0 +1,36 @@ ++# ++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++# struct stat64 changed the inode field name between 2.2 and 2.4 from st_ino ++# to __st_ino. It stayed in the same place, so as long as the correct name ++# is used, hostfs compiled on 2.2 should work on 2.4 and vice versa. ++ ++STAT64_INO_FIELD := $(shell grep -q __st_ino /usr/include/bits/stat.h && \ ++ echo __)st_ino ++ ++hostfs-objs := hostfs_kern.o hostfs_user.o ++ ++obj-y = ++obj-$(CONFIG_HOSTFS) += hostfs.o ++ ++SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs)) ++ ++USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(SINGLE_OBJS)) ++USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) ++ ++USER_CFLAGS += -DSTAT64_INO_FIELD=$(STAT64_INO_FIELD) ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< ++ ++clean: ++ ++modules: ++ ++fastdep: ++ ++dep: ++ ++archmrproper: clean +diff -Naur a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h +--- a/fs/hostfs/hostfs.h Wed Dec 31 19:00:00 1969 ++++ b/fs/hostfs/hostfs.h Fri Aug 15 15:10:06 2003 +@@ -0,0 +1,79 @@ ++#ifndef __UM_FS_HOSTFS ++#define __UM_FS_HOSTFS ++ ++#include "os.h" ++ ++/* These are exactly the same definitions as in fs.h, but the names are ++ * changed so that this file can be included in both kernel and user files. ++ */ ++ ++#define HOSTFS_ATTR_MODE 1 ++#define HOSTFS_ATTR_UID 2 ++#define HOSTFS_ATTR_GID 4 ++#define HOSTFS_ATTR_SIZE 8 ++#define HOSTFS_ATTR_ATIME 16 ++#define HOSTFS_ATTR_MTIME 32 ++#define HOSTFS_ATTR_CTIME 64 ++#define HOSTFS_ATTR_ATIME_SET 128 ++#define HOSTFS_ATTR_MTIME_SET 256 ++#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ ++#define HOSTFS_ATTR_ATTR_FLAG 1024 ++ ++struct hostfs_iattr { ++ unsigned int ia_valid; ++ mode_t ia_mode; ++ uid_t ia_uid; ++ gid_t ia_gid; ++ loff_t ia_size; ++ struct timespec ia_atime; ++ struct timespec ia_mtime; ++ struct timespec ia_ctime; ++ unsigned int ia_attr_flags; ++}; ++ ++extern int stat_file(const char *path, unsigned long long *inode_out, ++ int *mode_out, int *nlink_out, int *uid_out, int *gid_out, ++ unsigned long long *size_out, struct timespec *atime_out, ++ struct timespec *mtime_out, struct timespec *ctime_out, ++ int *blksize_out, unsigned long long *blocks_out); ++extern int access_file(char *path, int r, int w, int x); ++extern int open_file(char *path, int r, int w, int append); ++extern int file_type(const char *path, int *rdev); ++extern void *open_dir(char *path, int *err_out); ++extern char *read_dir(void *stream, unsigned long long *pos, ++ unsigned long long *ino_out, int *len_out); ++extern void close_file(void *stream); ++extern void close_dir(void *stream); ++extern int read_file(int fd, unsigned long long *offset, char *buf, int len); ++extern int write_file(int fd, unsigned long long *offset, const char *buf, ++ int len); ++extern int lseek_file(int fd, long long offset, int whence); ++extern int file_create(char *name, int ur, int uw, int ux, int gr, ++ int gw, int gx, int or, int ow, int ox); ++extern int set_attr(const char *file, struct hostfs_iattr *attrs); ++extern int make_symlink(const char *from, const char *to); ++extern int unlink_file(const char *file); ++extern int do_mkdir(const char *file, int mode); ++extern int do_rmdir(const char *file); ++extern int do_mknod(const char *file, int mode, int dev); ++extern int link_file(const char *from, const char *to); ++extern int do_readlink(char *file, char *buf, int size); ++extern int rename_file(char *from, char *to); ++extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, ++ long long *bfree_out, long long *bavail_out, ++ long long *files_out, long long *ffree_out, ++ void *fsid_out, int fsid_size, long *namelen_out, ++ long *spare_out); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c +--- a/fs/hostfs/hostfs_kern.c Wed Dec 31 19:00:00 1969 ++++ b/fs/hostfs/hostfs_kern.c Fri Aug 15 15:10:12 2003 +@@ -0,0 +1,1010 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ * ++ * Ported the filesystem routines to 2.5. ++ * 2003-02-10 Petr Baudis ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "hostfs.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "user_util.h" ++#include "2_5compat.h" ++#include "init.h" ++ ++struct hostfs_inode_info { ++ char *host_filename; ++ int fd; ++ int mode; ++ struct inode vfs_inode; ++}; ++ ++static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) ++{ ++ return(list_entry(inode, struct hostfs_inode_info, vfs_inode)); ++} ++ ++#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_dentry->d_inode) ++ ++int hostfs_d_delete(struct dentry *dentry) ++{ ++ return(1); ++} ++ ++struct dentry_operations hostfs_dentry_ops = { ++ .d_delete = hostfs_d_delete, ++}; ++ ++/* Changed in hostfs_args before the kernel starts running */ ++static char *root_ino = "/"; ++static int append = 0; ++ ++#define HOSTFS_SUPER_MAGIC 0x00c0ffee ++ ++static struct inode_operations hostfs_iops; ++static struct inode_operations hostfs_dir_iops; ++static struct address_space_operations hostfs_link_aops; ++ ++static int __init hostfs_args(char *options, int *add) ++{ ++ char *ptr; ++ ++ ptr = strchr(options, ','); ++ if(ptr != NULL) ++ *ptr++ = '\0'; ++ if(*options != '\0') ++ root_ino = options; ++ ++ options = ptr; ++ while(options){ ++ ptr = strchr(options, ','); ++ if(ptr != NULL) ++ *ptr++ = '\0'; ++ if(*options != '\0'){ ++ if(!strcmp(options, "append")) ++ append = 1; ++ else printf("hostfs_args - unsupported option - %s\n", ++ options); ++ } ++ options = ptr; ++ } ++ return(0); ++} ++ ++__uml_setup("hostfs=", hostfs_args, ++"hostfs=,,...\n" ++" This is used to set hostfs parameters. The root directory argument\n" ++" is used to confine all hostfs mounts to within the specified directory\n" ++" tree on the host. If this isn't specified, then a user inside UML can\n" ++" mount anything on the host that's accessible to the user that's running\n" ++" it.\n" ++" The only flag currently supported is 'append', which specifies that all\n" ++" files opened by hostfs will be opened in append mode.\n\n" ++); ++ ++static char *dentry_name(struct dentry *dentry, int extra) ++{ ++ struct dentry *parent; ++ char *root, *name; ++ int len; ++ ++ len = 0; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ len += parent->d_name.len + 1; ++ parent = parent->d_parent; ++ } ++ ++ root = HOSTFS_I(parent->d_inode)->host_filename; ++ len += strlen(root); ++ name = kmalloc(len + extra + 1, GFP_KERNEL); ++ if(name == NULL) return(NULL); ++ ++ name[len] = '\0'; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ len -= parent->d_name.len + 1; ++ name[len] = '/'; ++ strncpy(&name[len + 1], parent->d_name.name, ++ parent->d_name.len); ++ parent = parent->d_parent; ++ } ++ strncpy(name, root, strlen(root)); ++ return(name); ++} ++ ++static char *inode_name(struct inode *ino, int extra) ++{ ++ struct dentry *dentry; ++ ++ dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias); ++ return(dentry_name(dentry, extra)); ++} ++ ++static int read_name(struct inode *ino, char *name) ++{ ++ /* The non-int inode fields are copied into ints by stat_file and ++ * then copied into the inode because passing the actual pointers ++ * in and having them treated as int * breaks on big-endian machines ++ */ ++ int err; ++ int i_mode, i_nlink, i_blksize; ++ unsigned long long i_size; ++ unsigned long long i_ino; ++ unsigned long long i_blocks; ++ ++ err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid, ++ &ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime, ++ &ino->i_ctime, &i_blksize, &i_blocks); ++ if(err) ++ return(err); ++ ++ ino->i_ino = i_ino; ++ ino->i_mode = i_mode; ++ ino->i_nlink = i_nlink; ++ ino->i_size = i_size; ++ ino->i_blksize = i_blksize; ++ ino->i_blocks = i_blocks; ++ if((ino->i_sb->s_dev == ROOT_DEV) && (ino->i_uid == getuid())) ++ ino->i_uid = 0; ++ return(0); ++} ++ ++static char *follow_link(char *link) ++{ ++ int len, n; ++ char *name, *resolved, *end; ++ ++ len = 64; ++ while(1){ ++ n = -ENOMEM; ++ name = kmalloc(len, GFP_KERNEL); ++ if(name == NULL) ++ goto out; ++ ++ n = do_readlink(link, name, len); ++ if(n < len) ++ break; ++ len *= 2; ++ kfree(name); ++ } ++ if(n < 0) ++ goto out_free; ++ ++ if(*name == '/') ++ return(name); ++ ++ end = strrchr(link, '/'); ++ if(end == NULL) ++ return(name); ++ ++ *(end + 1) = '\0'; ++ len = strlen(link) + strlen(name) + 1; ++ ++ resolved = kmalloc(len, GFP_KERNEL); ++ if(resolved == NULL){ ++ n = -ENOMEM; ++ goto out_free; ++ } ++ ++ sprintf(resolved, "%s%s", link, name); ++ kfree(name); ++ kfree(link); ++ return(resolved); ++ ++ out_free: ++ kfree(name); ++ out: ++ return(ERR_PTR(n)); ++} ++ ++static int read_inode(struct inode *ino) ++{ ++ char *name; ++ int err = 0; ++ ++ /* Unfortunately, we are called from iget() when we don't have a dentry ++ * allocated yet. ++ */ ++ if(list_empty(&ino->i_dentry)) ++ goto out; ++ ++ err = -ENOMEM; ++ name = inode_name(ino, 0); ++ if(name == NULL) ++ goto out; ++ ++ if(file_type(name, NULL) == OS_TYPE_SYMLINK){ ++ name = follow_link(name); ++ if(IS_ERR(name)){ ++ err = PTR_ERR(name); ++ goto out; ++ } ++ } ++ ++ err = read_name(ino, name); ++ kfree(name); ++ out: ++ return(err); ++} ++ ++int hostfs_statfs(struct super_block *sb, struct kstatfs *sf) ++{ ++ /* do_statfs uses struct statfs64 internally, but the linux kernel ++ * struct statfs still has 32-bit versions for most of these fields, ++ * so we convert them here ++ */ ++ int err; ++ long long f_blocks; ++ long long f_bfree; ++ long long f_bavail; ++ long long f_files; ++ long long f_ffree; ++ ++ err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename, ++ &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, ++ &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), ++ &sf->f_namelen, sf->f_spare); ++ if(err) return(err); ++ sf->f_blocks = f_blocks; ++ sf->f_bfree = f_bfree; ++ sf->f_bavail = f_bavail; ++ sf->f_files = f_files; ++ sf->f_ffree = f_ffree; ++ sf->f_type = HOSTFS_SUPER_MAGIC; ++ return(0); ++} ++ ++static struct inode *hostfs_alloc_inode(struct super_block *sb) ++{ ++ struct hostfs_inode_info *hi; ++ ++ hi = kmalloc(sizeof(*hi), GFP_KERNEL); ++ if(hi == NULL) ++ return(NULL); ++ ++ *hi = ((struct hostfs_inode_info) { .host_filename = NULL, ++ .fd = -1, ++ .mode = 0 }); ++ inode_init_once(&hi->vfs_inode); ++ return(&hi->vfs_inode); ++} ++ ++static void hostfs_destroy_inode(struct inode *inode) ++{ ++ if(HOSTFS_I(inode)->host_filename) ++ kfree(HOSTFS_I(inode)->host_filename); ++ ++ if(HOSTFS_I(inode)->fd != -1) ++ close_file(&HOSTFS_I(inode)->fd); ++ ++ kfree(HOSTFS_I(inode)); ++} ++ ++static void hostfs_read_inode(struct inode *inode) ++{ ++ read_inode(inode); ++} ++ ++static struct super_operations hostfs_sbops = { ++ .alloc_inode = hostfs_alloc_inode, ++ .destroy_inode = hostfs_destroy_inode, ++ .read_inode = hostfs_read_inode, ++ .statfs = hostfs_statfs, ++}; ++ ++int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) ++{ ++ void *dir; ++ char *name; ++ unsigned long long next, ino; ++ int error, len; ++ ++ name = dentry_name(file->f_dentry, 0); ++ if(name == NULL) return(-ENOMEM); ++ dir = open_dir(name, &error); ++ kfree(name); ++ if(dir == NULL) return(-error); ++ next = file->f_pos; ++ while((name = read_dir(dir, &next, &ino, &len)) != NULL){ ++ error = (*filldir)(ent, name, len, file->f_pos, ++ ino, DT_UNKNOWN); ++ if(error) break; ++ file->f_pos = next; ++ } ++ close_dir(dir); ++ return(0); ++} ++ ++int hostfs_file_open(struct inode *ino, struct file *file) ++{ ++ char *name; ++ int mode = 0, r = 0, w = 0, fd; ++ ++ mode = file->f_mode & (FMODE_READ | FMODE_WRITE); ++ if((mode & HOSTFS_I(ino)->mode) == mode) ++ return(0); ++ ++ /* The file may already have been opened, but with the wrong access, ++ * so this resets things and reopens the file with the new access. ++ */ ++ if(HOSTFS_I(ino)->fd != -1){ ++ close_file(&HOSTFS_I(ino)->fd); ++ HOSTFS_I(ino)->fd = -1; ++ } ++ ++ HOSTFS_I(ino)->mode |= mode; ++ if(HOSTFS_I(ino)->mode & FMODE_READ) ++ r = 1; ++ if(HOSTFS_I(ino)->mode & FMODE_WRITE) ++ w = 1; ++ if(w) ++ r = 1; ++ ++ name = dentry_name(file->f_dentry, 0); ++ if(name == NULL) ++ return(-ENOMEM); ++ ++ fd = open_file(name, r, w, append); ++ kfree(name); ++ if(fd < 0) return(fd); ++ FILE_HOSTFS_I(file)->fd = fd; ++ ++ return(0); ++} ++ ++int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) ++{ ++ return(0); ++} ++ ++static struct file_operations hostfs_file_fops = { ++ .llseek = generic_file_llseek, ++ .read = generic_file_read, ++ .write = generic_file_write, ++ .mmap = generic_file_mmap, ++ .open = hostfs_file_open, ++ .release = NULL, ++ .fsync = hostfs_fsync, ++}; ++ ++static struct file_operations hostfs_dir_fops = { ++ .readdir = hostfs_readdir, ++ .read = generic_read_dir, ++}; ++ ++int hostfs_writepage(struct page *page, struct writeback_control *wbc) ++{ ++ struct address_space *mapping = page->mapping; ++ struct inode *inode = mapping->host; ++ char *buffer; ++ unsigned long long base; ++ int count = PAGE_CACHE_SIZE; ++ int end_index = inode->i_size >> PAGE_CACHE_SHIFT; ++ int err; ++ ++ if (page->index >= end_index) ++ count = inode->i_size & (PAGE_CACHE_SIZE-1); ++ ++ buffer = kmap(page); ++ base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT; ++ ++ err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); ++ if(err != count){ ++ ClearPageUptodate(page); ++ goto out; ++ } ++ ++ if (base > inode->i_size) ++ inode->i_size = base; ++ ++ if (PageError(page)) ++ ClearPageError(page); ++ err = 0; ++ ++ out: ++ kunmap(page); ++ ++ unlock_page(page); ++ return err; ++} ++ ++int hostfs_readpage(struct file *file, struct page *page) ++{ ++ char *buffer; ++ long long start; ++ int err = 0; ++ ++ start = (long long) page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, ++ PAGE_CACHE_SIZE); ++ if(err < 0) goto out; ++ ++ memset(&buffer[err], 0, PAGE_CACHE_SIZE - err); ++ ++ flush_dcache_page(page); ++ SetPageUptodate(page); ++ if (PageError(page)) ClearPageError(page); ++ err = 0; ++ out: ++ kunmap(page); ++ unlock_page(page); ++ return(err); ++} ++ ++int hostfs_prepare_write(struct file *file, struct page *page, ++ unsigned int from, unsigned int to) ++{ ++ char *buffer; ++ long long start, tmp; ++ int err; ++ ++ start = (long long) page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ if(from != 0){ ++ tmp = start; ++ err = read_file(FILE_HOSTFS_I(file)->fd, &tmp, buffer, ++ from); ++ if(err < 0) goto out; ++ } ++ if(to != PAGE_CACHE_SIZE){ ++ start += to; ++ err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer + to, ++ PAGE_CACHE_SIZE - to); ++ if(err < 0) goto out; ++ } ++ err = 0; ++ out: ++ kunmap(page); ++ return(err); ++} ++ ++int hostfs_commit_write(struct file *file, struct page *page, unsigned from, ++ unsigned to) ++{ ++ struct address_space *mapping = page->mapping; ++ struct inode *inode = mapping->host; ++ char *buffer; ++ long long start; ++ int err = 0; ++ ++ start = (long long) (page->index << PAGE_CACHE_SHIFT) + from; ++ buffer = kmap(page); ++ err = write_file(FILE_HOSTFS_I(file)->fd, &start, buffer + from, ++ to - from); ++ if(err > 0) err = 0; ++ if(!err && (start > inode->i_size)) ++ inode->i_size = start; ++ ++ kunmap(page); ++ return(err); ++} ++ ++static struct address_space_operations hostfs_aops = { ++ .writepage = hostfs_writepage, ++ .readpage = hostfs_readpage, ++/* .set_page_dirty = __set_page_dirty_nobuffers, */ ++ .prepare_write = hostfs_prepare_write, ++ .commit_write = hostfs_commit_write ++}; ++ ++static int init_inode(struct inode *inode, struct dentry *dentry) ++{ ++ char *name; ++ int type, err = -ENOMEM, rdev; ++ ++ if(dentry){ ++ name = dentry_name(dentry, 0); ++ if(name == NULL) ++ goto out; ++ type = file_type(name, &rdev); ++ kfree(name); ++ } ++ else type = OS_TYPE_DIR; ++ ++ err = 0; ++ if(type == OS_TYPE_SYMLINK) ++ inode->i_op = &page_symlink_inode_operations; ++ else if(type == OS_TYPE_DIR) ++ inode->i_op = &hostfs_dir_iops; ++ else inode->i_op = &hostfs_iops; ++ ++ if(type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops; ++ else inode->i_fop = &hostfs_file_fops; ++ ++ if(type == OS_TYPE_SYMLINK) ++ inode->i_mapping->a_ops = &hostfs_link_aops; ++ else inode->i_mapping->a_ops = &hostfs_aops; ++ ++ switch (type) { ++ case OS_TYPE_CHARDEV: ++ init_special_inode(inode, S_IFCHR, rdev); ++ break; ++ case OS_TYPE_BLOCKDEV: ++ init_special_inode(inode, S_IFBLK, rdev); ++ break; ++ case OS_TYPE_FIFO: ++ init_special_inode(inode, S_IFIFO, 0); ++ break; ++ case OS_TYPE_SOCK: ++ init_special_inode(inode, S_IFSOCK, 0); ++ break; ++ } ++ out: ++ return(err); ++} ++ ++int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ char *name; ++ int error, fd; ++ ++ error = -ENOMEM; ++ inode = iget(dir->i_sb, 0); ++ if(inode == NULL) goto out; ++ ++ error = init_inode(inode, dentry); ++ if(error) ++ goto out_put; ++ ++ error = -ENOMEM; ++ name = dentry_name(dentry, 0); ++ if(name == NULL) ++ goto out_put; ++ ++ fd = file_create(name, ++ mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, ++ mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, ++ mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH); ++ if(fd < 0) ++ error = fd; ++ else error = read_name(inode, name); ++ ++ kfree(name); ++ if(error) ++ goto out_put; ++ ++ HOSTFS_I(inode)->fd = fd; ++ HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE; ++ d_instantiate(dentry, inode); ++ return(0); ++ ++ out_free: ++ kfree(name); ++ out_put: ++ iput(inode); ++ out: ++ return(error); ++} ++ ++struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ char *name; ++ int err; ++ ++ err = -ENOMEM; ++ inode = iget(ino->i_sb, 0); ++ if(inode == NULL) ++ goto out; ++ ++ err = init_inode(inode, dentry); ++ if(err) ++ goto out_put; ++ ++ err = -ENOMEM; ++ name = dentry_name(dentry, 0); ++ if(name == NULL) ++ goto out_put; ++ ++ err = read_name(inode, name); ++ kfree(name); ++ if(err == -ENOENT){ ++ iput(inode); ++ inode = NULL; ++ } ++ else if(err) ++ goto out_put; ++ ++ d_add(dentry, inode); ++ dentry->d_op = &hostfs_dentry_ops; ++ return(NULL); ++ ++ out_put: ++ iput(inode); ++ out: ++ return(ERR_PTR(err)); ++} ++ ++static char *inode_dentry_name(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int len; ++ ++ file = inode_name(ino, dentry->d_name.len + 1); ++ if(file == NULL) return(NULL); ++ strcat(file, "/"); ++ len = strlen(file); ++ strncat(file, dentry->d_name.name, dentry->d_name.len); ++ file[len + dentry->d_name.len] = '\0'; ++ return(file); ++} ++ ++int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) ++{ ++ char *from_name, *to_name; ++ int err; ++ ++ if((from_name = inode_dentry_name(ino, from)) == NULL) ++ return(-ENOMEM); ++ to_name = dentry_name(to, 0); ++ if(to_name == NULL){ ++ kfree(from_name); ++ return(-ENOMEM); ++ } ++ err = link_file(to_name, from_name); ++ kfree(from_name); ++ kfree(to_name); ++ return(err); ++} ++ ++int hostfs_unlink(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ if(append) ++ return(-EPERM); ++ ++ err = unlink_file(file); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = make_symlink(file, to); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = do_mkdir(file, mode); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_rmdir(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = do_rmdir(file); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) ++{ ++ struct inode *inode; ++ char *name; ++ int err = -ENOMEM; ++ ++ inode = iget(dir->i_sb, 0); ++ if(inode == NULL) ++ goto out; ++ ++ err = init_inode(inode, dentry); ++ if(err) ++ goto out_put; ++ ++ err = -ENOMEM; ++ name = dentry_name(dentry, 0); ++ if(name == NULL) ++ goto out_put; ++ ++ init_special_inode(inode, mode, dev); ++ err = do_mknod(name, mode, dev); ++ if(err) ++ goto out_free; ++ ++ err = read_name(inode, name); ++ kfree(name); ++ if(err) ++ goto out_put; ++ ++ d_instantiate(dentry, inode); ++ return(0); ++ ++ out_free: ++ kfree(name); ++ out_put: ++ iput(inode); ++ out: ++ return(err); ++} ++ ++int hostfs_rename(struct inode *from_ino, struct dentry *from, ++ struct inode *to_ino, struct dentry *to) ++{ ++ char *from_name, *to_name; ++ int err; ++ ++ if((from_name = inode_dentry_name(from_ino, from)) == NULL) ++ return(-ENOMEM); ++ if((to_name = inode_dentry_name(to_ino, to)) == NULL){ ++ kfree(from_name); ++ return(-ENOMEM); ++ } ++ err = rename_file(from_name, to_name); ++ kfree(from_name); ++ kfree(to_name); ++ return(err); ++} ++ ++void hostfs_truncate(struct inode *ino) ++{ ++ not_implemented(); ++} ++ ++int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd) ++{ ++ char *name; ++ int r = 0, w = 0, x = 0, err; ++ ++ if(desired & MAY_READ) r = 1; ++ if(desired & MAY_WRITE) w = 1; ++ if(desired & MAY_EXEC) x = 1; ++ name = inode_name(ino, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = access_file(name, r, w, x); ++ kfree(name); ++ if(!err) err = vfs_permission(ino, desired); ++ return(err); ++} ++ ++int hostfs_setattr(struct dentry *dentry, struct iattr *attr) ++{ ++ struct hostfs_iattr attrs; ++ char *name; ++ int err; ++ ++ if(append) ++ attr->ia_valid &= ~ATTR_SIZE; ++ ++ attrs.ia_valid = 0; ++ if(attr->ia_valid & ATTR_MODE){ ++ attrs.ia_valid |= HOSTFS_ATTR_MODE; ++ attrs.ia_mode = attr->ia_mode; ++ } ++ if(attr->ia_valid & ATTR_UID){ ++ if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && ++ (attr->ia_uid == 0)) ++ attr->ia_uid = getuid(); ++ attrs.ia_valid |= HOSTFS_ATTR_UID; ++ attrs.ia_uid = attr->ia_uid; ++ } ++ if(attr->ia_valid & ATTR_GID){ ++ if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && ++ (attr->ia_gid == 0)) ++ attr->ia_gid = getuid(); ++ attrs.ia_valid |= HOSTFS_ATTR_GID; ++ attrs.ia_gid = attr->ia_gid; ++ } ++ if(attr->ia_valid & ATTR_SIZE){ ++ attrs.ia_valid |= HOSTFS_ATTR_SIZE; ++ attrs.ia_size = attr->ia_size; ++ } ++ if(attr->ia_valid & ATTR_ATIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_ATIME; ++ attrs.ia_atime = attr->ia_atime; ++ } ++ if(attr->ia_valid & ATTR_MTIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_MTIME; ++ attrs.ia_mtime = attr->ia_mtime; ++ } ++ if(attr->ia_valid & ATTR_CTIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_CTIME; ++ attrs.ia_ctime = attr->ia_ctime; ++ } ++ if(attr->ia_valid & ATTR_ATIME_SET){ ++ attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; ++ } ++ if(attr->ia_valid & ATTR_MTIME_SET){ ++ attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET; ++ } ++ name = dentry_name(dentry, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = set_attr(name, &attrs); ++ kfree(name); ++ if(err) ++ return(err); ++ ++ return(inode_setattr(dentry->d_inode, attr)); ++} ++ ++int hostfs_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ generic_fillattr(dentry->d_inode, stat); ++ return(0); ++} ++ ++static struct inode_operations hostfs_iops = { ++ .create = hostfs_create, ++ .link = hostfs_link, ++ .unlink = hostfs_unlink, ++ .symlink = hostfs_symlink, ++ .mkdir = hostfs_mkdir, ++ .rmdir = hostfs_rmdir, ++ .mknod = hostfs_mknod, ++ .rename = hostfs_rename, ++ .truncate = hostfs_truncate, ++ .permission = hostfs_permission, ++ .setattr = hostfs_setattr, ++ .getattr = hostfs_getattr, ++}; ++ ++static struct inode_operations hostfs_dir_iops = { ++ .create = hostfs_create, ++ .lookup = hostfs_lookup, ++ .link = hostfs_link, ++ .unlink = hostfs_unlink, ++ .symlink = hostfs_symlink, ++ .mkdir = hostfs_mkdir, ++ .rmdir = hostfs_rmdir, ++ .mknod = hostfs_mknod, ++ .rename = hostfs_rename, ++ .truncate = hostfs_truncate, ++ .permission = hostfs_permission, ++ .setattr = hostfs_setattr, ++ .getattr = hostfs_getattr, ++}; ++ ++int hostfs_link_readpage(struct file *file, struct page *page) ++{ ++ char *buffer, *name; ++ long long start; ++ int err; ++ ++ start = page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ name = inode_name(page->mapping->host, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = do_readlink(name, buffer, PAGE_CACHE_SIZE); ++ kfree(name); ++ if(err == PAGE_CACHE_SIZE) ++ err = -E2BIG; ++ else if(err > 0){ ++ flush_dcache_page(page); ++ SetPageUptodate(page); ++ if (PageError(page)) ClearPageError(page); ++ err = 0; ++ } ++ kunmap(page); ++ unlock_page(page); ++ return(err); ++} ++ ++static struct address_space_operations hostfs_link_aops = { ++ .readpage = hostfs_link_readpage, ++}; ++ ++static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) ++{ ++ struct inode *root_inode; ++ char *name, *data = d; ++ int err; ++ ++ sb->s_blocksize = 1024; ++ sb->s_blocksize_bits = 10; ++ sb->s_magic = HOSTFS_SUPER_MAGIC; ++ sb->s_op = &hostfs_sbops; ++ ++ if((data == NULL) || (*data == '\0')) ++ data = root_ino; ++ ++ err = -ENOMEM; ++ name = kmalloc(strlen(data) + 1, GFP_KERNEL); ++ if(name == NULL) ++ goto out; ++ ++ strcpy(name, data); ++ ++ root_inode = iget(sb, 0); ++ if(root_inode == NULL) ++ goto out_free; ++ ++ err = init_inode(root_inode, NULL); ++ if(err) ++ goto out_put; ++ ++ HOSTFS_I(root_inode)->host_filename = name; ++ ++ err = -ENOMEM; ++ sb->s_root = d_alloc_root(root_inode); ++ if(sb->s_root == NULL) ++ goto out_put; ++ ++ err = read_inode(root_inode); ++ if(err) ++ goto out_put; ++ ++ return(0); ++ ++ out_put: ++ iput(root_inode); ++ out_free: ++ kfree(name); ++ out: ++ return(err); ++} ++ ++static struct super_block *hostfs_read_sb(struct file_system_type *type, ++ int flags, const char *dev_name, ++ void *data) ++{ ++ return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common)); ++} ++ ++static struct file_system_type hostfs_type = { ++ .owner = THIS_MODULE, ++ .name = "hostfs", ++ .get_sb = hostfs_read_sb, ++ .kill_sb = kill_anon_super, ++ .fs_flags = 0, ++}; ++ ++static int __init init_hostfs(void) ++{ ++ return(register_filesystem(&hostfs_type)); ++} ++ ++static void __exit exit_hostfs(void) ++{ ++ unregister_filesystem(&hostfs_type); ++} ++ ++module_init(init_hostfs) ++module_exit(exit_hostfs) ++MODULE_LICENSE("GPL"); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c +--- a/fs/hostfs/hostfs_user.c Wed Dec 31 19:00:00 1969 ++++ b/fs/hostfs/hostfs_user.c Fri Aug 15 15:10:43 2003 +@@ -0,0 +1,361 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "hostfs.h" ++#include "kern_util.h" ++#include "user.h" ++ ++int stat_file(const char *path, unsigned long long *inode_out, int *mode_out, ++ int *nlink_out, int *uid_out, int *gid_out, ++ unsigned long long *size_out, struct timespec *atime_out, ++ struct timespec *mtime_out, struct timespec *ctime_out, ++ int *blksize_out, unsigned long long *blocks_out) ++{ ++ struct stat64 buf; ++ ++ if(lstat64(path, &buf) < 0) ++ return(-errno); ++ ++ /* See the Makefile for why STAT64_INO_FIELD is passed in ++ * by the build ++ */ ++ if(inode_out != NULL) *inode_out = buf.STAT64_INO_FIELD; ++ if(mode_out != NULL) *mode_out = buf.st_mode; ++ if(nlink_out != NULL) *nlink_out = buf.st_nlink; ++ if(uid_out != NULL) *uid_out = buf.st_uid; ++ if(gid_out != NULL) *gid_out = buf.st_gid; ++ if(size_out != NULL) *size_out = buf.st_size; ++ if(atime_out != NULL) { ++ atime_out->tv_sec = buf.st_atime; ++ atime_out->tv_nsec = 0; ++ } ++ if(mtime_out != NULL) { ++ mtime_out->tv_sec = buf.st_mtime; ++ mtime_out->tv_nsec = 0; ++ } ++ if(ctime_out != NULL) { ++ ctime_out->tv_sec = buf.st_ctime; ++ ctime_out->tv_nsec = 0; ++ } ++ if(blksize_out != NULL) *blksize_out = buf.st_blksize; ++ if(blocks_out != NULL) *blocks_out = buf.st_blocks; ++ return(0); ++} ++ ++int file_type(const char *path, int *rdev) ++{ ++ struct stat64 buf; ++ ++ if(lstat64(path, &buf) < 0) ++ return(-errno); ++ if(rdev != NULL) ++ *rdev = buf.st_rdev; ++ ++ if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR); ++ else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK); ++ else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV); ++ else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV); ++ else if(S_ISFIFO(buf.st_mode))return(OS_TYPE_FIFO); ++ else if(S_ISSOCK(buf.st_mode))return(OS_TYPE_SOCK); ++ else return(OS_TYPE_FILE); ++} ++ ++int access_file(char *path, int r, int w, int x) ++{ ++ int mode = 0; ++ ++ if(r) mode = R_OK; ++ if(w) mode |= W_OK; ++ if(x) mode |= X_OK; ++ if(access(path, mode) != 0) return(-errno); ++ else return(0); ++} ++ ++int open_file(char *path, int r, int w, int append) ++{ ++ int mode = 0, fd; ++ ++ if(r && !w) ++ mode = O_RDONLY; ++ else if(!r && w) ++ mode = O_WRONLY; ++ else if(r && w) ++ mode = O_RDWR; ++ else panic("Impossible mode in open_file"); ++ ++ if(append) ++ mode |= O_APPEND; ++ fd = open64(path, mode); ++ if(fd < 0) return(-errno); ++ else return(fd); ++} ++ ++void *open_dir(char *path, int *err_out) ++{ ++ DIR *dir; ++ ++ dir = opendir(path); ++ *err_out = errno; ++ if(dir == NULL) return(NULL); ++ return(dir); ++} ++ ++char *read_dir(void *stream, unsigned long long *pos, ++ unsigned long long *ino_out, int *len_out) ++{ ++ DIR *dir = stream; ++ struct dirent *ent; ++ ++ seekdir(dir, *pos); ++ ent = readdir(dir); ++ if(ent == NULL) return(NULL); ++ *len_out = strlen(ent->d_name); ++ *ino_out = ent->d_ino; ++ *pos = telldir(dir); ++ return(ent->d_name); ++} ++ ++int read_file(int fd, unsigned long long *offset, char *buf, int len) ++{ ++ int n; ++ ++ n = pread64(fd, buf, len, *offset); ++ if(n < 0) return(-errno); ++ *offset += n; ++ return(n); ++} ++ ++int write_file(int fd, unsigned long long *offset, const char *buf, int len) ++{ ++ int n; ++ ++ n = pwrite64(fd, buf, len, *offset); ++ if(n < 0) return(-errno); ++ *offset += n; ++ return(n); ++} ++ ++int lseek_file(int fd, long long offset, int whence) ++{ ++ int ret; ++ ++ ret = lseek64(fd, offset, whence); ++ if(ret < 0) return(-errno); ++ return(0); ++} ++ ++void close_file(void *stream) ++{ ++ close(*((int *) stream)); ++} ++ ++void close_dir(void *stream) ++{ ++ closedir(stream); ++} ++ ++int file_create(char *name, int ur, int uw, int ux, int gr, ++ int gw, int gx, int or, int ow, int ox) ++{ ++ int mode, fd; ++ ++ mode = 0; ++ mode |= ur ? S_IRUSR : 0; ++ mode |= uw ? S_IWUSR : 0; ++ mode |= ux ? S_IXUSR : 0; ++ mode |= gr ? S_IRGRP : 0; ++ mode |= gw ? S_IWGRP : 0; ++ mode |= gx ? S_IXGRP : 0; ++ mode |= or ? S_IROTH : 0; ++ mode |= ow ? S_IWOTH : 0; ++ mode |= ox ? S_IXOTH : 0; ++ fd = open64(name, O_CREAT | O_RDWR, mode); ++ if(fd < 0) ++ return(-errno); ++ return(fd); ++} ++ ++int set_attr(const char *file, struct hostfs_iattr *attrs) ++{ ++ struct utimbuf buf; ++ int err, ma; ++ ++ if(attrs->ia_valid & HOSTFS_ATTR_MODE){ ++ if(chmod(file, attrs->ia_mode) != 0) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_UID){ ++ if(chown(file, attrs->ia_uid, -1)) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_GID){ ++ if(chown(file, -1, attrs->ia_gid)) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_SIZE){ ++ if(truncate(file, attrs->ia_size)) return(-errno); ++ } ++ ma = HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET; ++ if((attrs->ia_valid & ma) == ma){ ++ buf.actime = attrs->ia_atime.tv_sec; ++ buf.modtime = attrs->ia_mtime.tv_sec; ++ if(utime(file, &buf) != 0) return(-errno); ++ } ++ else { ++ struct timespec ts; ++ ++ if(attrs->ia_valid & HOSTFS_ATTR_ATIME_SET){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, ++ NULL, NULL, &ts, NULL, NULL, NULL); ++ if(err != 0) ++ return(err); ++ buf.actime = attrs->ia_atime.tv_sec; ++ buf.modtime = ts.tv_sec; ++ if(utime(file, &buf) != 0) ++ return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_MTIME_SET){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, ++ NULL, &ts, NULL, NULL, NULL, NULL); ++ if(err != 0) ++ return(err); ++ buf.actime = ts.tv_sec; ++ buf.modtime = attrs->ia_mtime.tv_sec; ++ if(utime(file, &buf) != 0) ++ return(-errno); ++ } ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ; ++ if(attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, ++ &attrs->ia_atime, &attrs->ia_mtime, NULL, ++ NULL, NULL); ++ if(err != 0) return(err); ++ } ++ return(0); ++} ++ ++int make_symlink(const char *from, const char *to) ++{ ++ int err; ++ ++ err = symlink(to, from); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int unlink_file(const char *file) ++{ ++ int err; ++ ++ err = unlink(file); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_mkdir(const char *file, int mode) ++{ ++ int err; ++ ++ err = mkdir(file, mode); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_rmdir(const char *file) ++{ ++ int err; ++ ++ err = rmdir(file); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_mknod(const char *file, int mode, int dev) ++{ ++ int err; ++ ++ err = mknod(file, mode, dev); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int link_file(const char *to, const char *from) ++{ ++ int err; ++ ++ err = link(to, from); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_readlink(char *file, char *buf, int size) ++{ ++ int n; ++ ++ n = readlink(file, buf, size); ++ if(n < 0) ++ return(-errno); ++ if(n < size) ++ buf[n] = '\0'; ++ return(n); ++} ++ ++int rename_file(char *from, char *to) ++{ ++ int err; ++ ++ err = rename(from, to); ++ if(err < 0) return(-errno); ++ return(0); ++} ++ ++int do_statfs(char *root, long *bsize_out, long long *blocks_out, ++ long long *bfree_out, long long *bavail_out, ++ long long *files_out, long long *ffree_out, ++ void *fsid_out, int fsid_size, long *namelen_out, ++ long *spare_out) ++{ ++ struct statfs64 buf; ++ int err; ++ ++ err = statfs64(root, &buf); ++ if(err < 0) return(-errno); ++ *bsize_out = buf.f_bsize; ++ *blocks_out = buf.f_blocks; ++ *bfree_out = buf.f_bfree; ++ *bavail_out = buf.f_bavail; ++ *files_out = buf.f_files; ++ *ffree_out = buf.f_ffree; ++ memcpy(fsid_out, &buf.f_fsid, ++ sizeof(buf.f_fsid) > fsid_size ? fsid_size : ++ sizeof(buf.f_fsid)); ++ *namelen_out = buf.f_namelen; ++ spare_out[0] = buf.f_spare[0]; ++ spare_out[1] = buf.f_spare[1]; ++ spare_out[2] = buf.f_spare[2]; ++ spare_out[3] = buf.f_spare[3]; ++ spare_out[4] = buf.f_spare[4]; ++ spare_out[5] = buf.f_spare[5]; ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/fs/hppfs/Makefile b/fs/hppfs/Makefile +--- a/fs/hppfs/Makefile Wed Dec 31 19:00:00 1969 ++++ b/fs/hppfs/Makefile Fri Aug 15 15:12:31 2003 +@@ -0,0 +1,19 @@ ++# ++# Copyright (C) 2002, 2003 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++hppfs-objs := hppfs_kern.o ++ ++obj-y = ++obj-$(CONFIG_HPPFS) += hppfs.o ++ ++clean: ++ ++modules: ++ ++fastdep: ++ ++dep: ++ ++archmrproper: clean +diff -Naur a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c +--- a/fs/hppfs/hppfs_kern.c Wed Dec 31 19:00:00 1969 ++++ b/fs/hppfs/hppfs_kern.c Fri Aug 15 15:11:52 2003 +@@ -0,0 +1,811 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "os.h" ++ ++static int init_inode(struct inode *inode, struct dentry *dentry); ++ ++struct hppfs_data { ++ struct list_head list; ++ char contents[PAGE_SIZE - sizeof(struct list_head)]; ++}; ++ ++struct hppfs_private { ++ struct file proc_file; ++ int host_fd; ++ loff_t len; ++ struct hppfs_data *contents; ++}; ++ ++struct hppfs_inode_info { ++ struct dentry *proc_dentry; ++ struct inode vfs_inode; ++}; ++ ++static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) ++{ ++ return(list_entry(inode, struct hppfs_inode_info, vfs_inode)); ++} ++ ++#define HPPFS_SUPER_MAGIC 0xb00000ee ++ ++static struct super_operations hppfs_sbops; ++ ++static int is_pid(struct dentry *dentry) ++{ ++ struct super_block *sb; ++ int i; ++ ++ sb = dentry->d_sb; ++ if((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root)) ++ return(0); ++ ++ for(i = 0; i < dentry->d_name.len; i++){ ++ if(!isdigit(dentry->d_name.name[i])) ++ return(0); ++ } ++ return(1); ++} ++ ++static char *dentry_name(struct dentry *dentry, int extra) ++{ ++ struct dentry *parent; ++ char *root, *name; ++ const char *seg_name; ++ int len, seg_len; ++ ++ len = 0; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ if(is_pid(parent)) ++ len += strlen("pid") + 1; ++ else len += parent->d_name.len + 1; ++ parent = parent->d_parent; ++ } ++ ++ root = "proc"; ++ len += strlen(root); ++ name = kmalloc(len + extra + 1, GFP_KERNEL); ++ if(name == NULL) return(NULL); ++ ++ name[len] = '\0'; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ if(is_pid(parent)){ ++ seg_name = "pid"; ++ seg_len = strlen("pid"); ++ } ++ else { ++ seg_name = parent->d_name.name; ++ seg_len = parent->d_name.len; ++ } ++ ++ len -= seg_len + 1; ++ name[len] = '/'; ++ strncpy(&name[len + 1], seg_name, seg_len); ++ parent = parent->d_parent; ++ } ++ strncpy(name, root, strlen(root)); ++ return(name); ++} ++ ++struct dentry_operations hppfs_dentry_ops = { ++}; ++ ++static int file_removed(struct dentry *dentry, const char *file) ++{ ++ char *host_file; ++ int extra, fd; ++ ++ extra = 0; ++ if(file != NULL) extra += strlen(file) + 1; ++ ++ host_file = dentry_name(dentry, extra + strlen("/remove")); ++ if(host_file == NULL){ ++ printk("file_removed : allocation failed\n"); ++ return(-ENOMEM); ++ } ++ ++ if(file != NULL){ ++ strcat(host_file, "/"); ++ strcat(host_file, file); ++ } ++ strcat(host_file, "/remove"); ++ ++ fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); ++ kfree(host_file); ++ if(fd > 0){ ++ os_close_file(fd); ++ return(1); ++ } ++ return(0); ++} ++ ++static void hppfs_read_inode(struct inode *ino) ++{ ++ struct inode *proc_ino; ++ ++ if(HPPFS_I(ino)->proc_dentry == NULL) ++ return; ++ ++ proc_ino = HPPFS_I(ino)->proc_dentry->d_inode; ++ ino->i_uid = proc_ino->i_uid; ++ ino->i_gid = proc_ino->i_gid; ++ ino->i_atime = proc_ino->i_atime; ++ ino->i_mtime = proc_ino->i_mtime; ++ ino->i_ctime = proc_ino->i_ctime; ++ ino->i_ino = proc_ino->i_ino; ++ ino->i_mode = proc_ino->i_mode; ++ ino->i_nlink = proc_ino->i_nlink; ++ ino->i_size = proc_ino->i_size; ++ ino->i_blksize = proc_ino->i_blksize; ++ ino->i_blocks = proc_ino->i_blocks; ++} ++ ++static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct dentry *proc_dentry, *new, *parent; ++ struct inode *inode; ++ int err, deleted; ++ ++ deleted = file_removed(dentry, NULL); ++ if(deleted < 0) ++ return(ERR_PTR(deleted)); ++ else if(deleted) ++ return(ERR_PTR(-ENOENT)); ++ ++ err = -ENOMEM; ++ parent = HPPFS_I(ino)->proc_dentry; ++ down(&parent->d_inode->i_sem); ++ proc_dentry = d_lookup(parent, &dentry->d_name); ++ if(proc_dentry == NULL){ ++ proc_dentry = d_alloc(parent, &dentry->d_name); ++ if(proc_dentry == NULL){ ++ up(&parent->d_inode->i_sem); ++ goto out; ++ } ++ new = (*parent->d_inode->i_op->lookup)(parent->d_inode, ++ proc_dentry, NULL); ++ if(new){ ++ dput(proc_dentry); ++ proc_dentry = new; ++ } ++ } ++ up(&parent->d_inode->i_sem); ++ ++ if(IS_ERR(proc_dentry)) ++ return(proc_dentry); ++ ++ inode = iget(ino->i_sb, 0); ++ if(inode == NULL) ++ goto out_dput; ++ ++ err = init_inode(inode, proc_dentry); ++ if(err) ++ goto out_put; ++ ++ hppfs_read_inode(inode); ++ ++ d_add(dentry, inode); ++ dentry->d_op = &hppfs_dentry_ops; ++ return(NULL); ++ ++ out_put: ++ iput(inode); ++ out_dput: ++ dput(proc_dentry); ++ out: ++ return(ERR_PTR(err)); ++} ++ ++static struct inode_operations hppfs_file_iops = { ++}; ++ ++static ssize_t read_proc(struct file *file, char *buf, ssize_t count, ++ loff_t *ppos, int is_user) ++{ ++ ssize_t (*read)(struct file *, char *, size_t, loff_t *); ++ ssize_t n; ++ ++ read = file->f_dentry->d_inode->i_fop->read; ++ ++ if(!is_user) ++ set_fs(KERNEL_DS); ++ ++ n = (*read)(file, buf, count, &file->f_pos); ++ ++ if(!is_user) ++ set_fs(USER_DS); ++ ++ if(ppos) *ppos = file->f_pos; ++ return(n); ++} ++ ++static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) ++{ ++ ssize_t n; ++ int cur, err; ++ char *new_buf; ++ ++ n = -ENOMEM; ++ new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if(new_buf == NULL){ ++ printk("hppfs_read_file : kmalloc failed\n"); ++ goto out; ++ } ++ n = 0; ++ while(count > 0){ ++ cur = min_t(ssize_t, count, PAGE_SIZE); ++ err = os_read_file(fd, new_buf, cur); ++ if(err < 0){ ++ printk("hppfs_read : read failed, errno = %d\n", ++ count); ++ n = err; ++ goto out_free; ++ } ++ else if(err == 0) ++ break; ++ ++ if(copy_to_user(buf, new_buf, err)){ ++ n = -EFAULT; ++ goto out_free; ++ } ++ n += err; ++ count -= err; ++ } ++ out_free: ++ kfree(new_buf); ++ out: ++ return(n); ++} ++ ++static ssize_t hppfs_read(struct file *file, char *buf, size_t count, ++ loff_t *ppos) ++{ ++ struct hppfs_private *hppfs = file->private_data; ++ struct hppfs_data *data; ++ loff_t off; ++ int err; ++ ++ if(hppfs->contents != NULL){ ++ if(*ppos >= hppfs->len) return(0); ++ ++ data = hppfs->contents; ++ off = *ppos; ++ while(off >= sizeof(data->contents)){ ++ data = list_entry(data->list.next, struct hppfs_data, ++ list); ++ off -= sizeof(data->contents); ++ } ++ ++ if(off + count > hppfs->len) ++ count = hppfs->len - off; ++ copy_to_user(buf, &data->contents[off], count); ++ *ppos += count; ++ } ++ else if(hppfs->host_fd != -1){ ++ err = os_seek_file(hppfs->host_fd, *ppos); ++ if(err){ ++ printk("hppfs_read : seek failed, errno = %d\n", err); ++ return(err); ++ } ++ count = hppfs_read_file(hppfs->host_fd, buf, count); ++ if(count > 0) ++ *ppos += count; ++ } ++ else count = read_proc(&hppfs->proc_file, buf, count, ppos, 1); ++ ++ return(count); ++} ++ ++static ssize_t hppfs_write(struct file *file, const char *buf, size_t len, ++ loff_t *ppos) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ ssize_t (*write)(struct file *, const char *, size_t, loff_t *); ++ int err; ++ ++ write = proc_file->f_dentry->d_inode->i_fop->write; ++ ++ proc_file->f_pos = file->f_pos; ++ err = (*write)(proc_file, buf, len, &proc_file->f_pos); ++ file->f_pos = proc_file->f_pos; ++ ++ return(err); ++} ++ ++static int open_host_sock(char *host_file, int *filter_out) ++{ ++ char *end; ++ int fd; ++ ++ end = &host_file[strlen(host_file)]; ++ strcpy(end, "/rw"); ++ *filter_out = 1; ++ fd = os_connect_socket(host_file); ++ if(fd > 0) ++ return(fd); ++ ++ strcpy(end, "/r"); ++ *filter_out = 0; ++ fd = os_connect_socket(host_file); ++ return(fd); ++} ++ ++static void free_contents(struct hppfs_data *head) ++{ ++ struct hppfs_data *data; ++ struct list_head *ele, *next; ++ ++ if(head == NULL) return; ++ ++ list_for_each_safe(ele, next, &head->list){ ++ data = list_entry(ele, struct hppfs_data, list); ++ kfree(data); ++ } ++ kfree(head); ++} ++ ++static struct hppfs_data *hppfs_get_data(int fd, int filter, ++ struct file *proc_file, ++ struct file *hppfs_file, ++ loff_t *size_out) ++{ ++ struct hppfs_data *data, *new, *head; ++ int n, err; ++ ++ err = -ENOMEM; ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(data == NULL){ ++ printk("hppfs_get_data : head allocation failed\n"); ++ goto failed; ++ } ++ ++ INIT_LIST_HEAD(&data->list); ++ ++ head = data; ++ *size_out = 0; ++ ++ if(filter){ ++ while((n = read_proc(proc_file, data->contents, ++ sizeof(data->contents), NULL, 0)) > 0) ++ os_write_file(fd, data->contents, n); ++ err = os_shutdown_socket(fd, 0, 1); ++ if(err){ ++ printk("hppfs_get_data : failed to shut down " ++ "socket\n"); ++ goto failed_free; ++ } ++ } ++ while(1){ ++ n = os_read_file(fd, data->contents, sizeof(data->contents)); ++ if(n < 0){ ++ err = n; ++ printk("hppfs_get_data : read failed, errno = %d\n", ++ err); ++ goto failed_free; ++ } ++ else if(n == 0) ++ break; ++ ++ *size_out += n; ++ ++ if(n < sizeof(data->contents)) ++ break; ++ ++ new = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(new == 0){ ++ printk("hppfs_get_data : data allocation failed\n"); ++ err = -ENOMEM; ++ goto failed_free; ++ } ++ ++ INIT_LIST_HEAD(&new->list); ++ list_add(&new->list, &data->list); ++ data = new; ++ } ++ return(head); ++ ++ failed_free: ++ free_contents(head); ++ failed: ++ return(ERR_PTR(err)); ++} ++ ++static struct hppfs_private *hppfs_data(void) ++{ ++ struct hppfs_private *data; ++ ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(data == NULL) ++ return(data); ++ ++ *data = ((struct hppfs_private ) { .host_fd = -1, ++ .len = -1, ++ .contents = NULL } ); ++ return(data); ++} ++ ++static int file_mode(int fmode) ++{ ++ if(fmode == (FMODE_READ | FMODE_WRITE)) ++ return(O_RDWR); ++ if(fmode == FMODE_READ) ++ return(O_RDONLY); ++ if(fmode == FMODE_WRITE) ++ return(O_WRONLY); ++ return(0); ++} ++ ++static int hppfs_open(struct inode *inode, struct file *file) ++{ ++ struct hppfs_private *data; ++ struct dentry *proc_dentry; ++ char *host_file; ++ int err, fd, type, filter; ++ ++ err = -ENOMEM; ++ data = hppfs_data(); ++ if(data == NULL) ++ goto out; ++ ++ host_file = dentry_name(file->f_dentry, strlen("/rw")); ++ if(host_file == NULL) ++ goto out_free2; ++ ++ proc_dentry = HPPFS_I(inode)->proc_dentry; ++ ++ /* XXX This isn't closed anywhere */ ++ err = open_private_file(&data->proc_file, proc_dentry, ++ file_mode(file->f_mode)); ++ if(err) ++ goto out_free1; ++ ++ type = os_file_type(host_file); ++ if(type == OS_TYPE_FILE){ ++ fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); ++ if(fd >= 0) ++ data->host_fd = fd; ++ else printk("hppfs_open : failed to open '%s', errno = %d\n", ++ host_file, -fd); ++ ++ data->contents = NULL; ++ } ++ else if(type == OS_TYPE_DIR){ ++ fd = open_host_sock(host_file, &filter); ++ if(fd > 0){ ++ data->contents = hppfs_get_data(fd, filter, ++ &data->proc_file, ++ file, &data->len); ++ if(!IS_ERR(data->contents)) ++ data->host_fd = fd; ++ } ++ else printk("hppfs_open : failed to open a socket in " ++ "'%s', errno = %d\n", host_file, -fd); ++ } ++ kfree(host_file); ++ ++ file->private_data = data; ++ return(0); ++ ++ out_free1: ++ kfree(host_file); ++ out_free2: ++ free_contents(data->contents); ++ kfree(data); ++ out: ++ return(err); ++} ++ ++static int hppfs_dir_open(struct inode *inode, struct file *file) ++{ ++ struct hppfs_private *data; ++ struct dentry *proc_dentry; ++ int err; ++ ++ err = -ENOMEM; ++ data = hppfs_data(); ++ if(data == NULL) ++ goto out; ++ ++ proc_dentry = HPPFS_I(inode)->proc_dentry; ++ err = open_private_file(&data->proc_file, proc_dentry, ++ file_mode(file->f_mode)); ++ if(err) ++ goto out_free; ++ ++ file->private_data = data; ++ return(0); ++ ++ out_free: ++ kfree(data); ++ out: ++ return(err); ++} ++ ++static loff_t hppfs_llseek(struct file *file, loff_t off, int where) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ loff_t (*llseek)(struct file *, loff_t, int); ++ loff_t ret; ++ ++ llseek = proc_file->f_dentry->d_inode->i_fop->llseek; ++ if(llseek != NULL){ ++ ret = (*llseek)(proc_file, off, where); ++ if(ret < 0) ++ return(ret); ++ } ++ ++ return(default_llseek(file, off, where)); ++} ++ ++static struct file_operations hppfs_file_fops = { ++ .owner = NULL, ++ .llseek = hppfs_llseek, ++ .read = hppfs_read, ++ .write = hppfs_write, ++ .open = hppfs_open, ++}; ++ ++struct hppfs_dirent { ++ void *vfs_dirent; ++ filldir_t filldir; ++ struct dentry *dentry; ++}; ++ ++static int hppfs_filldir(void *d, const char *name, int size, ++ loff_t offset, ino_t inode, unsigned int type) ++{ ++ struct hppfs_dirent *dirent = d; ++ ++ if(file_removed(dirent->dentry, name)) ++ return(0); ++ ++ return((*dirent->filldir)(dirent->vfs_dirent, name, size, offset, ++ inode, type)); ++} ++ ++static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ int (*readdir)(struct file *, void *, filldir_t); ++ struct hppfs_dirent dirent = ((struct hppfs_dirent) ++ { .vfs_dirent = ent, ++ .filldir = filldir, ++ .dentry = file->f_dentry } ); ++ int err; ++ ++ readdir = proc_file->f_dentry->d_inode->i_fop->readdir; ++ ++ proc_file->f_pos = file->f_pos; ++ err = (*readdir)(proc_file, &dirent, hppfs_filldir); ++ file->f_pos = proc_file->f_pos; ++ ++ return(err); ++} ++ ++static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) ++{ ++ return(0); ++} ++ ++static struct file_operations hppfs_dir_fops = { ++ .owner = NULL, ++ .readdir = hppfs_readdir, ++ .open = hppfs_dir_open, ++ .fsync = hppfs_fsync, ++}; ++ ++static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf) ++{ ++ sf->f_blocks = 0; ++ sf->f_bfree = 0; ++ sf->f_bavail = 0; ++ sf->f_files = 0; ++ sf->f_ffree = 0; ++ sf->f_type = HPPFS_SUPER_MAGIC; ++ return(0); ++} ++ ++static struct inode *hppfs_alloc_inode(struct super_block *sb) ++{ ++ struct hppfs_inode_info *hi; ++ ++ hi = kmalloc(sizeof(*hi), GFP_KERNEL); ++ if(hi == NULL) ++ return(NULL); ++ ++ *hi = ((struct hppfs_inode_info) { .proc_dentry = NULL }); ++ inode_init_once(&hi->vfs_inode); ++ return(&hi->vfs_inode); ++} ++ ++void hppfs_delete_inode(struct inode *ino) ++{ ++ clear_inode(ino); ++} ++ ++static void hppfs_destroy_inode(struct inode *inode) ++{ ++ kfree(HPPFS_I(inode)); ++} ++ ++static struct super_operations hppfs_sbops = { ++ .alloc_inode = hppfs_alloc_inode, ++ .destroy_inode = hppfs_destroy_inode, ++ .read_inode = hppfs_read_inode, ++ .delete_inode = hppfs_delete_inode, ++ .statfs = hppfs_statfs, ++}; ++ ++static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen) ++{ ++ struct file proc_file; ++ struct dentry *proc_dentry; ++ int (*readlink)(struct dentry *, char *, int); ++ int err, n; ++ ++ proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; ++ err = open_private_file(&proc_file, proc_dentry, O_RDONLY); ++ if(err) ++ return(err); ++ ++ readlink = proc_dentry->d_inode->i_op->readlink; ++ n = (*readlink)(proc_dentry, buffer, buflen); ++ ++ close_private_file(&proc_file); ++ ++ return(n); ++} ++ ++static int hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct file proc_file; ++ struct dentry *proc_dentry; ++ int (*follow_link)(struct dentry *, struct nameidata *); ++ int err, n; ++ ++ proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; ++ err = open_private_file(&proc_file, proc_dentry, O_RDONLY); ++ if(err) ++ return(err); ++ ++ follow_link = proc_dentry->d_inode->i_op->follow_link; ++ n = (*follow_link)(proc_dentry, nd); ++ ++ close_private_file(&proc_file); ++ ++ return(n); ++} ++ ++static struct inode_operations hppfs_dir_iops = { ++ .lookup = hppfs_lookup, ++}; ++ ++static struct inode_operations hppfs_link_iops = { ++ .readlink = hppfs_readlink, ++ .follow_link = hppfs_follow_link, ++}; ++ ++static int init_inode(struct inode *inode, struct dentry *dentry) ++{ ++ if(S_ISDIR(dentry->d_inode->i_mode)){ ++ inode->i_op = &hppfs_dir_iops; ++ inode->i_fop = &hppfs_dir_fops; ++ } ++ else if(S_ISLNK(dentry->d_inode->i_mode)){ ++ inode->i_op = &hppfs_link_iops; ++ inode->i_fop = &hppfs_file_fops; ++ } ++ else { ++ inode->i_op = &hppfs_file_iops; ++ inode->i_fop = &hppfs_file_fops; ++ } ++ ++ HPPFS_I(inode)->proc_dentry = dentry; ++ ++ return(0); ++} ++ ++static int hppfs_fill_super(struct super_block *sb, void *d, int silent) ++{ ++ struct inode *root_inode; ++ struct file_system_type *procfs; ++ struct super_block *proc_sb; ++ int err; ++ ++ err = -ENOENT; ++ procfs = get_fs_type("proc"); ++ if(procfs == NULL) ++ goto out; ++ ++ if(list_empty(&procfs->fs_supers)) ++ goto out; ++ ++ proc_sb = list_entry(procfs->fs_supers.next, struct super_block, ++ s_instances); ++ ++ sb->s_blocksize = 1024; ++ sb->s_blocksize_bits = 10; ++ sb->s_magic = HPPFS_SUPER_MAGIC; ++ sb->s_op = &hppfs_sbops; ++ ++ root_inode = iget(sb, 0); ++ if(root_inode == NULL) ++ goto out; ++ ++ err = init_inode(root_inode, proc_sb->s_root); ++ if(err) ++ goto out_put; ++ ++ err = -ENOMEM; ++ sb->s_root = d_alloc_root(root_inode); ++ if(sb->s_root == NULL) ++ goto out_put; ++ ++ hppfs_read_inode(root_inode); ++ ++ return(0); ++ ++ out_put: ++ iput(root_inode); ++ out: ++ return(err); ++} ++ ++static struct super_block *hppfs_read_super(struct file_system_type *type, ++ int flags, const char *dev_name, ++ void *data) ++{ ++ return(get_sb_nodev(type, flags, data, hppfs_fill_super)); ++} ++ ++static struct file_system_type hppfs_type = { ++ .owner = THIS_MODULE, ++ .name = "hppfs", ++ .get_sb = hppfs_read_super, ++ .kill_sb = kill_anon_super, ++ .fs_flags = 0, ++}; ++ ++static int __init init_hppfs(void) ++{ ++ return(register_filesystem(&hppfs_type)); ++} ++ ++static void __exit exit_hppfs(void) ++{ ++ unregister_filesystem(&hppfs_type); ++} ++ ++module_init(init_hppfs) ++module_exit(exit_hppfs) ++MODULE_LICENSE("GPL"); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur a/include/asm-um/archparam-i386.h b/include/asm-um/archparam-i386.h +--- a/include/asm-um/archparam-i386.h Fri Aug 15 15:07:52 2003 ++++ b/include/asm-um/archparam-i386.h Fri Aug 15 15:13:17 2003 +@@ -56,6 +56,65 @@ + pr_reg[16] = PT_REGS_SS(regs); \ + } while(0); + ++#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL)) ++#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) ++#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) ++extern void *__kernel_vsyscall; ++ ++/* ++ * Architecture-neutral AT_ values in 0-17, leave some room ++ * for more of them, start the x86-specific ones at 32. ++ */ ++#define AT_SYSINFO 32 ++#define AT_SYSINFO_EHDR 33 ++ ++#define ARCH_DLINFO \ ++do { \ ++ NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ ++ NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ ++} while (0) ++ ++/* ++ * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out ++ * extra segments containing the vsyscall DSO contents. Dumping its ++ * contents makes post-mortem fully interpretable later without matching up ++ * the same kernel and hardware config to see what PC values meant. ++ * Dumping its extra ELF program headers includes all the other information ++ * a debugger needs to easily find how the vsyscall DSO was being used. ++ */ ++#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum) ++#define ELF_CORE_WRITE_EXTRA_PHDRS \ ++do { \ ++ const struct elf_phdr *const vsyscall_phdrs = \ ++ (const struct elf_phdr *) (VSYSCALL_BASE \ ++ + VSYSCALL_EHDR->e_phoff); \ ++ int i; \ ++ Elf32_Off ofs = 0; \ ++ for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ ++ struct elf_phdr phdr = vsyscall_phdrs[i]; \ ++ if (phdr.p_type == PT_LOAD) { \ ++ ofs = phdr.p_offset = offset; \ ++ offset += phdr.p_filesz; \ ++ } \ ++ else \ ++ phdr.p_offset += ofs; \ ++ phdr.p_paddr = 0; /* match other core phdrs */ \ ++ DUMP_WRITE(&phdr, sizeof(phdr)); \ ++ } \ ++} while (0) ++#define ELF_CORE_WRITE_EXTRA_DATA \ ++do { \ ++ const struct elf_phdr *const vsyscall_phdrs = \ ++ (const struct elf_phdr *) (VSYSCALL_BASE \ ++ + VSYSCALL_EHDR->e_phoff); \ ++ int i; \ ++ for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ ++ if (vsyscall_phdrs[i].p_type == PT_LOAD) \ ++ DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \ ++ vsyscall_phdrs[i].p_filesz); \ ++ } \ ++} while (0) ++ + /********* Bits for asm-um/delay.h **********/ + + typedef unsigned long um_udelay_t; +diff -Naur a/include/asm-um/common.lds.S b/include/asm-um/common.lds.S +--- a/include/asm-um/common.lds.S Fri Aug 15 15:04:49 2003 ++++ b/include/asm-um/common.lds.S Fri Aug 15 15:10:46 2003 +@@ -1,3 +1,5 @@ ++#include ++ + .fini : { *(.fini) } =0x9090 + _etext = .; + PROVIDE (etext = .); +@@ -67,6 +69,10 @@ + } + __initcall_end = .; + ++ __con_initcall_start = .; ++ .con_initcall.init : { *(.con_initcall.init) } ++ __con_initcall_end = .; ++ + __uml_initcall_start = .; + .uml.initcall.init : { *(.uml.initcall.init) } + __uml_initcall_end = .; +@@ -80,7 +86,33 @@ + .uml.exitcall : { *(.uml.exitcall.exit) } + __uml_exitcall_end = .; + +- . = ALIGN(4096); ++ . = ALIGN(4); ++ __alt_instructions = .; ++ .altinstructions : { *(.altinstructions) } ++ __alt_instructions_end = .; ++ .altinstr_replacement : { *(.altinstr_replacement) } ++ /* .exit.text is discard at runtime, not link time, to deal with references ++ from .altinstructions and .eh_frame */ ++ .exit.text : { *(.exit.text) } ++ .exit.data : { *(.exit.data) } ++ ++ __preinit_array_start = .; ++ .preinit_array : { *(.preinit_array) } ++ __preinit_array_end = .; ++ __init_array_start = .; ++ .init_array : { *(.init_array) } ++ __init_array_end = .; ++ __fini_array_start = .; ++ .fini_array : { *(.fini_array) } ++ __fini_array_end = .; ++ ++ . = ALIGN(4096); + __initramfs_start = .; + .init.ramfs : { *(.init.ramfs) } + __initramfs_end = .; ++ ++ /* Sections to be discarded */ ++ /DISCARD/ : { ++ *(.exitcall.exit) ++ } ++ +diff -Naur a/include/asm-um/cpufeature.h b/include/asm-um/cpufeature.h +--- a/include/asm-um/cpufeature.h Wed Dec 31 19:00:00 1969 ++++ b/include/asm-um/cpufeature.h Fri Aug 15 15:10:07 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_CPUFEATURE_H ++#define __UM_CPUFEATURE_H ++ ++#include "asm/arch/cpufeature.h" ++ ++#endif +diff -Naur a/include/asm-um/current.h b/include/asm-um/current.h +--- a/include/asm-um/current.h Fri Aug 15 15:04:11 2003 ++++ b/include/asm-um/current.h Fri Aug 15 15:10:19 2003 +@@ -16,8 +16,10 @@ + #define CURRENT_THREAD(dummy) (((unsigned long) &dummy) & \ + (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER)) + +-#define current ({ int dummy; \ +- ((struct thread_info *) CURRENT_THREAD(dummy))->task; }) ++#define current_thread \ ++ ({ int dummy; ((struct thread_info *) CURRENT_THREAD(dummy)); }) ++ ++#define current (current_thread->task) + + #endif /* __ASSEMBLY__ */ + +diff -Naur a/include/asm-um/fixmap.h b/include/asm-um/fixmap.h +--- a/include/asm-um/fixmap.h Fri Aug 15 15:08:40 2003 ++++ b/include/asm-um/fixmap.h Fri Aug 15 15:13:36 2003 +@@ -34,6 +34,7 @@ + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + #endif ++ FIX_VSYSCALL, + __end_of_fixed_addresses + }; + +@@ -63,6 +64,13 @@ + #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) + #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + ++/* ++ * This is the range that is readable by user mode, and things ++ * acting like user mode such as get_user_pages. ++ */ ++#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL)) ++#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) ++ + extern void __this_fixmap_does_not_exist(void); + + /* +diff -Naur a/include/asm-um/irq.h b/include/asm-um/irq.h +--- a/include/asm-um/irq.h Fri Aug 15 15:09:15 2003 ++++ b/include/asm-um/irq.h Fri Aug 15 15:13:51 2003 +@@ -1,15 +1,6 @@ + #ifndef __UM_IRQ_H + #define __UM_IRQ_H + +-/* The i386 irq.h has a struct task_struct in a prototype without including +- * sched.h. This forward declaration kills the resulting warning. +- */ +-struct task_struct; +- +-#include "asm/ptrace.h" +- +-#undef NR_IRQS +- + #define TIMER_IRQ 0 + #define UMN_IRQ 1 + #define CONSOLE_IRQ 2 +@@ -28,8 +19,4 @@ + #define LAST_IRQ XTERM_IRQ + #define NR_IRQS (LAST_IRQ + 1) + +-extern int um_request_irq(unsigned int irq, int fd, int type, +- void (*handler)(int, void *, struct pt_regs *), +- unsigned long irqflags, const char * devname, +- void *dev_id); + #endif +diff -Naur a/include/asm-um/local.h b/include/asm-um/local.h +--- a/include/asm-um/local.h Wed Dec 31 19:00:00 1969 ++++ b/include/asm-um/local.h Fri Aug 15 15:12:46 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_LOCAL_H ++#define __UM_LOCAL_H ++ ++#include "asm/arch/local.h" ++ ++#endif +diff -Naur a/include/asm-um/module-generic.h b/include/asm-um/module-generic.h +--- a/include/asm-um/module-generic.h Wed Dec 31 19:00:00 1969 ++++ b/include/asm-um/module-generic.h Fri Aug 15 15:12:38 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_MODULE_GENERIC_H ++#define __UM_MODULE_GENERIC_H ++ ++#include "asm/arch/module.h" ++ ++#endif +diff -Naur a/include/asm-um/module-i386.h b/include/asm-um/module-i386.h +--- a/include/asm-um/module-i386.h Wed Dec 31 19:00:00 1969 ++++ b/include/asm-um/module-i386.h Fri Aug 15 15:12:37 2003 +@@ -0,0 +1,13 @@ ++#ifndef __UM_MODULE_I386_H ++#define __UM_MODULE_I386_H ++ ++/* UML is simple */ ++struct mod_arch_specific ++{ ++}; ++ ++#define Elf_Shdr Elf32_Shdr ++#define Elf_Sym Elf32_Sym ++#define Elf_Ehdr Elf32_Ehdr ++ ++#endif +diff -Naur a/include/asm-um/page.h b/include/asm-um/page.h +--- a/include/asm-um/page.h Fri Aug 15 15:06:42 2003 ++++ b/include/asm-um/page.h Fri Aug 15 15:12:40 2003 +@@ -4,7 +4,6 @@ + struct page; + + #include "asm/arch/page.h" +-#include "asm/bug.h" + + #undef __pa + #undef __va +diff -Naur a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h +--- a/include/asm-um/pgtable.h Fri Aug 15 15:09:25 2003 ++++ b/include/asm-um/pgtable.h Fri Aug 15 15:14:09 2003 +@@ -79,12 +79,13 @@ + + #define _PAGE_PRESENT 0x001 + #define _PAGE_NEWPAGE 0x002 +-#define _PAGE_PROTNONE 0x004 /* If not present */ +-#define _PAGE_RW 0x008 +-#define _PAGE_USER 0x010 +-#define _PAGE_ACCESSED 0x020 +-#define _PAGE_DIRTY 0x040 +-#define _PAGE_NEWPROT 0x080 ++#define _PAGE_NEWPROT 0x004 ++#define _PAGE_FILE 0x008 /* set:pagecache unset:swap */ ++#define _PAGE_PROTNONE 0x010 /* If not present */ ++#define _PAGE_RW 0x020 ++#define _PAGE_USER 0x040 ++#define _PAGE_ACCESSED 0x080 ++#define _PAGE_DIRTY 0x100 + + #define REGION_MASK 0xf0000000 + #define REGION_SHIFT 28 +@@ -203,6 +204,16 @@ + #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) + #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) + ++/* ++ * Bits 0 through 3 are taken ++ */ ++#define PTE_FILE_MAX_BITS 28 ++ ++#define pte_to_pgoff(pte) ((pte).pte_low >> 4) ++ ++#define pgoff_to_pte(off) \ ++ ((pte_t) { ((off) << 4) + _PAGE_FILE }) ++ + static inline pte_t pte_mknewprot(pte_t pte) + { + pte_val(pte) |= _PAGE_NEWPROT; +@@ -236,6 +247,12 @@ + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ ++static inline int pte_user(pte_t pte) ++{ ++ return((pte_val(pte) & _PAGE_USER) && ++ !(pte_val(pte) & _PAGE_PROTNONE)); ++} ++ + static inline int pte_read(pte_t pte) + { + return((pte_val(pte) & _PAGE_USER) && +@@ -253,6 +270,14 @@ + !(pte_val(pte) & _PAGE_PROTNONE)); + } + ++/* ++ * The following only works if pte_present() is not true. ++ */ ++static inline int pte_file(pte_t pte) ++{ ++ return (pte).pte_low & _PAGE_FILE; ++} ++ + static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } + static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } + static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; } +@@ -355,14 +380,26 @@ + #define pmd_page(pmd) (phys_mem_map(pmd_val(pmd) & PAGE_MASK) + \ + ((phys_addr(pmd_val(pmd)) >> PAGE_SHIFT))) + +-/* to find an entry in a page-table-directory. */ ++/* ++ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] ++ * ++ * this macro returns the index of the entry in the pgd page which would ++ * control the given virtual address ++ */ + #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) + +-/* to find an entry in a page-table-directory */ ++/* ++ * pgd_offset() returns a (pgd_t *) ++ * pgd_index() is used get the offset into the pgd page's array of pgd_t's; ++ */ + #define pgd_offset(mm, address) \ + ((mm)->pgd + ((address) >> PGDIR_SHIFT)) + +-/* to find an entry in a kernel page-table-directory */ ++ ++/* ++ * a shortcut which implies the use of the kernel's pgd, instead ++ * of a process's ++ */ + #define pgd_offset_k(address) pgd_offset(&init_mm, address) + + #define pmd_index(address) \ +@@ -374,7 +411,12 @@ + return (pmd_t *) dir; + } + +-/* Find an entry in the third-level page table.. */ ++/* ++ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] ++ * ++ * this macro returns the index of the entry in the pte page which would ++ * control the given virtual address ++ */ + #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + #define pte_offset_kernel(dir, address) \ + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) +@@ -400,11 +442,11 @@ + #define update_mmu_cache(vma,address,pte) do ; while (0) + + /* Encode and de-code a swap entry */ +-#define __swp_type(x) (((x).val >> 3) & 0x7f) +-#define __swp_offset(x) ((x).val >> 10) ++#define __swp_type(x) (((x).val >> 4) & 0x3f) ++#define __swp_offset(x) ((x).val >> 11) + + #define __swp_entry(type, offset) \ +- ((swp_entry_t) { ((type) << 3) | ((offset) << 10) }) ++ ((swp_entry_t) { ((type) << 4) | ((offset) << 11) }) + #define __pte_to_swp_entry(pte) \ + ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) + #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +diff -Naur a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h +--- a/include/asm-um/processor-generic.h Fri Aug 15 15:04:48 2003 ++++ b/include/asm-um/processor-generic.h Fri Aug 15 15:10:42 2003 +@@ -11,9 +11,7 @@ + struct task_struct; + + #include "linux/config.h" +-#include "linux/signal.h" + #include "asm/ptrace.h" +-#include "asm/siginfo.h" + #include "choose-mode.h" + + struct mm_struct; +@@ -101,14 +99,19 @@ + } mm_segment_t; + + extern struct task_struct *alloc_task_struct(void); +-extern void free_task_struct(struct task_struct *task); + + extern void release_thread(struct task_struct *); + extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); + extern void dump_thread(struct pt_regs *regs, struct user *u); ++extern void prepare_to_copy(struct task_struct *tsk); + + extern unsigned long thread_saved_pc(struct task_struct *t); + ++static inline void mm_copy_segments(struct mm_struct *from_mm, ++ struct mm_struct *new_mm) ++{ ++} ++ + #define init_stack (init_thread_union.stack) + + /* +diff -Naur a/include/asm-um/processor-i386.h b/include/asm-um/processor-i386.h +--- a/include/asm-um/processor-i386.h Fri Aug 15 15:04:00 2003 ++++ b/include/asm-um/processor-i386.h Fri Aug 15 15:10:18 2003 +@@ -6,8 +6,8 @@ + #ifndef __UM_PROCESSOR_I386_H + #define __UM_PROCESSOR_I386_H + +-extern int cpu_has_xmm; +-extern int cpu_has_cmov; ++extern int host_has_xmm; ++extern int host_has_cmov; + + struct arch_thread { + unsigned long debugregs[8]; +diff -Naur a/include/asm-um/sections.h b/include/asm-um/sections.h +--- a/include/asm-um/sections.h Wed Dec 31 19:00:00 1969 ++++ b/include/asm-um/sections.h Fri Aug 15 15:12:54 2003 +@@ -0,0 +1,7 @@ ++#ifndef _UM_SECTIONS_H ++#define _UM_SECTIONS_H ++ ++/* nothing to see, move along */ ++#include ++ ++#endif +diff -Naur a/include/asm-um/smp.h b/include/asm-um/smp.h +--- a/include/asm-um/smp.h Fri Aug 15 15:03:35 2003 ++++ b/include/asm-um/smp.h Fri Aug 15 15:10:04 2003 +@@ -7,9 +7,10 @@ + + #include "linux/config.h" + #include "linux/bitops.h" ++#include "linux/threads.h" + #include "asm/current.h" + +-#define smp_processor_id() (current->thread_info->cpu) ++#define smp_processor_id() (current_thread->cpu) + #define cpu_logical_map(n) (n) + #define cpu_number_map(n) (n) + #define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */ +@@ -30,6 +31,13 @@ + { + } + ++extern inline int any_online_cpu(unsigned int mask) ++{ ++ if (mask & cpu_online_map) ++ return __ffs(mask & cpu_online_map); ++ ++ return -1; ++} + #endif + + #endif +diff -Naur a/include/asm-um/system-generic.h b/include/asm-um/system-generic.h +--- a/include/asm-um/system-generic.h Fri Aug 15 15:09:22 2003 ++++ b/include/asm-um/system-generic.h Fri Aug 15 15:14:01 2003 +@@ -23,8 +23,10 @@ + extern void block_signals(void); + extern void unblock_signals(void); + +-#define local_save_flags(flags) do { (flags) = get_signals(); } while(0) +-#define local_irq_restore(flags) do { set_signals(flags); } while(0) ++#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ ++ (flags) = get_signals(); } while(0) ++#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ ++ set_signals(flags); } while(0) + + #define local_irq_save(flags) do { local_save_flags(flags); \ + local_irq_disable(); } while(0) +diff -Naur a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h +--- a/include/asm-um/thread_info.h Fri Aug 15 15:05:00 2003 ++++ b/include/asm-um/thread_info.h Fri Aug 15 15:11:11 2003 +@@ -9,6 +9,7 @@ + #ifndef __ASSEMBLY__ + + #include ++#include + + struct thread_info { + struct task_struct *task; /* main task structure */ +@@ -43,15 +44,18 @@ + static inline struct thread_info *current_thread_info(void) + { + struct thread_info *ti; +- __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~16383UL)); ++ unsigned long mask = PAGE_SIZE * ++ (1 << CONFIG_KERNEL_STACK_ORDER) - 1; ++ __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~mask)); + return ti; + } + + /* thread information allocation */ +-#define THREAD_SIZE (4*PAGE_SIZE) +-#define alloc_thread_info(tsk) ((struct thread_info *) \ +- __get_free_pages(GFP_KERNEL,2)) +-#define free_thread_info(ti) free_pages((unsigned long) (ti), 2) ++#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) ++#define alloc_thread_info(tsk) \ ++ ((struct thread_info *) kmalloc(THREAD_SIZE, GFP_KERNEL)) ++#define free_thread_info(ti) kfree(ti) ++ + #define get_thread_info(ti) get_task_struct((ti)->task) + #define put_thread_info(ti) put_task_struct((ti)->task) + +@@ -65,11 +69,13 @@ + #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling + * TIF_NEED_RESCHED + */ ++#define TIF_RESTART_BLOCK 4 + + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) + #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) ++#define _TIF_RESTART_BLOCK (1 << TIF_RESTART_BLOCK) + + #endif + +diff -Naur a/include/asm-um/timex.h b/include/asm-um/timex.h +--- a/include/asm-um/timex.h Fri Aug 15 15:07:22 2003 ++++ b/include/asm-um/timex.h Fri Aug 15 15:12:48 2003 +@@ -1,8 +1,6 @@ + #ifndef __UM_TIMEX_H + #define __UM_TIMEX_H + +-#include "linux/time.h" +- + typedef unsigned long cycles_t; + + #define cacheflush_time (0) +diff -Naur a/include/linux/mm.h b/include/linux/mm.h +--- a/include/linux/mm.h Fri Aug 15 15:03:56 2003 ++++ b/include/linux/mm.h Fri Aug 15 15:10:14 2003 +@@ -483,6 +483,9 @@ + return __set_page_dirty_buffers(page); + } + ++extern long do_mprotect(struct mm_struct *mm, unsigned long start, ++ size_t len, unsigned long prot); ++ + /* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc_map() that does all +@@ -513,9 +516,10 @@ + + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +- unsigned long len, unsigned long prot, +- unsigned long flag, unsigned long pgoff); ++extern unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file *file, ++ unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flag, ++ unsigned long pgoff); + + static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, +@@ -525,7 +529,8 @@ + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) +- ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); ++ ret = do_mmap_pgoff(current->mm, file, addr, len, prot, flag, ++ offset >> PAGE_SHIFT); + out: + return ret; + } +diff -Naur a/include/linux/proc_mm.h b/include/linux/proc_mm.h +--- a/include/linux/proc_mm.h Wed Dec 31 19:00:00 1969 ++++ b/include/linux/proc_mm.h Fri Aug 15 15:10:02 2003 +@@ -0,0 +1,48 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __PROC_MM_H ++#define __PROC_MM_H ++ ++#include "linux/sched.h" ++ ++#define MM_MMAP 54 ++#define MM_MUNMAP 55 ++#define MM_MPROTECT 56 ++#define MM_COPY_SEGMENTS 57 ++ ++struct mm_mmap { ++ unsigned long addr; ++ unsigned long len; ++ unsigned long prot; ++ unsigned long flags; ++ unsigned long fd; ++ unsigned long offset; ++}; ++ ++struct mm_munmap { ++ unsigned long addr; ++ unsigned long len; ++}; ++ ++struct mm_mprotect { ++ unsigned long addr; ++ unsigned long len; ++ unsigned int prot; ++}; ++ ++struct proc_mm_op { ++ int op; ++ union { ++ struct mm_mmap mmap; ++ struct mm_munmap munmap; ++ struct mm_mprotect mprotect; ++ int copy_segments; ++ } u; ++}; ++ ++extern struct mm_struct *proc_mm_get_mm(int fd); ++ ++#endif +diff -Naur a/mm/Makefile b/mm/Makefile +--- a/mm/Makefile Fri Aug 15 15:07:22 2003 ++++ b/mm/Makefile Fri Aug 15 15:12:48 2003 +@@ -12,3 +12,5 @@ + slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) + + obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o ++obj-$(CONFIG_PROC_MM) += proc_mm.o ++ +diff -Naur a/mm/memory.c b/mm/memory.c +--- a/mm/memory.c Fri Aug 15 15:05:37 2003 ++++ b/mm/memory.c Fri Aug 15 15:11:48 2003 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -669,6 +670,24 @@ + } + + ++static struct vm_area_struct fixmap_vma = { ++ /* Catch users - if there are any valid ++ ones, we can make this be "&init_mm" or ++ something. */ ++ .vm_mm = NULL, ++ .vm_page_prot = PAGE_READONLY, ++ .vm_flags = VM_READ | VM_EXEC, ++}; ++ ++static int init_fixmap_vma(void) ++{ ++ fixmap_vma.vm_start = FIXADDR_START; ++ fixmap_vma.vm_end = FIXADDR_TOP; ++ return(0); ++} ++ ++__initcall(init_fixmap_vma); ++ + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +@@ -689,19 +708,8 @@ + + vma = find_extend_vma(mm, start); + +-#ifdef FIXADDR_USER_START +- if (!vma && +- start >= FIXADDR_USER_START && start < FIXADDR_USER_END) { +- static struct vm_area_struct fixmap_vma = { +- /* Catch users - if there are any valid +- ones, we can make this be "&init_mm" or +- something. */ +- .vm_mm = NULL, +- .vm_start = FIXADDR_USER_START, +- .vm_end = FIXADDR_USER_END, +- .vm_page_prot = PAGE_READONLY, +- .vm_flags = VM_READ | VM_EXEC, +- }; ++#ifdef FIXADDR_START ++ if (!vma && start >= FIXADDR_START && start < FIXADDR_TOP) { + unsigned long pg = start & PAGE_MASK; + pgd_t *pgd; + pmd_t *pmd; +diff -Naur a/mm/mmap.c b/mm/mmap.c +--- a/mm/mmap.c Fri Aug 15 15:07:18 2003 ++++ b/mm/mmap.c Fri Aug 15 15:12:45 2003 +@@ -457,11 +457,11 @@ + * The caller must hold down_write(current->mm->mmap_sem). + */ + +-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, +- unsigned long len, unsigned long prot, +- unsigned long flags, unsigned long pgoff) ++unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file * file, ++ unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, ++ unsigned long pgoff) + { +- struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + struct inode *inode; + unsigned int vm_flags; +diff -Naur a/mm/mprotect.c b/mm/mprotect.c +--- a/mm/mprotect.c Fri Aug 15 15:05:20 2003 ++++ b/mm/mprotect.c Fri Aug 15 15:11:21 2003 +@@ -222,7 +222,8 @@ + } + + asmlinkage long +-sys_mprotect(unsigned long start, size_t len, unsigned long prot) ++do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, ++ unsigned long prot) + { + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * next, * prev; +@@ -239,9 +240,9 @@ + if (end == start) + return 0; + +- down_write(¤t->mm->mmap_sem); ++ down_write(&mm->mmap_sem); + +- vma = find_vma_prev(current->mm, start, &prev); ++ vma = find_vma_prev(mm, start, &prev); + error = -ENOMEM; + if (!vma || vma->vm_start > start) + goto out; +@@ -301,6 +302,11 @@ + prev->vm_mm->map_count--; + } + out: +- up_write(¤t->mm->mmap_sem); ++ up_write(&mm->mmap_sem); + return error; + } ++ ++asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) ++{ ++ return(do_mprotect(current->mm, start, len, prot)); ++} +diff -Naur a/mm/proc_mm.c b/mm/proc_mm.c +--- a/mm/proc_mm.c Wed Dec 31 19:00:00 1969 ++++ b/mm/proc_mm.c Fri Aug 15 15:11:44 2003 +@@ -0,0 +1,174 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/mm.h" ++#include "linux/init.h" ++#include "linux/proc_fs.h" ++#include "linux/proc_mm.h" ++#include "linux/file.h" ++#include "asm/uaccess.h" ++#include "asm/mmu_context.h" ++ ++static struct file_operations proc_mm_fops; ++ ++struct mm_struct *proc_mm_get_mm(int fd) ++{ ++ struct mm_struct *ret = ERR_PTR(-EBADF); ++ struct file *file; ++ ++ file = fget(fd); ++ if (!file) ++ goto out; ++ ++ ret = ERR_PTR(-EINVAL); ++ if(file->f_op != &proc_mm_fops) ++ goto out_fput; ++ ++ ret = file->private_data; ++ out_fput: ++ fput(file); ++ out: ++ return(ret); ++} ++ ++extern long do_mmap2(struct mm_struct *mm, unsigned long addr, ++ unsigned long len, unsigned long prot, ++ unsigned long flags, unsigned long fd, ++ unsigned long pgoff); ++ ++static ssize_t write_proc_mm(struct file *file, const char *buffer, ++ size_t count, loff_t *ppos) ++{ ++ struct mm_struct *mm = file->private_data; ++ struct proc_mm_op req; ++ int n, ret; ++ ++ if(count > sizeof(req)) ++ return(-EINVAL); ++ ++ n = copy_from_user(&req, buffer, count); ++ if(n != 0) ++ return(-EFAULT); ++ ++ ret = count; ++ switch(req.op){ ++ case MM_MMAP: { ++ struct mm_mmap *map = &req.u.mmap; ++ ++ ret = do_mmap2(mm, map->addr, map->len, map->prot, ++ map->flags, map->fd, map->offset >> PAGE_SHIFT); ++ if((ret & ~PAGE_MASK) == 0) ++ ret = count; ++ ++ break; ++ } ++ case MM_MUNMAP: { ++ struct mm_munmap *unmap = &req.u.munmap; ++ ++ down_write(&mm->mmap_sem); ++ ret = do_munmap(mm, unmap->addr, unmap->len); ++ up_write(&mm->mmap_sem); ++ ++ if(ret == 0) ++ ret = count; ++ break; ++ } ++ case MM_MPROTECT: { ++ struct mm_mprotect *protect = &req.u.mprotect; ++ ++ ret = do_mprotect(mm, protect->addr, protect->len, ++ protect->prot); ++ if(ret == 0) ++ ret = count; ++ break; ++ } ++ ++ case MM_COPY_SEGMENTS: { ++ struct mm_struct *from = proc_mm_get_mm(req.u.copy_segments); ++ ++ if(IS_ERR(from)){ ++ ret = PTR_ERR(from); ++ break; ++ } ++ ++ mm_copy_segments(from, mm); ++ break; ++ } ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return(ret); ++} ++ ++static int open_proc_mm(struct inode *inode, struct file *file) ++{ ++ struct mm_struct *mm = mm_alloc(); ++ int ret; ++ ++ ret = -ENOMEM; ++ if(mm == NULL) ++ goto out_mem; ++ ++ ret = init_new_context(current, mm); ++ if(ret) ++ goto out_free; ++ ++ spin_lock(&mmlist_lock); ++ list_add(&mm->mmlist, ¤t->mm->mmlist); ++ mmlist_nr++; ++ spin_unlock(&mmlist_lock); ++ ++ file->private_data = mm; ++ ++ return(0); ++ ++ out_free: ++ mmput(mm); ++ out_mem: ++ return(ret); ++} ++ ++static int release_proc_mm(struct inode *inode, struct file *file) ++{ ++ struct mm_struct *mm = file->private_data; ++ ++ mmput(mm); ++ return(0); ++} ++ ++static struct file_operations proc_mm_fops = { ++ .open = open_proc_mm, ++ .release = release_proc_mm, ++ .write = write_proc_mm, ++}; ++ ++static int make_proc_mm(void) ++{ ++ struct proc_dir_entry *ent; ++ ++ ent = create_proc_entry("mm", 0222, &proc_root); ++ if(ent == NULL){ ++ printk("make_proc_mm : Failed to register /proc/mm\n"); ++ return(0); ++ } ++ ent->proc_fops = &proc_mm_fops; ++ ++ return(0); ++} ++ ++__initcall(make_proc_mm); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch new file mode 100644 index 0000000..a9cc225 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch @@ -0,0 +1,265 @@ + fs/inode.c | 1 + fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++--------------- + include/linux/fs.h | 11 ++++---- + 3 files changed, 54 insertions(+), 24 deletions(-) + +--- linux-2.4.18/fs/namei.c~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 ++++ linux-2.4.18-alexey/fs/namei.c 2003-09-01 17:56:10.000000000 +0400 +@@ -101,6 +101,36 @@ void intent_release(struct lookup_intent + + } + ++static void *lock_dir(struct inode *dir, struct qstr *name) ++{ ++ unsigned long hash; ++ ++ if (!IS_PDIROPS(dir)) { ++ down(&dir->i_sem); ++ return 0; ++ } ++ ++ /* OK. fs understands parallel directory operations. ++ * so, we try to acquire lock for hash of requested ++ * filename in order to prevent any operations with ++ * same name in same time -bzzz */ ++ ++ /* calculate name hash */ ++ hash = full_name_hash(name->name, name->len); ++ ++ /* lock this hash */ ++ return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC); ++} ++ ++static void unlock_dir(struct inode *dir, void *lock) ++{ ++ if (!IS_PDIROPS(dir)) { ++ up(&dir->i_sem); ++ return; ++ } ++ dynlock_unlock(&dir->i_dcache_lock, lock); ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -302,10 +332,10 @@ static struct dentry *real_lookup(struct + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ void *lock; + + again: +- +- down(&dir->i_sem); ++ lock = lock_dir(dir, name); + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. +@@ -329,7 +359,7 @@ again: + else + result = dentry; + } +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + return result; + } + +@@ -337,7 +367,7 @@ again: + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) { + dput(result); +@@ -1234,13 +1264,13 @@ struct file *filp_open(const char * path + goto exit; + + dir = nd.dentry; +- down(&dir->d_inode->i_sem); ++ nd.lock = lock_dir(dir->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, &it); + + do_last: + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd.lock); + goto exit; + } + +@@ -1249,7 +1279,7 @@ do_last: + if (!dentry->d_inode) { + error = vfs_create_it(dir->d_inode, dentry, + mode & ~current->fs->umask, &it); +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd.lock); + dput(nd.dentry); + nd.dentry = dentry; + if (error) +@@ -1264,7 +1294,7 @@ do_last: + /* + * It already exists. + */ +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd.lock); + + error = -EEXIST; + if (flag & O_EXCL) +@@ -1344,7 +1374,7 @@ do_link: + goto exit; + } + dir = nd.dentry; +- down(&dir->d_inode->i_sem); ++ nd.lock = lock_dir(dir->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, &it); + putname(nd.last.name); + goto do_last; +@@ -1357,7 +1387,7 @@ static struct dentry *lookup_create(stru + { + struct dentry *dentry; + +- down(&nd->dentry->d_inode->i_sem); ++ nd->lock = lock_dir(nd->dentry->d_inode, &nd->last); + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +@@ -1446,7 +1476,7 @@ asmlinkage long sys_mknod(const char * f + } + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1509,7 +1539,7 @@ asmlinkage long sys_mkdir(const char * p + mode & ~current->fs->umask); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1619,14 +1649,14 @@ asmlinkage long sys_rmdir(const char * p + if (error != -EOPNOTSUPP) + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1685,7 +1715,7 @@ asmlinkage long sys_unlink(const char * + if (error != -EOPNOTSUPP) + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1696,7 +1726,7 @@ asmlinkage long sys_unlink(const char * + exit2: + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1766,7 +1796,7 @@ asmlinkage long sys_symlink(const char * + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1858,7 +1888,7 @@ asmlinkage long sys_link(const char * ol + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); + dput(new_dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out_release: + path_release(&nd); + out: +--- linux-2.4.18/include/linux/fs.h~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 ++++ linux-2.4.18-alexey/include/linux/fs.h 2003-09-01 16:36:16.000000000 +0400 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -136,6 +137,7 @@ extern int leases_enable, dir_notify_ena + #define S_IMMUTABLE 16 /* Immutable file */ + #define S_DEAD 32 /* removed, but still open directory */ + #define S_NOQUOTA 64 /* Inode is not counted to quota */ ++#define S_PDIROPS 256 /* Parallel directory operations */ + + /* + * Note that nosuid etc flags are inode-specific: setting some file-system +@@ -162,6 +164,7 @@ extern int leases_enable, dir_notify_ena + #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) + #define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) + #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) ++#define IS_PDIROPS(inode) __IS_FLG(inode, S_PDIROPS) + + #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) + +@@ -490,6 +493,7 @@ struct inode { + atomic_t i_writecount; + unsigned int i_attr_flags; + __u32 i_generation; ++ struct dynlock i_dcache_lock; /* for parallel directory ops */ + union { + struct minix_inode_info minix_i; + struct ext2_inode_info ext2_i; +@@ -713,6 +717,7 @@ struct nameidata { + unsigned int flags; + int last_type; + struct lookup_intent *intent; ++ void *lock; + }; + + #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ +@@ -1610,12 +1615,6 @@ static inline struct dentry *get_parent( + return dget(dentry->d_parent); + } + +-static inline void unlock_dir(struct dentry *dir) +-{ +- up(&dir->d_inode->i_sem); +- dput(dir); +-} +- + /* + * Whee.. Deadlock country. Happily there are only two VFS + * operations that does this.. +--- linux-2.4.18/fs/inode.c~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 ++++ linux-2.4.18-alexey/fs/inode.c 2003-09-01 16:36:16.000000000 +0400 +@@ -119,6 +119,7 @@ static struct inode *alloc_inode(struct + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + inode->i_mapping = mapping; ++ dynlock_init(&inode->i_dcache_lock); + } + return inode; + } + +_ diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.4.20-rh.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.4.20-rh.patch new file mode 100644 index 0000000..c9228a8 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-pdirops-2.4.20-rh.patch @@ -0,0 +1,269 @@ + fs/inode.c | 1 + fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++--------------- + include/linux/fs.h | 11 ++++---- + 3 files changed, 54 insertions(+), 24 deletions(-) + +Index: linux-2.4.20-rh/fs/namei.c +=================================================================== +--- linux-2.4.20-rh.orig/fs/namei.c 2003-09-04 20:58:33.000000000 +0800 ++++ linux-2.4.20-rh/fs/namei.c 2003-09-04 21:21:20.000000000 +0800 +@@ -101,6 +101,36 @@ + + } + ++static void *lock_dir(struct inode *dir, struct qstr *name) ++{ ++ unsigned long hash; ++ ++ if (!IS_PDIROPS(dir)) { ++ down(&dir->i_sem); ++ return 0; ++ } ++ ++ /* OK. fs understands parallel directory operations. ++ * so, we try to acquire lock for hash of requested ++ * filename in order to prevent any operations with ++ * same name in same time -bzzz */ ++ ++ /* calculate name hash */ ++ hash = full_name_hash(name->name, name->len); ++ ++ /* lock this hash */ ++ return dynlock_lock(&dir->i_dcache_lock, hash, 1, GFP_ATOMIC); ++} ++ ++static void unlock_dir(struct inode *dir, void *lock) ++{ ++ if (!IS_PDIROPS(dir)) { ++ up(&dir->i_sem); ++ return; ++ } ++ dynlock_unlock(&dir->i_dcache_lock, lock); ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -302,10 +332,10 @@ + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ void *lock; + + again: +- +- down(&dir->i_sem); ++ lock = lock_dir(dir, name); + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. +@@ -329,7 +359,7 @@ + else + result = dentry; + } +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + return result; + } + +@@ -337,7 +367,7 @@ + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ +- up(&dir->i_sem); ++ unlock_dir(dir, lock); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) { + dput(result); +@@ -1180,13 +1210,13 @@ + goto exit; + + dir = nd->dentry; +- down(&dir->d_inode->i_sem); ++ nd->lock = lock_dir(dir->d_inode, &nd->last); + dentry = lookup_hash_it(&nd->last, nd->dentry, it); + + do_last: + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + goto exit; + } + +@@ -1195,7 +1225,7 @@ + if (!dentry->d_inode) { + error = vfs_create_it(dir->d_inode, dentry, + mode & ~current->fs->umask, it); +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + dput(nd->dentry); + nd->dentry = dentry; + if (error) +@@ -1209,7 +1239,7 @@ + /* + * It already exists. + */ +- up(&dir->d_inode->i_sem); ++ unlock_dir(dir->d_inode, nd->lock); + + error = -EEXIST; + if (flag & O_EXCL) +@@ -1362,7 +1392,7 @@ + goto exit; + } + dir = nd->dentry; +- down(&dir->d_inode->i_sem); ++ nd->lock = lock_dir(dir->d_inode, &nd->last); + dentry = lookup_hash_it(&nd->last, nd->dentry, it); + putname(nd->last.name); + goto do_last; +@@ -1380,7 +1410,7 @@ + { + struct dentry *dentry; + +- down(&nd->dentry->d_inode->i_sem); ++ nd->lock = lock_dir(nd->dentry->d_inode, &nd->last); + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +@@ -1469,7 +1499,7 @@ + } + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1532,7 +1562,7 @@ + mode & ~current->fs->umask); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1642,14 +1672,14 @@ + if (error != -EOPNOTSUPP) + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1708,7 +1738,7 @@ + if (error != -EOPNOTSUPP) + goto exit1; + } +- down(&nd.dentry->d_inode->i_sem); ++ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); + dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1719,7 +1749,7 @@ + exit2: + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + exit1: + path_release(&nd); + exit: +@@ -1789,7 +1819,7 @@ + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out2: + path_release(&nd); + out: +@@ -1881,7 +1911,7 @@ + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); + dput(new_dentry); + } +- up(&nd.dentry->d_inode->i_sem); ++ unlock_dir(nd.dentry->d_inode, nd.lock); + out_release: + path_release(&nd); + out: +Index: linux-2.4.20-rh/include/linux/fs.h +=================================================================== +--- linux-2.4.20-rh.orig/include/linux/fs.h 2003-09-04 20:59:14.000000000 +0800 ++++ linux-2.4.20-rh/include/linux/fs.h 2003-09-04 21:03:46.000000000 +0800 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -136,6 +137,7 @@ + #define S_IMMUTABLE 16 /* Immutable file */ + #define S_DEAD 32 /* removed, but still open directory */ + #define S_NOQUOTA 64 /* Inode is not counted to quota */ ++#define S_PDIROPS 256 /* Parallel directory operations */ + + /* + * Note that nosuid etc flags are inode-specific: setting some file-system +@@ -162,6 +164,7 @@ + #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) + #define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) + #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) ++#define IS_PDIROPS(inode) __IS_FLG(inode, S_PDIROPS) + + #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) + +@@ -489,6 +492,7 @@ + atomic_t i_writecount; + unsigned int i_attr_flags; + __u32 i_generation; ++ struct dynlock i_dcache_lock; /* for parallel directory ops */ + union { + struct minix_inode_info minix_i; + struct ext2_inode_info ext2_i; +@@ -708,6 +712,7 @@ + unsigned int flags; + int last_type; + struct lookup_intent *intent; ++ void *lock; + }; + + /* +@@ -1621,12 +1626,6 @@ + return dget(dentry->d_parent); + } + +-static inline void unlock_dir(struct dentry *dir) +-{ +- up(&dir->d_inode->i_sem); +- dput(dir); +-} +- + /* + * Whee.. Deadlock country. Happily there are only two VFS + * operations that does this.. +Index: linux-2.4.20-rh/fs/inode.c +=================================================================== +--- linux-2.4.20-rh.orig/fs/inode.c 2003-09-04 20:58:35.000000000 +0800 ++++ linux-2.4.20-rh/fs/inode.c 2003-09-04 21:03:46.000000000 +0800 +@@ -121,6 +121,7 @@ + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + inode->i_mapping = mapping; ++ dynlock_init(&inode->i_dcache_lock); + } + return inode; + } diff --git a/lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc b/lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc new file mode 100644 index 0000000..b626dcf --- /dev/null +++ b/lustre/kernel_patches/pc/dynamic-locks-2.4.18-chaos.pc @@ -0,0 +1,3 @@ +include/linux/dynlocks.h +lib/dynlocks.c +lib/Makefile diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-5.pc b/lustre/kernel_patches/pc/ext-2.4-patch-5.pc new file mode 100644 index 0000000..7191405 --- /dev/null +++ b/lustre/kernel_patches/pc/ext-2.4-patch-5.pc @@ -0,0 +1 @@ +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc new file mode 100644 index 0000000..bd89204 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc @@ -0,0 +1,20 @@ +fs/ext3/balloc.c +fs/ext3/balloc.c.orig +fs/ext3/dir.c +fs/ext3/dir.c.orig +fs/ext3/ialloc.c +fs/ext3/ialloc.c.orig +fs/ext3/inode.c +fs/ext3/inode.c.orig +fs/ext3/ioctl.c +fs/ext3/ioctl.c.orig +fs/ext3/namei.c +fs/ext3/namei.c.orig +fs/ext3/super.c +fs/ext3/super.c.orig +fs/ext3/symlink.c +fs/ext3/symlink.c.orig +include/linux/ext3_fs.h +include/linux/ext3_fs.h.orig +include/linux/ext3_jbd.h +include/linux/ext3_jbd.h.orig diff --git a/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc new file mode 100644 index 0000000..9b16759 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc @@ -0,0 +1 @@ +fs/ext3/namei.c diff --git a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc new file mode 100644 index 0000000..42243c8 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc @@ -0,0 +1,6 @@ +fs/ext3/file.c +fs/ext3/file.c.orig +fs/ext3/inode.c +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos.pc new file mode 100644 index 0000000..f408025 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos.pc @@ -0,0 +1,8 @@ +fs/ext3/extents.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/Makefile +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_i.h +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/ext3-extents-oflag-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-extents-oflag-2.4.18-chaos.pc new file mode 100644 index 0000000..56c1739 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-extents-oflag-2.4.18-chaos.pc @@ -0,0 +1,19 @@ +fs/ext3/ialloc.c +fs/ext3/namei.c +include/asm-alpha/fcntl.h +include/asm-arm/fcntl.h +include/asm-cris/fcntl.h +include/asm-i386/fcntl.h +include/asm-ia64/fcntl.h +include/asm-m68k/fcntl.h +include/asm-mips64/fcntl.h +include/asm-mips/fcntl.h +include/asm-parisc/fcntl.h +include/asm-ppc/fcntl.h +include/asm-s390/fcntl.h +include/asm-s390x/fcntl.h +include/asm-sh/fcntl.h +include/asm-sparc64/fcntl.h +include/asm-sparc/fcntl.h +include/linux/ext3_fs.h +fs/ext3/inode.c diff --git a/lustre/kernel_patches/pc/ext3-map_inode_page-2.6.0.pc b/lustre/kernel_patches/pc/ext3-map_inode_page-2.6.0.pc new file mode 100644 index 0000000..231df0e --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-map_inode_page-2.6.0.pc @@ -0,0 +1,2 @@ +fs/ext3/inode.c +fs/ext3/super.c diff --git a/lustre/kernel_patches/pc/ext3-no-write-super-chaos.pc b/lustre/kernel_patches/pc/ext3-no-write-super-chaos.pc new file mode 100644 index 0000000..08795de --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-no-write-super-chaos.pc @@ -0,0 +1 @@ +fs/ext3/super.c diff --git a/lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc new file mode 100644 index 0000000..2ad2584 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-pdirops-2.4.18-chaos.pc @@ -0,0 +1,6 @@ +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_i.h +fs/ext3/inode.c +fs/ext3/ialloc.c diff --git a/lustre/kernel_patches/pc/iopen-2.4.18-2.pc b/lustre/kernel_patches/pc/iopen-2.4.18-2.pc new file mode 100644 index 0000000..308490e --- /dev/null +++ b/lustre/kernel_patches/pc/iopen-2.4.18-2.pc @@ -0,0 +1,8 @@ +Documentation/filesystems/ext2.txt +fs/ext3/inode.c +fs/ext3/iopen.c +fs/ext3/iopen.h +fs/ext3/Makefile +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/iopen-2.6.0.pc b/lustre/kernel_patches/pc/iopen-2.6.0.pc new file mode 100644 index 0000000..308490e --- /dev/null +++ b/lustre/kernel_patches/pc/iopen-2.6.0.pc @@ -0,0 +1,8 @@ +Documentation/filesystems/ext2.txt +fs/ext3/inode.c +fs/ext3/iopen.c +fs/ext3/iopen.h +fs/ext3/Makefile +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc b/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc new file mode 100644 index 0000000..1078cb4 --- /dev/null +++ b/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26-2.pc @@ -0,0 +1,11 @@ +fs/ext3/ext3-exports.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/Makefile +fs/ext3/namei.c +fs/ext3/super.c +fs/ext3/xattr.c +include/linux/ext3_fs.h +include/linux/ext3_jbd.h +include/linux/ext3_xattr.h +include/linux/xattr.h diff --git a/lustre/kernel_patches/pc/removepage-2.4.20.pc b/lustre/kernel_patches/pc/removepage-2.4.20.pc new file mode 100644 index 0000000..c659e15 --- /dev/null +++ b/lustre/kernel_patches/pc/removepage-2.4.20.pc @@ -0,0 +1,2 @@ +include/linux/fs.h +mm/filemap.c diff --git a/lustre/kernel_patches/pc/removepage-2.6.0.pc b/lustre/kernel_patches/pc/removepage-2.6.0.pc new file mode 100644 index 0000000..c659e15 --- /dev/null +++ b/lustre/kernel_patches/pc/removepage-2.6.0.pc @@ -0,0 +1,2 @@ +include/linux/fs.h +mm/filemap.c diff --git a/lustre/kernel_patches/pc/uml-2.6.0-fix.pc b/lustre/kernel_patches/pc/uml-2.6.0-fix.pc new file mode 100644 index 0000000..980e3ee --- /dev/null +++ b/lustre/kernel_patches/pc/uml-2.6.0-fix.pc @@ -0,0 +1 @@ +include/asm-um/unistd.h diff --git a/lustre/kernel_patches/pc/uml-patch-2.6.0-test3-1.pc b/lustre/kernel_patches/pc/uml-patch-2.6.0-test3-1.pc new file mode 100644 index 0000000..9a32c9a --- /dev/null +++ b/lustre/kernel_patches/pc/uml-patch-2.6.0-test3-1.pc @@ -0,0 +1,113 @@ +arch/um/config.release +arch/um/defconfig +arch/um/drivers/chan_kern.c +arch/um/drivers/chan_user.c +arch/um/drivers/cow.h +arch/um/drivers/cow_kern.c +arch/um/drivers/cow_sys.h +arch/um/drivers/cow_user.c +arch/um/drivers/hostaudio_kern.c +arch/um/drivers/line.c +arch/um/drivers/Makefile +arch/um/drivers/mconsole_kern.c +arch/um/drivers/mconsole_user.c +arch/um/drivers/mmapper_kern.c +arch/um/drivers/net_kern.c +arch/um/drivers/port_kern.c +arch/um/drivers/ssl.c +arch/um/drivers/stdio_console.c +arch/um/drivers/ubd_kern.c +arch/um/drivers/ubd_user.c +arch/um/drivers/xterm.c +arch/um/drivers/xterm_kern.c +arch/um/dyn.lds.S +arch/um/include/irq_kern.h +arch/um/include/kern_util.h +arch/um/include/line.h +arch/um/include/mconsole.h +arch/um/include/mem.h +arch/um/include/mem_user.h +arch/um/include/os.h +arch/um/include/sysdep-i386/sigcontext.h +arch/um/include/ubd_user.h +arch/um/include/user.h +arch/um/include/user_util.h +arch/um/Kconfig +arch/um/Kconfig_block +arch/um/Kconfig_net +arch/um/kernel/config.c.in +arch/um/kernel/exec_kern.c +arch/um/kernel/init_task.c +arch/um/kernel/irq.c +arch/um/kernel/Makefile +arch/um/kernel/mem.c +arch/um/kernel/mem_user.c +arch/um/kernel/process.c +arch/um/kernel/process_kern.c +arch/um/kernel/ptrace.c +arch/um/kernel/sigio_kern.c +arch/um/kernel/signal_kern.c +arch/um/kernel/skas/include/mode.h +arch/um/kernel/skas/include/uaccess.h +arch/um/kernel/skas/Makefile +arch/um/kernel/skas/process.c +arch/um/kernel/skas/process_kern.c +arch/um/kernel/skas/util/mk_ptregs.c +arch/um/kernel/smp.c +arch/um/kernel/syscall_kern.c +arch/um/kernel/sys_call_table.c +arch/um/kernel/sysrq.c +arch/um/kernel/time.c +arch/um/kernel/time_kern.c +arch/um/kernel/trap_kern.c +arch/um/kernel/trap_user.c +arch/um/kernel/tt/include/uaccess.h +arch/um/kernel/tt/process_kern.c +arch/um/kernel/tt/ptproxy/proxy.c +arch/um/kernel/tt/tracer.c +arch/um/kernel/tt/uaccess_user.c +arch/um/kernel/tty_log.c +arch/um/kernel/um_arch.c +arch/um/kernel/umid.c +arch/um/kernel/user_util.c +arch/um/Makefile +arch/um/Makefile-i386 +arch/um/Makefile-skas +arch/um/os-Linux/drivers/tuntap_user.c +arch/um/os-Linux/file.c +arch/um/sys-i386/bugs.c +arch/um/sys-i386/Makefile +arch/um/uml.lds.S +arch/um/util/mk_constants_kern.c +fs/hostfs/hostfs.h +fs/hostfs/hostfs_kern.c +fs/hostfs/hostfs_user.c +fs/hostfs/Makefile +fs/hppfs/hppfs_kern.c +fs/hppfs/Makefile +fs/Makefile +include/asm-um/archparam-i386.h +include/asm-um/common.lds.S +include/asm-um/cpufeature.h +include/asm-um/current.h +include/asm-um/fixmap.h +include/asm-um/irq.h +include/asm-um/local.h +include/asm-um/module-generic.h +include/asm-um/module-i386.h +include/asm-um/page.h +include/asm-um/pgtable.h +include/asm-um/processor-generic.h +include/asm-um/processor-i386.h +include/asm-um/sections.h +include/asm-um/smp.h +include/asm-um/system-generic.h +include/asm-um/thread_info.h +include/asm-um/timex.h +include/linux/mm.h +include/linux/proc_mm.h +mm/Makefile +mm/memory.c +mm/mmap.c +mm/mprotect.c +mm/proc_mm.c diff --git a/lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc b/lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc new file mode 100644 index 0000000..f244b84 --- /dev/null +++ b/lustre/kernel_patches/pc/vfs-pdirops-2.4.18-chaos.pc @@ -0,0 +1,3 @@ +fs/namei.c +include/linux/fs.h +fs/inode.c diff --git a/lustre/kernel_patches/series/chaos-2.4.18 b/lustre/kernel_patches/series/chaos-2.4.18 index 8a36dc0..5ecead5 100644 --- a/lustre/kernel_patches/series/chaos-2.4.18 +++ b/lustre/kernel_patches/series/chaos-2.4.18 @@ -23,6 +23,6 @@ iopen-2.4.18.patch jbd-dont-account-blocks-twice.patch jbd-commit-tricks.patch ext3-o_direct-1-2.4.18-chaos.patch -ext3-no-write-super.patch -jbd-ctx_switch.patch -jbd-get_write_access.patch +ext3-no-write-super-chaos.patch +ext3-extents-2.4.18-chaos.patch +ext3-extents-oflag-2.4.18-chaos.patch diff --git a/lustre/kernel_patches/series/chaos-2.4.18-pdirops b/lustre/kernel_patches/series/chaos-2.4.18-pdirops new file mode 100644 index 0000000..d4545e2 --- /dev/null +++ b/lustre/kernel_patches/series/chaos-2.4.18-pdirops @@ -0,0 +1,35 @@ +dev_read_only.patch +exports.patch +kmem_cache_validate.patch +lustre_version.patch +vfs_intent-2.4.18-18-chaos65.patch +invalidate_show.patch +iod-rmap-exports.patch +export-truncate.patch +ext3-compat-2.4.18-chaos.patch +ext-2.4-patch-1.patch +ext-2.4-patch-2.patch +ext-2.4-patch-3.patch +ext-2.4-patch-4.patch +ext-2.4-patch-5.patch +linux-2.4.18ea-0.8.26-2.patch +ext3-2.4-ino_t.patch +ext3-2.4.18-ino_sb_macro-2.patch +ext3-orphan_lock.patch +ext3-delete_thread-2.4.18-2.patch +extN-misc-fixup.patch +extN-noread.patch +extN-wantedi.patch +ext3-san-2.4.20.patch +extN-2.4.18-ino_sb_fixup.patch +ext3-map_inode_page_2.4.18.patch +ext3-error-export.patch +iopen-2.4.18-2.patch +jbd-dont-account-blocks-twice.patch +jbd-commit-tricks.patch +ext3-o_direct-1-2.4.18-chaos.patch +ext3-no-write-super-chaos.patch +dynamic-locks-2.4.18-chaos.patch +vfs-pdirops-2.4.18-chaos.patch +ext3-pdirops-2.4.18-chaos.patch +add_page_private.patch diff --git a/lustre/kernel_patches/series/uml_2.6.0_test3 b/lustre/kernel_patches/series/uml_2.6.0_test3 new file mode 100644 index 0000000..7b89a36 --- /dev/null +++ b/lustre/kernel_patches/series/uml_2.6.0_test3 @@ -0,0 +1,14 @@ +uml-patch-2.6.0-test3-1.patch +lustre_build.patch +lustre_version.patch +vfs_intent_2.6.0-test1.patch +vfs_nointent_2.6.0-test1.patch +vfs_races_2.5.72_rev1.patch +vfs_mntcwd_2.5.72_rev1.patch +ext3-san-jdike-2.5.73.patch +iopen-2.6.0.patch +export-truncate-2.5.63.patch +qla2xxx-v8.00.00b1-2.5.73.patch +uml-2.6.0-fix.patch +ext3-map_inode_page-2.6.0.patch +removepage-2.6.0.patch diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index 88af047..5d4c927 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -470,6 +470,7 @@ static int llu_file_release(struct inode *inode) if (!fd) /* no process opened the file after an mcreate */ RETURN(rc = 0); +#if 0 /* we might not be able to get a valid handle on this file * again so we really want to flush our write cache.. */ if (S_ISREG(inode->i_mode) && lsm) { @@ -481,11 +482,12 @@ static int llu_file_release(struct inode *inode) memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE); oa.o_valid |= OBD_MD_FLHANDLE; - rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL); + rc = obd_close(ll_s2obdexp(sbi), &oa, lsm, NULL); if (rc) CERROR("inode %lu object close failed: rc = " "%d\n", lli->lli_st_ino, rc); } +#endif rc2 = llu_mdc_close(&sbi->ll_mdc_conn, inode); if (rc2 && !rc) diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index ce2e23b..977dbca 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -20,7 +20,7 @@ struct llu_sb_info { struct obd_uuid ll_sb_uuid; struct lustre_handle ll_mdc_conn; - struct lustre_handle ll_osc_conn; + struct obd_export ll_osc_exp; obd_id ll_rootino; int ll_flags; struct list_head ll_conn_chain; diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index 847b1d0..c5df187 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -74,7 +74,7 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, down(&lli->lli_getattr_sem); if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) { - rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL); + rc = ll_inode_getattr(inode, lsm); if (rc == 0) { set_bit(LLI_F_DID_GETATTR, &lli->lli_flags); } else { diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 0939352..a51be12 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -170,8 +170,7 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) dst->o_valid |= (valid & ~OBD_MD_FLID); } -int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, - char *ostdata) +static int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm) { struct llu_sb_info *sbi = llu_i2sbi(inode); struct obdo oa; @@ -187,11 +186,6 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME; - if (ostdata != NULL) { - memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE); - oa.o_valid |= OBD_MD_FLHANDLE; - } - rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); if (rc) RETURN(rc); @@ -327,7 +321,7 @@ static int llu_iop_lookup(struct pnode *pnode, llu_update_inode(*inop, body, lic.lic_lsm); if (llu_i2info(*inop)->lli_smd) { - rc = llu_inode_getattr(*inop, llu_i2info(*inop)->lli_smd, NULL); + rc = llu_inode_getattr(*inop, llu_i2info(*inop)->lli_smd); if (rc) _sysio_i_gone(*inop); } diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c new file mode 100644 index 0000000..2e63dc7 --- /dev/null +++ b/lustre/mdc/mdc_locks.c @@ -0,0 +1,550 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MDC + +#ifdef __KERNEL__ +# include +# include +# include +# include +#else +# include +# include +#endif + +#include +#include +#include +#include +#include "mdc_internal.h" + +int it_disposition(struct lookup_intent *it, int flag) +{ + return it->d.lustre.it_disposition & flag; +} +EXPORT_SYMBOL(it_disposition); + +void it_set_disposition(struct lookup_intent *it, int flag) +{ + it->d.lustre.it_disposition |= flag; +} +EXPORT_SYMBOL(it_set_disposition); + +static void mdc_fid2mdc_op_data(struct mdc_op_data *data, + struct ll_uctxt *ctxt, + struct ll_fid *f1, + struct ll_fid *f2, + const char *name, + int namelen, + int mode) +{ + LASSERT(data); + LASSERT(ctxt); + LASSERT(f1); + + data->ctxt = *ctxt; + data->fid1 = *f1; + if (f2) + data->fid2 = *f2; + else + memset(&data->fid2, 0, sizeof(data->fid2)); + data->name = name; + data->namelen = namelen; + data->create_mode = mode; +} + +static int it_to_lock_mode(struct lookup_intent *it) +{ + /* CREAT needs to be tested before open (both could be set) */ + if (it->it_op & IT_CREAT) + return LCK_PW; + else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP)) + return LCK_PR; + + LBUG(); + RETURN(-EINVAL); +} + +int it_open_error(int phase, struct lookup_intent *it) +{ + if (it_disposition(it, DISP_OPEN_OPEN)) { + if (phase == DISP_OPEN_OPEN) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_OPEN_CREATE)) { + if (phase == DISP_OPEN_CREATE) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_LOOKUP_EXECD)) { + if (phase == DISP_LOOKUP_EXECD) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_IT_EXECD)) { + if (phase == DISP_IT_EXECD) + return it->d.lustre.it_status; + else + return 0; + } + CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition, + it->d.lustre.it_status); + LBUG(); + return 0; +} +EXPORT_SYMBOL(it_open_error); + +/* this must be called on a lockh that is known to have a referenced lock */ +void mdc_set_lock_data(__u64 *l, void *data) +{ + struct ldlm_lock *lock; + struct lustre_handle *lockh = (struct lustre_handle *)l; + ENTRY; + + if (!*l) { + EXIT; + return; + } + + lock = ldlm_handle2lock(lockh); + + LASSERT(lock != NULL); + l_lock(&lock->l_resource->lr_namespace->ns_lock); +#if !defined(LIBLUSTRE) + if (lock->l_data && lock->l_data != data) { + struct inode *new_inode = data; + struct inode *old_inode = lock->l_data; + unsigned long state = old_inode->i_state & I_FREEING; + CERROR("Found existing inode %p/%lu/%u state %lu in lock: " + "setting data to %p/%lu/%u\n", old_inode, + old_inode->i_ino, old_inode->i_generation, state, + new_inode, new_inode->i_ino, new_inode->i_generation); + LASSERT(state); + } +#endif + lock->l_data = data; + l_unlock(&lock->l_resource->lr_namespace->ns_lock); + LDLM_LOCK_PUT(lock); + + EXIT; +} +EXPORT_SYMBOL(mdc_set_lock_data); + +int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid, + ldlm_iterator_t it, void *data) +{ + struct ldlm_res_id res_id = { .name = {0} }; + ENTRY; + + res_id.name[0] = fid->id; + res_id.name[1] = fid->generation; + + ldlm_change_cbdata(class_exp2obd(exp)->obd_namespace, &res_id, it, + data); + EXIT; + return 0; +} + + + +/* We always reserve enough space in the reply packet for a stripe MD, because + * we don't know in advance the file type. */ +int mdc_enqueue(struct obd_export *exp, + int lock_type, + struct lookup_intent *it, + int lock_mode, + struct mdc_op_data *data, + struct lustre_handle *lockh, + char *tgt, + int tgtlen, + ldlm_completion_callback cb_completion, + ldlm_blocking_callback cb_blocking, + void *cb_data) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_res_id res_id = + { .name = {data->fid1.id, data->fid1.generation} }; + int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)}; + int rc, flags = LDLM_FL_HAS_INTENT; + int repsize[4] = {sizeof(struct ldlm_reply), + sizeof(struct mds_body), + obddev->u.cli.cl_max_mds_easize, + obddev->u.cli.cl_max_mds_cookiesize}; + struct ldlm_reply *dlm_rep; + struct ldlm_intent *lit; + struct ldlm_request *lockreq; + void *eadata; + unsigned long irqflags; + int reply_buffers = 0; + ENTRY; + +// LDLM_DEBUG_NOLOCK("mdsintent=%s,name=%s,dir=%lu", +// ldlm_it2str(it->it_op), it_name, it_inode->i_ino); + + if (it->it_op & IT_OPEN) { + it->it_create_mode |= S_IFREG; + it->it_create_mode &= ~current->fs->umask; + + size[2] = sizeof(struct mds_rec_create); + size[3] = data->namelen + 1; + size[4] = obddev->u.cli.cl_max_mds_easize; + req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 5, + size, NULL); + if (!req) + RETURN(-ENOMEM); + + spin_lock_irqsave (&req->rq_lock, irqflags); + req->rq_replay = 1; + spin_unlock_irqrestore (&req->rq_lock, irqflags); + + /* pack the intent */ + lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_open_pack(req, 2, data, it->it_create_mode, 0, + LTIME_S(CURRENT_TIME), + it->it_flags, tgt, tgtlen); + /* get ready for the reply */ + reply_buffers = 3; + req->rq_replen = lustre_msg_size(3, repsize); + } else if (it->it_op & IT_UNLINK) { + size[2] = sizeof(struct mds_rec_unlink); + size[3] = data->namelen + 1; + req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 4, + size, NULL); + if (!req) + RETURN(-ENOMEM); + + /* pack the intent */ + lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_unlink_pack(req, 2, data); + /* get ready for the reply */ + reply_buffers = 4; + req->rq_replen = lustre_msg_size(4, repsize); + } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { + int valid = OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE; + size[2] = sizeof(struct mds_body); + size[3] = data->namelen + 1; + + req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 4, + size, NULL); + if (!req) + RETURN(-ENOMEM); + + /* pack the intent */ + lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_getattr_pack(req, valid, 2, it->it_flags, data); + /* get ready for the reply */ + reply_buffers = 3; + req->rq_replen = lustre_msg_size(3, repsize); + } else if (it->it_op == IT_READDIR) { + req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 1, + size, NULL); + if (!req) + RETURN(-ENOMEM); + + /* get ready for the reply */ + reply_buffers = 1; + req->rq_replen = lustre_msg_size(1, repsize); + } else { + LBUG(); + RETURN(-EINVAL); + } + + mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it); + rc = ldlm_cli_enqueue(exp, req, obddev->obd_namespace, NULL, res_id, + lock_type, NULL, 0, lock_mode, &flags, + cb_completion, cb_blocking, cb_data, lockh); + mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); + + /* Similarly, if we're going to replay this request, we don't want to + * actually get a lock, just perform the intent. */ + if (req->rq_transno || req->rq_replay) { + lockreq = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*lockreq)); + lockreq->lock_flags |= LDLM_FL_INTENT_ONLY; + } + + /* This can go when we're sure that this can never happen */ + LASSERT(rc != -ENOENT); + if (rc == ELDLM_LOCK_ABORTED) { + lock_mode = 0; + memset(lockh, 0, sizeof(*lockh)); + rc = 0; + } else if (rc != 0) { + CERROR("ldlm_cli_enqueue: %d\n", rc); + LASSERT (rc < 0); + ptlrpc_req_finished(req); + RETURN(rc); + } else { /* rc = 0 */ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + LASSERT(lock); + + /* If the server gave us back a different lock mode, we should + * fix up our variables. */ + if (lock->l_req_mode != lock_mode) { + ldlm_lock_addref(lockh, lock->l_req_mode); + ldlm_lock_decref(lockh, lock_mode); + lock_mode = lock->l_req_mode; + } + + LDLM_LOCK_PUT(lock); + } + + dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep)); + LASSERT(dlm_rep != NULL); /* checked by ldlm_cli_enqueue() */ + LASSERT_REPSWABBED(req, 0); /* swabbed by ldlm_cli_enqueue() */ + + it->d.lustre.it_disposition = (int) dlm_rep->lock_policy_res1; + it->d.lustre.it_status = (int) dlm_rep->lock_policy_res2; + it->d.lustre.it_lock_mode = lock_mode; + it->d.lustre.it_data = req; + + /* We know what to expect, so we do any byte flipping required here */ + LASSERT(reply_buffers == 4 || reply_buffers == 3 || reply_buffers == 1); + if (reply_buffers >= 3) { + struct mds_body *body; + + body = lustre_swab_repbuf(req, 1, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't swab mds_body\n"); + RETURN (-EPROTO); + } + + if ((body->valid & OBD_MD_FLEASIZE) != 0) { + void *replayea; + /* The eadata is opaque; just check that it is + * there. Eventually, obd_unpackmd() will check + * the contents */ + eadata = lustre_swab_repbuf(req, 2, body->eadatasize, + NULL); + if (eadata == NULL) { + CERROR ("Missing/short eadata\n"); + RETURN (-EPROTO); + } + if (it->it_op & IT_OPEN) { + replayea = lustre_msg_buf(req->rq_reqmsg, 4, + obddev->u.cli.cl_max_mds_easize); + LASSERT(replayea); + memcpy(replayea, eadata, body->eadatasize); + } + } + } + + RETURN(rc); +} +EXPORT_SYMBOL(mdc_enqueue); + +/* + * This long block is all about fixing up the lock and request state + * so that it is correct as of the moment _before_ the operation was + * applied; that way, the VFS will think that everything is normal and + * call Lustre's regular VFS methods. + * + * If we're performing a creation, that means that unless the creation + * failed with EEXIST, we should fake up a negative dentry. + * + * For everything else, we want to lookup to succeed. + * + * One additional note: if CREATE or OPEN succeeded, we add an extra + * reference to the request because we need to keep it around until + * ll_create/ll_open gets called. + * + * The server will return to us, in it_disposition, an indication of + * exactly what d.lustre.it_status refers to. + * + * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call, + * otherwise if DISP_OPEN_CREATE is set, then it status is the + * creation failure mode. In either case, one of DISP_LOOKUP_NEG or + * DISP_LOOKUP_POS will be set, indicating whether the child lookup + * was successful. + * + * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the + * child lookup. + */ +int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt, + struct ll_fid *pfid, const char *name, int len, + struct ll_fid *cfid, struct lookup_intent *it, int flags, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking) +{ + struct lustre_handle lockh; + struct ptlrpc_request *request; + int rc = 0; + struct mds_body *mds_body; + struct lustre_handle old_lock; + struct ldlm_lock *lock; + ENTRY; + LASSERT(it); + + CDEBUG(D_DLMTRACE, "name: %*s in %ld, intent: %s\n", len, name, + (unsigned long) pfid->id, ldlm_it2str(it->it_op)); + + if (cfid && (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR)) { + /* We could just return 1 immediately, but since we should only + * be called in revalidate_it if we already have a lock, let's + * verify that. */ + struct ldlm_res_id res_id ={.name = {cfid->id, + cfid->generation}}; + struct lustre_handle lockh; + int mode, flags = LDLM_FL_BLOCK_GRANTED; + + mode = LCK_PR; + rc = ldlm_lock_match(exp->exp_obd->obd_namespace, flags, + &res_id, LDLM_PLAIN, NULL, 0, LCK_PR, + &lockh); + if (!rc) { + mode = LCK_PW; + rc = ldlm_lock_match(exp->exp_obd->obd_namespace, flags, + &res_id, LDLM_PLAIN, NULL, 0, + LCK_PW, &lockh); + } + if (rc) { + memcpy(&it->d.lustre.it_lock_handle, &lockh, + sizeof(lockh)); + it->d.lustre.it_lock_mode = mode; + } + RETURN(rc); + } + + /* This function may be called twice, we only once want to + execute the request associated with the intent. If it was + done already, we skip past this and use the results. */ + if (!it_disposition(it, DISP_ENQ_COMPLETE)) { + struct mdc_op_data op_data; + mdc_fid2mdc_op_data(&op_data, uctxt, pfid, cfid, name, len, 0); + + rc = mdc_enqueue(exp, LDLM_PLAIN, it, it_to_lock_mode(it), + &op_data, &lockh, NULL, 0, ldlm_completion_ast, + cb_blocking, NULL); + if (rc < 0) + RETURN(rc); + memcpy(&it->d.lustre.it_lock_handle, &lockh, sizeof(lockh)); + } + request = *reqp = it->d.lustre.it_data; + LASSERT(request != NULL); + + if (!it_disposition(it, DISP_IT_EXECD)) { + /* The server failed before it even started executing the + * intent, i.e. because it couldn't unpack the request. */ + LASSERT(it->d.lustre.it_status != 0); + RETURN(it->d.lustre.it_status); + } + rc = it_open_error(DISP_IT_EXECD, it); + if (rc) + RETURN(rc); + + mds_body = lustre_msg_buf(request->rq_repmsg, 1, sizeof(*mds_body)); + LASSERT(mds_body != NULL); /* mdc_enqueue checked */ + LASSERT_REPSWABBED(request, 1); /* mdc_enqueue swabbed */ + + /* If we were revalidating a fid/name pair, mark the intent in + * case we fail and get called again from lookup */ + if (cfid != NULL) { + it_set_disposition(it, DISP_ENQ_COMPLETE); + /* Also: did we find the same inode? */ + if (memcmp(cfid, &mds_body->fid1, sizeof(*cfid))) { + ptlrpc_request_addref(request); + RETURN(-ESTALE); + } + } + + /* If we're doing an IT_OPEN which did not result in an actual + * successful open, then we need to remove the bit which saves + * this request for unconditional replay. */ + if (it->it_op & IT_OPEN) { + if (!it_disposition(it, DISP_OPEN_OPEN) || + it->d.lustre.it_status != 0) { + unsigned long flags; + + spin_lock_irqsave(&request->rq_lock, flags); + request->rq_replay = 0; + spin_unlock_irqrestore(&request->rq_lock, flags); + } + } + + rc = it_open_error(DISP_LOOKUP_EXECD, it); + if (rc) + RETURN(rc); + + /* keep requests around for the multiple phases of the call + * this shows the DISP_XX must guarantee we make it into the call + */ + if (it_disposition(it, DISP_OPEN_CREATE) && + !it_open_error(DISP_OPEN_CREATE, it)) + ptlrpc_request_addref(request); + if (it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) + ptlrpc_request_addref(request); + + if (it->it_op & IT_CREAT) { + /* XXX this belongs in ll_create_iit */ + } else if (it->it_op == IT_OPEN) { + LASSERT(!it_disposition(it, DISP_OPEN_CREATE)); + } else { + LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP)); + } + + /* If we already have a matching lock, then cancel the new + * one. We have to set the data here instead of in + * mdc_enqueue, because we need to use the child's inode as + * the l_data to match, and that's not available until + * intent_finish has performed the iget().) */ + lock = ldlm_handle2lock(&lockh); + if (lock) { + LDLM_DEBUG(lock, "matching against this"); + LDLM_LOCK_PUT(lock); + memcpy(&old_lock, &lockh, sizeof(lockh)); + if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, + LDLM_PLAIN, NULL, 0, LCK_NL, &old_lock)) { + ldlm_lock_decref_and_cancel(&lockh, + it->d.lustre.it_lock_mode); + memcpy(&lockh, &old_lock, sizeof(old_lock)); + memcpy(&it->d.lustre.it_lock_handle, &lockh, + sizeof(lockh)); + } + } + CDEBUG(D_DENTRY, "D_IT dentry %*s intent: %s status %d disp %x rc %d\n", + len, name, ldlm_it2str(it->it_op), it->d.lustre.it_status, + it->d.lustre.it_disposition, rc); + + RETURN(rc); +} +EXPORT_SYMBOL(mdc_intent_lock); diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c new file mode 100644 index 0000000..a109ef6 --- /dev/null +++ b/lustre/obdfilter/filter_io_24.c @@ -0,0 +1,237 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * linux/fs/obdfilter/filter_io.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Andreas Dilger + * Author: Phil Schwan + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include // XXX kill me soon +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + +#define DEBUG_SUBSYSTEM S_FILTER + +#include + +#include +#include +#include "filter_internal.h" + + +/* We should only change the file mtime (and not the ctime, like + * update_inode_times() in generic_file_write()) when we only change data. */ +void inode_update_time(struct inode *inode, int ctime_too) +{ + time_t now = CURRENT_TIME; + if (inode->i_mtime == now && (!ctime_too || inode->i_ctime == now)) + return; + inode->i_mtime = now; + if (ctime_too) + inode->i_ctime = now; + mark_inode_dirty_sync(inode); +} + +int ext3_map_inode_page(struct inode *inode, struct page *page, + unsigned long *blocks, int *created, int create); +int filter_direct_io(int rw, struct inode *inode, struct kiobuf *iobuf) +{ + struct page *page; + unsigned long *b = iobuf->blocks; + int rc, i, create = (rw == OBD_BRW_WRITE), blocks_per_page, *created; + int *cr, cleanup_phase; + ENTRY; + + blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + if (iobuf->nr_pages * blocks_per_page > KIO_MAX_SECTORS) + RETURN(-EINVAL); + + OBD_ALLOC(created, sizeof(*created) * iobuf->nr_pages*blocks_per_page); + if (created == NULL) + RETURN(-ENOMEM); + cleanup_phase = 1; + + rc = lock_kiovec(1, &iobuf, 1); + if (rc < 0) + GOTO(cleanup, rc); + cleanup_phase = 2; + + down(&inode->i_sem); + cleanup_phase = 3; + for (i = 0, cr = created, b = iobuf->blocks; i < iobuf->nr_pages; i++){ + page = iobuf->maplist[i]; + + rc = ext3_map_inode_page(inode, page, b, cr, create); + if (rc) + GOTO(cleanup, rc); + + b += blocks_per_page; + cr += blocks_per_page; + } + up(&inode->i_sem); + cleanup_phase = 2; + + rc = brw_kiovec(WRITE, 1, &iobuf, inode->i_dev, iobuf->blocks, + 1 << inode->i_blkbits); + CDEBUG(D_INFO, "tried to write %d pages, rc = %d\n", + iobuf->nr_pages, rc); + if (rc != (1 << inode->i_blkbits) * iobuf->nr_pages * blocks_per_page) + CERROR("short write? expected %d, wrote %d\n", + (1 << inode->i_blkbits) * iobuf->nr_pages * + blocks_per_page, rc); + if (rc > 0) + rc = 0; + + EXIT; +cleanup: + switch(cleanup_phase) { + case 3: + up(&inode->i_sem); + case 2: + unlock_kiovec(1, &iobuf); + case 1: + OBD_FREE(created, sizeof(*created) * + iobuf->nr_pages*blocks_per_page); + break; + default: + CERROR("corrupt cleanup_phase (%d)?\n", cleanup_phase); + LBUG(); + break; + } + return rc; +} + +int filter_commitrw_write(struct obd_export *exp, int objcount, + struct obd_ioobj *obj, int niocount, + struct niobuf_local *res, + struct obd_trans_info *oti) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_run_ctxt saved; + struct niobuf_local *lnb; + struct fsfilt_objinfo fso; + struct iattr iattr = { .ia_valid = ATTR_SIZE, .ia_size = 0, }; + struct kiobuf *iobuf; + struct inode *inode = NULL; + int rc = 0, i, cleanup_phase = 0, err; + unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */ + ENTRY; + LASSERT(oti != NULL); + LASSERT(objcount == 1); + LASSERT(current->journal_info == NULL); + + rc = alloc_kiovec(1, &iobuf); + if (rc) + GOTO(cleanup, rc); + cleanup_phase = 1; + +#if (LINUX_VERSION_CODE == KERNEL_VERSION(2,4,18)) + iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */ +#endif + rc = expand_kiobuf(iobuf, obj->ioo_bufcnt); + if (rc) + GOTO(cleanup, rc); + + iobuf->offset = 0; + iobuf->length = PAGE_SIZE * obj->ioo_bufcnt; + iobuf->nr_pages = obj->ioo_bufcnt; + + cleanup_phase = 1; + fso.fso_dentry = res->dentry; + fso.fso_bufcnt = obj->ioo_bufcnt; + inode = res->dentry->d_inode; + + for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) { + loff_t this_size; + iobuf->maplist[i] = lnb->page; + /* We expect these pages to be in offset order, but we'll + * be forgiving */ + this_size = lnb->offset + lnb->len; + if (this_size > iattr.ia_size) + iattr.ia_size = this_size; + } + + push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + cleanup_phase = 2; + + oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, oti); + if (IS_ERR(oti->oti_handle)) { + rc = PTR_ERR(oti->oti_handle); + CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, + "error starting transaction: rc = %d\n", rc); + oti->oti_handle = NULL; + GOTO(cleanup, rc); + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow brw_start %lus\n", (jiffies - now) / HZ); + + rc = filter_direct_io(OBD_BRW_WRITE, inode, iobuf); + if (rc == 0) { + down(&inode->i_sem); + inode_update_time(inode, 1); + if (iattr.ia_size > inode->i_size) { + CDEBUG(D_INFO, "setting i_size to "LPU64"\n", + iattr.ia_size); + fsfilt_setattr(obd, res->dentry, oti->oti_handle, + &iattr, 0); + } + up(&inode->i_sem); + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow direct_io %lus\n", (jiffies - now) / HZ); + + rc = filter_finish_transno(exp, oti, rc); + err = fsfilt_commit(obd, inode, oti->oti_handle, obd_sync_filter); + if (err) + rc = err; + if (obd_sync_filter) + LASSERT(oti->oti_transno <= obd->obd_last_committed); + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow commitrw commit %lus\n", (jiffies - now) / HZ); + +cleanup: + switch (cleanup_phase) { + case 2: + pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + LASSERT(current->journal_info == NULL); + case 1: + free_kiovec(1, &iobuf); + case 0: + for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) { + /* flip_.. gets a ref, while free_page only frees + * when it decrefs to 0 */ + if (rc == 0) + flip_into_page_cache(inode, lnb->page); + __free_page(lnb->page); + } + f_dput(res->dentry); + } + + RETURN(rc); +} + +#endif + diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c new file mode 100644 index 0000000..ec9957a --- /dev/null +++ b/lustre/obdfilter/filter_io_26.c @@ -0,0 +1,228 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * linux/fs/obdfilter/filter_io.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Andreas Dilger + * Author: Phil Schwan + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include // XXX kill me soon +#include + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include "filter_internal.h" + +int ext3_map_inode_page(struct inode *inode, struct page *page, + unsigned long *blocks, int *created, int create); + +/* 512byte block min */ +#define MAX_BLOCKS_PER_PAGE (PAGE_SIZE / 512) +struct dio_request { + atomic_t numreqs; /* number of reqs being processed */ + struct bio *bio_list; /* list of completed bios */ + wait_queue_head_t wait; + int created[MAX_BLOCKS_PER_PAGE]; + unsigned long blocks[MAX_BLOCKS_PER_PAGE]; + spinlock_t lock; +}; + +static int dio_complete_routine(struct bio *bio, unsigned int done, int error) +{ + struct dio_request *dreq = bio->bi_private; + unsigned long flags; + + spin_lock_irqsave(&dreq->lock, flags); + bio->bi_private = dreq->bio_list; + dreq->bio_list = bio; + spin_unlock_irqrestore(&dreq->lock, flags); + if (atomic_dec_and_test(&dreq->numreqs)) + wake_up(&dreq->wait); + + return 0; +} + +static int can_be_merged(struct bio *bio, sector_t sector) +{ + int size; + + if (!bio) + return 0; + + size = bio->bi_size >> 9; + return bio->bi_sector + size == sector ? 1 : 0; +} + +int filter_commitrw_write(struct obd_export *exp, int objcount, + struct obd_ioobj *obj, int niocount, + struct niobuf_local *res, + struct obd_trans_info *oti) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_run_ctxt saved; + struct niobuf_local *lnb; + struct fsfilt_objinfo fso; + struct iattr iattr = { .ia_valid = ATTR_SIZE, .ia_size = 0, }; + struct inode *inode = NULL; + int rc = 0, i, k, cleanup_phase = 0, err; + unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */ + int blocks_per_page; + struct dio_request *dreq; + struct bio *bio = NULL; + ENTRY; + LASSERT(oti != NULL); + LASSERT(objcount == 1); + LASSERT(current->journal_info == NULL); + + blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + LASSERT(blocks_per_page <= MAX_BLOCKS_PER_PAGE); + + OBD_ALLOC(dreq, sizeof(*dreq)); + if (dreq == NULL) + RETURN(-ENOMEM); + dreq->bio_list = NULL; + init_waitqueue_head(&dreq->wait); + atomic_set(&dreq->numreqs, 0); + spin_lock_init(&dreq->lock); + + cleanup_phase = 1; + fso.fso_dentry = res->dentry; + fso.fso_bufcnt = obj->ioo_bufcnt; + inode = res->dentry->d_inode; + + push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + cleanup_phase = 2; + + oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, oti); + if (IS_ERR(oti->oti_handle)) { + rc = PTR_ERR(oti->oti_handle); + CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, + "error starting transaction: rc = %d\n", rc); + oti->oti_handle = NULL; + GOTO(cleanup, rc); + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow brw_start %lus\n", (jiffies - now) / HZ); + + for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) { + loff_t this_size; + sector_t sector; + int offs; + + /* get block number for next page */ + rc = ext3_map_inode_page(inode, lnb->page, dreq->blocks, + dreq->created, 1); + if (rc) + GOTO(cleanup, rc); + + for (k = 0; k < blocks_per_page; k++) { + sector = dreq->blocks[k] * (inode->i_sb->s_blocksize >> 9); + offs = k * inode->i_sb->s_blocksize; + + if (!bio || !can_be_merged(bio, sector) || + !bio_add_page(bio, lnb->page, lnb->len, offs)) { + if (bio) { + atomic_inc(&dreq->numreqs); + submit_bio(WRITE, bio); + bio = NULL; + } + /* allocate new bio */ + bio = bio_alloc(GFP_NOIO, obj->ioo_bufcnt); + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_sector = sector; + bio->bi_end_io = dio_complete_routine; + bio->bi_private = dreq; + + if (!bio_add_page(bio, lnb->page, lnb->len, 0)) + LBUG(); + } + } + + /* We expect these pages to be in offset order, but we'll + * be forgiving */ + this_size = lnb->offset + lnb->len; + if (this_size > iattr.ia_size) + iattr.ia_size = this_size; + } + if (bio) { + atomic_inc(&dreq->numreqs); + submit_bio(WRITE, bio); + } + + /* time to wait for I/O completion */ + wait_event(dreq->wait, atomic_read(&dreq->numreqs) == 0); + + /* free all bios */ + while (dreq->bio_list) { + bio = dreq->bio_list; + dreq->bio_list = bio->bi_private; + bio_put(bio); + } + + if (rc == 0) { + down(&inode->i_sem); + inode_update_time(inode, 1); + if (iattr.ia_size > inode->i_size) { + CDEBUG(D_INFO, "setting i_size to "LPU64"\n", + iattr.ia_size); + fsfilt_setattr(obd, res->dentry, oti->oti_handle, + &iattr, 0); + } + up(&inode->i_sem); + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow direct_io %lus\n", (jiffies - now) / HZ); + + rc = filter_finish_transno(exp, oti, rc); + err = fsfilt_commit(obd, inode, oti->oti_handle, obd_sync_filter); + if (err) + rc = err; + if (obd_sync_filter) + LASSERT(oti->oti_transno <= obd->obd_last_committed); + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow commitrw commit %lus\n", (jiffies - now) / HZ); + +cleanup: + switch (cleanup_phase) { + case 2: + pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + LASSERT(current->journal_info == NULL); + case 1: + OBD_FREE(dreq, sizeof(*dreq)); + case 0: + for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) { + /* flip_.. gets a ref, while free_page only frees + * when it decrefs to 0 */ + if (rc == 0) + flip_into_page_cache(inode, lnb->page); + __free_page(lnb->page); + } + f_dput(res->dentry); + } + + RETURN(rc); +} diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c new file mode 100644 index 0000000..3fb9d08 --- /dev/null +++ b/lustre/osc/osc_create.c @@ -0,0 +1,343 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * Author Peter Braam + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * For testing and management it is treated as an obd_device, + * although * it does not export a full OBD method table (the + * requests are coming * in over the wire, so object target modules + * do not have a full * method table.) + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_OSC + +#ifdef __KERNEL__ +# include +# include +# include +# include +# include +# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +# include +# include +# else +# include +# endif +#else /* __KERNEL__ */ +# include +#endif + +#include +#include /* for mds_objid */ +#include +#include +#include + +#ifndef __CYGWIN__ +# include +# include +#else +# include +#endif + +#include +#include /* for OBD_FAIL_CHECK */ +#include /* for ll_i2info */ +#include /* for PTL_MD_MAX_IOV */ +#include +#include "osc_internal.h" + +struct osc_created { + wait_queue_head_t osccd_waitq; /* the daemon sleeps on this */ + wait_queue_head_t osccd_ctl_waitq; /* insmod rmmod sleep on this */ + spinlock_t osccd_lock; + int osccd_flags; + struct task_struct *osccd_thread; + struct list_head osccd_queue_list_head; + struct list_head osccd_work_list_head; +}; + + +#define OSCCD_STOPPING 0x1 +#define OSCCD_STOPPED 0x2 +#define OSCCD_RUNNING 0x4 +#define OSCCD_KICKED 0x8 +#define OSCCD_PRECREATED 0x10 + + +static struct osc_created osc_created; + +static int oscc_has_objects(struct osc_creator *oscc, int count) +{ + int rc; + spin_lock(&oscc->oscc_lock); + rc = ((__s64)(oscc->oscc_last_id - oscc->oscc_next_id) >= count); + spin_unlock(&oscc->oscc_lock); + return rc; +} + +static int oscc_precreate(struct osc_creator *oscc, struct osc_created *osccd, + int wait) +{ + int rc = 0; + struct l_wait_info lwi = { 0 }; + ENTRY; + + if (oscc_has_objects(oscc, oscc->oscc_kick_barrier)) + RETURN(0); + + spin_lock(&osccd->osccd_lock); + spin_lock(&oscc->oscc_lock); + if (list_empty(&oscc->oscc_list)) { + list_add(&oscc->oscc_list, &osccd->osccd_queue_list_head); + osccd->osccd_flags |= OSCCD_KICKED; + wake_up(&osccd->osccd_waitq); + } + spin_unlock(&oscc->oscc_lock); + spin_unlock(&osccd->osccd_lock); + + /* an MDS using this call may time out on this. This is a + * recovery style wait. + */ + if (wait) + rc = l_wait_event(oscc->oscc_waitq, oscc_has_objects(oscc, 1), + &lwi); + if (rc || !wait) + RETURN(rc); + + spin_lock(&oscc->oscc_lock); + rc = oscc->oscc_status; + spin_unlock(&oscc->oscc_lock); + RETURN(rc); +} + +int osc_create(struct obd_export *exp, struct obdo *oa, + struct lov_stripe_md **ea, struct obd_trans_info *oti) +{ + struct lov_stripe_md *lsm; + struct osc_creator *oscc = &exp->u.eu_osc_data.oed_oscc; + struct osc_created *osccd = oscc->oscc_osccd; + int try_again = 1, rc = 0; + ENTRY; + + LASSERT(oa); + LASSERT(ea); + + lsm = *ea; + if (lsm == NULL) { + rc = obd_alloc_memmd(exp, &lsm); + if (rc < 0) + RETURN(rc); + } + + /* this is the special case where create removes orphans */ + if (oa->o_valid == OBD_MD_FLFLAGS && + oa->o_flags == OBD_FL_DELORPHAN) { + /* delete from next_id on up */ + oa->o_valid |= OBD_MD_FLID; + oa->o_id = oscc->oscc_next_id; + if (oa->o_id == 0) + RETURN(0); + rc = osc_real_create(oscc->oscc_exp, oa, ea, NULL); + + spin_lock(&osccd->osccd_lock); + spin_lock(&oscc->oscc_lock); + oscc->oscc_status = rc; + oscc->oscc_last_id = oscc->oscc_next_id - 1; + spin_unlock(&oscc->oscc_lock); + spin_unlock(&osccd->osccd_lock); + + RETURN(rc); + } + + while (try_again) { + spin_lock(&oscc->oscc_lock); + if (oscc->oscc_last_id >= oscc->oscc_next_id) { + memcpy(oa, &oscc->oscc_oa, sizeof(*oa)); + oa->o_id = oscc->oscc_next_id; + lsm->lsm_object_id = oscc->oscc_next_id; + *ea = lsm; + oscc->oscc_next_id++; + try_again = 0; + } + spin_unlock(&oscc->oscc_lock); + rc = oscc_precreate(oscc, osccd, try_again); + } + + if (rc == 0) + CDEBUG(D_INFO, "returning objid "LPU64"\n", lsm->lsm_object_id); + else if (*ea == NULL) + obd_free_memmd(exp, &lsm); + RETURN(rc); +} + +void osccd_do_create(struct osc_created *osccd) +{ + struct list_head *tmp; + + next: + spin_lock(&osccd->osccd_lock); + list_for_each (tmp, &osccd->osccd_queue_list_head) { + int rc; + struct osc_creator *oscc = list_entry(tmp, struct osc_creator, + oscc_list); + list_del_init(&oscc->oscc_list); + list_add(&oscc->oscc_list, &osccd->osccd_work_list_head); + spin_lock(&oscc->oscc_lock); + oscc->oscc_oa.o_id = oscc->oscc_last_id + oscc->oscc_grow_count; + oscc->oscc_oa.o_valid |= OBD_MD_FLID; + spin_unlock(&oscc->oscc_lock); + spin_unlock(&osccd->osccd_lock); + + rc = osc_real_create(oscc->oscc_exp, &oscc->oscc_oa, + &oscc->oscc_ea, NULL); + + /* This is not used and leaked, so might as well free + * it now.*/ + if (rc == 0 && oscc->oscc_ea != NULL) + obd_free_memmd(oscc->oscc_exp, &oscc->oscc_ea); + + spin_lock(&osccd->osccd_lock); + spin_lock(&oscc->oscc_lock); + list_del_init(&oscc->oscc_list); + oscc->oscc_status = rc; + oscc->oscc_last_id = oscc->oscc_oa.o_id; + spin_unlock(&oscc->oscc_lock); + spin_unlock(&osccd->osccd_lock); + + CDEBUG(D_INFO, "preallocated through id "LPU64" (last used " + LPU64")\n", oscc->oscc_last_id, oscc->oscc_next_id); + wake_up(&oscc->oscc_waitq); + goto next; + } + spin_unlock(&osccd->osccd_lock); +} + +static int osccd_main(void *arg) +{ + struct osc_created *osccd = (struct osc_created *)arg; + unsigned long flags; + ENTRY; + + lock_kernel(); + kportal_daemonize("lustre_created"); + + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + + unlock_kernel(); + + /* Record that the thread is running */ + osccd->osccd_flags = OSCCD_RUNNING; + wake_up(&osccd->osccd_ctl_waitq); + + /* And now, loop forever on requests */ + while (1) { + struct l_wait_info lwi = { 0 }; + l_wait_event(osccd->osccd_waitq, + osccd->osccd_flags & (OSCCD_STOPPING|OSCCD_KICKED), + &lwi); + + spin_lock(&osccd->osccd_lock); + if (osccd->osccd_flags & OSCCD_STOPPING) { + spin_unlock(&osccd->osccd_lock); + EXIT; + break; + } + osccd->osccd_flags &= ~OSCCD_KICKED; + spin_unlock(&osccd->osccd_lock); + osccd_do_create(osccd); + } + + osccd->osccd_thread = NULL; + osccd->osccd_flags = OSCCD_STOPPED; + wake_up(&osccd->osccd_ctl_waitq); + CDEBUG(D_NET, "commit callback daemon exiting %d\n", current->pid); + RETURN(0); +} + +void oscc_init(struct lustre_handle *exph) +{ + struct obd_export *exp = class_conn2export(exph); + struct osc_export_data *oed; + + if (exp == NULL) + return; + + oed = &exp->exp_osc_data; + memset(oed, 0, sizeof(*oed)); + INIT_LIST_HEAD(&oed->oed_oscc.oscc_list); + init_waitqueue_head(&oed->oed_oscc.oscc_waitq); + spin_lock_init(&oed->oed_oscc.oscc_lock); + oed->oed_oscc.oscc_exp = exp; + oed->oed_oscc.oscc_osccd = &osc_created; + oed->oed_oscc.oscc_kick_barrier = 50; + oed->oed_oscc.oscc_grow_count = 100; + oed->oed_oscc.oscc_initial_create_count = 100; + + oed->oed_oscc.oscc_next_id = 2; + oed->oed_oscc.oscc_last_id = 1; + /* XXX the export handle should give the oscc the last object */ + /* oed->oed_oscc.oscc_last_id = exph->....; */ +} + +int osccd_setup(void) +{ + struct osc_created *osccd = &osc_created; + int rc; + struct l_wait_info lwi = { 0 }; + ENTRY; + + INIT_LIST_HEAD(&osccd->osccd_queue_list_head); + INIT_LIST_HEAD(&osccd->osccd_work_list_head); + init_waitqueue_head(&osccd->osccd_ctl_waitq); + init_waitqueue_head(&osccd->osccd_waitq); + spin_lock_init(&osccd->osccd_lock); + rc = kernel_thread(osccd_main, osccd, + CLONE_VM | CLONE_FS | CLONE_FILES); + if (rc < 0) { + CERROR("cannot start thread\n"); + RETURN(rc); + } + l_wait_event(osccd->osccd_ctl_waitq, osccd->osccd_flags & OSCCD_RUNNING, + &lwi); + RETURN(0); +} + +int osccd_cleanup(void) +{ + struct osc_created *osccd = &osc_created; + struct l_wait_info lwi = { 0 }; + ENTRY; + + spin_lock(&osccd->osccd_lock); + osccd->osccd_flags = OSCCD_STOPPING; + spin_unlock(&osccd->osccd_lock); + + wake_up(&osccd->osccd_waitq); + l_wait_event(osccd->osccd_ctl_waitq, + osccd->osccd_flags & OSCCD_STOPPED, &lwi); + RETURN(0); +} diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index a84a29c..db70ea7 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -96,7 +96,6 @@ enum { }; int ptlrpc_expire_one_request(struct ptlrpc_request *req); -int ptlrpc_check_set(struct ptlrpc_request_set *set); void ptlrpc_pinger_sending_on_import(struct obd_import *imp); #endif /* PTLRPC_INTERNAL_H */ diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index d33670f..094de0b 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -120,9 +120,7 @@ static void __exit ptlrpc_exit(void) { ptlrpc_exit_portals(); ptlrpc_cleanup_connection(); -#ifdef ENABLE_ORPHANS llog_cleanup_commit_master(0); -#endif } /* connection.c */ @@ -170,8 +168,13 @@ EXPORT_SYMBOL(ptlrpc_next_xid); EXPORT_SYMBOL(ptlrpc_prep_set); EXPORT_SYMBOL(ptlrpc_set_add_req); +EXPORT_SYMBOL(ptlrpc_set_add_new_req); EXPORT_SYMBOL(ptlrpc_set_destroy); +EXPORT_SYMBOL(ptlrpc_set_next_timeout); +EXPORT_SYMBOL(ptlrpc_check_set); EXPORT_SYMBOL(ptlrpc_set_wait); +EXPORT_SYMBOL(ptlrpc_expired_set); +EXPORT_SYMBOL(ptlrpc_interrupted_set); /* service.c */ EXPORT_SYMBOL(ptlrpc_init_svc); @@ -192,6 +195,7 @@ EXPORT_SYMBOL(lustre_swab_obd_statfs); EXPORT_SYMBOL(lustre_swab_obd_ioobj); EXPORT_SYMBOL(lustre_swab_niobuf_remote); EXPORT_SYMBOL(lustre_swab_ost_body); +EXPORT_SYMBOL(lustre_swab_ost_last_id); EXPORT_SYMBOL(lustre_swab_ll_fid); EXPORT_SYMBOL(lustre_swab_mds_status_req); EXPORT_SYMBOL(lustre_swab_mds_fileh_body); diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh new file mode 100755 index 0000000..eabee0a --- /dev/null +++ b/lustre/tests/replay-ost-single.sh @@ -0,0 +1,90 @@ +#!/bin/sh + +set -e + +# Skip these tests +# 3 - bug 1852 +ALWAYS_EXCEPT="3" + +LUSTRE=${LUSTRE:-`dirname $0`/..} +LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} +PATH=$LUSTRE/utils:$LUSTRE/tests:$PATH + +RLUSTRE=${RLUSTRE:-$LUSTRE} +RPWD=${RPWD:-$PWD} + +XMLCONFIG="`basename $0 .sh`.xml" + +. $LUSTRE/tests/test-framework.sh + +CHECKSTAT="${CHECKSTAT:-checkstat} -v" + +# XXX I wish all this stuff was in some default-config.sh somewhere +MOUNT=${MOUNT:-/mnt/lustre} +DIR=${DIR:-$MOUNT} +MDSDEV=${MDSDEV:-/tmp/mds-`hostname`} +MDSSIZE=${MDSSIZE:-100000} +OSTDEV=${OSTDEV:-/tmp/ost-`hostname`} +OSTSIZE=${OSTSIZE:-100000} +UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh} +FSTYPE=${FSTYPE:-ext3} +TIMEOUT=${TIMEOUT:-5} + +STRIPE_BYTES=65536 +STRIPES_PER_OBJ=1 + + +gen_config() { + rm -f $XMLCONFIG + add_facet mds + add_facet ost + add_facet client --lustre_upcall $UPCALL + do_lmc --add mds --node mds_facet --mds mds1 --dev $MDSDEV --size $MDSSIZE + do_lmc --add lov --mds mds1 --lov lov1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 + do_lmc --add ost --lov lov1 --failover --node ost_facet --ost ost1 --dev $OSTDEV --size $OSTSIZE + do_lmc --add mtpt --node client_facet --path $MOUNT --mds mds1 --ost lov1 +} + + +build_test_filter + +gen_config +start mds --reformat $MDSLCONFARGS +start ost --reformat $OSTLCONFARGS +start client --gdb $CLIENTLCONFARGS + +mkdir -p $DIR + +test_0() { + replay_barrier ost + fail ost +} +run_test 0 "empty replay" + +test_1() { + replay_barrier ost + touch $DIR/$tfile + fail ost + $CHECKSTAT -t file $DIR/$tfile || return 1 +} +run_test 1 "touch" + +test_2() { + replay_barrier ost + for i in `seq 10`; do + echo "tag-$i" > $DIR/$tfile-$i + done + fail ost + for i in `seq 10`; do + grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i" + done +} +run_test 2 "|x| 10 open(O_CREAT)s" + +exit 0 + +equals_msg test complete, cleaning up +stop client ${FORCE:=--force} $CLIENTLCONFARGS +stop ost ${FORCE} +stop mds ${FORCE} $MDSLCONFARGS --dump cleanup.log + diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh new file mode 100644 index 0000000..3ba7402 --- /dev/null +++ b/lustre/tests/test-framework.sh @@ -0,0 +1,126 @@ +#!/bin/sh + +set -e + +init_test_env() { + export TESTSUITE=`basename $0 .sh` + export XMLCONFIG="${TESTSUITE}.xml" + export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} + export PATH=$LUSTRE/utils:$LUSTRE/tests:$PATH + + export RLUSTRE=${RLUSTRE:-$LUSTRE} + export RPWD=${RPWD:-$PWD} + export CHECKSTAT="${CHECKSTAT:-checkstat} -v" +} + +start() { + facet=$1 + shift + lconf --node ${facet}_facet $@ $XMLCONFIG +} + +stop() { + facet=$1 + shift + lconf --node ${facet}_facet $@ --cleanup $XMLCONFIG +} + +replay_barrier() { + local dev=$1 + sync + df $MOUNT + lctl --device %${dev}1 readonly + lctl --device %${dev}1 notransno + lctl mark "REPLAY BARRIER" +} + +fail() { + local facet=$1 + stop $facet --force --failover --nomod + start $facet --nomod + df $MOUNT +} + +do_lmc() { + lmc -m ${XMLCONFIG} $@ +} + +add_facet() { + local facet=$1 + shift + do_lmc --add node --node ${facet}_facet $@ --timeout $TIMEOUT + do_lmc --add net --node ${facet}_facet --nid localhost --nettype tcp +} + +error() { + echo "${TESTSUITE}: **** FAIL:" $@ + exit 1 +} + +build_test_filter() { + for O in $ONLY; do + eval ONLY_${O}=true + done + for E in $EXCEPT $ALWAYS_EXCEPT; do + eval EXCEPT_${E}=true + done +} + +_basetest() { + echo $* +} + +basetest() { + IFS=abcdefghijklmnopqrstuvwxyz _basetest $1 +} + +run_test() { + export base=`basetest $1` + if [ ! -z "$ONLY" ]; then + testname=ONLY_$1 + if [ ${!testname}x != x ]; then + run_one $1 "$2" + return $? + fi + testname=ONLY_$base + if [ ${!testname}x != x ]; then + run_one $1 "$2" + return $? + fi + echo -n "." + return 0 + fi + testname=EXCEPT_$1 + if [ ${!testname}x != x ]; then + echo "skipping excluded test $1" + return 0 + fi + testname=EXCEPT_$base + if [ ${!testname}x != x ]; then + echo "skipping excluded test $1 (base $base)" + return 0 + fi + run_one $1 "$2" + + return $? +} + +EQUALS="======================================================================" +equals_msg() { + msg="$@" + + local suffixlen=$((65 - ${#msg})) + printf '===== %s %.*s\n' "$msg" $suffixlen $EQUALS +} + +run_one() { + testnum=$1 + message=$2 + tfile=f$base + tdir=d$base + + # Pretty tests run faster. + equals_msg $testnum: $message + + test_${testnum} || error "test_$testnum failed with $?" +} -- 1.8.3.1