From cbd3a230274e1197d378725ee9e2aed6e35d2cb6 Mon Sep 17 00:00:00 2001 From: adilger Date: Thu, 29 Apr 2004 08:54:18 +0000 Subject: [PATCH] Land b_smallfix onto HEAD (20040428_2142) b=3284, b=3285, b=3043, b=3236, revert liblustre group changes --- lnet/include/linux/kp30.h | 9 +- lustre/ChangeLog | 7 +- lustre/include/liblustre.h | 18 +- lustre/include/linux/lustre_fsfilt.h | 4 +- lustre/include/linux/lustre_lib.h | 5 +- lustre/include/linux/lustre_net.h | 3 + .../configurable-x86-stack-2.4.21-sles8sp3.patch | 330 +++ .../patches/dev_read_only-suse-2.4.19.patch | 76 + .../patches/exports_2.4.19-suse.patch | 36 +- .../patches/ext-2.4-patch-1-suse-2.4.19.patch | 2551 ++++++++++++++++++++ .../patches/ext3-orphan_lock-2.4.19-suse.patch | 30 +- .../patches/ext3-trusted_ea-suse-2.4.19.patch | 179 ++ .../patches/ext3-use-after-free-suse.patch | 14 +- .../patches/invalidate_show-2.4.21-sles8sp3.patch | 134 + .../kernel_patches/patches/iopen-2.4.19-suse.patch | 58 +- .../patches/iopen-2.4.21-sles8sp3.patch | 449 ++++ .../kernel_text_address-2.4.21-sles8sp3.patch | 115 + .../patches/linux-2.4.19-pre1-xattr-0.8.54.patch | 8 +- .../patches/linux-2.4.19-xattr-0.8.54-suse.patch | 33 +- .../patches/linux-2.4.20-xattr-0.8.54-chaos.patch | 8 +- .../patches/linux-2.4.20-xattr-0.8.54-hp.patch | 8 +- .../patches/linux-2.4.20-xattr-0.8.54.patch | 8 +- .../patches/linux-2.4.21-xattr-0.8.54-chaos.patch | 4 +- .../patches/linux-2.4.21-xattr-0.8.54-suse.patch | 8 +- .../patches/linux-2.4.22-xattr-0.8.54.patch | 8 +- .../patches/linux-2.4.24-xattr-0.8.54.patch | 8 +- lustre/kernel_patches/patches/lustre_version.patch | 2 +- .../patches/mkdep-revert-rh-2.4.patch | 50 + .../patches/tcp-zero-copy-2.4.21-sles8sp3.patch | 458 ++++ .../patches/vfs_intent-2.4.18-18-chaos65.patch | 391 ++- .../patches/vfs_intent-2.4.19-pre1.patch | 414 ++-- .../patches/vfs_intent-2.4.19-suse.patch | 410 ++-- .../patches/vfs_intent-2.4.20-hp.patch | 406 ++-- .../patches/vfs_intent-2.4.20-rh.patch | 76 +- .../patches/vfs_intent-2.4.20-vanilla.patch | 379 ++- .../patches/vfs_intent-2.4.21-chaos.patch | 15 +- .../patches/vfs_intent-2.4.21-sles8sp3.patch | 1862 ++++++++++++++ .../patches/vfs_intent-2.4.21-suse2.patch | 17 +- .../patches/vfs_intent-2.4.22-rh.patch | 41 +- .../patches/vfs_nointent-2.6-suse.patch | 6 +- .../patches/xattr-0.8.54-2.4.22-rh.patch | 8 +- lustre/kernel_patches/series/chaos-2.4.18 | 1 + lustre/kernel_patches/series/chaos-2.4.18-pdirops | 1 + lustre/kernel_patches/series/rh-2.4.20 | 1 + lustre/kernel_patches/series/suse-2.4.19 | 7 +- lustre/kernel_patches/series/suse-sles8sp3-2.4.21 | 31 + lustre/kernel_patches/which_patch | 1 + lustre/ldlm/ldlm_resource.c | 4 +- lustre/liblustre/genlib.sh | 2 +- lustre/liblustre/llite_lib.c | 116 +- lustre/liblustre/namei.c | 10 +- lustre/liblustre/super.c | 10 +- lustre/liblustre/tests/Makefile.am | 4 +- lustre/liblustre/tests/echo_test.c | 5 - lustre/llite/dir.c | 1 - lustre/llite/file.c | 51 +- lustre/llite/llite_internal.h | 28 +- lustre/llite/llite_lib.c | 5 +- lustre/llite/lproc_llite.c | 61 +- lustre/llite/rw.c | 306 ++- lustre/mdc/mdc_locks.c | 3 + lustre/mdc/mdc_request.c | 8 +- lustre/mds/mds_open.c | 1 + lustre/obdclass/class_obd.c | 2 +- lustre/obdclass/genops.c | 2 +- lustre/obdecho/echo_client.c | 1 - lustre/obdfilter/filter.c | 36 +- lustre/obdfilter/filter_io.c | 2 +- lustre/osc/lproc_osc.c | 7 +- lustre/osc/osc_create.c | 55 +- lustre/osc/osc_request.c | 2 +- lustre/portals/include/linux/kp30.h | 9 +- lustre/ptlrpc/client.c | 2 +- lustre/ptlrpc/pinger.c | 3 +- lustre/ptlrpc/recover.c | 3 +- lustre/ptlrpc/service.c | 3 +- lustre/scripts/version_tag.pl.in | 2 +- lustre/tests/acceptance-small.sh | 4 +- lustre/tests/cfg/insanity-local.sh | 4 +- lustre/tests/insanity.sh | 4 +- lustre/tests/llmountcleanup.sh | 4 +- lustre/tests/multiop.c | 8 - lustre/tests/oos.sh | 4 +- lustre/tests/oos2.sh | 4 +- lustre/tests/recovery-small.sh | 4 +- lustre/tests/replay-dual.sh | 4 +- lustre/tests/replay-ost-single.sh | 2 +- lustre/tests/replay-single.sh | 16 +- lustre/tests/run-llog.sh | 2 +- lustre/tests/runas.c | 48 +- lustre/tests/runregression-brw.sh | 2 +- lustre/tests/runregression-mds.sh | 8 +- lustre/tests/runregression-net.sh | 2 +- lustre/tests/runtests | 10 +- lustre/tests/sanity.sh | 13 +- lustre/tests/sanityN.sh | 4 +- lustre/tests/test-framework.sh | 2 +- 97 files changed, 7914 insertions(+), 1667 deletions(-) create mode 100644 lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-sles8sp3.patch create mode 100644 lustre/kernel_patches/patches/dev_read_only-suse-2.4.19.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-1-suse-2.4.19.patch create mode 100644 lustre/kernel_patches/patches/ext3-trusted_ea-suse-2.4.19.patch create mode 100644 lustre/kernel_patches/patches/invalidate_show-2.4.21-sles8sp3.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch create mode 100644 lustre/kernel_patches/patches/kernel_text_address-2.4.21-sles8sp3.patch create mode 100644 lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch create mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.4.21-sles8sp3.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch create mode 100644 lustre/kernel_patches/series/suse-sles8sp3-2.4.21 diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index 181594f..8a56b55 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -689,27 +689,30 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data); # endif #endif +#ifndef LP_POISON +# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) +#endif + #if defined(__x86_64__) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPSZ "%lu" # define LPSSZ "%ld" -# define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPSZ "%u" # define LPSSZ "%d" -# define LP_POISON ((void *)0x5a5a5a5a) #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) # define LPU64 "%lu" # define LPD64 "%ld" # define LPX64 "%#lx" # define LPSZ "%lu" # define LPSSZ "%ld" -# define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #endif #ifndef LPU64 # error "No word size defined" diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 3693d4c..b2cf2ca 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -5,7 +5,9 @@ tbd Cluster File Systems, Inc. - deal with strange write() on x86-64 (3043) - don't dereference NULL peer_ni in ldlm_handle_ast_error (3258) - clear page->private before handing to FS (3119) - - drop scimac NAL + - tune the read pipeline (3236) + * miscellania + - drop scimac NAL (unmaintained) tbd Cluster File Systems, Inc. * version 1.2.2 @@ -30,6 +32,9 @@ tbd Cluster File Systems, Inc. - initialize RPC timeout timer earlier for 2.6 (3219) - don't dereference NULL reply buffer if mdc_close was never sent (2410) - print nal/nid for unknown nid (3258) + - additional checks for oscc recovery before doing precreate (3284) + - fix ll_extent_lock() error return code for 64-bit systems (3043) + - don't crash in mdc_close for bad permissions on open (3285) * miscellania - allow default OST striping configuration per directory (1414) - increase maximum number of MDS request buffers for large systems diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 30d9574..af80f44 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -116,6 +116,9 @@ static inline void *kmalloc(int size, int prot) #define PTR_ERR(a) ((long)(a)) #define ERR_PTR(a) ((void*)((long)(a))) +#define capable(foo) 1 +#define CAP_SYS_ADMIN 1 + typedef struct { void *cwd; }mm_segment_t; @@ -575,23 +578,12 @@ struct task_struct { int pid; int fsuid; int fsgid; - int max_groups; - int ngroups; - gid_t *groups; __u32 cap_effective; - - struct fs_struct __fs; }; extern struct task_struct *current; -int in_group_p(gid_t gid); -static inline int capable(int cap) -{ - if (current->cap_effective & (1 << cap)) - return 1; - else - return 0; -} + +#define in_group_p(a) 0 /* FIXME */ #define set_current_state(foo) do { current->state = foo; } while (0) diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 40e9914..72f3817 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -100,7 +100,7 @@ static inline void *fsfilt_start_log(struct obd_device *obd, unsigned long now = jiffies; void *parent_handle = oti ? oti->oti_handle : NULL; void *handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs); - CDEBUG(D_HA, "started handle %p (%p)\n", handle, parent_handle); + CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle); if (oti != NULL) { if (parent_handle == NULL) { @@ -162,7 +162,7 @@ static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode, { unsigned long now = jiffies; int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync); - CDEBUG(D_HA, "committing handle %p\n", handle); + CDEBUG(D_INFO, "committing handle %p\n", handle); if (time_after(jiffies, now + 15 * HZ)) CERROR("long journal start time %lus\n", (jiffies - now) / HZ); diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index ebdfdf6..0bb5f0b 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -40,18 +40,19 @@ #include #include +#define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) +#define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) + #ifndef LPU64 /* x86_64 has 64bit longs and defines u64 as long long */ #if BITS_PER_LONG > 32 && !defined(__x86_64__) #define LPU64 "%lu" #define LPD64 "%ld" #define LPX64 "%#lx" -#define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #else #define LPU64 "%Lu" #define LPD64 "%Ld" #define LPX64 "%#Lx" -#define LP_POISON ((void *)0x5a5a5a5a) #endif #endif diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 84062e2..3c75a8b 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -79,6 +79,9 @@ # define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE / PAGE_SIZE) #endif +#if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) +#error "PTLRPC_MAX_BRW_PAGES isn't a power of two" +#endif /* Size over which to OBD_VMALLOC() rather than OBD_ALLOC() service request * buffers */ diff --git a/lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-sles8sp3.patch new file mode 100644 index 0000000..bc0a1b7 --- /dev/null +++ b/lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-sles8sp3.patch @@ -0,0 +1,330 @@ +Index: linux-2.4.21/arch/i386/kernel/entry.S +=================================================================== +--- linux-2.4.21.orig/arch/i386/kernel/entry.S 2004-04-24 02:39:01.000000000 -0400 ++++ linux-2.4.21/arch/i386/kernel/entry.S 2004-04-24 02:42:58.000000000 -0400 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + + EBX = 0x00 + ECX = 0x04 +@@ -130,10 +131,6 @@ + .long 3b,6b; \ + .previous + +-#define GET_CURRENT(reg) \ +- movl $-8192, reg; \ +- andl %esp, reg +- + ENTRY(lcall7) + pushfl # We get a different stack layout with call gates, + pushl %eax # which has to be cleaned up later.. +@@ -149,7 +146,7 @@ + movl %ecx,CS(%esp) # + movl %esp,%ebx + pushl %ebx +- andl $-8192,%ebx # GET_CURRENT ++ andl $-THREAD_SIZE,%ebx # GET_CURRENT + movl exec_domain(%ebx),%edx # Get the execution domain + movl 4(%edx),%edx # Get the lcall7 handler for the domain + pushl $0x7 +@@ -173,7 +170,7 @@ + movl %ecx,CS(%esp) # + movl %esp,%ebx + pushl %ebx +- andl $-8192,%ebx # GET_CURRENT ++ andl $-THREAD_SIZE,%ebx # GET_CURRENT + movl exec_domain(%ebx),%edx # Get the execution domain + movl 4(%edx),%edx # Get the lcall7 handler for the domain + pushl $0x27 +Index: linux-2.4.21/arch/i386/kernel/smpboot.c +=================================================================== +--- linux-2.4.21.orig/arch/i386/kernel/smpboot.c 2004-04-24 02:39:05.000000000 -0400 ++++ linux-2.4.21/arch/i386/kernel/smpboot.c 2004-04-24 02:42:58.000000000 -0400 +@@ -837,7 +837,7 @@ + + /* So we see what's up */ + printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); +- stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle); ++ stack_start.esp = (void *)idle->thread.esp; + + /* + * This grunge runs the startup process for +@@ -918,7 +918,7 @@ + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; +- if (*((volatile unsigned char *)phys_to_virt(8192)) ++ if (*((volatile unsigned char *)phys_to_virt(THREAD_SIZE)) + == 0xA5) + /* trampoline started but...? */ + printk("Stuck ??\n"); +@@ -941,7 +941,7 @@ + } + + /* mark "stuck" area as not stuck */ +- *((volatile unsigned long *)phys_to_virt(8192)) = 0; ++ *((volatile unsigned long *)phys_to_virt(THREAD_SIZE)) = 0; + + #ifdef CONFIG_ES7000 + if (!es7000_plat) +Index: linux-2.4.21/arch/i386/kernel/traps.c +=================================================================== +--- linux-2.4.21.orig/arch/i386/kernel/traps.c 2004-04-24 02:39:18.000000000 -0400 ++++ linux-2.4.21/arch/i386/kernel/traps.c 2004-04-24 02:42:58.000000000 -0400 +@@ -304,7 +304,7 @@ + unsigned long esp = tsk->thread.esp; + + /* User space on another CPU? */ +- if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1)) ++ if ((esp ^ (unsigned long)tsk) & ~(THREAD_SIZE - 1)) + return; + show_trace((unsigned long *)esp); + } +Index: linux-2.4.21/arch/i386/kernel/head.S +=================================================================== +--- linux-2.4.21.orig/arch/i386/kernel/head.S 2004-04-24 02:38:42.000000000 -0400 ++++ linux-2.4.21/arch/i386/kernel/head.S 2004-04-24 02:42:58.000000000 -0400 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + #define OLD_CL_MAGIC_ADDR 0x90020 + #define OLD_CL_MAGIC 0xA33F +@@ -326,7 +327,7 @@ + ret + + ENTRY(stack_start) +- .long SYMBOL_NAME(init_task_union)+8192 ++ .long SYMBOL_NAME(init_task_union)+THREAD_SIZE + .long __KERNEL_DS + + /* This is the default interrupt "handler" :-) */ +Index: linux-2.4.21/arch/i386/kernel/irq.c +=================================================================== +--- linux-2.4.21.orig/arch/i386/kernel/irq.c 2004-04-24 02:39:18.000000000 -0400 ++++ linux-2.4.21/arch/i386/kernel/irq.c 2004-04-24 02:44:26.000000000 -0400 +@@ -602,7 +602,10 @@ + long esp; + + /* Debugging check for stack overflow: is there less than 1KB free? */ +- __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191)); ++ __asm__ __volatile__( ++ "andl %%esp,%0" ++ : "=r" (esp) : "0" (THREAD_SIZE-1)); ++ + if (unlikely(esp < (sizeof(struct task_struct) + sysctl_stackwarn))) { + static unsigned long next_jiffies; /* ratelimiting */ + static long least_esp = THREAD_SIZE; +Index: linux-2.4.21/arch/i386/lib/getuser.S +=================================================================== +--- linux-2.4.21.orig/arch/i386/lib/getuser.S 1998-01-12 16:42:52.000000000 -0500 ++++ linux-2.4.21/arch/i386/lib/getuser.S 2004-04-24 02:42:58.000000000 -0400 +@@ -21,6 +21,10 @@ + * as they get called from within inline assembly. + */ + ++/* Duplicated from asm/processor.h */ ++#include ++#include ++ + addr_limit = 12 + + .text +@@ -28,7 +32,7 @@ + .globl __get_user_1 + __get_user_1: + movl %esp,%edx +- andl $0xffffe000,%edx ++ andl $~(THREAD_SIZE - 1),%edx + cmpl addr_limit(%edx),%eax + jae bad_get_user + 1: movzbl (%eax),%edx +@@ -41,7 +45,7 @@ + addl $1,%eax + movl %esp,%edx + jc bad_get_user +- andl $0xffffe000,%edx ++ andl $~(THREAD_SIZE - 1),%edx + cmpl addr_limit(%edx),%eax + jae bad_get_user + 2: movzwl -1(%eax),%edx +@@ -54,7 +58,7 @@ + addl $3,%eax + movl %esp,%edx + jc bad_get_user +- andl $0xffffe000,%edx ++ andl $~(THREAD_SIZE - 1),%edx + cmpl addr_limit(%edx),%eax + jae bad_get_user + 3: movl -3(%eax),%edx +Index: linux-2.4.21/arch/i386/config.in +=================================================================== +--- linux-2.4.21.orig/arch/i386/config.in 2004-04-24 02:39:21.000000000 -0400 ++++ linux-2.4.21/arch/i386/config.in 2004-04-24 02:42:58.000000000 -0400 +@@ -326,6 +326,29 @@ + if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then + define_bool CONFIG_HAVE_DEC_LOCK y + fi ++ ++choice 'Bigger Stack Size Support' \ ++ "off CONFIG_NOBIGSTACK \ ++ 16KB CONFIG_STACK_SIZE_16KB \ ++ 32KB CONFIG_STACK_SIZE_32KB \ ++ 64KB CONFIG_STACK_SIZE_64KB" off ++ ++if [ "$CONFIG_NOBIGSTACK" = "y" ]; then ++ define_int CONFIG_STACK_SIZE_SHIFT 1 ++else ++ if [ "$CONFIG_STACK_SIZE_16KB" = "y" ]; then ++ define_int CONFIG_STACK_SIZE_SHIFT 2 ++ else ++ if [ "$CONFIG_STACK_SIZE_32KB" = "y" ]; then ++ define_int CONFIG_STACK_SIZE_SHIFT 3 ++ else ++ if [ "$CONFIG_STACK_SIZE_64KB" = "y" ]; then ++ define_int CONFIG_STACK_SIZE_SHIFT 4 ++ fi ++ fi ++ fi ++fi ++ + endmenu + + mainmenu_option next_comment +Index: linux-2.4.21/arch/i386/vmlinux.lds.S +=================================================================== +--- linux-2.4.21.orig/arch/i386/vmlinux.lds.S 2004-04-24 02:38:06.000000000 -0400 ++++ linux-2.4.21/arch/i386/vmlinux.lds.S 2004-04-24 02:42:58.000000000 -0400 +@@ -39,7 +39,8 @@ + + _edata = .; /* End of data section */ + +- . = ALIGN(8192); /* init_task */ ++/* chose the biggest of the possible stack sizes here? */ ++ . = ALIGN(65536); /* init_task */ + .data.init_task : { *(.data.init_task) } + + . = ALIGN(4096); /* Init code and data */ +Index: linux-2.4.21/include/asm-i386/current.h +=================================================================== +--- linux-2.4.21.orig/include/asm-i386/current.h 1998-08-14 19:35:22.000000000 -0400 ++++ linux-2.4.21/include/asm-i386/current.h 2004-04-24 02:42:58.000000000 -0400 +@@ -1,15 +1,43 @@ + #ifndef _I386_CURRENT_H + #define _I386_CURRENT_H ++#include ++ ++/* ++ * Configurable page sizes on i386, mainly for debugging purposes. ++ * (c) Balbir Singh ++ */ ++ ++#ifdef __ASSEMBLY__ ++ ++#define PAGE_SIZE 4096 /* as cannot handle 1UL << 12 */ ++#define THREAD_SIZE ((1 << CONFIG_STACK_SIZE_SHIFT) * PAGE_SIZE) ++ ++#define GET_CURRENT(reg) \ ++ movl $-THREAD_SIZE, reg; \ ++ andl %esp, reg ++ ++#else /* __ASSEMBLY__ */ ++ ++#define THREAD_SIZE ((1 << CONFIG_STACK_SIZE_SHIFT) * PAGE_SIZE) ++#define alloc_task_struct() \ ++ ((struct task_struct *) __get_free_pages(GFP_KERNEL,CONFIG_STACK_SIZE_SHIFT)) ++ ++#define free_task_struct(p) \ ++ free_pages((unsigned long) (p), CONFIG_STACK_SIZE_SHIFT) ++ ++#define INIT_TASK_SIZE THREAD_SIZE + + struct task_struct; + + static inline struct task_struct * get_current(void) + { + struct task_struct *current; +- __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); ++ __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~(THREAD_SIZE - 1))); + return current; + } + + #define current get_current() + ++#endif /* __ASSEMBLY__ */ ++ + #endif /* !(_I386_CURRENT_H) */ +Index: linux-2.4.21/include/asm-i386/hw_irq.h +=================================================================== +--- linux-2.4.21.orig/include/asm-i386/hw_irq.h 2004-04-24 02:39:05.000000000 -0400 ++++ linux-2.4.21/include/asm-i386/hw_irq.h 2004-04-24 02:42:58.000000000 -0400 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + /* + * IDT vectors usable for external interrupt sources start +@@ -120,10 +121,6 @@ + #define IRQ_NAME2(nr) nr##_interrupt(void) + #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) + +-#define GET_CURRENT \ +- "movl %esp, %ebx\n\t" \ +- "andl $-8192, %ebx\n\t" +- + /* + * SMP has a few special interrupts for IPI messages + */ +Index: linux-2.4.21/include/asm-i386/processor.h +=================================================================== +--- linux-2.4.21.orig/include/asm-i386/processor.h 2004-04-24 02:39:21.000000000 -0400 ++++ linux-2.4.21/include/asm-i386/processor.h 2004-04-24 02:42:58.000000000 -0400 +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -466,9 +467,6 @@ + #define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019]) + #define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022]) + +-#define THREAD_SIZE (2*PAGE_SIZE) +-#define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)) +-#define free_task_struct(p) free_pages((unsigned long) (p), 1) + #define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count) + + #define init_task (init_task_union.task) +Index: linux-2.4.21/include/linux/sched.h +=================================================================== +--- linux-2.4.21.orig/include/linux/sched.h 2004-04-24 02:39:20.000000000 -0400 ++++ linux-2.4.21/include/linux/sched.h 2004-04-24 02:42:58.000000000 -0400 +@@ -2,6 +2,7 @@ + #define _LINUX_SCHED_H + + #include /* for HZ */ ++#include /* maybe for INIT_TASK_SIZE */ + + extern unsigned long event; + +Index: linux-2.4.21/include/asm-x86_64/current.h +=================================================================== +--- linux-2.4.21.orig/include/asm-x86_64/current.h 2002-11-28 18:53:15.000000000 -0500 ++++ linux-2.4.21/include/asm-x86_64/current.h 2004-04-24 02:42:58.000000000 -0400 +@@ -5,6 +5,7 @@ + struct task_struct; + + #include ++#include + + static inline struct task_struct *get_current(void) + { diff --git a/lustre/kernel_patches/patches/dev_read_only-suse-2.4.19.patch b/lustre/kernel_patches/patches/dev_read_only-suse-2.4.19.patch new file mode 100644 index 0000000..f2eb39a --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only-suse-2.4.19.patch @@ -0,0 +1,76 @@ + drivers/block/blkpg.c | 36 ++++++++++++++++++++++++++++++++++++ + drivers/block/loop.c | 3 +++ + drivers/ide/ide-disk.c | 4 ++++ + 3 files changed, 43 insertions(+) + +Index: linux-2.4.19/drivers/block/blkpg.c +=================================================================== +--- linux-2.4.19.orig/drivers/block/blkpg.c 2002-08-02 20:39:43.000000000 -0400 ++++ linux-2.4.19/drivers/block/blkpg.c 2004-04-23 18:24:40.000000000 -0400 +@@ -296,3 +296,37 @@ + } + + EXPORT_SYMBOL(blk_ioctl); ++ ++#define NUM_DEV_NO_WRITE 16 ++static int dev_no_write[NUM_DEV_NO_WRITE]; ++/* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++void dev_set_rdonly(kdev_t dev, int no_write) ++{ ++ if (dev) { ++ printk(KERN_WARNING "Turning device %s read-only\n", ++ bdevname(dev)); ++ dev_no_write[no_write] = 0xdead0000 + dev; ++ } ++} ++ ++int dev_check_rdonly(kdev_t dev) { ++ int i; ++ ++ for (i = 0; i < NUM_DEV_NO_WRITE; i++) { ++ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 && ++ dev == (dev_no_write[i] & 0xffff)) ++ return 1; ++ } ++ return 0; ++} ++ ++void dev_clear_rdonly(int no_write) { ++ dev_no_write[no_write] = 0; ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); +Index: linux-2.4.19/drivers/block/loop.c +=================================================================== +--- linux-2.4.19.orig/drivers/block/loop.c 2004-04-23 17:53:56.000000000 -0400 ++++ linux-2.4.19/drivers/block/loop.c 2004-04-23 18:23:16.000000000 -0400 +@@ -478,6 +478,9 @@ + spin_unlock_irq(&lo->lo_lock); + + if (rw == WRITE) { ++ if (dev_check_rdonly(rbh->b_rdev)) ++ goto err; ++ + if (lo->lo_flags & LO_FLAGS_READ_ONLY) + goto err; + } else if (rw == READA) { +Index: linux-2.4.19/drivers/ide/ide-disk.c +=================================================================== +--- linux-2.4.19.orig/drivers/ide/ide-disk.c 2004-04-23 17:53:51.000000000 -0400 ++++ linux-2.4.19/drivers/ide/ide-disk.c 2004-04-23 18:23:16.000000000 -0400 +@@ -558,6 +558,10 @@ + */ + static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) + { ++ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { ++ ide_end_request(1, HWGROUP(drive)); ++ return ide_stopped; ++ } + if (IDE_CONTROL_REG) + OUT_BYTE(drive->ctl,IDE_CONTROL_REG); + diff --git a/lustre/kernel_patches/patches/exports_2.4.19-suse.patch b/lustre/kernel_patches/patches/exports_2.4.19-suse.patch index feaeec6..769f411 100644 --- a/lustre/kernel_patches/patches/exports_2.4.19-suse.patch +++ b/lustre/kernel_patches/patches/exports_2.4.19-suse.patch @@ -4,8 +4,10 @@ kernel/ksyms.c | 4 ++++ 4 files changed, 8 insertions(+), 1 deletion(-) ---- linux/fs/ext3/Makefile~exports_2.4.20 Wed Apr 9 10:07:14 2003 -+++ linux-mmonroe/fs/ext3/Makefile Wed Apr 9 10:19:53 2003 +Index: linux-2.4.19/fs/ext3/Makefile +=================================================================== +--- linux-2.4.19.orig/fs/ext3/Makefile 2004-04-23 17:53:55.000000000 -0400 ++++ linux-2.4.19/fs/ext3/Makefile 2004-04-23 18:25:03.000000000 -0400 @@ -9,6 +9,8 @@ O_TARGET := ext3.o @@ -15,10 +17,12 @@ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o obj-m := $(O_TARGET) ---- linux/fs/ext3/super.c~exports_2.4.20 Wed Apr 9 10:07:14 2003 -+++ linux-mmonroe/fs/ext3/super.c Wed Apr 9 10:19:53 2003 -@@ -1769,7 +1769,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); +Index: linux-2.4.19/fs/ext3/super.c +=================================================================== +--- linux-2.4.19.orig/fs/ext3/super.c 2004-04-23 17:53:55.000000000 -0400 ++++ linux-2.4.19/fs/ext3/super.c 2004-04-23 18:25:03.000000000 -0400 +@@ -1821,7 +1821,7 @@ + exit_ext3_xattr(); } -EXPORT_NO_SYMBOLS; @@ -26,19 +30,23 @@ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ---- linux/include/linux/fs.h~exports_2.4.20 Wed Apr 9 10:07:14 2003 -+++ linux-mmonroe/include/linux/fs.h Wed Apr 9 10:19:53 2003 -@@ -1020,6 +1020,7 @@ extern int unregister_filesystem(struct +Index: linux-2.4.19/include/linux/fs.h +=================================================================== +--- linux-2.4.19.orig/include/linux/fs.h 2004-04-23 17:54:14.000000000 -0400 ++++ linux-2.4.19/include/linux/fs.h 2004-04-23 18:25:27.000000000 -0400 +@@ -1183,6 +1183,7 @@ extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); +struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data); + extern void umount_tree(struct vfsmount *); #define kern_umount mntput - ---- linux/kernel/ksyms.c~exports_2.4.20 Wed Apr 9 10:07:14 2003 -+++ linux-mmonroe/kernel/ksyms.c Wed Apr 9 10:19:53 2003 -@@ -308,6 +308,10 @@ EXPORT_SYMBOL(dcache_dir_fsync); +Index: linux-2.4.19/kernel/ksyms.c +=================================================================== +--- linux-2.4.19.orig/kernel/ksyms.c 2004-04-23 17:54:14.000000000 -0400 ++++ linux-2.4.19/kernel/ksyms.c 2004-04-23 18:25:03.000000000 -0400 +@@ -330,6 +330,10 @@ EXPORT_SYMBOL(dcache_readdir); EXPORT_SYMBOL(dcache_dir_ops); @@ -49,5 +57,3 @@ /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ EXPORT_SYMBOL(default_llseek); EXPORT_SYMBOL(dentry_open); - -_ diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-suse-2.4.19.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-suse-2.4.19.patch new file mode 100644 index 0000000..e937932 --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1-suse-2.4.19.patch @@ -0,0 +1,2551 @@ + fs/ext3/Makefile | 2 + fs/ext3/dir.c | 299 +++++++++ + fs/ext3/file.c | 3 + fs/ext3/hash.c | 215 ++++++ + fs/ext3/namei.c | 1388 ++++++++++++++++++++++++++++++++++++++++----- + fs/ext3/super.c | 7 + include/linux/ext3_fs.h | 85 ++ + include/linux/ext3_fs_sb.h | 2 + include/linux/ext3_jbd.h | 2 + include/linux/rbtree.h | 2 + lib/rbtree.c | 42 + + 11 files changed, 1887 insertions(+), 160 deletions(-) + +Index: linux-2.4.19/fs/ext3/Makefile +=================================================================== +--- linux-2.4.19.orig/fs/ext3/Makefile 2004-04-23 18:25:03.000000000 -0400 ++++ linux-2.4.19/fs/ext3/Makefile 2004-04-23 18:26:27.000000000 -0400 +@@ -12,7 +12,7 @@ + export-objs := super.o inode.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o ++ ioctl.o namei.o super.o symlink.o hash.o + obj-m := $(O_TARGET) + + obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o +Index: linux-2.4.19/fs/ext3/dir.c +=================================================================== +--- linux-2.4.19.orig/fs/ext3/dir.c 2001-11-09 17:25:04.000000000 -0500 ++++ linux-2.4.19/fs/ext3/dir.c 2004-04-23 18:26:27.000000000 -0400 +@@ -21,12 +21,16 @@ + #include + #include + #include ++#include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK + }; + + static int ext3_readdir(struct file *, void *, filldir_t); ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir); + + struct file_operations ext3_dir_operations = { + read: generic_read_dir, +@@ -35,6 +39,17 @@ + fsync: ext3_sync_file, /* BKL held */ + }; + ++ ++static unsigned char get_dtype(struct super_block *sb, int filetype) ++{ ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || ++ (filetype >= EXT3_FT_MAX)) ++ return DT_UNKNOWN; ++ ++ return (ext3_filetype_table[filetype]); ++} ++ ++ + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -79,6 +94,16 @@ + + sb = inode->i_sb; + ++ if (is_dx(inode)) { ++ err = ext3_dx_readdir(filp, dirent, filldir); ++ if (err != ERR_BAD_DX_DIR) ++ return err; ++ /* ++ * We don't set the inode dirty flag since it's not ++ * critical that it get flushed back to the disk. ++ */ ++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; ++ } + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); +@@ -162,18 +187,12 @@ + * during the copy operation. + */ + unsigned long version = filp->f_version; +- unsigned char d_type = DT_UNKNOWN; + +- if (EXT3_HAS_INCOMPAT_FEATURE(sb, +- EXT3_FEATURE_INCOMPAT_FILETYPE) +- && de->file_type < EXT3_FT_MAX) +- d_type = +- ext3_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), +- d_type); ++ get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) +@@ -188,3 +207,269 @@ + UPDATE_ATIME(inode); + return 0; + } ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * These functions convert from the major/minor hash to an f_pos ++ * value. ++ * ++ * Currently we only use major hash numer. This is unfortunate, but ++ * on 32-bit machines, the same VFS interface is used for lseek and ++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of ++ * lseek/telldir/seekdir will blow out spectacularly, and from within ++ * the ext2 low-level routine, we don't know if we're being called by ++ * a 64-bit version of the system call or the 32-bit version of the ++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir ++ * cookie. Sigh. ++ */ ++#define hash2pos(major, minor) (major >> 1) ++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) ++#define pos2min_hash(pos) (0) ++ ++/* ++ * This structure holds the nodes of the red-black tree used to store ++ * the directory entry in hash order. ++ */ ++struct fname { ++ __u32 hash; ++ __u32 minor_hash; ++ rb_node_t rb_hash; ++ struct fname *next; ++ __u32 inode; ++ __u8 name_len; ++ __u8 file_type; ++ char name[0]; ++}; ++ ++/* ++ * This functoin implements a non-recursive way of freeing all of the ++ * nodes in the red-black tree. ++ */ ++static void free_rb_tree_fname(rb_root_t *root) ++{ ++ rb_node_t *n = root->rb_node; ++ rb_node_t *parent; ++ struct fname *fname; ++ ++ while (n) { ++ /* Do the node's children first */ ++ if ((n)->rb_left) { ++ n = n->rb_left; ++ continue; ++ } ++ if (n->rb_right) { ++ n = n->rb_right; ++ continue; ++ } ++ /* ++ * The node has no children; free it, and then zero ++ * out parent's link to it. Finally go to the ++ * beginning of the loop and try to free the parent ++ * node. ++ */ ++ parent = n->rb_parent; ++ fname = rb_entry(n, struct fname, rb_hash); ++ kfree(fname); ++ if (!parent) ++ root->rb_node = 0; ++ else if (parent->rb_left == n) ++ parent->rb_left = 0; ++ else if (parent->rb_right == n) ++ parent->rb_right = 0; ++ n = parent; ++ } ++ root->rb_node = 0; ++} ++ ++ ++struct dir_private_info *create_dir_info(loff_t pos) ++{ ++ struct dir_private_info *p; ++ ++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ if (!p) ++ return NULL; ++ p->root.rb_node = 0; ++ p->curr_node = 0; ++ p->extra_fname = 0; ++ p->last_pos = 0; ++ p->curr_hash = pos2maj_hash(pos); ++ p->curr_minor_hash = pos2min_hash(pos); ++ p->next_hash = 0; ++ return p; ++} ++ ++void ext3_htree_free_dir_info(struct dir_private_info *p) ++{ ++ free_rb_tree_fname(&p->root); ++ kfree(p); ++} ++ ++/* ++ * Given a directory entry, enter it into the fname rb tree. ++ */ ++void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent) ++{ ++ rb_node_t **p, *parent = NULL; ++ struct fname * fname, *new_fn; ++ struct dir_private_info *info; ++ int len; ++ ++ info = (struct dir_private_info *) dir_file->private_data; ++ p = &info->root.rb_node; ++ ++ /* Create and allocate the fname structure */ ++ len = sizeof(struct fname) + dirent->name_len + 1; ++ new_fn = kmalloc(len, GFP_KERNEL); ++ memset(new_fn, 0, len); ++ new_fn->hash = hash; ++ new_fn->minor_hash = minor_hash; ++ new_fn->inode = le32_to_cpu(dirent->inode); ++ new_fn->name_len = dirent->name_len; ++ new_fn->file_type = dirent->file_type; ++ memcpy(new_fn->name, dirent->name, dirent->name_len); ++ new_fn->name[dirent->name_len] = 0; ++ ++ while (*p) { ++ parent = *p; ++ fname = rb_entry(parent, struct fname, rb_hash); ++ ++ /* ++ * If the hash and minor hash match up, then we put ++ * them on a linked list. This rarely happens... ++ */ ++ if ((new_fn->hash == fname->hash) && ++ (new_fn->minor_hash == fname->minor_hash)) { ++ new_fn->next = fname->next; ++ fname->next = new_fn; ++ return; ++ } ++ ++ if (new_fn->hash < fname->hash) ++ p = &(*p)->rb_left; ++ else if (new_fn->hash > fname->hash) ++ p = &(*p)->rb_right; ++ else if (new_fn->minor_hash < fname->minor_hash) ++ p = &(*p)->rb_left; ++ else /* if (new_fn->minor_hash > fname->minor_hash) */ ++ p = &(*p)->rb_right; ++ } ++ ++ rb_link_node(&new_fn->rb_hash, parent, p); ++ rb_insert_color(&new_fn->rb_hash, &info->root); ++} ++ ++ ++ ++/* ++ * This is a helper function for ext3_dx_readdir. It calls filldir ++ * for all entres on the fname linked list. (Normally there is only ++ * one entry on the linked list, unless there are 62 bit hash collisions.) ++ */ ++static int call_filldir(struct file * filp, void * dirent, ++ filldir_t filldir, struct fname *fname) ++{ ++ struct dir_private_info *info = filp->private_data; ++ loff_t curr_pos; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct super_block * sb; ++ int error; ++ ++ sb = inode->i_sb; ++ ++ if (!fname) { ++ printk("call_filldir: called with null fname?!?\n"); ++ return 0; ++ } ++ curr_pos = hash2pos(fname->hash, fname->minor_hash); ++ while (fname) { ++ error = filldir(dirent, fname->name, ++ fname->name_len, curr_pos, ++ fname->inode, ++ get_dtype(sb, fname->file_type)); ++ if (error) { ++ filp->f_pos = curr_pos; ++ info->extra_fname = fname->next; ++ return error; ++ } ++ fname = fname->next; ++ } ++ return 0; ++} ++ ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir) ++{ ++ struct dir_private_info *info = filp->private_data; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct fname *fname; ++ int ret; ++ ++ if (!info) { ++ info = create_dir_info(filp->f_pos); ++ if (!info) ++ return -ENOMEM; ++ filp->private_data = info; ++ } ++ ++ /* Some one has messed with f_pos; reset the world */ ++ if (info->last_pos != filp->f_pos) { ++ free_rb_tree_fname(&info->root); ++ info->curr_node = 0; ++ info->extra_fname = 0; ++ info->curr_hash = pos2maj_hash(filp->f_pos); ++ info->curr_minor_hash = pos2min_hash(filp->f_pos); ++ } ++ ++ /* ++ * If there are any leftover names on the hash collision ++ * chain, return them first. ++ */ ++ if (info->extra_fname && ++ call_filldir(filp, dirent, filldir, info->extra_fname)) ++ goto finished; ++ ++ if (!info->curr_node) ++ info->curr_node = rb_get_first(&info->root); ++ ++ while (1) { ++ /* ++ * Fill the rbtree if we have no more entries, ++ * or the inode has changed since we last read in the ++ * cached entries. ++ */ ++ if ((!info->curr_node) || ++ (filp->f_version != inode->i_version)) { ++ info->curr_node = 0; ++ free_rb_tree_fname(&info->root); ++ filp->f_version = inode->i_version; ++ ret = ext3_htree_fill_tree(filp, info->curr_hash, ++ info->curr_minor_hash, ++ &info->next_hash); ++ if (ret < 0) ++ return ret; ++ if (ret == 0) ++ break; ++ info->curr_node = rb_get_first(&info->root); ++ } ++ ++ fname = rb_entry(info->curr_node, struct fname, rb_hash); ++ info->curr_hash = fname->hash; ++ info->curr_minor_hash = fname->minor_hash; ++ if (call_filldir(filp, dirent, filldir, fname)) ++ break; ++ ++ info->curr_node = rb_get_next(info->curr_node); ++ if (!info->curr_node) { ++ info->curr_hash = info->next_hash; ++ info->curr_minor_hash = 0; ++ } ++ } ++finished: ++ info->last_pos = filp->f_pos; ++ UPDATE_ATIME(inode); ++ return 0; ++} ++#endif +Index: linux-2.4.19/fs/ext3/file.c +=================================================================== +--- linux-2.4.19.orig/fs/ext3/file.c 2004-04-23 17:54:02.000000000 -0400 ++++ linux-2.4.19/fs/ext3/file.c 2004-04-23 18:26:27.000000000 -0400 +@@ -38,6 +38,9 @@ + { + if (filp->f_mode & FMODE_WRITE) + ext3_discard_prealloc (inode); ++ if (is_dx(inode) && filp->private_data) ++ ext3_htree_free_dir_info(filp->private_data); ++ + return 0; + } + +Index: linux-2.4.19/fs/ext3/hash.c +=================================================================== +--- linux-2.4.19.orig/fs/ext3/hash.c 2003-01-30 05:24:37.000000000 -0500 ++++ linux-2.4.19/fs/ext3/hash.c 2004-04-23 18:26:27.000000000 -0400 +@@ -0,0 +1,215 @@ ++/* ++ * linux/fs/ext3/hash.c ++ * ++ * Copyright (C) 2002 by Theodore Ts'o ++ * ++ * This file is released under the GPL v2. ++ * ++ * This file may be redistributed under the terms of the GNU Public ++ * License. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define DELTA 0x9E3779B9 ++ ++static void TEA_transform(__u32 buf[4], __u32 const in[]) ++{ ++ __u32 sum = 0; ++ __u32 b0 = buf[0], b1 = buf[1]; ++ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; ++ int n = 16; ++ ++ do { ++ sum += DELTA; ++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); ++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); ++ } while(--n); ++ ++ buf[0] += b0; ++ buf[1] += b1; ++} ++ ++/* F, G and H are basic MD4 functions: selection, majority, parity */ ++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) ++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) ++#define H(x, y, z) ((x) ^ (y) ^ (z)) ++ ++/* ++ * The generic round function. The application is so specific that ++ * we don't bother protecting all the arguments with parens, as is generally ++ * good macro practice, in favor of extra legibility. ++ * Rotation is separate from addition to prevent recomputation ++ */ ++#define ROUND(f, a, b, c, d, x, s) \ ++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) ++#define K1 0 ++#define K2 013240474631UL ++#define K3 015666365641UL ++ ++/* ++ * Basic cut-down MD4 transform. Returns only 32 bits of result. ++ */ ++static void halfMD4Transform (__u32 buf[4], __u32 const in[]) ++{ ++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; ++ ++ /* Round 1 */ ++ ROUND(F, a, b, c, d, in[0] + K1, 3); ++ ROUND(F, d, a, b, c, in[1] + K1, 7); ++ ROUND(F, c, d, a, b, in[2] + K1, 11); ++ ROUND(F, b, c, d, a, in[3] + K1, 19); ++ ROUND(F, a, b, c, d, in[4] + K1, 3); ++ ROUND(F, d, a, b, c, in[5] + K1, 7); ++ ROUND(F, c, d, a, b, in[6] + K1, 11); ++ ROUND(F, b, c, d, a, in[7] + K1, 19); ++ ++ /* Round 2 */ ++ ROUND(G, a, b, c, d, in[1] + K2, 3); ++ ROUND(G, d, a, b, c, in[3] + K2, 5); ++ ROUND(G, c, d, a, b, in[5] + K2, 9); ++ ROUND(G, b, c, d, a, in[7] + K2, 13); ++ ROUND(G, a, b, c, d, in[0] + K2, 3); ++ ROUND(G, d, a, b, c, in[2] + K2, 5); ++ ROUND(G, c, d, a, b, in[4] + K2, 9); ++ ROUND(G, b, c, d, a, in[6] + K2, 13); ++ ++ /* Round 3 */ ++ ROUND(H, a, b, c, d, in[3] + K3, 3); ++ ROUND(H, d, a, b, c, in[7] + K3, 9); ++ ROUND(H, c, d, a, b, in[2] + K3, 11); ++ ROUND(H, b, c, d, a, in[6] + K3, 15); ++ ROUND(H, a, b, c, d, in[1] + K3, 3); ++ ROUND(H, d, a, b, c, in[5] + K3, 9); ++ ROUND(H, c, d, a, b, in[0] + K3, 11); ++ ROUND(H, b, c, d, a, in[4] + K3, 15); ++ ++ buf[0] += a; ++ buf[1] += b; ++ buf[2] += c; ++ buf[3] += d; ++} ++ ++#undef ROUND ++#undef F ++#undef G ++#undef H ++#undef K1 ++#undef K2 ++#undef K3 ++ ++/* The old legacy hash */ ++static __u32 dx_hack_hash (const char *name, int len) ++{ ++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ while (len--) { ++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); ++ ++ if (hash & 0x80000000) hash -= 0x7fffffff; ++ hash1 = hash0; ++ hash0 = hash; ++ } ++ return (hash0 << 1); ++} ++ ++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) ++{ ++ __u32 pad, val; ++ int i; ++ ++ pad = (__u32)len | ((__u32)len << 8); ++ pad |= pad << 16; ++ ++ val = pad; ++ if (len > num*4) ++ len = num * 4; ++ for (i=0; i < len; i++) { ++ if ((i % 4) == 0) ++ val = pad; ++ val = msg[i] + (val << 8); ++ if ((i % 4) == 3) { ++ *buf++ = val; ++ val = pad; ++ num--; ++ } ++ } ++ if (--num >= 0) ++ *buf++ = val; ++ while (--num >= 0) ++ *buf++ = pad; ++} ++ ++/* ++ * Returns the hash of a filename. If len is 0 and name is NULL, then ++ * this function can be used to test whether or not a hash version is ++ * supported. ++ * ++ * The seed is an 4 longword (32 bits) "secret" which can be used to ++ * uniquify a hash. If the seed is all zero's, then some default seed ++ * may be used. ++ * ++ * A particular hash version specifies whether or not the seed is ++ * represented, and whether or not the returned hash is 32 bits or 64 ++ * bits. 32 bit hashes will return 0 for the minor hash. ++ */ ++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) ++{ ++ __u32 hash; ++ __u32 minor_hash = 0; ++ const char *p; ++ int i; ++ __u32 in[8], buf[4]; ++ ++ /* Initialize the default seed for the hash checksum functions */ ++ buf[0] = 0x67452301; ++ buf[1] = 0xefcdab89; ++ buf[2] = 0x98badcfe; ++ buf[3] = 0x10325476; ++ ++ /* Check to see if the seed is all zero's */ ++ if (hinfo->seed) { ++ for (i=0; i < 4; i++) { ++ if (hinfo->seed[i]) ++ break; ++ } ++ if (i < 4) ++ memcpy(buf, hinfo->seed, sizeof(buf)); ++ } ++ ++ switch (hinfo->hash_version) { ++ case DX_HASH_LEGACY: ++ hash = dx_hack_hash(name, len); ++ break; ++ case DX_HASH_HALF_MD4: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 8); ++ halfMD4Transform(buf, in); ++ len -= 32; ++ p += 32; ++ } ++ minor_hash = buf[2]; ++ hash = buf[1]; ++ break; ++ case DX_HASH_TEA: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 4); ++ TEA_transform(buf, in); ++ len -= 16; ++ p += 16; ++ } ++ hash = buf[0]; ++ minor_hash = buf[1]; ++ break; ++ default: ++ hinfo->hash = 0; ++ return -1; ++ } ++ hinfo->hash = hash & ~1; ++ hinfo->minor_hash = minor_hash; ++ return 0; ++} +Index: linux-2.4.19/fs/ext3/namei.c +=================================================================== +--- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 17:53:55.000000000 -0400 ++++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:24:05.000000000 -0400 +@@ -16,6 +16,12 @@ + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 ++ * Hash Tree Directory indexing (c) ++ * Daniel Phillips, 2001 ++ * Hash Tree Directory indexing porting ++ * Christopher Li, 2002 ++ * Hash Tree Directory indexing cleanup ++ * Theodore Ts'o, 2002 + */ + + #include +@@ -40,6 +46,630 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++static struct buffer_head *ext3_append(handle_t *handle, ++ struct inode *inode, ++ u32 *block, int *err) ++{ ++ struct buffer_head *bh; ++ ++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ ++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ inode->i_size += inode->i_sb->s_blocksize; ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_journal_get_write_access(handle,bh); ++ } ++ return bh; ++} ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#ifndef swap ++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) ++#endif ++ ++typedef struct { u32 v; } le_u32; ++typedef struct { u16 v; } le_u16; ++ ++#ifdef DX_DEBUG ++#define dxtrace(command) command ++#else ++#define dxtrace(command) ++#endif ++ ++struct fake_dirent ++{ ++ /*le*/u32 inode; ++ /*le*/u16 rec_len; ++ u8 name_len; ++ u8 file_type; ++}; ++ ++struct dx_countlimit ++{ ++ le_u16 limit; ++ le_u16 count; ++}; ++ ++struct dx_entry ++{ ++ le_u32 hash; ++ le_u32 block; ++}; ++ ++/* ++ * dx_root_info is laid out so that if it should somehow get overlaid by a ++ * dirent the two low bits of the hash version will be zero. Therefore, the ++ * hash version mod 4 should never be 0. Sincerely, the paranoia department. ++ */ ++ ++struct dx_root ++{ ++ struct fake_dirent dot; ++ char dot_name[4]; ++ struct fake_dirent dotdot; ++ char dotdot_name[4]; ++ struct dx_root_info ++ { ++ le_u32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; ++ } ++ info; ++ struct dx_entry entries[0]; ++}; ++ ++struct dx_node ++{ ++ struct fake_dirent fake; ++ struct dx_entry entries[0]; ++}; ++ ++ ++struct dx_frame ++{ ++ struct buffer_head *bh; ++ struct dx_entry *entries; ++ struct dx_entry *at; ++}; ++ ++struct dx_map_entry ++{ ++ u32 hash; ++ u32 offs; ++}; ++ ++#ifdef CONFIG_EXT3_INDEX ++static inline unsigned dx_get_block (struct dx_entry *entry); ++static void dx_set_block (struct dx_entry *entry, unsigned value); ++static inline unsigned dx_get_hash (struct dx_entry *entry); ++static void dx_set_hash (struct dx_entry *entry, unsigned value); ++static unsigned dx_get_count (struct dx_entry *entries); ++static unsigned dx_get_limit (struct dx_entry *entries); ++static void dx_set_count (struct dx_entry *entries, unsigned value); ++static void dx_set_limit (struct dx_entry *entries, unsigned value); ++static unsigned dx_root_limit (struct inode *dir, unsigned infosize); ++static unsigned dx_node_limit (struct inode *dir); ++static struct dx_frame *dx_probe(struct dentry *dentry, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct dx_frame *frame, ++ int *err); ++static void dx_release (struct dx_frame *frames); ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry map[]); ++static void dx_sort_map(struct dx_map_entry *map, unsigned count); ++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, ++ struct dx_map_entry *offsets, int count); ++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); ++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash); ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err); ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode); ++ ++/* ++ * Future: use high four bits of block for coalesce-on-delete flags ++ * Mask them off for now. ++ */ ++ ++static inline unsigned dx_get_block (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->block.v) & 0x00ffffff; ++} ++ ++static inline void dx_set_block (struct dx_entry *entry, unsigned value) ++{ ++ entry->block.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_hash (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->hash.v); ++} ++ ++static inline void dx_set_hash (struct dx_entry *entry, unsigned value) ++{ ++ entry->hash.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_count (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); ++} ++ ++static inline unsigned dx_get_limit (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); ++} ++ ++static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); ++} ++ ++static inline void dx_set_limit (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); ++} ++ ++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - ++ EXT3_DIR_REC_LEN(2) - infosize; ++ return 0? 20: entry_space / sizeof(struct dx_entry); ++} ++ ++static inline unsigned dx_node_limit (struct inode *dir) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); ++ return 0? 22: entry_space / sizeof(struct dx_entry); ++} ++ ++/* ++ * Debug ++ */ ++#ifdef DX_DEBUG ++struct stats ++{ ++ unsigned names; ++ unsigned space; ++ unsigned bcount; ++}; ++ ++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, ++ int size, int show_names) ++{ ++ unsigned names = 0, space = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ printk("names: "); ++ while ((char *) de < base + size) ++ { ++ if (de->inode) ++ { ++ if (show_names) ++ { ++ int len = de->name_len; ++ char *name = de->name; ++ while (len--) printk("%c", *name++); ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ printk(":%x.%u ", h.hash, ++ ((char *) de - base)); ++ } ++ space += EXT3_DIR_REC_LEN(de->name_len); ++ names++; ++ } ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ printk("(%i)\n", names); ++ return (struct stats) { names, space, 1 }; ++} ++ ++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ struct dx_entry *entries, int levels) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count = dx_get_count (entries), names = 0, space = 0, i; ++ unsigned bcount = 0; ++ struct buffer_head *bh; ++ int err; ++ printk("%i indexed blocks...\n", count); ++ for (i = 0; i < count; i++, entries++) ++ { ++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; ++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; ++ struct stats stats; ++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); ++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; ++ stats = levels? ++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): ++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); ++ names += stats.names; ++ space += stats.space; ++ bcount += stats.bcount; ++ brelse (bh); ++ } ++ if (bcount) ++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", ++ names, space/bcount,(space/bcount)*100/blocksize); ++ return (struct stats) { names, space, bcount}; ++} ++#endif /* DX_DEBUG */ ++ ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static struct dx_frame * ++dx_probe(struct dentry *dentry, struct inode *dir, ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++{ ++ unsigned count, indirect; ++ struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_root *root; ++ struct buffer_head *bh; ++ struct dx_frame *frame = frame_in; ++ u32 hash; ++ ++ frame->bh = NULL; ++ if (dentry) ++ dir = dentry->d_parent->d_inode; ++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) ++ goto fail; ++ root = (struct dx_root *) bh->b_data; ++ if (root->info.hash_version != DX_HASH_TEA && ++ root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_LEGACY) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ hinfo->hash_version = root->info.hash_version; ++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ if (dentry) ++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ hash = hinfo->hash; ++ ++ if (root->info.unused_flags & 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ if ((indirect = root->info.indirect_levels) > 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ while (1) ++ { ++ count = dx_get_count(entries); ++ assert (count && count <= dx_get_limit(entries)); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ dxtrace(printk(".")); ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ ++ if (0) // linear search cross check ++ { ++ unsigned n = count - 1; ++ at = entries; ++ while (n--) ++ { ++ dxtrace(printk(",")); ++ if (dx_get_hash(++at) > hash) ++ { ++ at--; ++ break; ++ } ++ } ++ assert (at == p - 1); ++ } ++ ++ at = p - 1; ++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ frame->bh = bh; ++ frame->entries = entries; ++ frame->at = at; ++ if (!indirect--) return frame; ++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ goto fail2; ++ at = entries = ((struct dx_node *) bh->b_data)->entries; ++ assert (dx_get_limit(entries) == dx_node_limit (dir)); ++ frame++; ++ } ++fail2: ++ while (frame >= frame_in) { ++ brelse(frame->bh); ++ frame--; ++ } ++fail: ++ return NULL; ++} ++ ++static void dx_release (struct dx_frame *frames) ++{ ++ if (frames[0].bh == NULL) ++ return; ++ ++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ brelse(frames[1].bh); ++ brelse(frames[0].bh); ++} ++ ++/* ++ * This function increments the frame pointer to search the next leaf ++ * block, and reads in the necessary intervening nodes if the search ++ * should be necessary. Whether or not the search is necessary is ++ * controlled by the hash parameter. If the hash value is even, then ++ * the search is only continued if the next block starts with that ++ * hash value. This is used if we are searching for a specific file. ++ * ++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. ++ * ++ * This function returns 1 if the caller should continue to search, ++ * or 0 if it should not. If there is an error reading one of the ++ * index blocks, it will return -1. ++ * ++ * If start_hash is non-null, it will be filled in with the starting ++ * hash of the next page. ++ */ ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash) ++{ ++ struct dx_frame *p; ++ struct buffer_head *bh; ++ int num_frames = 0; ++ __u32 bhash; ++ ++ *err = ENOENT; ++ p = frame; ++ /* ++ * Find the next leaf page by incrementing the frame pointer. ++ * If we run out of entries in the interior node, loop around and ++ * increment pointer in the parent node. When we break out of ++ * this loop, num_frames indicates the number of interior ++ * nodes need to be read. ++ */ ++ while (1) { ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ if (p == frames) ++ return 0; ++ num_frames++; ++ p--; ++ } ++ ++ /* ++ * If the hash is 1, then continue only if the next page has a ++ * continuation hash of any value. This is used for readdir ++ * handling. Otherwise, check to see if the hash matches the ++ * desired contiuation hash. If it doesn't, return since ++ * there's no point to read in the successive index pages. ++ */ ++ bhash = dx_get_hash(p->at); ++ if (start_hash) ++ *start_hash = bhash; ++ if ((hash & 1) == 0) { ++ if ((bhash & ~1) != hash) ++ return 0; ++ } ++ /* ++ * If the hash is HASH_NB_ALWAYS, we always go to the next ++ * block so no check is necessary ++ */ ++ while (num_frames--) { ++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), ++ 0, err))) ++ return -1; /* Failure */ ++ p++; ++ brelse (p->bh); ++ p->bh = bh; ++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ } ++ return 1; ++} ++ ++ ++/* ++ * p is at least 6 bytes before the end of page ++ */ ++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) ++{ ++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); ++} ++ ++/* ++ * This function fills a red-black tree with information from a ++ * directory. We start scanning the directory in hash order, starting ++ * at start_hash and start_minor_hash. ++ * ++ * This function returns the number of entries inserted into the tree, ++ * or a negative error code. ++ */ ++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash) ++{ ++ struct dx_hash_info hinfo; ++ struct buffer_head *bh; ++ struct ext3_dir_entry_2 *de, *top; ++ static struct dx_frame frames[2], *frame; ++ struct inode *dir; ++ int block, err; ++ int count = 0; ++ int ret; ++ __u32 hashval; ++ ++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, ++ start_minor_hash)); ++ dir = dir_file->f_dentry->d_inode; ++ hinfo.hash = start_hash; ++ hinfo.minor_hash = 0; ++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ ++ while (1) { ++ block = dx_get_block(frame->at); ++ dxtrace(printk("Reading block %d\n", block)); ++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) ++ goto errout; ++ ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) { ++ ext3fs_dirhash(de->name, de->name_len, &hinfo); ++ if ((hinfo.hash < start_hash) || ++ ((hinfo.hash == start_hash) && ++ (hinfo.minor_hash < start_minor_hash))) ++ continue; ++ ext3_htree_store_dirent(dir_file, hinfo.hash, ++ hinfo.minor_hash, de); ++ count++; ++ } ++ brelse (bh); ++ hashval = ~1; ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, ++ frame, frames, &err, &hashval); ++ if (next_hash) ++ *next_hash = hashval; ++ if (ret == -1) ++ goto errout; ++ /* ++ * Stop if: (a) there are no more entries, or ++ * (b) we have inserted at least one entry and the ++ * next hash value is not a continuation ++ */ ++ if ((ret == 0) || ++ (count && ((hashval & 1) == 0))) ++ break; ++ } ++ dx_release(frames); ++ dxtrace(printk("Fill tree: returned %d entries\n", count)); ++ return count; ++errout: ++ dx_release(frames); ++ return (err); ++} ++ ++ ++/* ++ * Directory block splitting, compacting ++ */ ++ ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) ++{ ++ int count = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ while ((char *) de < base + size) ++ { ++ if (de->name_len && de->inode) { ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ map_tail--; ++ map_tail->hash = h.hash; ++ map_tail->offs = (u32) ((char *) de - base); ++ count++; ++ } ++ /* XXX: do we need to check rec_len == 0 case? -Chris */ ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ return count; ++} ++ ++static void dx_sort_map (struct dx_map_entry *map, unsigned count) ++{ ++ struct dx_map_entry *p, *q, *top = map + count - 1; ++ int more; ++ /* Combsort until bubble sort doesn't suck */ ++ while (count > 2) ++ { ++ count = count*10/13; ++ if (count - 9 < 2) /* 9, 10 -> 11 */ ++ count = 11; ++ for (p = top, q = p - count; q >= map; p--, q--) ++ if (p->hash < q->hash) ++ swap(*p, *q); ++ } ++ /* Garden variety bubble sort */ ++ do { ++ more = 0; ++ q = top; ++ while (q-- > map) ++ { ++ if (q[1].hash >= q[0].hash) ++ continue; ++ swap(*(q+1), *q); ++ more = 1; ++ } ++ } while(more); ++} ++ ++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++{ ++ struct dx_entry *entries = frame->entries; ++ struct dx_entry *old = frame->at, *new = old + 1; ++ int count = dx_get_count(entries); ++ ++ assert(count < dx_get_limit(entries)); ++ assert(old < entries + count); ++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); ++ dx_set_hash(new, hash); ++ dx_set_block(new, block); ++ dx_set_count(entries, count + 1); ++} ++#endif ++ ++ ++static void ext3_update_dx_flag(struct inode *inode) ++{ ++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, ++ EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; ++} ++ + /* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * +@@ -96,6 +726,7 @@ + return 0; + } + ++ + /* + * ext3_find_entry() + * +@@ -107,6 +738,8 @@ + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ ++ ++ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { +@@ -121,12 +754,32 @@ + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; ++ int namelen; ++ const u8 *name; ++ unsigned blocksize; + + *res_dir = NULL; + sb = dir->i_sb; +- ++ blocksize = sb->s_blocksize; ++ namelen = dentry->d_name.len; ++ name = dentry->d_name.name; ++ if (namelen > EXT3_NAME_LEN) ++ return NULL; ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ /* ++ * On success, or if the error was file not found, ++ * return. Otherwise, fall back to doing a search the ++ * old fashioned way. ++ */ ++ if (bh || (err != ERR_BAD_DX_DIR)) ++ return bh; ++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ } ++#endif + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); +- start = dir->u.ext3_i.i_dir_start_lookup; ++ start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +@@ -167,7 +820,7 @@ + i = search_dirblock(bh, dir, dentry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { +- dir->u.ext3_i.i_dir_start_lookup = block; ++ EXT3_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { +@@ -198,6 +851,66 @@ + return ret; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err) ++{ ++ struct super_block * sb; ++ struct dx_hash_info hinfo; ++ u32 hash; ++ struct dx_frame frames[2], *frame; ++ struct ext3_dir_entry_2 *de, *top; ++ struct buffer_head *bh; ++ unsigned long block; ++ int retval; ++ int namelen = dentry->d_name.len; ++ const u8 *name = dentry->d_name.name; ++ struct inode *dir = dentry->d_parent->d_inode; ++ ++ sb = dir->i_sb; ++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++ return NULL; ++ hash = hinfo.hash; ++ do { ++ block = dx_get_block(frame->at); ++ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) ++ goto errout; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) ++ if (ext3_match (namelen, name, de)) { ++ if (!ext3_check_dir_entry("ext3_find_entry", ++ dir, de, bh, ++ (block<b_data))) { ++ brelse (bh); ++ goto errout; ++ } ++ *res_dir = de; ++ dx_release (frames); ++ return bh; ++ } ++ brelse (bh); ++ /* Check to see if we should continue to search */ ++ retval = ext3_htree_next_block(dir, hash, frame, ++ frames, err, 0); ++ if (retval == -1) { ++ ext3_warning(sb, __FUNCTION__, ++ "error reading index page in directory #%lu", ++ dir->i_ino); ++ goto errout; ++ } ++ } while (retval == 1); ++ ++ *err = -ENOENT; ++errout: ++ dxtrace(printk("%s not found\n", name)); ++ dx_release (frames); ++ return NULL; ++} ++#endif ++ + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; +@@ -214,8 +927,9 @@ + brelse (bh); + inode = iget(dir->i_sb, ino); + +- if (!inode) ++ if (!inode) { + return ERR_PTR(-EACCES); ++ } + } + d_add(dentry, inode); + return NULL; +@@ -239,6 +953,301 @@ + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct ext3_dir_entry_2 * ++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) ++{ ++ unsigned rec_len = 0; ++ ++ while (count--) { ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ memcpy (to, de, rec_len); ++ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ de->inode = 0; ++ map++; ++ to += rec_len; ++ } ++ return (struct ext3_dir_entry_2 *) (to - rec_len); ++} ++ ++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) ++{ ++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ unsigned rec_len = 0; ++ ++ prev = to = de; ++ while ((char*)de < base + size) { ++ next = (struct ext3_dir_entry_2 *) ((char *) de + ++ le16_to_cpu(de->rec_len)); ++ if (de->inode && de->name_len) { ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ if (de > to) ++ memmove(to, de, rec_len); ++ to->rec_len = rec_len; ++ prev = to; ++ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ } ++ de = next; ++ } ++ return prev; ++} ++ ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ struct buffer_head **bh,struct dx_frame *frame, ++ struct dx_hash_info *hinfo, int *error) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count, continued; ++ struct buffer_head *bh2; ++ u32 newblock; ++ u32 hash2; ++ struct dx_map_entry *map; ++ char *data1 = (*bh)->b_data, *data2; ++ unsigned split; ++ struct ext3_dir_entry_2 *de = NULL, *de2; ++ int err; ++ ++ bh2 = ext3_append (handle, dir, &newblock, error); ++ if (!(bh2)) { ++ brelse(*bh); ++ *bh = NULL; ++ goto errout; ++ } ++ ++ BUFFER_TRACE(*bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, *bh); ++ if (err) { ++ journal_error: ++ brelse(*bh); ++ brelse(bh2); ++ *bh = NULL; ++ ext3_std_error(dir->i_sb, err); ++ goto errout; ++ } ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ ++ data2 = bh2->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map (map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ dx_get_block(frame->at), hash2, split, count-split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de = dx_pack_dirents(data1,blocksize); ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) ++ { ++ swap(*bh, bh2); ++ de = de2; ++ } ++ dx_insert_block (frame, hash2 + continued, newblock); ++ err = ext3_journal_dirty_metadata (handle, bh2); ++ if (err) ++ goto journal_error; ++ err = ext3_journal_dirty_metadata (handle, frame->bh); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ dxtrace(dx_show_index ("frame", frame->entries)); ++errout: ++ return de; ++} ++#endif ++ ++ ++/* ++ * Add a new entry into a directory (leaf) block. If de is non-NULL, ++ * it points to a directory entry which is guaranteed to be large ++ * enough for new directory entry. If de is NULL, then ++ * add_dirent_to_buf will attempt search the directory block for ++ * space. It will return -ENOSPC if no space is available, and -EIO ++ * and -EEXIST if directory entry already exists. ++ * ++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In ++ * all other cases bh is released. ++ */ ++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct ext3_dir_entry_2 *de, ++ struct buffer_head * bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ unsigned long offset = 0; ++ unsigned short reclen; ++ int nlen, rlen, err; ++ char *top; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ if (!de) { ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, ++ bh, offset)) { ++ brelse (bh); ++ return -EIO; ++ } ++ if (ext3_match (namelen, name, de)) { ++ brelse (bh); ++ return -EEXIST; ++ } ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ break; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ if ((char *) de > top) ++ return -ENOSPC; ++ } ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) { ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return err; ++ } ++ ++ /* By now the buffer is marked for journaling */ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ if (inode) { ++ de->inode = cpu_to_le32(inode->i_ino); ++ ext3_set_de_type(dir->i_sb, de, inode->i_mode); ++ } else ++ de->inode = 0; ++ de->name_len = namelen; ++ memcpy (de->name, name, namelen); ++ /* ++ * XXX shouldn't update any times until successful ++ * completion of syscall, but too many callers depend ++ * on this. ++ * ++ * XXX similarly, too many callers depend on ++ * ext3_new_inode() setting the times, but error ++ * recovery deletes the inode, so the worst that can ++ * happen is that the times are slightly out of date ++ * and/or different from the directory change time. ++ */ ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ ext3_update_dx_flag(dir); ++ dir->i_version = ++event; ++ ext3_mark_inode_dirty(handle, dir); ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return 0; ++} ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * This converts a one block unindexed directory to a 3 block indexed ++ * directory, and adds the dentry to the indexed directory. ++ */ ++static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct buffer_head *bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ struct buffer_head *bh2; ++ struct dx_root *root; ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries; ++ struct ext3_dir_entry_2 *de, *de2; ++ char *data1, *top; ++ unsigned len; ++ int retval; ++ unsigned blocksize; ++ struct dx_hash_info hinfo; ++ u32 block; ++ ++ blocksize = dir->i_sb->s_blocksize; ++ dxtrace(printk("Creating index\n")); ++ retval = ext3_journal_get_write_access(handle, bh); ++ if (retval) { ++ ext3_std_error(dir->i_sb, retval); ++ brelse(bh); ++ return retval; ++ } ++ root = (struct dx_root *) bh->b_data; ++ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ bh2 = ext3_append (handle, dir, &block, &retval); ++ if (!(bh2)) { ++ brelse(bh); ++ return retval; ++ } ++ data1 = bh2->b_data; ++ ++ /* The 0th block becomes the root, move the dirents out */ ++ de = (struct ext3_dir_entry_2 *) &root->dotdot; ++ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); ++ len = ((char *) root) + blocksize - (char *) de; ++ memcpy (data1, de, len); ++ de = (struct ext3_dir_entry_2 *) data1; ++ top = data1 + len; ++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) ++ de = de2; ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ /* Initialize the root; the dot dirents already exist */ ++ de = (struct ext3_dir_entry_2 *) (&root->dotdot); ++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); ++ memset (&root->info, 0, sizeof(root->info)); ++ root->info.info_length = sizeof(root->info); ++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; ++ entries = root->entries; ++ dx_set_block (entries, 1); ++ dx_set_count (entries, 1); ++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ ++ /* Initialize as for dx_probe */ ++ hinfo.hash_version = root->info.hash_version; ++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ frame = frames; ++ frame->entries = entries; ++ frame->at = entries; ++ frame->bh = bh; ++ bh = bh2; ++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ dx_release (frames); ++ if (!(de)) ++ return retval; ++ ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} ++#endif ++ + /* + * ext3_add_entry() + * +@@ -249,127 +1258,197 @@ + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +- +-/* +- * AKPM: the journalling code here looks wrong on the error paths +- */ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; +- const char *name = dentry->d_name.name; +- int namelen = dentry->d_name.len; + unsigned long offset; +- unsigned short rec_len; + struct buffer_head * bh; +- struct ext3_dir_entry_2 * de, * de1; ++ struct ext3_dir_entry_2 *de; + struct super_block * sb; + int retval; ++#ifdef CONFIG_EXT3_INDEX ++ int dx_fallback=0; ++#endif ++ unsigned blocksize; ++ unsigned nlen, rlen; ++ u32 block, blocks; + + sb = dir->i_sb; +- +- if (!namelen) ++ blocksize = sb->s_blocksize; ++ if (!dentry->d_name.len) + return -EINVAL; +- bh = ext3_bread (handle, dir, 0, 0, &retval); ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ retval = ext3_dx_add_entry(handle, dentry, inode); ++ if (!retval || (retval != ERR_BAD_DX_DIR)) ++ return retval; ++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; ++ dx_fallback++; ++ ext3_mark_inode_dirty(handle, dir); ++ } ++#endif ++ blocks = dir->i_size >> sb->s_blocksize_bits; ++ for (block = 0, offset = 0; block < blocks; block++) { ++ bh = ext3_bread(handle, dir, block, 0, &retval); ++ if(!bh) ++ return retval; ++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (retval != -ENOSPC) ++ return retval; ++ ++#ifdef CONFIG_EXT3_INDEX ++ if (blocks == 1 && !dx_fallback && ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ return make_indexed_dir(handle, dentry, inode, bh); ++#endif ++ brelse(bh); ++ } ++ bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; +- rec_len = EXT3_DIR_REC_LEN(namelen); +- offset = 0; + de = (struct ext3_dir_entry_2 *) bh->b_data; +- while (1) { +- if ((char *)de >= sb->s_blocksize + bh->b_data) { +- brelse (bh); +- bh = NULL; +- bh = ext3_bread (handle, dir, +- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); +- if (!bh) +- return retval; +- if (dir->i_size <= offset) { +- if (dir->i_size == 0) { +- brelse(bh); +- return -ENOENT; +- } +- +- ext3_debug ("creating next block\n"); +- +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- de->inode = 0; +- de->rec_len = le16_to_cpu(sb->s_blocksize); +- dir->u.ext3_i.i_disksize = +- dir->i_size = offset + sb->s_blocksize; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- } else { +- +- ext3_debug ("skipping to next block\n"); ++ de->inode = 0; ++ de->rec_len = cpu_to_le16(rlen = blocksize); ++ nlen = 0; ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} + +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- } +- } +- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, +- offset)) { +- brelse (bh); +- return -ENOENT; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries, *at; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct super_block * sb = dir->i_sb; ++ struct ext3_dir_entry_2 *de; ++ int err; ++ ++ frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ entries = frame->entries; ++ at = frame->at; ++ ++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ goto cleanup; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (err != -ENOSPC) { ++ bh = 0; ++ goto cleanup; ++ } ++ ++ /* Block full, should compress but for now just split */ ++ dxtrace(printk("using %u of %u node entries\n", ++ dx_get_count(entries), dx_get_limit(entries))); ++ /* Need to split index? */ ++ if (dx_get_count(entries) == dx_get_limit(entries)) { ++ u32 newblock; ++ unsigned icount = dx_get_count(entries); ++ int levels = frame - frames; ++ struct dx_entry *entries2; ++ struct dx_node *node2; ++ struct buffer_head *bh2; ++ ++ if (levels && (dx_get_count(frames->entries) == ++ dx_get_limit(frames->entries))) { ++ ext3_warning(sb, __FUNCTION__, ++ "Directory index full!\n"); ++ err = -ENOSPC; ++ goto cleanup; + } +- if ((le32_to_cpu(de->inode) == 0 && +- le16_to_cpu(de->rec_len) >= rec_len) || +- (le16_to_cpu(de->rec_len) >= +- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- /* By now the buffer is marked for journaling */ +- offset += le16_to_cpu(de->rec_len); +- if (le32_to_cpu(de->inode)) { +- de1 = (struct ext3_dir_entry_2 *) ((char *) de + +- EXT3_DIR_REC_LEN(de->name_len)); +- de1->rec_len = +- cpu_to_le16(le16_to_cpu(de->rec_len) - +- EXT3_DIR_REC_LEN(de->name_len)); +- de->rec_len = cpu_to_le16( +- EXT3_DIR_REC_LEN(de->name_len)); +- de = de1; ++ ++ bh2 = ext3_append (handle, dir, &newblock, &err); ++ if (!(bh2)) ++ goto cleanup; ++ node2 = (struct dx_node *)(bh2->b_data); ++ entries2 = node2->entries; ++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); ++ node2->fake.inode = 0; ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ if (levels) { ++ unsigned icount1 = icount/2, icount2 = icount - icount1;+ unsigned hash2 = dx_get_hash(entries + icount1); ++ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ err = ext3_journal_get_write_access(handle, ++ frames[0].bh); ++ if (err) ++ goto journal_error; ++ ++ memcpy ((char *) entries2, (char *) (entries + icount1),+ icount2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, icount1); ++ dx_set_count (entries2, icount2); ++ dx_set_limit (entries2, dx_node_limit(dir)); ++ ++ /* Which index block gets the new entry? */ ++ if (at - entries >= icount1) { ++ frame->at = at = at - entries - icount1 + entries2; ++ frame->entries = entries = entries2; ++ swap(frame->bh, bh2); + } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); +- /* +- * XXX shouldn't update any times until successful +- * completion of syscall, but too many callers depend +- * on this. +- * +- * XXX similarly, too many callers depend on +- * ext3_new_inode() setting the times, but error +- * recovery deletes the inode, so the worst that can +- * happen is that the times are slightly out of date +- * and/or different from the directory change time. +- */ +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- dir->i_version = ++event; +- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); +- ext3_journal_dirty_metadata(handle, bh); +- brelse(bh); +- return 0; ++ dx_insert_block (frames + 0, hash2, newblock); ++ dxtrace(dx_show_index ("node", frames[1].entries)); ++ dxtrace(dx_show_index ("node", ++ ((struct dx_node *) bh2->b_data)->entries)); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ } else { ++ dxtrace(printk("Creating second level index...\n")); ++ memcpy((char *) entries2, (char *) entries, ++ icount * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock); ++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ ++ /* Add new access path frame */ ++ frame = frames + 1; ++ frame->at = at = at - entries + entries2; ++ frame->entries = entries = entries2; ++ frame->bh = bh2; ++ err = ext3_journal_get_write_access(handle, ++ frame->bh); ++ if (err) ++ goto journal_error; + } +- offset += le16_to_cpu(de->rec_len); +- de = (struct ext3_dir_entry_2 *) +- ((char *) de + le16_to_cpu(de->rec_len)); ++ ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- brelse (bh); +- return -ENOSPC; ++ de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ if (!de) ++ goto cleanup; ++ err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ bh = 0; ++ goto cleanup; ++ ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++cleanup: ++ if (bh) ++ brelse(bh); ++ dx_release(frames); ++ return err; + } ++#endif + + /* + * ext3_delete_entry deletes a directory entry by merging it with the +@@ -453,9 +1532,11 @@ + struct inode * inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -480,9 +1561,11 @@ + struct inode *inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -512,9 +1595,11 @@ + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -526,7 +1611,8 @@ + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; +- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; ++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; ++ inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ +@@ -555,21 +1641,19 @@ + brelse (dir_block); + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); +- if (err) +- goto out_no_entry; ++ if (err) { ++ inode->i_nlink = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + dir->i_nlink++; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- inode->i_nlink = 0; +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + /* +@@ -656,7 +1740,7 @@ + int err = 0, rc; + + lock_super(sb); +- if (!list_empty(&inode->u.ext3_i.i_orphan)) ++ if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks +@@ -697,7 +1781,7 @@ + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) +- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); ++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); + jbd_debug(4, "orphan inode %ld will point to %d\n", +@@ -715,25 +1799,26 @@ + int ext3_orphan_del(handle_t *handle, struct inode *inode) + { + struct list_head *prev; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_sb_info *sbi; + ino_t ino_next; + struct ext3_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); +- if (list_empty(&inode->u.ext3_i.i_orphan)) { ++ if (list_empty(&ei->i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); +- prev = inode->u.ext3_i.i_orphan.prev; ++ prev = ei->i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); + +- list_del(&inode->u.ext3_i.i_orphan); +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ list_del(&ei->i_orphan); ++ INIT_LIST_HEAD(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on +@@ -794,8 +1879,9 @@ + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); +@@ -833,7 +1919,7 @@ + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + + end_rmdir: +@@ -851,8 +1937,9 @@ + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -879,7 +1966,7 @@ + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) +@@ -905,9 +1992,11 @@ + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -917,7 +2006,7 @@ + if (IS_ERR(inode)) + goto out_stop; + +- if (l > sizeof (inode->u.ext3_i.i_data)) { ++ if (l > sizeof (EXT3_I(inode)->i_data)) { + inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* +@@ -926,25 +2015,23 @@ + * i_size in generic_commit_write(). + */ + err = block_symlink(inode, symname, l); +- if (err) +- goto out_no_entry; ++ if (err) { ++ ext3_dec_count(handle, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; +- memcpy((char*)&inode->u.ext3_i.i_data,symname,l); ++ memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_size = l-1; + } +- inode->u.ext3_i.i_disksize = inode->i_size; ++ EXT3_I(inode)->i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- ext3_dec_count(handle, inode); +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + static int ext3_link (struct dentry * old_dentry, +@@ -957,12 +2044,15 @@ + if (S_ISDIR(inode->i_mode)) + return -EPERM; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (inode->i_nlink >= EXT3_LINK_MAX) { + return -EMLINK; ++ } + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -996,9 +2086,11 @@ + + old_bh = new_bh = dir_bh = NULL; + +- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; +@@ -1078,7 +2170,7 @@ + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; +- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); +@@ -1090,7 +2182,7 @@ + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; +- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + } +Index: linux-2.4.19/fs/ext3/super.c +=================================================================== +--- linux-2.4.19.orig/fs/ext3/super.c 2004-04-23 18:25:03.000000000 -0400 ++++ linux-2.4.19/fs/ext3/super.c 2004-04-23 18:26:27.000000000 -0400 +@@ -741,6 +741,7 @@ + es->s_mtime = cpu_to_le32(CURRENT_TIME); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ + ext3_commit_super (sb, es, 1); + if (test_opt (sb, DEBUG)) + printk (KERN_INFO +@@ -751,6 +752,7 @@ + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); ++ + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", + bdevname(sb->s_dev)); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { +@@ -925,6 +927,7 @@ + return res; + } + ++ + struct super_block * ext3_read_super (struct super_block * sb, void * data, + int silent) + { +@@ -1113,6 +1116,9 @@ + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); ++ for (i=0; i < 4; i++) ++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); ++ sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR +@@ -1821,6 +1827,7 @@ + exit_ext3_xattr(); + } + ++EXPORT_SYMBOL(ext3_force_commit); + EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +Index: linux-2.4.19/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.19.orig/include/linux/ext3_fs.h 2004-04-23 17:53:55.000000000 -0400 ++++ linux-2.4.19/include/linux/ext3_fs.h 2004-04-23 18:26:27.000000000 -0400 +@@ -40,6 +40,11 @@ + #define EXT3FS_VERSION "2.4-0.9.18" + + /* ++ * Always enable hashed directories ++ */ ++#define CONFIG_EXT3_INDEX ++ ++/* + * Debug code + */ + #ifdef EXT3FS_DEBUG +@@ -414,8 +419,11 @@ + /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ +- +-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ ++ __u32 s_hash_seed[4]; /* HTREE hash seed */ ++ __u8 s_def_hash_version; /* Default hash version to use */ ++ __u8 s_reserved_char_pad; ++ __u16 s_reserved_word_pad; ++ __u32 s_reserved[192]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ +@@ -552,9 +560,46 @@ + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) ++/* ++ * Hash Tree Directory indexing ++ * (c) Daniel Phillips, 2001 ++ */ ++ ++#ifdef CONFIG_EXT3_INDEX ++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ ++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ ++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) ++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#else ++ #define is_dx(dir) 0 ++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) ++#endif ++ ++/* Legal values for the dx_root hash_version field: */ ++ ++#define DX_HASH_LEGACY 0 ++#define DX_HASH_HALF_MD4 1 ++#define DX_HASH_TEA 2 ++ ++/* hash info structure used by the directory hash */ ++struct dx_hash_info ++{ ++ u32 hash; ++ u32 minor_hash; ++ int hash_version; ++ u32 *seed; ++}; + + #ifdef __KERNEL__ + /* ++ * Control parameters used by ext3_htree_next_block ++ */ ++#define HASH_NB_ALWAYS 1 ++ ++ ++/* + * Describe an inode's exact location on disk and in memory + */ + struct ext3_iloc +@@ -564,6 +609,27 @@ + unsigned long block_group; + }; + ++ ++/* ++ * This structure is stuffed into the struct file's private_data field ++ * for directories. It is where we put information so that we can do ++ * readdir operations in hash tree order. ++ */ ++struct dir_private_info { ++ rb_root_t root; ++ rb_node_t *curr_node; ++ struct fname *extra_fname; ++ loff_t last_pos; ++ __u32 curr_hash; ++ __u32 curr_minor_hash; ++ __u32 next_hash; ++}; ++ ++/* ++ * Special error return code only used by dx_probe() and its callers. ++ */ ++#define ERR_BAD_DX_DIR -75000 ++ + /* + * Function prototypes + */ +@@ -591,11 +657,20 @@ + + /* dir.c */ + extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, struct buffer_head *, +- unsigned long); ++ struct ext3_dir_entry_2 *, ++ struct buffer_head *, unsigned long); ++extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent); ++extern void ext3_htree_free_dir_info(struct dir_private_info *p); ++ + /* fsync.c */ + extern int ext3_sync_file (struct file *, struct dentry *, int); + ++/* hash.c */ ++extern int ext3fs_dirhash(const char *name, int len, struct ++ dx_hash_info *hinfo); ++ + /* ialloc.c */ + extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); + extern void ext3_free_inode (handle_t *, struct inode *); +@@ -628,6 +703,8 @@ + /* namei.c */ + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); ++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) +Index: linux-2.4.19/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.4.19.orig/include/linux/ext3_fs_sb.h 2004-04-23 17:53:54.000000000 -0400 ++++ linux-2.4.19/include/linux/ext3_fs_sb.h 2004-04-23 18:26:27.000000000 -0400 +@@ -62,6 +62,8 @@ + int s_inode_size; + int s_first_ino; + u32 s_next_generation; ++ u32 s_hash_seed[4]; ++ int s_def_hash_version; + + /* Journaling */ + struct inode * s_journal_inode; +Index: linux-2.4.19/include/linux/ext3_jbd.h +=================================================================== +--- linux-2.4.19.orig/include/linux/ext3_jbd.h 2004-04-23 17:53:54.000000000 -0400 ++++ linux-2.4.19/include/linux/ext3_jbd.h 2004-04-23 18:26:27.000000000 -0400 +@@ -69,6 +69,8 @@ + + #define EXT3_RESERVE_TRANS_BLOCKS 12 + ++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 ++ + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, +Index: linux-2.4.19/include/linux/rbtree.h +=================================================================== +--- linux-2.4.19.orig/include/linux/rbtree.h 2001-11-22 14:46:18.000000000 -0500 ++++ linux-2.4.19/include/linux/rbtree.h 2004-04-23 18:26:27.000000000 -0400 +@@ -120,6 +120,8 @@ + + extern void rb_insert_color(rb_node_t *, rb_root_t *); + extern void rb_erase(rb_node_t *, rb_root_t *); ++extern rb_node_t *rb_get_first(rb_root_t *root); ++extern rb_node_t *rb_get_next(rb_node_t *n); + + static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) + { +Index: linux-2.4.19/lib/rbtree.c +=================================================================== +--- linux-2.4.19.orig/lib/rbtree.c 2002-08-02 20:39:46.000000000 -0400 ++++ linux-2.4.19/lib/rbtree.c 2004-04-23 18:26:27.000000000 -0400 +@@ -17,6 +17,8 @@ + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/lib/rbtree.c ++ ++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 + */ + + #include +@@ -294,3 +296,43 @@ + __rb_erase_color(child, parent, root); + } + EXPORT_SYMBOL(rb_erase); ++ ++/* ++ * This function returns the first node (in sort order) of the tree. ++ */ ++rb_node_t *rb_get_first(rb_root_t *root) ++{ ++ rb_node_t *n; ++ ++ n = root->rb_node; ++ if (!n) ++ return 0; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++} ++EXPORT_SYMBOL(rb_get_first); ++ ++/* ++ * Given a node, this function will return the next node in the tree. ++ */ ++rb_node_t *rb_get_next(rb_node_t *n) ++{ ++ rb_node_t *parent; ++ ++ if (n->rb_right) { ++ n = n->rb_right; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++ } else { ++ while ((parent = n->rb_parent)) { ++ if (n == parent->rb_left) ++ return parent; ++ n = parent; ++ } ++ return 0; ++ } ++} ++EXPORT_SYMBOL(rb_get_next); ++ diff --git a/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch b/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch index 3c6b5e5..1e26b8c 100644 --- a/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch +++ b/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch @@ -1,7 +1,7 @@ -Index: linux-2.4.19.SuSE/fs/ext3/namei.c +Index: linux-2.4.19/fs/ext3/namei.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:14:50 2003 -+++ linux-2.4.19.SuSE/fs/ext3/namei.c Sun Nov 16 01:18:04 2003 +--- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:36:03.000000000 -0400 ++++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:37:37.000000000 -0400 @@ -1751,8 +1751,8 @@ struct super_block *sb = inode->i_sb; struct ext3_iloc iloc; @@ -25,7 +25,7 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c @@ -1813,20 +1813,19 @@ { struct list_head *prev; - struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_sb_info *sbi; + struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); unsigned long ino_next; @@ -33,15 +33,15 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c int err = 0; - lock_super(inode->i_sb); -+ down(&sbi->s_orphan_lock); - if (list_empty(&ei->i_orphan)) { ++ down(&sbi->s_orphan_lock); + if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); -+ up(&sbi->s_orphan_lock); ++ up(&sbi->s_orphan_lock); return 0; } ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; + prev = ei->i_orphan.prev; - sbi = EXT3_SB(inode->i_sb); jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); @@ -59,11 +59,11 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c return err; out_brelse: -Index: linux-2.4.19.SuSE/fs/ext3/super.c +Index: linux-2.4.19/fs/ext3/super.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 00:40:59 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 01:18:04 2003 -@@ -1182,6 +1182,7 @@ +--- linux-2.4.19.orig/fs/ext3/super.c 2004-04-23 22:30:41.000000000 -0400 ++++ linux-2.4.19/fs/ext3/super.c 2004-04-23 22:36:22.000000000 -0400 +@@ -1179,6 +1179,7 @@ */ sb->s_op = &ext3_sops; INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ @@ -71,10 +71,10 @@ Index: linux-2.4.19.SuSE/fs/ext3/super.c sb->s_root = 0; -Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h +Index: linux-2.4.19/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h Sat Nov 15 23:58:28 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h Sun Nov 16 01:18:41 2003 +--- linux-2.4.19.orig/include/linux/ext3_fs_sb.h 2004-04-23 18:26:27.000000000 -0400 ++++ linux-2.4.19/include/linux/ext3_fs_sb.h 2004-04-23 22:36:22.000000000 -0400 @@ -69,6 +69,7 @@ struct inode * s_journal_inode; struct journal_s * s_journal; diff --git a/lustre/kernel_patches/patches/ext3-trusted_ea-suse-2.4.19.patch b/lustre/kernel_patches/patches/ext3-trusted_ea-suse-2.4.19.patch new file mode 100644 index 0000000..1c31052 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-trusted_ea-suse-2.4.19.patch @@ -0,0 +1,179 @@ + fs/ext3/xattr.c | 12 +++++- + fs/ext3/xattr_trusted.c | 86 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_xattr.h | 6 +++ + 3 files changed, 102 insertions(+), 2 deletions(-) + +Index: linux-2.4.19/fs/ext3/xattr.c +=================================================================== +--- linux-2.4.19.orig/fs/ext3/xattr.c 2004-04-23 22:44:57.000000000 -0400 ++++ linux-2.4.19/fs/ext3/xattr.c 2004-04-23 22:45:20.000000000 -0400 +@@ -1785,18 +1785,25 @@ + int __init + init_ext3_xattr(void) + { ++ int error; ++ + ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, + sizeof(struct mb_cache_entry) + + sizeof(struct mb_cache_entry_index), 1, 61); + if (!ext3_xattr_cache) + return -ENOMEM; + +- return 0; ++ error = init_ext3_xattr_trusted(); ++ if (error) ++ mb_cache_destroy(ext3_xattr_cache); ++ ++ return error; + } + + void + exit_ext3_xattr(void) + { ++ exit_ext3_xattr_trusted(); + if (ext3_xattr_cache) + mb_cache_destroy(ext3_xattr_cache); + ext3_xattr_cache = NULL; +@@ -1807,12 +1814,13 @@ + int __init + init_ext3_xattr(void) + { +- return 0; ++ return init_ext3_xattr_trusted(); + } + + void + exit_ext3_xattr(void) + { ++ exit_ext3_xattr_trusted(); + } + + #endif /* CONFIG_EXT3_FS_XATTR_SHARING */ +Index: linux-2.4.19/fs/ext3/xattr_trusted.c +=================================================================== +--- linux-2.4.19.orig/fs/ext3/xattr_trusted.c 2003-01-30 05:24:37.000000000 -0500 ++++ linux-2.4.19/fs/ext3/xattr_trusted.c 2004-04-23 22:45:20.000000000 -0400 +@@ -0,0 +1,86 @@ ++/* ++ * linux/fs/ext3/xattr_trusted.c ++ * Handler for trusted extended attributes. ++ * ++ * Copyright (C) 2003 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define XATTR_TRUSTED_PREFIX "trusted." ++ ++static size_t ++ext3_xattr_trusted_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext3_xattr_trusted_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, ++ buffer, size); ++} ++ ++static int ++ext3_xattr_trusted_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ handle_t *handle; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_TRUSTED, name, ++ value, size, flags); ++ ext3_journal_stop(handle, inode); ++ ++ return error; ++} ++ ++struct ext3_xattr_handler ext3_xattr_trusted_handler = { ++ .prefix = XATTR_TRUSTED_PREFIX, ++ .list = ext3_xattr_trusted_list, ++ .get = ext3_xattr_trusted_get, ++ .set = ext3_xattr_trusted_set, ++}; ++ ++int __init ++init_ext3_xattr_trusted(void) ++{ ++ return ext3_xattr_register(EXT3_XATTR_INDEX_TRUSTED, ++ &ext3_xattr_trusted_handler); ++} ++ ++void ++exit_ext3_xattr_trusted(void) ++{ ++ ext3_xattr_unregister(EXT3_XATTR_INDEX_TRUSTED, ++ &ext3_xattr_trusted_handler); ++} +Index: linux-2.4.19/fs/ext3/Makefile +=================================================================== +--- linux-2.4.19.orig/fs/ext3/Makefile 2004-04-23 22:38:38.000000000 -0400 ++++ linux-2.4.19/fs/ext3/Makefile 2004-04-23 22:49:23.000000000 -0400 +@@ -12,7 +12,7 @@ + export-objs := super.o inode.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o ++ ioctl.o namei.o super.o symlink.o hash.o xattr_trusted.o + obj-m := $(O_TARGET) + + obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o +Index: linux-2.4.19/include/linux/ext3_xattr.h +=================================================================== +--- linux-2.4.19.orig/include/linux/ext3_xattr.h 2004-04-23 17:53:54.000000000 -0400 ++++ linux-2.4.19/include/linux/ext3_xattr.h 2004-04-23 22:45:20.000000000 -0400 +@@ -21,6 +21,9 @@ + #define EXT3_XATTR_INDEX_USER 1 + #define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 + #define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++#define EXT3_XATTR_INDEX_TRUSTED 4 ++#define EXT3_XATTR_INDEX_LUSTRE 5 ++#define EXT3_XATTR_INDEX_SECURITY 6 + + struct ext3_xattr_header { + __u32 h_magic; /* magic number for identification */ +@@ -84,6 +87,9 @@ + extern int init_ext3_xattr(void) __init; + extern void exit_ext3_xattr(void); + ++extern int init_ext3_xattr_trusted(void) __init; ++extern void exit_ext3_xattr_trusted(void); ++ + # else /* CONFIG_EXT3_FS_XATTR */ + # define ext3_setxattr NULL + # define ext3_getxattr NULL diff --git a/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch b/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch index 1dab6d8..5a5dc5a 100644 --- a/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch +++ b/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch @@ -1,11 +1,11 @@ ./fs/ext3/namei.c | 11 +++++------ 1 files changed, 5 insertions(+), 6 deletions(-) -Index: linux-2.4.19.SuSE/./fs/ext3/namei.c +Index: linux-2.4.19/fs/ext3/namei.c =================================================================== ---- linux-2.4.19.SuSE.orig/./fs/ext3/namei.c Sun Nov 16 01:02:51 2003 -+++ linux-2.4.19.SuSE/./fs/ext3/namei.c Sun Nov 16 01:14:50 2003 -@@ -1523,8 +1523,11 @@ +--- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:30:41.000000000 -0400 ++++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:36:03.000000000 -0400 +@@ -1522,8 +1522,11 @@ { int err = ext3_add_entry(handle, dentry, inode); if (!err) { @@ -19,7 +19,7 @@ Index: linux-2.4.19.SuSE/./fs/ext3/namei.c } ext3_dec_count(handle, inode); iput(inode); -@@ -1560,7 +1563,6 @@ +@@ -1559,7 +1562,6 @@ inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; inode->i_mapping->a_ops = &ext3_aops; @@ -27,7 +27,7 @@ Index: linux-2.4.19.SuSE/./fs/ext3/namei.c err = ext3_add_nondir(handle, dentry, inode); } ext3_journal_stop(handle, dir); -@@ -1590,7 +1592,6 @@ +@@ -1589,7 +1591,6 @@ #ifdef CONFIG_EXT3_FS_XATTR inode->i_op = &ext3_special_inode_operations; #endif @@ -38,7 +38,7 @@ Index: linux-2.4.19.SuSE/./fs/ext3/namei.c @@ -2039,7 +2040,6 @@ inode->i_size = l-1; } - EXT3_I(inode)->i_disksize = inode->i_size; + EXT3_I(inode)->i_disksize = inode->i_size; - ext3_mark_inode_dirty(handle, inode); err = ext3_add_nondir(handle, dentry, inode); out_stop: diff --git a/lustre/kernel_patches/patches/invalidate_show-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/invalidate_show-2.4.21-sles8sp3.patch new file mode 100644 index 0000000..36e59d3 --- /dev/null +++ b/lustre/kernel_patches/patches/invalidate_show-2.4.21-sles8sp3.patch @@ -0,0 +1,134 @@ + + + + fs/inode.c | 21 ++++++++++++++------- + fs/smbfs/inode.c | 2 +- + fs/super.c | 4 ++-- + include/linux/fs.h | 2 +- + 4 files changed, 18 insertions(+), 11 deletions(-) + +Index: linux-2.4.21/fs/inode.c +=================================================================== +--- linux-2.4.21.orig/fs/inode.c 2004-04-24 02:38:51.000000000 -0400 ++++ linux-2.4.21/fs/inode.c 2004-04-26 19:41:58.000000000 -0400 +@@ -651,7 +651,8 @@ + /* + * Invalidate all inodes for a device. + */ +-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) ++static int invalidate_list(struct list_head *head, struct super_block * sb, ++ struct list_head * dispose, int show) + { + struct list_head *next; + int busy = 0, count = 0; +@@ -676,6 +677,11 @@ + count++; + continue; + } ++ if (show) ++ printk(KERN_ERR ++ "inode busy: dev %s:%lu (%p) mode %o count %u\n", ++ kdevname(sb->s_dev), inode->i_ino, inode, ++ inode->i_mode, atomic_read(&inode->i_count)); + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ +@@ -694,22 +700,23 @@ + /** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock ++ * @show: whether we should display any busy inodes found + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +-int invalidate_inodes(struct super_block * sb) ++int invalidate_inodes(struct super_block * sb, int show) + { + int busy; + LIST_HEAD(throw_away); + + spin_lock(&inode_lock); +- busy = invalidate_list(&inode_in_use, sb, &throw_away); +- busy |= invalidate_list(&inode_unused, sb, &throw_away); +- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); +- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); ++ busy = invalidate_list(&inode_in_use, sb, &throw_away, show); ++ busy |= invalidate_list(&inode_unused, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); +@@ -735,7 +742,7 @@ + * hold). + */ + shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); ++ res = invalidate_inodes(sb, 0); + drop_super(sb); + } + invalidate_buffers(dev); +Index: linux-2.4.21/fs/super.c +=================================================================== +--- linux-2.4.21.orig/fs/super.c 2004-04-24 02:38:51.000000000 -0400 ++++ linux-2.4.21/fs/super.c 2004-04-26 19:41:58.000000000 -0400 +@@ -932,7 +932,7 @@ + lock_super(sb); + lock_kernel(); + sb->s_flags &= ~MS_ACTIVE; +- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */ ++ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */ + if (sop) { + if (sop->write_super && sb->s_dirt) + sop->write_super(sb); +@@ -941,7 +941,7 @@ + } + + /* Forget any remaining inodes */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 1)) { + printk(KERN_ERR "VFS: Busy inodes after unmount. " + "Self-destruct in 5 seconds. Have a nice day...\n"); + } +Index: linux-2.4.21/fs/smbfs/inode.c +=================================================================== +--- linux-2.4.21.orig/fs/smbfs/inode.c 2004-04-24 02:38:44.000000000 -0400 ++++ linux-2.4.21/fs/smbfs/inode.c 2004-04-26 19:41:58.000000000 -0400 +@@ -167,7 +167,7 @@ + { + VERBOSE("\n"); + shrink_dcache_sb(SB_of(server)); +- invalidate_inodes(SB_of(server)); ++ invalidate_inodes(SB_of(server), 0); + } + + /* +Index: linux-2.4.21/fs/ntfs/super.c +=================================================================== +--- linux-2.4.21.orig/fs/ntfs/super.c 2004-04-24 02:38:38.000000000 -0400 ++++ linux-2.4.21/fs/ntfs/super.c 2004-04-26 19:42:12.000000000 -0400 +@@ -1604,7 +1604,7 @@ + * method again... FIXME: Do we need to do this twice now because of + * attribute inodes? I think not, so leave as is for now... (AIA) + */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 0)) { + ntfs_error(sb, "Busy inodes left. This is most likely a NTFS " + "driver bug."); + /* Copied from fs/super.c. I just love this message. (-; */ +Index: linux-2.4.21/include/linux/fs.h +=================================================================== +--- linux-2.4.21.orig/include/linux/fs.h 2004-04-26 19:06:32.000000000 -0400 ++++ linux-2.4.21/include/linux/fs.h 2004-04-26 19:41:58.000000000 -0400 +@@ -1401,7 +1401,7 @@ + extern void set_buffer_flushtime(struct buffer_head *); + extern void balance_dirty(void); + extern int check_disk_change(kdev_t); +-extern int invalidate_inodes(struct super_block *); ++extern int invalidate_inodes(struct super_block *, int); + extern int invalidate_device(kdev_t, int); + extern void invalidate_inode_pages(struct inode *); + extern void invalidate_inode_pages2(struct address_space *); diff --git a/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch b/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch index ad213c9..01c040c 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch @@ -8,10 +8,10 @@ include/linux/ext3_fs.h | 2 8 files changed, 318 insertions(+), 2 deletions(-) -Index: linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt +Index: linux-2.4.19/Documentation/filesystems/ext2.txt =================================================================== ---- linux-2.4.19.SuSE.orig/Documentation/filesystems/ext2.txt Wed Jul 11 15:44:45 2001 -+++ linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt Sun Nov 16 01:27:31 2003 +--- linux-2.4.19.orig/Documentation/filesystems/ext2.txt 2001-07-11 18:44:45.000000000 -0400 ++++ linux-2.4.19/Documentation/filesystems/ext2.txt 2004-04-23 22:37:48.000000000 -0400 @@ -35,6 +35,22 @@ sb=n Use alternate superblock at this location. @@ -35,23 +35,23 @@ Index: linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. -Index: linux-2.4.19.SuSE/fs/ext3/Makefile +Index: linux-2.4.19/fs/ext3/Makefile =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/Makefile Sun Nov 16 00:40:59 2003 -+++ linux-2.4.19.SuSE/fs/ext3/Makefile Sun Nov 16 01:27:31 2003 +--- linux-2.4.19.orig/fs/ext3/Makefile 2004-04-23 18:26:27.000000000 -0400 ++++ linux-2.4.19/fs/ext3/Makefile 2004-04-23 22:38:38.000000000 -0400 @@ -11,7 +11,7 @@ - export-objs := ext3-exports.o + export-objs := super.o inode.o -obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o + ioctl.o namei.o super.o symlink.o hash.o obj-m := $(O_TARGET) -Index: linux-2.4.19.SuSE/fs/ext3/inode.c +Index: linux-2.4.19/fs/ext3/inode.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:26:04 2003 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:27:31 2003 +--- linux-2.4.19.orig/fs/ext3/inode.c 2004-04-23 22:37:42.000000000 -0400 ++++ linux-2.4.19/fs/ext3/inode.c 2004-04-23 22:37:48.000000000 -0400 @@ -34,6 +34,7 @@ #include #include @@ -70,10 +70,10 @@ Index: linux-2.4.19.SuSE/fs/ext3/inode.c if(ext3_get_inode_loc(inode, &iloc)) goto bad_inode; bh = iloc.bh; -Index: linux-2.4.19.SuSE/fs/ext3/iopen.c +Index: linux-2.4.19/fs/ext3/iopen.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/iopen.c Sun Nov 16 01:27:31 2003 -+++ linux-2.4.19.SuSE/fs/ext3/iopen.c Sun Nov 16 01:27:31 2003 +--- linux-2.4.19.orig/fs/ext3/iopen.c 2003-01-30 05:24:37.000000000 -0500 ++++ linux-2.4.19/fs/ext3/iopen.c 2004-04-23 22:37:48.000000000 -0400 @@ -0,0 +1,258 @@ +/* + * linux/fs/ext3/iopen.c @@ -333,10 +333,10 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.c + + return 1; +} -Index: linux-2.4.19.SuSE/fs/ext3/iopen.h +Index: linux-2.4.19/fs/ext3/iopen.h =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/iopen.h Sun Nov 16 01:27:31 2003 -+++ linux-2.4.19.SuSE/fs/ext3/iopen.h Sun Nov 16 01:27:31 2003 +--- linux-2.4.19.orig/fs/ext3/iopen.h 2003-01-30 05:24:37.000000000 -0500 ++++ linux-2.4.19/fs/ext3/iopen.h 2004-04-23 22:37:48.000000000 -0400 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -353,10 +353,10 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.h +extern int ext3_iopen_get_inode(struct inode *inode); +extern struct dentry *iopen_connect_dentry(struct dentry *de, + struct inode *inode); -Index: linux-2.4.19.SuSE/fs/ext3/namei.c +Index: linux-2.4.19/fs/ext3/namei.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:23:20 2003 -+++ linux-2.4.19.SuSE/fs/ext3/namei.c Sun Nov 16 01:27:31 2003 +--- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:37:42.000000000 -0400 ++++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:37:48.000000000 -0400 @@ -36,7 +36,7 @@ #include #include @@ -366,7 +366,7 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c /* * define how far ahead to read directories while searching them. -@@ -922,10 +922,14 @@ +@@ -928,10 +928,14 @@ struct inode * inode; struct ext3_dir_entry_2 * de; struct buffer_head * bh; @@ -381,7 +381,7 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -943,7 +948,28 @@ +@@ -943,7 +947,28 @@ return ERR_PTR(-EACCES); } } @@ -411,11 +411,11 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c return NULL; } -Index: linux-2.4.19.SuSE/fs/ext3/super.c +Index: linux-2.4.19/fs/ext3/super.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:19:22 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 01:27:31 2003 -@@ -864,6 +864,18 @@ +--- linux-2.4.19.orig/fs/ext3/super.c 2004-04-23 22:37:42.000000000 -0400 ++++ linux-2.4.19/fs/ext3/super.c 2004-04-23 22:37:48.000000000 -0400 +@@ -861,6 +861,18 @@ || !strcmp (this_char, "quota") || !strcmp (this_char, "usrquota")) /* Don't do anything ;-) */ ; @@ -434,10 +434,10 @@ Index: linux-2.4.19.SuSE/fs/ext3/super.c else if (!strcmp (this_char, "journal")) { /* @@@ FIXME */ /* Eventually we will want to be able to create -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h +Index: linux-2.4.19/include/linux/ext3_fs.h =================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:25:42 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:30:05 2003 +--- linux-2.4.19.orig/include/linux/ext3_fs.h 2004-04-23 22:37:42.000000000 -0400 ++++ linux-2.4.19/include/linux/ext3_fs.h 2004-04-23 22:37:48.000000000 -0400 @@ -324,6 +324,8 @@ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ diff --git a/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch new file mode 100644 index 0000000..9258544 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch @@ -0,0 +1,449 @@ + Documentation/filesystems/ext2.txt | 16 ++ + fs/ext3/Makefile | 2 + fs/ext3/inode.c | 4 + fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ + fs/ext3/iopen.h | 13 + + fs/ext3/namei.c | 13 + + fs/ext3/super.c | 11 + + include/linux/ext3_fs.h | 2 + 8 files changed, 318 insertions(+), 2 deletions(-) + +Index: linux-2.4.21/Documentation/filesystems/ext2.txt +=================================================================== +--- linux-2.4.21.orig/Documentation/filesystems/ext2.txt 2001-07-11 18:44:45.000000000 -0400 ++++ linux-2.4.21/Documentation/filesystems/ext2.txt 2004-04-24 02:46:32.000000000 -0400 +@@ -35,6 +35,22 @@ + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +Index: linux-2.4.21/fs/ext3/Makefile +=================================================================== +--- linux-2.4.21.orig/fs/ext3/Makefile 2004-04-24 02:46:18.000000000 -0400 ++++ linux-2.4.21/fs/ext3/Makefile 2004-04-24 02:47:02.000000000 -0400 +@@ -11,7 +11,7 @@ + + export-objs := ext3-exports.o + +-obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + +Index: linux-2.4.21/fs/ext3/inode.c +=================================================================== +--- linux-2.4.21.orig/fs/ext3/inode.c 2004-04-24 02:46:19.000000000 -0400 ++++ linux-2.4.21/fs/ext3/inode.c 2004-04-24 02:46:32.000000000 -0400 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * SEARCH_FROM_ZERO forces each block allocation to search from the start +@@ -2252,6 +2253,9 @@ + struct buffer_head *bh; + int block; + ++ if (ext3_iopen_get_inode(inode)) ++ return; ++ + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; +Index: linux-2.4.21/fs/ext3/iopen.c +=================================================================== +--- linux-2.4.21.orig/fs/ext3/iopen.c 2003-01-30 05:24:37.000000000 -0500 ++++ linux-2.4.21/fs/ext3/iopen.c 2004-04-24 02:46:32.000000000 -0400 +@@ -0,0 +1,258 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ * Caller must hold dcache_lock. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ return NULL; ++ ++ /* Move the goal to the de hash queue - like d_move() */ ++ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; ++ list_del_init(&goal->d_hash); ++ ++ list_del(&goal->d_child); ++ list_del(&de->d_child); ++ ++ /* Switch the parents and the names.. */ ++ switch_names(goal, de); ++ do_switch(goal->d_parent, de->d_parent); ++ do_switch(goal->d_name.len, de->d_name.len); ++ do_switch(goal->d_name.hash, de->d_name.hash); ++ ++ /* And add them back to the (new) parent lists */ ++ list_add(&goal->d_child, &goal->d_parent->d_subdirs); ++ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ __d_rehash(goal, 0); ++ ++ return goal; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ inode->u.ext3_i.i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +Index: linux-2.4.21/fs/ext3/iopen.h +=================================================================== +--- linux-2.4.21.orig/fs/ext3/iopen.h 2003-01-30 05:24:37.000000000 -0500 ++++ linux-2.4.21/fs/ext3/iopen.h 2004-04-24 02:46:32.000000000 -0400 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *de, ++ struct inode *inode); +Index: linux-2.4.21/fs/ext3/namei.c +=================================================================== +--- linux-2.4.21.orig/fs/ext3/namei.c 2004-04-24 02:46:19.000000000 -0400 ++++ linux-2.4.21/fs/ext3/namei.c 2004-04-24 02:46:32.000000000 -0400 +@@ -36,7 +36,7 @@ + #include + #include + #include +- ++#include "iopen.h" + + /* + * define how far ahead to read directories while searching them. +@@ -928,10 +928,14 @@ + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ struct dentry *alternate = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -943,7 +947,28 @@ + return ERR_PTR(-EACCES); + } + } +- d_add(dentry, inode); ++ ++ /* verify this dentry is really new */ ++ assert(!dentry->d_inode); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ return alternate; ++ } ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ if (inode) /* d_instantiate */ ++ list_add(&dentry->d_alias, &inode->i_dentry); ++ dentry->d_inode = inode; ++ ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ + return NULL; + } + +Index: linux-2.4.21/fs/ext3/super.c +=================================================================== +--- linux-2.4.21.orig/fs/ext3/super.c 2004-04-24 02:46:19.000000000 -0400 ++++ linux-2.4.21/fs/ext3/super.c 2004-04-24 02:46:32.000000000 -0400 +@@ -869,6 +869,18 @@ + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; ++ else if (!strcmp (this_char, "iopen")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "noiopen")) { ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "iopen_nopriv")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create +Index: linux-2.4.21/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.21.orig/include/linux/ext3_fs.h 2004-04-24 02:46:19.000000000 -0400 ++++ linux-2.4.21/include/linux/ext3_fs.h 2004-04-24 02:46:32.000000000 -0400 +@@ -324,6 +324,8 @@ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ ++#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/kernel_text_address-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/kernel_text_address-2.4.21-sles8sp3.patch new file mode 100644 index 0000000..0541a48 --- /dev/null +++ b/lustre/kernel_patches/patches/kernel_text_address-2.4.21-sles8sp3.patch @@ -0,0 +1,115 @@ +Index: linux-2.4.21/arch/um/kernel/Makefile +=================================================================== +--- linux-2.4.21.orig/arch/um/kernel/Makefile 2004-04-24 02:37:58.000000000 -0400 ++++ linux-2.4.21/arch/um/kernel/Makefile 2004-04-24 02:51:03.000000000 -0400 +@@ -37,7 +37,8 @@ + export-objs-$(CONFIG_GPROF) += gprof_syms.o + export-objs-$(CONFIG_GCOV) += gmon_syms.o + +-export-objs = ksyms.o process_kern.o signal_kern.o user_syms.o $(export-objs-y) ++export-objs = ksyms.o process_kern.o signal_kern.o user_syms.o sysrq.o \ ++ $(export-objs-y) + + CFLAGS_user_syms.o = -D__AUTOCONF_INCLUDED__ $(DMODULES-y) $(DMODVERSIONS-y) \ + -I/usr/include -I../include +Index: linux-2.4.21/arch/um/kernel/sysrq.c +=================================================================== +--- linux-2.4.21.orig/arch/um/kernel/sysrq.c 2004-04-24 02:37:58.000000000 -0400 ++++ linux-2.4.21/arch/um/kernel/sysrq.c 2004-04-24 02:51:03.000000000 -0400 +@@ -86,6 +86,37 @@ + show_trace((unsigned long *)esp); + } + ++#ifdef CONFIG_MODULES ++extern struct module *module_list; ++extern struct module kernel_module; ++#endif ++ ++int is_kernel_text_address(unsigned long addr) ++{ ++ int retval = 0; ++#ifdef CONFIG_MODULES ++ struct module *mod; ++#endif ++ if (addr >= (unsigned long) &_stext && ++ addr <= (unsigned long) &_etext) ++ return 1; ++ ++#ifdef CONFIG_MODULES ++ for (mod = module_list; mod != &kernel_module; mod = mod->next) { ++ /* mod_bound tests for addr being inside the vmalloc'ed ++ * module area. Of course it'd be better to test only ++ * for the .text subset... */ ++ if (mod_bound(addr, 0, mod)) { ++ retval = 1; ++ break; ++ } ++ } ++#endif ++ return retval; ++} ++ ++EXPORT_SYMBOL(is_kernel_text_address); ++ + /* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically +Index: linux-2.4.21/arch/i386/kernel/Makefile +=================================================================== +--- linux-2.4.21.orig/arch/i386/kernel/Makefile 2004-04-24 02:39:05.000000000 -0400 ++++ linux-2.4.21/arch/i386/kernel/Makefile 2004-04-24 02:51:26.000000000 -0400 +@@ -20,7 +20,7 @@ + + O_TARGET := kernel.o + +-export-objs := mca.o mtrr.o msr.o cpuid.o microcode.o i386_ksyms.o time.o traps.o dr_alloc.o ++export-objs := mca.o mtrr.o msr.o cpuid.o microcode.o i386_ksyms.o time.o traps.o dr_alloc.o traps.o + + ifdef CONFIG_X86_SPEEDSTEP_ICH + export-objs += speedstep-lib.o +Index: linux-2.4.21/arch/i386/kernel/traps.c +=================================================================== +--- linux-2.4.21.orig/arch/i386/kernel/traps.c 2004-04-24 02:42:58.000000000 -0400 ++++ linux-2.4.21/arch/i386/kernel/traps.c 2004-04-24 02:51:03.000000000 -0400 +@@ -1339,3 +1339,41 @@ + cobalt_init(); + #endif + } ++ ++#ifdef CONFIG_MODULES ++extern struct module *module_list; ++extern struct module kernel_module; ++#endif ++ ++int is_kernel_text_address(unsigned long addr) ++{ ++ int retval = 0; ++#ifdef CONFIG_MODULES ++ struct module *mod; ++#endif ++ if (addr >= (unsigned long) &_stext && ++ addr <= (unsigned long) &_etext); ++ return 1; ++ ++#ifdef CONFIG_MODULES ++ for (mod = module_list; mod != &kernel_module; mod = mod->next) { ++ /* mod_bound tests for addr being inside the vmalloc'ed ++ * module area. Of course it'd be better to test only ++ * for the .text subset... */ ++ if (mod_bound(addr, 0, mod)) { ++ retval = 1; ++ break; ++ } ++ } ++#endif ++ ++ return retval; ++} ++ ++int lookup_symbol(unsigned long address, char *buf, int buflen) ++{ ++ return -ENOSYS; ++} ++ ++EXPORT_SYMBOL_GPL(is_kernel_text_address); ++EXPORT_SYMBOL_GPL(lookup_symbol); diff --git a/lustre/kernel_patches/patches/linux-2.4.19-pre1-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.19-pre1-xattr-0.8.54.patch index 4cf7592..e694068 100644 --- a/lustre/kernel_patches/patches/linux-2.4.19-pre1-xattr-0.8.54.patch +++ b/lustre/kernel_patches/patches/linux-2.4.19-pre1-xattr-0.8.54.patch @@ -1881,7 +1881,7 @@ Index: linux-2.4.19-pre1/fs/ext2/xattr.c + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1900,7 +1900,7 @@ Index: linux-2.4.19-pre1/fs/ext2/xattr.c + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } @@ -3563,7 +3563,7 @@ Index: linux-2.4.19-pre1/fs/ext3/xattr.c + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT3_I(inode)->i_file_acl != 0; @@ -3588,7 +3588,7 @@ Index: linux-2.4.19-pre1/fs/ext3/xattr.c + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + + ext3_xattr_update_super_block(handle, sb); + } diff --git a/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch b/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch index 26d3af9..79c48fb 100644 --- a/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch +++ b/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch @@ -2,10 +2,10 @@ ext3/ext3-exports.c | 13 +++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) -Index: linux-2.4.19.SuSE/fs/ext2/super.c +Index: linux-2.4.19/fs/ext2/super.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/super.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/ext2/super.c Sun Nov 16 00:40:59 2003 +--- linux-2.4.19.orig/fs/ext2/super.c 2004-04-23 17:53:55.000000000 -0400 ++++ linux-2.4.19/fs/ext2/super.c 2004-04-23 22:30:41.000000000 -0400 @@ -70,6 +70,7 @@ { va_list args; @@ -14,11 +14,11 @@ Index: linux-2.4.19.SuSE/fs/ext2/super.c if (!(sb->s_flags & MS_RDONLY)) { sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS; sb->u.ext2_sb.s_es->s_state = -Index: linux-2.4.19.SuSE/fs/ext3/super.c +Index: linux-2.4.19/fs/ext3/super.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 00:40:59 2003 -@@ -1822,8 +1828,6 @@ +--- linux-2.4.19.orig/fs/ext3/super.c 2004-04-23 18:26:27.000000000 -0400 ++++ linux-2.4.19/fs/ext3/super.c 2004-04-23 22:30:41.000000000 -0400 +@@ -1827,8 +1827,6 @@ exit_ext3_xattr(); } @@ -27,10 +27,10 @@ Index: linux-2.4.19.SuSE/fs/ext3/super.c MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c +Index: linux-2.4.19/fs/ext3/ext3-exports.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c Sun Nov 16 00:40:58 2003 -+++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c Sun Nov 16 00:40:59 2003 +--- linux-2.4.19.orig/fs/ext3/ext3-exports.c 2003-01-30 05:24:37.000000000 -0500 ++++ linux-2.4.19/fs/ext3/ext3-exports.c 2004-04-23 22:30:41.000000000 -0400 @@ -0,0 +1,13 @@ +#include +#include @@ -45,3 +45,16 @@ Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c +EXPORT_SYMBOL(ext3_xattr_get); +EXPORT_SYMBOL(ext3_xattr_list); +EXPORT_SYMBOL(ext3_xattr_set); +Index: linux-2.4.19/fs/ext3/Makefile +=================================================================== +--- linux-2.4.19.orig/fs/ext3/Makefile 2004-04-23 18:26:27.000000000 -0400 ++++ linux-2.4.19/fs/ext3/Makefile 2004-04-23 23:05:20.000000000 -0400 +@@ -9,7 +9,7 @@ + + O_TARGET := ext3.o + +-export-objs := super.o inode.o ++export-objs := ext3-exports.c + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch index 811c40f..5bdfaff 100644 --- a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch +++ b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch @@ -1786,7 +1786,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1805,7 +1805,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } @@ -3450,7 +3450,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT3_I(inode)->i_file_acl != 0; @@ -3475,7 +3475,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + + ext3_xattr_update_super_block(handle, sb); + } diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch index c9fb126..f1365d7 100644 --- a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch +++ b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch @@ -1294,7 +1294,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1313,7 +1313,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } @@ -2957,7 +2957,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT3_I(inode)->i_file_acl != 0; @@ -2982,7 +2982,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + + ext3_xattr_update_super_block(handle, sb); + } diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch index 2e4750b..f078ebe 100644 --- a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch +++ b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch @@ -1821,7 +1821,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1840,7 +1840,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } @@ -3485,7 +3485,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT3_I(inode)->i_file_acl != 0; @@ -3510,7 +3510,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + + ext3_xattr_update_super_block(handle, sb); + } diff --git a/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch b/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch index e18ac9d..8e198f8 100644 --- a/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch +++ b/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch @@ -1315,7 +1315,7 @@ Index: linux-2.4.21-chaos/fs/ext2/xattr.c + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1334,7 +1334,7 @@ Index: linux-2.4.21-chaos/fs/ext2/xattr.c + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } diff --git a/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-suse.patch b/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-suse.patch index 22dad3c..da8c15c 100644 --- a/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-suse.patch +++ b/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-suse.patch @@ -1683,7 +1683,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1702,7 +1702,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } @@ -3347,7 +3347,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT3_I(inode)->i_file_acl != 0; @@ -3372,7 +3372,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + + ext3_xattr_update_super_block(handle, sb); + } diff --git a/lustre/kernel_patches/patches/linux-2.4.22-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.22-xattr-0.8.54.patch index b63cc2e..fd5f0c2 100644 --- a/lustre/kernel_patches/patches/linux-2.4.22-xattr-0.8.54.patch +++ b/lustre/kernel_patches/patches/linux-2.4.22-xattr-0.8.54.patch @@ -1738,7 +1738,7 @@ Index: linux-2.4.22-vanilla/fs/ext2/xattr.c + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1757,7 +1757,7 @@ Index: linux-2.4.22-vanilla/fs/ext2/xattr.c + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } @@ -3426,7 +3426,7 @@ Index: linux-2.4.22-vanilla/fs/ext3/xattr.c + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT3_I(inode)->i_file_acl != 0; @@ -3451,7 +3451,7 @@ Index: linux-2.4.22-vanilla/fs/ext3/xattr.c + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + + ext3_xattr_update_super_block(handle, sb); + } diff --git a/lustre/kernel_patches/patches/linux-2.4.24-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.24-xattr-0.8.54.patch index 0109dd9..4cf5c2c 100644 --- a/lustre/kernel_patches/patches/linux-2.4.24-xattr-0.8.54.patch +++ b/lustre/kernel_patches/patches/linux-2.4.24-xattr-0.8.54.patch @@ -1738,7 +1738,7 @@ Index: linux-2.4.24-vanilla/fs/ext2/xattr.c + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1757,7 +1757,7 @@ Index: linux-2.4.24-vanilla/fs/ext2/xattr.c + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } @@ -3426,7 +3426,7 @@ Index: linux-2.4.24-vanilla/fs/ext3/xattr.c + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT3_I(inode)->i_file_acl != 0; @@ -3451,7 +3451,7 @@ Index: linux-2.4.24-vanilla/fs/ext3/xattr.c + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + + ext3_xattr_update_super_block(handle, sb); + } diff --git a/lustre/kernel_patches/patches/lustre_version.patch b/lustre/kernel_patches/patches/lustre_version.patch index 7ebb838..77c5531 100644 --- a/lustre/kernel_patches/patches/lustre_version.patch +++ b/lustre/kernel_patches/patches/lustre_version.patch @@ -7,6 +7,6 @@ --- /dev/null Fri Aug 30 17:31:37 2002 +++ linux-2.4.18-18.8.0-l12-braam/include/linux/lustre_version.h Thu Feb 13 07:58:33 2003 @@ -0,0 +1 @@ -+#define LUSTRE_KERNEL_VERSION 34 ++#define LUSTRE_KERNEL_VERSION 35 _ diff --git a/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch b/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch new file mode 100644 index 0000000..5cc34b8 --- /dev/null +++ b/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch @@ -0,0 +1,50 @@ +Index: linux-2.4.20-30.9/scripts/mkdep.c +=================================================================== +--- linux-2.4.20-30.9.orig/scripts/mkdep.c 2004-02-19 19:40:51.000000000 -0500 ++++ linux-2.4.20-30.9/scripts/mkdep.c 2004-04-28 17:24:54.000000000 -0400 +@@ -48,8 +48,6 @@ + char __depname[512] = "\n\t@touch "; + #define depname (__depname+9) + int hasdep; +-char cwd[PATH_MAX]; +-int lcwd; + + struct path_struct { + int len; +@@ -204,22 +202,8 @@ + memcpy(path->buffer+path->len, name, len); + path->buffer[path->len+len] = '\0'; + if (access(path->buffer, F_OK) == 0) { +- int l = lcwd + strlen(path->buffer); +- char name2[l+2], *p; +- if (path->buffer[0] == '/') { +- memcpy(name2, path->buffer, l+1); +- } +- else { +- memcpy(name2, cwd, lcwd); +- name2[lcwd] = '/'; +- memcpy(name2+lcwd+1, path->buffer, path->len+len+1); +- } +- while ((p = strstr(name2, "/../"))) { +- *p = '\0'; +- strcpy(strrchr(name2, '/'), p+3); +- } + do_depname(); +- printf(" \\\n %s", name2); ++ printf(" \\\n %s", path->buffer); + return; + } + } +@@ -601,12 +585,6 @@ + return 1; + } + +- if (!getcwd(cwd, sizeof(cwd))) { +- fprintf(stderr, "mkdep: getcwd() failed %m\n"); +- return 1; +- } +- lcwd = strlen(cwd); +- + add_path("."); /* for #include "..." */ + + while (++argv, --argc > 0) { diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.4.21-sles8sp3.patch new file mode 100644 index 0000000..a7859bd --- /dev/null +++ b/lustre/kernel_patches/patches/tcp-zero-copy-2.4.21-sles8sp3.patch @@ -0,0 +1,458 @@ +Index: linux-2.4.21/include/linux/skbuff.h +=================================================================== +--- linux-2.4.21.orig/include/linux/skbuff.h 2004-04-24 02:38:40.000000000 -0400 ++++ linux-2.4.21/include/linux/skbuff.h 2004-04-24 02:47:46.000000000 -0400 +@@ -116,6 +116,30 @@ + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -123,6 +147,12 @@ + atomic_t dataref; + unsigned int nr_frags; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +Index: linux-2.4.21/include/net/tcp.h +=================================================================== +--- linux-2.4.21.orig/include/net/tcp.h 2004-04-24 02:39:20.000000000 -0400 ++++ linux-2.4.21/include/net/tcp.h 2004-04-24 02:48:27.000000000 -0400 +@@ -646,6 +646,8 @@ + + extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -742,6 +744,10 @@ + struct msghdr *msg, + int len, int nonblock, + int flags, int *addr_len); ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); ++ + + extern int tcp_listen_start(struct sock *sk); + +Index: linux-2.4.21/net/netsyms.c +=================================================================== +--- linux-2.4.21.orig/net/netsyms.c 2004-04-24 02:39:13.000000000 -0400 ++++ linux-2.4.21/net/netsyms.c 2004-04-24 02:47:46.000000000 -0400 +@@ -403,6 +403,8 @@ + + #endif + ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); + EXPORT_SYMBOL(tcp_read_sock); + + EXPORT_SYMBOL(netlink_set_err); +Index: linux-2.4.21/net/core/skbuff.c +=================================================================== +--- linux-2.4.21.orig/net/core/skbuff.c 2004-04-24 02:38:40.000000000 -0400 ++++ linux-2.4.21/net/core/skbuff.c 2004-04-24 02:47:46.000000000 -0400 +@@ -208,6 +208,8 @@ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ skb_shinfo(skb)->zccd2 = NULL; + return skb; + + nodata: +@@ -277,6 +279,10 @@ + { + if (!skb->cloned || + atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -535,6 +541,8 @@ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */ ++ skb_shinfo(skb)->zccd2 = NULL; + + /* We are no longer a clone, even if we were. */ + skb->cloned = 0; +@@ -581,6 +589,14 @@ + n->data_len = skb->data_len; + n->len = skb->len; + ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + +@@ -623,6 +639,8 @@ + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ + + if (skb_shared(skb)) + BUG(); +@@ -644,6 +662,11 @@ + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data+nhead) - skb->head; +@@ -658,6 +681,8 @@ + skb->nh.raw += off; + skb->cloned = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; + return 0; + + nodata: +Index: linux-2.4.21/net/ipv4/tcp.c +=================================================================== +--- linux-2.4.21.orig/net/ipv4/tcp.c 2004-04-24 02:39:21.000000000 -0400 ++++ linux-2.4.21/net/ipv4/tcp.c 2004-04-24 02:50:40.000000000 -0400 +@@ -748,7 +748,7 @@ + goto out; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd); + + static inline int + can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) +@@ -827,7 +827,7 @@ + return err; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd) + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now; +@@ -875,6 +875,17 @@ + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + if (can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += copy; + } else if (i < MAX_SKB_FRAGS) { +@@ -885,6 +896,20 @@ + goto new_segment; + } + ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->ip_summed = CHECKSUM_HW; +@@ -948,7 +973,29 @@ + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) ++ ++ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &page, 0, size, flags, zccd); ++ + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; +@@ -1772,6 +1819,202 @@ + goto out; + } + ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++ int len, int nonblock) ++{ ++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->err || ++ sk->state == TCP_CLOSE || ++ (sk->shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sk->done) ++ break; ++ ++ if (sk->err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->state == TCP_CLOSE) { ++ if (!sk->done) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ cleanup_rbuf(sk, copied); ++ timeo = tcp_data_wait(sk, timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!sk->urginline) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb, GFP_KERNEL); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ break; ++ } ++ ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; ++} ++ + /* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch index dd07ef3..91dc15b 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch @@ -11,6 +11,47 @@ kernel/ksyms.c | 1 11 files changed, 564 insertions(+), 126 deletions(-) +Index: linux-2.4.18-p4smp/fs/dcache.c +=================================================================== +--- linux-2.4.18-p4smp.orig/fs/dcache.c 2004-02-03 01:00:10.000000000 -0500 ++++ linux-2.4.18-p4smp/fs/dcache.c 2004-03-19 16:05:42.000000000 -0500 +@@ -186,6 +186,13 @@ + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -859,13 +866,19 @@ void d_delete(struct dentry * dentry) + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ Index: linux-2.4.18-p4smp/fs/exec.c =================================================================== --- linux-2.4.18-p4smp.orig/fs/exec.c 2004-02-03 01:00:10.000000000 -0500 @@ -20,7 +61,7 @@ Index: linux-2.4.18-p4smp/fs/exec.c struct nameidata nd; int error; + struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; ++ .it_flags = FMODE_READ|FMODE_EXEC }; - error = user_path_walk(library, &nd); + error = user_path_walk_it(library, &nd, &it); @@ -37,14 +78,14 @@ Index: linux-2.4.18-p4smp/fs/exec.c error = PTR_ERR(file); if (IS_ERR(file)) goto out; -@@ -359,8 +362,9 @@ struct file *open_exec(const char *name) +@@ -359,8 +362,10 @@ struct file *open_exec(const char *name) struct inode *inode; struct file *file; int err = 0; -- -- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); + struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; ++ .it_flags = FMODE_READ|FMODE_EXEC }; + +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); + err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); file = ERR_PTR(err); if (!err) { @@ -76,164 +117,6 @@ Index: linux-2.4.18-p4smp/fs/exec.c goto close_fail; retval = binfmt->core_dump(signr, regs, file); -Index: linux-2.4.18-p4smp/fs/dcache.c -=================================================================== ---- linux-2.4.18-p4smp.orig/fs/dcache.c 2004-02-03 01:00:10.000000000 -0500 -+++ linux-2.4.18-p4smp/fs/dcache.c 2004-03-19 16:05:42.000000000 -0500 -@@ -186,6 +186,13 @@ - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -859,13 +866,19 @@ void d_delete(struct dentry * dentry) - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux-2.4.18-p4smp/fs/namespace.c -=================================================================== ---- linux-2.4.18-p4smp.orig/fs/namespace.c 2004-02-03 01:00:10.000000000 -0500 -+++ linux-2.4.18-p4smp/fs/namespace.c 2004-03-19 16:05:42.000000000 -0500 -@@ -99,6 +99,7 @@ - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -110,6 +111,7 @@ static void attach_mnt(struct vfsmount * - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -485,14 +487,17 @@ static int do_loopback(struct nameidata - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; -- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); -- if (err) -+ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down_write(¤t->namespace->sem); - err = -EINVAL; -@@ -515,6 +520,7 @@ static int do_loopback(struct nameidata - } - - up_write(¤t->namespace->sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -698,6 +704,7 @@ long do_mount(char * dev_name, char * di - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -722,10 +729,11 @@ long do_mount(char * dev_name, char * di - flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); - - /* ... and get the mountpoint */ -- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); -- if (retval) -+ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -- -+ } - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, - data_page); -@@ -736,6 +744,8 @@ long do_mount(char * dev_name, char * di - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -901,6 +911,8 @@ asmlinkage long sys_pivot_root(const cha - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - int error; - - if (!capable(CAP_SYS_ADMIN)) -@@ -908,14 +920,14 @@ asmlinkage long sys_pivot_root(const cha - - lock_kernel(); - -- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); -+ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); - if (error) - goto out0; - error = -EINVAL; - if (!check_mnt(new_nd.mnt)) - goto out1; - -- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); -+ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); - if (error) - goto out1; - -@@ -970,8 +982,10 @@ out2: - up(&old_nd.dentry->d_inode->i_zombie); - up_write(¤t->namespace->sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); Index: linux-2.4.18-p4smp/fs/namei.c =================================================================== --- linux-2.4.18-p4smp.orig/fs/namei.c 2004-02-03 01:00:10.000000000 -0500 @@ -399,13 +282,11 @@ Index: linux-2.4.18-p4smp/fs/namei.c ; err = -ENOENT; -@@ -548,8 +585,8 @@ int link_path_walk(const char * name, st - if (!inode->i_op) +@@ -549,7 +586,7 @@ int link_path_walk(const char * name, st goto out_dput; -- if (inode->i_op->follow_link) { + if (inode->i_op->follow_link) { - err = do_follow_link(dentry, nd); -+ if (inode->i_op->follow_link || inode->i_op->follow_link2) { + err = do_follow_link(dentry, nd, NULL); dput(dentry); if (err) @@ -419,7 +300,7 @@ Index: linux-2.4.18-p4smp/fs/namei.c break; continue; /* here ends the main loop */ -@@ -592,22 +629,23 @@ last_component: +@@ -592,22 +629,22 @@ last_component: if (err < 0) break; } @@ -440,10 +321,8 @@ Index: linux-2.4.18-p4smp/fs/namei.c ; inode = dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) -- && inode && inode->i_op && inode->i_op->follow_link) { + && inode && inode->i_op && inode->i_op->follow_link) { - err = do_follow_link(dentry, nd); -+ && inode && inode->i_op && -+ (inode->i_op->follow_link || inode->i_op->follow_link2)) { + err = do_follow_link(dentry, nd, it); dput(dentry); if (err) @@ -471,7 +350,7 @@ Index: linux-2.4.18-p4smp/fs/namei.c + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -1046,6 +925,122 @@ Index: linux-2.4.18-p4smp/fs/namei.c if (page) { kunmap(page); page_cache_release(page); +Index: linux-2.4.18-p4smp/fs/namespace.c +=================================================================== +--- linux-2.4.18-p4smp.orig/fs/namespace.c 2004-02-03 01:00:10.000000000 -0500 ++++ linux-2.4.18-p4smp/fs/namespace.c 2004-03-19 16:05:42.000000000 -0500 +@@ -99,6 +99,7 @@ + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -110,6 +111,7 @@ static void attach_mnt(struct vfsmount * + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -485,14 +487,17 @@ static int do_loopback(struct nameidata + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; +- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); +- if (err) ++ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down_write(¤t->namespace->sem); + err = -EINVAL; +@@ -515,6 +520,7 @@ static int do_loopback(struct nameidata + } + + up_write(¤t->namespace->sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -698,6 +704,7 @@ long do_mount(char * dev_name, char * di + unsigned long flags, void *data_page) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int retval = 0; + int mnt_flags = 0; + +@@ -722,9 +729,11 @@ long do_mount(char * dev_name, char * di + flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); + + /* ... and get the mountpoint */ +- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); +- if (retval) ++ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; ++ } + + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, +@@ -736,6 +744,8 @@ long do_mount(char * dev_name, char * di + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -901,6 +911,8 @@ asmlinkage long sys_pivot_root(const cha + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + int error; + + if (!capable(CAP_SYS_ADMIN)) +@@ -908,14 +920,14 @@ asmlinkage long sys_pivot_root(const cha + + lock_kernel(); + +- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); ++ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); + if (error) + goto out0; + error = -EINVAL; + if (!check_mnt(new_nd.mnt)) + goto out1; + +- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); ++ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); + if (error) + goto out1; + +@@ -970,8 +982,10 @@ out2: + up(&old_nd.dentry->d_inode->i_zombie); + up_write(¤t->namespace->sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); Index: linux-2.4.18-p4smp/fs/open.c =================================================================== --- linux-2.4.18-p4smp.orig/fs/open.c 2004-02-03 01:00:10.000000000 -0500 @@ -1229,16 +1224,14 @@ Index: linux-2.4.18-p4smp/fs/open.c path_release(&nd); } -@@ -385,8 +430,11 @@ asmlinkage long sys_chdir(const char * f +@@ -385,8 +430,9 @@ asmlinkage long sys_chdir(const char * f { int error; struct nameidata nd; + struct lookup_intent it = { .it_op = IT_GETATTR }; - error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd); -+ error = __user_walk_it(filename, -+ LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, -+ &nd, &it); ++ error = __user_walk_it(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd, &it); if (error) goto out; @@ -1588,9 +1581,9 @@ Index: linux-2.4.18-p4smp/include/linux/dcache.h + void (*d_unpin)(struct dentry *, struct vfsmount *, int); }; -+#define PIN(de,mnt,flag) if (de->d_op && de->d_op->d_pin) \ ++#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \ + de->d_op->d_pin(de, mnt, flag); -+#define UNPIN(de,mnt,flag) if (de->d_op && de->d_op->d_unpin) \ ++#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \ + de->d_op->d_unpin(de, mnt, flag); + + @@ -1628,7 +1621,7 @@ Index: linux-2.4.18-p4smp/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 /* called from open path, ie O_TRUNC */ ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It @@ -1666,7 +1659,7 @@ Index: linux-2.4.18-p4smp/include/linux/fs.h /* * File types -@@ -900,21 +908,34 @@ struct file_operations { +@@ -900,21 +908,32 @@ struct file_operations { struct inode_operations { int (*create) (struct inode *,struct dentry *,int); @@ -1690,8 +1683,6 @@ Index: linux-2.4.18-p4smp/include/linux/fs.h + int (*rename_raw) (struct nameidata *, struct nameidata *); int (*readlink) (struct dentry *, char *,int); int (*follow_link) (struct dentry *, struct nameidata *); -+ int (*follow_link2) (struct dentry *, struct nameidata *, -+ struct lookup_intent *it); void (*truncate) (struct inode *); int (*permission) (struct inode *, int); int (*revalidate) (struct dentry *); @@ -1746,24 +1737,6 @@ Index: linux-2.4.18-p4smp/include/linux/fs.h extern int page_readlink(struct dentry *, char *, int); extern int page_follow_link(struct dentry *, struct nameidata *); extern struct inode_operations page_symlink_inode_operations; -Index: linux-2.4.18-p4smp/kernel/fork.c -=================================================================== ---- linux-2.4.18-p4smp.orig/kernel/fork.c 2004-02-03 01:00:10.000000000 -0500 -+++ linux-2.4.18-p4smp/kernel/fork.c 2004-03-19 16:05:42.000000000 -0500 -@@ -399,10 +399,13 @@ - fs->umask = old->umask; - read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); -+ PIN(old->pwd, old->pwdmnt, 0); -+ PIN(old->root, old->rootmnt, 1); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { -+ PIN(old->altroot, old->altrootmnt, 1); - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); - } else { Index: linux-2.4.18-p4smp/kernel/exit.c =================================================================== --- linux-2.4.18-p4smp.orig/kernel/exit.c 2004-02-03 01:00:10.000000000 -0500 @@ -1783,6 +1756,24 @@ Index: linux-2.4.18-p4smp/kernel/exit.c dput(fs->altroot); mntput(fs->altrootmnt); } +Index: linux-2.4.18-p4smp/kernel/fork.c +=================================================================== +--- linux-2.4.18-p4smp.orig/kernel/fork.c 2004-02-03 01:00:10.000000000 -0500 ++++ linux-2.4.18-p4smp/kernel/fork.c 2004-03-19 16:05:42.000000000 -0500 +@@ -399,10 +399,13 @@ + fs->umask = old->umask; + read_lock(&old->lock); + fs->rootmnt = mntget(old->rootmnt); ++ PIN(old->pwd, old->pwdmnt, 0); ++ PIN(old->root, old->rootmnt, 1); + fs->root = dget(old->root); + fs->pwdmnt = mntget(old->pwdmnt); + fs->pwd = dget(old->pwd); + if (old->altroot) { ++ PIN(old->altroot, old->altrootmnt, 1); + fs->altrootmnt = mntget(old->altrootmnt); + fs->altroot = dget(old->altroot); + } else { Index: linux-2.4.18-p4smp/kernel/ksyms.c =================================================================== --- linux-2.4.18-p4smp.orig/kernel/ksyms.c 2004-03-19 16:05:40.000000000 -0500 diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch index 7c4ea56..4ccfa4d 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch @@ -12,6 +12,47 @@ kernel/ksyms.c | 1 12 files changed, 558 insertions(+), 128 deletions(-) +Index: linux-2.4.19-pre1/fs/dcache.c +=================================================================== +--- linux-2.4.19-pre1.orig/fs/dcache.c 2003-11-21 02:41:00.000000000 +0300 ++++ linux-2.4.19-pre1/fs/dcache.c 2003-11-21 02:51:38.000000000 +0300 +@@ -181,6 +181,13 @@ + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -831,13 +838,19 @@ + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ Index: linux-2.4.19-pre1/fs/exec.c =================================================================== --- linux-2.4.19-pre1.orig/fs/exec.c 2003-11-21 02:41:00.000000000 +0300 @@ -78,165 +119,6 @@ Index: linux-2.4.19-pre1/fs/exec.c goto close_fail; retval = binfmt->core_dump(signr, regs, file); -Index: linux-2.4.19-pre1/fs/dcache.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/dcache.c 2003-11-21 02:41:00.000000000 +0300 -+++ linux-2.4.19-pre1/fs/dcache.c 2003-11-21 02:51:38.000000000 +0300 -@@ -181,6 +181,13 @@ - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -831,13 +838,19 @@ - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux-2.4.19-pre1/fs/namespace.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/namespace.c 2003-11-21 02:41:00.000000000 +0300 -+++ linux-2.4.19-pre1/fs/namespace.c 2003-11-21 02:51:38.000000000 +0300 -@@ -107,6 +107,7 @@ - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -118,6 +119,7 @@ - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -500,15 +502,18 @@ - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; - if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) -- err = path_walk(old_name, &old_nd); -- if (err) -+ err = path_walk_it(old_name, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down(&mount_sem); - err = -EINVAL; -@@ -531,6 +536,7 @@ - } - - up(&mount_sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -706,6 +712,7 @@ - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -731,9 +738,11 @@ - - /* ... and get the mountpoint */ - if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- retval = path_walk(dir_name, &nd); -- if (retval) -+ retval = path_walk_it(dir_name, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -+ } - - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, -@@ -745,6 +754,8 @@ - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -830,6 +841,8 @@ - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - char *name; - int error; - -@@ -844,7 +857,7 @@ - goto out0; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) -- error = path_walk(name, &new_nd); -+ error = path_walk_it(name, &new_nd, &new_it); - putname(name); - if (error) - goto out0; -@@ -858,7 +871,7 @@ - goto out1; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) -- error = path_walk(name, &old_nd); -+ error = path_walk_it(name, &old_nd, &old_it); - putname(name); - if (error) - goto out1; -@@ -914,8 +927,10 @@ - up(&old_nd.dentry->d_inode->i_zombie); - up(&mount_sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); Index: linux-2.4.19-pre1/fs/namei.c =================================================================== --- linux-2.4.19-pre1.orig/fs/namei.c 2003-11-21 02:41:00.000000000 +0300 @@ -370,7 +252,7 @@ Index: linux-2.4.19-pre1/fs/namei.c if (!dentry) { - dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); + dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, -+ NULL); ++ NULL); err = PTR_ERR(dentry); if (IS_ERR(dentry)) break; @@ -431,7 +313,7 @@ Index: linux-2.4.19-pre1/fs/namei.c break; } goto return_base; -@@ -625,21 +663,66 @@ +@@ -625,21 +663,68 @@ nd->last_type = LAST_DOT; else if (this.len == 2 && this.name[1] == '.') nd->last_type = LAST_DOTDOT; @@ -450,7 +332,7 @@ Index: linux-2.4.19-pre1/fs/namei.c + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -459,6 +341,8 @@ Index: linux-2.4.19-pre1/fs/namei.c + } + nd->dentry = new; + } ++ if (!nd->dentry->d_inode) ++ goto no_inode; + } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + err = -ESTALE; @@ -744,12 +628,16 @@ Index: linux-2.4.19-pre1/fs/namei.c if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1252,7 +1392,16 @@ +@@ -1252,7 +1392,20 @@ error = path_walk(tmp, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); + ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->mknod_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->mknod_raw(&nd, mode, dev); @@ -770,11 +658,15 @@ Index: linux-2.4.19-pre1/fs/namei.c path_release(&nd); out: putname(tmp); -@@ -1321,7 +1471,14 @@ +@@ -1321,7 +1471,18 @@ error = path_walk(tmp, &nd); if (error) goto out; - dentry = lookup_create(&nd, 1); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->mkdir_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->mkdir_raw(&nd, mode); @@ -829,11 +721,15 @@ Index: linux-2.4.19-pre1/fs/namei.c error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1557,15 +1730,23 @@ +@@ -1557,15 +1730,27 @@ error = path_walk(to, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->symlink_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->symlink_raw(&nd, from); @@ -855,11 +751,15 @@ Index: linux-2.4.19-pre1/fs/namei.c putname(to); } putname(from); -@@ -1648,7 +1829,14 @@ +@@ -1648,7 +1829,18 @@ error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; - new_dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out_release; ++ } + if (nd.dentry->d_inode->i_op->link_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->link_raw(&old_nd, &nd); @@ -981,6 +881,124 @@ Index: linux-2.4.19-pre1/fs/namei.c if (page) { kunmap(page); page_cache_release(page); +Index: linux-2.4.19-pre1/fs/namespace.c +=================================================================== +--- linux-2.4.19-pre1.orig/fs/namespace.c 2003-11-21 02:41:00.000000000 +0300 ++++ linux-2.4.19-pre1/fs/namespace.c 2003-11-21 02:51:38.000000000 +0300 +@@ -107,6 +107,7 @@ + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -118,6 +119,7 @@ + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -500,15 +502,18 @@ + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; + if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) +- err = path_walk(old_name, &old_nd); +- if (err) ++ err = path_walk_it(old_name, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down(&mount_sem); + err = -EINVAL; +@@ -531,6 +536,7 @@ + } + + up(&mount_sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -706,6 +712,7 @@ + unsigned long flags, void *data_page) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int retval = 0; + int mnt_flags = 0; + +@@ -731,9 +738,11 @@ + + /* ... and get the mountpoint */ + if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) +- retval = path_walk(dir_name, &nd); +- if (retval) ++ retval = path_walk_it(dir_name, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; ++ } + + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, +@@ -745,6 +754,8 @@ + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -830,6 +841,8 @@ + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + char *name; + int error; + +@@ -844,7 +857,7 @@ + goto out0; + error = 0; + if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) +- error = path_walk(name, &new_nd); ++ error = path_walk_it(name, &new_nd, &new_it); + putname(name); + if (error) + goto out0; +@@ -858,7 +871,7 @@ + goto out1; + error = 0; + if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) +- error = path_walk(name, &old_nd); ++ error = path_walk_it(name, &old_nd, &old_it); + putname(name); + if (error) + goto out1; +@@ -914,8 +927,10 @@ + up(&old_nd.dentry->d_inode->i_zombie); + up(&mount_sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); Index: linux-2.4.19-pre1/fs/open.c =================================================================== --- linux-2.4.19-pre1.orig/fs/open.c 2003-11-21 02:41:00.000000000 +0300 @@ -1397,7 +1415,7 @@ Index: linux-2.4.19-pre1/fs/stat.c =================================================================== --- linux-2.4.19-pre1.orig/fs/stat.c 2003-11-21 02:41:00.000000000 +0300 +++ linux-2.4.19-pre1/fs/stat.c 2003-11-21 02:51:38.000000000 +0300 -@@ -17,10 +17,14 @@ +@@ -17,10 +17,12 @@ * Revalidate the inode. This is required for proper NFS attribute caching. */ static __inline__ int @@ -1406,8 +1424,6 @@ Index: linux-2.4.19-pre1/fs/stat.c { struct inode * inode = dentry->d_inode; - if (inode->i_op && inode->i_op->revalidate) -+ if (!inode) -+ return -ENOENT; + if (inode->i_op && inode->i_op->revalidate_it) + return inode->i_op->revalidate_it(dentry, it); + else if (inode->i_op && inode->i_op->revalidate) @@ -1661,7 +1677,7 @@ Index: linux-2.4.19-pre1/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It @@ -1804,18 +1820,25 @@ Index: linux-2.4.19-pre1/include/linux/fs_struct.h dput(old_pwd); mntput(old_pwdmnt); } -Index: linux-2.4.19-pre1/kernel/ksyms.c +Index: linux-2.4.19-pre1/kernel/exit.c =================================================================== ---- linux-2.4.19-pre1.orig/kernel/ksyms.c 2003-11-21 02:51:37.000000000 +0300 -+++ linux-2.4.19-pre1/kernel/ksyms.c 2003-11-21 02:51:38.000000000 +0300 -@@ -260,6 +260,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); +--- linux-2.4.19-pre1.orig/kernel/exit.c 2003-11-21 02:41:00.000000000 +0300 ++++ linux-2.4.19-pre1/kernel/exit.c 2003-11-21 02:51:38.000000000 +0300 +@@ -245,11 +245,14 @@ + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } Index: linux-2.4.19-pre1/kernel/fork.c =================================================================== --- linux-2.4.19-pre1.orig/kernel/fork.c 2003-11-21 02:41:00.000000000 +0300 @@ -1834,22 +1857,15 @@ Index: linux-2.4.19-pre1/kernel/fork.c fs->altrootmnt = mntget(old->altrootmnt); fs->altroot = dget(old->altroot); } else { -Index: linux-2.4.19-pre1/kernel/exit.c +Index: linux-2.4.19-pre1/kernel/ksyms.c =================================================================== ---- linux-2.4.19-pre1.orig/kernel/exit.c 2003-11-21 02:41:00.000000000 +0300 -+++ linux-2.4.19-pre1/kernel/exit.c 2003-11-21 02:51:38.000000000 +0300 -@@ -245,11 +245,14 @@ - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } +--- linux-2.4.19-pre1.orig/kernel/ksyms.c 2003-11-21 02:51:37.000000000 +0300 ++++ linux-2.4.19-pre1/kernel/ksyms.c 2003-11-21 02:51:38.000000000 +0300 +@@ -260,6 +260,7 @@ + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch index 7741be4..b6ab3b6 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch @@ -12,6 +12,47 @@ kernel/ksyms.c | 1 12 files changed, 558 insertions(+), 128 deletions(-) +Index: linux-2.4.19.SuSE/fs/dcache.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/dcache.c Mon Jan 27 05:08:04 2003 ++++ linux-2.4.19.SuSE/fs/dcache.c Sat Nov 15 17:29:03 2003 +@@ -186,6 +186,13 @@ + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -838,13 +845,19 @@ + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ Index: linux-2.4.19.SuSE/fs/exec.c =================================================================== --- linux-2.4.19.SuSE.orig/fs/exec.c Mon Jan 27 05:08:35 2003 @@ -78,165 +119,6 @@ Index: linux-2.4.19.SuSE/fs/exec.c goto close_fail; retval = binfmt->core_dump(signr, regs, file); -Index: linux-2.4.19.SuSE/fs/dcache.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/dcache.c Mon Jan 27 05:08:04 2003 -+++ linux-2.4.19.SuSE/fs/dcache.c Sat Nov 15 17:29:03 2003 -@@ -186,6 +186,13 @@ - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -838,13 +845,19 @@ - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux-2.4.19.SuSE/fs/namespace.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/namespace.c Mon Jan 27 05:08:07 2003 -+++ linux-2.4.19.SuSE/fs/namespace.c Sat Nov 15 17:56:42 2003 -@@ -97,6 +97,7 @@ - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -108,6 +109,7 @@ - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -491,15 +493,18 @@ - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; - if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) -- err = path_walk(old_name, &old_nd); -- if (err) -+ err = path_walk_it(old_name, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down_write(¤t->namespace->sem); - err = -EINVAL; -@@ -522,6 +527,7 @@ - } - - up_write(¤t->namespace->sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -725,6 +731,7 @@ - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -750,9 +757,11 @@ - - /* ... and get the mountpoint */ - if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- retval = path_walk(dir_name, &nd); -- if (retval) -+ retval = path_walk_it(dir_name, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -+ } - - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, -@@ -764,6 +773,8 @@ - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -929,6 +940,8 @@ - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - char *name; - int error; - -@@ -943,7 +956,7 @@ - goto out0; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) -- error = path_walk(name, &new_nd); -+ error = path_walk_it(name, &new_nd, &new_it); - putname(name); - if (error) - goto out0; -@@ -957,7 +970,7 @@ - goto out1; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) -- error = path_walk(name, &old_nd); -+ error = path_walk_it(name, &old_nd, &old_it); - putname(name); - if (error) - goto out1; -@@ -1013,8 +1026,10 @@ - up(&old_nd.dentry->d_inode->i_zombie); - up_write(¤t->namespace->sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); Index: linux-2.4.19.SuSE/fs/namei.c =================================================================== --- linux-2.4.19.SuSE.orig/fs/namei.c Mon Jan 27 05:08:07 2003 @@ -357,7 +239,7 @@ Index: linux-2.4.19.SuSE/fs/namei.c } /* This does the actual lookups.. */ - dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE,NULL); ++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); if (!dentry) { err = -EWOULDBLOCKIO; if (atomic) @@ -433,7 +315,7 @@ Index: linux-2.4.19.SuSE/fs/namei.c + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -442,6 +324,8 @@ Index: linux-2.4.19.SuSE/fs/namei.c + } + nd->dentry = new; + } ++ if (!nd->dentry->d_inode) ++ goto no_inode; + } else if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { err = -ESTALE; @@ -722,12 +606,16 @@ Index: linux-2.4.19.SuSE/fs/namei.c if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1286,7 +1414,16 @@ +@@ -1286,7 +1414,20 @@ error = path_walk(tmp, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); + ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->mknod_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->mknod_raw(&nd, mode, dev); @@ -748,11 +636,15 @@ Index: linux-2.4.19.SuSE/fs/namei.c path_release(&nd); out: putname(tmp); -@@ -1356,7 +1494,14 @@ +@@ -1356,7 +1494,18 @@ error = path_walk(tmp, &nd); if (error) goto out; - dentry = lookup_create(&nd, 1); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->mkdir_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->mkdir_raw(&nd, mode); @@ -807,11 +699,15 @@ Index: linux-2.4.19.SuSE/fs/namei.c error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1595,15 +1756,23 @@ +@@ -1595,15 +1756,27 @@ error = path_walk(to, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->symlink_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->symlink_raw(&nd, from); @@ -838,6 +734,10 @@ Index: linux-2.4.19.SuSE/fs/namei.c if (old_nd.mnt != nd.mnt) goto out_release; - new_dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out_release; ++ } + if (nd.dentry->d_inode->i_op->link_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->link_raw(&old_nd, &nd); @@ -959,6 +859,124 @@ Index: linux-2.4.19.SuSE/fs/namei.c if (page) { kunmap(page); page_cache_release(page); +Index: linux-2.4.19.SuSE/fs/namespace.c +=================================================================== +--- linux-2.4.19.SuSE.orig/fs/namespace.c Mon Jan 27 05:08:07 2003 ++++ linux-2.4.19.SuSE/fs/namespace.c Sat Nov 15 17:56:42 2003 +@@ -97,6 +97,7 @@ + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -108,6 +109,7 @@ + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -491,15 +493,18 @@ + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; + if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) +- err = path_walk(old_name, &old_nd); +- if (err) ++ err = path_walk_it(old_name, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down_write(¤t->namespace->sem); + err = -EINVAL; +@@ -522,6 +527,7 @@ + } + + up_write(¤t->namespace->sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -725,6 +731,7 @@ + unsigned long flags, void *data_page) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int retval = 0; + int mnt_flags = 0; + +@@ -750,9 +757,11 @@ + + /* ... and get the mountpoint */ + if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) +- retval = path_walk(dir_name, &nd); +- if (retval) ++ retval = path_walk_it(dir_name, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; ++ } + + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, +@@ -764,6 +773,8 @@ + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -929,6 +940,8 @@ + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + char *name; + int error; + +@@ -943,7 +956,7 @@ + goto out0; + error = 0; + if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) +- error = path_walk(name, &new_nd); ++ error = path_walk_it(name, &new_nd, &new_it); + putname(name); + if (error) + goto out0; +@@ -957,7 +970,7 @@ + goto out1; + error = 0; + if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) +- error = path_walk(name, &old_nd); ++ error = path_walk_it(name, &old_nd, &old_it); + putname(name); + if (error) + goto out1; +@@ -1013,8 +1026,10 @@ + up(&old_nd.dentry->d_inode->i_zombie); + up_write(¤t->namespace->sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); Index: linux-2.4.19.SuSE/fs/open.c =================================================================== --- linux-2.4.19.SuSE.orig/fs/open.c Mon Jan 27 05:08:00 2003 @@ -1377,7 +1395,7 @@ Index: linux-2.4.19.SuSE/fs/stat.c =================================================================== --- linux-2.4.19.SuSE.orig/fs/stat.c Mon Jan 27 05:08:00 2003 +++ linux-2.4.19.SuSE/fs/stat.c Sat Nov 15 17:29:03 2003 -@@ -17,10 +17,14 @@ +@@ -17,10 +17,16 @@ * Revalidate the inode. This is required for proper NFS attribute caching. */ static __inline__ int @@ -1386,8 +1404,6 @@ Index: linux-2.4.19.SuSE/fs/stat.c { struct inode * inode = dentry->d_inode; - if (inode->i_op && inode->i_op->revalidate) -+ if (!inode) -+ return -ENOENT; + if (inode->i_op && inode->i_op->revalidate_it) + return inode->i_op->revalidate_it(dentry, it); + else if (inode->i_op && inode->i_op->revalidate) @@ -1641,7 +1657,7 @@ Index: linux-2.4.19.SuSE/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It @@ -1784,18 +1800,25 @@ Index: linux-2.4.19.SuSE/include/linux/fs_struct.h dput(old_pwd); mntput(old_pwdmnt); } -Index: linux-2.4.19.SuSE/kernel/ksyms.c +Index: linux-2.4.19.SuSE/kernel/exit.c =================================================================== ---- linux-2.4.19.SuSE.orig/kernel/ksyms.c Sat Nov 15 17:24:46 2003 -+++ linux-2.4.19.SuSE/kernel/ksyms.c Sat Nov 15 17:29:03 2003 -@@ -315,6 +315,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); +--- linux-2.4.19.SuSE.orig/kernel/exit.c Mon Jan 27 05:08:16 2003 ++++ linux-2.4.19.SuSE/kernel/exit.c Sat Nov 15 17:29:03 2003 +@@ -288,11 +288,14 @@ + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } Index: linux-2.4.19.SuSE/kernel/fork.c =================================================================== --- linux-2.4.19.SuSE.orig/kernel/fork.c Mon Jan 27 05:08:56 2003 @@ -1814,22 +1837,15 @@ Index: linux-2.4.19.SuSE/kernel/fork.c fs->altrootmnt = mntget(old->altrootmnt); fs->altroot = dget(old->altroot); } else { -Index: linux-2.4.19.SuSE/kernel/exit.c +Index: linux-2.4.19.SuSE/kernel/ksyms.c =================================================================== ---- linux-2.4.19.SuSE.orig/kernel/exit.c Mon Jan 27 05:08:16 2003 -+++ linux-2.4.19.SuSE/kernel/exit.c Sat Nov 15 17:29:03 2003 -@@ -288,11 +288,14 @@ - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } +--- linux-2.4.19.SuSE.orig/kernel/ksyms.c Sat Nov 15 17:24:46 2003 ++++ linux-2.4.19.SuSE/kernel/ksyms.c Sat Nov 15 17:29:03 2003 +@@ -315,6 +315,7 @@ + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch index 8585701..424d90e 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch @@ -13,6 +13,47 @@ kernel/ksyms.c | 1 13 files changed, 591 insertions(+), 133 deletions(-) +Index: linux/fs/dcache.c +=================================================================== +--- linux.orig/fs/dcache.c Thu Nov 28 18:53:15 2002 ++++ linux/fs/dcache.c Wed Mar 17 13:11:25 2004 +@@ -181,6 +181,13 @@ + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -830,13 +837,19 @@ + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ Index: linux/fs/exec.c =================================================================== --- linux.orig/fs/exec.c Wed Mar 17 13:00:38 2004 @@ -78,164 +119,6 @@ Index: linux/fs/exec.c goto close_fail; retval = binfmt->core_dump(signr, regs, file); -Index: linux/fs/dcache.c -=================================================================== ---- linux.orig/fs/dcache.c Thu Nov 28 18:53:15 2002 -+++ linux/fs/dcache.c Wed Mar 17 13:11:25 2004 -@@ -181,6 +181,13 @@ - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -830,13 +837,19 @@ - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux/fs/namespace.c -=================================================================== ---- linux.orig/fs/namespace.c Thu Nov 28 18:53:15 2002 -+++ linux/fs/namespace.c Wed Mar 17 13:11:25 2004 -@@ -99,6 +99,7 @@ - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -110,6 +111,7 @@ - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -485,14 +487,17 @@ - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; -- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); -- if (err) -+ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down_write(¤t->namespace->sem); - err = -EINVAL; -@@ -515,6 +520,7 @@ - } - - up_write(¤t->namespace->sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -698,6 +704,7 @@ - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -722,10 +729,11 @@ - flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); - - /* ... and get the mountpoint */ -- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); -- if (retval) -+ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -- -+ } - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, - data_page); -@@ -736,6 +744,8 @@ - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -901,6 +911,8 @@ - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - int error; - - if (!capable(CAP_SYS_ADMIN)) -@@ -908,14 +920,14 @@ - - lock_kernel(); - -- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); -+ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); - if (error) - goto out0; - error = -EINVAL; - if (!check_mnt(new_nd.mnt)) - goto out1; - -- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); -+ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); - if (error) - goto out1; - -@@ -970,8 +982,10 @@ - up(&old_nd.dentry->d_inode->i_zombie); - up_write(¤t->namespace->sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); Index: linux/fs/namei.c =================================================================== --- linux.orig/fs/namei.c Wed Mar 17 13:00:37 2004 @@ -487,7 +370,7 @@ Index: linux/fs/namei.c + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -1065,6 +948,123 @@ Index: linux/fs/namei.c if (page) { kunmap(page); page_cache_release(page); +Index: linux/fs/namespace.c +=================================================================== +--- linux.orig/fs/namespace.c Thu Nov 28 18:53:15 2002 ++++ linux/fs/namespace.c Wed Mar 17 13:11:25 2004 +@@ -99,6 +99,7 @@ + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -110,6 +111,7 @@ + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -485,14 +487,17 @@ + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; +- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); +- if (err) ++ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down_write(¤t->namespace->sem); + err = -EINVAL; +@@ -515,6 +520,7 @@ + } + + up_write(¤t->namespace->sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -698,6 +704,7 @@ + unsigned long flags, void *data_page) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int retval = 0; + int mnt_flags = 0; + +@@ -722,10 +729,11 @@ + flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); + + /* ... and get the mountpoint */ +- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); +- if (retval) ++ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; +- ++ } + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, + data_page); +@@ -736,6 +744,8 @@ + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -901,6 +911,8 @@ + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + int error; + + if (!capable(CAP_SYS_ADMIN)) +@@ -908,14 +920,14 @@ + + lock_kernel(); + +- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); ++ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); + if (error) + goto out0; + error = -EINVAL; + if (!check_mnt(new_nd.mnt)) + goto out1; + +- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); ++ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); + if (error) + goto out1; + +@@ -970,8 +982,10 @@ + up(&old_nd.dentry->d_inode->i_zombie); + up_write(¤t->namespace->sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); Index: linux/fs/open.c =================================================================== --- linux.orig/fs/open.c Thu Nov 28 18:53:15 2002 @@ -1467,6 +1467,20 @@ Index: linux/fs/open.c /* * Find an empty file descriptor entry, and mark it busy. */ +Index: linux/fs/proc/base.c +=================================================================== +--- linux.orig/fs/proc/base.c Wed Mar 17 13:00:35 2004 ++++ linux/fs/proc/base.c Wed Mar 17 13:11:25 2004 +@@ -481,6 +481,9 @@ + + error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt); + nd->last_type = LAST_BIND; ++ ++ if (nd->intent != NULL) ++ nd->intent->d.lustre.it_int_flags |= IT_FL_FOLLOWED; + out: + return error; + } Index: linux/fs/stat.c =================================================================== --- linux.orig/fs/stat.c Thu Sep 13 19:04:43 2001 @@ -1628,20 +1642,6 @@ Index: linux/fs/stat.c if (!err) err = cp_new_stat64(dentry->d_inode, statbuf); fput(f); -Index: linux/fs/proc/base.c -=================================================================== ---- linux.orig/fs/proc/base.c Wed Mar 17 13:00:35 2004 -+++ linux/fs/proc/base.c Wed Mar 17 13:11:25 2004 -@@ -481,6 +481,9 @@ - - error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt); - nd->last_type = LAST_BIND; -+ -+ if (nd->intent != NULL) -+ nd->intent->d.lustre.it_int_flags |= IT_FL_FOLLOWED; - out: - return error; - } Index: linux/include/linux/dcache.h =================================================================== --- linux.orig/include/linux/dcache.h Thu Nov 28 18:53:15 2002 @@ -1747,7 +1747,7 @@ Index: linux/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It @@ -1890,18 +1890,25 @@ Index: linux/include/linux/fs_struct.h dput(old_pwd); mntput(old_pwdmnt); } -Index: linux/kernel/ksyms.c +Index: linux/kernel/exit.c =================================================================== ---- linux.orig/kernel/ksyms.c Wed Mar 17 13:11:23 2004 -+++ linux/kernel/ksyms.c Wed Mar 17 13:11:25 2004 -@@ -315,6 +315,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); +--- linux.orig/kernel/exit.c Wed Mar 17 13:00:38 2004 ++++ linux/kernel/exit.c Wed Mar 17 13:11:25 2004 +@@ -239,11 +239,14 @@ + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } Index: linux/kernel/fork.c =================================================================== --- linux.orig/kernel/fork.c Wed Mar 17 13:00:38 2004 @@ -1920,22 +1927,15 @@ Index: linux/kernel/fork.c fs->altrootmnt = mntget(old->altrootmnt); fs->altroot = dget(old->altroot); } else { -Index: linux/kernel/exit.c +Index: linux/kernel/ksyms.c =================================================================== ---- linux.orig/kernel/exit.c Wed Mar 17 13:00:38 2004 -+++ linux/kernel/exit.c Wed Mar 17 13:11:25 2004 -@@ -239,11 +239,14 @@ - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } +--- linux.orig/kernel/ksyms.c Wed Mar 17 13:11:23 2004 ++++ linux/kernel/ksyms.c Wed Mar 17 13:11:25 2004 +@@ -315,6 +315,7 @@ + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch index 409c8f0..37bf227 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch @@ -286,7 +286,7 @@ Index: linux-2.4.20/fs/namei.c { struct dentry *dentry; struct inode *inode; -@@ -526,19 +579,18 @@ +@@ -526,18 +579,18 @@ break; } /* This does the actual lookups.. */ @@ -304,11 +304,10 @@ Index: linux-2.4.20/fs/namei.c } /* Check mountpoints.. */ - while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) -- ; -+ while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, NULL)); ++ while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, NULL)) + ; err = -ENOENT; - inode = dentry->d_inode; @@ -549,7 +601,7 @@ goto out_dput; @@ -377,7 +376,7 @@ Index: linux-2.4.20/fs/namei.c + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -1011,7 +1010,7 @@ Index: linux-2.4.20/fs/namespace.c int retval = 0; int mnt_flags = 0; -@@ -722,10 +729,11 @@ +@@ -722,9 +729,11 @@ flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); /* ... and get the mountpoint */ @@ -1021,11 +1020,10 @@ Index: linux-2.4.20/fs/namespace.c + if (retval) { + intent_release(&it); return retval; -- + } + if (flags & MS_REMOUNT) retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, - data_page); @@ -736,6 +744,8 @@ else retval = do_add_mount(&nd, type_page, flags, mnt_flags, @@ -1283,7 +1281,7 @@ Index: linux-2.4.20/fs/open.c - error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | -+ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); if (error) goto out; @@ -1680,7 +1678,7 @@ Index: linux-2.4.20/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It @@ -1823,18 +1821,25 @@ Index: linux-2.4.20/include/linux/fs_struct.h dput(old_pwd); mntput(old_pwdmnt); } -Index: linux-2.4.20/kernel/ksyms.c +Index: linux-2.4.20/kernel/exit.c =================================================================== ---- linux-2.4.20.orig/kernel/ksyms.c Wed Mar 17 13:57:11 2004 -+++ linux-2.4.20/kernel/ksyms.c Wed Mar 17 13:57:11 2004 -@@ -297,6 +297,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); +--- linux-2.4.20.orig/kernel/exit.c Wed Mar 17 13:57:05 2004 ++++ linux-2.4.20/kernel/exit.c Wed Mar 17 13:57:11 2004 +@@ -345,11 +345,14 @@ + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } Index: linux-2.4.20/kernel/fork.c =================================================================== --- linux-2.4.20.orig/kernel/fork.c Wed Mar 17 13:57:05 2004 @@ -1853,22 +1858,15 @@ Index: linux-2.4.20/kernel/fork.c fs->altrootmnt = mntget(old->altrootmnt); fs->altroot = dget(old->altroot); } else { -Index: linux-2.4.20/kernel/exit.c +Index: linux-2.4.20/kernel/ksyms.c =================================================================== ---- linux-2.4.20.orig/kernel/exit.c Wed Mar 17 13:57:05 2004 -+++ linux-2.4.20/kernel/exit.c Wed Mar 17 13:57:11 2004 -@@ -345,11 +345,14 @@ - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } +--- linux-2.4.20.orig/kernel/ksyms.c Wed Mar 17 13:57:11 2004 ++++ linux-2.4.20/kernel/ksyms.c Wed Mar 17 13:57:11 2004 +@@ -297,6 +297,7 @@ + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch index 1ef0b01..dd293f1 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch @@ -12,6 +12,47 @@ kernel/ksyms.c | 1 12 files changed, 558 insertions(+), 128 deletions(-) +Index: linux-2.4.24/fs/dcache.c +=================================================================== +--- linux-2.4.24.orig/fs/dcache.c Fri Jun 13 07:51:37 2003 ++++ linux-2.4.24/fs/dcache.c Wed Mar 17 17:36:14 2004 +@@ -181,6 +181,13 @@ int d_invalidate(struct dentry * dentry) + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -830,13 +837,19 @@ void d_delete(struct dentry * dentry) + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ Index: linux-2.4.24/fs/exec.c =================================================================== --- linux-2.4.24.orig/fs/exec.c Fri Nov 28 10:26:21 2003 @@ -77,164 +118,6 @@ Index: linux-2.4.24/fs/exec.c goto close_fail; retval = binfmt->core_dump(signr, regs, file); -Index: linux-2.4.24/fs/dcache.c -=================================================================== ---- linux-2.4.24.orig/fs/dcache.c Fri Jun 13 07:51:37 2003 -+++ linux-2.4.24/fs/dcache.c Wed Mar 17 17:36:14 2004 -@@ -181,6 +181,13 @@ int d_invalidate(struct dentry * dentry) - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -830,13 +837,19 @@ void d_delete(struct dentry * dentry) - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux-2.4.24/fs/namespace.c -=================================================================== ---- linux-2.4.24.orig/fs/namespace.c Fri Nov 28 10:26:21 2003 -+++ linux-2.4.24/fs/namespace.c Wed Mar 17 17:36:14 2004 -@@ -98,6 +98,7 @@ static void detach_mnt(struct vfsmount * - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -109,6 +110,7 @@ static void attach_mnt(struct vfsmount * - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -488,14 +490,17 @@ static int do_loopback(struct nameidata - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; -- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); -- if (err) -+ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down_write(¤t->namespace->sem); - err = -EINVAL; -@@ -518,6 +523,7 @@ static int do_loopback(struct nameidata - } - - up_write(¤t->namespace->sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -701,6 +707,7 @@ long do_mount(char * dev_name, char * di - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -725,10 +732,11 @@ long do_mount(char * dev_name, char * di - flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); - - /* ... and get the mountpoint */ -- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); -- if (retval) -+ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -- -+ } - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, - data_page); -@@ -739,6 +747,8 @@ long do_mount(char * dev_name, char * di - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -904,6 +914,8 @@ asmlinkage long sys_pivot_root(const cha - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - int error; - - if (!capable(CAP_SYS_ADMIN)) -@@ -911,14 +923,14 @@ asmlinkage long sys_pivot_root(const cha - - lock_kernel(); - -- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); -+ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); - if (error) - goto out0; - error = -EINVAL; - if (!check_mnt(new_nd.mnt)) - goto out1; - -- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); -+ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); - if (error) - goto out1; - -@@ -973,8 +985,10 @@ out2: - up(&old_nd.dentry->d_inode->i_zombie); - up_write(¤t->namespace->sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); Index: linux-2.4.24/fs/namei.c =================================================================== --- linux-2.4.24.orig/fs/namei.c Mon Aug 25 04:44:43 2003 @@ -424,7 +307,7 @@ Index: linux-2.4.24/fs/namei.c + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -985,6 +868,122 @@ Index: linux-2.4.24/fs/namei.c if (page) { kunmap(page); page_cache_release(page); +Index: linux-2.4.24/fs/namespace.c +=================================================================== +--- linux-2.4.24.orig/fs/namespace.c Fri Nov 28 10:26:21 2003 ++++ linux-2.4.24/fs/namespace.c Wed Mar 17 17:36:14 2004 +@@ -98,6 +98,7 @@ static void detach_mnt(struct vfsmount * + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -109,6 +110,7 @@ static void attach_mnt(struct vfsmount * + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -488,14 +490,17 @@ static int do_loopback(struct nameidata + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; +- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); +- if (err) ++ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down_write(¤t->namespace->sem); + err = -EINVAL; +@@ -518,6 +523,7 @@ static int do_loopback(struct nameidata + } + + up_write(¤t->namespace->sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -701,6 +707,7 @@ long do_mount(char * dev_name, char * di + unsigned long flags, void *data_page) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int retval = 0; + int mnt_flags = 0; + +@@ -725,9 +732,11 @@ long do_mount(char * dev_name, char * di + flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); + + /* ... and get the mountpoint */ +- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); +- if (retval) ++ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; ++ } + + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, +@@ -739,6 +747,8 @@ long do_mount(char * dev_name, char * di + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -904,6 +914,8 @@ asmlinkage long sys_pivot_root(const cha + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + int error; + + if (!capable(CAP_SYS_ADMIN)) +@@ -911,14 +923,14 @@ asmlinkage long sys_pivot_root(const cha + + lock_kernel(); + +- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); ++ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); + if (error) + goto out0; + error = -EINVAL; + if (!check_mnt(new_nd.mnt)) + goto out1; + +- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); ++ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); + if (error) + goto out1; + +@@ -973,8 +985,10 @@ out2: + up(&old_nd.dentry->d_inode->i_zombie); + up_write(¤t->namespace->sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); Index: linux-2.4.24/fs/open.c =================================================================== --- linux-2.4.24.orig/fs/open.c Mon Aug 25 04:44:43 2003 @@ -1196,7 +1195,7 @@ Index: linux-2.4.24/fs/open.c - error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | -+ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); if (error) goto out; @@ -1653,7 +1652,7 @@ Index: linux-2.4.24/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It @@ -1796,18 +1795,25 @@ Index: linux-2.4.24/include/linux/fs_struct.h dput(old_pwd); mntput(old_pwdmnt); } -Index: linux-2.4.24/kernel/ksyms.c +Index: linux-2.4.24/kernel/exit.c =================================================================== ---- linux-2.4.24.orig/kernel/ksyms.c Wed Mar 17 17:36:14 2004 -+++ linux-2.4.24/kernel/ksyms.c Wed Mar 17 17:36:14 2004 -@@ -275,6 +275,7 @@ EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(mark_page_accessed); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); +--- linux-2.4.24.orig/kernel/exit.c Thu Nov 28 15:53:15 2002 ++++ linux-2.4.24/kernel/exit.c Wed Mar 17 17:36:14 2004 +@@ -238,11 +238,14 @@ static inline void __put_fs_struct(struc + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } Index: linux-2.4.24/kernel/fork.c =================================================================== --- linux-2.4.24.orig/kernel/fork.c Fri Nov 28 10:26:21 2003 @@ -1826,22 +1832,15 @@ Index: linux-2.4.24/kernel/fork.c fs->altrootmnt = mntget(old->altrootmnt); fs->altroot = dget(old->altroot); } else { -Index: linux-2.4.24/kernel/exit.c +Index: linux-2.4.24/kernel/ksyms.c =================================================================== ---- linux-2.4.24.orig/kernel/exit.c Thu Nov 28 15:53:15 2002 -+++ linux-2.4.24/kernel/exit.c Wed Mar 17 17:36:14 2004 -@@ -238,11 +238,14 @@ static inline void __put_fs_struct(struc - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } +--- linux-2.4.24.orig/kernel/ksyms.c Wed Mar 17 17:36:14 2004 ++++ linux-2.4.24/kernel/ksyms.c Wed Mar 17 17:36:14 2004 +@@ -275,6 +275,7 @@ EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(mark_page_accessed); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch index 09ef2f9..0026514 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch @@ -309,11 +309,11 @@ Index: linux-ia64/fs/namei.c + if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { + struct dentry *new; + err = permission(dentry->d_parent->d_inode, -+ MAY_EXEC); ++ MAY_EXEC); + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -554,7 +554,7 @@ Index: linux-ia64/fs/namei.c if (!IS_POSIXACL(dir->d_inode)) mode &= ~current->fs->umask; - error = vfs_create(dir->d_inode, dentry, mode); -+ error = vfs_create_it(dir->d_inode, dentry, mode, it); ++ error = vfs_create_it(dir->d_inode, dentry, mode, it); up(&dir->d_inode->i_sem); dput(nd->dentry); nd->dentry = dentry; @@ -930,7 +930,7 @@ Index: linux-ia64/fs/namespace.c int retval = 0; int mnt_flags = 0; -@@ -725,10 +732,11 @@ long do_mount(char * dev_name, char * di +@@ -725,9 +732,11 @@ long do_mount(char * dev_name, char * di flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); /* ... and get the mountpoint */ @@ -940,11 +940,10 @@ Index: linux-ia64/fs/namespace.c + if (retval) { + intent_release(&it); return retval; -- + } + if (flags & MS_REMOUNT) retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, - data_page); @@ -739,6 +747,8 @@ long do_mount(char * dev_name, char * di else retval = do_add_mount(&nd, type_page, flags, mnt_flags, @@ -1202,7 +1201,7 @@ Index: linux-ia64/fs/open.c - error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | -+ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); if (error) goto out; @@ -1661,7 +1660,7 @@ Index: linux-ia64/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch new file mode 100644 index 0000000..2ff2de8 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch @@ -0,0 +1,1862 @@ + fs/dcache.c | 19 ++ + fs/exec.c | 17 +- + fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++------- + fs/namespace.c | 28 +++- + fs/open.c | 172 +++++++++++++++++++------- + fs/stat.c | 52 +++++--- + include/linux/dcache.h | 60 +++++++++ + include/linux/fs.h | 32 ++++ + include/linux/fs_struct.h | 4 + kernel/exit.c | 3 + kernel/fork.c | 3 + kernel/ksyms.c | 1 + 12 files changed, 558 insertions(+), 128 deletions(-) + +Index: linux-2.4.21/fs/dcache.c +=================================================================== +--- linux-2.4.21.orig/fs/dcache.c 2004-04-24 02:38:00.000000000 -0400 ++++ linux-2.4.21/fs/dcache.c 2004-04-26 19:06:31.000000000 -0400 +@@ -186,6 +186,13 @@ + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -838,13 +845,19 @@ + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ +Index: linux-2.4.21/fs/exec.c +=================================================================== +--- linux-2.4.21.orig/fs/exec.c 2004-04-24 02:39:01.000000000 -0400 ++++ linux-2.4.21/fs/exec.c 2004-04-26 19:06:31.000000000 -0400 +@@ -113,8 +113,10 @@ + struct file * file; + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_OPEN, ++ .it_flags = FMODE_READ|FMODE_EXEC }; + +- error = user_path_walk(library, &nd); ++ error = user_path_walk_it(library, &nd, &it); + if (error) + goto out; + +@@ -126,7 +128,8 @@ + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(&it); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -383,8 +386,10 @@ + struct inode *inode; + struct file *file; + int err = 0; ++ struct lookup_intent it = { .it_op = IT_OPEN, ++ .it_flags = FMODE_READ|FMODE_EXEC }; + +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); ++ err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); + file = ERR_PTR(err); + if (!err) { + inode = nd.dentry->d_inode; +@@ -396,7 +401,8 @@ + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(&it); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +@@ -408,6 +414,7 @@ + return file; + } + } ++ intent_release(&it); + path_release(&nd); + } + goto out; +@@ -1147,7 +1154,7 @@ + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +Index: linux-2.4.21/fs/namei.c +=================================================================== +--- linux-2.4.21.orig/fs/namei.c 2004-04-24 02:39:02.000000000 -0400 ++++ linux-2.4.21/fs/namei.c 2004-04-26 19:06:38.000000000 -0400 +@@ -94,6 +94,13 @@ + * XEmacs seems to be relying on it... + */ + ++void intent_release(struct lookup_intent *it) ++{ ++ if (it && it->it_op_release) ++ it->it_op_release(it); ++ ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -274,10 +281,19 @@ + * Internal lookup() using the new generic dcache. + * SMP-safe + */ +-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * dentry = d_lookup(parent, name); + ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) && ++ !d_invalidate(dentry)) { ++ dput(dentry); ++ dentry = NULL; ++ } ++ return dentry; ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { + dput(dentry); +@@ -295,11 +311,15 @@ + * make sure that nobody added the entry to the dcache in the meantime.. + * SMP-safe + */ +-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ int counter = 0; + ++again: ++ counter++; + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created +@@ -314,6 +334,9 @@ + result = ERR_PTR(-ENOMEM); + if (dentry) { + lock_kernel(); ++ if (dir->i_op->lookup_it) ++ result = dir->i_op->lookup_it(dir, dentry, it, flags); ++ else + result = dir->i_op->lookup(dir, dentry); + unlock_kernel(); + if (result) +@@ -335,6 +358,15 @@ + dput(result); + result = ERR_PTR(-ENOENT); + } ++ } else if (result->d_op && result->d_op->d_revalidate_it) { ++ if (!result->d_op->d_revalidate_it(result, flags, it) && ++ !d_invalidate(result)) { ++ dput(result); ++ if (counter > 10) ++ result = ERR_PTR(-ESTALE); ++ if (!IS_ERR(result)) ++ goto again; ++ } + } + return result; + } +@@ -346,7 +378,8 @@ + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +-static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) ++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, ++ struct lookup_intent *it) + { + int err; + if (current->link_count >= 8) +@@ -360,10 +393,12 @@ + current->link_count++; + current->total_link_count++; + UPDATE_ATIME(dentry->d_inode); ++ nd->intent = it; + err = dentry->d_inode->i_op->follow_link(dentry, nd); + current->link_count--; + return err; + loop: ++ intent_release(it); + path_release(nd); + return -ELOOP; + } +@@ -462,7 +497,8 @@ + * We expect 'base' to be positive and a directory. + */ + static inline int __attribute__((always_inline)) +-__link_path_walk(const char * name, struct nameidata *nd) ++__link_path_walk_it(const char * name, struct nameidata *nd, ++ struct lookup_intent *it) + { + struct dentry *dentry; + struct inode *inode; +@@ -539,12 +575,12 @@ + break; + } + /* This does the actual lookups.. */ +- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + if (!dentry) { + err = -EWOULDBLOCKIO; + if (atomic) + break; +- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; +@@ -562,7 +598,7 @@ + goto out_dput; + + if (inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ err = do_follow_link(dentry, nd, NULL); + dput(dentry); + if (err) + goto return_err; +@@ -578,7 +614,7 @@ + nd->dentry = dentry; + } + err = -ENOTDIR; +- if (!inode->i_op->lookup) ++ if (!inode->i_op->lookup && !inode->i_op->lookup_it) + break; + continue; + /* here ends the main loop */ +@@ -605,12 +641,12 @@ + if (err < 0) + break; + } +- dentry = cached_lookup(nd->dentry, &this, 0); ++ dentry = cached_lookup(nd->dentry, &this, 0, it); + if (!dentry) { + err = -EWOULDBLOCKIO; + if (atomic) + break; +- dentry = real_lookup(nd->dentry, &this, 0); ++ dentry = real_lookup(nd->dentry, &this, 0, it); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; +@@ -620,7 +656,7 @@ + inode = dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) + && inode && inode->i_op && inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ err = do_follow_link(dentry, nd, it); + dput(dentry); + if (err) + goto return_err; +@@ -634,7 +670,8 @@ + goto no_inode; + if (lookup_flags & LOOKUP_DIRECTORY) { + err = -ENOTDIR; +- if (!inode->i_op || !inode->i_op->lookup) ++ if (!inode->i_op || ++ (!inode->i_op->lookup && !inode->i_op->lookup_it)) + break; + } + goto return_base; +@@ -658,6 +695,27 @@ + * Check the cached dentry for staleness. + */ + dentry = nd->dentry; ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ err = -ESTALE; ++ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { ++ struct dentry *new; ++ err = permission(dentry->d_parent->d_inode, ++ MAY_EXEC); ++ if (err) ++ break; ++ new = real_lookup(dentry->d_parent, ++ &dentry->d_name, 0, it); ++ d_invalidate(dentry); ++ dput(dentry); ++ if (IS_ERR(new)) { ++ err = PTR_ERR(new); ++ break; ++ } ++ nd->dentry = new; ++ } ++ if (!nd->dentry->d_inode) ++ goto no_inode; ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + err = -ESTALE; + if (!dentry->d_op->d_revalidate(dentry, lookup_flags & LOOKUP_PARENT)) { +@@ -671,6 +729,8 @@ + dput(dentry); + break; + } ++ if (err) ++ intent_release(it); + path_release(nd); + return_err: + return err; +@@ -678,13 +738,13 @@ + + int link_path_walk(const char * name, struct nameidata *nd) + { +- return __link_path_walk(name,nd); ++ return __link_path_walk_it(name, nd, NULL); + } + + static inline int __path_walk(const char * name, struct nameidata *nd) + { + current->total_link_count = 0; +- return __link_path_walk(name, nd); ++ return __link_path_walk_it(name, nd, NULL); + } + + int path_walk(const char * name, struct nameidata *nd) +@@ -692,6 +752,12 @@ + return __path_walk(name, nd); + } + ++int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) ++{ ++ current->total_link_count = 0; ++ return __link_path_walk_it(name, nd, it); ++} ++ + /* SMP-safe */ + /* returns 1 if everything is done */ + static int __emul_lookup_dentry(const char *name, struct nameidata *nd) +@@ -774,6 +840,17 @@ + } + + /* SMP-safe */ ++int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ int error = 0; ++ if (path_init(path, flags, nd)) ++ error = path_walk_it(path, nd, it); ++ return error; ++} ++ ++ ++/* SMP-safe */ + int path_lookup(const char *path, unsigned flags, struct nameidata *nd) + { + int error = 0; +@@ -788,6 +865,7 @@ + { + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags; ++ nd->intent = NULL; + if (*name=='/') + return walk_init_root(name,nd); + read_lock(¤t->fs->lock); +@@ -802,7 +880,8 @@ + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, ++ struct lookup_intent *it) + { + struct dentry * dentry; + struct inode *inode; +@@ -825,13 +904,16 @@ + goto out; + } + +- dentry = cached_lookup(base, name, 0); ++ dentry = cached_lookup(base, name, 0, it); + if (!dentry) { + struct dentry *new = d_alloc(base, name); + dentry = ERR_PTR(-ENOMEM); + if (!new) + goto out; + lock_kernel(); ++ if (inode->i_op->lookup_it) ++ dentry = inode->i_op->lookup_it(inode, new, it, 0); ++ else + dentry = inode->i_op->lookup(inode, new); + unlock_kernel(); + if (!dentry) +@@ -843,6 +925,12 @@ + return dentry; + } + ++struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++{ ++ return lookup_hash_it(name, base, NULL); ++} ++ ++ + /* SMP-safe */ + struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) + { +@@ -864,7 +952,7 @@ + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return lookup_hash_it(&this, base, NULL); + access: + return ERR_PTR(-EACCES); + } +@@ -895,6 +983,23 @@ + return err; + } + ++int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ char *tmp; ++ int err; ++ ++ tmp = getname(name); ++ err = PTR_ERR(tmp); ++ if (!IS_ERR(tmp)) { ++ err = 0; ++ if (path_init(tmp, flags, nd)) ++ err = path_walk_it(tmp, nd, it); ++ putname(tmp); ++ } ++ return err; ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -992,7 +1097,8 @@ + return retval; + } + +-int vfs_create(struct inode *dir, struct dentry *dentry, int mode) ++static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode, ++ struct lookup_intent *it) + { + int error; + +@@ -1005,12 +1111,15 @@ + goto exit_lock; + + error = -EACCES; /* shouldn't it be ENOSYS? */ +- if (!dir->i_op || !dir->i_op->create) ++ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it)) + goto exit_lock; + + DQUOT_INIT(dir); + lock_kernel(); +- error = dir->i_op->create(dir, dentry, mode); ++ if (dir->i_op->create_it) ++ error = dir->i_op->create_it(dir, dentry, mode, it); ++ else ++ error = dir->i_op->create(dir, dentry, mode); + unlock_kernel(); + exit_lock: + up(&dir->i_zombie); +@@ -1019,6 +1128,11 @@ + return error; + } + ++int vfs_create(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ return vfs_create_it(dir, dentry, mode, NULL); ++} ++ + /* + * open_namei() + * +@@ -1033,7 +1147,8 @@ + * for symlinks (where the permissions are checked later). + * SMP-safe + */ +-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) ++int open_namei_it(const char *pathname, int flag, int mode, ++ struct nameidata *nd, struct lookup_intent *it) + { + int acc_mode, error = 0; + struct inode *inode; +@@ -1043,11 +1158,14 @@ + + acc_mode = ACC_MODE(flag); + ++ if (it) ++ it->it_flags = flag; ++ + /* + * The simplest case - just a plain lookup. + */ + if (!(flag & O_CREAT)) { +- error = path_lookup(pathname, lookup_flags(flag), nd); ++ error = path_lookup_it(pathname, lookup_flags(flag), nd, it); + if (error) + return error; + dentry = nd->dentry; +@@ -1057,6 +1175,10 @@ + /* + * Create - we need to know the parent. + */ ++ if (it) { ++ it->it_create_mode = mode; ++ it->it_op |= IT_CREAT; ++ } + error = path_lookup(pathname, LOOKUP_PARENT, nd); + if (error) + return error; +@@ -1072,7 +1194,7 @@ + + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + + do_last: + error = PTR_ERR(dentry); +@@ -1081,11 +1203,12 @@ + goto exit; + } + ++ it->it_create_mode = mode; + /* Negative dentry, just create the file */ + if (!dentry->d_inode) { + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current->fs->umask; +- error = vfs_create(dir->d_inode, dentry, mode); ++ error = vfs_create_it(dir->d_inode, dentry, mode, it); + up(&dir->d_inode->i_sem); + #ifndef DENTRY_WASTE_RAM + if (error) +@@ -1193,7 +1316,7 @@ + if (!error) { + DQUOT_INIT(inode); + +- error = do_truncate(dentry, 0); ++ error = do_truncate(dentry, 0, 1); + } + put_write_access(inode); + if (error) +@@ -1205,8 +1328,10 @@ + return 0; + + exit_dput: ++ intent_release(it); + dput(dentry); + exit: ++ intent_release(it); + path_release(nd); + return error; + +@@ -1225,7 +1350,10 @@ + * are done. Procfs-like symlinks just set LAST_BIND. + */ + UPDATE_ATIME(dentry->d_inode); ++ nd->intent = it; + error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (error) ++ intent_release(it); + dput(dentry); + if (error) + return error; +@@ -1247,13 +1375,20 @@ + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + putname(nd->last.name); + goto do_last; + } + ++int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) ++{ ++ return open_namei_it(pathname, flag, mode, nd, NULL); ++} ++ ++ + /* SMP-safe */ +-struct dentry *lookup_create(struct nameidata *nd, int is_dir) ++struct dentry *lookup_create(struct nameidata *nd, int is_dir, ++ struct lookup_intent *it) + { + struct dentry *dentry; + +@@ -1261,7 +1396,7 @@ + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + if (IS_ERR(dentry)) + goto fail; + if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) +@@ -1317,7 +1452,20 @@ + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } ++ if (nd.dentry->d_inode->i_op->mknod_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod_raw(&nd, mode, dev); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + + if (!IS_POSIXACL(nd.dentry->d_inode)) +@@ -1339,6 +1487,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1386,7 +1535,18 @@ + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 1); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } ++ if (nd.dentry->d_inode->i_op->mkdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir_raw(&nd, mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 1, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + if (!IS_POSIXACL(nd.dentry->d_inode)) +@@ -1395,6 +1555,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1495,8 +1656,16 @@ + error = -EBUSY; + goto exit1; + } ++ if (nd.dentry->d_inode->i_op->rmdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ error = op->rmdir_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); +@@ -1554,8 +1723,15 @@ + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ +@@ -1622,15 +1798,27 @@ + error = path_lookup(to, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } ++ if (nd.dentry->d_inode->i_op->symlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink_raw(&nd, from); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++ out2: + path_release(&nd); +-out: ++ out: + putname(to); + } + putname(from); +@@ -1706,7 +1894,18 @@ + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; +- new_dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out_release; ++ } ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } ++ new_dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); +@@ -1750,7 +1949,7 @@ + * locking]. + */ + int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry) + { + int error; + struct inode *target; +@@ -1829,7 +2028,7 @@ + } + + int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry) + { + int error; + +@@ -1917,9 +2116,18 @@ + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename_raw) { ++ lock_kernel(); ++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); ++ unlock_kernel(); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ + double_lock(new_dir, old_dir); + +- old_dentry = lookup_hash(&oldnd.last, old_dir); ++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; +@@ -1935,16 +2143,16 @@ + if (newnd.last.name[newnd.last.len]) + goto exit4; + } +- new_dentry = lookup_hash(&newnd.last, new_dir); ++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + ++ + lock_kernel(); + error = vfs_rename(old_dir->d_inode, old_dentry, + new_dir->d_inode, new_dentry); + unlock_kernel(); +- + dput(new_dentry); + exit4: + dput(old_dentry); +@@ -1995,20 +2203,26 @@ + } + + static inline int __attribute__((always_inline)) +-__vfs_follow_link(struct nameidata *nd, const char *link) ++__vfs_follow_link(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) + { + int res = 0; + char *name; + if (IS_ERR(link)) + goto fail; + ++ if (it == NULL) ++ it = nd->intent; ++ else if (it != nd->intent) ++ printk("it != nd->intent: tell phil@clusterfs.com\n"); ++ + if (*link == '/') { + path_release(nd); + if (!walk_init_root(link, nd)) + /* weird __emul_prefix() stuff did it */ + goto out; + } +- res = __link_path_walk(link, nd); ++ res = __link_path_walk_it(link, nd, it); + out: + if (current->link_count || res || nd->last_type!=LAST_NORM) + return res; +@@ -2032,7 +2246,13 @@ + + int vfs_follow_link(struct nameidata *nd, const char *link) + { +- return __vfs_follow_link(nd, link); ++ return __vfs_follow_link(nd, link, NULL); ++} ++ ++int vfs_follow_link_it(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) ++{ ++ return __vfs_follow_link(nd, link, it); + } + + /* get the link contents into pagecache */ +@@ -2074,7 +2294,7 @@ + { + struct page *page = NULL; + char *s = page_getlink(dentry, &page); +- int res = __vfs_follow_link(nd, s); ++ int res = __vfs_follow_link(nd, s, NULL); + if (page) { + kunmap(page); + page_cache_release(page); +Index: linux-2.4.21/fs/namespace.c +=================================================================== +--- linux-2.4.21.orig/fs/namespace.c 2004-04-24 02:38:41.000000000 -0400 ++++ linux-2.4.21/fs/namespace.c 2004-04-26 19:06:32.000000000 -0400 +@@ -98,6 +98,7 @@ + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -109,6 +110,7 @@ + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -488,14 +490,17 @@ + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; +- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); +- if (err) ++ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down_write(¤t->namespace->sem); + err = -EINVAL; +@@ -518,6 +523,7 @@ + } + + up_write(¤t->namespace->sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -701,6 +707,7 @@ + unsigned long flags, void *data_page) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int retval = 0; + int mnt_flags = 0; + +@@ -725,9 +732,11 @@ + flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); + + /* ... and get the mountpoint */ +- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); +- if (retval) ++ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; ++ } + + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, +@@ -739,6 +748,8 @@ + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -904,6 +915,8 @@ + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + int error; + + if (!capable(CAP_SYS_ADMIN)) +@@ -911,14 +924,14 @@ + + lock_kernel(); + +- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); ++ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); + if (error) + goto out0; + error = -EINVAL; + if (!check_mnt(new_nd.mnt)) + goto out1; + +- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); ++ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); + if (error) + goto out1; + +@@ -973,8 +986,10 @@ + up(&old_nd.dentry->d_inode->i_zombie); + up_write(¤t->namespace->sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); +Index: linux-2.4.21/fs/open.c +=================================================================== +--- linux-2.4.21.orig/fs/open.c 2004-04-24 02:39:01.000000000 -0400 ++++ linux-2.4.21/fs/open.c 2004-04-26 19:06:32.000000000 -0400 +@@ -20,6 +20,8 @@ + #include + + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) ++extern int path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it); + + int vfs_statfs(struct super_block *sb, struct statfs *buf) + { +@@ -96,9 +98,10 @@ + write_unlock(&files->file_lock); + } + +-int do_truncate(struct dentry *dentry, loff_t length) ++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) + { + struct inode *inode = dentry->d_inode; ++ struct inode_operations *op = dentry->d_inode->i_op; + int error; + struct iattr newattrs; + +@@ -110,7 +113,13 @@ + down(&inode->i_sem); + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; +- error = notify_change(dentry, &newattrs); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; ++ if (op->setattr_raw) { ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ } else ++ error = notify_change(dentry, &newattrs); + up(&inode->i_sem); + up_write(&inode->i_alloc_sem); + return error; +@@ -121,12 +130,13 @@ + struct nameidata nd; + struct inode * inode; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd, &it); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -166,11 +176,13 @@ + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length); ++ intent_release(&it); ++ error = do_truncate(nd.dentry, length, 0); + } + put_write_access(inode); + + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -218,7 +230,7 @@ + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length); ++ error = do_truncate(dentry, length, 0); + out_putf: + fput(file); + out: +@@ -263,11 +275,13 @@ + struct inode * inode; + struct iattr newattrs; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, NULL); + if (error) + goto out; + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -282,11 +296,25 @@ + goto dput_and_out; + + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EPERM; ++ if (!times) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; + } ++ + error = notify_change(nd.dentry, &newattrs); + dput_and_out: + path_release(&nd); +@@ -307,12 +335,14 @@ + struct inode * inode; + struct iattr newattrs; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, NULL); + + if (error) + goto out; + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -327,7 +357,20 @@ + newattrs.ia_atime = times[0].tv_sec; + newattrs.ia_mtime = times[1].tv_sec; + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EPERM; ++ if (!utimes) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; +@@ -350,6 +393,7 @@ + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -367,13 +411,14 @@ + else + current->cap_effective = current->cap_permitted; + +- res = user_path_walk(filename, &nd); ++ res = user_path_walk_it(filename, &nd, &it); + if (!res) { + res = permission(nd.dentry->d_inode, mode); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ intent_release(&it); + path_release(&nd); + } + +@@ -388,8 +433,9 @@ + { + int error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd); ++ error = __user_walk_it(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd, &it); + if (error) + goto out; + +@@ -400,6 +446,7 @@ + set_fs_pwd(current->fs, nd.mnt, nd.dentry); + + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -439,9 +486,10 @@ + { + int error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | +- LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); + if (error) + goto out; + +@@ -457,39 +505,56 @@ + set_fs_altroot(); + error = 0; + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; + } + +-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++int chmod_common(struct dentry *dentry, mode_t mode) + { +- struct inode * inode; +- struct dentry * dentry; +- struct file * file; +- int err = -EBADF; ++ struct inode *inode = dentry->d_inode; + struct iattr newattrs; ++ int err = -EROFS; + +- file = fget(fd); +- if (!file) ++ if (IS_RDONLY(inode)) + goto out; + +- dentry = file->f_dentry; +- inode = dentry->d_inode; ++ if (inode->i_op->setattr_raw) { ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ err = inode->i_op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (err != -EOPNOTSUPP) ++ goto out; ++ } + +- err = -EROFS; +- if (IS_RDONLY(inode)) +- goto out_putf; + err = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto out_putf; ++ goto out; ++ + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + err = notify_change(dentry, &newattrs); + +-out_putf: ++out: ++ return err; ++} ++ ++asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++{ ++ struct file * file; ++ int err = -EBADF; ++ ++ file = fget(fd); ++ if (!file) ++ goto out; ++ ++ err = chmod_common(file->f_dentry, mode); ++ + fput(file); + out: + return err; +@@ -498,30 +563,14 @@ + asmlinkage long sys_chmod(const char * filename, mode_t mode) + { + struct nameidata nd; +- struct inode * inode; + int error; +- struct iattr newattrs; + + error = user_path_walk(filename, &nd); + if (error) + goto out; +- inode = nd.dentry->d_inode; +- +- error = -EROFS; +- if (IS_RDONLY(inode)) +- goto dput_and_out; + +- error = -EPERM; +- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto dput_and_out; ++ error = chmod_common(nd.dentry, mode); + +- if (mode == (mode_t) -1) +- mode = inode->i_mode; +- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); +- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; +- error = notify_change(nd.dentry, &newattrs); +- +-dput_and_out: + path_release(&nd); + out: + return error; +@@ -541,6 +590,20 @@ + error = -EROFS; + if (IS_RDONLY(inode)) + goto out; ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_uid = user; ++ newattrs.ia_gid = group; ++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ return error; ++ } ++ + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; +@@ -645,6 +708,7 @@ + { + int namei_flags, error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_OPEN }; + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) +@@ -652,14 +716,15 @@ + if (namei_flags & O_TRUNC) + namei_flags |= 2; + +- error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); ++ error = open_namei_it(filename, namei_flags, mode, &nd, &it); ++ if (error) ++ return ERR_PTR(error); + +- return ERR_PTR(error); ++ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); + } + +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it) + { + struct file * f; + struct inode *inode; +@@ -687,7 +752,9 @@ + file_move(f, &inode->i_sb->s_files); + + if (f->f_op && f->f_op->open) { ++ f->f_it = it; + error = f->f_op->open(inode,f); ++ f->f_it = NULL; + if (error) + goto cleanup_all; + } +@@ -699,6 +766,7 @@ + !inode->i_mapping->a_ops->direct_IO)) + goto cleanup_all; + ++ intent_release(it); + return f; + + cleanup_all: +@@ -711,11 +779,17 @@ + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ return dentry_open_it(dentry, mnt, flags, NULL); ++} ++ + /* + * Find an empty file descriptor entry, and mark it busy. + */ +Index: linux-2.4.21/fs/stat.c +=================================================================== +--- linux-2.4.21.orig/fs/stat.c 2004-04-24 02:37:58.000000000 -0400 ++++ linux-2.4.21/fs/stat.c 2004-04-26 19:06:32.000000000 -0400 +@@ -17,10 +17,12 @@ + * Revalidate the inode. This is required for proper NFS attribute caching. + */ + static __inline__ int +-do_revalidate(struct dentry *dentry) ++do_revalidate(struct dentry *dentry, struct lookup_intent *it) + { + struct inode * inode = dentry->d_inode; +- if (inode->i_op && inode->i_op->revalidate) ++ if (inode->i_op && inode->i_op->revalidate_it) ++ return inode->i_op->revalidate_it(dentry, it); ++ else if (inode->i_op && inode->i_op->revalidate) + return inode->i_op->revalidate(dentry); + return 0; + } +@@ -141,13 +143,15 @@ + asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_old_stat(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -157,13 +161,15 @@ + asmlinkage long sys_newstat(char * filename, struct stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_new_stat(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -178,13 +184,15 @@ + asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_old_stat(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -195,13 +203,15 @@ + asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_new_stat(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -222,7 +232,7 @@ + if (f) { + struct dentry * dentry = f->f_dentry; + +- err = do_revalidate(dentry); ++ err = do_revalidate(dentry, NULL); + if (!err) + err = cp_old_stat(dentry->d_inode, statbuf); + fput(f); +@@ -241,7 +251,7 @@ + if (f) { + struct dentry * dentry = f->f_dentry; + +- err = do_revalidate(dentry); ++ err = do_revalidate(dentry, NULL); + if (!err) + err = cp_new_stat(dentry->d_inode, statbuf); + fput(f); +@@ -263,7 +273,7 @@ + + error = -EINVAL; + if (inode->i_op && inode->i_op->readlink && +- !(error = do_revalidate(nd.dentry))) { ++ !(error = do_revalidate(nd.dentry, NULL))) { + UPDATE_ATIME(inode); + error = inode->i_op->readlink(nd.dentry, buf, bufsiz); + } +@@ -339,12 +349,14 @@ + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_new_stat64(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -354,12 +366,14 @@ + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { +- error = do_revalidate(nd.dentry); ++ error = do_revalidate(nd.dentry, &it); + if (!error) + error = cp_new_stat64(nd.dentry->d_inode, statbuf); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -374,7 +388,7 @@ + if (f) { + struct dentry * dentry = f->f_dentry; + +- err = do_revalidate(dentry); ++ err = do_revalidate(dentry, NULL); + if (!err) + err = cp_new_stat64(dentry->d_inode, statbuf); + fput(f); +Index: linux-2.4.21/include/linux/dcache.h +=================================================================== +--- linux-2.4.21.orig/include/linux/dcache.h 2004-04-24 02:37:59.000000000 -0400 ++++ linux-2.4.21/include/linux/dcache.h 2004-04-26 19:06:32.000000000 -0400 +@@ -7,6 +7,51 @@ + #include + #include + #include ++#include ++ ++#define IT_OPEN 0x0001 ++#define IT_CREAT 0x0002 ++#define IT_READDIR 0x0004 ++#define IT_GETATTR 0x0008 ++#define IT_LOOKUP 0x0010 ++#define IT_UNLINK 0x0020 ++#define IT_GETXATTR 0x0040 ++#define IT_EXEC 0x0080 ++#define IT_PIN 0x0100 ++ ++#define IT_FL_LOCKED 0x0001 ++#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */ ++ ++#define INTENT_MAGIC 0x19620323 ++ ++ ++struct lustre_intent_data { ++ int it_disposition; ++ int it_status; ++ __u64 it_lock_handle; ++ void *it_data; ++ int it_lock_mode; ++ int it_int_flags; ++}; ++struct lookup_intent { ++ int it_magic; ++ void (*it_op_release)(struct lookup_intent *); ++ int it_op; ++ int it_flags; ++ int it_create_mode; ++ union { ++ struct lustre_intent_data lustre; ++ } d; ++}; ++ ++static inline void intent_init(struct lookup_intent *it, int op, int flags) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->it_magic = INTENT_MAGIC; ++ it->it_op = op; ++ it->it_flags = flags; ++} ++ + + /* + * linux/include/linux/dcache.h +@@ -94,8 +139,22 @@ + int (*d_delete)(struct dentry *); + void (*d_release)(struct dentry *); + void (*d_iput)(struct dentry *, struct inode *); ++ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *); ++ void (*d_pin)(struct dentry *, struct vfsmount * , int); ++ void (*d_unpin)(struct dentry *, struct vfsmount *, int); + }; + ++#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \ ++ de->d_op->d_pin(de, mnt, flag); ++#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \ ++ de->d_op->d_unpin(de, mnt, flag); ++ ++ ++/* defined in fs/namei.c */ ++extern void intent_release(struct lookup_intent *it); ++/* defined in fs/dcache.c */ ++extern void __d_rehash(struct dentry * entry, int lock); ++ + /* the dentry parameter passed to d_hash and d_compare is the parent + * directory of the entries to be compared. It is used in case these + * functions need any directory specific information for determining +@@ -127,6 +186,7 @@ + * s_nfsd_free_path semaphore will be down + */ + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ ++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ + + extern spinlock_t dcache_lock; + +Index: linux-2.4.21/include/linux/fs.h +=================================================================== +--- linux-2.4.21.orig/include/linux/fs.h 2004-04-26 18:56:44.000000000 -0400 ++++ linux-2.4.21/include/linux/fs.h 2004-04-26 19:06:32.000000000 -0400 +@@ -74,6 +74,7 @@ + + #define FMODE_READ 1 + #define FMODE_WRITE 2 ++#define FMODE_EXEC 4 + + #define READ 0 + #define WRITE 1 +@@ -360,6 +361,9 @@ + #define ATTR_MTIME_SET 256 + #define ATTR_FORCE 512 /* Not a change, but a change it */ + #define ATTR_ATTR_FLAG 1024 ++#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ ++#define ATTR_CTIME_SET 0x2000 + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -504,6 +508,7 @@ + struct pipe_inode_info *i_pipe; + struct block_device *i_bdev; + struct char_device *i_cdev; ++ void *i_filterdata; + + unsigned long i_dnotify_mask; /* Directory notify events */ + struct dnotify_struct *i_dnotify; /* for directory notifications */ +@@ -666,6 +671,7 @@ + + /* needed for tty driver, and maybe others */ + void *private_data; ++ struct lookup_intent *f_it; + struct list_head f_ep_links; + spinlock_t f_ep_lock; + }; +@@ -795,6 +801,7 @@ + struct qstr last; + unsigned int flags; + int last_type; ++ struct lookup_intent *intent; + }; + + /* +@@ -916,7 +923,8 @@ + extern int __vfs_rmdir(struct inode *, struct dentry *); + extern int vfs_rmdir(struct inode *, struct dentry *); + extern int vfs_unlink(struct inode *, struct dentry *); +-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); ++int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, ++ struct inode *new_dir, struct dentry *new_dentry); + + /* + * File types +@@ -991,21 +999,32 @@ + + struct inode_operations { + int (*create) (struct inode *,struct dentry *,int); ++ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); + struct dentry * (*lookup) (struct inode *,struct dentry *); ++ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link_raw) (struct nameidata *,struct nameidata *); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink_raw) (struct nameidata *); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink_raw) (struct nameidata *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir_raw) (struct nameidata *,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir_raw) (struct nameidata *); + int (*mknod) (struct inode *,struct dentry *,int,int); ++ int (*mknod_raw) (struct nameidata *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename_raw) (struct nameidata *, struct nameidata *); + int (*readlink) (struct dentry *, char *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int); + int (*revalidate) (struct dentry *); ++ int (*revalidate_it) (struct dentry *, struct lookup_intent *); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); + int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); +@@ -1204,10 +1223,14 @@ + + asmlinkage long sys_open(const char *, int, int); + asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ +-extern int do_truncate(struct dentry *, loff_t start); ++extern int do_truncate(struct dentry *, loff_t start, int called_from_open); + + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern int open_namei_it(const char *filename, int namei_flags, int mode, ++ struct nameidata *nd, struct lookup_intent *it); ++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char *); + +@@ -1503,6 +1526,7 @@ + extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + + extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); + extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); + extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); +@@ -1515,6 +1539,8 @@ + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); + #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) + #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) ++#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) ++#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) + + extern void inode_init_once(struct inode *); + extern void _inode_init_once(struct inode *); +@@ -1666,6 +1692,8 @@ + + extern int vfs_readlink(struct dentry *, char *, int, const char *); + extern int vfs_follow_link(struct nameidata *, const char *); ++extern int vfs_follow_link_it(struct nameidata *, const char *, ++ struct lookup_intent *it); + extern int page_readlink(struct dentry *, char *, int); + extern int page_follow_link(struct dentry *, struct nameidata *); + extern struct inode_operations page_symlink_inode_operations; +Index: linux-2.4.21/include/linux/fs_struct.h +=================================================================== +--- linux-2.4.21.orig/include/linux/fs_struct.h 2001-07-13 18:10:44.000000000 -0400 ++++ linux-2.4.21/include/linux/fs_struct.h 2004-04-26 19:06:32.000000000 -0400 +@@ -34,10 +34,12 @@ + write_lock(&fs->lock); + old_root = fs->root; + old_rootmnt = fs->rootmnt; ++ PIN(dentry, mnt, 1); + fs->rootmnt = mntget(mnt); + fs->root = dget(dentry); + write_unlock(&fs->lock); + if (old_root) { ++ UNPIN(old_root, old_rootmnt, 1); + dput(old_root); + mntput(old_rootmnt); + } +@@ -57,10 +59,12 @@ + write_lock(&fs->lock); + old_pwd = fs->pwd; + old_pwdmnt = fs->pwdmnt; ++ PIN(dentry, mnt, 0); + fs->pwdmnt = mntget(mnt); + fs->pwd = dget(dentry); + write_unlock(&fs->lock); + if (old_pwd) { ++ UNPIN(old_pwd, old_pwdmnt, 0); + dput(old_pwd); + mntput(old_pwdmnt); + } +Index: linux-2.4.21/kernel/exit.c +=================================================================== +--- linux-2.4.21.orig/kernel/exit.c 2004-04-24 02:39:01.000000000 -0400 ++++ linux-2.4.21/kernel/exit.c 2004-04-26 19:06:32.000000000 -0400 +@@ -292,11 +292,14 @@ + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } +Index: linux-2.4.21/kernel/fork.c +=================================================================== +--- linux-2.4.21.orig/kernel/fork.c 2004-04-24 02:39:01.000000000 -0400 ++++ linux-2.4.21/kernel/fork.c 2004-04-26 19:06:32.000000000 -0400 +@@ -466,10 +466,13 @@ + fs->umask = old->umask; + read_lock(&old->lock); + fs->rootmnt = mntget(old->rootmnt); ++ PIN(old->pwd, old->pwdmnt, 0); ++ PIN(old->root, old->rootmnt, 1); + fs->root = dget(old->root); + fs->pwdmnt = mntget(old->pwdmnt); + fs->pwd = dget(old->pwd); + if (old->altroot) { ++ PIN(old->altroot, old->altrootmnt, 1); + fs->altrootmnt = mntget(old->altrootmnt); + fs->altroot = dget(old->altroot); + } else { +Index: linux-2.4.21/kernel/ksyms.c +=================================================================== +--- linux-2.4.21.orig/kernel/ksyms.c 2004-04-26 18:56:44.000000000 -0400 ++++ linux-2.4.21/kernel/ksyms.c 2004-04-26 19:06:32.000000000 -0400 +@@ -329,6 +329,7 @@ + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch index ec3b64c..71b46e5 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch @@ -309,11 +309,11 @@ Index: linux-2.4.21-x86_64/fs/namei.c + if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { + struct dentry *new; + err = permission(dentry->d_parent->d_inode, -+ MAY_EXEC); ++ MAY_EXEC); + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -332,8 +332,8 @@ Index: linux-2.4.21-x86_64/fs/namei.c dput(dentry); break; } -+ if (err) -+ intent_release(it); ++ if (err) ++ intent_release(it); path_release(nd); return_err: return err; @@ -938,7 +938,7 @@ Index: linux-2.4.21-x86_64/fs/namespace.c int retval = 0; int mnt_flags = 0; -@@ -725,10 +732,11 @@ +@@ -725,9 +732,11 @@ flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); /* ... and get the mountpoint */ @@ -948,11 +948,10 @@ Index: linux-2.4.21-x86_64/fs/namespace.c + if (retval) { + intent_release(&it); return retval; -- + } + if (flags & MS_REMOUNT) retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, - data_page); @@ -739,6 +747,8 @@ else retval = do_add_mount(&nd, type_page, flags, mnt_flags, @@ -1210,7 +1209,7 @@ Index: linux-2.4.21-x86_64/fs/open.c - error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | -+ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); if (error) goto out; @@ -1669,7 +1668,7 @@ Index: linux-2.4.21-x86_64/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch index 29ad74f..7758b2c 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch @@ -297,11 +297,11 @@ + if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { + struct dentry *new; + err = permission(dentry->d_parent->d_inode, -+ MAY_EXEC); ++ MAY_EXEC); + if (err) + break; + new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); ++ &dentry->d_name, 0, it); + d_invalidate(dentry); + dput(dentry); + if (IS_ERR(new)) { @@ -607,12 +607,16 @@ if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1278,7 +1416,16 @@ asmlinkage long sys_mknod(const char * f +@@ -1278,7 +1416,20 @@ asmlinkage long sys_mknod(const char * f error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); + ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->mknod_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->mknod_raw(&nd, mode, dev); @@ -633,11 +637,15 @@ path_release(&nd); out: putname(tmp); -@@ -1346,7 +1494,14 @@ asmlinkage long sys_mkdir(const char * p +@@ -1346,7 +1494,20 @@ asmlinkage long sys_mkdir(const char * p error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 1); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->mkdir_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->mkdir_raw(&nd, mode); @@ -692,11 +700,15 @@ error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1581,15 +1752,23 @@ asmlinkage long sys_symlink(const char * +@@ -1581,15 +1752,27 @@ asmlinkage long sys_symlink(const char * error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->symlink_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->symlink_raw(&nd, from); @@ -718,11 +730,15 @@ putname(to); } putname(from); -@@ -1665,7 +1844,14 @@ asmlinkage long sys_link(const char * ol +@@ -1665,7 +1844,18 @@ asmlinkage long sys_link(const char * ol error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; - new_dentry = lookup_create(&nd, 0); ++ if (nd.last_type != LAST_NORM) { ++ error = -EEXIST; ++ goto out2; ++ } + if (nd.dentry->d_inode->i_op->link_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; + error = op->link_raw(&old_nd, &nd); @@ -898,7 +914,7 @@ int retval = 0; int mnt_flags = 0; -@@ -725,10 +732,11 @@ long do_mount(char * dev_name, char * di +@@ -725,9 +732,11 @@ long do_mount(char * dev_name, char * di flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); /* ... and get the mountpoint */ @@ -908,11 +924,10 @@ + if (retval) { + intent_release(&it); return retval; -- + } + if (flags & MS_REMOUNT) retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, - data_page); @@ -739,6 +747,8 @@ long do_mount(char * dev_name, char * di else retval = do_add_mount(&nd, type_page, flags, mnt_flags, @@ -1168,7 +1183,7 @@ - error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | -+ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); if (error) goto out; @@ -1361,7 +1376,7 @@ */ --- linux-2.4.22-ac1/fs/stat.c~vfs_intent-2.4.22-rh 2003-09-25 14:16:27.000000000 +0400 +++ linux-2.4.22-ac1-alexey/fs/stat.c 2003-09-25 14:42:46.000000000 +0400 -@@ -17,10 +17,14 @@ +@@ -17,10 +17,12 @@ * Revalidate the inode. This is required for proper NFS attribute caching. */ static __inline__ int @@ -1370,8 +1385,6 @@ { struct inode * inode = dentry->d_inode; - if (inode->i_op && inode->i_op->revalidate) -+ if (!inode) -+ return -ENOENT; + if (inode->i_op && inode->i_op->revalidate_it) + return inode->i_op->revalidate_it(dentry, it); + else if (inode->i_op && inode->i_op->revalidate) @@ -1621,7 +1634,7 @@ #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It diff --git a/lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch b/lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch index 9fcec3f..2bd3c6d 100644 --- a/lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch @@ -82,20 +82,18 @@ Index: linux-2.6.4-51.0/fs/namei.c name = getname(pathname); if(IS_ERR(name)) -@@ -1716,6 +1737,16 @@ +@@ -1716,6 +1737,14 @@ error = -EBUSY; goto exit1; } -+ + if (nd.dentry->d_inode->i_op->rmdir_raw) { + struct inode_operations *op = nd.dentry->d_inode->i_op; -+ ++ + error = op->rmdir_raw(&nd); + /* the file system wants to use normal vfs path now */ + if (error != -EOPNOTSUPP) + goto exit1; + } -+ down(&nd.dentry->d_inode->i_sem); dentry = lookup_hash(&nd.last, nd.dentry); error = PTR_ERR(dentry); diff --git a/lustre/kernel_patches/patches/xattr-0.8.54-2.4.22-rh.patch b/lustre/kernel_patches/patches/xattr-0.8.54-2.4.22-rh.patch index 9d6bc19..b221045 100644 --- a/lustre/kernel_patches/patches/xattr-0.8.54-2.4.22-rh.patch +++ b/lustre/kernel_patches/patches/xattr-0.8.54-2.4.22-rh.patch @@ -1716,7 +1716,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT2_I(inode)->i_file_acl != 0; @@ -1735,7 +1735,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); ++ (void)ext2_xattr_cache_insert(new_bh); + + ext2_xattr_update_super_block(sb); + } @@ -3402,7 +3402,7 @@ + } else if (old_bh && header == HDR(old_bh)) { + /* Keep this block. */ + new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + } else { + /* We need to allocate a new block */ + int force = EXT3_I(inode)->i_file_acl != 0; @@ -3427,7 +3427,7 @@ + memcpy(new_bh->b_data, header, new_bh->b_size); + mark_buffer_uptodate(new_bh, 1); + unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); ++ (void)ext3_xattr_cache_insert(new_bh); + + ext3_xattr_update_super_block(handle, sb); + } diff --git a/lustre/kernel_patches/series/chaos-2.4.18 b/lustre/kernel_patches/series/chaos-2.4.18 index c69f42f..99cdf04 100644 --- a/lustre/kernel_patches/series/chaos-2.4.18 +++ b/lustre/kernel_patches/series/chaos-2.4.18 @@ -38,3 +38,4 @@ procfs-ndynamic-2.4.patch ext3-truncate-buffer-head.patch inode-max-readahead-2.4.24.patch dcache_refcount_debug.patch +mkdep-revert-rh-2.4.patch diff --git a/lustre/kernel_patches/series/chaos-2.4.18-pdirops b/lustre/kernel_patches/series/chaos-2.4.18-pdirops index c180a5b..456c2eb 100644 --- a/lustre/kernel_patches/series/chaos-2.4.18-pdirops +++ b/lustre/kernel_patches/series/chaos-2.4.18-pdirops @@ -33,3 +33,4 @@ ext3-extents-2.4.18-chaos-pdirops.patch nfs_export_kernel-2.4.18.patch ext3-raw-lookup-pdirops.patch ext3-truncate-buffer-head.patch +mkdep-revert-rh-2.4.patch diff --git a/lustre/kernel_patches/series/rh-2.4.20 b/lustre/kernel_patches/series/rh-2.4.20 index 06b2642..0ad83a9 100644 --- a/lustre/kernel_patches/series/rh-2.4.20 +++ b/lustre/kernel_patches/series/rh-2.4.20 @@ -45,3 +45,4 @@ ext3-xattr-ptr-arith-fix.patch procfs-ndynamic-2.4.patch ext3-truncate-buffer-head.patch inode-max-readahead-2.4.24.patch +mkdep-revert-rh-2.4.patch diff --git a/lustre/kernel_patches/series/suse-2.4.19 b/lustre/kernel_patches/series/suse-2.4.19 index 9905491..12eab3d 100644 --- a/lustre/kernel_patches/series/suse-2.4.19 +++ b/lustre/kernel_patches/series/suse-2.4.19 @@ -1,4 +1,4 @@ -dev_read_only_hp_2.4.20.patch +dev_read_only-suse-2.4.19.patch exports_2.4.19-suse.patch lustre_version.patch vfs_intent-2.4.19-suse.patch @@ -6,11 +6,10 @@ invalidate_show.patch export-truncate.patch iod-stock-24-exports-2.4.19-suse.patch jbd-2.4.18-jcberr.patch -ext-2.4-patch-1-chaos.patch +ext-2.4-patch-1-suse-2.4.19.patch ext-2.4-patch-2.patch ext-2.4-patch-3.patch ext-2.4-patch-4.patch -linux-2.4.20-xattr-0.8.54-hp.patch linux-2.4.19-xattr-0.8.54-suse.patch ext3-2.4-ino_t.patch ext3-largefile.patch @@ -32,5 +31,5 @@ jbd-flushtime-2.4.19-suse.patch jbd-get_write_access.patch ext3-ea-in-inode-2.4.20.patch listman-2.4.20.patch -ext3-trusted_ea-2.4.20.patch +ext3-trusted_ea-suse-2.4.19.patch ext3-truncate-buffer-head.patch diff --git a/lustre/kernel_patches/series/suse-sles8sp3-2.4.21 b/lustre/kernel_patches/series/suse-sles8sp3-2.4.21 new file mode 100644 index 0000000..5bc8822 --- /dev/null +++ b/lustre/kernel_patches/series/suse-sles8sp3-2.4.21 @@ -0,0 +1,31 @@ +configurable-x86-stack-2.4.21-sles8sp3.patch +dev_read_only_2.4.20-rh.patch +exports_2.4.20-rh-hp.patch +lustre_version.patch +vfs_intent-2.4.21-sles8sp3.patch +invalidate_show-2.4.21-sles8sp3.patch +iod-stock-24-exports_hp.patch +ext3-htree-2.4.21-chaos.patch +linux-2.4.21-xattr-0.8.54-suse2.patch +ext3-orphan_lock-2.4.22-rh.patch +ext3-noread-2.4.21-suse2.patch +ext3-delete_thread-2.4.21-chaos.patch +extN-wantedi-2.4.21-suse2.patch +ext3-san-2.4.20.patch +ext3-map_inode_page-2.4.21-suse2.patch +ext3-error-export.patch +iopen-2.4.21-sles8sp3.patch +tcp-zero-copy-2.4.21-sles8sp3.patch +jbd-dont-account-blocks-twice.patch +jbd-commit-tricks.patch +ext3-no-write-super-chaos.patch +add_page_private.patch +nfs_export_kernel-2.4.21-suse2.patch +ext3-raw-lookup.patch +ext3-ea-in-inode-2.4.21-suse2.patch +listman-2.4.20.patch +gfp_memalloc-2.4.24.patch +ext3-xattr-ptr-arith-fix.patch +kernel_text_address-2.4.21-sles8sp3.patch +ext3-truncate-buffer-head.patch +export-truncate.patch diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index f86c76d..93f3411 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -6,3 +6,4 @@ vanilla-2.4.20 linux-2.4.20 patch with uml-2.4.20-6 um chaos-2.4.20 linux-chaos-2.4.20 same as rh-2.4.20-8 i386 kgdb-2.5.73 linux-2.5.73 vanilla 2.5.73 with kgdb i386 bproc-2.4.20-hp-pnnl linux-2.4.20-hp4_pnnl9 hp-pnnl + bproc i386 +suse-2.4.19 SUSE ES 8 diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index bc174d8..7e75089 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -534,7 +534,7 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res) { LASSERT(res != NULL); - LASSERT(res != (void *)0x5a5a5a5a); + LASSERT(res != LP_POISON); atomic_inc(&res->lr_refcount); CDEBUG(D_INFO, "getref res: %p count: %d\n", res, atomic_read(&res->lr_refcount)); @@ -550,7 +550,7 @@ int ldlm_resource_putref(struct ldlm_resource *res) CDEBUG(D_INFO, "putref res: %p count: %d\n", res, atomic_read(&res->lr_refcount) - 1); LASSERT(atomic_read(&res->lr_refcount) > 0); - LASSERT(atomic_read(&res->lr_refcount) < 0x5a5a5a5a); + LASSERT(atomic_read(&res->lr_refcount) < LI_POISON); if (atomic_dec_and_test(&res->lr_refcount)) { struct ldlm_namespace *ns = res->lr_namespace; diff --git a/lustre/liblustre/genlib.sh b/lustre/liblustre/genlib.sh index c31ea2f..f371650 100755 --- a/lustre/liblustre/genlib.sh +++ b/lustre/liblustre/genlib.sh @@ -83,6 +83,6 @@ $RANLIB $CWD/liblustre.a # create shared lib lustre rm -f $CWD/liblustre.so $LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \ - $ALL_OBJS -lcap -lpthread + $ALL_OBJS -lpthread #rm -rf $sysio_tmp diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index a61e368..fbd199b 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -99,112 +98,18 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str) return str; } -int in_group_p(gid_t gid) -{ - int i; - - if (gid == current->fsgid) - return 1; - - for (i = 0; i < current->ngroups; i++) { - if (gid == current->groups[i]) - return 1; - } - - return 0; -} - -static void init_capability(int *res) -{ - cap_value_t cap_types[] = { - CAP_CHOWN, - CAP_DAC_OVERRIDE, - CAP_DAC_READ_SEARCH, - CAP_FOWNER, - CAP_FSETID, - CAP_KILL, - CAP_SETGID, - CAP_SETUID, - /* following are linux specific, we could simply - * remove them I think */ - CAP_SETPCAP, - CAP_LINUX_IMMUTABLE, - CAP_NET_BIND_SERVICE, - CAP_NET_BROADCAST, - CAP_NET_ADMIN, - CAP_NET_RAW, - CAP_IPC_LOCK, - CAP_IPC_OWNER, - CAP_SYS_MODULE, - CAP_SYS_RAWIO, - CAP_SYS_CHROOT, - CAP_SYS_PTRACE, - CAP_SYS_PACCT, - CAP_SYS_ADMIN, - CAP_SYS_BOOT, - CAP_SYS_NICE, - CAP_SYS_RESOURCE, - CAP_SYS_TIME, - CAP_SYS_TTY_CONFIG, - CAP_MKNOD, - CAP_LEASE, - }; - cap_t syscap; - cap_flag_value_t capval; - int i; - - *res = 0; - - syscap = cap_get_proc(); - if (!syscap) { - printf("Liblustre: Warning: failed to get system capability, " - "set to minimal\n"); - return; - } - - for (i = 0; i < sizeof(cap_types)/sizeof(cap_t); i++) { - LASSERT(cap_types[i] < 32); - if (!cap_get_flag(syscap, cap_types[i], - CAP_EFFECTIVE, &capval)) { - if (capval == CAP_SET) { - *res |= 1 << cap_types[i]; - } - } - } -} - -static int init_current(char *comm) +void init_current(char *comm) { current = malloc(sizeof(*current)); - if (!current) { - CERROR("Not enough memory\n"); - return -ENOMEM; - } - current->fs = ¤t->__fs; + current->fs = malloc(sizeof(*current->fs)); current->fs->umask = umask(0777); umask(current->fs->umask); - strncpy(current->comm, comm, sizeof(current->comm)); current->pid = getpid(); - current->fsuid = geteuid(); - current->fsgid = getegid(); + current->fsuid = 0; + current->fsgid = 0; + current->cap_effective = -1; memset(¤t->pending, 0, sizeof(current->pending)); - - current->max_groups = sysconf(_SC_NGROUPS_MAX); - current->groups = malloc(sizeof(gid_t) * current->max_groups); - if (!current->groups) { - CERROR("Not enough memory\n"); - return -ENOMEM; - } - current->ngroups = getgroups(current->max_groups, current->groqps); - if (current->ngroups < 0) { - perror("Error getgroups"); - return -EINVAL; - } - - init_capability(¤t->cap_effective); - - return 0; } /* FIXME */ @@ -298,8 +203,8 @@ int lllib_init(char *dumpfile) printf("LibLustre: TCPNAL NID: %016llx\n", tcpnal_mynid); } - if (init_current("dummy") || - init_obdclass() || + init_current("dummy"); + if (init_obdclass() || init_lib_portals() || ptlrpc_init() || mdc_init() || @@ -426,6 +331,11 @@ out: RETURN(rc); } +static void sighandler_USR1(int signum) +{ + /* do nothing */ +} + /* parse host:/mdsname/profile string */ int ll_parse_mount_target(const char *target, char **mdsnid, char **mdsname, char **profile) @@ -488,6 +398,8 @@ void __liblustre_setup_(void) */ srand(time(NULL) + getpid()); + signal(SIGUSR1, sighandler_USR1); + lustre_path = getenv(ENV_LUSTRE_MNTPNT); if (!lustre_path) { lustre_path = "/mnt/lustre"; diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c index 6e596d2..0403ad5 100644 --- a/lustre/liblustre/namei.c +++ b/lustre/liblustre/namei.c @@ -319,11 +319,7 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, /* NB 1 request reference will be taken away by ll_intent_lock() * when I return - */ - /* FIXME: for CREAT, libsysio require the inode must be generated here - * currently here we don't know the whether the create is successful - * or failed on mds. thus blinded return -EPERM in llu_iget(). need - * a fix later. + * Note: libsysio require the inode must be generated here */ if ((it->it_op & IT_CREAT) || !it_disposition(it, DISP_LOOKUP_NEG)) { struct lustre_md md; @@ -335,11 +331,11 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, RETURN(rc); inode = llu_iget(parent->i_fs, &md); - if (!inode || IS_ERR(inode)) { + if (!inode) { /* free the lsm if we allocated one above */ if (md.lsm != NULL) obd_free_memmd(sbi->ll_osc_exp, &md.lsm); - RETURN(inode ? PTR_ERR(inode) : -ENOMEM); + RETURN(-ENOMEM); } else if (md.lsm != NULL && llu_i2info(inode)->lli_smd != md.lsm) { obd_free_memmd(sbi->ll_osc_exp, &md.lsm); diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index de8cf3b..86048e6 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -31,7 +31,6 @@ #include #include #include -#include #ifndef __CYGWIN__ # include #else @@ -1291,11 +1290,8 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md) if ((md->body->valid & (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) != - (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) { - /* FIXME this is workaround for for open(O_CREAT), - * see lookup_it_finish(). */ - return ERR_PTR(-EPERM); - } + (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) + CERROR("invalide fields!\n"); /* try to find existing inode */ fid.id = md->body->ino; @@ -1494,7 +1490,7 @@ llu_fsswop_mount(const char *source, LASSERT(sbi->ll_rootino != 0); root = llu_iget(fs, &md); - if (!root || IS_ERR(root)) { + if (root == NULL) { CERROR("fail to generate root inode\n"); GOTO(out_request, err = -EBADF); } diff --git a/lustre/liblustre/tests/Makefile.am b/lustre/liblustre/tests/Makefile.am index ff73edf..81e7058 100644 --- a/lustre/liblustre/tests/Makefile.am +++ b/lustre/liblustre/tests/Makefile.am @@ -4,7 +4,7 @@ AM_CPPFLAGS = -I$(SYSIO)/include -I/opt/lam/include $(LLCPPFLAGS) -I$(top_srcdir AM_CFLAGS = $(LLCFLAGS) LIBS = $(LIBEFENCE) $(LIBREADLINE) -LLIB_EXEC= ../liblustre.a -lcap -lpthread +LLIB_EXEC= ../liblustre.a -lpthread if LIBLUSTRE noinst_LIBRARIES = libtestcommon.a @@ -21,7 +21,7 @@ libtestcommon_a_SOURCES = test_common.c test_common.h echo_test_SOURCES = echo_test.c ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c echo_test_CFLAGS = $(LL_CFLAGS) -echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lcap -lpthread +echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread echo_test_DEPENDENCIES=$(top_builddir)/liblustre/liblsupport.a sanity_SOURCES = sanity.c diff --git a/lustre/liblustre/tests/echo_test.c b/lustre/liblustre/tests/echo_test.c index 48d0e6f..f2230ab 100644 --- a/lustre/liblustre/tests/echo_test.c +++ b/lustre/liblustre/tests/echo_test.c @@ -81,11 +81,6 @@ libcfs_nal_cmd(struct portals_cfg *pcfg) return 0; } -int in_group_p(gid_t gid) -{ - return 0; -} - int init_current(int argc, char **argv) { current = malloc(sizeof(*current)); diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 5307cb12..05f6573 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -201,7 +201,6 @@ Eend: fail: SetPageChecked(page); SetPageError(page); - LBUG(); } static struct page *ll_get_dir_page(struct inode *dir, unsigned long n) diff --git a/lustre/llite/file.c b/lustre/llite/file.c index f9d58f7..61bb36d 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -699,7 +699,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, struct lov_stripe_md *lsm = lli->lli_smd; struct lustre_handle lockh = { 0 }; ldlm_policy_data_t policy; - ldlm_error_t err; + int rc; ssize_t retval; __u64 kms; ENTRY; @@ -720,11 +720,11 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, policy.l_extent.start = *ppos; policy.l_extent.end = *ppos + count - 1; - err = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, - (filp->f_flags & O_NONBLOCK)?LDLM_FL_BLOCK_NOWAIT: - 0); - if (err != ELDLM_OK) - RETURN(err); + rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, + (filp->f_flags & O_NONBLOCK) ? + LDLM_FL_BLOCK_NOWAIT: 0); + if (rc != 0) + RETURN(rc); kms = lov_merge_size(lsm, 1); if (*ppos + count - 1 > kms) { @@ -767,9 +767,8 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, struct lustre_handle lockh = { 0 }; ldlm_policy_data_t policy; loff_t maxbytes = ll_file_maxbytes(inode); - ldlm_error_t err; ssize_t retval; - int nonblock = 0; + int nonblock = 0, rc; ENTRY; if (file->f_flags & O_NONBLOCK) nonblock = LDLM_FL_BLOCK_NOWAIT; @@ -797,9 +796,9 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, policy.l_extent.end = *ppos + count - 1; } - err = ll_extent_lock(fd, inode, lsm, LCK_PW, &policy, &lockh, nonblock); - if (err != ELDLM_OK) - RETURN(err); + rc = ll_extent_lock(fd, inode, lsm, LCK_PW, &policy, &lockh, nonblock); + if (rc != 0) + RETURN(rc); /* this is ok, g_f_w will overwrite this under i_sem if it races * with a local truncate, it just makes our maxbyte checking easier */ @@ -1018,8 +1017,7 @@ static int ll_get_grouplock(struct inode *inode, struct file *file, struct lustre_handle lockh = { 0 }; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - ldlm_error_t err; - int flags = 0; + int flags = 0, rc; ENTRY; if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { @@ -1030,9 +1028,9 @@ static int ll_get_grouplock(struct inode *inode, struct file *file, if (file->f_flags & O_NONBLOCK) flags = LDLM_FL_BLOCK_NOWAIT; - err = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags); - if (err) - RETURN(err); + rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags); + if (rc != 0) + RETURN(rc); fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK; fd->fd_gid = arg; @@ -1047,7 +1045,7 @@ static int ll_put_grouplock(struct inode *inode, struct file *file, struct ll_file_data *fd = file->private_data; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - ldlm_error_t err; + int rc; ENTRY; if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { @@ -1057,18 +1055,18 @@ static int ll_put_grouplock(struct inode *inode, struct file *file, if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */ RETURN(-EINVAL); - + fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK); - err = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh); - if (err) - RETURN(err); + rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh); + if (rc) + RETURN(rc); fd->fd_gid = 0; memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh)); RETURN(0); -} +} int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) @@ -1145,17 +1143,16 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin) lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK); if (origin == 2) { /* SEEK_END */ - ldlm_error_t err; - int nonblock = 0; + int nonblock = 0, rc; ldlm_policy_data_t policy = { .l_extent = {0, OBD_OBJECT_EOF }}; if (file->f_flags & O_NONBLOCK) nonblock = LDLM_FL_BLOCK_NOWAIT; - err = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, + rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, nonblock); - if (err != ELDLM_OK) - RETURN(err); + if (rc != 0) + RETURN(rc); offset += inode->i_size; } else if (origin == 1) { /* SEEK_CUR */ diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 5ff6eb3..5031b20 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -10,27 +10,24 @@ #ifndef LLITE_INTERNAL_H #define LLITE_INTERNAL_H +/* default to about 40meg of readahead on a given system. That much tied + * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */ +#define SBI_DEFAULT_RA_MAX ((40 << 20) >> PAGE_CACHE_SHIFT) + struct ll_sb_info { + /* this protects pglist and max_r_a_pages. It isn't safe to + * grab from interrupt contexts */ + spinlock_t ll_lock; struct obd_uuid ll_sb_uuid; -// struct lustre_handle ll_mdc_conn; struct obd_export *ll_mdc_exp; struct obd_export *ll_osc_exp; struct proc_dir_entry* ll_proc_root; obd_id ll_rootino; /* number of root inode */ - struct obd_uuid ll_mds_uuid; - struct obd_uuid ll_mds_peer_uuid; struct lustre_mount_data *ll_lmd; char *ll_instance; int ll_flags; - wait_queue_head_t ll_commitcbd_waitq; - wait_queue_head_t ll_commitcbd_ctl_waitq; - int ll_commitcbd_flags; - struct task_struct *ll_commitcbd_thread; - time_t ll_commitcbd_waketime; - time_t ll_commitcbd_timeout; - spinlock_t ll_commitcbd_lock; struct list_head ll_conn_chain; /* per-conn chain of SBs */ struct hlist_head ll_orphan_dentry_list; /*please don't ask -p*/ @@ -38,14 +35,20 @@ struct ll_sb_info { struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ - spinlock_t ll_pglist_lock; unsigned long ll_pglist_gen; struct list_head ll_pglist; + + unsigned long ll_read_ahead_pages; + unsigned long ll_max_read_ahead_pages; + }; struct ll_readahead_state { spinlock_t ras_lock; - unsigned long ras_last, ras_window, ras_next_index; + unsigned long ras_last_readpage, ras_consecutive; + unsigned long ras_window_start, ras_window_len; + unsigned long ras_next_readahead; + }; extern kmem_cache_t *ll_file_data_slab; @@ -149,7 +152,6 @@ int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to); void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa); void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc); void ll_removepage(struct page *page); -int ll_sync_page(struct page *page); int ll_readpage(struct file *file, struct page *page); struct ll_async_page *llap_from_cookie(void *cookie); struct ll_async_page *llap_from_page(struct page *page); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index bf8fb4c..4c68ca7 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -53,9 +53,10 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb) if (!sbi) RETURN(NULL); - spin_lock_init(&sbi->ll_pglist_lock); + spin_lock_init(&sbi->ll_lock); INIT_LIST_HEAD(&sbi->ll_pglist); sbi->ll_pglist_gen = 0; + sbi->ll_max_read_ahead_pages = SBI_DEFAULT_RA_MAX; INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list); ll_s2sbi(sb) = sbi; @@ -905,7 +906,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy, &lockh, ast_flags); down(&inode->i_sem); - if (rc != ELDLM_OK) + if (rc != 0) RETURN(rc); rc = vmtruncate(inode, attr->ia_size); diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 9f1987c..35676f2 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -161,14 +161,6 @@ static int ll_rd_filesfree(char *page, char **start, off_t off, int count, } -#if 0 -static int ll_rd_path(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} -#endif - static int ll_rd_fstype(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -222,6 +214,41 @@ static int ll_wr_read_ahead(struct file *file, const char *buffer, RETURN(count); } +static int ll_rd_max_read_ahead_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned val; + + spin_lock(&sbi->ll_lock); + val = (sbi->ll_max_read_ahead_pages << PAGE_CACHE_SHIFT) >> 20; + spin_unlock(&sbi->ll_lock); + + return snprintf(page, count, "%u\n", val); +} + +static int ll_wr_max_read_ahead_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 0 || val > (num_physpages << PAGE_SHIFT) >> 20) + return -ERANGE; + + spin_lock(&sbi->ll_lock); + sbi->ll_max_read_ahead_pages = (val << 20) >> PAGE_CACHE_SHIFT; + spin_unlock(&sbi->ll_lock); + + return count; +} + static struct lprocfs_vars lprocfs_obd_vars[] = { { "uuid", ll_rd_sb_uuid, 0, 0 }, //{ "mntpt_path", ll_rd_path, 0, 0 }, @@ -234,6 +261,8 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "filesfree", ll_rd_filesfree, 0, 0 }, //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, { "read_ahead", ll_rd_read_ahead, ll_wr_read_ahead, 0 }, + { "max_read_ahead_mb", ll_rd_max_read_ahead_mb, + ll_wr_max_read_ahead_mb, 0 }, { 0 } }; @@ -463,7 +492,7 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) return 0; } - spin_lock(&sbi->ll_pglist_lock); + spin_lock(&sbi->ll_lock); llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_proc_item); if (llap != NULL) { @@ -486,7 +515,7 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "]\n"); } - spin_unlock(&sbi->ll_pglist_lock); + spin_unlock(&sbi->ll_lock); return 0; } @@ -516,14 +545,14 @@ static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, /* we've just displayed the llap that is after us in the list. * we advance to a position beyond it, returning null if there * isn't another llap in the list beyond that new position. */ - spin_lock(&sbi->ll_pglist_lock); + spin_lock(&sbi->ll_lock); llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_proc_item); list_del_init(&dummy_llap->llap_proc_item); if (llap) { list_add(&dummy_llap->llap_proc_item, &llap->llap_proc_item); llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_proc_item); } - spin_unlock(&sbi->ll_pglist_lock); + spin_unlock(&sbi->ll_lock); ++*pos; if (llap == NULL) { @@ -578,9 +607,9 @@ static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) seq = file->private_data; seq->private = llap; - spin_lock(&sbi->ll_pglist_lock); + spin_lock(&sbi->ll_lock); list_add(&llap->llap_proc_item, &sbi->ll_pglist); - spin_unlock(&sbi->ll_pglist_lock); + spin_unlock(&sbi->ll_lock); return 0; } @@ -592,10 +621,10 @@ static int llite_dump_pgcache_seq_release(struct inode *inode, struct ll_async_page *llap = seq->private; struct ll_sb_info *sbi = llap->llap_cookie; - spin_lock(&sbi->ll_pglist_lock); + spin_lock(&sbi->ll_lock); if (!list_empty(&llap->llap_proc_item)) list_del_init(&llap->llap_proc_item); - spin_unlock(&sbi->ll_pglist_lock); + spin_unlock(&sbi->ll_lock); OBD_FREE(llap, sizeof(*llap)); return seq_release(inode, file); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 2e71def..18551bc 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -228,8 +228,7 @@ static int ll_ap_make_ready(void *data, int cmd) page = llap->llap_page; - if (cmd == OBD_BRW_READ) - RETURN(0); + LASSERT(cmd != OBD_BRW_READ); /* we're trying to write, but the page is locked.. come back later */ if (TryLockPage(page)) @@ -375,10 +374,10 @@ struct ll_async_page *llap_from_page(struct page *page) page->private = (unsigned long)llap; llap->llap_page = page; - spin_lock(&sbi->ll_pglist_lock); + spin_lock(&sbi->ll_lock); sbi->ll_pglist_gen++; list_add_tail(&llap->llap_proc_item, &sbi->ll_pglist); - spin_unlock(&sbi->ll_pglist_lock); + spin_unlock(&sbi->ll_lock); RETURN(llap); } @@ -475,6 +474,29 @@ out: RETURN(rc); } +static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len) +{ + unsigned long ret; + ENTRY; + + spin_lock(&sbi->ll_lock); + ret = min(sbi->ll_max_read_ahead_pages - sbi->ll_read_ahead_pages, + len); + sbi->ll_read_ahead_pages += ret; + spin_unlock(&sbi->ll_lock); + + RETURN(ret); +} + +static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) +{ + spin_lock(&sbi->ll_lock); + LASSERTF(sbi->ll_read_ahead_pages >= len, "r_a_p %lu len %lu\n", + sbi->ll_read_ahead_pages, len); + sbi->ll_read_ahead_pages -= len; + spin_unlock(&sbi->ll_lock); +} + /* called for each page in a completed rpc.*/ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) { @@ -493,6 +515,9 @@ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) LL_CDEBUG_PAGE(D_PAGE, page, "completing cmd %d with %d\n", cmd, rc); + if (cmd == OBD_BRW_READ && llap->llap_defer_uptodate) + ll_ra_count_put(ll_i2sbi(page->mapping->host), 1); + if (rc == 0) { if (cmd == OBD_BRW_READ) { if (!llap->llap_defer_uptodate) @@ -507,7 +532,6 @@ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) SetPageError(page); } - unlock_page(page); if (0 && cmd == OBD_BRW_WRITE) { @@ -568,11 +592,11 @@ void ll_removepage(struct page *page) * is providing exclusivity to memory pressure/truncate/writeback..*/ page->private = 0; - spin_lock(&sbi->ll_pglist_lock); + spin_lock(&sbi->ll_lock); if (!list_empty(&llap->llap_proc_item)) list_del_init(&llap->llap_proc_item); sbi->ll_pglist_gen++; - spin_unlock(&sbi->ll_pglist_lock); + spin_unlock(&sbi->ll_lock); OBD_FREE(llap, sizeof(*llap)); EXIT; } @@ -610,7 +634,8 @@ static int ll_issue_page_read(struct obd_export *exp, llap->llap_defer_uptodate = defer; rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd, NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0, - PAGE_SIZE, 0, ASYNC_COUNT_STABLE); + PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY + | ASYNC_URGENT); if (rc) { LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc); page_cache_release(page); @@ -618,171 +643,155 @@ static int ll_issue_page_read(struct obd_export *exp, RETURN(rc); } -#define LL_RA_MIN(inode) ((unsigned long)PTLRPC_MAX_BRW_PAGES / 2) -#define LL_RA_MAX(inode) ((ll_i2info(inode)->lli_smd->lsm_xfersize * 3) >> \ - PAGE_CACHE_SHIFT) +#define RAS_CDEBUG(ras) \ + CDEBUG(D_READA, "lrp %lu c %lu ws %lu wl %lu nra %lu\n", \ + ras->ras_last_readpage, ras->ras_consecutive, \ + ras->ras_window_start, ras->ras_window_len, \ + ras->ras_next_readahead); -static void ll_readahead(struct ll_readahead_state *ras, +static int ll_readahead(struct ll_readahead_state *ras, struct obd_export *exp, struct address_space *mapping, struct obd_io_group *oig, int flags) { - unsigned long i, start, end; + unsigned long i, start = 0, end = 0, reserved; struct ll_async_page *llap; struct page *page; - int rc; + int rc, ret = 0; + __u64 kms; + ENTRY; - if (mapping->host->i_size == 0) - return; + kms = lov_merge_size(ll_i2info(mapping->host)->lli_smd, 1); + if (kms == 0) + RETURN(0); spin_lock(&ras->ras_lock); - /* make sure to issue a window's worth of read-ahead pages */ - end = ras->ras_last; - start = end - ras->ras_window; - if (start > end) - start = 0; - - /* but don't iterate over pages that we've already issued. this - * will set start to end + 1 if we've already read-ahead up to - * ras_last sothe for() won't be entered */ - if (ras->ras_next_index > start) - start = ras->ras_next_index; - if (end != ~0UL) - ras->ras_next_index = end + 1; + if (ras->ras_window_len) { + start = ras->ras_next_readahead; + end = ras->ras_window_start + ras->ras_window_len - 1; + end = min(end, (unsigned long)(kms >> PAGE_CACHE_SHIFT)); + ras->ras_next_readahead = max(end, end + 1); - CDEBUG(D_READA, "ni %lu last %lu win %lu: reading from %lu to %lu\n", - ras->ras_next_index, ras->ras_last, ras->ras_window, - start, end); + RAS_CDEBUG(ras); + } spin_unlock(&ras->ras_lock); - /* clamp to filesize */ - i = (mapping->host->i_size - 1) >> PAGE_CACHE_SHIFT; - end = min(end, i); + if (end == 0) + RETURN(0); + + reserved = ll_ra_count_get(ll_i2sbi(mapping->host), end - start + 1); - for (i = start; i <= end; i++) { - /* grab_cache_page_nowait returns null if this races with - * truncating the page (page->mapping == NULL) */ + for (i = start; reserved > 0 && i <= end; i++) { + /* skip locked pages from previous readpage calls */ page = grab_cache_page_nowait(mapping, i); - if (page == NULL) - break; + if (page == NULL) { + CDEBUG(D_READA, "g_c_p_n failed\n"); + continue; + } + + /* we do this first so that we can see the page in the /proc + * accounting */ + llap = llap_from_page(page); + if (IS_ERR(llap) || llap->llap_defer_uptodate) + goto next_page; - /* the book-keeping above promises that we've tried - * all the indices from start to end, so we don't - * stop if anyone returns an error. This may not be good. */ + /* skip completed pages */ if (Page_Uptodate(page)) goto next_page; + /* bail when we hit the end of the lock. */ if ((rc = ll_page_matches(page, flags)) <= 0) { LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "lock match failed: rc %d\n", rc); + i = end; goto next_page; } - llap = llap_from_page(page); - if (IS_ERR(llap) || llap->llap_defer_uptodate) - goto next_page; - rc = ll_issue_page_read(exp, llap, oig, 1); - if (rc == 0) - LL_CDEBUG_PAGE(D_PAGE, page, "started read-ahead\n"); + if (rc == 0) { + reserved--; + ret++; + LL_CDEBUG_PAGE(D_READA| D_PAGE, page, + "started read-ahead\n"); + } if (rc) { next_page: - LL_CDEBUG_PAGE(D_PAGE, page, "skipping read-ahead\n"); + LL_CDEBUG_PAGE(D_READA | D_PAGE, page, + "skipping read-ahead\n"); unlock_page(page); } page_cache_release(page); } + + LASSERTF(reserved >= 0, "reserved %lu\n", reserved); + if (reserved != 0) + ll_ra_count_put(ll_i2sbi(mapping->host), reserved); + RETURN(ret); +} + +static void ras_set_start(struct ll_readahead_state *ras, + unsigned long index) +{ + ras->ras_window_start = index & (~(PTLRPC_MAX_BRW_PAGES - 1)); + ras->ras_next_readahead = max(ras->ras_window_start, + ras->ras_next_readahead); } /* called with the ras_lock held or from places where it doesn't matter */ -static void ll_readahead_set(struct inode *inode, - struct ll_readahead_state *ras, - unsigned long index) +static void ras_reset(struct ll_readahead_state *ras, + unsigned long index) { - ras->ras_next_index = index; - if (ras->ras_next_index != ~0UL) - ras->ras_next_index++; - ras->ras_window = LL_RA_MIN(inode); - ras->ras_last = ras->ras_next_index + ras->ras_window; - if (ras->ras_last < ras->ras_next_index) - ras->ras_last = ~0UL; - CDEBUG(D_READA, "ni %lu last %lu win %lu: set %lu\n", - ras->ras_next_index, ras->ras_last, ras->ras_window, - index); + ras->ras_last_readpage = index; + ras->ras_consecutive = 1; + ras->ras_window_len = 0; + ras_set_start(ras, index); + ras->ras_next_readahead = ras->ras_window_start; + + RAS_CDEBUG(ras); } void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) { spin_lock_init(&ras->ras_lock); - ll_readahead_set(inode, ras, 0); + ras_reset(ras, 0); } -static void ll_readahead_update(struct inode *inode, - struct ll_readahead_state *ras, - unsigned long index, int hit) +static void ras_update(struct ll_readahead_state *ras, + unsigned long index, unsigned long max) { - unsigned long issued_start, new_last; + ENTRY; spin_lock(&ras->ras_lock); - /* we're interested in noticing the index's relation to the - * previously issued read-ahead pages */ - issued_start = ras->ras_next_index - ras->ras_window - 1; - if (issued_start > ras->ras_next_index) - issued_start = 0; - - CDEBUG(D_READA, "ni %lu last %lu win %lu: %s ind %lu start %lu\n", - ras->ras_next_index, ras->ras_last, ras->ras_window, - hit ? "hit" : "miss", index, issued_start); - if (!hit && - index == ras->ras_next_index && index == ras->ras_last + 1) { - /* special case the kernel's read-ahead running into the - * page just beyond our read-ahead window as an extension - * of our read-ahead. sigh. wishing it was easier to - * turn off 2.4's read-ahead. */ - ras->ras_window = min(LL_RA_MAX(inode), ras->ras_window + 1); - if (index != ~0UL) - ras->ras_next_index = index + 1; - ras->ras_last = index; - } else if (!hit && - (index > issued_start || ras->ras_next_index >= index)) { - /* deal with a miss way out of the window. we interpret - * this as a seek and restart the window */ - ll_readahead_set(inode, ras, index); - - } else if (!hit && - issued_start <= index && index < ras->ras_next_index) { - /* a miss inside the window? surely its memory pressure - * evicting our read pages before the app can see them. - * we shrink the window aggressively */ - unsigned long old_window = ras->ras_window; - - ras->ras_window = max(ras->ras_window / 2, LL_RA_MIN(inode)); - ras->ras_last -= old_window - ras->ras_window; - if (ras->ras_next_index > ras->ras_last) - ras->ras_next_index = ras->ras_last + 1; - CDEBUG(D_READA, "ni %lu last %lu win %lu: miss inside\n", - ras->ras_next_index, ras->ras_last, ras->ras_window); - - } else if (hit && - issued_start <= index && index < ras->ras_next_index) { - /* a hit inside the window. grow the window by twice the - * number of pages that are satisified within the window. */ - ras->ras_window = min(LL_RA_MAX(inode), ras->ras_window + 2); - - /* we want the next readahead pass to issue a windows worth - * beyond where the app currently is */ - new_last = index + ras->ras_window; - if (new_last > ras->ras_last) - ras->ras_last = new_last; - - CDEBUG(D_READA, "ni %lu last %lu win %lu: extended window/last\n", - ras->ras_next_index, ras->ras_last, ras->ras_window); + if (index != ras->ras_last_readpage + 1) { + ras_reset(ras, index); + GOTO(out_unlock, 0); } + ras->ras_last_readpage = index; + ras->ras_consecutive++; + ras_set_start(ras, index); + + if (ras->ras_consecutive == 2) { + ras->ras_window_len = PTLRPC_MAX_BRW_PAGES; + GOTO(out_unlock, 0); + } + + /* we need to increase the window sometimes. we'll arbitrarily + * do it half-way through the pages in an rpc */ + if ((index & (PTLRPC_MAX_BRW_PAGES - 1)) == + (PTLRPC_MAX_BRW_PAGES >> 1)) { + ras->ras_window_len += PTLRPC_MAX_BRW_PAGES; + ras->ras_window_len = min(ras->ras_window_len, max); + } + + EXIT; +out_unlock: + RAS_CDEBUG(ras); spin_unlock(&ras->ras_lock); + return; } /* @@ -823,19 +832,22 @@ int ll_readpage(struct file *filp, struct page *page) if (IS_ERR(llap)) GOTO(out, rc = PTR_ERR(llap)); + if (ll_i2sbi(inode)->ll_flags & LL_SBI_READAHEAD) + ras_update(&fd->fd_ras, page->index, + ll_i2sbi(inode)->ll_max_read_ahead_pages); + if (llap->llap_defer_uptodate) { - ll_readahead_update(inode, &fd->fd_ras, page->index, 1); - ll_readahead(&fd->fd_ras, exp, page->mapping, oig,fd->fd_flags); - obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, - oig); + rc = ll_readahead(&fd->fd_ras, exp, page->mapping, oig, + fd->fd_flags); + if (rc > 0) + obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, + NULL, oig); LL_CDEBUG_PAGE(D_PAGE, page, "marking uptodate from defer\n"); SetPageUptodate(page); unlock_page(page); GOTO(out_oig, rc = 0); } - ll_readahead_update(inode, &fd->fd_ras, page->index, 0); - rc = ll_page_matches(page, fd->fd_flags); if (rc < 0) { LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc); @@ -861,8 +873,9 @@ int ll_readpage(struct file *filp, struct page *page) GOTO(out, rc); LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n"); - if ((ll_i2sbi(inode)->ll_flags & LL_SBI_READAHEAD)) - ll_readahead(&fd->fd_ras, exp, page->mapping, oig,fd->fd_flags); + if (ll_i2sbi(inode)->ll_flags & LL_SBI_READAHEAD) + ll_readahead(&fd->fd_ras, exp, page->mapping, oig, + fd->fd_flags); rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig); @@ -874,38 +887,3 @@ out_oig: oig_release(oig); RETURN(rc); } - -#if 0 -/* this is for read pages. we issue them as ready but not urgent. when - * someone waits on them we fire them off, hopefully merged with adjacent - * reads that were queued by read-ahead. */ -int ll_sync_page(struct page *page) -{ - struct obd_export *exp; - struct ll_async_page *llap; - int rc; - ENTRY; - - /* we're using a low bit flag to signify that a queued read should - * be issued once someone goes to lock it. it is also cleared - * as the page is built into an RPC */ - if (!test_and_clear_bit(LL_PRIVBITS_READ, &page->private)) - RETURN(0); - - /* careful to only deref page->mapping after checking the bit */ - exp = ll_i2obdexp(page->mapping->host); - if (exp == NULL) - RETURN(-EINVAL); - - llap = llap_from_page(page); - if (IS_ERR(llap)) - RETURN(PTR_ERR(llap)); - - LL_CDEBUG_PAGE(D_PAGE, page, "setting ready|urgent\n"); - - rc = obd_set_async_flags(exp, ll_i2info(page->mapping->host)->lli_smd, - NULL, llap->llap_cookie, - ASYNC_READY|ASYNC_URGENT); - return rc; -} -#endif diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 8f7b3b6..a2bbab9 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -346,6 +346,9 @@ int mdc_enqueue(struct obd_export *exp, spin_unlock(&req->rq_lock); } + DEBUG_REQ(D_RPCTRACE, req, "disposition: %x, status: %d", + it->d.lustre.it_disposition, it->d.lustre.it_status); + /* We know what to expect, so we do any byte flipping required here */ LASSERT(reply_buffers == 4 || reply_buffers == 3 || reply_buffers == 1); if (reply_buffers >= 3) { diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index afa8ec0..2cb67a2 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -366,7 +366,7 @@ void mdc_clear_open_replay_data(struct obd_client_handle *och) * we're sure we won't need to fix up the close request in the future), * but make sure that replay doesn't poke at the och, which is about to * be freed. */ - LASSERT(mod != (void *)0x5a5a5a5a); + LASSERT(mod != LP_POISON); if (mod != NULL) mod->mod_och = NULL; och->och_mod = NULL; @@ -388,7 +388,8 @@ static void mdc_commit_close(struct ptlrpc_request *req) open_req = mod->mod_open_req; LASSERT(open_req != NULL); - LASSERT(open_req != (void *)0x5a5a5a5a); + LASSERT(open_req != LP_POISON); + LASSERT(open_req->rq_type != LI_POISON); DEBUG_REQ(D_HA, open_req, "open req balanced"); LASSERT(open_req->rq_transno != 0); @@ -465,6 +466,7 @@ int mdc_close(struct obd_export *exp, struct obdo *obdo, mod = och->och_mod; if (likely(mod != NULL)) { mod->mod_close_req = req; + LASSERT(mod->mod_open_req->rq_type != LI_POISON); DEBUG_REQ(D_HA, mod->mod_open_req, "matched open req %p", mod->mod_open_req); } else { @@ -499,7 +501,7 @@ int mdc_close(struct obd_export *exp, struct obdo *obdo, if (req->rq_repmsg == NULL) { CDEBUG(D_HA, "request failed to send: %p, %d\n", req, req->rq_status); - rc = req->rq_status; + rc = req->rq_status ? req->rq_status : -EIO; } else if (rc == 0) { rc = req->rq_repmsg->status; if (req->rq_repmsg->type == PTL_RPC_MSG_ERR) { diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index d62f50e..8954cb5 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -981,6 +981,7 @@ int mds_open(struct mds_update_record *rec, int offset, GOTO(cleanup, rc = -EISDIR); } if (ll_permission(dchild->d_inode, acc_mode, NULL)) { + intent_set_disposition(rep, DISP_OPEN_OPEN); GOTO(cleanup, rc = -EACCES); } } diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 3978cb82..43be2ae 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -647,7 +647,7 @@ static void cleanup_obdclass(void) * kernel patch */ #include #define LUSTRE_MIN_VERSION 28 -#define LUSTRE_MAX_VERSION 34 +#define LUSTRE_MAX_VERSION 35 #if (LUSTRE_KERNEL_VERSION < LUSTRE_MIN_VERSION) # error Cannot continue: Your Lustre kernel patch is older than the sources #elif (LUSTRE_KERNEL_VERSION > LUSTRE_MAX_VERSION) diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index aee57a7..8df70b3 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -537,7 +537,7 @@ struct obd_import *class_new_import(void) void class_destroy_import(struct obd_import *import) { LASSERT(import != NULL); - LASSERT((unsigned long)import != 0x5a5a5a5a); + LASSERT(import != LP_POISON); class_handle_unhash(&import->imp_handle); diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index e6471c8..67935cb 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -31,7 +31,6 @@ #include #else #include -#include #endif #include diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 4809e22..2ab2652 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -320,8 +320,8 @@ int filter_update_last_objid(struct obd_device *obd, obd_gr group, int rc; ENTRY; - CDEBUG(D_INODE, "server last_objid for group "LPU64": "LPU64"\n", - group, filter->fo_last_objids[group]); + CDEBUG(D_INODE, "%s: server last_objid for group "LPU64": "LPU64"\n", + obd->obd_name, group, filter->fo_last_objids[group]); tmp = cpu_to_le64(filter->fo_last_objids[group]); rc = fsfilt_write_record(obd, filter->fo_last_objid_files[group], @@ -696,7 +696,7 @@ static int filter_prep_groups(struct obd_device *obd) } filter->fo_last_objids[i] = le64_to_cpu(filter->fo_last_objids[i]); - CDEBUG(D_INODE, "%s: server last_objid group %d: "LPU64"\n", + CDEBUG(D_HA, "%s: server last_objid group %d: "LPU64"\n", obd->obd_name, i, filter->fo_last_objids[i]); } @@ -1755,12 +1755,16 @@ static void filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, doa.o_mode = S_IFREG; last = filter_last_id(filter, &doa); - CWARN("deleting orphan objects from "LPU64" to "LPU64"\n", - oa->o_id + 1, last); + CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"\n", + exp->exp_obd->obd_name, oa->o_id + 1, last); for (id = oa->o_id + 1; id <= last; id++) { doa.o_id = id; filter_destroy(exp, &doa, NULL, NULL); } + + CDEBUG(D_HA, "%s: after destroy: set last_objids["LPU64"] = "LPU64"\n", + exp->exp_obd->obd_name, doa.o_gr, oa->o_id); + spin_lock(&filter->fo_objidlock); filter->fo_last_objids[doa.o_gr] = oa->o_id; spin_unlock(&filter->fo_objidlock); @@ -1836,6 +1840,8 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, recreate_obj = 1; } + CDEBUG(D_HA, "%s: precreating %d objects\n", obd->obd_name, *num); + for (i = 0; i < *num && err == 0; i++) { int cleanup_phase = 0; @@ -1870,13 +1876,17 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, * already exists */ if (recreate_obj) { - CERROR("Serious error: recreating obj %*s but " - "obj already exists \n", - dchild->d_name.len, dchild->d_name.name); + CERROR("%s: Serious error: recreating obj %*s " + "but obj already exists \n", + obd->obd_name, dchild->d_name.len, + dchild->d_name.name); + LBUG(); } else { - CERROR("Serious error: objid %*s already " + CERROR("%s: Serious error: objid %*s already " "exists; is this filesystem corrupt?\n", - dchild->d_name.len, dchild->d_name.name); + obd->obd_name, dchild->d_name.len, + dchild->d_name.name); + LBUG(); } GOTO(cleanup, rc = -EEXIST); } @@ -1923,7 +1933,11 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, } *num = i; - CDEBUG(D_INFO, "filter_precreate() created %d objects\n", i); + CDEBUG(D_HA, "%s: server last_objid for group "LPU64": "LPU64"\n", + obd->obd_name, group, filter->fo_last_objids[group]); + + CDEBUG(D_HA, "%s: filter_precreate() created %d objects\n", + obd->obd_name, i); RETURN(rc); } diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index b9b7ab3..ccbbc74 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -628,7 +628,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, i++, lnb++, rnb++) { /* We still set up for ungranted pages so that granted pages * can be written to disk as they were promised, and portals - * needs to keep the pages all aligned properly. */ + * needs to keep the pages all aligned properly. */ lnb->dentry = dentry; lnb->offset = rnb->offset; lnb->len = rnb->len; diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 88b4d2a..078d0d1 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -108,14 +108,13 @@ int osc_rd_max_dirty_mb(char *page, char **start, off_t off, int count, { struct obd_device *dev = data; struct client_obd *cli = &dev->u.cli; - int val; - int rc; + unsigned val; spin_lock(&cli->cl_loi_list_lock); val = cli->cl_dirty_max >> 20; - rc = snprintf(page, count, "%d\n", val); spin_unlock(&cli->cl_loi_list_lock); - return rc; + + return snprintf(page, count, "%u\n", val); } int osc_wr_max_dirty_mb(struct file *file, const char *buffer, diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index d4db2c7..50b4d12 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -87,7 +87,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, oscc->oscc_flags &= ~OSCC_FLAG_CREATING; spin_unlock(&oscc->oscc_lock); - CDEBUG(D_INFO, "preallocated through id "LPU64" (last used "LPU64")\n", + CDEBUG(D_HA, "preallocated through id "LPU64" (last used "LPU64")\n", oscc->oscc_last_id, oscc->oscc_next_id); wake_up(&oscc->oscc_waitq); @@ -102,7 +102,8 @@ static int oscc_internal_create(struct osc_creator *oscc) ENTRY; spin_lock(&oscc->oscc_lock); - if (oscc->oscc_flags & OSCC_FLAG_CREATING) { + if (oscc->oscc_flags & OSCC_FLAG_CREATING || + oscc->oscc_flags & OSCC_FLAG_RECOVERING) { spin_unlock(&oscc->oscc_lock); RETURN(0); } @@ -124,7 +125,7 @@ static int oscc_internal_create(struct osc_creator *oscc) spin_lock(&oscc->oscc_lock); body->oa.o_id = oscc->oscc_last_id + oscc->oscc_grow_count; body->oa.o_valid |= OBD_MD_FLID; - CDEBUG(D_INFO, "preallocating through id "LPU64" (last used "LPU64")\n", + CDEBUG(D_HA, "preallocating through id "LPU64" (last used "LPU64")\n", body->oa.o_id, oscc->oscc_next_id); spin_unlock(&oscc->oscc_lock); @@ -235,6 +236,10 @@ int osc_create(struct obd_export *exp, struct obdo *oa, oa->o_valid |= OBD_MD_FLID; oa->o_id = oscc->oscc_next_id - 1; + CDEBUG(D_HA, "%s: deleting to next_id: "LPU64"\n", + oscc->oscc_obd->u.cli.cl_import->imp_target_uuid.uuid, + oa->o_id); + rc = osc_real_create(exp, oa, ea, NULL); spin_lock(&oscc->oscc_lock); @@ -250,26 +255,28 @@ int osc_create(struct obd_export *exp, struct obdo *oa, RETURN(rc); } - /* If orphans are being recovered, then we must wait until it is - finished before we can continue with create. */ - if (oscc_recovering(oscc)) { - struct l_wait_info lwi; - - CDEBUG(D_HA, "%p: oscc recovery in progress, waiting\n", oscc); - - lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL); - rc = l_wait_event(oscc->oscc_waitq, !oscc_recovering(oscc), - &lwi); - LASSERT(rc == 0 || rc == -ETIMEDOUT); - if (rc == -ETIMEDOUT) { - CDEBUG(D_HA, "%p: timed out waiting for recovery\n", oscc); - RETURN(rc); - } - CDEBUG(D_HA, "%p: oscc recovery over, waking up\n", oscc); - } - - while (try_again) { + /* If orphans are being recovered, then we must wait until + it is finished before we can continue with create. */ + if (oscc_recovering(oscc)) { + struct l_wait_info lwi; + + CDEBUG(D_HA,"%p: oscc recovery in progress, waiting\n", + oscc); + + lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL); + rc = l_wait_event(oscc->oscc_waitq, + !oscc_recovering(oscc), &lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + if (rc == -ETIMEDOUT) { + CDEBUG(D_HA, "%p: timed out waiting for " + "recovery\n", oscc); + RETURN(rc); + } + CDEBUG(D_HA, "%p: oscc recovery over, waking up\n", + oscc); + } + spin_lock(&oscc->oscc_lock); if (oscc->oscc_last_id >= oscc->oscc_next_id) { memcpy(oa, &oscc->oscc_oa, sizeof(*oa)); @@ -290,7 +297,9 @@ int osc_create(struct obd_export *exp, struct obdo *oa, } if (rc == 0) - CDEBUG(D_INFO, "returning objid "LPU64"\n", lsm->lsm_object_id); + CDEBUG(D_HA, "%s: returning objid "LPU64"\n", + oscc->oscc_obd->u.cli.cl_import->imp_target_uuid.uuid, + lsm->lsm_object_id); else if (*ea == NULL) obd_free_memmd(exp, &lsm); RETURN(rc); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 85f33d5..458529d 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -2739,7 +2739,7 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen, if (vallen != sizeof(obd_id)) RETURN(-EINVAL); obd->u.cli.cl_oscc.oscc_next_id = *((obd_id*)val) + 1; - CDEBUG(D_INODE, "%s: set oscc_next_id = "LPU64"\n", + CDEBUG(D_HA, "%s: set oscc_next_id = "LPU64"\n", exp->exp_obd->obd_name, obd->u.cli.cl_oscc.oscc_next_id); diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index 181594f..8a56b55 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -689,27 +689,30 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data); # endif #endif +#ifndef LP_POISON +# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) +#endif + #if defined(__x86_64__) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPSZ "%lu" # define LPSSZ "%ld" -# define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPSZ "%u" # define LPSSZ "%d" -# define LP_POISON ((void *)0x5a5a5a5a) #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) # define LPU64 "%lu" # define LPD64 "%ld" # define LPX64 "%#lx" # define LPSZ "%lu" # define LPSSZ "%ld" -# define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #endif #ifndef LPU64 # error "No word size defined" diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index d17540c..e78176b 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -168,7 +168,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) ENTRY; LASSERT(desc != NULL); - LASSERT(desc->bd_iov_count != 0x5a5a5a5a); /* not freed already */ + LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ LASSERT(!desc->bd_network_rw); /* network hands off or */ LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); if (desc->bd_export) diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index c6d5c84..bc137f37 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -81,7 +81,8 @@ static int ptlrpc_pinger_main(void *arg) SIGNAL_MASK_UNLOCK(current, flags); LASSERTF(strlen(data->name) < sizeof(current->comm), - "name %d > len %d\n",strlen(data->name),sizeof(current->comm)); + "name %d > len %d\n", + (int)strlen(data->name), (int)sizeof(current->comm)); THREAD_NAME(current->comm, sizeof(current->comm) - 1, "%s", data->name); unlock_kernel(); diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 8db67c7..14c9d60 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -275,8 +275,7 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) /* Wait for recovery to complete and resend. If evicted, then this request will be errored out later.*/ spin_lock_irqsave(&failed_req->rq_lock, flags); - if (!failed_req->rq_no_resend) - failed_req->rq_resend = 1; + failed_req->rq_resend = 1; spin_unlock_irqrestore(&failed_req->rq_lock, flags); EXIT; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index bd0bb45..8a7474f 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -668,7 +668,8 @@ static int ptlrpc_main(void *arg) SIGNAL_MASK_UNLOCK(current, flags); LASSERTF(strlen(data->name) < sizeof(current->comm), - "name %d > len %d\n",strlen(data->name),sizeof(current->comm)); + "name %d > len %d\n", + (int)strlen(data->name), (int)sizeof(current->comm)); THREAD_NAME(current->comm, sizeof(current->comm) - 1, "%s", data->name); unlock_kernel(); diff --git a/lustre/scripts/version_tag.pl.in b/lustre/scripts/version_tag.pl.in index 1212441..0e31be6 100644 --- a/lustre/scripts/version_tag.pl.in +++ b/lustre/scripts/version_tag.pl.in @@ -18,7 +18,7 @@ sub get_tag() my $tagfile = new IO::File; if (!$tagfile->open("CVS/Tag")) { my $verfile = new IO::File; - if (!$verfile->open("portals/include/config.h")) { + if (!$verfile->open("include/config.h")) { return "UNKNOWN"; } while(defined($line = <$verfile>)) { diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index f10da1d..1387cf3 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -40,7 +40,7 @@ for NAME in $CONFIGS; do if [ "$DBENCH" != "no" ]; then mount | grep $MOUNT || sh llmount.sh - SPACE=`df $MOUNT | tail -1 | awk '{ print $4 }'` + SPACE=`df $MOUNT | tail -n 1 | awk '{ print $4 }'` DB_THREADS=`expr $SPACE / 50000` [ $THREADS -lt $DB_THREADS ] && DB_THREADS=$THREADS @@ -82,7 +82,7 @@ for NAME in $CONFIGS; do fi if [ "$IOZONE_DIR" != "no" ]; then mount | grep $MOUNT || sh llmount.sh - SPACE=`df $MOUNT | tail -1 | awk '{ print $4 }'` + SPACE=`df $MOUNT | tail -n 1 | awk '{ print $4 }'` IOZ_THREADS=`expr $SPACE / \( $SIZE + $SIZE / 512 \)` [ $THREADS -lt $IOZ_THREADS ] && IOZ_THREADS=$THREADS diff --git a/lustre/tests/cfg/insanity-local.sh b/lustre/tests/cfg/insanity-local.sh index c6de54c..2ca1485 100644 --- a/lustre/tests/cfg/insanity-local.sh +++ b/lustre/tests/cfg/insanity-local.sh @@ -10,8 +10,8 @@ FAIL_CLIENTS=${FAIL_CLIENTS:-""} NETTYPE=${NETTYPE:-tcp} TIMEOUT=${TIMEOUT:-30} -PTLDEBUG=${PTLDEBUG:-0} -SUBSYSTEM=${SUBSYSTEM:-0} +PTLDEBUG=${PTLDEBUG:-0x3f0400} +SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff} MOUNT=${MOUNT:-"/mnt/lustre"} #CLIENT_UPCALL=${CLIENT_UPCALL:-`pwd`/client-upcall-mdev.sh} UPCALL=${CLIENT_UPCALL:-`pwd`/replay-single-upcall.sh} diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 7ad2c1c..68d0ff9 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -137,7 +137,7 @@ setup() { wait_for mds start mds $MDSLCONFARGS ${REFORMAT} while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done - zconf_mount $CLIENTS $MOUNT + grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT } @@ -156,7 +156,7 @@ client_touch() { file=$1 for c in $LIVE_CLIENT $FAIL_CLIENTS; do if echo $DOWN_CLIENTS | grep -q $c; then continue; fi - $PDSH $c touch $MOUNT/${c}_$file + $PDSH $c touch $MOUNT/${c}_$file || return 1 done } diff --git a/lustre/tests/llmountcleanup.sh b/lustre/tests/llmountcleanup.sh index 15277d6..35eca9b 100755 --- a/lustre/tests/llmountcleanup.sh +++ b/lustre/tests/llmountcleanup.sh @@ -39,8 +39,8 @@ if [ "$BUSY" ]; then mv $TMP/debug $TMP/debug-busy.`date +%s` exit 255 fi -LEAK_LUSTRE=`dmesg | tail -30 | grep "obd mem.*leaked"` -LEAK_PORTALS=`dmesg | tail -20 | grep "Portals memory leaked"` +LEAK_LUSTRE=`dmesg | tail -n 30 | grep "obd mem.*leaked"` +LEAK_PORTALS=`dmesg | tail -n 20 | grep "Portals memory leaked"` if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then echo "$LEAK_LUSTRE" 1>&2 echo "$LEAK_PORTALS" 1>&2 diff --git a/lustre/tests/multiop.c b/lustre/tests/multiop.c index c8ac394..7ab0208 100755 --- a/lustre/tests/multiop.c +++ b/lustre/tests/multiop.c @@ -210,14 +210,6 @@ int main(int argc, char **argv) perror("write"); exit(1); } - /* b=3043 write() on Suse x86-64 is returning -errno - instead of -1, and not setting errno. */ - if (rc < 0) { - fprintf(stderr, "MULTIOP: broken write() " - "returned %d, errno %d\n", - rc, errno); - exit(1); - } break; } case 'W': diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh index 4e6b261..5894c4f 100755 --- a/lustre/tests/oos.sh +++ b/lustre/tests/oos.sh @@ -22,8 +22,8 @@ sleep 1 # to ensure we get up-to-date statfs info #lctl clear #lctl debug_daemon start /r/tmp/debug 1024 -STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -1` -ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -1` +STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1` +ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1` MAXFREE=${MAXFREE:-$((200000 * $STRIPECOUNT))} if [ $ORIGFREE -gt $MAXFREE ]; then echo "skipping out-of-space test on $OSC" diff --git a/lustre/tests/oos2.sh b/lustre/tests/oos2.sh index c9755cb..84c87f1 100644 --- a/lustre/tests/oos2.sh +++ b/lustre/tests/oos2.sh @@ -20,8 +20,8 @@ rm -f $OOS $OOS2 $LOG $LOG2 sleep 1 # to ensure we get up-to-date statfs info -STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -1` -ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -1` +STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1` +ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1` MAXFREE=${MAXFREE:-$((200000 * $STRIPECOUNT))} if [ $ORIGFREE -gt $MAXFREE ]; then echo "skipping out-of-space test on $OSC" diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index d0dd6f0..47d77ef 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -39,7 +39,7 @@ setup() { start ost2 --reformat $OSTLCONFARGS [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE start mds $MDSLCONFARGS --reformat - zconf_mount `hostname` $MOUNT + grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT } cleanup() { @@ -218,7 +218,7 @@ test_16() { do_facet client "cmp /etc/termcap $MOUNT/termcap" && return 1 sysctl -w lustre.fail_loc=0 # give recovery a chance to finish (shouldn't take long) - sleep 1 + sleep $TIMEOUT do_facet client "cmp /etc/termcap $MOUNT/termcap" || return 2 } run_test 16 "timeout bulk put, evict client (2732)" diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 8e10631..9c1f1e1 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -62,8 +62,8 @@ fi start ost2 --reformat $OSTLCONFARGS [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE start mds $MDSLCONFARGS --reformat -zconf_mount `hostname` $MOUNT -zconf_mount `hostname` $MOUNT2 +grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT +grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2 echo $TIMEOUT > /proc/sys/lustre/timeout echo $UPCALL > /proc/sys/lustre/upcall diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index 4331be5..f1523bb 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -56,7 +56,7 @@ setup() { start ost --reformat $OSTLCONFARGS [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE start mds --reformat $MDSLCONFARGS - zconf_mount `hostname` $MOUNT + grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT } mkdir -p $DIR diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index fbf8d19..76ce388 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -62,8 +62,7 @@ setup() { start ost2 --reformat $OSTLCONFARGS [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE start mds $MDSLCONFARGS --reformat - zconf_mount `hostname` $MOUNT - echo 0x3f0410 > /proc/sys/portals/debug + grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT } $SETUP @@ -80,6 +79,15 @@ test_0() { } run_test 0 "empty replay" +test_0b() { + # this test attempts to trigger a race in the precreation code, + # and must run before any other objects are created on the filesystem + fail ost + createmany -o $DIR/$tfile 20 || return 1 + unlinkmany $DIR/$tfile 20 || return 2 +} +run_test 0b "ensure object created after recover exists. (3284)" + test_1() { replay_barrier mds mcreate $DIR/$tfile @@ -825,14 +833,14 @@ run_test 41 "read from a valid osc while other oscs are invalid" # test MDS recovery after ost failure test_42() { - blocks=`df $MOUNT | tail -1 | awk '{ print $1 }'` + blocks=`df $MOUNT | tail -n 1 | awk '{ print $1 }'` createmany -o $DIR/$tfile-%d 800 replay_barrier ost unlinkmany $DIR/$tfile-%d 0 400 facet_failover ost # osc is evicted, fs is smaller - blocks_after=`df $MOUNT | tail -1 | awk '{ print $1 }'` + blocks_after=`df $MOUNT | tail -n 1 | awk '{ print $1 }'` [ $blocks_after -lt $blocks ] || return 1 echo wait for MDS to timeout and recover sleep $((TIMEOUT * 2)) diff --git a/lustre/tests/run-llog.sh b/lustre/tests/run-llog.sh index 5d46e2b..b7201f2 100644 --- a/lustre/tests/run-llog.sh +++ b/lustre/tests/run-llog.sh @@ -2,7 +2,7 @@ PATH=`dirname $0`:`dirname $0`/../utils:$PATH TMP=${TMP:-/tmp} -MDS=`ls /proc/fs/lustre/mds | grep -v num_refs | head -1` +MDS=`ls /proc/fs/lustre/mds | grep -v num_refs | head -n 1` [ -z "$MDS" ] && echo "no MDS available, skipping llog test" && exit 0 insmod ../obdclass/llog_test.o || exit 1 diff --git a/lustre/tests/runas.c b/lustre/tests/runas.c index f1df775..91c9c7f 100644 --- a/lustre/tests/runas.c +++ b/lustre/tests/runas.c @@ -6,17 +6,22 @@ #include #include #include +#include #include #include #include #define DEBUG 0 +#ifndef NGROUPS_MAX +#define NGROUPS_MAX 32 +#endif + static const char usage[] = "Usage: %s -u user_id [-g grp_id ] [ -G ] command\n" -" -u user_id switch to UID user_id\n" -" -g grp_id switch to GID grp_id\n" -" -G clear supplementary groups\n"; +" -u user_id switch to UID user_id\n" +" -g grp_id switch to GID grp_id\n" +" -G[gid0,gid1,...] set supplementary groups\n"; void Usage_and_abort(const char *name) { @@ -26,20 +31,17 @@ void Usage_and_abort(const char *name) int main(int argc, char **argv) { - char **my_argv, *name = argv[0]; - int status; - int c,i; - int gid_is_set = 0; - int uid_is_set = 0; - int clear_supp_groups = 0; + char **my_argv, *name = argv[0], *grp; + int status, c, i; + int gid_is_set = 0, uid_is_set = 0, num_supp = -1; uid_t user_id = 0; - gid_t grp_id = 0; + gid_t grp_id = 0, supp_groups[NGROUPS_MAX] = { 0 }; if (argc == 1) Usage_and_abort(name); // get UID and GID - while ((c = getopt (argc, argv, "+u:g:hG")) != -1) { + while ((c = getopt(argc, argv, "+u:g:hG::")) != -1) { switch (c) { case 'u': user_id = (uid_t)atoi(optarg); @@ -54,7 +56,15 @@ int main(int argc, char **argv) break; case 'G': - clear_supp_groups = 1; + num_supp = 0; + if (optarg == NULL || !isdigit(optarg[0])) + break; + while ((grp = strsep(&optarg, ",")) != NULL) { + printf("adding supp group %d\n", atoi(grp)); + supp_groups[num_supp++] = atoi(grp); + if (num_supp >= NGROUPS_MAX) + break; + } break; default: @@ -98,14 +108,14 @@ int main(int argc, char **argv) exit(-1); } - if (clear_supp_groups) { - status = setgroups(0, NULL); + if (num_supp >= 0) { + status = setgroups(num_supp, supp_groups); if (status == -1) { - perror("clearing supplementary groups"); + perror("setting supplementary groups"); exit(-1); } } - + // set UID status = setreuid(user_id, user_id ); if(status == -1) { @@ -114,8 +124,10 @@ int main(int argc, char **argv) exit(-1); } - fprintf(stderr, "running as UID %d, GID %d%s:", user_id, grp_id, - clear_supp_groups ? ", cleared groups" : ""); + fprintf(stderr, "running as UID %d, GID %d", user_id, grp_id); + for (i = 0; i < num_supp; i++) + fprintf(stderr, ":%d", supp_groups[i]); + fprintf(stderr, "\n"); for (i = 0; i < argc - optind; i++) fprintf(stderr, " [%s]", my_argv[i]); diff --git a/lustre/tests/runregression-brw.sh b/lustre/tests/runregression-brw.sh index 34a55b0..666b253 100644 --- a/lustre/tests/runregression-brw.sh +++ b/lustre/tests/runregression-brw.sh @@ -9,7 +9,7 @@ COUNT_100=`expr $COUNT / 100` ENDRUN=endrun-`hostname` -ECHONAME="`lctl device_list 2> /dev/null | awk '/ echo_client / { print $4 }' | tail -1`" +ECHONAME="`lctl device_list 2> /dev/null | awk '/ echo_client / { print $4 }' | tail -n 1`" if [ -z "$ECHONAME" ]; then echo "$0: needs an ECHO_CLIENT set up first" 1>&2 diff --git a/lustre/tests/runregression-mds.sh b/lustre/tests/runregression-mds.sh index ecfe0d9..7167d2d 100755 --- a/lustre/tests/runregression-mds.sh +++ b/lustre/tests/runregression-mds.sh @@ -23,11 +23,11 @@ cleanup() { [ -z "$*" ] && fail "usage: $0 [--reformat] .xml" 1 -OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -1`" +OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" if [ -z "$OSCMT" ]; then $LCONF $@ || exit 1 trap cleanup 0 - OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -1`" + OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" [ -z "$OSCMT" ] && fail "no lustre filesystem mounted" 1 fi @@ -42,7 +42,7 @@ while [ "$1" ]; do done OSCTMP=`echo $OSCMT | tr "/" "."` -USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -1` +USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1` USED=`expr $USED + 16` # Some space for the status file THREADS=1 @@ -58,7 +58,7 @@ done rm -f $ENDRUN -NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -1` +NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1` if [ $NOWUSED -gt $USED ]; then echo "Space not all freed: now ${NOWUSED}kB, was ${USED}kB." 1>&2 echo "This is normal on BA OSTs, because of subdirectories." 1>&2 diff --git a/lustre/tests/runregression-net.sh b/lustre/tests/runregression-net.sh index 73e81ca..77d6768 100644 --- a/lustre/tests/runregression-net.sh +++ b/lustre/tests/runregression-net.sh @@ -9,7 +9,7 @@ COUNT_1000=`expr $COUNT / 1000` ENDRUN=endrun-`hostname` -ECHONAME="`lctl device_list 2> /dev/null | awk '/ echo_client / { print $4 }' | tail -1`" +ECHONAME="`lctl device_list 2> /dev/null | awk '/ echo_client / { print $4 }' | tail -n 1`" if [ -z "$ECHONAME" ]; then echo "$0: needs an ECHO_CLIENT set up first" 1>&2 diff --git a/lustre/tests/runtests b/lustre/tests/runtests index 6a8aac8..d97bdea 100755 --- a/lustre/tests/runtests +++ b/lustre/tests/runtests @@ -35,16 +35,16 @@ while [ "$1" ]; do shift done -MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -1`" +MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" if [ -z "$MOUNT" ]; then sh llmount.sh - MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -1`" + MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" [ -z "$MOUNT" ] && fail "no lustre filesystem mounted" 1 I_MOUNTED="yes" fi OSCTMP=`echo $MOUNT | tr "/" "."` -USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -1` +USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1` USED=`expr $USED + 16` # Some space for the status file # let's start slowly here... @@ -77,7 +77,7 @@ mkdir $DST || fail "can't mkdir $DST" 10 # ok, that hopefully worked, so let's do a little more, with files that # haven't changed in the last day (hopefully they don't change during test) -FILES=`find $SRC -type f -mtime +1 -ctime +1 | head -$COUNT` +FILES=`find $SRC -type f -mtime +1 -ctime +1 | head -n $COUNT` log "copying files from $SRC to $DST$SRC" tar cf - $FILES | tar xvf - -C $DST || fail "copying $SRC" 11 @@ -123,7 +123,7 @@ rmdir $MOUNT/base$$* || fail "mkdirmany cleanup failed" log "done" -NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -1` +NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1` if [ `expr $NOWUSED - $USED` -gt 1024 ]; then echo "Space not all freed: now ${NOWUSED}kB, was ${USED}kB." 1>&2 fi diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 6b15107..201de89 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1192,7 +1192,7 @@ test_36d() { run_test 36d "non-root OST utime check (open, utime) ===========" test_36e() { - [ $RUNAS_ID -eq $UID ] && return + [ $RUNAS_ID -eq $UID ] && echo "skipping test 36e" && return [ ! -d $DIR/d36 ] && mkdir $DIR/d36 touch $DIR/d36/f36e $RUNAS utime $DIR/d36/f36e && error "utime worked, want failure" || true @@ -1832,7 +1832,7 @@ run_test 62 "verify obd_match failure doesn't LBUG (should -EIO)" # bug 2319 - oig_wait() interrupted causes crash because of invalid waitq. test_63() { - MAX_DIRTY_MB=`cat /proc/fs/lustre/osc/*/max_dirty_mb | head -1` + MAX_DIRTY_MB=`cat /proc/fs/lustre/osc/*/max_dirty_mb | head -n 1` for i in /proc/fs/lustre/osc/*/max_dirty_mb ; do echo 0 > $i done @@ -1923,6 +1923,15 @@ test_66() { } run_test 66 "update inode blocks count on client ===============" +test_67() { # bug 3285 - supplementary group fails on MDS, passes on client + [ "$RUNAS_ID" = "$UID" ] && echo "skipping test 67" && return + mkdir $DIR/d67 + chmod 771 $DIR/d67 + chgrp $RUNAS_ID $DIR/d67 + $RUNAS -g $((RUNAS_ID + 1)) -G1,2,$RUNAS_ID ls $DIR/d67 && error || true +} +run_test 67 "supplementary group failure (should return error) =" + # on the LLNL clusters, runas will still pick up root's $TMP settings, # which will not be writable for the runas user, and then you get a CVS # error message with a corrupt path string (CVS bug) and panic. diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index e884b73..0c34d6b 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -103,8 +103,8 @@ pass() { echo PASS } -export MOUNT1=`mount| awk '/ lustre/ { print $3 }'| head -1` -export MOUNT2=`mount| awk '/ lustre/ { print $3 }'| tail -1` +export MOUNT1=`mount| awk '/ lustre/ { print $3 }'| head -n 1` +export MOUNT2=`mount| awk '/ lustre/ { print $3 }'| tail -n 1` [ -z "$MOUNT1" ] && error "NAME=$NAME not mounted once" [ "$MOUNT1" = "$MOUNT2" ] && error "NAME=$NAME not mounted twice" [ `mount| awk '/ lustre/ { print $3 }'| wc -l` -ne 2 ] && \ diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 791b523..33f9786 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -27,7 +27,7 @@ usage() { init_test_env() { export LUSTRE=`absolute_path $LUSTRE` export TESTSUITE=`basename $0 .sh` - export XMLCONFIG="${TESTSUITE}.xml" + export XMLCONFIG=${XMLCONFIG:-${TESTSUITE}.xml} export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} [ -d /r ] && export ROOT=/r -- 1.8.3.1