sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series)
rm -rf linux-stage linux sources $(ldiskfs_SOURCES)
mkdir -p linux-stage/fs/ext3 linux-stage/include/linux
- cd linux-stage && quilt setup -l ../$(series) -d ../$(patches)
cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3
cp $(linux_headers) linux-stage/include/linux
+if USE_QUILT
+ cd linux-stage && quilt setup -l ../$(series) -d ../$(patches)
cd linux-stage && quilt push -a -q
+else
+ @cd linux-stage && for i in $$(<../$(series)) ; do \
+ echo "patch -p1 < ../$(patches)/$$i" ; \
+ patch -p1 < ../$(patches)/$$i || exit 1 ; \
+ done
+endif
mkdir linux
@echo -n "Replacing 'ext3' with 'ldiskfs':"
@for i in $(notdir $(ext3_headers) $(ext3_sources)) $(new_sources) ; do \
linux-stage/include/linux/ext3$$i \
> linux/ldiskfs$$i ; \
done
+ @echo
touch sources
foo-check:
[
linux25="yes"
KMODEXT=".ko"
+ enable_ldiskfs="yes"
],[
KMODEXT=".o"
linux25="no"
AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
AC_SUBST(KMODEXT)
+AC_PATH_PROG(PATCH, patch, [no])
+AC_PATH_PROG(QUILT, quilt, [no])
+AM_CONDITIONAL(USE_QUILT, test x$QUILT = xno)
+
+if test x$enable_ldiskfs$enable_modules = xyesyes ; then
+ if test x$PATCH$QUILT = xnono ; then
+ AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)])
+ fi
+fi
+
# ------- Makeflags ------------------
CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
AC_DEFUN([LUSTRE_MODULE_COMPILE_IFELSE],
[m4_ifvaln([$1], [LUSTRE_MODULE_CONFTEST([$1])])dnl
rm -f kernel-tests/conftest.o kernel-tests/conftest.mod.c kernel-tests/conftest.ko
-AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="$EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])],
+AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])],
[$4],
[_AC_MSG_LOG_CONFTEST
m4_ifvaln([$5],[$5])dnl])dnl
# ---------- Red Hat 2.4.20 backports some 2.5 bits --------
# This needs to run after we've defined the KCPPFLAGS
-AC_MSG_CHECKING([for kernel version])
+AC_MSG_CHECKING([if task_struct has a sighand field])
LUSTRE_MODULE_TRY_COMPILE(
[
#include <linux/sched.h>
p.sighand = NULL;
],[
AC_DEFINE(CONFIG_RH_2_4_20, 1, [this kernel contains Red Hat 2.4.20 patches])
- AC_MSG_RESULT([redhat-2.4.20])
+ AC_MSG_RESULT([yes])
],[
- AC_MSG_RESULT([$LINUXRELEASE])
+ AC_MSG_RESULT([no])
+ ])
+
+# ---------- 2.4.20 introduced cond_resched --------------
+
+AC_MSG_CHECKING([if kernel offers cond_resched])
+LUSTRE_MODULE_TRY_COMPILE(
+ [
+ #include <linux/sched.h>
+ ],[
+ cond_resched();
+ ],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found])
+ ],[
+ AC_MSG_RESULT([no])
])
# ---------- Red Hat 2.4.21 backports some more 2.5 bits --------
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
-EXTRA_DIST = archdep.m4 build.m4 include
+EXTRA_DIST = archdep.m4 build.m4
-SUBDIRS = portals libcfs knals unals router tests doc utils
+SUBDIRS = portals libcfs knals unals router tests doc utils include
stamp-h
stamp-h1
stamp-h.in
+Makefile
+Makefile.in
--- /dev/null
+SUBDIRS = linux portals
+
+EXTRA_DIST = cygwin-ioctl.h
--- /dev/null
+Makefile
+Makefile.in
--- /dev/null
+linuxdir = $(includedir)/linux
+
+EXTRA_DIST = kp30.h kpr.h libcfs.h lustre_list.h portals_compat25.h \
+ portals_lib.h
* vim:expandtab:shiftwidth=8:tabstop=8:
*/
#ifndef _LIBCFS_H
-
+#define _LIBCFS_H
#define PORTAL_DEBUG
--- /dev/null
+Makefile
+Makefile.in
--- /dev/null
+portalsdir=$(includedir)/portals
+
+if UTILS
+portals_HEADERS = list.h
+endif
+
+EXTRA_DIST = api.h api-support.h arg-blocks.h defines.h errno.h \
+ internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h \
+ list.h lltrace.h myrnal.h nal.h nalids.h p30.h ppid.h ptlctl.h \
+ socknal.h stringtab.h types.h
#ifndef _P30_TYPES_H_
#define _P30_TYPES_H_
-#ifdef __linux__
-# include <asm/types.h>
-# if defined(__powerpc__) && !defined(__KERNEL__)
-# define __KERNEL__
-# include <asm/timex.h>
-# undef __KERNEL__
-# else
-# include <asm/timex.h>
-# endif
-#else
-# include <sys/types.h>
-typedef u_int32_t __u32;
-typedef u_int64_t __u64;
-#endif
+#include <asm/types.h>
#ifdef __KERNEL__
# include <linux/time.h>
+# include <asm/timex.h>
#else
# include <sys/time.h>
# define do_gettimeofday(tv) gettimeofday(tv, NULL);
+typedef unsigned long long cycles_t;
#endif
#include <portals/errno.h>
/* Don't block for transmit descriptor if we're in interrupt context */
int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
int dest = kqswnal_nid2elanid (ktx->ktx_nid);
- long flags;
+ unsigned long flags;
int rc;
ktx->ktx_launchtime = jiffies;
void
kqswnal_rxhandler(EP_RXD *rxd)
{
- long flags;
+ unsigned long flags;
int nob = ep_rxd_len (rxd);
int status = ep_rxd_status (rxd);
kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd);
kqswnal_rx_t *krx;
kqswnal_tx_t *ktx;
kpr_fwd_desc_t *fwd;
- long flags;
+ unsigned long flags;
int rc;
int counter = 0;
int shuttingdown = 0;
{
ksock_fmb_t *fmb = (ksock_fmb_t *)arg;
ksock_fmb_pool_t *fmp = fmb->fmb_pool;
- ptl_hdr_t *hdr = (ptl_hdr_t *)page_address(fmb->fmb_kiov[0].kiov_page);
+ ptl_hdr_t *hdr = &fmb->fmb_hdr;
ksock_conn_t *conn = NULL;
ksock_sched_t *sched;
unsigned long flags;
noinst_LIBRARIES = libtcpnal.a
endif
-pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h \
+ ipmap.h bridge.h procbridge.h
+
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h \
+ dispatch.h table.h timer.h address.c procapi.c proclib.c \
+ connection.c tcpnal.c connection.h
+
libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
libtcpnal_a_CFLAGS = $(LLCFLAGS)
noinst_LIBRARIES = libtcpnal.a
endif
-pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h \
+ ipmap.h bridge.h procbridge.h
+
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h \
+ dispatch.h table.h timer.h address.c procapi.c proclib.c \
+ connection.c tcpnal.c connection.h
+
libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
libtcpnal_a_CFLAGS = $(LLCFLAGS)
libuptlctl_a_CFLAGS = $(LLCFLAGS)
endif
+if UTILS
sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
lib_LIBRARIES = libptlctl.a
+endif
acceptor_SOURCES = acceptor.c
- strip trailing '/'s before comparing paths with /proc/mounts (3486)
- remove assertions to work around "in-flight rpcs" recovery bug (3063)
- change init script to fail more clearly if not run as root (1528)
+ - allow clients to reconnect during replay (1742)
- fix ns_lock/i_sem lock ordering deadlock for kms update (3477)
- don't do DNS lookups on NIDs too small for IP addresses (3442)
+ - re-awaken ptlrpcd if new requests arrive during check_set (3554)
+ - fix cond_resched (3554)
+ - only evict unfinished clients after recovery (3515)
+ - allow bulk resend, prevent data loss (3570)
- dynamic ptlrpc request buffer allocation (2102)
- don't allow unlinking open directory if it isn't empty (2904)
- set MDS/OST threads to umask 0 to not clobber client modes (3359)
+ - remove extraneous obd dereference causing LASSERT failure (3334)
+ - don't use get_cycles() when creating temp. files on the mds (3156)
+ - hold i_sem when setting i_size in ll_extent_lock() (3564)
* miscellania
- servers can dump a log evicting a client - lustre.dump_on_timeout=1
+ - fix ksocknal_fmb_callback() error messages (2918)
2004-05-27 Cluster File Systems, Inc. <info@clusterfs.com>
* version 1.2.2
EXTRA_DIST = lustre.dtd lustre.schema slapd-lustre.conf lustre2ldif.xsl top.ldif
ldapconfdir = $(sysconfdir)/openldap
ldapschemadir = $(sysconfdir)/openldap/schema
+pkglibdir = '${exec_prefix}/usr/lib/$(PACKAGE)'
+
+if UTILS
ldapconf_SCRIPTS = slapd-lustre.conf
ldapschema_SCRIPTS = lustre.schema
-pkglibdir = '${exec_prefix}/usr/lib/$(PACKAGE)'
pkglib_DATA = top.ldif lustre2ldif.xsl
+endif
AC_INIT
AC_CANONICAL_SYSTEM
-AM_INIT_AUTOMAKE(lustre, 1.2.2.3)
+AM_INIT_AUTOMAKE(lustre, 1.2.2.4)
# AM_MAINTAINER_MODE
# Four main targets: lustre kernel modules, utilities, tests, and liblustre
[use ldiskfs for the Lustre backing FS]),
[BACKINGFS='ldiskfs'],[enable_ldiskfs='no'])
AC_MSG_RESULT([$enable_ldiskfs])
-AM_CONDITIONAL(LDISKFS, test x$enable_ldiskfs = xyes)
AC_MSG_CHECKING([which backing filesystem to use])
AC_MSG_RESULT([$BACKINGFS])
sinclude(portals/build.m4)
sinclude(portals/archdep.m4)
+AM_CONDITIONAL(LDISKFS, test x$enable_ldiskfs = xyes)
+
if test x$enable_inkernel = xyes ; then
find . -name Makefile.mk | sed 's/.mk$//' | xargs -n 1 \
sh -e -x -c '(cp -f $0.mk $0.in)'
portals/Makefile
portals/autoMakefile
portals/doc/Makefile
+portals/include/Makefile
+portals/include/linux/Makefile
+portals/include/portals/Makefile
portals/knals/Makefile
portals/knals/autoMakefile
portals/knals/gmnal/Makefile
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
+linuxdir = $(includedir)/linux
+
+if UTILS
+linux_HEADERS = lustre_idl.h
+endif
+
EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_ha.h lustre_lib.h \
lustre_mgmt.h obd_cache.h obd_lov.h lustre_dlm.h lustre_handles.h \
lustre_net.h obd_class.h obd_ost.h obd_support.h lustre_commit_confd.h \
#define smp_num_cpus NR_CPUS
-#ifndef conditional_schedule
-#define conditional_schedule() cond_resched()
-#endif
-
#include <linux/proc_fs.h>
#else /* 2.4.. */
return 0;
}
-#ifndef conditional_schedule
-#define conditional_schedule() if (unlikely(need_resched())) schedule()
+#ifndef HAVE_COND_RESCHED
+static inline void cond_resched(void)
+{
+ if (unlikely(need_resched())) {
+ set_current_state(TASK_RUNNING);
+ schedule();
+ }
+}
#endif
/* to find proc_dir_entry from inode. 2.6 has native one -bzzz */
spinlock_t exp_lock; /* protects flags int below */
/* ^ protects exp_outstanding_replies too */
int exp_flags;
- int exp_failed:1;
- int exp_libclient:1; /* liblustre client? */
+ int exp_failed:1,
+ exp_replay_needed:1,
+ exp_libclient:1; /* liblustre client? */
union {
struct mds_export_data eu_mds_data;
struct filter_export_data eu_filter_data;
void *parent_handle = oti ? oti->oti_handle : NULL;
void *handle = obd->obd_fsops->fs_brw_start(objcount, fso, niocount, nb,
parent_handle, logs);
- CDEBUG(D_HA, "started handle %p (%p)\n", handle, parent_handle);
+ CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
if (oti != NULL) {
if (parent_handle == NULL) {
unsigned long now = jiffies;
int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle);
- CDEBUG(D_HA, "committing handle %p (async)\n", *wait_handle);
+ CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle);
if (time_after(jiffies, now + 15 * HZ))
CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
{
unsigned long now = jiffies;
int rc = obd->obd_fsops->fs_commit_wait(inode, handle);
- CDEBUG(D_HA, "waiting for completion %p\n", handle);
+ CDEBUG(D_INFO, "waiting for completion %p\n", handle);
if (time_after(jiffies, now + 15 * HZ))
CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
return rc;
int imp_invalid:1, imp_replayable:1,
imp_dlm_fake:1, imp_server_timeout:1,
imp_initial_recov:1, imp_force_verify:1,
- imp_pingable:1;
+ imp_pingable:1, imp_resend_replay:1;
__u32 imp_connect_op;
};
static inline int ll_insecure_random_int(void)
{
-#ifdef __arch_um__
struct timeval t;
do_gettimeofday(&t);
return (int)(t.tv_usec);
-#else
- return (int)(get_cycles() >> 2);
-#endif
}
/*
/* llog_ioctl.c */
int llog_ioctl(struct llog_ctxt *ctxt, int cmd, struct obd_ioctl_data *data);
-int llog_catlog_list(struct obd_device *obd, int count,
- struct obd_ioctl_data *data);
+int llog_catalog_list(struct obd_device *obd, int count,
+ struct obd_ioctl_data *data);
/* llog_net.c */
int llog_initiator_connect(struct llog_ctxt *ctxt);
struct obd_uuid *cluuid);
int class_disconnect(struct obd_export *exp, int failover);
void class_disconnect_exports(struct obd_device *obddev, int failover);
+void class_disconnect_stale_exports(struct obd_device *obddev, int failover);
/* generic operations shared by various OBD types */
int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data);
int class_multi_cleanup(struct obd_device *obddev);
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
-
+if UTILS
pkginclude_HEADERS = lustre_user.h liblustreapi.h
+endif
-EXTRA_DIST = $(pkginclude_HEADERS)
+EXTRA_DIST = lustre_user.h liblustreapi.h
--- /dev/null
+Index: linux-p4smp/arch/i386/kernel/entry.S
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/entry.S 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/entry.S 2004-06-14 13:14:19.000000000 -0700
+@@ -46,6 +46,7 @@
+ #include <asm/segment.h>
+ #include <asm/page.h>
+ #include <asm/smp.h>
++#include <asm/current.h>
+ #include <asm/unistd.h>
+
+ EBX = 0x00
+@@ -94,10 +95,6 @@ pt_sys_exit = 8
+
+ ENOSYS = 38
+
+-#define GET_CURRENT(reg) \
+- movl $-8192, reg; \
+- andl %esp, reg
+-
+ #if CONFIG_X86_HIGH_ENTRY
+
+ #define call_SYMBOL_NAME_ABS(X) movl $X, %ebp; call *%ebp
+@@ -193,7 +190,7 @@ ENOSYS = 38
+ GET_CURRENT(%ebx); \
+ movl real_stack(%ebx), %edx; \
+ movl %esp, %ebx; \
+- andl $0x1fff, %ebx; \
++ andl $(THREAD_SIZE-1), %ebx; \
+ orl %ebx, %edx; \
+ movl %edx, %esp;
+
+@@ -228,7 +225,7 @@ ENOSYS = 38
+ return_path_start_marker: \
+ nop; \
+ movl %esp, %ebx; \
+- andl $0x1fff, %ebx; \
++ andl $(THREAD_SIZE-1), %ebx; \
+ orl %ebx, %edx; \
+ movl %esp, %eax; \
+ movl %edx, %esp; \
+Index: linux-p4smp/arch/i386/kernel/smpboot.c
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/smpboot.c 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/smpboot.c 2004-06-14 13:14:19.000000000 -0700
+@@ -814,7 +814,7 @@ static void __init do_boot_cpu (int apic
+
+ /* So we see what's up */
+ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+- stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
++ stack_start.esp = (void *)idle->thread.esp;
+
+ /*
+ * This grunge runs the startup process for
+@@ -887,7 +887,7 @@ static void __init do_boot_cpu (int apic
+ Dprintk("CPU has booted.\n");
+ } else {
+ boot_error= 1;
+- if (*((volatile unsigned char *)phys_to_virt(8192))
++ if (*((volatile unsigned char *)phys_to_virt(THREAD_SIZE))
+ == 0xA5)
+ /* trampoline started but...? */
+ printk("Stuck ??\n");
+@@ -910,7 +910,7 @@ static void __init do_boot_cpu (int apic
+ }
+
+ /* mark "stuck" area as not stuck */
+- *((volatile unsigned long *)phys_to_virt(8192)) = 0;
++ *((volatile unsigned long *)phys_to_virt(THREAD_SIZE)) = 0;
+
+ if(clustered_apic_mode == CLUSTERED_APIC_NUMAQ) {
+ printk("Restoring NMI vector\n");
+Index: linux-p4smp/arch/i386/kernel/traps.c
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/traps.c 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/traps.c 2004-06-14 13:14:19.000000000 -0700
+@@ -273,7 +273,7 @@ void show_trace_task(struct task_struct
+ unsigned long esp = tsk->thread.esp;
+
+ /* User space on another CPU? */
+- if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1))
++ if ((esp ^ (unsigned long)tsk) & ~(THREAD_SIZE - 1))
+ return;
+ show_trace((unsigned long *)esp);
+ }
+Index: linux-p4smp/arch/i386/kernel/head.S
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/head.S 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/head.S 2004-06-14 13:14:19.000000000 -0700
+@@ -15,6 +15,7 @@
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+ #include <asm/desc.h>
++#include <asm/current.h>
+
+ #define OLD_CL_MAGIC_ADDR 0x90020
+ #define OLD_CL_MAGIC 0xA33F
+@@ -328,7 +329,7 @@ rp_sidt:
+ ret
+
+ ENTRY(stack_start)
+- .long SYMBOL_NAME(init_task_union)+8192
++ .long SYMBOL_NAME(init_task_union)+THREAD_SIZE
+ .long __KERNEL_DS
+
+ /* This is the default interrupt "handler" :-) */
+Index: linux-p4smp/arch/i386/kernel/irq.c
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/irq.c 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/irq.c 2004-06-14 13:14:19.000000000 -0700
+@@ -45,6 +45,7 @@
+ #include <asm/delay.h>
+ #include <asm/desc.h>
+ #include <asm/irq.h>
++#include <asm/current.h>
+
+
+
+@@ -585,7 +586,7 @@ asmlinkage unsigned int do_IRQ(struct pt
+ long esp;
+
+ /* Debugging check for stack overflow: is there less than 1KB free? */
+- __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191));
++ __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE-1));
+ if (unlikely(esp < (sizeof(struct task_struct) + 1024))) {
+ extern void show_stack(unsigned long *);
+
+Index: linux-p4smp/arch/i386/lib/getuser.S
+===================================================================
+--- linux-p4smp.orig/arch/i386/lib/getuser.S 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/lib/getuser.S 2004-06-14 13:14:19.000000000 -0700
+@@ -21,6 +21,10 @@
+ * as they get called from within inline assembly.
+ */
+
++/* Duplicated from asm/processor.h */
++#include <asm/current.h>
++#include <linux/config.h>
++
+ addr_limit = 12
+
+ .text
+@@ -28,7 +32,7 @@ addr_limit = 12
+ .globl __get_user_1
+ __get_user_1:
+ movl %esp,%edx
+- andl $0xffffe000,%edx
++ andl $~(THREAD_SIZE - 1),%edx
+ cmpl addr_limit(%edx),%eax
+ jae bad_get_user
+ 1: movzbl (%eax),%edx
+@@ -41,7 +45,7 @@ __get_user_2:
+ addl $1,%eax
+ movl %esp,%edx
+ jc bad_get_user
+- andl $0xffffe000,%edx
++ andl $~(THREAD_SIZE - 1),%edx
+ cmpl addr_limit(%edx),%eax
+ jae bad_get_user
+ 2: movzwl -1(%eax),%edx
+@@ -54,7 +58,7 @@ __get_user_4:
+ addl $3,%eax
+ movl %esp,%edx
+ jc bad_get_user
+- andl $0xffffe000,%edx
++ andl $~(THREAD_SIZE - 1),%edx
+ cmpl addr_limit(%edx),%eax
+ jae bad_get_user
+ 3: movl -3(%eax),%edx
+Index: linux-p4smp/arch/i386/config.in
+===================================================================
+--- linux-p4smp.orig/arch/i386/config.in 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/config.in 2004-06-14 13:14:05.000000000 -0700
+@@ -310,6 +310,28 @@ if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86
+ define_bool CONFIG_HAVE_DEC_LOCK y
+ fi
+
++choice 'Bigger Stack Size Support' \
++ "off CONFIG_NOBIGSTACK \
++ 16KB CONFIG_STACK_SIZE_16KB \
++ 32KB CONFIG_STACK_SIZE_32KB \
++ 64KB CONFIG_STACK_SIZE_64KB" off
++
++if [ "$CONFIG_NOBIGSTACK" = "y" ]; then
++ define_int CONFIG_STACK_SIZE_SHIFT 1
++else
++ if [ "$CONFIG_STACK_SIZE_16KB" = "y" ]; then
++ define_int CONFIG_STACK_SIZE_SHIFT 2
++ else
++ if [ "$CONFIG_STACK_SIZE_32KB" = "y" ]; then
++ define_int CONFIG_STACK_SIZE_SHIFT 3
++ else
++ if [ "$CONFIG_STACK_SIZE_64KB" = "y" ]; then
++ define_int CONFIG_STACK_SIZE_SHIFT 4
++ fi
++ fi
++ fi
++fi
++
+ source drivers/perfctr/Config.in
+
+ endmenu
+Index: linux-p4smp/include/asm-i386/current.h
+===================================================================
+--- linux-p4smp.orig/include/asm-i386/current.h 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/include/asm-i386/current.h 2004-06-14 13:41:19.000000000 -0700
+@@ -1,15 +1,64 @@
+ #ifndef _I386_CURRENT_H
+ #define _I386_CURRENT_H
++#include <asm/page.h>
++
++/*
++ * Configurable page sizes on i386, mainly for debugging purposes.
++ * (c) Balbir Singh
++ */
++
++/* enumerate the values, include/asm-i386/hw_irq.h in particular needs this */
++#if (PAGE_SIZE != 4096)
++#error PAGE_SIZE != 4096 unsupported
++#endif
++
++#if (CONFIG_STACK_SIZE_SHIFT == 0)
++#define THREAD_SIZE 4096
++#elif (CONFIG_STACK_SIZE_SHIFT == 1)
++#define THREAD_SIZE 8192
++#elif (CONFIG_STACK_SIZE_SHIFT == 2)
++#define THREAD_SIZE 16384
++#elif (CONFIG_STACK_SIZE_SHIFT == 3)
++#define THREAD_SIZE 32768
++#elif (CONFIG_STACK_SIZE_SHIFT == 4)
++#define THREAD_SIZE 65536
++#else
++#error CONFIG_STACK_SIZE_SHIFT > 4 unsupported
++#endif
++
++#if (CONFIG_STACK_SIZE_SHIFT != 1) && defined(CONFIG_X86_4G)
++#error Large stacks with 4G/4G split unsupported
++#endif
++
++#ifdef __ASSEMBLY__
++
++#define GET_CURRENT(reg) \
++ movl $-THREAD_SIZE, reg; \
++ andl %esp, reg
++
++#else /* __ASSEMBLY__ */
++
++#define __alloc_task_struct() \
++ ((struct task_struct *) __get_free_pages(GFP_KERNEL, CONFIG_STACK_SIZE_SHIFT))
++
++#define __free_task_struct(p) do { \
++ BUG_ON((p)->state < TASK_ZOMBIE); \
++ free_pages((unsigned long) (p), CONFIG_STACK_SIZE_SHIFT); \
++} while(0)
++
++#define INIT_TASK_SIZE THREAD_SIZE
+
+ struct task_struct;
+
+ static inline struct task_struct * get_current(void)
+ {
+ struct task_struct *current;
+- __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
++ __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~(THREAD_SIZE - 1)));
+ return current;
+ }
+
+ #define current get_current()
+
++#endif /* __ASSEMBLY__ */
++
+ #endif /* !(_I386_CURRENT_H) */
+Index: linux-p4smp/include/asm-i386/hw_irq.h
+===================================================================
+--- linux-p4smp.orig/include/asm-i386/hw_irq.h 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/include/asm-i386/hw_irq.h 2004-06-14 13:14:19.000000000 -0700
+@@ -136,21 +136,17 @@ extern char _stext, _etext;
+ " \
+ /* load the real stack - keep the offset */ \
+ \
+- movl $-8192, %ebx; \
++ movl $- " STR(THREAD_SIZE) ", %ebx; \
+ andl %esp, %ebx; \
+ movl 36(%ebx), %edx; \
+ movl %esp, %ebx; \
+- andl $0x1fff, %ebx; \
++ andl $( " STR(THREAD_SIZE) "-1), %ebx; \
+ orl %ebx, %edx; \
+ movl %edx, %esp;"
+
+ #define IRQ_NAME2(nr) nr##_interrupt(void)
+ #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
+
+-#define GET_CURRENT \
+- "movl %esp, %ebx\n\t" \
+- "andl $-8192, %ebx\n\t"
+-
+ /*
+ * SMP has a few special interrupts for IPI messages
+ */
+Index: linux-p4smp/include/asm-i386/processor.h
+===================================================================
+--- linux-p4smp.orig/include/asm-i386/processor.h 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/include/asm-i386/processor.h 2004-06-14 13:14:19.000000000 -0700
+@@ -14,6 +14,7 @@
+ #include <asm/types.h>
+ #include <asm/sigcontext.h>
+ #include <asm/cpufeature.h>
++#include <asm/current.h>
+ #include <linux/cache.h>
+ #include <linux/config.h>
+ #include <linux/threads.h>
+@@ -498,10 +499,6 @@ unsigned long get_wchan(struct task_stru
+ #define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019])
+ #define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
+
+-#define THREAD_SIZE (2*PAGE_SIZE)
+-#define __alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1))
+-#define __free_task_struct(p) do { BUG_ON((p)->state < TASK_ZOMBIE); free_pages((unsigned long) (p), 1); } while (0)
+-
+ #define init_task (init_task_union.task)
+ #define init_stack (init_task_union.stack)
+
+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ memcpy (to, de, rec_len);
-+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
+ de->inode = 0;
+ map++;
+ to += rec_len;
+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ if (de > to)
+ memmove(to, de, rec_len);
-+ to->rec_len = rec_len;
++ to->rec_len = cpu_to_le16(rec_len);
+ prev = to;
-+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len);
+ }
+ de = next;
+ }
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
-+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
+ de = (struct ext3_dir_entry_2 *) data1;
+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ memcpy (to, de, rec_len);
-+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
+ de->inode = 0;
+ map++;
+ to += rec_len;
+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ if (de > to)
+ memmove(to, de, rec_len);
-+ to->rec_len = rec_len;
++ to->rec_len = cpu_to_le16(rec_len);
+ prev = to;
-+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len);
+ }
+ de = next;
+ }
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
-+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
+ de = (struct ext3_dir_entry_2 *) data1;
+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ memcpy (to, de, rec_len);
-+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
+ de->inode = 0;
+ map++;
+ to += rec_len;
+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ if (de > to)
+ memmove(to, de, rec_len);
-+ to->rec_len = rec_len;
++ to->rec_len = cpu_to_le16(rec_len);
+ prev = to;
-+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len);
+ }
+ de = next;
+ }
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
-+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
+ de = (struct ext3_dir_entry_2 *) data1;
+ EXT3_I(inode)->i_extra_isize = 0;
+
ei->i_state = EXT3_STATE_NEW;
- err = ext3_get_inode_loc_new(inode, &iloc, 1);
- if (err) goto fail;
+ err = ext3_get_inode_loc_new(inode, &iloc, 1);
+ if (err) goto fail;
Index: linux-2.4.21-chaos/fs/ext3/inode.c
===================================================================
--- linux-2.4.21-chaos.orig/fs/ext3/inode.c 2003-12-12 17:39:11.000000000 +0300
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
-+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
++ de = (struct ext3_dir_entry_2 *)&root->dotdot;
+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
--- /dev/null
+ fs/ext3/ialloc.c | 3
+ fs/ext3/inode.c | 3
+ fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++---------
+ fs/ext3/super.c | 14 +
+ include/linux/ext3_fs.h | 1
+ include/linux/ext3_fs_i.h | 6
+ 6 files changed, 500 insertions(+), 109 deletions(-)
+
+Index: linux-2.4.20/fs/ext3/namei.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/namei.c 2004-05-27 15:10:40.000000000 -0400
++++ linux-2.4.20/fs/ext3/namei.c 2004-05-27 15:29:52.000000000 -0400
+@@ -51,6 +51,9 @@
+ {
+ struct buffer_head *bh;
+
++ /* with parallel dir operations all appends
++ * have to be serialized -bzzz */
++ down(&EXT3_I(inode)->i_append_sem);
+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+@@ -58,6 +61,8 @@
+ EXT3_I(inode)->i_disksize = inode->i_size;
+ ext3_journal_get_write_access(handle,bh);
+ }
++ up(&EXT3_I(inode)->i_append_sem);
++
+ return bh;
+ }
+
+@@ -134,6 +139,8 @@
+ struct buffer_head *bh;
+ struct dx_entry *entries;
+ struct dx_entry *at;
++ unsigned long leaf;
++ unsigned int curidx;
+ };
+
+ struct dx_map_entry
+@@ -142,6 +149,30 @@
+ u32 offs;
+ };
+
++/* FIXME: this should be reworked using bb_spin_lock
++ * introduced in -mm tree
++ */
++#define BH_DXLock 25
++
++static inline void dx_lock_bh(struct buffer_head volatile *bh)
++{
++#ifdef CONFIG_SMP
++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
++ while (test_bit(BH_DXLock, &bh->b_state))
++ cpu_relax();
++ }
++#endif
++}
++
++static inline void dx_unlock_bh(struct buffer_head *bh)
++{
++#ifdef CONFIG_SMP
++ smp_mb__before_clear_bit();
++ clear_bit(BH_DXLock, &bh->b_state);
++#endif
++}
++
++
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block (struct dx_entry *entry);
+ static void dx_set_block (struct dx_entry *entry, unsigned value);
+@@ -153,7 +184,7 @@
+ static void dx_set_limit (struct dx_entry *entries, unsigned value);
+ static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+ static unsigned dx_node_limit (struct inode *dir);
+-static struct dx_frame *dx_probe(struct dentry *dentry,
++static struct dx_frame *dx_probe(struct qstr *name,
+ struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct dx_frame *frame,
+@@ -165,15 +196,18 @@
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+ struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames, int *err,
+ __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+- struct ext3_dir_entry_2 **res_dir, int *err);
++ struct ext3_dir_entry_2 **res_dir, int *err,
++ int rwlock, void **lock);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
++static inline void *ext3_lock_htree(struct inode *, unsigned long, int);
++static inline void ext3_unlock_htree(struct inode *, void *);
+
+ /*
+ * Future: use high four bits of block for coalesce-on-delete flags
+@@ -306,6 +340,94 @@
+ #endif /* DX_DEBUG */
+
+ /*
++ * dx_find_position
++ *
++ * search position of specified hash in index
++ *
++ */
++
++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash)
++{
++ struct dx_entry *p, *q, *m;
++ int count;
++
++ count = dx_get_count(entries);
++ p = entries + 1;
++ q = entries + count - 1;
++ while (p <= q)
++ {
++ m = p + (q - p)/2;
++ if (dx_get_hash(m) > hash)
++ q = m - 1;
++ else
++ p = m + 1;
++ }
++ return p - 1;
++}
++
++/*
++ * returns 1 if path is unchanged
++ */
++int dx_check_path(struct dx_frame *frame, u32 hash)
++{
++ struct dx_entry *p;
++ int ret = 1;
++
++ dx_lock_bh(frame->bh);
++ p = dx_find_position(frame->entries, hash);
++ if (frame->leaf != dx_get_block(p))
++ ret = 0;
++ dx_unlock_bh(frame->bh);
++
++ return ret;
++}
++
++/*
++ * 0 - changed
++ * 1 - hasn't changed
++ */
++static int
++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo)
++{
++ struct dx_entry *p;
++ struct dx_frame *frame = frames;
++ u32 leaf;
++
++ /* check first level */
++ dx_lock_bh(frame->bh);
++ p = dx_find_position(frame->entries, hinfo->hash);
++ leaf = dx_get_block(p);
++ dx_unlock_bh(frame->bh);
++
++ if (leaf != frame->leaf)
++ return 0;
++
++ /* is there 2nd level? */
++ frame++;
++ if (frame->bh == NULL)
++ return 1;
++
++ /* check second level */
++ dx_lock_bh(frame->bh);
++
++ /* probably 1st level got changed, check it */
++ if (!dx_check_path(frames, hinfo->hash)) {
++ /* path changed */
++ dx_unlock_bh(frame->bh);
++ return 0;
++ }
++
++ p = dx_find_position(frame->entries, hinfo->hash);
++ leaf = dx_get_block(p);
++ dx_unlock_bh(frame->bh);
++
++ if (leaf != frame->leaf)
++ return 0;
++
++ return 1;
++}
++
++/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+@@ -315,19 +437,20 @@
+ * back to userspace.
+ */
+ static struct dx_frame *
+-dx_probe(struct dentry *dentry, struct inode *dir,
++dx_probe(struct qstr *name, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+ {
+- unsigned count, indirect;
+- struct dx_entry *at, *entries, *p, *q, *m;
++ unsigned indirect;
++ struct dx_entry *at, *entries;
+ struct dx_root *root;
+ struct buffer_head *bh;
+ struct dx_frame *frame = frame_in;
+ u32 hash;
++ unsigned int curidx;
+
+ frame->bh = NULL;
+- if (dentry)
+- dir = dentry->d_parent->d_inode;
++ frame[1].bh = NULL;
++
+ if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+ goto fail;
+ root = (struct dx_root *) bh->b_data;
+@@ -343,8 +466,8 @@
+ }
+ hinfo->hash_version = root->info.hash_version;
+ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
+- if (dentry)
+- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++ if (name)
++ ext3fs_dirhash(name->name, name->len, hinfo);
+ hash = hinfo->hash;
+
+ if (root->info.unused_flags & 1) {
+@@ -356,7 +479,19 @@
+ goto fail;
+ }
+
++repeat:
++ curidx = 0;
++ entries = (struct dx_entry *) (((char *)&root->info) +
++ root->info.info_length);
++ assert(dx_get_limit(entries) == dx_root_limit(dir,
++ root->info.info_length));
++ dxtrace (printk("Look up %x", hash));
++ dx_lock_bh(bh);
++ /* indirect must be initialized under bh lock because
++ * 2nd level creation procedure may change it and dx_probe()
++ * will suggest htree is still single-level -bzzz */
+ if ((indirect = root->info.indirect_levels) > 1) {
++ dx_unlock_bh(bh);
+ ext3_warning(dir->i_sb, __FUNCTION__,
+ "Unimplemented inode hash depth: %#06x",
+ root->info.indirect_levels);
+@@ -364,56 +499,46 @@
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+-
+- entries = (struct dx_entry *) (((char *)&root->info) +
+- root->info.info_length);
+- assert(dx_get_limit(entries) == dx_root_limit(dir,
+- root->info.info_length));
+- dxtrace (printk("Look up %x", hash));
++
+ while (1)
+ {
+- count = dx_get_count(entries);
+- assert (count && count <= dx_get_limit(entries));
+- p = entries + 1;
+- q = entries + count - 1;
+- while (p <= q)
+- {
+- m = p + (q - p)/2;
+- dxtrace(printk("."));
+- if (dx_get_hash(m) > hash)
+- q = m - 1;
+- else
+- p = m + 1;
+- }
+-
+- if (0) // linear search cross check
+- {
+- unsigned n = count - 1;
+- at = entries;
+- while (n--)
+- {
+- dxtrace(printk(","));
+- if (dx_get_hash(++at) > hash)
+- {
+- at--;
+- break;
+- }
+- }
+- assert (at == p - 1);
+- }
+-
+- at = p - 1;
+- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++ at = dx_find_position(entries, hinfo->hash);
++ dxtrace(printk(" %x->%u\n",
++ at == entries? 0: dx_get_hash(at),
++ dx_get_block(at)));
+ frame->bh = bh;
+ frame->entries = entries;
+ frame->at = at;
+- if (!indirect--) return frame;
+- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++ frame->curidx = curidx;
++ frame->leaf = dx_get_block(at);
++ if (!indirect--) {
++ dx_unlock_bh(bh);
++ return frame;
++ }
++
++ /* step into next htree level */
++ curidx = dx_get_block(at);
++ dx_unlock_bh(bh);
++ if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err)))
+ goto fail2;
++
++ dx_lock_bh(bh);
++ /* splitting may change root index block and move
++ * hash we're looking for into another index block
++ * so, we have to check this situation and repeat
++ * from begining if path got changed -bzzz */
++ if (!dx_check_path(frame, hash)) {
++ dx_unlock_bh(bh);
++ bh = frame->bh;
++ indirect++;
++ goto repeat;
++ }
++
+ at = entries = ((struct dx_node *) bh->b_data)->entries;
+ assert (dx_get_limit(entries) == dx_node_limit (dir));
+ frame++;
+ }
++ dx_unlock_bh(bh);
+ fail2:
+ while (frame >= frame_in) {
+ brelse(frame->bh);
+@@ -427,8 +552,7 @@
+ {
+ if (frames[0].bh == NULL)
+ return;
+-
+- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++ if (frames[1].bh != NULL)
+ brelse(frames[1].bh);
+ brelse(frames[0].bh);
+ }
+@@ -470,8 +594,10 @@
+ * nodes need to be read.
+ */
+ while (1) {
+- if (++(p->at) < p->entries + dx_get_count(p->entries))
++ if (++(p->at) < p->entries + dx_get_count(p->entries)) {
++ p->leaf = dx_get_block(p->at);
+ break;
++ }
+ if (p == frames)
+ return 0;
+ num_frames++;
+@@ -497,13 +623,17 @@
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+- 0, err)))
++ u32 idx;
++
++ idx = p->leaf = dx_get_block(p->at);
++ if (!(bh = ext3_bread(NULL, dir, idx, 0, err)))
+ return -1; /* Failure */
+ p++;
+ brelse (p->bh);
+ p->bh = bh;
+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++ p->curidx = idx;
++ p->leaf = dx_get_block(p->at);
+ }
+ return 1;
+ }
+@@ -543,7 +673,7 @@
+ dir = dir_file->f_dentry->d_inode;
+ hinfo.hash = start_hash;
+ hinfo.minor_hash = 0;
+- frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++ frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+
+@@ -625,7 +755,8 @@
+ count++;
+ }
+ /* XXX: do we need to check rec_len == 0 case? -Chris */
+- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ de = (struct ext3_dir_entry_2 *)((char*)de +
++ le16_to_cpu(de->rec_len));
+ }
+ return count;
+ }
+@@ -658,7 +789,8 @@
+ } while(more);
+ }
+
+-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct inode *dir, struct dx_frame *frame,
++ u32 hash, u32 block, u32 idx)
+ {
+ struct dx_entry *entries = frame->entries;
+ struct dx_entry *old = frame->at, *new = old + 1;
+@@ -670,6 +802,7 @@
+ dx_set_hash(new, hash);
+ dx_set_block(new, block);
+ dx_set_count(entries, count + 1);
++
+ }
+ #endif
+
+@@ -752,7 +885,8 @@
+
+
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+- struct ext3_dir_entry_2 ** res_dir)
++ struct ext3_dir_entry_2 ** res_dir,
++ int rwlock, void **lock)
+ {
+ struct super_block * sb;
+ struct buffer_head * bh_use[NAMEI_RA_SIZE];
+@@ -768,6 +902,7 @@
+ int namelen;
+ const u8 *name;
+ unsigned blocksize;
++ int do_not_use_dx = 0;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+@@ -776,9 +911,10 @@
+ name = dentry->d_name.name;
+ if (namelen > EXT3_NAME_LEN)
+ return NULL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+ if (is_dx(dir)) {
+- bh = ext3_dx_find_entry(dentry, res_dir, &err);
++ bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock);
+ /*
+ * On success, or if the error was file not found,
+ * return. Otherwise, fall back to doing a search the
+@@ -787,8 +923,14 @@
+ if (bh || (err != ERR_BAD_DX_DIR))
+ return bh;
+ dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++ do_not_use_dx = 1;
+ }
+ #endif
++ *lock = ext3_lock_htree(dir, 0, rwlock);
++ if (is_dx(dir) && !do_not_use_dx) {
++ ext3_unlock_htree(dir, *lock);
++ goto repeat;
++ }
+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+ start = EXT3_I(dir)->i_dir_start_lookup;
+ if (start >= nblocks)
+@@ -860,12 +1002,17 @@
+ /* Clean up the read-ahead blocks */
+ for (; ra_ptr < ra_max; ra_ptr++)
+ brelse (bh_use[ra_ptr]);
++ if (!ret) {
++ ext3_unlock_htree(dir, *lock);
++ *lock = NULL;
++ }
+ return ret;
+ }
+
+ #ifdef CONFIG_EXT3_INDEX
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+- struct ext3_dir_entry_2 **res_dir, int *err)
++ struct ext3_dir_entry_2 **res_dir, int *err,
++ int rwlock, void **lock)
+ {
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+@@ -880,11 +1027,22 @@
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ sb = dir->i_sb;
+- if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++repeat:
++ if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err)))
+ return NULL;
++
++ *lock = ext3_lock_htree(dir, frame->leaf, rwlock);
++ /* while locking leaf we just found may get splitted
++ * so, we need another leaf. check this */
++ if (!dx_check_full_path(frames, &hinfo)) {
++ ext3_unlock_htree(dir, *lock);
++ dx_release(frames);
++ goto repeat;
++ }
++
+ hash = hinfo.hash;
+ do {
+- block = dx_get_block(frame->at);
++ block = frame->leaf;
+ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+ goto errout;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -918,6 +1076,8 @@
+ *err = -ENOENT;
+ errout:
+ dxtrace(printk("%s not found\n", name));
++ ext3_unlock_htree(dir, *lock);
++ *lock = NULL;
+ dx_release (frames);
+ return NULL;
+ }
+@@ -928,6 +1088,7 @@
+ struct inode * inode;
+ struct ext3_dir_entry_2 * de;
+ struct buffer_head * bh;
++ void *lock = NULL;
+
+ if (dentry->d_name.len > EXT3_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+@@ -935,10 +1096,11 @@
+ if (ext3_check_for_iopen(dir, dentry))
+ return NULL;
+
+- bh = ext3_find_entry(dentry, &de);
++ bh = ext3_find_entry(dentry, &de, 0, &lock);
+ inode = NULL;
+ if (bh) {
+ unsigned long ino = le32_to_cpu(de->inode);
++ ext3_unlock_htree(dir, lock);
+ brelse (bh);
+ inode = iget(dir->i_sb, ino);
+
+@@ -975,7 +1137,8 @@
+ unsigned rec_len = 0;
+
+ while (count--) {
+- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++ struct ext3_dir_entry_2 *de =
++ (struct ext3_dir_entry_2 *) (from + map->offs);
+ rec_len = EXT3_DIR_REC_LEN(de->name_len);
+ memcpy (to, de, rec_len);
+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
+@@ -988,7 +1151,8 @@
+
+ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+ {
+- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++ struct ext3_dir_entry_2 *next, *to, *prev;
++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base;
+ unsigned rec_len = 0;
+
+ prev = to = de;
+@@ -1010,7 +1174,8 @@
+
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+ struct buffer_head **bh,struct dx_frame *frame,
+- struct dx_hash_info *hinfo, int *error)
++ struct dx_hash_info *hinfo, void **target,
++ int *error)
+ {
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count, continued;
+@@ -1057,23 +1222,30 @@
+ hash2 = map[split].hash;
+ continued = hash2 == map[split - 1].hash;
+ dxtrace(printk("Split block %i at %x, %i/%i\n",
+- dx_get_block(frame->at), hash2, split, count-split));
+-
++ frame->leaf, hash2, split, count-split));
++
+ /* Fancy dance to stay within two buffers */
+ de2 = dx_move_dirents(data1, data2, map + split, count - split);
+ de = dx_pack_dirents(data1,blocksize);
+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1));
++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1));
+
+ /* Which block gets the new entry? */
++ *target = NULL;
+ if (hinfo->hash >= hash2)
+ {
+ swap(*bh, bh2);
+ de = de2;
+- }
+- dx_insert_block (frame, hash2 + continued, newblock);
++
++ /* entry will be stored into new block
++ * we have to lock it before add_dirent_to_buf */
++ *target = ext3_lock_htree(dir, newblock, 1);
++ }
++ dx_lock_bh(frame->bh);
++ dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx);
++ dx_unlock_bh(frame->bh);
+ err = ext3_journal_dirty_metadata (handle, bh2);
+ if (err)
+ goto journal_error;
+@@ -1147,7 +1319,8 @@
+ nlen = EXT3_DIR_REC_LEN(de->name_len);
+ rlen = le16_to_cpu(de->rec_len);
+ if (de->inode) {
+- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++ struct ext3_dir_entry_2 *de1 =
++ (struct ext3_dir_entry_2 *)((char *)de + nlen);
+ de1->rec_len = cpu_to_le16(rlen - nlen);
+ de->rec_len = cpu_to_le16(nlen);
+ de = de1;
+@@ -1205,7 +1378,8 @@
+ unsigned blocksize;
+ struct dx_hash_info hinfo;
+ u32 block;
+-
++ void *lock, *new_lock;
++
+ blocksize = dir->i_sb->s_blocksize;
+ dxtrace(printk("Creating index\n"));
+ retval = ext3_journal_get_write_access(handle, bh);
+@@ -1216,7 +1390,6 @@
+ }
+ root = (struct dx_root *) bh->b_data;
+
+- EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
+ bh2 = ext3_append (handle, dir, &block, &retval);
+ if (!(bh2)) {
+ brelse(bh);
+@@ -1224,6 +1397,8 @@
+ }
+ data1 = bh2->b_data;
+
++ lock = ext3_lock_htree(dir, block, 1);
++
+ /* The 0th block becomes the root, move the dirents out */
+ de = (struct ext3_dir_entry_2 *) &root->dotdot;
+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
+@@ -1253,13 +1428,25 @@
+ frame->entries = entries;
+ frame->at = entries;
+ frame->bh = bh;
++ frame->curidx = 0;
++ frame->leaf = 0;
++ frame[1].bh = NULL;
+ bh = bh2;
+- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++ de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval);
+ dx_release (frames);
+ if (!(de))
+- return retval;
++ goto cleanup;
++
++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++cleanup:
++ if (new_lock)
++ ext3_unlock_htree(dir, new_lock);
++ /* we mark directory indexed in order to
++ * avoid races while htree being created -bzzz */
++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++ ext3_unlock_htree(dir, lock);
+
+- return add_dirent_to_buf(handle, dentry, inode, de, bh);
++ return retval;
+ }
+ #endif
+
+@@ -1288,11 +1475,13 @@
+ unsigned blocksize;
+ unsigned nlen, rlen;
+ u32 block, blocks;
++ void *lock;
+
+ sb = dir->i_sb;
+ blocksize = sb->s_blocksize;
+ if (!dentry->d_name.len)
+ return -EINVAL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+ if (is_dx(dir)) {
+ retval = ext3_dx_add_entry(handle, dentry, inode);
+@@ -1303,36 +1492,53 @@
+ ext3_mark_inode_dirty(handle, dir);
+ }
+ #endif
++ lock = ext3_lock_htree(dir, 0, 1);
++ if (is_dx(dir)) {
++ /* we got lock for block 0
++ * probably previous holder of the lock
++ * created htree -bzzz */
++ ext3_unlock_htree(dir, lock);
++ goto repeat;
++ }
++
+ blocks = dir->i_size >> sb->s_blocksize_bits;
+ for (block = 0, offset = 0; block < blocks; block++) {
+ bh = ext3_bread(handle, dir, block, 0, &retval);
+- if(!bh)
++ if(!bh) {
++ ext3_unlock_htree(dir, lock);
+ return retval;
++ }
+ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
+- if (retval != -ENOSPC)
++ if (retval != -ENOSPC) {
++ ext3_unlock_htree(dir, lock);
+ return retval;
++ }
+
+ #ifdef CONFIG_EXT3_INDEX
+ if (blocks == 1 && !dx_fallback &&
+- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+- return make_indexed_dir(handle, dentry, inode, bh);
++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) {
++ retval = make_indexed_dir(handle, dentry, inode, bh);
++ ext3_unlock_htree(dir, lock);
++ return retval;
++ }
+ #endif
+ brelse(bh);
+ }
+ bh = ext3_append(handle, dir, &block, &retval);
+- if (!bh)
++ if (!bh) {
++ ext3_unlock_htree(dir, lock);
+ return retval;
++ }
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ de->inode = 0;
+ de->rec_len = cpu_to_le16(rlen = blocksize);
+ nlen = 0;
+- return add_dirent_to_buf(handle, dentry, inode, de, bh);
++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ ext3_unlock_htree(dir, lock);
++ return retval;
+ }
+
+ #ifdef CONFIG_EXT3_INDEX
+-/*
+- * Returns 0 for success, or a negative error value
+- */
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+@@ -1344,15 +1550,28 @@
+ struct super_block * sb = dir->i_sb;
+ struct ext3_dir_entry_2 *de;
+ int err;
+-
+- frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++ int curidx;
++ void *idx_lock, *leaf_lock, *newleaf_lock;
++
++repeat:
++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+- entries = frame->entries;
+- at = frame->at;
+
+- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++ /* we're going to chage leaf, so lock it first */
++ leaf_lock = ext3_lock_htree(dir, frame->leaf, 1);
++
++ /* while locking leaf we just found may get splitted
++ * so we need to check this */
++ if (!dx_check_full_path(frames, &hinfo)) {
++ ext3_unlock_htree(dir, leaf_lock);
++ dx_release(frames);
++ goto repeat;
++ }
++ if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) {
++ printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err);
+ goto cleanup;
++ }
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, bh);
+@@ -1365,6 +1584,35 @@
+ goto cleanup;
+ }
+
++ /* our leaf has no enough space. hence, we have to
++ * split it. so lock index for this leaf first */
++ curidx = frame->curidx;
++ idx_lock = ext3_lock_htree(dir, curidx, 1);
++
++ /* now check did path get changed? */
++ dx_release(frames);
++
++ frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode,
++ &hinfo, frames, &err);
++ if (!frame) {
++ /* FIXME: error handling here */
++ brelse(bh);
++ ext3_unlock_htree(dir, idx_lock);
++ return err;
++ }
++
++ if (frame->curidx != curidx) {
++ /* path has been changed. we have to drop old lock
++ * and repeat */
++ brelse(bh);
++ ext3_unlock_htree(dir, idx_lock);
++ ext3_unlock_htree(dir, leaf_lock);
++ dx_release(frames);
++ goto repeat;
++ }
++ entries = frame->entries;
++ at = frame->at;
++
+ /* Block full, should compress but for now just split */
+ dxtrace(printk("using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+@@ -1376,7 +1624,8 @@
+ struct dx_entry *entries2;
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+-
++ void *nb_lock;
++
+ if (levels && (dx_get_count(frames->entries) ==
+ dx_get_limit(frames->entries))) {
+ ext3_warning(sb, __FUNCTION__,
+@@ -1387,6 +1636,7 @@
+ bh2 = ext3_append (handle, dir, &newblock, &err);
+ if (!(bh2))
+ goto cleanup;
++ nb_lock = ext3_lock_htree(dir, newblock, 1);
+ node2 = (struct dx_node *)(bh2->b_data);
+ entries2 = node2->entries;
+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+@@ -1398,27 +1648,73 @@
+ if (levels) {
+ unsigned icount1 = icount/2, icount2 = icount - icount1;
+ unsigned hash2 = dx_get_hash(entries + icount1);
++ void *ri_lock;
++
++ /* we have to protect root htree index against
++ * another dx_add_entry() which would want to
++ * split it too -bzzz */
++ ri_lock = ext3_lock_htree(dir, 0, 1);
++
++ /* as root index block blocked we must repeat
++ * searching for current position of our 2nd index -bzzz */
++ dx_lock_bh(frame->bh);
++ frames->at = dx_find_position(frames->entries, hinfo.hash);
++ dx_unlock_bh(frame->bh);
++
+ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+-
+- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++
++ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle,
+ frames[0].bh);
+ if (err)
+ goto journal_error;
+-
++
++ /* copy index into new one */
+ memcpy ((char *) entries2, (char *) (entries + icount1),
+ icount2 * sizeof(struct dx_entry));
+- dx_set_count (entries, icount1);
+ dx_set_count (entries2, icount2);
+ dx_set_limit (entries2, dx_node_limit(dir));
+
+ /* Which index block gets the new entry? */
+ if (at - entries >= icount1) {
++ /* unlock index we won't use */
++ ext3_unlock_htree(dir, idx_lock);
++ idx_lock = nb_lock;
+ frame->at = at = at - entries - icount1 + entries2;
+- frame->entries = entries = entries2;
++ frame->entries = entries2;
++ frame->curidx = curidx = newblock;
+ swap(frame->bh, bh2);
++ } else {
++ /* we'll use old index,so new one may be freed */
++ ext3_unlock_htree(dir, nb_lock);
+ }
+- dx_insert_block (frames + 0, hash2, newblock);
++
++ /* NOTE: very subtle piece of code
++ * competing dx_probe() may find 2nd level index in root
++ * index, then we insert new index here and set new count
++ * in that 2nd level index. so, dx_probe() may see 2nd
++ * level index w/o hash it looks for. the solution is
++ * to check root index after we locked just founded 2nd
++ * level index -bzzz */
++ dx_lock_bh(frames[0].bh);
++ dx_insert_block (dir, frames + 0, hash2, newblock, 0);
++ dx_unlock_bh(frames[0].bh);
++
++ /* now old and new 2nd level index blocks contain
++ * all pointers, so dx_probe() may find it in the both.
++ * it's OK -bzzz */
++
++ dx_lock_bh(frame->bh);
++ dx_set_count(entries, icount1);
++ dx_unlock_bh(frame->bh);
++
++ /* now old 2nd level index block points to first half
++ * of leafs. it's importand that dx_probe() must
++ * check root index block for changes under
++ * dx_lock_bh(frame->bh) -bzzz */
++
++ ext3_unlock_htree(dir, ri_lock);
++
+ dxtrace(dx_show_index ("node", frames[1].entries));
+ dxtrace(dx_show_index ("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+@@ -1427,38 +1723,61 @@
+ goto journal_error;
+ brelse (bh2);
+ } else {
++ unsigned long leaf = frame->leaf;
++
+ dxtrace(printk("Creating second level index...\n"));
+ memcpy((char *) entries2, (char *) entries,
+ icount * sizeof(struct dx_entry));
+ dx_set_limit(entries2, dx_node_limit(dir));
+
+ /* Set up root */
++ dx_lock_bh(frames[0].bh);
+ dx_set_count(entries, 1);
+ dx_set_block(entries + 0, newblock);
+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++ dx_unlock_bh(frames[0].bh);
+
+ /* Add new access path frame */
+ frame = frames + 1;
+ frame->at = at = at - entries + entries2;
+ frame->entries = entries = entries2;
+ frame->bh = bh2;
++ frame->curidx = newblock;
++ frame->leaf = leaf;
+ err = ext3_journal_get_write_access(handle,
+ frame->bh);
+ if (err)
+ goto journal_error;
++
++ /* first level index was root. it's already initialized */
++ /* we my unlock it now */
++ ext3_unlock_htree(dir, idx_lock);
++
++ /* current index is just created 2nd level index */
++ curidx = newblock;
++ idx_lock = nb_lock;
+ }
+ ext3_journal_dirty_metadata(handle, frames[0].bh);
+ }
+- de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err);
+ if (!de)
+ goto cleanup;
++
++ /* index splitted */
++ ext3_unlock_htree(dir, idx_lock);
++
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++
++ if (newleaf_lock)
++ ext3_unlock_htree(dir, newleaf_lock);
++
+ bh = 0;
+ goto cleanup;
+
+ journal_error:
+ ext3_std_error(dir->i_sb, err);
+ cleanup:
++ ext3_unlock_htree(dir, leaf_lock);
+ if (bh)
+ brelse(bh);
+ dx_release(frames);
+@@ -1902,6 +2221,7 @@
+ struct buffer_head * bh;
+ struct ext3_dir_entry_2 * de;
+ handle_t *handle;
++ void *lock;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+ if (IS_ERR(handle)) {
+@@ -1909,7 +2229,7 @@
+ }
+
+ retval = -ENOENT;
+- bh = ext3_find_entry (dentry, &de);
++ bh = ext3_find_entry (dentry, &de, 1, &lock);
+ if (!bh)
+ goto end_rmdir;
+
+@@ -1920,14 +2240,19 @@
+ DQUOT_INIT(inode);
+
+ retval = -EIO;
+- if (le32_to_cpu(de->inode) != inode->i_ino)
++ if (le32_to_cpu(de->inode) != inode->i_ino) {
++ ext3_unlock_htree(dir, lock);
+ goto end_rmdir;
++ }
+
+ retval = -ENOTEMPTY;
+- if (!empty_dir (inode))
++ if (!empty_dir (inode)) {
++ ext3_unlock_htree(dir, lock);
+ goto end_rmdir;
++ }
+
+ retval = ext3_delete_entry(handle, dir, de, bh);
++ ext3_unlock_htree(dir, lock);
+ if (retval)
+ goto end_rmdir;
+ if (inode->i_nlink != 2)
+@@ -1956,6 +2281,7 @@
+ struct buffer_head * bh;
+ struct ext3_dir_entry_2 * de;
+ handle_t *handle;
++ void *lock;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+ if (IS_ERR(handle)) {
+@@ -1966,7 +2292,7 @@
+ handle->h_sync = 1;
+
+ retval = -ENOENT;
+- bh = ext3_find_entry (dentry, &de);
++ bh = ext3_find_entry (dentry, &de, 1, &lock);
+ if (!bh)
+ goto end_unlink;
+
+@@ -1974,8 +2300,10 @@
+ DQUOT_INIT(inode);
+
+ retval = -EIO;
+- if (le32_to_cpu(de->inode) != inode->i_ino)
++ if (le32_to_cpu(de->inode) != inode->i_ino) {
++ ext3_unlock_htree(dir, lock);
+ goto end_unlink;
++ }
+
+ if (!inode->i_nlink) {
+ ext3_warning (inode->i_sb, "ext3_unlink",
+@@ -1984,6 +2312,7 @@
+ inode->i_nlink = 1;
+ }
+ retval = ext3_delete_entry(handle, dir, de, bh);
++ ext3_unlock_htree(dir, lock);
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+@@ -2121,6 +2450,7 @@
+ struct buffer_head * old_bh, * new_bh, * dir_bh;
+ struct ext3_dir_entry_2 * old_de, * new_de;
+ int retval;
++ void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL;
+
+ old_bh = new_bh = dir_bh = NULL;
+
+@@ -2133,7 +2463,10 @@
+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+ handle->h_sync = 1;
+
+- old_bh = ext3_find_entry (old_dentry, &old_de);
++ if (old_dentry->d_parent == new_dentry->d_parent)
++ down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
++
++ old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+@@ -2146,7 +2479,7 @@
+ goto end_rename;
+
+ new_inode = new_dentry->d_inode;
+- new_bh = ext3_find_entry (new_dentry, &new_de);
++ new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */);
+ if (new_bh) {
+ if (!new_inode) {
+ brelse (new_bh);
+@@ -2213,7 +2546,7 @@
+ struct buffer_head *old_bh2;
+ struct ext3_dir_entry_2 *old_de2;
+
+- old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++ old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */);
+ if (old_bh2) {
+ retval = ext3_delete_entry(handle, old_dir,
+ old_de2, old_bh2);
+@@ -2256,6 +2589,14 @@
+ retval = 0;
+
+ end_rename:
++ if (lock1)
++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1);
++ if (lock2)
++ ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2);
++ if (lock3)
++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3);
++ if (old_dentry->d_parent == new_dentry->d_parent)
++ up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
+ brelse (dir_bh);
+ brelse (old_bh);
+ brelse (new_bh);
+@@ -2264,6 +2605,29 @@
+ }
+
+ /*
++ * this locking primitives are used to protect parts
++ * of dir's htree. protection unit is block: leaf or index
++ */
++static inline void *ext3_lock_htree(struct inode *dir,
++ unsigned long value, int rwlock)
++{
++ void *lock;
++
++ if (!test_opt(dir->i_sb, PDIROPS))
++ return NULL;
++ lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL);
++ return lock;
++}
++
++static inline void ext3_unlock_htree(struct inode *dir,
++ void *lock)
++{
++ if (!test_opt(dir->i_sb, PDIROPS) || !lock)
++ return;
++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock);
++}
++
++/*
+ * directories can handle most operations...
+ */
+ struct inode_operations ext3_dir_inode_operations = {
+Index: linux-2.4.20/fs/ext3/super.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/super.c 2004-05-27 15:10:41.000000000 -0400
++++ linux-2.4.20/fs/ext3/super.c 2004-05-27 15:10:45.000000000 -0400
+@@ -796,6 +796,8 @@
+ return 0;
+ }
+ }
++ else if (!strcmp (this_char, "pdirops"))
++ set_opt (sbi->s_mount_opt, PDIROPS);
+ else if (!strcmp (this_char, "grpid") ||
+ !strcmp (this_char, "bsdgroups"))
+ set_opt (*mount_options, GRPID);
+@@ -822,6 +824,9 @@
+ if (want_numeric(value, "sb", sb_block))
+ return 0;
+ }
++ else if (!strcmp (this_char, "pdirops")) {
++ set_opt (sbi->s_mount_opt, PDIROPS);
++ }
+ #ifdef CONFIG_JBD_DEBUG
+ else if (!strcmp (this_char, "ro-after")) {
+ unsigned long v;
+@@ -985,6 +990,10 @@
+ ext3_check_inodes_bitmap (sb);
+ }
+ #endif
++#ifdef S_PDIROPS
++ if (test_opt (sb, PDIROPS))
++ sb->s_flags |= S_PDIROPS;
++#endif
+ setup_ro_after(sb);
+ return res;
+ }
+@@ -1484,6 +1493,11 @@
+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+ "writeback");
+
++ if (test_opt(sb, PDIROPS)) {
++ printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n");
++ sb->s_flags |= S_PDIROPS;
++ }
++
+ return sb;
+
+ failed_mount3:
+Index: linux-2.4.20/fs/ext3/inode.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/inode.c 2004-05-27 15:10:41.000000000 -0400
++++ linux-2.4.20/fs/ext3/inode.c 2004-05-27 15:10:45.000000000 -0400
+@@ -2435,6 +2435,9 @@
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
++ dynlock_init(&EXT3_I(inode)->i_htree_lock);
++ sema_init(&EXT3_I(inode)->i_rename_sem, 1);
++ sema_init(&EXT3_I(inode)->i_append_sem, 1);
+ } else if (S_ISLNK(inode->i_mode)) {
+ if (ext3_inode_is_fast_symlink(inode))
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+Index: linux-2.4.20/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/ialloc.c 2004-05-27 15:10:39.000000000 -0400
++++ linux-2.4.20/fs/ext3/ialloc.c 2004-05-27 15:10:45.000000000 -0400
+@@ -601,6 +601,9 @@
+ return ERR_PTR(-EDQUOT);
+ }
+ ext3_debug ("allocating inode %lu\n", inode->i_ino);
++ dynlock_init(&EXT3_I(inode)->i_htree_lock);
++ sema_init(&EXT3_I(inode)->i_rename_sem, 1);
++ sema_init(&EXT3_I(inode)->i_append_sem, 1);
+ return inode;
+
+ fail:
+Index: linux-2.4.20/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-05-27 15:10:40.000000000 -0400
++++ linux-2.4.20/include/linux/ext3_fs.h 2004-05-27 15:10:45.000000000 -0400
+@@ -306,6 +306,7 @@
+ /*
+ * Mount flags
+ */
++#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */
+ #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */
+ #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */
+ #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */
+Index: linux-2.4.20/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.4.20.orig/include/linux/ext3_fs_i.h 2001-11-22 14:46:19.000000000 -0500
++++ linux-2.4.20/include/linux/ext3_fs_i.h 2004-05-27 15:10:45.000000000 -0400
+@@ -17,6 +17,7 @@
+ #define _LINUX_EXT3_FS_I
+
+ #include <linux/rwsem.h>
++#include <linux/dynlocks.h>
+
+ /*
+ * second extended file system inode data in memory
+@@ -73,6 +74,11 @@
+ * by other means, so we have truncate_sem.
+ */
+ struct rw_semaphore truncate_sem;
++
++ /* following fields for parallel directory operations -bzzz */
++ struct dynlock i_htree_lock;
++ struct semaphore i_append_sem;
++ struct semaphore i_rename_sem;
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
--- /dev/null
+ fs/ext3/xattr.c | 12 +++++-
+ fs/ext3/xattr_trusted.c | 86 +++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/ext3_xattr.h | 6 +++
+ 3 files changed, 102 insertions(+), 2 deletions(-)
+
+Index: linux-p4smp/fs/ext3/Makefile
+===================================================================
+--- linux-p4smp.orig/fs/ext3/Makefile 2004-06-14 13:46:11.000000000 -0700
++++ linux-p4smp/fs/ext3/Makefile 2004-06-14 13:50:46.000000000 -0700
+@@ -12,7 +12,8 @@ O_TARGET := ext3.o
+ export-objs := ext3-exports.o
+
+ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+- ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \
++ xattr_trusted.o
+ obj-m := $(O_TARGET)
+
+ export-objs += xattr.o
+Index: linux-p4smp/fs/ext3/xattr.c
+===================================================================
+--- linux-p4smp.orig/fs/ext3/xattr.c 2004-06-14 13:46:44.000000000 -0700
++++ linux-p4smp/fs/ext3/xattr.c 2004-06-14 13:50:46.000000000 -0700
+@@ -1780,18 +1780,25 @@ static void ext3_xattr_rehash(struct ext
+ int __init
+ init_ext3_xattr(void)
+ {
++ int error;
++
+ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
+ sizeof(struct mb_cache_entry) +
+ sizeof(struct mb_cache_entry_index), 1, 61);
+ if (!ext3_xattr_cache)
+ return -ENOMEM;
+
+- return 0;
++ error = init_ext3_xattr_trusted();
++ if (error)
++ mb_cache_destroy(ext3_xattr_cache);
++
++ return error;
+ }
+
+ void
+ exit_ext3_xattr(void)
+ {
++ exit_ext3_xattr_trusted();
+ if (ext3_xattr_cache)
+ mb_cache_destroy(ext3_xattr_cache);
+ ext3_xattr_cache = NULL;
+@@ -1802,12 +1809,13 @@ exit_ext3_xattr(void)
+ int __init
+ init_ext3_xattr(void)
+ {
+- return 0;
++ return init_ext3_xattr_trusted();
+ }
+
+ void
+ exit_ext3_xattr(void)
+ {
++ exit_ext3_xattr_trusted();
+ }
+
+ #endif /* CONFIG_EXT3_FS_XATTR_SHARING */
+Index: linux-p4smp/fs/ext3/xattr_trusted.c
+===================================================================
+--- linux-p4smp.orig/fs/ext3/xattr_trusted.c 2004-06-14 13:41:58.000000000 -0700
++++ linux-p4smp/fs/ext3/xattr_trusted.c 2004-06-14 13:50:46.000000000 -0700
+@@ -0,0 +1,86 @@
++/*
++ * linux/fs/ext3/xattr_trusted.c
++ * Handler for trusted extended attributes.
++ *
++ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++
++#define XATTR_TRUSTED_PREFIX "trusted."
++
++static size_t
++ext3_xattr_trusted_list(char *list, struct inode *inode,
++ const char *name, int name_len)
++{
++ const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
++
++ if (!capable(CAP_SYS_ADMIN))
++ return 0;
++
++ if (list) {
++ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
++ memcpy(list+prefix_len, name, name_len);
++ list[prefix_len + name_len] = '\0';
++ }
++ return prefix_len + name_len + 1;
++}
++
++static int
++ext3_xattr_trusted_get(struct inode *inode, const char *name,
++ void *buffer, size_t size)
++{
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
++ buffer, size);
++}
++
++static int
++ext3_xattr_trusted_set(struct inode *inode, const char *name,
++ const void *value, size_t size, int flags)
++{
++ handle_t *handle;
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!capable(CAP_SYS_ADMIN))
++ return -EPERM;
++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_TRUSTED, name,
++ value, size, flags);
++ ext3_journal_stop(handle, inode);
++
++ return error;
++}
++
++struct ext3_xattr_handler ext3_xattr_trusted_handler = {
++ .prefix = XATTR_TRUSTED_PREFIX,
++ .list = ext3_xattr_trusted_list,
++ .get = ext3_xattr_trusted_get,
++ .set = ext3_xattr_trusted_set,
++};
++
++int __init
++init_ext3_xattr_trusted(void)
++{
++ return ext3_xattr_register(EXT3_XATTR_INDEX_TRUSTED,
++ &ext3_xattr_trusted_handler);
++}
++
++void
++exit_ext3_xattr_trusted(void)
++{
++ ext3_xattr_unregister(EXT3_XATTR_INDEX_TRUSTED,
++ &ext3_xattr_trusted_handler);
++}
+Index: linux-p4smp/include/linux/ext3_xattr.h
+===================================================================
+--- linux-p4smp.orig/include/linux/ext3_xattr.h 2004-06-14 13:41:58.000000000 -0700
++++ linux-p4smp/include/linux/ext3_xattr.h 2004-06-14 13:50:46.000000000 -0700
+@@ -93,6 +93,9 @@ extern void ext3_xattr_put_super(struct
+ extern int init_ext3_xattr(void) __init;
+ extern void exit_ext3_xattr(void);
+
++extern int init_ext3_xattr_trusted(void) __init;
++extern void exit_ext3_xattr_trusted(void);
++
+ # else /* CONFIG_EXT3_FS_XATTR */
+ # define ext3_setxattr NULL
+ # define ext3_getxattr NULL
if(ext3_get_inode_loc(inode, &iloc))
goto bad_inode;
bh = iloc.bh;
-Index: linux-2.4.19.SuSE/fs/ext3/iopen.c
+Index: lum/fs/ext3/iopen.c
===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/iopen.c Sun Nov 16 01:27:31 2003
-+++ linux-2.4.19.SuSE/fs/ext3/iopen.c Sun Nov 16 01:27:31 2003
-@@ -0,0 +1,258 @@
+--- lum.orig/fs/ext3/iopen.c 2004-03-09 16:46:37.000000000 -0700
++++ lum/fs/ext3/iopen.c 2004-03-09 16:48:03.000000000 -0700
+@@ -0,0 +1,282 @@
+/*
+ * linux/fs/ext3/iopen.c
+ *
+
+/* This function is spliced into ext3_lookup and does the move of a
+ * disconnected dentry (if it exists) to a connected dentry.
-+ * Caller must hold dcache_lock.
+ */
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++ int rehash)
+{
+ struct dentry *tmp, *goal = NULL;
+ struct list_head *lp;
+
++ /* verify this dentry is really new */
++ assert(dentry->d_inode == NULL);
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ if (rehash)
++ assert(list_empty(&dentry->d_hash)); /* d_rehash */
++ assert(list_empty(&dentry->d_subdirs));
++
++ spin_lock(&dcache_lock);
++ if (!inode)
++ goto do_rehash;
++
+ /* preferrably return a connected dentry */
+ list_for_each(lp, &inode->i_dentry) {
+ tmp = list_entry(lp, struct dentry, d_alias);
+ }
+
+ if (!goal)
-+ return NULL;
++ goto do_instantiate;
+
+ /* Move the goal to the de hash queue - like d_move() */
+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
+ list_del_init(&goal->d_hash);
+
+ list_del(&goal->d_child);
-+ list_del(&de->d_child);
++ list_del(&dentry->d_child);
+
+ /* Switch the parents and the names.. */
-+ switch_names(goal, de);
-+ do_switch(goal->d_parent, de->d_parent);
-+ do_switch(goal->d_name.len, de->d_name.len);
-+ do_switch(goal->d_name.hash, de->d_name.hash);
++ switch_names(goal, dentry);
++ do_switch(goal->d_parent, dentry->d_parent);
++ do_switch(goal->d_name.len, dentry->d_name.len);
++ do_switch(goal->d_name.hash, dentry->d_name.hash);
+
+ /* And add them back to the (new) parent lists */
+ list_add(&goal->d_child, &goal->d_parent->d_subdirs);
-+ list_add(&de->d_child, &de->d_parent->d_subdirs);
++ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
+ __d_rehash(goal, 0);
++ spin_unlock(&dcache_lock);
++ iput(inode);
+
+ return goal;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++do_rehash:
++ if (rehash)
++ __d_rehash(dentry, 0); /* d_rehash */
++ spin_unlock(&dcache_lock);
++
++ return NULL;
+}
+
+/*
+
+ return 1;
+}
-Index: linux-2.4.19.SuSE/fs/ext3/iopen.h
+Index: lum/fs/ext3/iopen.h
===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/iopen.h Sun Nov 16 01:27:31 2003
-+++ linux-2.4.19.SuSE/fs/ext3/iopen.h Sun Nov 16 01:27:31 2003
+--- lum.orig/fs/ext3/iopen.h 2004-03-09 16:46:37.000000000 -0700
++++ lum/fs/ext3/iopen.h 2004-03-09 16:48:03.000000000 -0700
@@ -0,0 +1,15 @@
+/*
+ * iopen.h
+
+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
+extern int ext3_iopen_get_inode(struct inode *inode);
-+extern struct dentry *iopen_connect_dentry(struct dentry *de,
-+ struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++ struct inode *inode, int rehash);
Index: linux-2.4.19.SuSE/fs/ext3/namei.c
===================================================================
--- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:23:20 2003
/*
* define how far ahead to read directories while searching them.
-@@ -922,10 +922,14 @@
- struct inode * inode;
- struct ext3_dir_entry_2 * de;
- struct buffer_head * bh;
-+ struct dentry *alternate = NULL;
-
+@@ -926,6 +927,9 @@
if (dentry->d_name.len > EXT3_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
bh = ext3_find_entry(dentry, &de);
inode = NULL;
if (bh) {
-@@ -943,7 +948,28 @@
+@@ -943,8 +948,8 @@
return ERR_PTR(-EACCES);
}
}
- d_add(dentry, inode);
+- return NULL;
+
-+ /* verify this dentry is really new */
-+ assert(!dentry->d_inode);
-+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
-+ assert(list_empty(&dentry->d_hash)); /* d_rehash */
-+ assert(list_empty(&dentry->d_subdirs));
-+
-+ spin_lock(&dcache_lock);
-+ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
-+ spin_unlock(&dcache_lock);
-+ iput(inode);
-+ return alternate;
++ return iopen_connect_dentry(dentry, inode, 1);
+ }
+
+ #define S_SHIFT 12
+@@ -1932,10 +1935,6 @@
+ inode->i_nlink);
+ inode->i_version = ++event;
+ inode->i_nlink = 0;
+- /* There's no need to set i_disksize: the fact that i_nlink is
+- * zero will ensure that the right thing happens during any
+- * recovery. */
+- inode->i_size = 0;
+ ext3_orphan_add(handle, inode);
+ dir->i_nlink--;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+@@ -2086,6 +2085,23 @@
+ return err;
+ }
+
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ int err = ext3_add_entry(handle, dentry, inode);
++ if (!err) {
++ err = ext3_mark_inode_dirty(handle, inode);
++ if (err == 0) {
++ dput(iopen_connect_dentry(dentry, inode, 0));
++ return 0;
++ }
+ }
++ ext3_dec_count(handle, inode);
++ iput(inode);
++ return err;
++}
+
-+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+ if (inode) /* d_instantiate */
-+ list_add(&dentry->d_alias, &inode->i_dentry);
-+ dentry->d_inode = inode;
-+
-+ __d_rehash(dentry, 0); /* d_rehash */
-+ spin_unlock(&dcache_lock);
-+
- return NULL;
- }
+ static int ext3_link (struct dentry * old_dentry,
+ struct inode * dir, struct dentry *dentry)
+ {
+@@ -2113,7 +2129,8 @@
+ ext3_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+- err = ext3_add_nondir(handle, dentry, inode);
++ err = ext3_add_link(handle, dentry, inode);
++ ext3_orphan_del(handle, inode);
+ ext3_journal_stop(handle, dir);
+ return err;
+ }
Index: linux-2.4.19.SuSE/fs/ext3/super.c
===================================================================
--- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:19:22 2003
===================================================================
--- linux-ia64.orig/fs/ext3/iopen.c 2004-03-17 18:02:08.000000000 -0800
+++ linux-ia64/fs/ext3/iopen.c 2004-03-17 18:10:58.000000000 -0800
-@@ -8,3 +8,275 @@
- * This file may be redistributed under the terms of the GNU General
- * Public License.
- *
+@@ -0,0 +1,282 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
+ *
+ * Invariants:
+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
+ if (!err) {
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (err == 0) {
-+ (void)iopen_connect_dentry(dentry, inode, 0);
++ dput(iopen_connect_dentry(dentry, inode, 0));
+ return 0;
+ }
+ }
Index: linux-2.6.5-12.1/fs/exec.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/exec.c 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/exec.c 2004-05-25 17:32:14.038494200 +0300
+--- linux-2.6.5-12.1.orig/fs/exec.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/exec.c 2004-06-03 18:31:28.000000000 -0400
@@ -125,9 +125,10 @@
struct nameidata nd;
int error;
if (err) {
Index: linux-2.6.5-12.1/fs/namei.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/namei.c 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/namei.c 2004-05-25 17:32:14.040493896 +0300
+--- linux-2.6.5-12.1.orig/fs/namei.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/namei.c 2004-06-03 18:42:17.000000000 -0400
@@ -270,8 +270,19 @@
return 0;
}
dput(next.dentry);
mntput(next.mnt);
if (err)
-@@ -703,14 +749,29 @@
+@@ -703,14 +749,24 @@
inode = nd->dentry->d_inode;
/* fallthrough */
case 1:
+ nd->flags |= LOOKUP_LAST;
+ err = revalidate_special(nd);
+ nd->flags &= ~LOOKUP_LAST;
++ if (!nd->dentry->d_inode)
++ err = -ENOENT;
+ if (err)
-+ break;
++ goto return_err;
goto return_reval;
}
-+
-+ if (err) {
-+ if (!nd->dentry->d_inode)
-+ err = -ENOENT;
-+
-+ goto return_err;
-+ }
-+
++
if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
err = nd->dentry->d_op->d_hash(nd->dentry, &this);
if (err < 0)
if (err)
break;
follow_mount(&next.mnt, &next.dentry);
-@@ -936,7 +997,7 @@
+@@ -936,7 +992,7 @@
}
/* SMP-safe */
{
unsigned long hash;
struct qstr this;
-@@ -956,11 +1017,16 @@
+@@ -956,11 +1012,16 @@
}
this.hash = end_name_hash(hash);
/*
* namei()
*
-@@ -972,7 +1038,8 @@
+@@ -972,7 +1033,8 @@
* that namei follows links, while lnamei does not.
* SMP-safe
*/
{
char *tmp = getname(name);
int err = PTR_ERR(tmp);
-@@ -987,6 +1054,13 @@
+@@ -987,6 +1049,13 @@
return err;
}
-+int __user_walk(const char __user *name, unsigned flags,
-+ struct nameidata *nd, const char **pname)
++int fastcall __user_walk(const char __user *name, unsigned flags,
++ struct nameidata *nd, const char **pname)
+{
+ intent_init(&nd->intent, IT_LOOKUP);
+ return __user_walk_it(name, flags, nd, pname);
/*
* It's inline, so penalty for filesystems that don't use sticky bit is
* minimal.
-@@ -1259,8 +1333,8 @@
+@@ -1259,8 +1328,8 @@
acc_mode |= MAY_APPEND;
/* Fill in the open() intent data */
/*
* The simplest case - just a plain lookup.
-@@ -1275,6 +1349,7 @@
+@@ -1275,6 +1344,7 @@
/*
* Create - we need to know the parent.
*/
error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
if (error)
return error;
-@@ -1291,7 +1366,9 @@
+@@ -1291,7 +1361,9 @@
dir = nd->dentry;
nd->flags &= ~LOOKUP_PARENT;
down(&dir->d_inode->i_sem);
do_last:
error = PTR_ERR(dentry);
-@@ -1396,7 +1473,9 @@
+@@ -1396,7 +1468,9 @@
}
dir = nd->dentry;
down(&dir->d_inode->i_sem);
putname(nd->last.name);
goto do_last;
}
-@@ -2196,7 +2275,9 @@
+@@ -2196,7 +2270,9 @@
__vfs_follow_link(struct nameidata *nd, const char *link)
{
int res = 0;
if (IS_ERR(link))
goto fail;
-@@ -2206,6 +2287,10 @@
+@@ -2206,6 +2282,10 @@
/* weird __emul_prefix() stuff did it */
goto out;
}
if (current->link_count || res || nd->last_type!=LAST_NORM)
Index: linux-2.6.5-12.1/fs/namespace.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/namespace.c 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/namespace.c 2004-05-25 17:33:44.385759328 +0300
+--- linux-2.6.5-12.1.orig/fs/namespace.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/namespace.c 2004-06-03 18:31:28.000000000 -0400
@@ -108,6 +108,7 @@
static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
flags &= ~MS_MGC_MSK;
Index: linux-2.6.5-12.1/fs/open.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/open.c 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/open.c 2004-05-25 17:32:14.042493592 +0300
+--- linux-2.6.5-12.1.orig/fs/open.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/open.c 2004-06-03 18:31:28.000000000 -0400
@@ -227,12 +227,12 @@
struct nameidata nd;
struct inode * inode;
*/
Index: linux-2.6.5-12.1/fs/stat.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/stat.c 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/stat.c 2004-05-25 17:32:14.042493592 +0300
+--- linux-2.6.5-12.1.orig/fs/stat.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/stat.c 2004-06-03 18:31:28.000000000 -0400
@@ -37,7 +37,7 @@
EXPORT_SYMBOL(generic_fillattr);
Index: linux-2.6.5-12.1/fs/nfs/dir.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 19:21:53.000000000 +0300
-+++ linux-2.6.5-12.1/fs/nfs/dir.c 2004-05-25 17:32:14.043493440 +0300
+--- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 12:21:53.000000000 -0400
++++ linux-2.6.5-12.1/fs/nfs/dir.c 2004-06-03 18:31:28.000000000 -0400
@@ -709,7 +709,7 @@
return 0;
if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE))
* The 0 argument passed into the create function should one day
Index: linux-2.6.5-12.1/fs/inode.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/inode.c 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/inode.c 2004-05-25 17:32:14.044493288 +0300
+--- linux-2.6.5-12.1.orig/fs/inode.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/inode.c 2004-06-03 18:31:28.000000000 -0400
@@ -221,6 +221,7 @@
inodes_stat.nr_unused--;
}
* @inode: inode to clear
Index: linux-2.6.5-12.1/fs/super.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/super.c 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/super.c 2004-05-25 17:32:14.045493136 +0300
+--- linux-2.6.5-12.1.orig/fs/super.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/super.c 2004-06-03 18:31:28.000000000 -0400
@@ -789,6 +789,8 @@
return (struct vfsmount *)sb;
}
struct vfsmount *kern_mount(struct file_system_type *type)
{
return do_kern_mount(type->name, 0, type->name, NULL);
+Index: linux-2.6.5-12.1/fs/block_dev.c
+===================================================================
+--- linux-2.6.5-12.1.orig/fs/block_dev.c 2004-05-10 12:21:55.000000000 -0400
++++ linux-2.6.5-12.1/fs/block_dev.c 2004-06-03 18:31:28.000000000 -0400
+@@ -834,6 +834,7 @@
+ if (!path || !*path)
+ return ERR_PTR(-EINVAL);
+
++ intent_init(&nd.intent, IT_LOOKUP);
+ error = path_lookup(path, LOOKUP_FOLLOW, &nd);
+ if (error)
+ return ERR_PTR(error);
Index: linux-2.6.5-12.1/include/linux/dcache.h
===================================================================
---- linux-2.6.5-12.1.orig/include/linux/dcache.h 2004-04-04 06:38:24.000000000 +0300
-+++ linux-2.6.5-12.1/include/linux/dcache.h 2004-05-25 17:32:14.045493136 +0300
+--- linux-2.6.5-12.1.orig/include/linux/dcache.h 2004-04-03 22:38:24.000000000 -0500
++++ linux-2.6.5-12.1/include/linux/dcache.h 2004-06-03 18:31:28.000000000 -0400
@@ -4,6 +4,7 @@
#ifdef __KERNEL__
int nr_unused;
Index: linux-2.6.5-12.1/include/linux/fs.h
===================================================================
---- linux-2.6.5-12.1.orig/include/linux/fs.h 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/include/linux/fs.h 2004-05-25 17:32:14.046492984 +0300
+--- linux-2.6.5-12.1.orig/include/linux/fs.h 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/include/linux/fs.h 2004-06-03 18:31:28.000000000 -0400
@@ -250,6 +250,8 @@
#define ATTR_ATTR_FLAG 1024
#define ATTR_KILL_SUID 2048
Index: linux-2.6.5-12.1/include/linux/namei.h
===================================================================
---- linux-2.6.5-12.1.orig/include/linux/namei.h 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/include/linux/namei.h 2004-05-25 17:32:14.047492832 +0300
+--- linux-2.6.5-12.1.orig/include/linux/namei.h 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/include/linux/namei.h 2004-06-03 18:31:28.000000000 -0400
@@ -2,25 +2,55 @@
#define _LINUX_NAMEI_H
extern int follow_down(struct vfsmount **, struct dentry **);
extern int follow_up(struct vfsmount **, struct dentry **);
-Index: linux-2.6.5-12.1/kernel/exit.c
-===================================================================
---- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/kernel/exit.c 2004-05-25 17:32:14.047492832 +0300
-@@ -260,6 +260,8 @@
- write_unlock_irq(&tasklist_lock);
- }
-
-+EXPORT_SYMBOL(reparent_to_init);
-+
- void __set_special_pids(pid_t session, pid_t pgrp)
- {
- struct task_struct *curr = current;
-@@ -429,6 +431,8 @@
- __exit_files(tsk);
- }
-
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
- /* No need to hold fs->lock if we are killing it */
Index: linux-2.6.5-12.1/include/linux/fshooks.h
===================================================================
---- linux-2.6.5-12.1.orig/include/linux/fshooks.h 2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/include/linux/fshooks.h 2004-05-25 17:32:14.048492680 +0300
+--- linux-2.6.5-12.1.orig/include/linux/fshooks.h 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/include/linux/fshooks.h 2004-06-03 18:31:28.000000000 -0400
@@ -90,12 +90,18 @@
#define FSHOOK_BEGIN_USER_WALK(type, err, path, flags, nd, field, args...) \
#define FSHOOK_END_USER_WALK(type, err, field) ((void)0);}
-Index: linux-2.6.5-12.1/fs/block_dev.c
+Index: linux-2.6.5-12.1/kernel/exit.c
===================================================================
---- linux-2.6.5-12.1.orig/fs/block_dev.c 2004-05-10 19:21:55.000000000 +0300
-+++ linux-2.6.5-12.1/fs/block_dev.c 2004-05-25 17:32:39.517620784 +0300
-@@ -834,6 +834,7 @@
- if (!path || !*path)
- return ERR_PTR(-EINVAL);
+--- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/kernel/exit.c 2004-06-03 18:31:28.000000000 -0400
+@@ -260,6 +260,8 @@
+ write_unlock_irq(&tasklist_lock);
+ }
-+ intent_init(&nd.intent, IT_LOOKUP);
- error = path_lookup(path, LOOKUP_FOLLOW, &nd);
- if (error)
- return ERR_PTR(error);
++EXPORT_SYMBOL(reparent_to_init);
++
+ void __set_special_pids(pid_t session, pid_t pgrp)
+ {
+ struct task_struct *curr = current;
+@@ -429,6 +431,8 @@
+ __exit_files(tsk);
+ }
+
++EXPORT_SYMBOL(exit_files);
++
+ static inline void __put_fs_struct(struct fs_struct *fs)
+ {
+ /* No need to hold fs->lock if we are killing it */
+configurable-x86-stack-2.4.21-chaos.patch
dev_read_only_2.4.21-chaos.patch
exports_2.4.19-suse.patch
lustre_version.patch
ext3-raw-lookup.patch
nfs_export_kernel-2.4.21-chaos.patch
ext3-ea-in-inode-2.4.21-chaos.patch
+ext3-trusted_ea-2.4.21-chaos.patch
listman-2.4.21-chaos.patch
gfp_memalloc-2.4.21-chaos.patch
ext3-xattr-ptr-arith-fix.patch
pagecache-lock-2.4.21-chaos.patch
ext3-truncate-buffer-head.patch
inode-max-readahead-2.4.24.patch
+dcache_refcount_debug.patch
ext3-no-write-super-chaos.patch
dynamic-locks-2.4.20-rh.patch
vfs-pdirops-2.4.20-rh.patch
-ext3-pdirops-2.4.20-chaos.patch
+ext3-pdirops-2.4.20-rh.patch
tcp_zero_copy_2.4.20_chaos.patch
gpl_header-chaos-2.4.20.patch
add_page_private.patch
ext-2.4-patch-2.patch
ext-2.4-patch-3.patch
ext-2.4-patch-4.patch
-linux-2.4.20-xattr-0.8.54-hp.patch
linux-2.4.19-xattr-0.8.54-suse.patch
ext3-2.4-ino_t.patch
ext3-largefile.patch
ext3-xattr-ptr-arith-fix.patch
gfp_memalloc-2.4.22.patch
procfs-ndynamic-2.4.patch
-linux-2.4.20-tmpfs-xattr.patch
-linux-2.4.20-tmpfs-iopen.patch
linux-2.4.20-filemap.patch
ext3-truncate-buffer-head.patch
-KERNEL=linux-2.4.20-28.9.tar.gz
+KERNEL=linux-2.4.20-31.9.tar.gz
SERIES=rh-2.4.20
VERSION=2.4.20
-EXTRA_VERSION=28.9_lustre
+EXTRA_VERSION=31.9_lustre.1.2.2
RHBUILD=1
BASE_ARCHS="i586"
SMP_ARCHS="i586"
UP_ARCHS=""
SRC_ARCHS="i586"
+
+# the modules in this kernel do not build with gcc 3
+for cc in i386-redhat-linux-gcc-2.96 gcc296 gcc ; do
+ if which $cc >/dev/null 2>/dev/null ; then
+ CC=$cc
+ break
+ fi
+done
sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series)
rm -rf linux-stage linux sources $(ldiskfs_SOURCES)
mkdir -p linux-stage/fs/ext3 linux-stage/include/linux
- cd linux-stage && quilt setup -l ../$(series) -d ../$(patches)
cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3
cp $(linux_headers) linux-stage/include/linux
+if USE_QUILT
+ cd linux-stage && quilt setup -l ../$(series) -d ../$(patches)
cd linux-stage && quilt push -a -q
+else
+ @cd linux-stage && for i in $$(<../$(series)) ; do \
+ echo "patch -p1 < ../$(patches)/$$i" ; \
+ patch -p1 < ../$(patches)/$$i || exit 1 ; \
+ done
+endif
mkdir linux
@echo -n "Replacing 'ext3' with 'ldiskfs':"
@for i in $(notdir $(ext3_headers) $(ext3_sources)) $(new_sources) ; do \
linux-stage/include/linux/ext3$$i \
> linux/ldiskfs$$i ; \
done
+ @echo
touch sources
foo-check:
if (rc && rc != EALREADY)
GOTO(out, rc);
- /* XXX track this all the time? */
- if (target->obd_recovering) {
- target->obd_connected_clients++;
- }
-
req->rq_repmsg->handle = conn;
/* If the client and the server are the same node, we will already
GOTO(out, rc = 0);
}
+ if (target->obd_recovering)
+ target->obd_connected_clients++;
+
memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn),
sizeof conn);
* Recovery functions
*/
-static void abort_delayed_replies(struct obd_device *obd)
+static void target_finish_recovery(struct obd_device *obd)
{
- struct ptlrpc_request *req;
struct list_head *tmp, *n;
+ int rc;
+
+ CWARN("%s: sending delayed replies to recovered clients\n",
+ obd->obd_name);
+
+ ldlm_reprocess_all_ns(obd->obd_namespace);
+
+ /* when recovery finished, cleanup orphans on mds and ost */
+ if (OBT(obd) && OBP(obd, postrecov)) {
+ rc = OBP(obd, postrecov)(obd);
+ if (rc >= 0)
+ CWARN("%s: all clients recovered, %d MDS "
+ "orphans deleted\n", obd->obd_name, rc);
+ else
+ CERROR("postrecov failed %d\n", rc);
+ }
+
list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
+ struct ptlrpc_request *req;
req = list_entry(tmp, struct ptlrpc_request, rq_list);
- DEBUG_REQ(D_ERROR, req, "aborted:");
- req->rq_status = -ENOTCONN;
- req->rq_type = PTL_RPC_MSG_ERR;
+ DEBUG_REQ(D_ERROR, req, "delayed:");
ptlrpc_reply(req);
class_export_put(req->rq_export);
list_del(&req->rq_list);
OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
OBD_FREE(req, sizeof *req);
}
+ return;
}
static void abort_recovery_queue(struct obd_device *obd)
void target_abort_recovery(void *data)
{
struct obd_device *obd = data;
- int rc;
- CERROR("disconnecting clients and aborting recovery\n");
spin_lock_bh(&obd->obd_processing_task_lock);
if (!obd->obd_recovering) {
spin_unlock_bh(&obd->obd_processing_task_lock);
EXIT;
return;
}
-
obd->obd_recovering = obd->obd_abort_recovery = 0;
-
- wake_up(&obd->obd_next_transno_waitq);
target_cancel_recovery_timer(obd);
spin_unlock_bh(&obd->obd_processing_task_lock);
- class_disconnect_exports(obd, 0);
+ CERROR("%s: recovery period over; disconnecting unfinished clients.\n",
+ obd->obd_name);
+ class_disconnect_stale_exports(obd, 0);
+ abort_recovery_queue(obd);
- /* when recovery was aborted, cleanup orphans on mds and ost */
- if (OBT(obd) && OBP(obd, postrecov)) {
- rc = OBP(obd, postrecov)(obd);
- if (rc >= 0)
- CWARN("Cleanup %d orphans after recovery was aborted\n", rc);
- else
- CERROR("postrecov failed %d\n", rc);
- }
+ target_finish_recovery(obd);
- abort_delayed_replies(obd);
- abort_recovery_queue(obd);
ptlrpc_run_recovery_over_upcall(obd);
}
struct obd_device *obd = (struct obd_device *)castmeharder;
CERROR("recovery timed out, aborting\n");
spin_lock_bh(&obd->obd_processing_task_lock);
- obd->obd_abort_recovery = 1;
+ if (obd->obd_recovering)
+ obd->obd_abort_recovery = 1;
wake_up(&obd->obd_next_transno_waitq);
spin_unlock_bh(&obd->obd_processing_task_lock);
}
queue_len = obd->obd_requests_queued_for_recovery;
next_transno = obd->obd_next_recovery_transno;
+ CDEBUG(D_HA,"max: %d, connected: %d, completed: %d, queue_len: %d, "
+ "req_transno: "LPU64", next_transno: "LPU64"\n",
+ max, connected, completed, queue_len, req_transno, next_transno);
if (obd->obd_abort_recovery) {
CDEBUG(D_HA, "waking for aborted recovery\n");
wake_up = 1;
* Also, if this request has a transno less than the one we're waiting
* for, we should process it now. It could (and currently always will)
* be an open request for a descriptor that was opened some time ago.
+ *
+ * Also, a resent, replayed request that has already been
+ * handled will pass through here and be processed immediately.
*/
if (obd->obd_processing_task == current->pid ||
transno < obd->obd_next_recovery_transno) {
return 1;
}
+ /* A resent, replayed request that is still on the queue; just drop it.
+ The queued request will handle this. */
+ if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) ==
+ (MSG_RESENT | MSG_REPLAY)) {
+ DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
+ spin_unlock_bh(&obd->obd_processing_task_lock);
+ OBD_FREE(reqmsg, req->rq_reqlen);
+ OBD_FREE(saved_req, sizeof *saved_req);
+ return 0;
+ }
+
memcpy(saved_req, req, sizeof *req);
memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
req = saved_req;
struct ptlrpc_request *saved_req;
struct lustre_msg *reqmsg;
int recovery_done = 0;
- int rc2;
LASSERT ((rc == 0) == (req->rq_reply_state != NULL));
list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
spin_lock_bh(&obd->obd_processing_task_lock);
- --obd->obd_recoverable_clients;
+ /* only count the first "replay over" request from each
+ export */
+ if (req->rq_export->exp_replay_needed) {
+ --obd->obd_recoverable_clients;
+ req->rq_export->exp_replay_needed = 0;
+ }
recovery_done = (obd->obd_recoverable_clients == 0);
spin_unlock_bh(&obd->obd_processing_task_lock);
if (recovery_done) {
- struct list_head *tmp, *n;
- ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace);
- CWARN("%s: all clients recovered, sending delayed replies\n",
- obd->obd_name);
spin_lock_bh(&obd->obd_processing_task_lock);
- obd->obd_recovering = 0;
+ obd->obd_recovering = obd->obd_abort_recovery = 0;
target_cancel_recovery_timer(obd);
spin_unlock_bh(&obd->obd_processing_task_lock);
- /* when recovery finished, cleanup orphans on mds and ost */
- if (OBT(obd) && OBP(obd, postrecov)) {
- rc2 = OBP(obd, postrecov)(obd);
- if (rc2 >= 0)
- CWARN("%s: all clients recovered, %d MDS "
- "orphans deleted\n", obd->obd_name, rc2);
- else
- CERROR("postrecov failed %d\n", rc2);
- }
-
- list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
- req = list_entry(tmp, struct ptlrpc_request, rq_list);
- DEBUG_REQ(D_ERROR, req, "delayed:");
- ptlrpc_reply(req);
- class_export_put(req->rq_export);
- list_del(&req->rq_list);
- OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
- OBD_FREE(req, sizeof *req);
- }
+ target_finish_recovery(obd);
ptlrpc_run_recovery_over_upcall(obd);
} else {
CWARN("%s: %d recoverable clients remain\n",
rc = obd_enqueue(sbi->ll_osc_exp, lli->lli_smd, LDLM_EXTENT, &policy,
LCK_PR, &flags, llu_extent_lock_callback,
ldlm_completion_ast, llu_glimpse_callback, inode,
- sizeof(*lvb), lustre_swab_ost_lvb, &lockh);
+ sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh);
if (rc > 0)
RETURN(-EIO);
break;
}
- conditional_schedule();
+ cond_resched();
page = find_get_page(inode->i_mapping, i);
if (page == NULL)
rc = -EIO;
if (policy->l_extent.start == 0 &&
- policy->l_extent.end == OBD_OBJECT_EOF)
+ policy->l_extent.end == OBD_OBJECT_EOF) {
+ /* vmtruncate()->ll_truncate() first sets the i_size and then
+ * the kms under both a DLM lock and the i_sem. If we don't
+ * get the i_sem here we can match the DLM lock and reset
+ * i_size from the kms before the truncating path has updated
+ * the kms. generic_file_write can then trust the stale i_size
+ * when doing appending writes and effectively cancel the
+ * result of the truncate. Getting the i_sem after the enqueue
+ * maintains the DLM -> i_sem acquiry order. */
+ down(&inode->i_sem);
inode->i_size = lov_merge_size(lsm, 1);
+ up(&inode->i_sem);
+ }
//inode->i_mtime = lov_merge_mtime(lsm, inode->i_mtime);
static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
int activate)
{
- struct obd_device *obd;
struct lov_tgt_desc *tgt;
int i, rc = 0;
ENTRY;
if (i == lov->desc.ld_tgt_count)
GOTO(out, rc = -EINVAL);
- obd = class_exp2obd(tgt->ltd_exp);
- if (obd == NULL) {
- /* This can happen if OST failure races with node shutdown */
- GOTO(out, rc = -ENOTCONN);
- }
-
- CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LOV idx %d\n",
- obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
- obd->obd_type->typ_name, i);
- LASSERT(strcmp(obd->obd_type->typ_name, "osc") == 0);
-
if (tgt->active == activate) {
- CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+ CDEBUG(D_INFO, "OSC %s already %sactive!\n", uuid->uuid,
activate ? "" : "in");
GOTO(out, rc);
}
- CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
+ CDEBUG(D_INFO, "Marking OSC %s %sactive\n", uuid->uuid,
+ activate ? "" : "in");
tgt->active = activate;
if (activate)
if (tmp > lock->l_policy_data.l_extent.end)
tmp = lock->l_policy_data.l_extent.end + 1;
if (tmp >= loi->loi_kms) {
- CDEBUG(D_INODE, "lock acquired, setting rss="
+ CDEBUG(D_DLMTRACE, "lock acquired, setting rss="
LPU64", kms="LPU64"\n", loi->loi_rss,
tmp);
loi->loi_kms = tmp;
loi->loi_kms_valid = 1;
} else {
- CDEBUG(D_INODE, "lock acquired, setting rss="
+ CDEBUG(D_DLMTRACE, "lock acquired, setting rss="
LPU64"; leaving kms="LPU64", end="LPU64
"\n", loi->loi_rss, loi->loi_kms,
lock->l_policy_data.l_extent.end);
memset(lov_lockhp, 0, sizeof(*lov_lockhp));
loi->loi_rss = submd->lsm_oinfo->loi_rss;
loi->loi_blocks = submd->lsm_oinfo->loi_blocks;
- CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
- " kms="LPU64"\n", loi->loi_rss, loi->loi_kms);
+ CDEBUG(D_DLMTRACE, "glimpsed, setting rss="LPU64
+ "; leaving kms="LPU64"\n", loi->loi_rss,
+ loi->loi_kms);
} else {
memset(lov_lockhp, 0, sizeof(*lov_lockhp));
if (lov->tgts[loi->loi_ost_idx].active) {
return 0;
}
-static int mds_read_last_rcvd(struct obd_device *obd, struct file *file)
+static int mds_init_server_data(struct obd_device *obd, struct file *file)
{
struct mds_obd *mds = &obd->u.mds;
struct mds_server_data *msd;
spin_lock_init(&med->med_open_lock);
mcd = NULL;
+ exp->exp_replay_needed = 1;
obd->obd_recoverable_clients++;
obd->obd_max_recoverable_clients++;
class_export_put(exp);
mds->mds_last_transno = last_transno;
}
+ if (mcd)
+ OBD_FREE(mcd, sizeof(*mcd));
+
obd->obd_last_committed = mds->mds_last_transno;
+
if (obd->obd_recoverable_clients) {
CWARN("RECOVERY: service %s, %d recoverable clients, "
"last_transno "LPU64"\n", obd->obd_name,
obd->obd_recovering = 1;
}
- if (mcd)
- OBD_FREE(mcd, sizeof(*mcd));
-
mds->mds_mount_count = mount_count + 1;
msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count);
/* save it, so mount count and last_transno is current */
rc = mds_update_server_data(obd, 1);
+ if (rc)
+ GOTO(err_client, rc);
- RETURN(rc);
+ RETURN(0);
err_client:
class_disconnect_exports(obd, 0);
GOTO(err_last_rcvd, rc = -ENOENT);
}
- rc = mds_read_last_rcvd(obd, file);
+ rc = mds_init_server_data(obd, file);
if (rc) {
CERROR("cannot read %s: rc = %d\n", LAST_RCVD, rc);
GOTO(err_last_rcvd, rc);
ENTRY;
push_ctxt(&saved, &exp->exp_obd->obd_ctxt, NULL);
-
- sprintf(fidname, "OBJECTS/%u", tmpname);
+
+ sprintf(fidname, "OBJECTS/%u.%u", tmpname, current->pid);
filp = filp_open(fidname, O_CREAT | O_EXCL, 0644);
if (IS_ERR(filp)) {
rc = PTR_ERR(filp);
case OBD_IOC_CATLOGLIST: {
int count = mds->mds_lov_desc.ld_tgt_count;
- rc = llog_catlog_list(obd, count, data);
+ rc = llog_catalog_list(obd, count, data);
RETURN(rc);
}
EXPORT_SYMBOL(class_conn2cliimp);
EXPORT_SYMBOL(class_disconnect);
EXPORT_SYMBOL(class_disconnect_exports);
+EXPORT_SYMBOL(class_disconnect_stale_exports);
EXPORT_SYMBOL(oig_init);
EXPORT_SYMBOL(oig_release);
RETURN(0);
}
-void class_disconnect_exports(struct obd_device *obd, int flags)
+static void class_disconnect_export_list(struct list_head *list, int flags)
{
int rc;
- struct list_head *tmp, *n, work_list;
struct lustre_handle fake_conn;
struct obd_export *fake_exp, *exp;
ENTRY;
- /* Move all of the exports from obd_exports to a work list, en masse. */
- spin_lock(&obd->obd_dev_lock);
- list_add(&work_list, &obd->obd_exports);
- list_del_init(&obd->obd_exports);
- spin_unlock(&obd->obd_dev_lock);
-
- CDEBUG(D_HA, "OBD device %d (%p) has exports, "
- "disconnecting them\n", obd->obd_minor, obd);
- list_for_each_safe(tmp, n, &work_list) {
- exp = list_entry(tmp, struct obd_export, exp_obd_chain);
+ /* It's possible that an export may disconnect itself, but
+ * nothing else will be added to this list. */
+ while(!list_empty(list)) {
+ exp = list_entry(list->next, struct obd_export, exp_obd_chain);
class_export_get(exp);
if (obd_uuid_equals(&exp->exp_client_uuid,
EXIT;
}
+void class_disconnect_exports(struct obd_device *obd, int flags)
+{
+ struct list_head work_list;
+ ENTRY;
+
+ /* Move all of the exports from obd_exports to a work list, en masse. */
+ spin_lock(&obd->obd_dev_lock);
+ list_add(&work_list, &obd->obd_exports);
+ list_del_init(&obd->obd_exports);
+ spin_unlock(&obd->obd_dev_lock);
+
+ CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+ "disconnecting them\n", obd->obd_minor, obd);
+ class_disconnect_export_list(&work_list, flags);
+ EXIT;
+}
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd, int flags)
+{
+ struct list_head work_list;
+ struct list_head *pos, *n;
+ struct obd_export *exp;
+ int cnt = 0;
+ ENTRY;
+
+ INIT_LIST_HEAD(&work_list);
+ spin_lock(&obd->obd_dev_lock);
+ list_for_each_safe(pos, n, &obd->obd_exports) {
+ exp = list_entry(pos, struct obd_export, exp_obd_chain);
+ if (exp->exp_replay_needed) {
+ list_del(&exp->exp_obd_chain);
+ list_add(&exp->exp_obd_chain, &work_list);
+ cnt++;
+ }
+ }
+ spin_unlock(&obd->obd_dev_lock);
+
+ CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
+ obd->obd_name, cnt);
+ class_disconnect_export_list(&work_list, flags);
+ EXIT;
+}
+
int oig_init(struct obd_io_group **oig_out)
{
struct obd_io_group *oig;
}
EXPORT_SYMBOL(llog_ioctl);
-int llog_catlog_list(struct obd_device *obd, int count,
+int llog_catalog_list(struct obd_device *obd, int count,
struct obd_ioctl_data *data)
{
int size, i;
RETURN(0);
}
-EXPORT_SYMBOL(llog_catlog_list);
+EXPORT_SYMBOL(llog_catalog_list);
spin_lock_init(&fed->fed_lock);
fcd = NULL;
+ exp->exp_replay_needed = 1;
obd->obd_recoverable_clients++;
class_export_put(exp);
}
+ if (fcd)
+ OBD_FREE(fcd, sizeof(*fcd));
+
obd->obd_last_committed = le64_to_cpu(fsd->fsd_last_transno);
if (obd->obd_recoverable_clients) {
obd->obd_recovering = 1;
}
- if (fcd)
- OBD_FREE(fcd, sizeof(*fcd));
-
out:
filter->fo_mount_count = mount_count + 1;
fsd->fsd_mount_count = cpu_to_le64(filter->fo_mount_count);
/* save it, so mount count and last_transno is current */
rc = filter_update_server_data(obd, filp, filter->fo_fsd, 1);
+ if (rc)
+ GOTO(err_client, rc);
- RETURN(rc);
+ RETURN(0);
err_client:
class_disconnect_exports(obd, 0);
}
case OBD_IOC_CATLOGLIST: {
- rc = llog_catlog_list(obd, 1, data);
+ rc = llog_catalog_list(obd, 1, data);
RETURN(rc);
}
lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf)));
osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
spin_lock_irqsave(&req->rq_lock, flags);
- req->rq_no_resend = 1;
spin_unlock_irqrestore(&req->rq_lock, flags);
/* size[0] still sizeof (*body) */
rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
page_count, pga, &requested_nob, &niocount,
&request);
- /* NB ^ sets rq_no_resend */
-
if (rc != 0)
return (rc);
struct brw_page *pga = aa->aa_pga;
ENTRY;
- /* XXX bug 937 here */
- if (rc == -ETIMEDOUT && request->rq_resend) {
- DEBUG_REQ(D_HA, request, "BULK TIMEOUT");
- LBUG(); /* re-send. later. */
- //goto restart_bulk;
- }
-
rc = osc_brw_fini_request(request, oa, requested_nob, niocount,
page_count, pga, rc);
RETURN (rc);
rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
page_count, pga, &requested_nob, &nio_count,
&request);
- /* NB ^ sets rq_no_resend */
if (rc == 0) {
LASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
[
linux25="yes"
KMODEXT=".ko"
+ enable_ldiskfs="yes"
],[
KMODEXT=".o"
linux25="no"
AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
AC_SUBST(KMODEXT)
+AC_PATH_PROG(PATCH, patch, [no])
+AC_PATH_PROG(QUILT, quilt, [no])
+AM_CONDITIONAL(USE_QUILT, test x$QUILT = xno)
+
+if test x$enable_ldiskfs$enable_modules = xyesyes ; then
+ if test x$PATCH$QUILT = xnono ; then
+ AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)])
+ fi
+fi
+
# ------- Makeflags ------------------
CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
AC_DEFUN([LUSTRE_MODULE_COMPILE_IFELSE],
[m4_ifvaln([$1], [LUSTRE_MODULE_CONFTEST([$1])])dnl
rm -f kernel-tests/conftest.o kernel-tests/conftest.mod.c kernel-tests/conftest.ko
-AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="$EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])],
+AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])],
[$4],
[_AC_MSG_LOG_CONFTEST
m4_ifvaln([$5],[$5])dnl])dnl
# ---------- Red Hat 2.4.20 backports some 2.5 bits --------
# This needs to run after we've defined the KCPPFLAGS
-AC_MSG_CHECKING([for kernel version])
+AC_MSG_CHECKING([if task_struct has a sighand field])
LUSTRE_MODULE_TRY_COMPILE(
[
#include <linux/sched.h>
p.sighand = NULL;
],[
AC_DEFINE(CONFIG_RH_2_4_20, 1, [this kernel contains Red Hat 2.4.20 patches])
- AC_MSG_RESULT([redhat-2.4.20])
+ AC_MSG_RESULT([yes])
],[
- AC_MSG_RESULT([$LINUXRELEASE])
+ AC_MSG_RESULT([no])
+ ])
+
+# ---------- 2.4.20 introduced cond_resched --------------
+
+AC_MSG_CHECKING([if kernel offers cond_resched])
+LUSTRE_MODULE_TRY_COMPILE(
+ [
+ #include <linux/sched.h>
+ ],[
+ cond_resched();
+ ],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found])
+ ],[
+ AC_MSG_RESULT([no])
])
# ---------- Red Hat 2.4.21 backports some more 2.5 bits --------
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
-EXTRA_DIST = archdep.m4 build.m4 include
+EXTRA_DIST = archdep.m4 build.m4
-SUBDIRS = portals libcfs knals unals router tests doc utils
+SUBDIRS = portals libcfs knals unals router tests doc utils include
stamp-h
stamp-h1
stamp-h.in
+Makefile
+Makefile.in
--- /dev/null
+SUBDIRS = linux portals
+
+EXTRA_DIST = cygwin-ioctl.h
--- /dev/null
+Makefile
+Makefile.in
--- /dev/null
+linuxdir = $(includedir)/linux
+
+EXTRA_DIST = kp30.h kpr.h libcfs.h lustre_list.h portals_compat25.h \
+ portals_lib.h
* vim:expandtab:shiftwidth=8:tabstop=8:
*/
#ifndef _LIBCFS_H
-
+#define _LIBCFS_H
#define PORTAL_DEBUG
--- /dev/null
+Makefile
+Makefile.in
--- /dev/null
+portalsdir=$(includedir)/portals
+
+if UTILS
+portals_HEADERS = list.h
+endif
+
+EXTRA_DIST = api.h api-support.h arg-blocks.h defines.h errno.h \
+ internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h \
+ list.h lltrace.h myrnal.h nal.h nalids.h p30.h ppid.h ptlctl.h \
+ socknal.h stringtab.h types.h
#ifndef _P30_TYPES_H_
#define _P30_TYPES_H_
-#ifdef __linux__
-# include <asm/types.h>
-# if defined(__powerpc__) && !defined(__KERNEL__)
-# define __KERNEL__
-# include <asm/timex.h>
-# undef __KERNEL__
-# else
-# include <asm/timex.h>
-# endif
-#else
-# include <sys/types.h>
-typedef u_int32_t __u32;
-typedef u_int64_t __u64;
-#endif
+#include <asm/types.h>
#ifdef __KERNEL__
# include <linux/time.h>
+# include <asm/timex.h>
#else
# include <sys/time.h>
# define do_gettimeofday(tv) gettimeofday(tv, NULL);
+typedef unsigned long long cycles_t;
#endif
#include <portals/errno.h>
/* Don't block for transmit descriptor if we're in interrupt context */
int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
int dest = kqswnal_nid2elanid (ktx->ktx_nid);
- long flags;
+ unsigned long flags;
int rc;
ktx->ktx_launchtime = jiffies;
void
kqswnal_rxhandler(EP_RXD *rxd)
{
- long flags;
+ unsigned long flags;
int nob = ep_rxd_len (rxd);
int status = ep_rxd_status (rxd);
kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd);
kqswnal_rx_t *krx;
kqswnal_tx_t *ktx;
kpr_fwd_desc_t *fwd;
- long flags;
+ unsigned long flags;
int rc;
int counter = 0;
int shuttingdown = 0;
{
ksock_fmb_t *fmb = (ksock_fmb_t *)arg;
ksock_fmb_pool_t *fmp = fmb->fmb_pool;
- ptl_hdr_t *hdr = (ptl_hdr_t *)page_address(fmb->fmb_kiov[0].kiov_page);
+ ptl_hdr_t *hdr = &fmb->fmb_hdr;
ksock_conn_t *conn = NULL;
ksock_sched_t *sched;
unsigned long flags;
noinst_LIBRARIES = libtcpnal.a
endif
-pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h \
+ ipmap.h bridge.h procbridge.h
+
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h \
+ dispatch.h table.h timer.h address.c procapi.c proclib.c \
+ connection.c tcpnal.c connection.h
+
libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
libtcpnal_a_CFLAGS = $(LLCFLAGS)
libuptlctl_a_CFLAGS = $(LLCFLAGS)
endif
+if UTILS
sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
lib_LIBRARIES = libptlctl.a
+endif
acceptor_SOURCES = acceptor.c
LASSERT_SPIN_LOCKED(&imp->imp_lock);
+ /* clear this for new requests that were resent as well
+ as resent replayed requests. */
+ lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
/* don't re-add requests that have been replayed */
if (!list_empty(&req->rq_replay_list))
return;
- lustre_msg_add_flags(req->rq_reqmsg,
- MSG_REPLAY);
+ lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
LASSERT(imp->imp_replayable);
/* Balanced in ptlrpc_free_committed, usually. */
aa->praa_old_state = req->rq_send_state;
req->rq_send_state = LUSTRE_IMP_REPLAY;
req->rq_phase = RQ_PHASE_NEW;
- /*
- * Q: "How can a req get on the replay list if it wasn't replied?"
- * A: "If we failed during the replay of this request, it will still
- * be on the list, but rq_replied will have been reset to 0."
- */
- if (req->rq_replied) {
- aa->praa_old_status = req->rq_repmsg->status;
- req->rq_status = 0;
- req->rq_replied = 0;
- }
+ aa->praa_old_status = req->rq_repmsg->status;
+ req->rq_status = 0;
req->rq_interpret_reply = ptlrpc_replay_interpret;
atomic_inc(&req->rq_import->imp_replay_inflight);
spin_lock_irqsave(&imp->imp_lock, flags);
if (imp->imp_state == LUSTRE_IMP_FULL) {
+ CERROR("%s: connection lost to %s@%s\n",
+ imp->imp_obd->obd_name,
+ imp->imp_target_uuid.uuid,
+ imp->imp_connection->c_remote_uuid.uuid);
IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
spin_unlock_irqrestore(&imp->imp_lock, flags);
obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
imp->imp_conn_cnt++;
- imp->imp_last_replay_transno = 0;
+ imp->imp_resend_replay = 0;
if (imp->imp_remote_handle.cookie == 0) {
initial_connect = 1;
request->rq_repmsg->handle.cookie);
imp->imp_remote_handle = request->rq_repmsg->handle;
} else {
- CERROR("reconnected to %s@%s after partition\n",
+ CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
imp->imp_target_uuid.uuid,
imp->imp_connection->c_remote_uuid.uuid);
}
- if (imp->imp_invalid)
+ if (imp->imp_invalid) {
IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
- else
+ } else if (MSG_CONNECT_RECOVERING & msg_flags) {
+ CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+ imp->imp_obd->obd_name,
+ imp->imp_target_uuid.uuid);
+ imp->imp_resend_replay = 1;
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+ } else {
IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+ }
}
else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
LASSERT(imp->imp_replayable);
imp->imp_remote_handle = request->rq_repmsg->handle;
+ imp->imp_last_replay_transno = 0;
IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
}
else {
if (aa->pcaa_initial_connect && !imp->imp_initial_recov) {
ptlrpc_deactivate_import(imp);
}
- CDEBUG(D_ERROR, "recovery of %s on %s failed (%d)\n",
+ CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
imp->imp_target_uuid.uuid,
(char *)imp->imp_connection->c_remote_uuid.uuid, rc);
}
void * data, int rc)
{
atomic_dec(&req->rq_import->imp_replay_inflight);
- ptlrpc_import_recovery_state_machine(req->rq_import);
+ if (req->rq_status == 0) {
+ ptlrpc_import_recovery_state_machine(req->rq_import);
+ } else {
+ CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+ "reconnecting\n",
+ req->rq_import->imp_obd->obd_name, req->rq_status);
+ ptlrpc_connect_import(req->rq_import, NULL);
+ }
+
RETURN(0);
}
GOTO(out, rc);
IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
ptlrpc_activate_import(imp);
+ CERROR("%s: connection restored to %s@%s\n",
+ imp->imp_obd->obd_name,
+ imp->imp_target_uuid.uuid,
+ imp->imp_connection->c_remote_uuid.uuid);
}
if (imp->imp_state == LUSTRE_IMP_FULL) {
#include <linux/obd.h>
#include "ptlrpc_internal.h"
-static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
+static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid,
struct ptlrpc_connection *conn, int portal, __u64 xid)
{
ptl_process_id_t remote_id;
int rc;
- int rc2;
ptl_md_t md;
char str[PTL_NALFMT_SIZE];
ENTRY;
CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
len, portal, xid);
- rc2 = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0);
+ rc = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0);
if (rc != PTL_OK) {
+ int rc2;
/* We're going to get an UNLINK event when I unlink below,
* which will complete just like any other failed send, so
* I fall through and return success here! */
CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n",
remote_id.nid, portal, xid, rc);
rc2 = PtlMDUnlink(*mdh);
- LASSERT (rc2 == PTL_OK);
+ LASSERTF(rc2 == PTL_OK, "rc2 = %d\n", rc2);
}
RETURN (0);
#include <linux/kp30.h>
#include <linux/lustre_net.h>
-#ifndef __CYGWIN__
+#ifdef __KERNEL__
# include <linux/ctype.h>
# include <linux/init.h>
#else
}
}
+ if (rc == 0) {
+ /* If new requests have been added, make sure to wake up */
+ spin_lock_irqsave(&pc->pc_set->set_new_req_lock, flags);
+ rc = !list_empty(&pc->pc_set->set_new_requests);
+ spin_unlock_irqrestore(&pc->pc_set->set_new_req_lock, flags);
+ }
+
RETURN(rc);
}
LASSERT (obd_lustre_upcall != NULL);
if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) {
- CDEBUG(D_ERROR, "%s: starting recovery without upcall\n",
+ CDEBUG(D_HA, "%s: starting recovery without upcall\n",
imp->imp_target_uuid.uuid);
ptlrpc_connect_import(imp, NULL);
}
else if (strcmp(obd_lustre_upcall, "NONE") == 0) {
- CDEBUG(D_ERROR, "%s: recovery diabled\n",
+ CDEBUG(D_HA, "%s: recovery disabled\n",
imp->imp_target_uuid.uuid);
}
else {
- CDEBUG(D_ERROR, "%s: calling upcall to start recovery\n",
+ CDEBUG(D_HA, "%s: calling upcall to start recovery\n",
imp->imp_target_uuid.uuid);
ptlrpc_run_failed_import_upcall(imp);
}
{
int rc = 0;
struct list_head *tmp, *pos;
- struct ptlrpc_request *req;
+ struct ptlrpc_request *req = NULL;
unsigned long flags;
__u64 last_transno;
ENTRY;
*/
list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+
+ /* If need to resend the last sent transno (because a
+ reconnect has occurred), then stop on the matching
+ req and send it again. If, however, the last sent
+ transno has been committed then we continue replay
+ from the next request. */
+ if (imp->imp_resend_replay &&
+ req->rq_transno == last_transno) {
+ lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+ break;
+ }
+
if (req->rq_transno > last_transno) {
- rc = ptlrpc_replay_req(req);
- if (rc) {
- CERROR("recovery replay error %d for req "
- LPD64"\n", rc, req->rq_xid);
- RETURN(rc);
- }
- *inflight = 1;
+ imp->imp_last_replay_transno = req->rq_transno;
break;
}
+
+ req = NULL;
+ }
+
+ imp->imp_resend_replay = 0;
+
+ if (req != NULL) {
+ rc = ptlrpc_replay_req(req);
+ if (rc) {
+ CERROR("recovery replay error %d for req "
+ LPD64"\n", rc, req->rq_xid);
+ RETURN(rc);
+ }
+ *inflight = 1;
}
RETURN(rc);
}
if (rc)
RETURN(rc);
- CDEBUG(D_ERROR, "%s: recovery started, waiting\n",
+ CDEBUG(D_HA, "%s: recovery started, waiting\n",
imp->imp_target_uuid.uuid);
lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL);
rc = l_wait_event(imp->imp_recovery_waitq,
!ptlrpc_import_in_recovery(imp), &lwi);
- CDEBUG(D_ERROR, "%s: recovery finished\n",
+ CDEBUG(D_HA, "%s: recovery finished\n",
imp->imp_target_uuid.uuid);
RETURN(rc);
# See the file COPYING in this distribution
EXTRA_DIST = license-status maketags.sh lustre.spec version_tag.pl.in \
- $(initd_SCRIPTS) lustre.spec.in lustre-kernel-2.4.spec.in \
+ lustre lustre.spec.in lustre-kernel-2.4.spec.in \
lmake linux-merge-config.awk linux-merge-modules.awk \
linux-rhconfig.h
initddir = $(sysconfdir)/init.d
+if UTILS
initd_SCRIPTS = lustre
+endif
CONFIG=
VERSION=
+RHBUILD=0
+LINUX26=0
+SUSEBUILD=0
+
BASE_ARCH=
BIGMEM_ARCHS=
BOOT_ARCHS=
CONFIG_FILE="$TOPDIR/lustre/kernel_patches/kernel_configs/$CONFIG"
[ -r "$CONFIG_FILE" ] || \
- fatal 1 "Target $TARGET's config file $CONFIG missing from $TOPDIR/lustre/kernel_patches/kernel_configs/configs."
+ fatal 1 "Target $TARGET's config file $CONFIG missing from $TOPDIR/lustre/kernel_patches/kernel_configs/."
if [ "$EXTRA_VERSION_save" ] ; then
EXTRA_VERSION="$EXTRA_VERSION_save"
elif ! (( $RELEASE )) ; then
- EXTRA_VERSION="${EXTRA_VERSION}-${TAG//_/}.${TIMESTAMP}"
+ EXTRA_VERSION="${EXTRA_VERSION}-${TAG}.${TIMESTAMP}"
fi
# EXTRA_VERSION=${EXTRA_VERSION//-/_}
BUILD_ARCHS=
for arch in $(uniqify "$ALL_ARCHS") ; do
- if [ -z "$TARGET_ARCHS" ] || echo "$TARGET_ARCHS" | grep -s "$arch" ; then
+ if [ -z "$TARGET_ARCHS" ] || echo "$TARGET_ARCHS" | grep "$arch" >/dev/null 2>/dev/null ; then
BUILD_ARCHS="$BUILD_ARCHS $arch"
fi
done
popd >/dev/null
echo "Full patch has been saved in ${FULL_PATCH##*/}."
echo "Replacing .config files..."
- [ -d linux/configs ] || mkdir linux/configs
+ [ -d linux/configs ] || mkdir linux/configs || \
+ fatal 1 "Error creating configs directory."
rm -f linux/configs/*
- cp -v lustre/kernel_patches/kernel_configs/kernel-${VERSION}-${TARGET}*.config linux/configs/
+ cp -v lustre/kernel_patches/kernel_configs/kernel-${VERSION}-${TARGET}*.config linux/configs/ || \
+ fatal 1 "Error copying in kernel configs."
}
pack_linux()
-e "s/@SMP_ARCHS@/$SMP_ARCHS/g" \
-e "s/@UP_ARCHS@/$UP_ARCHS/g" \
-e "s/@RHBUILD@/$RHBUILD/g" \
+ -e "s/@LINUX26@/$LINUX26/g" \
+ -e "s/@SUSEBUILD@/$SUSEBUILD/g" \
< $TOPDIR/lustre/scripts/lustre-kernel-2.4.spec.in \
> lustre-kernel-2.4.spec
[ -d SRPMS ] || mkdir SRPMS
--kerneldir $RPM_SOURCE_DIR \
-j $RPM_BUILD_NCPUS \
--destdir $RPM_BUILD_ROOT \
- -- @CONFIGURE_FLAGS@
+ -- --enable-modules \
+ --disable-doc --disable-tests \
+ --disable-utils --disable-liblustre \
+ @CONFIGURE_FLAGS@
}
BuildLustre()
--kerneldir $RPM_SOURCE_DIR \
-j $RPM_BUILD_NCPUS \
--destdir $RPM_BUILD_ROOT \
- -- @CONFIGURE_FLAGS@
+ -- --enable-utils \
+ --disable-doc --disable-tests \
+ --disable-modules --disable-liblustre \
+ @CONFIGURE_FLAGS@
}
SaveHeaders()
BuildKernel smp
%endif
-# we want this one last, so that it is the one populating /usr/bin
-%if %{buildup} && %{buildbase}
+%if %{buildup}
BuildKernel
-%elseif %{buildbase}
-BuildLustre
%endif
%if %{buildbase}
+BuildLustre
SaveHeaders
%endif
HEADER_FILE=../../savedheaders/%{_target_cpu}/up/version.h
else
# test build not including uniprocessor, must get info from somewhere
- HEADER_FILE=$(ls ../../savedheaders/*/*/version.h | head -1)
+ HEADER_FILE=$(ls ../../savedheaders/*/*/version.h | head -n 1)
fi
grep -v UTS_RELEASE $HEADER_FILE >> version.h
rm -rf ../../savedheaders
} ; popd
touch $RPM_BUILD_ROOT/boot/kernel.h-%{kversion}
-rm -f $RPM_BUILD_ROOT/usr/include/linux
+# rm -f $RPM_BUILD_ROOT/usr/include/linux
rm -rf $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/savedheaders
/usr/bin/*
/usr/lib/lustre/python
/etc/init.d/lustre
-/usr/include/lustre
+/usr/include/lustre/*
+/usr/include/portals/*
+/usr/include/linux/*
/lib/lib*.a
#%files -n lustre-doc
%endif
%build
+# if RPM_BUILD_NCPUS unset, set it
+if [ -z "$RPM_BUILD_NCPUS" ] ; then
+ RPM_BUILD_NCPUS=$(egrep -c "^cpu[0-9]+" /proc/stat || :)
+ if [ $RPM_BUILD_NCPUS -eq 0 ] ; then
+ RPM_BUILD_NCPUS=1
+ fi
+ if [ $RPM_BUILD_NCPUS -gt 8 ] ; then
+ RPM_BUILD_NCPUS=8
+ fi
+fi
+
rm -rf $RPM_BUILD_ROOT
# Set an explicit path to our Linux tree, if we can.
cd $RPM_BUILD_DIR/lustre-%{version}
./configure --with-linux='%{linuxdir}' %{disable_doc} --disable-liblustre
-make
+make -j $RPM_BUILD_NCPUS -s
%install
cd $RPM_BUILD_DIR/lustre-%{version}
ostactive
ll_dirstripe_verify
rename_many
+openfilleddirunlink
AM_CFLAGS = $(LLCFLAGS)
# LDADD = -lldap
# LDADD := -lreadline -ltermcap # -lefence
-EXTRA_DIST = $(pkgexample_SCRIPTS) $(noinst_SCRIPTS) $(noinst_DATA) \
- sanity.sh rundbench
-if TESTS
-pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh
-pkgexample_SCRIPTS += local.sh echo.sh uml.sh lov.sh
+
+pkgexample_scripts = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh
+pkgexample_scripts += local.sh echo.sh uml.sh lov.sh
noinst_DATA =
noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh tbox.sh
noinst_SCRIPTS += llrmount.sh runfailure-mds runvmstat runfailure-net
noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests
noinst_SCRIPTS += sanity.sh rundbench
+
+EXTRA_DIST = $(pkgexample_scripts) $(noinst_SCRIPTS) $(noinst_DATA) \
+ sanity.sh rundbench
+if TESTS
+pkgexample_SCRIPTS = $(pkgexample_scripts)
noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay
noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy
noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink utime cmknod
client_HOST=client
LIVE_CLIENT=${LIVE_CLIENT:-mdev6}
# This should always be a list, not a regexp
-#FAIL_CLIENTS=${FAIL_CLIENTS:-mdev7}
-FAIL_CLIENTS=${FAIL_CLIENTS:-""}
+FAIL_CLIENTS=${FAIL_CLIENTS:-mdev8}
+#FAIL_CLIENTS=${FAIL_CLIENTS:-""}
NETTYPE=${NETTYPE:-tcp}
TIMEOUT=${TIMEOUT:-30}
-PTLDEBUG=${PTLDEBUG:-0}
-SUBSYSTEM=${SUBSYSTEM:-0}
+PTLDEBUG=${PTLDEBUG:-0x3f0400}
+SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff}
MOUNT=${MOUNT:-"/mnt/lustre"}
UPCALL=${CLIENT_UPCALL:-`pwd`/replay-single-upcall.sh}
MDSDEV=${MDSDEV:-/dev/sda1}
MDSSIZE=${MDSSIZE:-50000}
+MDSJOURNALSIZE=${MDSJOURNALSIZE:-0}
OSTDEV=${OSTDEV:-$TMP/ost%d-`hostname`}
-OSTSIZE=${OSTSIZE:=50000}
+OSTSIZE=${OSTSIZE:=500000}
+OSTJOURNALSIZE=${OSTJOURNALSIZE:-0}
+
FSTYPE=${FSTYPE:-ext3}
STRIPE_BYTES=${STRIPE_BYTES:-1048576}
STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
OSTSIZE=${OSTSIZE:-50000}
FSTYPE=${FSTYPE:-ext3}
TIMEOUT=${TIMEOUT:-20}
-UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh}
+UPCALL=${UPCALL:-DEFAULT}
STRIPE_BYTES=${STRIPE_BYTES:-65536}
STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
stop_mds || return 2
stop_ost || return 3
- lsmod | grep -q portals && return 3
+ lsmod | grep -q portals && return 4
return 0
}
[ -d $MOUNT ] || mkdir -p $MOUNT
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
- llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT && exit 1
+ llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT && return 1
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
stop_mds || return 2
stop_ost || return 3
- lsmod | grep -q portals && return 3
+ lsmod | grep -q portals && return 4
return 0
}
run_test 5c "cleanup after failed mount (bug 2712)"
+test_5d() {
+ start_ost
+ start_mds
+ stop_ost --force
+
+ [ -d $MOUNT ] || mkdir -p $MOUNT
+ $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+ llmount $mds_HOST://mds_svc/client_facet $MOUNT || return 1
+
+ umount $MOUNT || return 2
+ # cleanup client modules
+ $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+
+ stop_mds || return 3
+
+ lsmod | grep -q portals && return 4
+ return 0
+
+}
+run_test 5d "ost down, don't crash during mount attempt"
+
test_6() {
setup
manual_umount_client
ALWAYS_EXCEPT="10"
+SETUP=${SETUP:-"setup"}
+CLEANUP=${CLEANUP:-"cleanup"}
+
build_test_filter
assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
}
setup() {
+ gen_config
+
rm -rf logs/*
for i in `seq $NUMOST`; do
wait_for ost$i
if [ "$ONLY" == "cleanup" ]; then
- cleanup
+ $CLEANUP
exit
fi
-if [ -z "$NOSETUP" ]; then
- gen_config
- setup
-fi
-
if [ ! -z "$EVAL" ]; then
eval "$EVAL"
exit $?
fi
+$SETUP
+
if [ "$ONLY" == "setup" ]; then
exit 0
fi
run_test 10 "Running Availability for 6 hours..."
equals_msg "Done, cleaning up"
-cleanup
+$CLEANUP
LUSTRE=${LUSTRE:-`dirname $0`/..}
-UPCALL=${UPCALL:-$PWD/recovery-small-upcall.sh}
+
. $LUSTRE/tests/test-framework.sh
init_test_env $@
mkdir -p $DIR/$tdir
multiop $DIR/$tdir/${tfile} O_wc &
MULTI_PID=$!
- usleep 500
+ sleep 1
cancel_lru_locks OSC
#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
do_facet ost sysctl -w lustre.fail_loc=0x80000308
. ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
+SETUP=${SETUP:-"setup"}
+CLEANUP=${CLEANUP:-"cleanup"}
+
gen_config() {
rm -f $XMLCONFIG
add_mds mds --dev $MDSDEV --size $MDSSIZE
fail mds
fi
- umount $MOUNT2
- umount $MOUNT
+ umount $MOUNT2 || true
+ umount $MOUNT || true
rmmod llite
stop mds ${FORCE}
stop ost2 ${FORCE}
exit
fi
-gen_config
-start ost --reformat $OSTLCONFARGS
-PINGER=`cat /proc/fs/lustre/pinger`
+setup() {
+ gen_config
+ start ost --reformat $OSTLCONFARGS
+ start ost2 --reformat $OSTLCONFARGS
+ start mds $MDSLCONFARGS --reformat
+ grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT
+ grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2
-if [ "$PINGER" != "on" ]; then
- echo "ERROR: Lustre must be built with --enable-pinger for replay-dual"
- stop mds
- exit 1
-fi
-
-start ost2 --reformat $OSTLCONFARGS
-[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
-start mds $MDSLCONFARGS --reformat
-grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT
-grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2
-
-echo $TIMEOUT > /proc/sys/lustre/timeout
-echo $UPCALL > /proc/sys/lustre/upcall
+# echo $TIMEOUT > /proc/sys/lustre/timeout
+}
+$SETUP
[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
test_1() {
}
run_test 6 "open1, open2, unlink |X| close1 [fail mds] close2"
+test_8() {
+ replay_barrier mds
+ drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
+ fail mds
+ checkstat $MOUNT2/$tfile || return 2
+ rm $MOUNT1/$tfile || return 3
+
+ return 0
+}
+run_test 8 "replay of resent request"
+
+test_9() {
+ replay_barrier mds
+ mcreate $MOUNT1/$tfile-1
+ mcreate $MOUNT2/$tfile-2
+ # drop first reint reply
+ sysctl -w lustre.fail_loc=0x80000119
+ fail mds
+ sysctl -w lustre.fail_loc=0
+
+ rm $MOUNT1/$tfile-[1,2] || return 1
+
+ return 0
+}
+run_test 9 "resending a replayed create"
+
+test_10() {
+ mcreate $MOUNT1/$tfile-1
+ replay_barrier mds
+ munlink $MOUNT1/$tfile-1
+ mcreate $MOUNT2/$tfile-2
+ # drop first reint reply
+ sysctl -w lustre.fail_loc=0x80000119
+ fail mds
+ sysctl -w lustre.fail_loc=0
+
+ checkstat $MOUNT1/$tfile-1 && return 1
+ checkstat $MOUNT1/$tfile-2 || return 2
+ rm $MOUNT1/$tfile-2
+
+ return 0
+}
+run_test 10 "resending a replayed unlink"
+
+test_11() {
+ replay_barrier mds
+ mcreate $MOUNT1/$tfile-1
+ mcreate $MOUNT2/$tfile-2
+ mcreate $MOUNT1/$tfile-3
+ mcreate $MOUNT2/$tfile-4
+ mcreate $MOUNT1/$tfile-5
+ # drop all reint replies for a while
+ sysctl -w lustre.fail_loc=0x0119
+ facet_failover mds
+ #sleep for while, let both clients reconnect and timeout
+ sleep $((TIMEOUT * 2))
+ sysctl -w lustre.fail_loc=0
+
+ rm $MOUNT1/$tfile-[1-5] || return 1
+
+ return 0
+}
+run_test 11 "both clients timeout during replay"
+
+test_12() {
+ replay_barrier mds
+
+ multiop $DIR/$tfile mo_c &
+ MULTIPID=$!
+ sleep 5
+
+ # drop first enqueue
+ sysctl -w lustre.fail_loc=0x80000302
+ facet_failover mds
+ df $MOUNT || return 1
+ sysctl -w lustre.fail_loc=0
+
+ ls $DIR/$tfile
+ $CHECKSTAT -t file $DIR/$tfile || return 2
+ kill -USR1 $MULTIPID || return 3
+ wait $MULTIPID || return 4
+ rm $DIR/$tfile
+
+ return 0
+}
+run_test 12 "open resend timeout"
+
+test_13() {
+ multiop $DIR/$tfile mo_c &
+ MULTIPID=$!
+ sleep 5
+
+ replay_barrier mds
+
+ kill -USR1 $MULTIPID || return 3
+ wait $MULTIPID || return 4
+
+ # drop close
+ sysctl -w lustre.fail_loc=0x80000115
+ facet_failover mds
+ df $MOUNT || return 1
+ sysctl -w lustre.fail_loc=0
+
+ ls $DIR/$tfile
+ $CHECKSTAT -t file $DIR/$tfile || return 2
+ rm $DIR/$tfile
+
+ return 0
+}
+run_test 13 "close resend timeout"
+
+test_14() {
+ replay_barrier mds
+ createmany -o $MOUNT1/$tfile- 25
+ createmany -o $MOUNT2/$tfile-2- 1
+ createmany -o $MOUNT1/$tfile-3- 25
+ umount $MOUNT2
+
+ facet_failover mds
+ # expect failover to fail
+ df $MOUNT && return 1
+
+ # first 25 files shouuld have been
+ # replayed
+ unlinkmany $MOUNT1/$tfile- 25 || return 2
+
+ zconf_mount `hostname` $MOUNT2
+ return 0
+}
+run_test 14 "timeouts waiting for lost client during replay"
+
+test_15() {
+ replay_barrier mds
+ createmany -o $MOUNT1/$tfile- 25
+ createmany -o $MOUNT2/$tfile-2- 1
+ umount $MOUNT2
+
+ facet_failover mds
+ df $MOUNT || return 1
+
+ lctl dk dk
+ unlinkmany $MOUNT1/$tfile- 25 || return 2
+
+ zconf_mount `hostname` $MOUNT2
+ return 0
+}
+run_test 15 "timeout waiting for lost client during replay, 1 client completes"
+
+
if [ "$ONLY" != "setup" ]; then
equals_msg test complete, cleaning up
- cleanup
+ $CLEANUP
fi
facet_active() {
local facet=$1
local activevar=${facet}active
+
+ if [ -f ./${facet}active ] ; then
+ source ./${facet}active
+ fi
+
active=${!activevar}
if [ -z "$active" ] ; then
echo -n ${facet}
+if UTILS
pymod_SCRIPTS = __init__.py lustredb.py error.py cmdline.py
-EXTRA_DIST = $(pymod_SCRIPTS)
+endif
+EXTRA_DIST = __init__.py lustredb.py error.py cmdline.py
AM_CPPFLAGS=$(LLCPPFLAGS)
AM_LDFLAGS := -L$(top_builddir)/portals/utils
+sbin_scripts = lconf lmc llanalyze llstat.pl llobdstat.pl lactive \
+ load_ldap.sh lrun lwizard
+bin_scripts = lfind lstripe
+
if UTILS
rootsbin_SCRIPTS = mount.lustre
sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount
-sbin_SCRIPTS = lconf lmc llanalyze llstat.pl llobdstat.pl lactive load_ldap.sh lrun
-sbin_SCRIPTS += lwizard
-bin_SCRIPTS = lfind lstripe
bin_PROGRAMS = lfs
lib_LIBRARIES = liblustreapi.a
-if LIBLUSTRE
-sbin_SCRIPTS += lrun
-endif # LIBLUSTRE
+sbin_SCRIPTS = $(sbin_scripts)
+bin_SCRIPTS = $(bin_scripts)
endif # UTILS
lctl_LDADD := $(LIBREADLINE) -lptlctl
llmount_SOURCES = llmount.c
llmount_LDADD = $(LIBREADLINE) -lptlctl
-EXTRA_DIST = $(bin_SCRIPTS) $(sbin_SCRIPTS)
+EXTRA_DIST = $(bin_scripts) $(sbin_scripts)
# NOTE: this should only be run on i386.
newwiretest: wirehdr.c wirecheck