Whamcloud - gitweb
Land b1_2_smallfix onto b1_2 (20040616_1009)
authoradilger <adilger>
Wed, 16 Jun 2004 16:50:40 +0000 (16:50 +0000)
committeradilger <adilger>
Wed, 16 Jun 2004 16:50:40 +0000 (16:50 +0000)
- allow clients to reconnect during replay (b=1742)
- re-awaken ptlrpcd if new requests arrive during check_set (b=3554)
- fix cond_resched (b=3554)
- only evict unfinished clients after recovery (b=3515)
- allow bulk resend, prevent data loss (b=3570)
- remove extraneous obd dereference causing LASSERT failure (b=3334)
- don't use get_cycles() when creating temp. files on the mds (b=3156)
- hold i_sem when setting i_size in ll_extent_lock() (b=3564)
- fix ksocknal_fmb_callback() error messages (b=2918)

94 files changed:
ldiskfs/ldiskfs/autoMakefile.am
lnet/archdep.m4
lnet/autoMakefile.am
lnet/include/.cvsignore
lnet/include/Makefile.am [new file with mode: 0644]
lnet/include/linux/.cvsignore [new file with mode: 0644]
lnet/include/linux/Makefile.am [new file with mode: 0644]
lnet/include/linux/libcfs.h
lnet/include/lnet/.cvsignore [new file with mode: 0644]
lnet/include/lnet/Makefile.am [new file with mode: 0644]
lnet/include/lnet/types.h
lnet/klnds/qswlnd/qswlnd_cb.c
lnet/klnds/socklnd/socklnd_cb.c
lnet/ulnds/Makefile.am
lnet/ulnds/socklnd/Makefile.am
lnet/utils/Makefile.am
lustre/ChangeLog
lustre/conf/Makefile.am
lustre/configure.in
lustre/include/linux/Makefile.am
lustre/include/linux/lustre_compat25.h
lustre/include/linux/lustre_export.h
lustre/include/linux/lustre_fsfilt.h
lustre/include/linux/lustre_import.h
lustre/include/linux/lustre_lib.h
lustre/include/linux/lustre_log.h
lustre/include/linux/obd_class.h
lustre/include/lustre/Makefile.am
lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch
lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch
lustre/kernel_patches/patches/ext-2.4-patch-1.patch
lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch
lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch
lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch
lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch
lustre/kernel_patches/patches/ext3-htree-suse.patch
lustre/kernel_patches/patches/ext3-htree.patch
lustre/kernel_patches/patches/ext3-pdirops-2.4.20-rh.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-trusted_ea-2.4.21-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/iopen-2.4.19-suse.patch
lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch
lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch
lustre/kernel_patches/series/chaos-2.4.21
lustre/kernel_patches/series/rh-2.4.20
lustre/kernel_patches/series/suse-2.4.19
lustre/kernel_patches/series/vanilla-2.4.20
lustre/kernel_patches/targets/rh-2.4.target
lustre/ldiskfs/autoMakefile.am
lustre/ldlm/ldlm_lib.c
lustre/liblustre/rw.c
lustre/llite/file.c
lustre/lov/lov_obd.c
lustre/mds/mds_fs.c
lustre/mds/mds_lov.c
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/llog_ioctl.c
lustre/obdfilter/filter.c
lustre/osc/osc_request.c
lustre/portals/archdep.m4
lustre/portals/autoMakefile.am
lustre/portals/include/.cvsignore
lustre/portals/include/Makefile.am [new file with mode: 0644]
lustre/portals/include/linux/.cvsignore [new file with mode: 0644]
lustre/portals/include/linux/Makefile.am [new file with mode: 0644]
lustre/portals/include/linux/libcfs.h
lustre/portals/include/portals/.cvsignore [new file with mode: 0644]
lustre/portals/include/portals/Makefile.am [new file with mode: 0644]
lustre/portals/include/portals/types.h
lustre/portals/knals/qswnal/qswnal_cb.c
lustre/portals/knals/socknal/socknal_cb.c
lustre/portals/unals/Makefile.am
lustre/portals/utils/Makefile.am
lustre/ptlrpc/client.c
lustre/ptlrpc/import.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/ptlrpcd.c
lustre/ptlrpc/recover.c
lustre/scripts/Makefile.am
lustre/scripts/lbuild
lustre/scripts/lustre-kernel-2.4.spec.in
lustre/scripts/lustre.spec.in
lustre/tests/.cvsignore
lustre/tests/Makefile.am
lustre/tests/cfg/insanity-mdev.sh
lustre/tests/cfg/local.sh
lustre/tests/conf-sanity.sh
lustre/tests/insanity.sh
lustre/tests/recovery-small.sh
lustre/tests/replay-dual.sh
lustre/tests/test-framework.sh
lustre/utils/Lustre/Makefile.am
lustre/utils/Makefile.am

index f81e6e7..eacc902 100644 (file)
@@ -33,10 +33,17 @@ patches := @top_srcdir@/kernel_patches/patches
 sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series)
        rm -rf linux-stage linux sources $(ldiskfs_SOURCES)
        mkdir -p linux-stage/fs/ext3 linux-stage/include/linux
-       cd linux-stage && quilt setup -l ../$(series) -d ../$(patches)
        cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3
        cp $(linux_headers) linux-stage/include/linux
+if USE_QUILT
+       cd linux-stage && quilt setup -l ../$(series) -d ../$(patches)
        cd linux-stage && quilt push -a -q
+else
+       @cd linux-stage && for i in $$(<../$(series)) ; do \
+               echo "patch -p1 < ../$(patches)/$$i" ; \
+               patch -p1 < ../$(patches)/$$i || exit 1 ; \
+       done
+endif
        mkdir linux
        @echo -n "Replacing 'ext3' with 'ldiskfs':"
        @for i in $(notdir $(ext3_headers) $(ext3_sources)) $(new_sources) ; do \
@@ -50,6 +57,7 @@ sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series)
                        linux-stage/include/linux/ext3$$i \
                        > linux/ldiskfs$$i ; \
        done
+       @echo
        touch sources
 
 foo-check:
index 2a42368..c78fc34 100644 (file)
@@ -92,6 +92,7 @@ AC_CHECK_FILE([$LINUX/include/linux/namei.h],
        [
                linux25="yes"
                KMODEXT=".ko"
+               enable_ldiskfs="yes"
        ],[
                KMODEXT=".o"
                linux25="no"
@@ -101,6 +102,16 @@ AC_MSG_RESULT([$linux25])
 AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
 AC_SUBST(KMODEXT)
 
+AC_PATH_PROG(PATCH, patch, [no])
+AC_PATH_PROG(QUILT, quilt, [no])
+AM_CONDITIONAL(USE_QUILT, test x$QUILT = xno)
+
+if test x$enable_ldiskfs$enable_modules = xyesyes ; then
+       if test x$PATCH$QUILT = xnono ; then
+               AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)])
+       fi
+fi
+
 # -------  Makeflags ------------------
 
 CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
@@ -135,7 +146,7 @@ _ACEOF
 AC_DEFUN([LUSTRE_MODULE_COMPILE_IFELSE],
 [m4_ifvaln([$1], [LUSTRE_MODULE_CONFTEST([$1])])dnl
 rm -f kernel-tests/conftest.o kernel-tests/conftest.mod.c kernel-tests/conftest.ko
-AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="$EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])],
+AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])],
        [$4],
        [_AC_MSG_LOG_CONFTEST
 m4_ifvaln([$5],[$5])dnl])dnl
@@ -446,7 +457,7 @@ LUSTRE_MODULE_TRY_COMPILE(
 # ---------- Red Hat 2.4.20 backports some 2.5 bits --------
 # This needs to run after we've defined the KCPPFLAGS
 
-AC_MSG_CHECKING([for kernel version])
+AC_MSG_CHECKING([if task_struct has a sighand field])
 LUSTRE_MODULE_TRY_COMPILE(
        [
                #include <linux/sched.h>
@@ -455,9 +466,24 @@ LUSTRE_MODULE_TRY_COMPILE(
                p.sighand = NULL;
        ],[
                AC_DEFINE(CONFIG_RH_2_4_20, 1, [this kernel contains Red Hat 2.4.20 patches])
-               AC_MSG_RESULT([redhat-2.4.20])
+               AC_MSG_RESULT([yes])
        ],[
-               AC_MSG_RESULT([$LINUXRELEASE])
+               AC_MSG_RESULT([no])
+       ])
+
+# ---------- 2.4.20 introduced cond_resched --------------
+
+AC_MSG_CHECKING([if kernel offers cond_resched])
+LUSTRE_MODULE_TRY_COMPILE(
+       [
+               #include <linux/sched.h>
+       ],[
+               cond_resched();
+       ],[
+               AC_MSG_RESULT([yes])
+               AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found])
+       ],[
+               AC_MSG_RESULT([no])
        ])
 
 # ---------- Red Hat 2.4.21 backports some more 2.5 bits --------
index bd57e6e..485ff04 100644 (file)
@@ -3,6 +3,6 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-EXTRA_DIST = archdep.m4 build.m4 include 
+EXTRA_DIST = archdep.m4 build.m4
 
-SUBDIRS = portals libcfs knals unals router tests doc utils
+SUBDIRS = portals libcfs knals unals router tests doc utils include
index d45f796..94d3790 100644 (file)
@@ -2,3 +2,5 @@ config.h
 stamp-h
 stamp-h1
 stamp-h.in
+Makefile
+Makefile.in
diff --git a/lnet/include/Makefile.am b/lnet/include/Makefile.am
new file mode 100644 (file)
index 0000000..2b3eb8c
--- /dev/null
@@ -0,0 +1,3 @@
+SUBDIRS = linux portals
+
+EXTRA_DIST = cygwin-ioctl.h
diff --git a/lnet/include/linux/.cvsignore b/lnet/include/linux/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/lnet/include/linux/Makefile.am b/lnet/include/linux/Makefile.am
new file mode 100644 (file)
index 0000000..3c28c6e
--- /dev/null
@@ -0,0 +1,4 @@
+linuxdir = $(includedir)/linux
+
+EXTRA_DIST = kp30.h kpr.h libcfs.h lustre_list.h portals_compat25.h    \
+       portals_lib.h
index efdc8fe..6772e82 100644 (file)
@@ -2,7 +2,7 @@
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
 #ifndef _LIBCFS_H
-
+#define _LIBCFS_H
 
 #define PORTAL_DEBUG
 
diff --git a/lnet/include/lnet/.cvsignore b/lnet/include/lnet/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am
new file mode 100644 (file)
index 0000000..5ed6090
--- /dev/null
@@ -0,0 +1,10 @@
+portalsdir=$(includedir)/portals
+
+if UTILS
+portals_HEADERS = list.h
+endif
+
+EXTRA_DIST = api.h api-support.h arg-blocks.h defines.h errno.h                \
+       internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h       \
+       list.h lltrace.h myrnal.h nal.h nalids.h p30.h ppid.h ptlctl.h  \
+       socknal.h stringtab.h types.h
index 74ef493..80995e9 100644 (file)
@@ -1,26 +1,15 @@
 #ifndef _P30_TYPES_H_
 #define _P30_TYPES_H_
 
-#ifdef __linux__
-# include <asm/types.h>
-# if defined(__powerpc__) && !defined(__KERNEL__)
-#  define __KERNEL__
-#  include <asm/timex.h>
-#  undef __KERNEL__
-# else
-#  include <asm/timex.h>
-# endif
-#else
-# include <sys/types.h>
-typedef u_int32_t __u32;
-typedef u_int64_t __u64;
-#endif
+#include <asm/types.h>
 
 #ifdef __KERNEL__
 # include <linux/time.h>
+# include <asm/timex.h>
 #else
 # include <sys/time.h>
 # define do_gettimeofday(tv) gettimeofday(tv, NULL);
+typedef unsigned long long cycles_t;
 #endif
 
 #include <portals/errno.h>
index 6bff730..08453a0 100644 (file)
@@ -585,7 +585,7 @@ kqswnal_launch (kqswnal_tx_t *ktx)
         /* Don't block for transmit descriptor if we're in interrupt context */
         int   attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
         int   dest = kqswnal_nid2elanid (ktx->ktx_nid);
-        long  flags;
+        unsigned long flags;
         int   rc;
 
         ktx->ktx_launchtime = jiffies;
@@ -1429,7 +1429,7 @@ kqswnal_rx (kqswnal_rx_t *krx)
 void 
 kqswnal_rxhandler(EP_RXD *rxd)
 {
-        long          flags;
+        unsigned long flags;
         int           nob    = ep_rxd_len (rxd);
         int           status = ep_rxd_status (rxd);
         kqswnal_rx_t *krx    = (kqswnal_rx_t *)ep_rxd_arg (rxd);
@@ -1732,7 +1732,7 @@ kqswnal_scheduler (void *arg)
         kqswnal_rx_t    *krx;
         kqswnal_tx_t    *ktx;
         kpr_fwd_desc_t  *fwd;
-        long             flags;
+        unsigned long    flags;
         int              rc;
         int              counter = 0;
         int              shuttingdown = 0;
index f02cbda..37695c9 100644 (file)
@@ -1187,7 +1187,7 @@ ksocknal_fmb_callback (void *arg, int error)
 {
         ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
         ksock_fmb_pool_t  *fmp = fmb->fmb_pool;
-        ptl_hdr_t         *hdr = (ptl_hdr_t *)page_address(fmb->fmb_kiov[0].kiov_page);
+        ptl_hdr_t         *hdr = &fmb->fmb_hdr;
         ksock_conn_t      *conn = NULL;
         ksock_sched_t     *sched;
         unsigned long      flags;
index 4c842a1..15080b0 100644 (file)
@@ -2,7 +2,12 @@ if LIBLUSTRE
 noinst_LIBRARIES = libtcpnal.a
 endif
 
-pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h     \
+       ipmap.h bridge.h procbridge.h
+
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h     \
+       dispatch.h table.h timer.h address.c procapi.c proclib.c        \
+       connection.c tcpnal.c connection.h
+
 libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
 libtcpnal_a_CFLAGS = $(LLCFLAGS)
index 4c842a1..15080b0 100644 (file)
@@ -2,7 +2,12 @@ if LIBLUSTRE
 noinst_LIBRARIES = libtcpnal.a
 endif
 
-pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h     \
+       ipmap.h bridge.h procbridge.h
+
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h     \
+       dispatch.h table.h timer.h address.c procapi.c proclib.c        \
+       connection.c tcpnal.c connection.h
+
 libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
 libtcpnal_a_CFLAGS = $(LLCFLAGS)
index 15c1774..851a8e1 100644 (file)
@@ -14,8 +14,10 @@ libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS)
 libuptlctl_a_CFLAGS = $(LLCFLAGS)
 endif
 
+if UTILS
 sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
 lib_LIBRARIES = libptlctl.a
+endif
 
 acceptor_SOURCES = acceptor.c
 
index df7d863..30da8bf 100644 (file)
@@ -5,13 +5,22 @@ tbd  Cluster File Systems, Inc. <info@clusterfs.com>
        - strip trailing '/'s before comparing paths with /proc/mounts (3486)
        - remove assertions to work around "in-flight rpcs" recovery bug (3063)
        - change init script to fail more clearly if not run as root (1528)
+       - allow clients to reconnect during replay (1742)
        - fix ns_lock/i_sem lock ordering deadlock for kms update (3477)
        - don't do DNS lookups on NIDs too small for IP addresses (3442)
+       - re-awaken ptlrpcd if new requests arrive during check_set  (3554)
+       - fix cond_resched  (3554)
+       - only evict unfinished clients after recovery   (3515)
+       - allow bulk resend, prevent data loss (3570)
        - dynamic ptlrpc request buffer allocation (2102)
        - don't allow unlinking open directory if it isn't empty (2904)
        - set MDS/OST threads to umask 0 to not clobber client modes (3359)
+       - remove extraneous obd dereference causing LASSERT failure (3334)
+       - don't use get_cycles() when creating temp. files on the mds (3156)
+       - hold i_sem when setting i_size in ll_extent_lock() (3564)
        * miscellania
        - servers can dump a log evicting a client - lustre.dump_on_timeout=1
+       - fix ksocknal_fmb_callback() error messages (2918)
 
 2004-05-27  Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.2.2
index 6e3666b..627f2a2 100644 (file)
@@ -6,7 +6,10 @@
 EXTRA_DIST = lustre.dtd lustre.schema slapd-lustre.conf lustre2ldif.xsl top.ldif
 ldapconfdir = $(sysconfdir)/openldap
 ldapschemadir = $(sysconfdir)/openldap/schema
+pkglibdir = '${exec_prefix}/usr/lib/$(PACKAGE)'
+
+if UTILS
 ldapconf_SCRIPTS = slapd-lustre.conf
 ldapschema_SCRIPTS = lustre.schema
-pkglibdir = '${exec_prefix}/usr/lib/$(PACKAGE)'
 pkglib_DATA = top.ldif lustre2ldif.xsl
+endif
index 7b14e69..99a1347 100644 (file)
@@ -5,7 +5,7 @@
 
 AC_INIT
 AC_CANONICAL_SYSTEM
-AM_INIT_AUTOMAKE(lustre, 1.2.2.3)
+AM_INIT_AUTOMAKE(lustre, 1.2.2.4)
 # AM_MAINTAINER_MODE
 
 # Four main targets: lustre kernel modules, utilities, tests, and liblustre
@@ -77,7 +77,6 @@ AC_ARG_ENABLE([ldiskfs],
                        [use ldiskfs for the Lustre backing FS]),
        [BACKINGFS='ldiskfs'],[enable_ldiskfs='no'])
 AC_MSG_RESULT([$enable_ldiskfs])
-AM_CONDITIONAL(LDISKFS, test x$enable_ldiskfs = xyes)
 
 AC_MSG_CHECKING([which backing filesystem to use])
 AC_MSG_RESULT([$BACKINGFS])
@@ -158,6 +157,8 @@ AM_CONDITIONAL(SMFS, test x$enable_smfs = xyes)
 sinclude(portals/build.m4)
 sinclude(portals/archdep.m4)
 
+AM_CONDITIONAL(LDISKFS, test x$enable_ldiskfs = xyes)
+
 if test x$enable_inkernel = xyes ; then
        find . -name Makefile.mk | sed 's/.mk$//' | xargs -n 1 \
                sh -e -x -c '(cp -f $0.mk $0.in)'
@@ -217,6 +218,9 @@ portals/Kernelenv
 portals/Makefile
 portals/autoMakefile
 portals/doc/Makefile
+portals/include/Makefile
+portals/include/linux/Makefile
+portals/include/portals/Makefile
 portals/knals/Makefile
 portals/knals/autoMakefile
 portals/knals/gmnal/Makefile
index cb75fe5..4c67b12 100644 (file)
@@ -3,6 +3,12 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
+linuxdir = $(includedir)/linux
+
+if UTILS
+linux_HEADERS = lustre_idl.h
+endif
+
 EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_ha.h lustre_lib.h \
   lustre_mgmt.h obd_cache.h obd_lov.h lustre_dlm.h lustre_handles.h \
   lustre_net.h obd_class.h obd_ost.h obd_support.h lustre_commit_confd.h \
index 13363bd..b9a295e 100644 (file)
@@ -102,10 +102,6 @@ static inline int cleanup_group_info(void)
 
 #define smp_num_cpus    NR_CPUS
 
-#ifndef conditional_schedule
-#define conditional_schedule() cond_resched()
-#endif
-
 #include <linux/proc_fs.h>
 
 #else /* 2.4.. */
@@ -183,8 +179,14 @@ static inline int cleanup_group_info(void)
         return 0;
 }
 
-#ifndef conditional_schedule
-#define conditional_schedule() if (unlikely(need_resched())) schedule()
+#ifndef HAVE_COND_RESCHED
+static inline void cond_resched(void)
+{
+        if (unlikely(need_resched())) {
+                set_current_state(TASK_RUNNING);
+                schedule();
+        }
+}
 #endif
 
 /* to find proc_dir_entry from inode. 2.6 has native one -bzzz */
index 9be781f..52b5c7a 100644 (file)
@@ -73,8 +73,9 @@ struct obd_export {
         spinlock_t                exp_lock; /* protects flags int below */
         /* ^ protects exp_outstanding_replies too */
         int                       exp_flags;
-        int                       exp_failed:1;
-        int                       exp_libclient:1; /* liblustre client? */
+        int                       exp_failed:1,
+                                  exp_replay_needed:1,
+                                  exp_libclient:1; /* liblustre client? */
         union {
                 struct mds_export_data    eu_mds_data;
                 struct filter_export_data eu_filter_data;
index 72f3817..b9beff5 100644 (file)
@@ -132,7 +132,7 @@ static inline void *fsfilt_brw_start_log(struct obd_device *obd,
         void *parent_handle = oti ? oti->oti_handle : NULL;
         void *handle = obd->obd_fsops->fs_brw_start(objcount, fso, niocount, nb,
                                                     parent_handle, logs);
-        CDEBUG(D_HA, "started handle %p (%p)\n", handle, parent_handle);
+        CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
 
         if (oti != NULL) {
                 if (parent_handle == NULL) {
@@ -177,7 +177,7 @@ static inline int fsfilt_commit_async(struct obd_device *obd,
         unsigned long now = jiffies;
         int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle);
 
-        CDEBUG(D_HA, "committing handle %p (async)\n", *wait_handle);
+        CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle);
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
 
@@ -189,7 +189,7 @@ static inline int fsfilt_commit_wait(struct obd_device *obd,
 {
         unsigned long now = jiffies;
         int rc = obd->obd_fsops->fs_commit_wait(inode, handle);
-        CDEBUG(D_HA, "waiting for completion %p\n", handle);
+        CDEBUG(D_INFO, "waiting for completion %p\n", handle);
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
         return rc;
index d2af141..74be113 100644 (file)
@@ -83,7 +83,7 @@ struct obd_import {
         int                       imp_invalid:1, imp_replayable:1,
                                   imp_dlm_fake:1, imp_server_timeout:1,
                                   imp_initial_recov:1, imp_force_verify:1,
-                                  imp_pingable:1;
+                                  imp_pingable:1, imp_resend_replay:1;
         __u32                     imp_connect_op;
 };
 
index a529860..c55e5ff 100644 (file)
@@ -493,13 +493,9 @@ static inline void ost_checksum(obd_count *cksum, void *addr, int len)
 
 static inline int ll_insecure_random_int(void)
 {
-#ifdef __arch_um__
         struct timeval t;
         do_gettimeofday(&t);
         return (int)(t.tv_usec);
-#else
-        return (int)(get_cycles() >> 2);
-#endif
 }
 
 /*
index 1d0ff9f..3eb75da 100644 (file)
@@ -127,8 +127,8 @@ int obd_llog_finish(struct obd_device *obd, int count);
 
 /* llog_ioctl.c */
 int llog_ioctl(struct llog_ctxt *ctxt, int cmd, struct obd_ioctl_data *data);
-int llog_catlog_list(struct obd_device *obd, int count,
-                     struct obd_ioctl_data *data);
+int llog_catalog_list(struct obd_device *obd, int count,
+                      struct obd_ioctl_data *data);
 
 /* llog_net.c */
 int llog_initiator_connect(struct llog_ctxt *ctxt);
index 1a577f0..8f2f9e2 100644 (file)
@@ -137,6 +137,7 @@ int class_connect(struct lustre_handle *conn, struct obd_device *obd,
                   struct obd_uuid *cluuid);
 int class_disconnect(struct obd_export *exp, int failover);
 void class_disconnect_exports(struct obd_device *obddev, int failover);
+void class_disconnect_stale_exports(struct obd_device *obddev, int failover);
 /* generic operations shared by various OBD types */
 int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data);
 int class_multi_cleanup(struct obd_device *obddev);
index a785ada..6faa7cd 100644 (file)
@@ -3,7 +3,8 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-
+if UTILS
 pkginclude_HEADERS = lustre_user.h liblustreapi.h
+endif
 
-EXTRA_DIST = $(pkginclude_HEADERS)
+EXTRA_DIST = lustre_user.h liblustreapi.h
diff --git a/lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-chaos.patch b/lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-chaos.patch
new file mode 100644 (file)
index 0000000..431bdc7
--- /dev/null
@@ -0,0 +1,323 @@
+Index: linux-p4smp/arch/i386/kernel/entry.S
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/entry.S  2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/entry.S       2004-06-14 13:14:19.000000000 -0700
+@@ -46,6 +46,7 @@
+ #include <asm/segment.h>
+ #include <asm/page.h>
+ #include <asm/smp.h>
++#include <asm/current.h>
+ #include <asm/unistd.h>
+       
+ EBX           = 0x00
+@@ -94,10 +95,6 @@ pt_sys_exit = 8
+ ENOSYS = 38
+-#define GET_CURRENT(reg) \
+-      movl $-8192, reg; \
+-      andl %esp, reg
+-
+ #if CONFIG_X86_HIGH_ENTRY
+ #define call_SYMBOL_NAME_ABS(X) movl $X, %ebp; call *%ebp
+@@ -193,7 +190,7 @@ ENOSYS = 38
+       GET_CURRENT(%ebx);                              \
+       movl real_stack(%ebx), %edx;                    \
+       movl %esp, %ebx;                                \
+-      andl $0x1fff, %ebx;                             \
++      andl $(THREAD_SIZE-1), %ebx;                    \
+       orl %ebx, %edx;                                 \
+       movl %edx, %esp;
+@@ -228,7 +225,7 @@ ENOSYS = 38
+ return_path_start_marker:                             \
+       nop;                                            \
+       movl %esp, %ebx;                                \
+-      andl $0x1fff, %ebx;                             \
++      andl $(THREAD_SIZE-1), %ebx;                    \
+       orl %ebx, %edx;                                 \
+       movl %esp, %eax;                                \
+       movl %edx, %esp;                                \
+Index: linux-p4smp/arch/i386/kernel/smpboot.c
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/smpboot.c        2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/smpboot.c     2004-06-14 13:14:19.000000000 -0700
+@@ -814,7 +814,7 @@ static void __init do_boot_cpu (int apic
+       /* So we see what's up   */
+       printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+-      stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
++      stack_start.esp = (void *)idle->thread.esp;
+       /*
+        * This grunge runs the startup process for
+@@ -887,7 +887,7 @@ static void __init do_boot_cpu (int apic
+                       Dprintk("CPU has booted.\n");
+               } else {
+                       boot_error= 1;
+-                      if (*((volatile unsigned char *)phys_to_virt(8192))
++                      if (*((volatile unsigned char *)phys_to_virt(THREAD_SIZE))
+                                       == 0xA5)
+                               /* trampoline started but...? */
+                               printk("Stuck ??\n");
+@@ -910,7 +910,7 @@ static void __init do_boot_cpu (int apic
+       }
+       /* mark "stuck" area as not stuck */
+-      *((volatile unsigned long *)phys_to_virt(8192)) = 0;
++      *((volatile unsigned long *)phys_to_virt(THREAD_SIZE)) = 0;
+       if(clustered_apic_mode == CLUSTERED_APIC_NUMAQ) {
+               printk("Restoring NMI vector\n");
+Index: linux-p4smp/arch/i386/kernel/traps.c
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/traps.c  2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/traps.c       2004-06-14 13:14:19.000000000 -0700
+@@ -273,7 +273,7 @@ void show_trace_task(struct task_struct 
+       unsigned long esp = tsk->thread.esp;
+       /* User space on another CPU? */
+-      if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1))
++      if ((esp ^ (unsigned long)tsk) & ~(THREAD_SIZE - 1))
+               return;
+       show_trace((unsigned long *)esp);
+ }
+Index: linux-p4smp/arch/i386/kernel/head.S
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/head.S   2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/head.S        2004-06-14 13:14:19.000000000 -0700
+@@ -15,6 +15,7 @@
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+ #include <asm/desc.h>
++#include <asm/current.h>
+ #define OLD_CL_MAGIC_ADDR     0x90020
+ #define OLD_CL_MAGIC          0xA33F
+@@ -328,7 +329,7 @@ rp_sidt:
+       ret
+ ENTRY(stack_start)
+-      .long SYMBOL_NAME(init_task_union)+8192
++      .long SYMBOL_NAME(init_task_union)+THREAD_SIZE
+       .long __KERNEL_DS
+ /* This is the default interrupt "handler" :-) */
+Index: linux-p4smp/arch/i386/kernel/irq.c
+===================================================================
+--- linux-p4smp.orig/arch/i386/kernel/irq.c    2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/kernel/irq.c 2004-06-14 13:14:19.000000000 -0700
+@@ -45,6 +45,7 @@
+ #include <asm/delay.h>
+ #include <asm/desc.h>
+ #include <asm/irq.h>
++#include <asm/current.h>
+@@ -585,7 +586,7 @@ asmlinkage unsigned int do_IRQ(struct pt
+       long esp;
+       /* Debugging check for stack overflow: is there less than 1KB free? */
+-      __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191));
++      __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE-1));
+       if (unlikely(esp < (sizeof(struct task_struct) + 1024))) {
+               extern void show_stack(unsigned long *);
+Index: linux-p4smp/arch/i386/lib/getuser.S
+===================================================================
+--- linux-p4smp.orig/arch/i386/lib/getuser.S   2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/lib/getuser.S        2004-06-14 13:14:19.000000000 -0700
+@@ -21,6 +21,10 @@
+  * as they get called from within inline assembly.
+  */
++/* Duplicated from asm/processor.h */
++#include <asm/current.h>
++#include <linux/config.h>
++
+ addr_limit = 12
+ .text
+@@ -28,7 +32,7 @@ addr_limit = 12
+ .globl __get_user_1
+ __get_user_1:
+       movl %esp,%edx
+-      andl $0xffffe000,%edx
++      andl $~(THREAD_SIZE - 1),%edx
+       cmpl addr_limit(%edx),%eax
+       jae bad_get_user
+ 1:    movzbl (%eax),%edx
+@@ -41,7 +45,7 @@ __get_user_2:
+       addl $1,%eax
+       movl %esp,%edx
+       jc bad_get_user
+-      andl $0xffffe000,%edx
++      andl $~(THREAD_SIZE - 1),%edx
+       cmpl addr_limit(%edx),%eax
+       jae bad_get_user
+ 2:    movzwl -1(%eax),%edx
+@@ -54,7 +58,7 @@ __get_user_4:
+       addl $3,%eax
+       movl %esp,%edx
+       jc bad_get_user
+-      andl $0xffffe000,%edx
++      andl $~(THREAD_SIZE - 1),%edx
+       cmpl addr_limit(%edx),%eax
+       jae bad_get_user
+ 3:    movl -3(%eax),%edx
+Index: linux-p4smp/arch/i386/config.in
+===================================================================
+--- linux-p4smp.orig/arch/i386/config.in       2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/arch/i386/config.in    2004-06-14 13:14:05.000000000 -0700
+@@ -310,6 +310,28 @@ if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86
+    define_bool CONFIG_HAVE_DEC_LOCK y
+ fi
++choice 'Bigger Stack Size Support' \
++     "off    CONFIG_NOBIGSTACK \
++      16KB   CONFIG_STACK_SIZE_16KB \
++      32KB   CONFIG_STACK_SIZE_32KB \
++      64KB   CONFIG_STACK_SIZE_64KB" off
++
++if [ "$CONFIG_NOBIGSTACK" = "y" ]; then
++   define_int CONFIG_STACK_SIZE_SHIFT 1
++else
++  if [ "$CONFIG_STACK_SIZE_16KB" = "y" ]; then
++     define_int CONFIG_STACK_SIZE_SHIFT 2
++  else
++    if [ "$CONFIG_STACK_SIZE_32KB" = "y" ]; then
++      define_int CONFIG_STACK_SIZE_SHIFT 3
++    else
++      if [ "$CONFIG_STACK_SIZE_64KB" = "y" ]; then
++        define_int CONFIG_STACK_SIZE_SHIFT 4
++      fi
++    fi
++  fi
++fi
++
+ source drivers/perfctr/Config.in
+ endmenu
+Index: linux-p4smp/include/asm-i386/current.h
+===================================================================
+--- linux-p4smp.orig/include/asm-i386/current.h        2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/include/asm-i386/current.h     2004-06-14 13:41:19.000000000 -0700
+@@ -1,15 +1,64 @@
+ #ifndef _I386_CURRENT_H
+ #define _I386_CURRENT_H
++#include <asm/page.h>
++
++/*
++ * Configurable page sizes on i386, mainly for debugging purposes.
++ * (c) Balbir Singh
++ */
++
++/* enumerate the values, include/asm-i386/hw_irq.h in particular needs this */
++#if (PAGE_SIZE != 4096)
++#error PAGE_SIZE != 4096 unsupported
++#endif
++
++#if (CONFIG_STACK_SIZE_SHIFT == 0)
++#define THREAD_SIZE   4096
++#elif (CONFIG_STACK_SIZE_SHIFT == 1)
++#define THREAD_SIZE   8192
++#elif (CONFIG_STACK_SIZE_SHIFT == 2)
++#define THREAD_SIZE   16384
++#elif (CONFIG_STACK_SIZE_SHIFT == 3)
++#define THREAD_SIZE   32768
++#elif (CONFIG_STACK_SIZE_SHIFT == 4)
++#define THREAD_SIZE   65536
++#else
++#error CONFIG_STACK_SIZE_SHIFT > 4 unsupported
++#endif
++
++#if (CONFIG_STACK_SIZE_SHIFT != 1) && defined(CONFIG_X86_4G)
++#error Large stacks with 4G/4G split unsupported
++#endif
++
++#ifdef __ASSEMBLY__
++
++#define GET_CURRENT(reg) \
++      movl $-THREAD_SIZE, reg; \
++      andl %esp, reg
++
++#else /* __ASSEMBLY__ */
++
++#define __alloc_task_struct() \
++  ((struct task_struct *) __get_free_pages(GFP_KERNEL, CONFIG_STACK_SIZE_SHIFT))
++
++#define __free_task_struct(p) do { \
++  BUG_ON((p)->state < TASK_ZOMBIE); \
++  free_pages((unsigned long) (p), CONFIG_STACK_SIZE_SHIFT); \
++} while(0)
++
++#define INIT_TASK_SIZE THREAD_SIZE
+ struct task_struct;
+ static inline struct task_struct * get_current(void)
+ {
+       struct task_struct *current;
+-      __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
++      __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~(THREAD_SIZE - 1)));
+       return current;
+  }
+  
+ #define current get_current()
++#endif /* __ASSEMBLY__ */
++
+ #endif /* !(_I386_CURRENT_H) */
+Index: linux-p4smp/include/asm-i386/hw_irq.h
+===================================================================
+--- linux-p4smp.orig/include/asm-i386/hw_irq.h 2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/include/asm-i386/hw_irq.h      2004-06-14 13:14:19.000000000 -0700
+@@ -136,21 +136,17 @@ extern char _stext, _etext;
+       "                                                               \
+       /* load the real stack - keep the offset */                     \
+                                                                       \
+-      movl $-8192, %ebx;                                              \
++      movl $- " STR(THREAD_SIZE) ", %ebx;                             \
+       andl %esp, %ebx;                                                \
+       movl 36(%ebx), %edx;                                            \
+       movl %esp, %ebx;                                                \
+-      andl $0x1fff, %ebx;                                             \
++      andl $( " STR(THREAD_SIZE) "-1), %ebx;                          \
+       orl %ebx, %edx;                                                 \
+       movl %edx, %esp;"
+ #define IRQ_NAME2(nr) nr##_interrupt(void)
+ #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
+-#define GET_CURRENT \
+-      "movl %esp, %ebx\n\t" \
+-      "andl $-8192, %ebx\n\t"
+-
+ /*
+  *    SMP has a few special interrupts for IPI messages
+  */
+Index: linux-p4smp/include/asm-i386/processor.h
+===================================================================
+--- linux-p4smp.orig/include/asm-i386/processor.h      2004-06-14 13:13:07.000000000 -0700
++++ linux-p4smp/include/asm-i386/processor.h   2004-06-14 13:14:19.000000000 -0700
+@@ -14,6 +14,7 @@
+ #include <asm/types.h>
+ #include <asm/sigcontext.h>
+ #include <asm/cpufeature.h>
++#include <asm/current.h>
+ #include <linux/cache.h>
+ #include <linux/config.h>
+ #include <linux/threads.h>
+@@ -498,10 +499,6 @@ unsigned long get_wchan(struct task_stru
+ #define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019])
+ #define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
+-#define THREAD_SIZE (2*PAGE_SIZE)
+-#define __alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1))
+-#define __free_task_struct(p) do { BUG_ON((p)->state < TASK_ZOMBIE); free_pages((unsigned long) (p), 1); } while (0)
+-
+ #define init_task     (init_task_union.task)
+ #define init_stack    (init_task_union.stack)
index 3de6a8f..f6b2f43 100644 (file)
 +              struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
 +              rec_len = EXT3_DIR_REC_LEN(de->name_len);
 +              memcpy (to, de, rec_len);
-+              ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++              ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
 +              de->inode = 0;
 +              map++;
 +              to += rec_len;
 +                      rec_len = EXT3_DIR_REC_LEN(de->name_len);
 +                      if (de > to)
 +                              memmove(to, de, rec_len);
-+                      to->rec_len = rec_len;
++                      to->rec_len = cpu_to_le16(rec_len);
 +                      prev = to;
-+                      to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++                      to = (struct ext3_dir_entry_2 *)((char *)to + rec_len);
 +              }
 +              de = next;
 +      }
 +      data1 = bh2->b_data;
 +
 +      /* The 0th block becomes the root, move the dirents out */
-+      de = (struct ext3_dir_entry_2 *) &root->dotdot;
-+      de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
++      de = (struct ext3_dir_entry_2 *)&root->dotdot;
++      de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
 +      len = ((char *) root) + blocksize - (char *) de;
 +      memcpy (data1, de, len);
 +      de = (struct ext3_dir_entry_2 *) data1;
index 748671f..28a1ad6 100644 (file)
 +              struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
 +              rec_len = EXT3_DIR_REC_LEN(de->name_len);
 +              memcpy (to, de, rec_len);
-+              ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++              ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
 +              de->inode = 0;
 +              map++;
 +              to += rec_len;
 +                      rec_len = EXT3_DIR_REC_LEN(de->name_len);
 +                      if (de > to)
 +                              memmove(to, de, rec_len);
-+                      to->rec_len = rec_len;
++                      to->rec_len = cpu_to_le16(rec_len);
 +                      prev = to;
-+                      to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++                      to = (struct ext3_dir_entry_2 *)((char *)to + rec_len);
 +              }
 +              de = next;
 +      }
 +      data1 = bh2->b_data;
 +
 +      /* The 0th block becomes the root, move the dirents out */
-+      de = (struct ext3_dir_entry_2 *) &root->dotdot;
-+      de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
++      de = (struct ext3_dir_entry_2 *)&root->dotdot;
++      de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
 +      len = ((char *) root) + blocksize - (char *) de;
 +      memcpy (data1, de, len);
 +      de = (struct ext3_dir_entry_2 *) data1;
index 748671f..28a1ad6 100644 (file)
 +              struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
 +              rec_len = EXT3_DIR_REC_LEN(de->name_len);
 +              memcpy (to, de, rec_len);
-+              ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++              ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len);
 +              de->inode = 0;
 +              map++;
 +              to += rec_len;
 +                      rec_len = EXT3_DIR_REC_LEN(de->name_len);
 +                      if (de > to)
 +                              memmove(to, de, rec_len);
-+                      to->rec_len = rec_len;
++                      to->rec_len = cpu_to_le16(rec_len);
 +                      prev = to;
-+                      to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++                      to = (struct ext3_dir_entry_2 *)((char *)to + rec_len);
 +              }
 +              de = next;
 +      }
 +      data1 = bh2->b_data;
 +
 +      /* The 0th block becomes the root, move the dirents out */
-+      de = (struct ext3_dir_entry_2 *) &root->dotdot;
-+      de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
++      de = (struct ext3_dir_entry_2 *)&root->dotdot;
++      de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
 +      len = ((char *) root) + blocksize - (char *) de;
 +      memcpy (data1, de, len);
 +      de = (struct ext3_dir_entry_2 *) data1;
index 031e46d..63684c5 100644 (file)
@@ -21,8 +21,8 @@ Index: linux-2.4.21-chaos/fs/ext3/ialloc.c
 +              EXT3_I(inode)->i_extra_isize = 0;
 +
        ei->i_state = EXT3_STATE_NEW;
-       err = ext3_get_inode_loc_new(inode, &iloc, 1);
-       if (err) goto fail;
+       err = ext3_get_inode_loc_new(inode, &iloc, 1);
+       if (err) goto fail;
 Index: linux-2.4.21-chaos/fs/ext3/inode.c
 ===================================================================
 --- linux-2.4.21-chaos.orig/fs/ext3/inode.c    2003-12-12 17:39:11.000000000 +0300
index c168149..0806c38 100644 (file)
@@ -1667,7 +1667,7 @@ Index: linux-2.4.19-pre1/fs/ext3/namei.c
 +      data1 = bh2->b_data;
 +
 +      /* The 0th block becomes the root, move the dirents out */
-+      de = (struct ext3_dir_entry_2 *) &root->dotdot;
++      de = (struct ext3_dir_entry_2 *)&root->dotdot;
 +      de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
 +      len = ((char *) root) + blocksize - (char *) de;
 +      memcpy (data1, de, len);
index b045d53..4b445f5 100644 (file)
@@ -1667,7 +1667,7 @@ Index: linux-2.4.21-chaos/fs/ext3/namei.c
 +      data1 = bh2->b_data;
 +
 +      /* The 0th block becomes the root, move the dirents out */
-+      de = (struct ext3_dir_entry_2 *) &root->dotdot;
++      de = (struct ext3_dir_entry_2 *)&root->dotdot;
 +      de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
 +      len = ((char *) root) + blocksize - (char *) de;
 +      memcpy (data1, de, len);
index 853fb0c..ca2cacf 100644 (file)
 +      data1 = bh2->b_data;
 +
 +      /* The 0th block becomes the root, move the dirents out */
-+      de = (struct ext3_dir_entry_2 *) &root->dotdot;
++      de = (struct ext3_dir_entry_2 *)&root->dotdot;
 +      de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
 +      len = ((char *) root) + blocksize - (char *) de;
 +      memcpy (data1, de, len);
index 1278f8f..3e5148e 100644 (file)
@@ -1667,7 +1667,7 @@ Index: linux-2.4.21-suse/fs/ext3/namei.c
 +      data1 = bh2->b_data;
 +
 +      /* The 0th block becomes the root, move the dirents out */
-+      de = (struct ext3_dir_entry_2 *) &root->dotdot;
++      de = (struct ext3_dir_entry_2 *)&root->dotdot;
 +      de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
 +      len = ((char *) root) + blocksize - (char *) de;
 +      memcpy (data1, de, len);
index 86b0061..31f2ae3 100644 (file)
 +      data1 = bh2->b_data;
 +
 +      /* The 0th block becomes the root, move the dirents out */
-+      de = (struct ext3_dir_entry_2 *) &root->dotdot;
++      de = (struct ext3_dir_entry_2 *)&root->dotdot;
 +      de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len));
 +      len = ((char *) root) + blocksize - (char *) de;
 +      memcpy (data1, de, len);
diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.4.20-rh.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.4.20-rh.patch
new file mode 100644 (file)
index 0000000..2733e7d
--- /dev/null
@@ -0,0 +1,1248 @@
+ fs/ext3/ialloc.c          |    3 
+ fs/ext3/inode.c           |    3 
+ fs/ext3/namei.c           |  582 +++++++++++++++++++++++++++++++++++++---------
+ fs/ext3/super.c           |   14 +
+ include/linux/ext3_fs.h   |    1 
+ include/linux/ext3_fs_i.h |    6 
+ 6 files changed, 500 insertions(+), 109 deletions(-)
+
+Index: linux-2.4.20/fs/ext3/namei.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/namei.c  2004-05-27 15:10:40.000000000 -0400
++++ linux-2.4.20/fs/ext3/namei.c       2004-05-27 15:29:52.000000000 -0400
+@@ -51,6 +51,9 @@
+ {
+       struct buffer_head *bh;
++      /* with parallel dir operations all appends
++       * have to be serialized -bzzz */
++      down(&EXT3_I(inode)->i_append_sem);
+       *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+       if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
+@@ -58,6 +61,8 @@
+               EXT3_I(inode)->i_disksize = inode->i_size;
+               ext3_journal_get_write_access(handle,bh);
+       }
++      up(&EXT3_I(inode)->i_append_sem);
++      
+       return bh;
+ }
+@@ -134,6 +139,8 @@
+       struct buffer_head *bh;
+       struct dx_entry *entries;
+       struct dx_entry *at;
++      unsigned long leaf;
++      unsigned int curidx;
+ };
+ struct dx_map_entry
+@@ -142,6 +149,30 @@
+       u32 offs;
+ };
++/* FIXME: this should be reworked using bb_spin_lock
++ * introduced in -mm tree
++ */
++#define BH_DXLock     25
++
++static inline void dx_lock_bh(struct buffer_head volatile *bh)
++{
++#ifdef CONFIG_SMP
++        while (test_and_set_bit(BH_DXLock, &bh->b_state)) {
++                while (test_bit(BH_DXLock, &bh->b_state))
++                        cpu_relax();
++        }
++#endif
++}
++
++static inline void dx_unlock_bh(struct buffer_head *bh)
++{
++#ifdef CONFIG_SMP
++        smp_mb__before_clear_bit();
++        clear_bit(BH_DXLock, &bh->b_state);
++#endif
++}
++
++
+ #ifdef CONFIG_EXT3_INDEX
+ static inline unsigned dx_get_block (struct dx_entry *entry);
+ static void dx_set_block (struct dx_entry *entry, unsigned value);
+@@ -153,7 +184,7 @@
+ static void dx_set_limit (struct dx_entry *entries, unsigned value);
+ static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+ static unsigned dx_node_limit (struct inode *dir);
+-static struct dx_frame *dx_probe(struct dentry *dentry,
++static struct dx_frame *dx_probe(struct qstr *name,
+                                struct inode *dir,
+                                struct dx_hash_info *hinfo,
+                                struct dx_frame *frame,
+@@ -165,15 +196,18 @@
+ static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
+               struct dx_map_entry *offsets, int count);
+ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32);
+ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
+                                struct dx_frame *frame,
+                                struct dx_frame *frames, int *err,
+                                __u32 *start_hash);
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+-                     struct ext3_dir_entry_2 **res_dir, int *err);
++                     struct ext3_dir_entry_2 **res_dir, int *err,
++                     int rwlock, void **lock);
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode);
++static inline void *ext3_lock_htree(struct inode *, unsigned long, int);
++static inline void ext3_unlock_htree(struct inode *, void *);
+ /*
+  * Future: use high four bits of block for coalesce-on-delete flags
+@@ -306,6 +340,94 @@
+ #endif /* DX_DEBUG */
+ /*
++ * dx_find_position
++ *
++ * search position of specified hash in index
++ *
++ */
++
++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash)
++{
++      struct dx_entry *p, *q, *m;
++      int count;
++
++      count = dx_get_count(entries);
++      p = entries + 1;
++      q = entries + count - 1;
++      while (p <= q)
++      {
++              m = p + (q - p)/2;
++              if (dx_get_hash(m) > hash)
++                      q = m - 1;
++              else
++                      p = m + 1;
++      }
++      return p - 1;
++}
++
++/*
++ * returns 1 if path is unchanged
++ */
++int dx_check_path(struct dx_frame *frame, u32 hash)
++{
++      struct dx_entry *p;
++      int ret = 1;
++
++      dx_lock_bh(frame->bh);
++      p = dx_find_position(frame->entries, hash);
++      if (frame->leaf != dx_get_block(p))
++              ret = 0;
++      dx_unlock_bh(frame->bh);
++      
++      return ret;
++}
++
++/*
++ * 0 - changed
++ * 1 - hasn't changed
++ */
++static int
++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo)
++{
++      struct dx_entry *p;
++      struct dx_frame *frame = frames;
++      u32 leaf;
++
++      /* check first level */
++      dx_lock_bh(frame->bh);
++      p = dx_find_position(frame->entries, hinfo->hash);
++      leaf = dx_get_block(p);
++      dx_unlock_bh(frame->bh);
++      
++      if (leaf != frame->leaf) 
++              return 0;
++      
++      /* is there 2nd level? */
++      frame++;
++      if (frame->bh == NULL)
++              return 1;
++
++      /* check second level */
++      dx_lock_bh(frame->bh);
++
++      /* probably 1st level got changed, check it */
++      if (!dx_check_path(frames, hinfo->hash)) {
++              /* path changed */
++              dx_unlock_bh(frame->bh);
++              return 0;
++      }
++
++      p = dx_find_position(frame->entries, hinfo->hash);
++      leaf = dx_get_block(p);
++      dx_unlock_bh(frame->bh);
++      
++      if (leaf != frame->leaf)
++              return 0;
++
++      return 1;
++}
++
++/*
+  * Probe for a directory leaf block to search.
+  *
+  * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+@@ -315,19 +437,20 @@
+  * back to userspace.
+  */
+ static struct dx_frame *
+-dx_probe(struct dentry *dentry, struct inode *dir,
++dx_probe(struct qstr *name, struct inode *dir,
+        struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+ {
+-      unsigned count, indirect;
+-      struct dx_entry *at, *entries, *p, *q, *m;
++      unsigned indirect;
++      struct dx_entry *at, *entries;
+       struct dx_root *root;
+       struct buffer_head *bh;
+       struct dx_frame *frame = frame_in;
+       u32 hash;
++      unsigned int curidx;
+       frame->bh = NULL;
+-      if (dentry)
+-              dir = dentry->d_parent->d_inode;
++      frame[1].bh = NULL;
++
+       if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
+               goto fail;
+       root = (struct dx_root *) bh->b_data;
+@@ -343,8 +466,8 @@
+       }
+       hinfo->hash_version = root->info.hash_version;
+       hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
+-      if (dentry)
+-              ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++      if (name)
++              ext3fs_dirhash(name->name, name->len, hinfo);
+       hash = hinfo->hash;
+       if (root->info.unused_flags & 1) {
+@@ -356,7 +479,19 @@
+               goto fail;
+       }
++repeat:
++      curidx = 0;
++      entries = (struct dx_entry *) (((char *)&root->info) +
++                                     root->info.info_length);
++      assert(dx_get_limit(entries) == dx_root_limit(dir,
++                                                    root->info.info_length));
++      dxtrace (printk("Look up %x", hash));
++      dx_lock_bh(bh);
++      /* indirect must be initialized under bh lock because
++       * 2nd level creation procedure may change it and dx_probe()
++       * will suggest htree is still single-level -bzzz */
+       if ((indirect = root->info.indirect_levels) > 1) {
++              dx_unlock_bh(bh);
+               ext3_warning(dir->i_sb, __FUNCTION__,
+                            "Unimplemented inode hash depth: %#06x",
+                            root->info.indirect_levels);
+@@ -364,56 +499,46 @@
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+       }
+-
+-      entries = (struct dx_entry *) (((char *)&root->info) +
+-                                     root->info.info_length);
+-      assert(dx_get_limit(entries) == dx_root_limit(dir,
+-                                                    root->info.info_length));
+-      dxtrace (printk("Look up %x", hash));
++      
+       while (1)
+       {
+-              count = dx_get_count(entries);
+-              assert (count && count <= dx_get_limit(entries));
+-              p = entries + 1;
+-              q = entries + count - 1;
+-              while (p <= q)
+-              {
+-                      m = p + (q - p)/2;
+-                      dxtrace(printk("."));
+-                      if (dx_get_hash(m) > hash)
+-                              q = m - 1;
+-                      else
+-                              p = m + 1;
+-              }
+-
+-              if (0) // linear search cross check
+-              {
+-                      unsigned n = count - 1;
+-                      at = entries;
+-                      while (n--)
+-                      {
+-                              dxtrace(printk(","));
+-                              if (dx_get_hash(++at) > hash)
+-                              {
+-                                      at--;
+-                                      break;
+-                              }
+-                      }
+-                      assert (at == p - 1);
+-              }
+-
+-              at = p - 1;
+-              dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++              at = dx_find_position(entries, hinfo->hash);
++              dxtrace(printk(" %x->%u\n",
++                              at == entries? 0: dx_get_hash(at),
++                              dx_get_block(at)));
+               frame->bh = bh;
+               frame->entries = entries;
+               frame->at = at;
+-              if (!indirect--) return frame;
+-              if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++              frame->curidx = curidx;
++              frame->leaf = dx_get_block(at);
++              if (!indirect--) {
++                      dx_unlock_bh(bh);
++                      return frame;
++              }
++              
++              /* step into next htree level */
++              curidx = dx_get_block(at);
++              dx_unlock_bh(bh);
++              if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err)))
+                       goto fail2;
++              
++              dx_lock_bh(bh);
++              /* splitting may change root index block and move
++               * hash we're looking for into another index block
++               * so, we have to check this situation and repeat
++               * from begining if path got changed -bzzz */
++              if (!dx_check_path(frame, hash)) {
++                      dx_unlock_bh(bh);
++                      bh = frame->bh;
++                      indirect++;
++                      goto repeat;
++              }
++              
+               at = entries = ((struct dx_node *) bh->b_data)->entries;
+               assert (dx_get_limit(entries) == dx_node_limit (dir));
+               frame++;
+       }
++      dx_unlock_bh(bh);
+ fail2:
+       while (frame >= frame_in) {
+               brelse(frame->bh);
+@@ -427,8 +552,7 @@
+ {
+       if (frames[0].bh == NULL)
+               return;
+-
+-      if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++      if (frames[1].bh != NULL)
+               brelse(frames[1].bh);
+       brelse(frames[0].bh);
+ }
+@@ -470,8 +594,10 @@
+        * nodes need to be read.
+        */
+       while (1) {
+-              if (++(p->at) < p->entries + dx_get_count(p->entries))
++              if (++(p->at) < p->entries + dx_get_count(p->entries)) {
++                      p->leaf = dx_get_block(p->at);
+                       break;
++              }
+               if (p == frames)
+                       return 0;
+               num_frames++;
+@@ -497,13 +623,17 @@
+        * block so no check is necessary
+        */
+       while (num_frames--) {
+-              if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
+-                                    0, err)))
++              u32 idx;
++              
++              idx = p->leaf = dx_get_block(p->at);
++              if (!(bh = ext3_bread(NULL, dir, idx, 0, err)))
+                       return -1; /* Failure */
+               p++;
+               brelse (p->bh);
+               p->bh = bh;
+               p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++              p->curidx = idx;
++              p->leaf = dx_get_block(p->at);
+       }
+       return 1;
+ }
+@@ -543,7 +673,7 @@
+       dir = dir_file->f_dentry->d_inode;
+       hinfo.hash = start_hash;
+       hinfo.minor_hash = 0;
+-      frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++      frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+@@ -625,7 +755,8 @@
+                       count++;
+               }
+               /* XXX: do we need to check rec_len == 0 case? -Chris */
+-              de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++              de = (struct ext3_dir_entry_2 *)((char*)de +
++                              le16_to_cpu(de->rec_len));
+       }
+       return count;
+ }
+@@ -658,7 +789,8 @@
+       } while(more);
+ }
+-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++static void dx_insert_block(struct inode *dir, struct dx_frame *frame,
++                      u32 hash, u32 block, u32 idx)
+ {
+       struct dx_entry *entries = frame->entries;
+       struct dx_entry *old = frame->at, *new = old + 1;
+@@ -670,6 +802,7 @@
+       dx_set_hash(new, hash);
+       dx_set_block(new, block);
+       dx_set_count(entries, count + 1);
++      
+ }
+ #endif
+@@ -752,7 +885,8 @@
+       
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+-                                      struct ext3_dir_entry_2 ** res_dir)
++                                      struct ext3_dir_entry_2 ** res_dir,
++                                      int rwlock, void **lock)
+ {
+       struct super_block * sb;
+       struct buffer_head * bh_use[NAMEI_RA_SIZE];
+@@ -768,6 +902,7 @@
+       int namelen;
+       const u8 *name;
+       unsigned blocksize;
++      int do_not_use_dx = 0;
+       *res_dir = NULL;
+       sb = dir->i_sb;
+@@ -776,9 +911,10 @@
+       name = dentry->d_name.name;
+       if (namelen > EXT3_NAME_LEN)
+               return NULL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+       if (is_dx(dir)) {
+-              bh = ext3_dx_find_entry(dentry, res_dir, &err);
++              bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock);
+               /*
+                * On success, or if the error was file not found,
+                * return.  Otherwise, fall back to doing a search the
+@@ -787,8 +923,14 @@
+               if (bh || (err != ERR_BAD_DX_DIR))
+                       return bh;
+               dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++              do_not_use_dx = 1;
+       }
+ #endif
++      *lock = ext3_lock_htree(dir, 0, rwlock);
++      if (is_dx(dir) && !do_not_use_dx) {
++              ext3_unlock_htree(dir, *lock);
++              goto repeat;
++      }
+       nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+       start = EXT3_I(dir)->i_dir_start_lookup;
+       if (start >= nblocks)
+@@ -860,12 +1002,17 @@
+       /* Clean up the read-ahead blocks */
+       for (; ra_ptr < ra_max; ra_ptr++)
+               brelse (bh_use[ra_ptr]);
++      if (!ret) {
++              ext3_unlock_htree(dir, *lock);
++              *lock = NULL;
++      }
+       return ret;
+ }
+ #ifdef CONFIG_EXT3_INDEX
+ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
+-                     struct ext3_dir_entry_2 **res_dir, int *err)
++                     struct ext3_dir_entry_2 **res_dir, int *err,
++                     int rwlock, void **lock)
+ {
+       struct super_block * sb;
+       struct dx_hash_info     hinfo;
+@@ -880,11 +1027,22 @@
+       struct inode *dir = dentry->d_parent->d_inode;
+       
+       sb = dir->i_sb;
+-      if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++repeat:
++      if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err)))
+               return NULL;
++      
++      *lock = ext3_lock_htree(dir, frame->leaf, rwlock);
++      /* while locking leaf we just found may get splitted
++       * so, we need another leaf. check this */
++      if (!dx_check_full_path(frames, &hinfo)) {
++              ext3_unlock_htree(dir, *lock);
++              dx_release(frames);
++              goto repeat;
++      }
++
+       hash = hinfo.hash;
+       do {
+-              block = dx_get_block(frame->at);
++              block = frame->leaf;
+               if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
+                       goto errout;
+               de = (struct ext3_dir_entry_2 *) bh->b_data;
+@@ -918,6 +1076,8 @@
+       *err = -ENOENT;
+ errout:
+       dxtrace(printk("%s not found\n", name));
++      ext3_unlock_htree(dir, *lock);
++      *lock = NULL;
+       dx_release (frames);
+       return NULL;
+ }
+@@ -928,6 +1088,7 @@
+       struct inode * inode;
+       struct ext3_dir_entry_2 * de;
+       struct buffer_head * bh;
++    void *lock = NULL;
+       if (dentry->d_name.len > EXT3_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
+@@ -935,10 +1096,11 @@
+       if (ext3_check_for_iopen(dir, dentry))
+               return NULL;
+-      bh = ext3_find_entry(dentry, &de);
++      bh = ext3_find_entry(dentry, &de, 0, &lock);
+       inode = NULL;
+       if (bh) {
+               unsigned long ino = le32_to_cpu(de->inode);
++              ext3_unlock_htree(dir, lock);
+               brelse (bh);
+               inode = iget(dir->i_sb, ino);
+@@ -975,7 +1137,8 @@
+       unsigned rec_len = 0;
+       while (count--) {
+-              struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++              struct ext3_dir_entry_2 *de =
++                      (struct ext3_dir_entry_2 *) (from + map->offs);
+               rec_len = EXT3_DIR_REC_LEN(de->name_len);
+               memcpy (to, de, rec_len);
+               ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
+@@ -988,7 +1151,8 @@
+ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+ {
+-      struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++      struct ext3_dir_entry_2 *next, *to, *prev;
++      struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base;
+       unsigned rec_len = 0;
+       prev = to = de;
+@@ -1010,7 +1174,8 @@
+ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+                       struct buffer_head **bh,struct dx_frame *frame,
+-                      struct dx_hash_info *hinfo, int *error)
++                      struct dx_hash_info *hinfo, void **target,
++                      int *error)
+ {
+       unsigned blocksize = dir->i_sb->s_blocksize;
+       unsigned count, continued;
+@@ -1057,23 +1222,30 @@
+       hash2 = map[split].hash;
+       continued = hash2 == map[split - 1].hash;
+       dxtrace(printk("Split block %i at %x, %i/%i\n",
+-              dx_get_block(frame->at), hash2, split, count-split));
+-
++              frame->leaf, hash2, split, count-split));
++      
+       /* Fancy dance to stay within two buffers */
+       de2 = dx_move_dirents(data1, data2, map + split, count - split);
+       de = dx_pack_dirents(data1,blocksize);
+       de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+       de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+-      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
+-      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++      dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1));
++      dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1));
+       /* Which block gets the new entry? */
++      *target = NULL;
+       if (hinfo->hash >= hash2)
+       {
+               swap(*bh, bh2);
+               de = de2;
+-      }
+-      dx_insert_block (frame, hash2 + continued, newblock);
++
++              /* entry will be stored into new block
++               * we have to lock it before add_dirent_to_buf */
++              *target = ext3_lock_htree(dir, newblock, 1);
++      }
++      dx_lock_bh(frame->bh);
++      dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx);
++      dx_unlock_bh(frame->bh);
+       err = ext3_journal_dirty_metadata (handle, bh2);
+       if (err)
+               goto journal_error;
+@@ -1147,7 +1319,8 @@
+       nlen = EXT3_DIR_REC_LEN(de->name_len);
+       rlen = le16_to_cpu(de->rec_len);
+       if (de->inode) {
+-              struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++              struct ext3_dir_entry_2 *de1 =
++                      (struct ext3_dir_entry_2 *)((char *)de + nlen);
+               de1->rec_len = cpu_to_le16(rlen - nlen);
+               de->rec_len = cpu_to_le16(nlen);
+               de = de1;
+@@ -1205,7 +1378,8 @@
+       unsigned        blocksize;
+       struct dx_hash_info hinfo;
+       u32             block;
+-              
++      void            *lock, *new_lock;
++
+       blocksize =  dir->i_sb->s_blocksize;
+       dxtrace(printk("Creating index\n"));
+       retval = ext3_journal_get_write_access(handle, bh);
+@@ -1216,7 +1390,6 @@
+       }
+       root = (struct dx_root *) bh->b_data;
+               
+-      EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
+       bh2 = ext3_append (handle, dir, &block, &retval);
+       if (!(bh2)) {
+               brelse(bh);
+@@ -1224,6 +1397,8 @@
+       }
+       data1 = bh2->b_data;
++      lock = ext3_lock_htree(dir, block, 1);
++
+       /* The 0th block becomes the root, move the dirents out */
+       de = (struct ext3_dir_entry_2 *) &root->dotdot;
+       de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
+@@ -1253,13 +1428,25 @@
+       frame->entries = entries;
+       frame->at = entries;
+       frame->bh = bh;
++      frame->curidx = 0;
++      frame->leaf = 0;
++      frame[1].bh = NULL;
+       bh = bh2;
+-      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++      de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval);
+       dx_release (frames);
+       if (!(de))
+-              return retval;
++              goto cleanup;
++
++      retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++cleanup:
++      if (new_lock)
++              ext3_unlock_htree(dir, new_lock);
++      /* we mark directory indexed in order to
++       * avoid races while htree being created -bzzz */
++      EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++      ext3_unlock_htree(dir, lock);
+-      return add_dirent_to_buf(handle, dentry, inode, de, bh);
++      return retval;
+ }
+ #endif
+@@ -1288,11 +1475,13 @@
+       unsigned blocksize;
+       unsigned nlen, rlen;
+       u32 block, blocks;
++      void *lock;
+       sb = dir->i_sb;
+       blocksize = sb->s_blocksize;
+       if (!dentry->d_name.len)
+               return -EINVAL;
++repeat:
+ #ifdef CONFIG_EXT3_INDEX
+       if (is_dx(dir)) {
+               retval = ext3_dx_add_entry(handle, dentry, inode);
+@@ -1303,36 +1492,53 @@
+               ext3_mark_inode_dirty(handle, dir);
+       }
+ #endif
++      lock = ext3_lock_htree(dir, 0, 1);
++      if (is_dx(dir)) {
++              /* we got lock for block 0
++               * probably previous holder of the lock
++               * created htree -bzzz */
++              ext3_unlock_htree(dir, lock);
++              goto repeat;
++      }
++      
+       blocks = dir->i_size >> sb->s_blocksize_bits;
+       for (block = 0, offset = 0; block < blocks; block++) {
+               bh = ext3_bread(handle, dir, block, 0, &retval);
+-              if(!bh)
++              if(!bh) {
++                      ext3_unlock_htree(dir, lock);
+                       return retval;
++              }
+               retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
+-              if (retval != -ENOSPC)
++              if (retval != -ENOSPC) {
++                      ext3_unlock_htree(dir, lock);
+                       return retval;
++              }
+ #ifdef CONFIG_EXT3_INDEX
+               if (blocks == 1 && !dx_fallback &&
+-                  EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+-                      return make_indexed_dir(handle, dentry, inode, bh);
++                  EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) {
++                      retval = make_indexed_dir(handle, dentry, inode, bh);
++                      ext3_unlock_htree(dir, lock);
++                      return retval;
++              }
+ #endif
+               brelse(bh);
+       }
+       bh = ext3_append(handle, dir, &block, &retval);
+-      if (!bh)
++      if (!bh) {
++              ext3_unlock_htree(dir, lock);
+               return retval;
++      }
+       de = (struct ext3_dir_entry_2 *) bh->b_data;
+       de->inode = 0;
+       de->rec_len = cpu_to_le16(rlen = blocksize);
+       nlen = 0;
+-      return add_dirent_to_buf(handle, dentry, inode, de, bh);
++      retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++      ext3_unlock_htree(dir, lock);
++      return retval;
+ }
+ #ifdef CONFIG_EXT3_INDEX
+-/*
+- * Returns 0 for success, or a negative error value
+- */
+ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode)
+ {
+@@ -1344,15 +1550,28 @@
+       struct super_block * sb = dir->i_sb;
+       struct ext3_dir_entry_2 *de;
+       int err;
+-
+-      frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++      int curidx;
++      void *idx_lock, *leaf_lock, *newleaf_lock;
++      
++repeat:
++      frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+       if (!frame)
+               return err;
+-      entries = frame->entries;
+-      at = frame->at;
+-      if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++      /* we're going to chage leaf, so lock it first */
++      leaf_lock = ext3_lock_htree(dir, frame->leaf, 1);
++
++      /* while locking leaf we just found may get splitted
++       * so we need to check this */
++      if (!dx_check_full_path(frames, &hinfo)) {
++              ext3_unlock_htree(dir, leaf_lock);
++              dx_release(frames);
++              goto repeat;
++      }
++      if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) {
++              printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err);
+               goto cleanup;
++      }
+       BUFFER_TRACE(bh, "get_write_access");
+       err = ext3_journal_get_write_access(handle, bh);
+@@ -1365,6 +1584,35 @@
+               goto cleanup;
+       }
++      /* our leaf has no enough space. hence, we have to
++       * split it. so lock index for this leaf first */
++      curidx = frame->curidx;
++      idx_lock = ext3_lock_htree(dir, curidx, 1);
++
++      /* now check did path get changed? */
++      dx_release(frames);
++
++      frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode,
++                      &hinfo, frames, &err);
++      if (!frame) {
++              /* FIXME: error handling here */
++              brelse(bh);
++              ext3_unlock_htree(dir, idx_lock);
++              return err;
++      }
++      
++      if (frame->curidx != curidx) {
++              /* path has been changed. we have to drop old lock
++               * and repeat */
++              brelse(bh);
++              ext3_unlock_htree(dir, idx_lock);
++              ext3_unlock_htree(dir, leaf_lock);
++              dx_release(frames);
++              goto repeat;
++      }
++      entries = frame->entries;
++      at = frame->at;
++
+       /* Block full, should compress but for now just split */
+       dxtrace(printk("using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+@@ -1376,7 +1624,8 @@
+               struct dx_entry *entries2;
+               struct dx_node *node2;
+               struct buffer_head *bh2;
+-
++              void *nb_lock;
++              
+               if (levels && (dx_get_count(frames->entries) ==
+                              dx_get_limit(frames->entries))) {
+                       ext3_warning(sb, __FUNCTION__,
+@@ -1387,6 +1636,7 @@
+               bh2 = ext3_append (handle, dir, &newblock, &err);
+               if (!(bh2))
+                       goto cleanup;
++              nb_lock = ext3_lock_htree(dir, newblock, 1);
+               node2 = (struct dx_node *)(bh2->b_data);
+               entries2 = node2->entries;
+               node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+@@ -1398,27 +1648,73 @@
+               if (levels) {
+                       unsigned icount1 = icount/2, icount2 = icount - icount1;
+                       unsigned hash2 = dx_get_hash(entries + icount1);
++                      void *ri_lock;
++
++                      /* we have to protect root htree index against
++                       * another dx_add_entry() which would want to
++                       * split it too -bzzz */
++                      ri_lock = ext3_lock_htree(dir, 0, 1);
++
++                      /* as root index block blocked we must repeat
++                       * searching for current position of our 2nd index -bzzz */
++                      dx_lock_bh(frame->bh);
++                      frames->at = dx_find_position(frames->entries, hinfo.hash);
++                      dx_unlock_bh(frame->bh);
++                      
+                       dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+-                              
+-                      BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++      
++                      BUFFER_TRACE(frame->bh, "get_write_access");
+                       err = ext3_journal_get_write_access(handle,
+                                                            frames[0].bh);
+                       if (err)
+                               goto journal_error;
+-                              
++                      
++                      /* copy index into new one */
+                       memcpy ((char *) entries2, (char *) (entries + icount1),
+                               icount2 * sizeof(struct dx_entry));
+-                      dx_set_count (entries, icount1);
+                       dx_set_count (entries2, icount2);
+                       dx_set_limit (entries2, dx_node_limit(dir));
+                       /* Which index block gets the new entry? */
+                       if (at - entries >= icount1) {
++                              /* unlock index we won't use */
++                              ext3_unlock_htree(dir, idx_lock);
++                              idx_lock = nb_lock;
+                               frame->at = at = at - entries - icount1 + entries2;
+-                              frame->entries = entries = entries2;
++                              frame->entries = entries2;
++                              frame->curidx = curidx = newblock;
+                               swap(frame->bh, bh2);
++                      } else {
++                              /* we'll use old index,so new one may be freed */
++                              ext3_unlock_htree(dir, nb_lock);
+                       }
+-                      dx_insert_block (frames + 0, hash2, newblock);
++              
++                      /* NOTE: very subtle piece of code
++                       * competing dx_probe() may find 2nd level index in root
++                       * index, then we insert new index here and set new count
++                       * in that 2nd level index. so, dx_probe() may see 2nd
++                       * level index w/o hash it looks for. the solution is
++                       * to check root index after we locked just founded 2nd
++                       * level index -bzzz */
++                      dx_lock_bh(frames[0].bh);
++                      dx_insert_block (dir, frames + 0, hash2, newblock, 0);
++                      dx_unlock_bh(frames[0].bh);
++                      
++                      /* now old and new 2nd level index blocks contain
++                       * all pointers, so dx_probe() may find it in the both.
++                       * it's OK -bzzz */
++                      
++                      dx_lock_bh(frame->bh);
++                      dx_set_count(entries, icount1);
++                      dx_unlock_bh(frame->bh);
++
++                      /* now old 2nd level index block points to first half
++                       * of leafs. it's importand that dx_probe() must
++                       * check root index block for changes under
++                       * dx_lock_bh(frame->bh) -bzzz */
++
++                      ext3_unlock_htree(dir, ri_lock);
++              
+                       dxtrace(dx_show_index ("node", frames[1].entries));
+                       dxtrace(dx_show_index ("node",
+                              ((struct dx_node *) bh2->b_data)->entries));
+@@ -1427,38 +1723,61 @@
+                               goto journal_error;
+                       brelse (bh2);
+               } else {
++                      unsigned long leaf = frame->leaf;
++
+                       dxtrace(printk("Creating second level index...\n"));
+                       memcpy((char *) entries2, (char *) entries,
+                              icount * sizeof(struct dx_entry));
+                       dx_set_limit(entries2, dx_node_limit(dir));
+                       /* Set up root */
++                      dx_lock_bh(frames[0].bh);
+                       dx_set_count(entries, 1);
+                       dx_set_block(entries + 0, newblock);
+                       ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++                      dx_unlock_bh(frames[0].bh);
+                       /* Add new access path frame */
+                       frame = frames + 1;
+                       frame->at = at = at - entries + entries2;
+                       frame->entries = entries = entries2;
+                       frame->bh = bh2;
++                      frame->curidx = newblock;
++                      frame->leaf = leaf;
+                       err = ext3_journal_get_write_access(handle,
+                                                            frame->bh);
+                       if (err)
+                               goto journal_error;
++
++                      /* first level index was root. it's already initialized */
++                      /* we my unlock it now */
++                      ext3_unlock_htree(dir, idx_lock);
++
++                      /* current index is just created 2nd level index */
++                      curidx = newblock;
++                      idx_lock = nb_lock;
+               }
+               ext3_journal_dirty_metadata(handle, frames[0].bh);
+       }
+-      de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++      de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err);
+       if (!de)
+               goto cleanup;
++
++      /* index splitted */
++      ext3_unlock_htree(dir, idx_lock);
++      
+       err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++
++      if (newleaf_lock)
++              ext3_unlock_htree(dir, newleaf_lock);
++      
+       bh = 0;
+       goto cleanup;
+       
+ journal_error:
+       ext3_std_error(dir->i_sb, err);
+ cleanup:
++      ext3_unlock_htree(dir, leaf_lock);
+       if (bh)
+               brelse(bh);
+       dx_release(frames);
+@@ -1902,6 +2221,7 @@
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de;
+       handle_t *handle;
++      void *lock;
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+       if (IS_ERR(handle)) {
+@@ -1909,7 +2229,7 @@
+       }
+       retval = -ENOENT;
+-      bh = ext3_find_entry (dentry, &de);
++      bh = ext3_find_entry (dentry, &de, 1, &lock);
+       if (!bh)
+               goto end_rmdir;
+@@ -1920,14 +2240,19 @@
+       DQUOT_INIT(inode);
+       retval = -EIO;
+-      if (le32_to_cpu(de->inode) != inode->i_ino)
++      if (le32_to_cpu(de->inode) != inode->i_ino) {
++              ext3_unlock_htree(dir, lock);
+               goto end_rmdir;
++      }
+       retval = -ENOTEMPTY;
+-      if (!empty_dir (inode))
++      if (!empty_dir (inode)) {
++              ext3_unlock_htree(dir, lock);
+               goto end_rmdir;
++      }
+       retval = ext3_delete_entry(handle, dir, de, bh);
++      ext3_unlock_htree(dir, lock);
+       if (retval)
+               goto end_rmdir;
+       if (inode->i_nlink != 2)
+@@ -1956,6 +2281,7 @@
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de;
+       handle_t *handle;
++      void *lock;
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+       if (IS_ERR(handle)) {
+@@ -1966,7 +2292,7 @@
+               handle->h_sync = 1;
+       retval = -ENOENT;
+-      bh = ext3_find_entry (dentry, &de);
++      bh = ext3_find_entry (dentry, &de, 1, &lock);
+       if (!bh)
+               goto end_unlink;
+@@ -1974,8 +2300,10 @@
+       DQUOT_INIT(inode);
+       retval = -EIO;
+-      if (le32_to_cpu(de->inode) != inode->i_ino)
++      if (le32_to_cpu(de->inode) != inode->i_ino) {
++              ext3_unlock_htree(dir, lock);
+               goto end_unlink;
++      }
+       
+       if (!inode->i_nlink) {
+               ext3_warning (inode->i_sb, "ext3_unlink",
+@@ -1984,6 +2312,7 @@
+               inode->i_nlink = 1;
+       }
+       retval = ext3_delete_entry(handle, dir, de, bh);
++      ext3_unlock_htree(dir, lock);
+       if (retval)
+               goto end_unlink;
+       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+@@ -2121,6 +2450,7 @@
+       struct buffer_head * old_bh, * new_bh, * dir_bh;
+       struct ext3_dir_entry_2 * old_de, * new_de;
+       int retval;
++      void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL;
+       old_bh = new_bh = dir_bh = NULL;
+@@ -2133,7 +2463,10 @@
+       if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+               handle->h_sync = 1;
+-      old_bh = ext3_find_entry (old_dentry, &old_de);
++      if (old_dentry->d_parent == new_dentry->d_parent)
++              down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
++
++      old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */);
+       /*
+        *  Check for inode number is _not_ due to possible IO errors.
+        *  We might rmdir the source, keep it as pwd of some process
+@@ -2146,7 +2479,7 @@
+               goto end_rename;
+       new_inode = new_dentry->d_inode;
+-      new_bh = ext3_find_entry (new_dentry, &new_de);
++      new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */);
+       if (new_bh) {
+               if (!new_inode) {
+                       brelse (new_bh);
+@@ -2213,7 +2546,7 @@
+               struct buffer_head *old_bh2;
+               struct ext3_dir_entry_2 *old_de2;
+-              old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++              old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */);
+               if (old_bh2) {
+                       retval = ext3_delete_entry(handle, old_dir,
+                                                  old_de2, old_bh2);
+@@ -2256,6 +2589,14 @@
+       retval = 0;
+ end_rename:
++      if (lock1)
++              ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1);
++      if (lock2)
++              ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2);
++      if (lock3)
++              ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3);
++      if (old_dentry->d_parent == new_dentry->d_parent)
++              up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem);
+       brelse (dir_bh);
+       brelse (old_bh);
+       brelse (new_bh);
+@@ -2264,6 +2605,29 @@
+ }
+ /*
++ * this locking primitives are used to protect parts
++ * of dir's htree. protection unit is block: leaf or index
++ */
++static inline void *ext3_lock_htree(struct inode *dir,
++                                      unsigned long value, int rwlock)
++{
++      void *lock;
++      
++      if (!test_opt(dir->i_sb, PDIROPS))
++              return NULL;
++      lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL);
++      return lock;
++}
++
++static inline void ext3_unlock_htree(struct inode *dir,
++                                      void *lock)
++{
++      if (!test_opt(dir->i_sb, PDIROPS) || !lock)
++              return;
++      dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock);
++}
++
++/*
+  * directories can handle most operations...
+  */
+ struct inode_operations ext3_dir_inode_operations = {
+Index: linux-2.4.20/fs/ext3/super.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/super.c  2004-05-27 15:10:41.000000000 -0400
++++ linux-2.4.20/fs/ext3/super.c       2004-05-27 15:10:45.000000000 -0400
+@@ -796,6 +796,8 @@
+                               return 0;
+                       }
+               }
++              else if (!strcmp (this_char, "pdirops"))
++                      set_opt (sbi->s_mount_opt, PDIROPS);
+               else if (!strcmp (this_char, "grpid") ||
+                        !strcmp (this_char, "bsdgroups"))
+                       set_opt (*mount_options, GRPID);
+@@ -822,6 +824,9 @@
+                       if (want_numeric(value, "sb", sb_block))
+                               return 0;
+               }
++              else if (!strcmp (this_char, "pdirops")) {
++                      set_opt (sbi->s_mount_opt, PDIROPS);
++              }
+ #ifdef CONFIG_JBD_DEBUG
+               else if (!strcmp (this_char, "ro-after")) {
+                       unsigned long v;
+@@ -985,6 +990,10 @@
+               ext3_check_inodes_bitmap (sb);
+       }
+ #endif
++#ifdef S_PDIROPS
++      if (test_opt (sb, PDIROPS))
++              sb->s_flags |= S_PDIROPS;
++#endif
+       setup_ro_after(sb);
+       return res;
+ }
+@@ -1484,6 +1493,11 @@
+               test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+               "writeback");
++      if (test_opt(sb, PDIROPS)) {
++              printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n");
++              sb->s_flags |= S_PDIROPS;
++      }
++              
+       return sb;
+ failed_mount3:
+Index: linux-2.4.20/fs/ext3/inode.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/inode.c  2004-05-27 15:10:41.000000000 -0400
++++ linux-2.4.20/fs/ext3/inode.c       2004-05-27 15:10:45.000000000 -0400
+@@ -2435,6 +2435,9 @@
+       } else if (S_ISDIR(inode->i_mode)) {
+               inode->i_op = &ext3_dir_inode_operations;
+               inode->i_fop = &ext3_dir_operations;
++              dynlock_init(&EXT3_I(inode)->i_htree_lock);
++              sema_init(&EXT3_I(inode)->i_rename_sem, 1);
++              sema_init(&EXT3_I(inode)->i_append_sem, 1);
+       } else if (S_ISLNK(inode->i_mode)) {
+               if (ext3_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext3_fast_symlink_inode_operations;
+Index: linux-2.4.20/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.4.20.orig/fs/ext3/ialloc.c 2004-05-27 15:10:39.000000000 -0400
++++ linux-2.4.20/fs/ext3/ialloc.c      2004-05-27 15:10:45.000000000 -0400
+@@ -601,6 +601,9 @@
+               return ERR_PTR(-EDQUOT);
+       }
+       ext3_debug ("allocating inode %lu\n", inode->i_ino);
++      dynlock_init(&EXT3_I(inode)->i_htree_lock);
++      sema_init(&EXT3_I(inode)->i_rename_sem, 1);
++      sema_init(&EXT3_I(inode)->i_append_sem, 1);
+       return inode;
+ fail:
+Index: linux-2.4.20/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.4.20.orig/include/linux/ext3_fs.h  2004-05-27 15:10:40.000000000 -0400
++++ linux-2.4.20/include/linux/ext3_fs.h       2004-05-27 15:10:45.000000000 -0400
+@@ -306,6 +306,7 @@
+ /*
+  * Mount flags
+  */
++#define EXT3_MOUNT_PDIROPS            0x800000/* Parallel dir operations */
+ #define EXT3_MOUNT_CHECK              0x0001  /* Do mount-time checks */
+ #define EXT3_MOUNT_GRPID              0x0004  /* Create files with directory's group */
+ #define EXT3_MOUNT_DEBUG              0x0008  /* Some debugging messages */
+Index: linux-2.4.20/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.4.20.orig/include/linux/ext3_fs_i.h        2001-11-22 14:46:19.000000000 -0500
++++ linux-2.4.20/include/linux/ext3_fs_i.h     2004-05-27 15:10:45.000000000 -0400
+@@ -17,6 +17,7 @@
+ #define _LINUX_EXT3_FS_I
+ #include <linux/rwsem.h>
++#include <linux/dynlocks.h>
+ /*
+  * second extended file system inode data in memory
+@@ -73,6 +74,11 @@
+        * by other means, so we have truncate_sem.
+        */
+       struct rw_semaphore truncate_sem;
++
++      /* following fields for parallel directory operations -bzzz */
++      struct dynlock i_htree_lock;
++      struct semaphore i_append_sem;
++      struct semaphore i_rename_sem;
+ };
+ #endif        /* _LINUX_EXT3_FS_I */
diff --git a/lustre/kernel_patches/patches/ext3-trusted_ea-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-trusted_ea-2.4.21-chaos.patch
new file mode 100644 (file)
index 0000000..92753de
--- /dev/null
@@ -0,0 +1,170 @@
+ fs/ext3/xattr.c            |   12 +++++-
+ fs/ext3/xattr_trusted.c    |   86 +++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/ext3_xattr.h |    6 +++
+ 3 files changed, 102 insertions(+), 2 deletions(-)
+
+Index: linux-p4smp/fs/ext3/Makefile
+===================================================================
+--- linux-p4smp.orig/fs/ext3/Makefile  2004-06-14 13:46:11.000000000 -0700
++++ linux-p4smp/fs/ext3/Makefile       2004-06-14 13:50:46.000000000 -0700
+@@ -12,7 +12,8 @@ O_TARGET := ext3.o
+ export-objs := ext3-exports.o
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+-              ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
++              ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \
++              xattr_trusted.o
+ obj-m    := $(O_TARGET)
+ export-objs += xattr.o
+Index: linux-p4smp/fs/ext3/xattr.c
+===================================================================
+--- linux-p4smp.orig/fs/ext3/xattr.c   2004-06-14 13:46:44.000000000 -0700
++++ linux-p4smp/fs/ext3/xattr.c        2004-06-14 13:50:46.000000000 -0700
+@@ -1780,18 +1780,25 @@ static void ext3_xattr_rehash(struct ext
+ int __init
+ init_ext3_xattr(void)
+ {
++      int error;
++
+       ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
+               sizeof(struct mb_cache_entry) +
+               sizeof(struct mb_cache_entry_index), 1, 61);
+       if (!ext3_xattr_cache)
+               return -ENOMEM;
+-      return 0;
++      error = init_ext3_xattr_trusted();
++      if (error)
++              mb_cache_destroy(ext3_xattr_cache);
++
++      return error;
+ }
+ void
+ exit_ext3_xattr(void)
+ {
++      exit_ext3_xattr_trusted();
+       if (ext3_xattr_cache)
+               mb_cache_destroy(ext3_xattr_cache);
+       ext3_xattr_cache = NULL;
+@@ -1802,12 +1809,13 @@ exit_ext3_xattr(void)
+ int __init
+ init_ext3_xattr(void)
+ {
+-      return 0;
++      return init_ext3_xattr_trusted();
+ }
+ void
+ exit_ext3_xattr(void)
+ {
++      exit_ext3_xattr_trusted();
+ }
+ #endif  /* CONFIG_EXT3_FS_XATTR_SHARING */
+Index: linux-p4smp/fs/ext3/xattr_trusted.c
+===================================================================
+--- linux-p4smp.orig/fs/ext3/xattr_trusted.c   2004-06-14 13:41:58.000000000 -0700
++++ linux-p4smp/fs/ext3/xattr_trusted.c        2004-06-14 13:50:46.000000000 -0700
+@@ -0,0 +1,86 @@
++/*
++ * linux/fs/ext3/xattr_trusted.c
++ * Handler for trusted extended attributes.
++ *
++ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++
++#define XATTR_TRUSTED_PREFIX "trusted."
++
++static size_t
++ext3_xattr_trusted_list(char *list, struct inode *inode,
++                      const char *name, int name_len)
++{
++      const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
++
++      if (!capable(CAP_SYS_ADMIN))
++              return 0;
++
++      if (list) {
++              memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
++              memcpy(list+prefix_len, name, name_len);
++              list[prefix_len + name_len] = '\0';
++      }
++      return prefix_len + name_len + 1;
++}
++
++static int
++ext3_xattr_trusted_get(struct inode *inode, const char *name,
++                     void *buffer, size_t size)
++{
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!capable(CAP_SYS_ADMIN))
++              return -EPERM;
++      return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
++                            buffer, size);
++}
++
++static int
++ext3_xattr_trusted_set(struct inode *inode, const char *name,
++                     const void *value, size_t size, int flags)
++{
++      handle_t *handle;
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!capable(CAP_SYS_ADMIN))
++              return -EPERM;
++      handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++      error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_TRUSTED, name,
++                             value, size, flags);
++      ext3_journal_stop(handle, inode);
++
++      return error;
++}
++
++struct ext3_xattr_handler ext3_xattr_trusted_handler = {
++      .prefix = XATTR_TRUSTED_PREFIX,
++      .list   = ext3_xattr_trusted_list,
++      .get    = ext3_xattr_trusted_get,
++      .set    = ext3_xattr_trusted_set,
++};
++
++int __init
++init_ext3_xattr_trusted(void)
++{
++      return ext3_xattr_register(EXT3_XATTR_INDEX_TRUSTED,
++                                 &ext3_xattr_trusted_handler);
++}
++
++void
++exit_ext3_xattr_trusted(void)
++{
++      ext3_xattr_unregister(EXT3_XATTR_INDEX_TRUSTED,
++                            &ext3_xattr_trusted_handler);
++}
+Index: linux-p4smp/include/linux/ext3_xattr.h
+===================================================================
+--- linux-p4smp.orig/include/linux/ext3_xattr.h        2004-06-14 13:41:58.000000000 -0700
++++ linux-p4smp/include/linux/ext3_xattr.h     2004-06-14 13:50:46.000000000 -0700
+@@ -93,6 +93,9 @@ extern void ext3_xattr_put_super(struct 
+ extern int init_ext3_xattr(void) __init;
+ extern void exit_ext3_xattr(void);
++extern int init_ext3_xattr_trusted(void) __init;
++extern void exit_ext3_xattr_trusted(void);
++
+ # else  /* CONFIG_EXT3_FS_XATTR */
+ #  define ext3_setxattr               NULL
+ #  define ext3_getxattr               NULL
index ad213c9..c0940cf 100644 (file)
@@ -70,11 +70,11 @@ Index: linux-2.4.19.SuSE/fs/ext3/inode.c
        if(ext3_get_inode_loc(inode, &iloc))
                goto bad_inode;
        bh = iloc.bh;
-Index: linux-2.4.19.SuSE/fs/ext3/iopen.c
+Index: lum/fs/ext3/iopen.c
 ===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/iopen.c     Sun Nov 16 01:27:31 2003
-+++ linux-2.4.19.SuSE/fs/ext3/iopen.c  Sun Nov 16 01:27:31 2003
-@@ -0,0 +1,258 @@
+--- lum.orig/fs/ext3/iopen.c   2004-03-09 16:46:37.000000000 -0700
++++ lum/fs/ext3/iopen.c        2004-03-09 16:48:03.000000000 -0700
+@@ -0,0 +1,282 @@
 +/*
 + * linux/fs/ext3/iopen.c
 + *
@@ -211,13 +211,24 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.c
 +
 +/* This function is spliced into ext3_lookup and does the move of a
 + * disconnected dentry (if it exists) to a connected dentry.
-+ * Caller must hold dcache_lock.
 + */
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++                                  int rehash)
 +{
 +      struct dentry *tmp, *goal = NULL;
 +      struct list_head *lp;
 +
++      /* verify this dentry is really new */
++      assert(dentry->d_inode == NULL);
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      if (rehash)
++              assert(list_empty(&dentry->d_hash));    /* d_rehash */
++      assert(list_empty(&dentry->d_subdirs));
++
++      spin_lock(&dcache_lock);
++      if (!inode)
++              goto do_rehash;
++
 +      /* preferrably return a connected dentry */
 +      list_for_each(lp, &inode->i_dentry) {
 +              tmp = list_entry(lp, struct dentry, d_alias);
@@ -231,27 +242,40 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.c
 +      }
 +
 +      if (!goal)
-+              return NULL;
++              goto do_instantiate;
 +
 +      /* Move the goal to the de hash queue - like d_move() */
 +      goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
 +      list_del_init(&goal->d_hash);
 +
 +      list_del(&goal->d_child);
-+      list_del(&de->d_child);
++      list_del(&dentry->d_child);
 +
 +      /* Switch the parents and the names.. */
-+      switch_names(goal, de);
-+      do_switch(goal->d_parent, de->d_parent);
-+      do_switch(goal->d_name.len, de->d_name.len);
-+      do_switch(goal->d_name.hash, de->d_name.hash);
++      switch_names(goal, dentry);
++      do_switch(goal->d_parent, dentry->d_parent);
++      do_switch(goal->d_name.len, dentry->d_name.len);
++      do_switch(goal->d_name.hash, dentry->d_name.hash);
 +
 +      /* And add them back to the (new) parent lists */
 +      list_add(&goal->d_child, &goal->d_parent->d_subdirs);
-+      list_add(&de->d_child, &de->d_parent->d_subdirs);
++      list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
 +      __d_rehash(goal, 0);
++      spin_unlock(&dcache_lock);
++      iput(inode);
 +
 +      return goal;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++do_rehash:
++      if (rehash)
++              __d_rehash(dentry, 0);                  /* d_rehash */
++      spin_unlock(&dcache_lock);
++
++      return NULL;
 +}
 +
 +/*
@@ -333,10 +357,10 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.c
 +
 +      return 1;
 +}
-Index: linux-2.4.19.SuSE/fs/ext3/iopen.h
+Index: lum/fs/ext3/iopen.h
 ===================================================================
---- linux-2.4.19.SuSE.orig/fs/ext3/iopen.h     Sun Nov 16 01:27:31 2003
-+++ linux-2.4.19.SuSE/fs/ext3/iopen.h  Sun Nov 16 01:27:31 2003
+--- lum.orig/fs/ext3/iopen.h   2004-03-09 16:46:37.000000000 -0700
++++ lum/fs/ext3/iopen.h        2004-03-09 16:48:03.000000000 -0700
 @@ -0,0 +1,15 @@
 +/*
 + * iopen.h
@@ -351,8 +375,8 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.h
 +
 +extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
 +extern int ext3_iopen_get_inode(struct inode *inode);
-+extern struct dentry *iopen_connect_dentry(struct dentry *de,
-+                                         struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++                                         struct inode *inode, int rehash);
 Index: linux-2.4.19.SuSE/fs/ext3/namei.c
 ===================================================================
 --- linux-2.4.19.SuSE.orig/fs/ext3/namei.c     Sun Nov 16 01:23:20 2003
@@ -366,12 +390,7 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c
  
  /*
   * define how far ahead to read directories while searching them.
-@@ -922,10 +922,14 @@
-       struct inode * inode;
-       struct ext3_dir_entry_2 * de;
-       struct buffer_head * bh;
-+      struct dentry *alternate = NULL;
+@@ -926,6 +927,9 @@
        if (dentry->d_name.len > EXT3_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
  
@@ -381,36 +400,62 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c
        bh = ext3_find_entry(dentry, &de);
        inode = NULL;
        if (bh) {
-@@ -943,7 +948,28 @@
+@@ -943,8 +948,8 @@
                        return ERR_PTR(-EACCES);
                }
        }
 -      d_add(dentry, inode);
+-      return NULL;
 +
-+      /* verify this dentry is really new */
-+      assert(!dentry->d_inode);
-+      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
-+      assert(list_empty(&dentry->d_hash));            /* d_rehash */
-+      assert(list_empty(&dentry->d_subdirs));
-+
-+      spin_lock(&dcache_lock);
-+      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
-+              spin_unlock(&dcache_lock);
-+              iput(inode);
-+              return alternate;
++      return iopen_connect_dentry(dentry, inode, 1);
+ }
+ #define S_SHIFT 12
+@@ -1932,10 +1935,6 @@
+                             inode->i_nlink);
+       inode->i_version = ++event;
+       inode->i_nlink = 0;
+-      /* There's no need to set i_disksize: the fact that i_nlink is
+-       * zero will ensure that the right thing happens during any
+-       * recovery. */
+-      inode->i_size = 0;
+       ext3_orphan_add(handle, inode);
+       dir->i_nlink--;
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+@@ -2086,6 +2085,23 @@
+       return err;
+ }
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++                       struct inode *inode)
++{
++      int err = ext3_add_entry(handle, dentry, inode);
++      if (!err) {
++              err = ext3_mark_inode_dirty(handle, inode);
++              if (err == 0) {
++                      dput(iopen_connect_dentry(dentry, inode, 0));
++                      return 0;
++              }
 +      }
++      ext3_dec_count(handle, inode);
++      iput(inode);
++      return err;
++}
 +
-+      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+      if (inode)                                      /* d_instantiate */
-+              list_add(&dentry->d_alias, &inode->i_dentry);
-+      dentry->d_inode = inode;
-+
-+      __d_rehash(dentry, 0);                          /* d_rehash */
-+      spin_unlock(&dcache_lock);
-+
-       return NULL;
- }
+ static int ext3_link (struct dentry * old_dentry,
+               struct inode * dir, struct dentry *dentry)
+ {
+@@ -2113,7 +2129,8 @@
+       ext3_inc_count(handle, inode);
+       atomic_inc(&inode->i_count);
  
+-      err = ext3_add_nondir(handle, dentry, inode);
++      err = ext3_add_link(handle, dentry, inode);
++      ext3_orphan_del(handle, inode);
+       ext3_journal_stop(handle, dir);
+       return err;
+ }
 Index: linux-2.4.19.SuSE/fs/ext3/super.c
 ===================================================================
 --- linux-2.4.19.SuSE.orig/fs/ext3/super.c     Sun Nov 16 01:19:22 2003
index 62bd8e1..3bed805 100644 (file)
@@ -74,10 +74,17 @@ Index: linux-ia64/fs/ext3/iopen.c
 ===================================================================
 --- linux-ia64.orig/fs/ext3/iopen.c    2004-03-17 18:02:08.000000000 -0800
 +++ linux-ia64/fs/ext3/iopen.c 2004-03-17 18:10:58.000000000 -0800
-@@ -8,3 +8,275 @@
-  * This file may be redistributed under the terms of the GNU General
-  * Public License.
-  *
+@@ -0,0 +1,282 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
 + *
 + * Invariants:
 + *   - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
@@ -427,7 +434,7 @@ Index: linux-ia64/fs/ext3/namei.c
 +      if (!err) {
 +              err = ext3_mark_inode_dirty(handle, inode);
 +              if (err == 0) {
-+                      (void)iopen_connect_dentry(dentry, inode, 0);
++                      dput(iopen_connect_dentry(dentry, inode, 0));
 +                      return 0;
 +              }
 +      }
index 12436a7..ee976f6 100644 (file)
@@ -1,7 +1,7 @@
 Index: linux-2.6.5-12.1/fs/exec.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/exec.c    2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/exec.c 2004-05-25 17:32:14.038494200 +0300
+--- linux-2.6.5-12.1.orig/fs/exec.c    2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/exec.c 2004-06-03 18:31:28.000000000 -0400
 @@ -125,9 +125,10 @@
        struct nameidata nd;
        int error;
@@ -47,8 +47,8 @@ Index: linux-2.6.5-12.1/fs/exec.c
                                        if (err) {
 Index: linux-2.6.5-12.1/fs/namei.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/namei.c   2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/namei.c        2004-05-25 17:32:14.040493896 +0300
+--- linux-2.6.5-12.1.orig/fs/namei.c   2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/namei.c        2004-06-03 18:42:17.000000000 -0400
 @@ -270,8 +270,19 @@
        return 0;
  }
@@ -136,25 +136,20 @@ Index: linux-2.6.5-12.1/fs/namei.c
                        dput(next.dentry);
                        mntput(next.mnt);
                        if (err)
-@@ -703,14 +749,29 @@
+@@ -703,14 +749,24 @@
                                inode = nd->dentry->d_inode;
                                /* fallthrough */
                        case 1:
 +                              nd->flags |= LOOKUP_LAST;
 +                              err = revalidate_special(nd);
 +                              nd->flags &= ~LOOKUP_LAST;
++                              if (!nd->dentry->d_inode)
++                                      err = -ENOENT;
 +                              if (err)
-+                                      break;
++                                      goto return_err;
                                goto return_reval;
                }
-+              
-+              if (err) {
-+                      if (!nd->dentry->d_inode)
-+                              err = -ENOENT;
-+                      
-+                      goto return_err;                        
-+              }
-+              
++
                if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
                        err = nd->dentry->d_op->d_hash(nd->dentry, &this);
                        if (err < 0)
@@ -166,7 +161,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
                if (err)
                        break;
                follow_mount(&next.mnt, &next.dentry);
-@@ -936,7 +997,7 @@
+@@ -936,7 +992,7 @@
  }
  
  /* SMP-safe */
@@ -175,7 +170,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
  {
        unsigned long hash;
        struct qstr this;
-@@ -956,11 +1017,16 @@
+@@ -956,11 +1012,16 @@
        }
        this.hash = end_name_hash(hash);
  
@@ -193,7 +188,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
  /*
   *    namei()
   *
-@@ -972,7 +1038,8 @@
+@@ -972,7 +1033,8 @@
   * that namei follows links, while lnamei does not.
   * SMP-safe
   */
@@ -203,12 +198,12 @@ Index: linux-2.6.5-12.1/fs/namei.c
  {
        char *tmp = getname(name);
        int err = PTR_ERR(tmp);
-@@ -987,6 +1054,13 @@
+@@ -987,6 +1049,13 @@
        return err;
  }
  
-+int __user_walk(const char __user *name, unsigned flags,
-+              struct nameidata *nd, const char **pname)
++int fastcall __user_walk(const char __user *name, unsigned flags,
++                       struct nameidata *nd, const char **pname)
 +{
 +      intent_init(&nd->intent, IT_LOOKUP);
 +      return __user_walk_it(name, flags, nd, pname);
@@ -217,7 +212,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
  /*
   * It's inline, so penalty for filesystems that don't use sticky bit is
   * minimal.
-@@ -1259,8 +1333,8 @@
+@@ -1259,8 +1328,8 @@
                acc_mode |= MAY_APPEND;
  
        /* Fill in the open() intent data */
@@ -228,7 +223,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
  
        /*
         * The simplest case - just a plain lookup.
-@@ -1275,6 +1349,7 @@
+@@ -1275,6 +1344,7 @@
        /*
         * Create - we need to know the parent.
         */
@@ -236,7 +231,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
        error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
        if (error)
                return error;
-@@ -1291,7 +1366,9 @@
+@@ -1291,7 +1361,9 @@
        dir = nd->dentry;
        nd->flags &= ~LOOKUP_PARENT;
        down(&dir->d_inode->i_sem);
@@ -246,7 +241,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
  
  do_last:
        error = PTR_ERR(dentry);
-@@ -1396,7 +1473,9 @@
+@@ -1396,7 +1468,9 @@
        }
        dir = nd->dentry;
        down(&dir->d_inode->i_sem);
@@ -256,7 +251,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
        putname(nd->last.name);
        goto do_last;
  }
-@@ -2196,7 +2275,9 @@
+@@ -2196,7 +2270,9 @@
  __vfs_follow_link(struct nameidata *nd, const char *link)
  {
        int res = 0;
@@ -266,7 +261,7 @@ Index: linux-2.6.5-12.1/fs/namei.c
        if (IS_ERR(link))
                goto fail;
  
-@@ -2206,6 +2287,10 @@
+@@ -2206,6 +2282,10 @@
                        /* weird __emul_prefix() stuff did it */
                        goto out;
        }
@@ -279,8 +274,8 @@ Index: linux-2.6.5-12.1/fs/namei.c
        if (current->link_count || res || nd->last_type!=LAST_NORM)
 Index: linux-2.6.5-12.1/fs/namespace.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/namespace.c       2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/namespace.c    2004-05-25 17:33:44.385759328 +0300
+--- linux-2.6.5-12.1.orig/fs/namespace.c       2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/namespace.c    2004-06-03 18:31:28.000000000 -0400
 @@ -108,6 +108,7 @@
  
  static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
@@ -316,8 +311,8 @@ Index: linux-2.6.5-12.1/fs/namespace.c
                flags &= ~MS_MGC_MSK;
 Index: linux-2.6.5-12.1/fs/open.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/open.c    2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/open.c 2004-05-25 17:32:14.042493592 +0300
+--- linux-2.6.5-12.1.orig/fs/open.c    2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/open.c 2004-06-03 18:31:28.000000000 -0400
 @@ -227,12 +227,12 @@
        struct nameidata nd;
        struct inode * inode;
@@ -485,8 +480,8 @@ Index: linux-2.6.5-12.1/fs/open.c
   */
 Index: linux-2.6.5-12.1/fs/stat.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/stat.c    2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/stat.c 2004-05-25 17:32:14.042493592 +0300
+--- linux-2.6.5-12.1.orig/fs/stat.c    2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/stat.c 2004-06-03 18:31:28.000000000 -0400
 @@ -37,7 +37,7 @@
  
  EXPORT_SYMBOL(generic_fillattr);
@@ -563,8 +558,8 @@ Index: linux-2.6.5-12.1/fs/stat.c
  
 Index: linux-2.6.5-12.1/fs/nfs/dir.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 19:21:53.000000000 +0300
-+++ linux-2.6.5-12.1/fs/nfs/dir.c      2004-05-25 17:32:14.043493440 +0300
+--- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 12:21:53.000000000 -0400
++++ linux-2.6.5-12.1/fs/nfs/dir.c      2004-06-03 18:31:28.000000000 -0400
 @@ -709,7 +709,7 @@
                return 0;
        if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE))
@@ -585,8 +580,8 @@ Index: linux-2.6.5-12.1/fs/nfs/dir.c
         * The 0 argument passed into the create function should one day
 Index: linux-2.6.5-12.1/fs/inode.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/inode.c   2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/inode.c        2004-05-25 17:32:14.044493288 +0300
+--- linux-2.6.5-12.1.orig/fs/inode.c   2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/inode.c        2004-06-03 18:31:28.000000000 -0400
 @@ -221,6 +221,7 @@
        inodes_stat.nr_unused--;
  }
@@ -597,8 +592,8 @@ Index: linux-2.6.5-12.1/fs/inode.c
   * @inode: inode to clear
 Index: linux-2.6.5-12.1/fs/super.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/super.c   2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/fs/super.c        2004-05-25 17:32:14.045493136 +0300
+--- linux-2.6.5-12.1.orig/fs/super.c   2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/fs/super.c        2004-06-03 18:31:28.000000000 -0400
 @@ -789,6 +789,8 @@
        return (struct vfsmount *)sb;
  }
@@ -608,10 +603,22 @@ Index: linux-2.6.5-12.1/fs/super.c
  struct vfsmount *kern_mount(struct file_system_type *type)
  {
        return do_kern_mount(type->name, 0, type->name, NULL);
+Index: linux-2.6.5-12.1/fs/block_dev.c
+===================================================================
+--- linux-2.6.5-12.1.orig/fs/block_dev.c       2004-05-10 12:21:55.000000000 -0400
++++ linux-2.6.5-12.1/fs/block_dev.c    2004-06-03 18:31:28.000000000 -0400
+@@ -834,6 +834,7 @@
+       if (!path || !*path)
+               return ERR_PTR(-EINVAL);
++      intent_init(&nd.intent, IT_LOOKUP);
+       error = path_lookup(path, LOOKUP_FOLLOW, &nd);
+       if (error)
+               return ERR_PTR(error);
 Index: linux-2.6.5-12.1/include/linux/dcache.h
 ===================================================================
---- linux-2.6.5-12.1.orig/include/linux/dcache.h       2004-04-04 06:38:24.000000000 +0300
-+++ linux-2.6.5-12.1/include/linux/dcache.h    2004-05-25 17:32:14.045493136 +0300
+--- linux-2.6.5-12.1.orig/include/linux/dcache.h       2004-04-03 22:38:24.000000000 -0500
++++ linux-2.6.5-12.1/include/linux/dcache.h    2004-06-03 18:31:28.000000000 -0400
 @@ -4,6 +4,7 @@
  #ifdef __KERNEL__
  
@@ -631,8 +638,8 @@ Index: linux-2.6.5-12.1/include/linux/dcache.h
        int nr_unused;
 Index: linux-2.6.5-12.1/include/linux/fs.h
 ===================================================================
---- linux-2.6.5-12.1.orig/include/linux/fs.h   2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/include/linux/fs.h        2004-05-25 17:32:14.046492984 +0300
+--- linux-2.6.5-12.1.orig/include/linux/fs.h   2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/include/linux/fs.h        2004-06-03 18:31:28.000000000 -0400
 @@ -250,6 +250,8 @@
  #define ATTR_ATTR_FLAG        1024
  #define ATTR_KILL_SUID        2048
@@ -686,8 +693,8 @@ Index: linux-2.6.5-12.1/include/linux/fs.h
  
 Index: linux-2.6.5-12.1/include/linux/namei.h
 ===================================================================
---- linux-2.6.5-12.1.orig/include/linux/namei.h        2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/include/linux/namei.h     2004-05-25 17:32:14.047492832 +0300
+--- linux-2.6.5-12.1.orig/include/linux/namei.h        2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/include/linux/namei.h     2004-06-03 18:31:28.000000000 -0400
 @@ -2,25 +2,55 @@
  #define _LINUX_NAMEI_H
  
@@ -783,32 +790,10 @@ Index: linux-2.6.5-12.1/include/linux/namei.h
  extern int follow_down(struct vfsmount **, struct dentry **);
  extern int follow_up(struct vfsmount **, struct dentry **);
  
-Index: linux-2.6.5-12.1/kernel/exit.c
-===================================================================
---- linux-2.6.5-12.1.orig/kernel/exit.c        2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/kernel/exit.c     2004-05-25 17:32:14.047492832 +0300
-@@ -260,6 +260,8 @@
-       write_unlock_irq(&tasklist_lock);
- }
-+EXPORT_SYMBOL(reparent_to_init);
-+
- void __set_special_pids(pid_t session, pid_t pgrp)
- {
-       struct task_struct *curr = current;
-@@ -429,6 +431,8 @@
-       __exit_files(tsk);
- }
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
-       /* No need to hold fs->lock if we are killing it */
 Index: linux-2.6.5-12.1/include/linux/fshooks.h
 ===================================================================
---- linux-2.6.5-12.1.orig/include/linux/fshooks.h      2004-05-10 19:21:56.000000000 +0300
-+++ linux-2.6.5-12.1/include/linux/fshooks.h   2004-05-25 17:32:14.048492680 +0300
+--- linux-2.6.5-12.1.orig/include/linux/fshooks.h      2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/include/linux/fshooks.h   2004-06-03 18:31:28.000000000 -0400
 @@ -90,12 +90,18 @@
  
  #define FSHOOK_BEGIN_USER_WALK(type, err, path, flags, nd, field, args...) \
@@ -847,15 +832,25 @@ Index: linux-2.6.5-12.1/include/linux/fshooks.h
  
  #define FSHOOK_END_USER_WALK(type, err, field) ((void)0);}
  
-Index: linux-2.6.5-12.1/fs/block_dev.c
+Index: linux-2.6.5-12.1/kernel/exit.c
 ===================================================================
---- linux-2.6.5-12.1.orig/fs/block_dev.c       2004-05-10 19:21:55.000000000 +0300
-+++ linux-2.6.5-12.1/fs/block_dev.c    2004-05-25 17:32:39.517620784 +0300
-@@ -834,6 +834,7 @@
-       if (!path || !*path)
-               return ERR_PTR(-EINVAL);
+--- linux-2.6.5-12.1.orig/kernel/exit.c        2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/kernel/exit.c     2004-06-03 18:31:28.000000000 -0400
+@@ -260,6 +260,8 @@
+       write_unlock_irq(&tasklist_lock);
+ }
  
-+      intent_init(&nd.intent, IT_LOOKUP);
-       error = path_lookup(path, LOOKUP_FOLLOW, &nd);
-       if (error)
-               return ERR_PTR(error);
++EXPORT_SYMBOL(reparent_to_init);
++
+ void __set_special_pids(pid_t session, pid_t pgrp)
+ {
+       struct task_struct *curr = current;
+@@ -429,6 +431,8 @@
+       __exit_files(tsk);
+ }
++EXPORT_SYMBOL(exit_files);
++
+ static inline void __put_fs_struct(struct fs_struct *fs)
+ {
+       /* No need to hold fs->lock if we are killing it */
index 0003912..b3e932f 100644 (file)
@@ -1,3 +1,4 @@
+configurable-x86-stack-2.4.21-chaos.patch 
 dev_read_only_2.4.21-chaos.patch 
 exports_2.4.19-suse.patch
 lustre_version.patch
@@ -26,6 +27,7 @@ add_page_private.patch
 ext3-raw-lookup.patch
 nfs_export_kernel-2.4.21-chaos.patch 
 ext3-ea-in-inode-2.4.21-chaos.patch 
+ext3-trusted_ea-2.4.21-chaos.patch 
 listman-2.4.21-chaos.patch 
 gfp_memalloc-2.4.21-chaos.patch 
 ext3-xattr-ptr-arith-fix.patch
@@ -33,3 +35,4 @@ kernel_text_address-2.4.18-chaos.patch
 pagecache-lock-2.4.21-chaos.patch 
 ext3-truncate-buffer-head.patch
 inode-max-readahead-2.4.24.patch
+dcache_refcount_debug.patch
index 06b2642..22491a0 100644 (file)
@@ -28,7 +28,7 @@ ext3-o_direct-1.2.4.20-rh.patch
 ext3-no-write-super-chaos.patch
 dynamic-locks-2.4.20-rh.patch 
 vfs-pdirops-2.4.20-rh.patch 
-ext3-pdirops-2.4.20-chaos.patch 
+ext3-pdirops-2.4.20-rh.patch 
 tcp_zero_copy_2.4.20_chaos.patch
 gpl_header-chaos-2.4.20.patch
 add_page_private.patch
index 9905491..8748256 100644 (file)
@@ -10,7 +10,6 @@ ext-2.4-patch-1-chaos.patch
 ext-2.4-patch-2.patch
 ext-2.4-patch-3.patch
 ext-2.4-patch-4.patch
-linux-2.4.20-xattr-0.8.54-hp.patch 
 linux-2.4.19-xattr-0.8.54-suse.patch 
 ext3-2.4-ino_t.patch
 ext3-largefile.patch
index ae838ca..d11bec0 100644 (file)
@@ -50,7 +50,5 @@ kernel_text_address-2.4.20-vanilla.patch
 ext3-xattr-ptr-arith-fix.patch
 gfp_memalloc-2.4.22.patch
 procfs-ndynamic-2.4.patch
-linux-2.4.20-tmpfs-xattr.patch
-linux-2.4.20-tmpfs-iopen.patch
 linux-2.4.20-filemap.patch
 ext3-truncate-buffer-head.patch
index cca5324..70af4ab 100644 (file)
@@ -1,7 +1,7 @@
-KERNEL=linux-2.4.20-28.9.tar.gz
+KERNEL=linux-2.4.20-31.9.tar.gz
 SERIES=rh-2.4.20
 VERSION=2.4.20
-EXTRA_VERSION=28.9_lustre
+EXTRA_VERSION=31.9_lustre.1.2.2
 RHBUILD=1
 
 BASE_ARCHS="i586"
@@ -11,3 +11,11 @@ JENSEN_ARCHS=""
 SMP_ARCHS="i586"
 UP_ARCHS=""
 SRC_ARCHS="i586"
+
+# the modules in this kernel do not build with gcc 3
+for cc in i386-redhat-linux-gcc-2.96 gcc296 gcc ; do
+    if which $cc >/dev/null 2>/dev/null ; then
+        CC=$cc
+        break
+    fi
+done
index f81e6e7..eacc902 100644 (file)
@@ -33,10 +33,17 @@ patches := @top_srcdir@/kernel_patches/patches
 sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series)
        rm -rf linux-stage linux sources $(ldiskfs_SOURCES)
        mkdir -p linux-stage/fs/ext3 linux-stage/include/linux
-       cd linux-stage && quilt setup -l ../$(series) -d ../$(patches)
        cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3
        cp $(linux_headers) linux-stage/include/linux
+if USE_QUILT
+       cd linux-stage && quilt setup -l ../$(series) -d ../$(patches)
        cd linux-stage && quilt push -a -q
+else
+       @cd linux-stage && for i in $$(<../$(series)) ; do \
+               echo "patch -p1 < ../$(patches)/$$i" ; \
+               patch -p1 < ../$(patches)/$$i || exit 1 ; \
+       done
+endif
        mkdir linux
        @echo -n "Replacing 'ext3' with 'ldiskfs':"
        @for i in $(notdir $(ext3_headers) $(ext3_sources)) $(new_sources) ; do \
@@ -50,6 +57,7 @@ sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series)
                        linux-stage/include/linux/ext3$$i \
                        > linux/ldiskfs$$i ; \
        done
+       @echo
        touch sources
 
 foo-check:
index 906090b..cdd3b07 100644 (file)
@@ -481,11 +481,6 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         if (rc && rc != EALREADY)
                 GOTO(out, rc);
 
-        /* XXX track this all the time? */
-        if (target->obd_recovering) {
-                target->obd_connected_clients++;
-        }
-
         req->rq_repmsg->handle = conn;
 
         /* If the client and the server are the same node, we will already
@@ -528,6 +523,9 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                 GOTO(out, rc = 0);
         }
 
+        if (target->obd_recovering)
+                target->obd_connected_clients++;
+
         memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn),
                sizeof conn);
 
@@ -580,21 +578,37 @@ void target_destroy_export(struct obd_export *exp)
  * Recovery functions
  */
 
-static void abort_delayed_replies(struct obd_device *obd)
+static void target_finish_recovery(struct obd_device *obd)
 {
-        struct ptlrpc_request *req;
         struct list_head *tmp, *n;
+        int rc;
+
+        CWARN("%s: sending delayed replies to recovered clients\n",
+              obd->obd_name);
+
+        ldlm_reprocess_all_ns(obd->obd_namespace);
+
+        /* when recovery finished, cleanup orphans on mds and ost */
+        if (OBT(obd) && OBP(obd, postrecov)) {
+                rc = OBP(obd, postrecov)(obd);
+                if (rc >= 0)
+                        CWARN("%s: all clients recovered, %d MDS "
+                              "orphans deleted\n", obd->obd_name, rc);
+                else
+                        CERROR("postrecov failed %d\n", rc);
+        }
+
         list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
+                struct ptlrpc_request *req;
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                DEBUG_REQ(D_ERROR, req, "aborted:");
-                req->rq_status = -ENOTCONN;
-                req->rq_type = PTL_RPC_MSG_ERR;
+                DEBUG_REQ(D_ERROR, req, "delayed:");
                 ptlrpc_reply(req);
                 class_export_put(req->rq_export);
                 list_del(&req->rq_list);
                 OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
                 OBD_FREE(req, sizeof *req);
         }
+        return;
 }
 
 static void abort_recovery_queue(struct obd_device *obd)
@@ -625,35 +639,24 @@ static void abort_recovery_queue(struct obd_device *obd)
 void target_abort_recovery(void *data)
 {
         struct obd_device *obd = data;
-        int rc;
 
-        CERROR("disconnecting clients and aborting recovery\n");
         spin_lock_bh(&obd->obd_processing_task_lock);
         if (!obd->obd_recovering) {
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 EXIT;
                 return;
         }
-
         obd->obd_recovering = obd->obd_abort_recovery = 0;
-
-        wake_up(&obd->obd_next_transno_waitq);
         target_cancel_recovery_timer(obd);
         spin_unlock_bh(&obd->obd_processing_task_lock);
 
-        class_disconnect_exports(obd, 0);
+        CERROR("%s: recovery period over; disconnecting unfinished clients.\n",
+               obd->obd_name);
+        class_disconnect_stale_exports(obd, 0);
+        abort_recovery_queue(obd);
 
-        /* when recovery was aborted, cleanup orphans on mds and ost */
-        if (OBT(obd) && OBP(obd, postrecov)) {
-                rc = OBP(obd, postrecov)(obd);
-                if (rc >= 0)
-                        CWARN("Cleanup %d orphans after recovery was aborted\n", rc);
-                else
-                        CERROR("postrecov failed %d\n", rc);
-        }
+        target_finish_recovery(obd);
 
-        abort_delayed_replies(obd);
-        abort_recovery_queue(obd);
         ptlrpc_run_recovery_over_upcall(obd);
 }
 
@@ -662,7 +665,8 @@ static void target_recovery_expired(unsigned long castmeharder)
         struct obd_device *obd = (struct obd_device *)castmeharder;
         CERROR("recovery timed out, aborting\n");
         spin_lock_bh(&obd->obd_processing_task_lock);
-        obd->obd_abort_recovery = 1;
+        if (obd->obd_recovering)
+                obd->obd_abort_recovery = 1;
         wake_up(&obd->obd_next_transno_waitq);
         spin_unlock_bh(&obd->obd_processing_task_lock);
 }
@@ -723,6 +727,9 @@ static int check_for_next_transno(struct obd_device *obd)
         queue_len = obd->obd_requests_queued_for_recovery;
         next_transno = obd->obd_next_recovery_transno;
 
+        CDEBUG(D_HA,"max: %d, connected: %d, completed: %d, queue_len: %d, "
+               "req_transno: "LPU64", next_transno: "LPU64"\n",
+               max, connected, completed, queue_len, req_transno, next_transno);
         if (obd->obd_abort_recovery) {
                 CDEBUG(D_HA, "waking for aborted recovery\n");
                 wake_up = 1;
@@ -836,6 +843,9 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
          * Also, if this request has a transno less than the one we're waiting
          * for, we should process it now.  It could (and currently always will)
          * be an open request for a descriptor that was opened some time ago.
+         *
+         * Also, a resent, replayed request that has already been
+         * handled will pass through here and be processed immediately.
          */
         if (obd->obd_processing_task == current->pid ||
             transno < obd->obd_next_recovery_transno) {
@@ -847,6 +857,17 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
                 return 1;
         }
 
+        /* A resent, replayed request that is still on the queue; just drop it.
+           The queued request will handle this. */
+        if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) ==
+            (MSG_RESENT | MSG_REPLAY)) {
+                DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                OBD_FREE(reqmsg, req->rq_reqlen);
+                OBD_FREE(saved_req, sizeof *saved_req);
+                return 0;
+        }
+
         memcpy(saved_req, req, sizeof *req);
         memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
         req = saved_req;
@@ -902,7 +923,6 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
         struct ptlrpc_request *saved_req;
         struct lustre_msg *reqmsg;
         int recovery_done = 0;
-        int rc2;
 
         LASSERT ((rc == 0) == (req->rq_reply_state != NULL));
 
@@ -932,39 +952,22 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
         list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
 
         spin_lock_bh(&obd->obd_processing_task_lock);
-        --obd->obd_recoverable_clients;
+        /* only count the first "replay over" request from each
+           export */
+        if (req->rq_export->exp_replay_needed) {
+                --obd->obd_recoverable_clients;
+                req->rq_export->exp_replay_needed = 0;
+        }
         recovery_done = (obd->obd_recoverable_clients == 0);
         spin_unlock_bh(&obd->obd_processing_task_lock);
 
         if (recovery_done) {
-                struct list_head *tmp, *n;
-                ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace);
-                CWARN("%s: all clients recovered, sending delayed replies\n",
-                       obd->obd_name);
                 spin_lock_bh(&obd->obd_processing_task_lock);
-                obd->obd_recovering = 0;
+                obd->obd_recovering = obd->obd_abort_recovery = 0;
                 target_cancel_recovery_timer(obd);
                 spin_unlock_bh(&obd->obd_processing_task_lock);
 
-                /* when recovery finished, cleanup orphans on mds and ost */
-                if (OBT(obd) && OBP(obd, postrecov)) {
-                        rc2 = OBP(obd, postrecov)(obd);
-                        if (rc2 >= 0)
-                                CWARN("%s: all clients recovered, %d MDS "
-                                      "orphans deleted\n", obd->obd_name, rc2);
-                        else
-                                CERROR("postrecov failed %d\n", rc2);
-                }
-
-                list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
-                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                        DEBUG_REQ(D_ERROR, req, "delayed:");
-                        ptlrpc_reply(req);
-                        class_export_put(req->rq_export);
-                        list_del(&req->rq_list);
-                        OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
-                        OBD_FREE(req, sizeof *req);
-                }
+                target_finish_recovery(obd);
                 ptlrpc_run_recovery_over_upcall(obd);
         } else {
                 CWARN("%s: %d recoverable clients remain\n",
index b55e91f..bacf759 100644 (file)
@@ -193,7 +193,7 @@ int llu_glimpse_size(struct inode *inode)
         rc = obd_enqueue(sbi->ll_osc_exp, lli->lli_smd, LDLM_EXTENT, &policy,
                          LCK_PR, &flags, llu_extent_lock_callback,
                          ldlm_completion_ast, llu_glimpse_callback, inode,
-                         sizeof(*lvb), lustre_swab_ost_lvb, &lockh);
+                         sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh);
         if (rc > 0)
                 RETURN(-EIO);
 
index ae8034a..4918f98 100644 (file)
@@ -384,7 +384,7 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
                         break;
                 }
 
-                conditional_schedule();
+                cond_resched();
 
                 page = find_get_page(inode->i_mapping, i);
                 if (page == NULL)
@@ -658,8 +658,19 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
                 rc = -EIO;
 
         if (policy->l_extent.start == 0 &&
-            policy->l_extent.end == OBD_OBJECT_EOF)
+            policy->l_extent.end == OBD_OBJECT_EOF) {
+                /* vmtruncate()->ll_truncate() first sets the i_size and then
+                 * the kms under both a DLM lock and the i_sem.  If we don't
+                 * get the i_sem here we can match the DLM lock and reset
+                 * i_size from the kms before the truncating path has updated
+                 * the kms.  generic_file_write can then trust the stale i_size
+                 * when doing appending writes and effectively cancel the
+                 * result of the truncate.  Getting the i_sem after the enqueue
+                 * maintains the DLM -> i_sem acquiry order. */
+                down(&inode->i_sem);
                 inode->i_size = lov_merge_size(lsm, 1);
+                up(&inode->i_sem);
+        }
 
         //inode->i_mtime = lov_merge_mtime(lsm, inode->i_mtime);
 
index 526eeb3..5bec189 100644 (file)
@@ -274,7 +274,6 @@ static int lov_disconnect(struct obd_export *exp, int flags)
 static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
                               int activate)
 {
-        struct obd_device *obd;
         struct lov_tgt_desc *tgt;
         int i, rc = 0;
         ENTRY;
@@ -293,24 +292,14 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
         if (i == lov->desc.ld_tgt_count)
                 GOTO(out, rc = -EINVAL);
 
-        obd = class_exp2obd(tgt->ltd_exp);
-        if (obd == NULL) {
-                /* This can happen if OST failure races with node shutdown */
-                GOTO(out, rc = -ENOTCONN);
-        }
-
-        CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LOV idx %d\n",
-               obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
-               obd->obd_type->typ_name, i);
-        LASSERT(strcmp(obd->obd_type->typ_name, "osc") == 0);
-
         if (tgt->active == activate) {
-                CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+                CDEBUG(D_INFO, "OSC %s already %sactive!\n", uuid->uuid,
                        activate ? "" : "in");
                 GOTO(out, rc);
         }
 
-        CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
+        CDEBUG(D_INFO, "Marking OSC %s %sactive\n", uuid->uuid, 
+               activate ? "" : "in");
 
         tgt->active = activate;
         if (activate)
@@ -2071,13 +2060,13 @@ static int lov_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
                         if (tmp > lock->l_policy_data.l_extent.end)
                                 tmp = lock->l_policy_data.l_extent.end + 1;
                         if (tmp >= loi->loi_kms) {
-                                CDEBUG(D_INODE, "lock acquired, setting rss="
+                                CDEBUG(D_DLMTRACE, "lock acquired, setting rss="
                                        LPU64", kms="LPU64"\n", loi->loi_rss,
                                        tmp);
                                 loi->loi_kms = tmp;
                                 loi->loi_kms_valid = 1;
                         } else {
-                                CDEBUG(D_INODE, "lock acquired, setting rss="
+                                CDEBUG(D_DLMTRACE, "lock acquired, setting rss="
                                        LPU64"; leaving kms="LPU64", end="LPU64
                                        "\n", loi->loi_rss, loi->loi_kms,
                                        lock->l_policy_data.l_extent.end);
@@ -2089,8 +2078,9 @@ static int lov_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
                         memset(lov_lockhp, 0, sizeof(*lov_lockhp));
                         loi->loi_rss = submd->lsm_oinfo->loi_rss;
                         loi->loi_blocks = submd->lsm_oinfo->loi_blocks;
-                        CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
-                               " kms="LPU64"\n", loi->loi_rss, loi->loi_kms);
+                        CDEBUG(D_DLMTRACE, "glimpsed, setting rss="LPU64
+                               "; leaving kms="LPU64"\n", loi->loi_rss,
+                               loi->loi_kms);
                 } else {
                         memset(lov_lockhp, 0, sizeof(*lov_lockhp));
                         if (lov->tgts[loi->loi_ost_idx].active) {
index 0c74ec0..5505329 100644 (file)
@@ -182,7 +182,7 @@ static int mds_server_free_data(struct mds_obd *mds)
         return 0;
 }
 
-static int mds_read_last_rcvd(struct obd_device *obd, struct file *file)
+static int mds_init_server_data(struct obd_device *obd, struct file *file)
 {
         struct mds_obd *mds = &obd->u.mds;
         struct mds_server_data *msd;
@@ -326,6 +326,7 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file)
                 spin_lock_init(&med->med_open_lock);
 
                 mcd = NULL;
+                exp->exp_replay_needed = 1;
                 obd->obd_recoverable_clients++;
                 obd->obd_max_recoverable_clients++;
                 class_export_put(exp);
@@ -337,7 +338,11 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file)
                        mds->mds_last_transno = last_transno;
         }
 
+        if (mcd)
+                OBD_FREE(mcd, sizeof(*mcd));
+
         obd->obd_last_committed = mds->mds_last_transno;
+
         if (obd->obd_recoverable_clients) {
                 CWARN("RECOVERY: service %s, %d recoverable clients, "
                       "last_transno "LPU64"\n", obd->obd_name,
@@ -346,16 +351,15 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file)
                 obd->obd_recovering = 1;
         }
 
-        if (mcd)
-                OBD_FREE(mcd, sizeof(*mcd));
-        
         mds->mds_mount_count = mount_count + 1;
         msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count);
 
         /* save it, so mount count and last_transno is current */
         rc = mds_update_server_data(obd, 1);
+        if (rc)
+                GOTO(err_client, rc);
 
-        RETURN(rc);
+        RETURN(0);
 
 err_client:
         class_disconnect_exports(obd, 0);
@@ -455,7 +459,7 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
                 GOTO(err_last_rcvd, rc = -ENOENT);
         }
 
-        rc = mds_read_last_rcvd(obd, file);
+        rc = mds_init_server_data(obd, file);
         if (rc) {
                 CERROR("cannot read %s: rc = %d\n", LAST_RCVD, rc);
                 GOTO(err_last_rcvd, rc);
@@ -562,8 +566,8 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa,
         ENTRY;
 
         push_ctxt(&saved, &exp->exp_obd->obd_ctxt, NULL);
-        
-        sprintf(fidname, "OBJECTS/%u", tmpname);
+
+        sprintf(fidname, "OBJECTS/%u.%u", tmpname, current->pid);
         filp = filp_open(fidname, O_CREAT | O_EXCL, 0644);
         if (IS_ERR(filp)) {
                 rc = PTR_ERR(filp);
index d93ce0e..f0bf35b 100644 (file)
@@ -435,7 +435,7 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 
         case OBD_IOC_CATLOGLIST: {
                 int count = mds->mds_lov_desc.ld_tgt_count;
-                rc = llog_catlog_list(obd, count, data);
+                rc = llog_catalog_list(obd, count, data);
                 RETURN(rc);
 
         }
index ee096ac..2952fce 100644 (file)
@@ -418,6 +418,7 @@ EXPORT_SYMBOL(class_exp2cliimp);
 EXPORT_SYMBOL(class_conn2cliimp);
 EXPORT_SYMBOL(class_disconnect);
 EXPORT_SYMBOL(class_disconnect_exports);
+EXPORT_SYMBOL(class_disconnect_stale_exports);
 
 EXPORT_SYMBOL(oig_init);
 EXPORT_SYMBOL(oig_release);
index a8db9cb..0429ceb 100644 (file)
@@ -603,24 +603,17 @@ int class_disconnect(struct obd_export *export, int flags)
         RETURN(0);
 }
 
-void class_disconnect_exports(struct obd_device *obd, int flags)
+static void  class_disconnect_export_list(struct list_head *list, int flags)
 {
         int rc;
-        struct list_head *tmp, *n, work_list;
         struct lustre_handle fake_conn;
         struct obd_export *fake_exp, *exp;
         ENTRY;
 
-        /* Move all of the exports from obd_exports to a work list, en masse. */
-        spin_lock(&obd->obd_dev_lock);
-        list_add(&work_list, &obd->obd_exports);
-        list_del_init(&obd->obd_exports);
-        spin_unlock(&obd->obd_dev_lock);
-
-        CDEBUG(D_HA, "OBD device %d (%p) has exports, "
-               "disconnecting them\n", obd->obd_minor, obd);
-        list_for_each_safe(tmp, n, &work_list) {
-                exp = list_entry(tmp, struct obd_export, exp_obd_chain);
+        /* It's possible that an export may disconnect itself, but 
+         * nothing else will be added to this list. */
+        while(!list_empty(list)) {
+                exp = list_entry(list->next, struct obd_export, exp_obd_chain);
                 class_export_get(exp);
 
                 if (obd_uuid_equals(&exp->exp_client_uuid,
@@ -653,6 +646,51 @@ void class_disconnect_exports(struct obd_device *obd, int flags)
         EXIT;
 }
 
+void class_disconnect_exports(struct obd_device *obd, int flags)
+{
+        struct list_head work_list;
+        ENTRY;
+
+        /* Move all of the exports from obd_exports to a work list, en masse. */
+        spin_lock(&obd->obd_dev_lock);
+        list_add(&work_list, &obd->obd_exports);
+        list_del_init(&obd->obd_exports);
+        spin_unlock(&obd->obd_dev_lock);
+
+        CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+               "disconnecting them\n", obd->obd_minor, obd);
+        class_disconnect_export_list(&work_list, flags);
+        EXIT;
+}
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd, int flags)
+{
+        struct list_head work_list;
+        struct list_head *pos, *n;
+        struct obd_export *exp;
+        int cnt = 0;
+        ENTRY;
+  
+        INIT_LIST_HEAD(&work_list);
+        spin_lock(&obd->obd_dev_lock);
+        list_for_each_safe(pos, n, &obd->obd_exports) {
+                exp = list_entry(pos, struct obd_export, exp_obd_chain);
+                if (exp->exp_replay_needed) {
+                        list_del(&exp->exp_obd_chain);
+                        list_add(&exp->exp_obd_chain, &work_list);
+                        cnt++;
+                }
+        }
+        spin_unlock(&obd->obd_dev_lock);
+
+        CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n", 
+               obd->obd_name, cnt);
+        class_disconnect_export_list(&work_list, flags);
+        EXIT;
+}
+
 int oig_init(struct obd_io_group **oig_out)
 {
         struct obd_io_group *oig;
index 6c060e7..6c53036 100644 (file)
@@ -377,7 +377,7 @@ out:
 }
 EXPORT_SYMBOL(llog_ioctl);
 
-int llog_catlog_list(struct obd_device *obd, int count,
+int llog_catalog_list(struct obd_device *obd, int count,
                      struct obd_ioctl_data *data)
 {
         int size, i;
@@ -418,4 +418,4 @@ int llog_catlog_list(struct obd_device *obd, int count,
         RETURN(0);
 
 }
-EXPORT_SYMBOL(llog_catlog_list);
+EXPORT_SYMBOL(llog_catalog_list);
index cf4797b..dd4e563 100644 (file)
@@ -477,6 +477,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 spin_lock_init(&fed->fed_lock);
 
                 fcd = NULL;
+                exp->exp_replay_needed = 1;
                 obd->obd_recoverable_clients++;
                 class_export_put(exp);
 
@@ -488,6 +489,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
 
         }
 
+        if (fcd)
+                OBD_FREE(fcd, sizeof(*fcd));
+
         obd->obd_last_committed = le64_to_cpu(fsd->fsd_last_transno);
 
         if (obd->obd_recoverable_clients) {
@@ -498,17 +502,16 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 obd->obd_recovering = 1;
         }
 
-        if (fcd)
-                OBD_FREE(fcd, sizeof(*fcd));
-
 out:
         filter->fo_mount_count = mount_count + 1;
         fsd->fsd_mount_count = cpu_to_le64(filter->fo_mount_count);
 
         /* save it, so mount count and last_transno is current */
         rc = filter_update_server_data(obd, filp, filter->fo_fsd, 1);
+        if (rc)
+                GOTO(err_client, rc);
 
-        RETURN(rc);
+        RETURN(0);
 
 err_client:
         class_disconnect_exports(obd, 0);
@@ -2336,7 +2339,7 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp,
         }
 
         case OBD_IOC_CATLOGLIST: {
-                rc = llog_catlog_list(obd, 1, data);
+                rc = llog_catalog_list(obd, 1, data);
                 RETURN(rc);
         }
 
index ded86b3..da09be4 100644 (file)
@@ -779,7 +779,6 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
                 lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf)));
         osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
         spin_lock_irqsave(&req->rq_lock, flags);
-        req->rq_no_resend = 1;
         spin_unlock_irqrestore(&req->rq_lock, flags);
 
         /* size[0] still sizeof (*body) */
@@ -901,8 +900,6 @@ restart_bulk:
         rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
                                   page_count, pga, &requested_nob, &niocount,
                                   &request);
-        /* NB ^ sets rq_no_resend */
-
         if (rc != 0)
                 return (rc);
 
@@ -931,13 +928,6 @@ static int brw_interpret(struct ptlrpc_request *request,
         struct brw_page *pga = aa->aa_pga;
         ENTRY;
 
-        /* XXX bug 937 here */
-        if (rc == -ETIMEDOUT && request->rq_resend) {
-                DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
-                LBUG(); /* re-send.  later. */
-                //goto restart_bulk;
-        }
-
         rc = osc_brw_fini_request(request, oa, requested_nob, niocount,
                                   page_count, pga, rc);
         RETURN (rc);
@@ -957,7 +947,6 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
         rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
                                   page_count, pga, &requested_nob, &nio_count,
                                   &request);
-        /* NB ^ sets rq_no_resend */
 
         if (rc == 0) {
                 LASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
index 2a42368..c78fc34 100644 (file)
@@ -92,6 +92,7 @@ AC_CHECK_FILE([$LINUX/include/linux/namei.h],
        [
                linux25="yes"
                KMODEXT=".ko"
+               enable_ldiskfs="yes"
        ],[
                KMODEXT=".o"
                linux25="no"
@@ -101,6 +102,16 @@ AC_MSG_RESULT([$linux25])
 AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
 AC_SUBST(KMODEXT)
 
+AC_PATH_PROG(PATCH, patch, [no])
+AC_PATH_PROG(QUILT, quilt, [no])
+AM_CONDITIONAL(USE_QUILT, test x$QUILT = xno)
+
+if test x$enable_ldiskfs$enable_modules = xyesyes ; then
+       if test x$PATCH$QUILT = xnono ; then
+               AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)])
+       fi
+fi
+
 # -------  Makeflags ------------------
 
 CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
@@ -135,7 +146,7 @@ _ACEOF
 AC_DEFUN([LUSTRE_MODULE_COMPILE_IFELSE],
 [m4_ifvaln([$1], [LUSTRE_MODULE_CONFTEST([$1])])dnl
 rm -f kernel-tests/conftest.o kernel-tests/conftest.mod.c kernel-tests/conftest.ko
-AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="$EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])],
+AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])],
        [$4],
        [_AC_MSG_LOG_CONFTEST
 m4_ifvaln([$5],[$5])dnl])dnl
@@ -446,7 +457,7 @@ LUSTRE_MODULE_TRY_COMPILE(
 # ---------- Red Hat 2.4.20 backports some 2.5 bits --------
 # This needs to run after we've defined the KCPPFLAGS
 
-AC_MSG_CHECKING([for kernel version])
+AC_MSG_CHECKING([if task_struct has a sighand field])
 LUSTRE_MODULE_TRY_COMPILE(
        [
                #include <linux/sched.h>
@@ -455,9 +466,24 @@ LUSTRE_MODULE_TRY_COMPILE(
                p.sighand = NULL;
        ],[
                AC_DEFINE(CONFIG_RH_2_4_20, 1, [this kernel contains Red Hat 2.4.20 patches])
-               AC_MSG_RESULT([redhat-2.4.20])
+               AC_MSG_RESULT([yes])
        ],[
-               AC_MSG_RESULT([$LINUXRELEASE])
+               AC_MSG_RESULT([no])
+       ])
+
+# ---------- 2.4.20 introduced cond_resched --------------
+
+AC_MSG_CHECKING([if kernel offers cond_resched])
+LUSTRE_MODULE_TRY_COMPILE(
+       [
+               #include <linux/sched.h>
+       ],[
+               cond_resched();
+       ],[
+               AC_MSG_RESULT([yes])
+               AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found])
+       ],[
+               AC_MSG_RESULT([no])
        ])
 
 # ---------- Red Hat 2.4.21 backports some more 2.5 bits --------
index bd57e6e..485ff04 100644 (file)
@@ -3,6 +3,6 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-EXTRA_DIST = archdep.m4 build.m4 include 
+EXTRA_DIST = archdep.m4 build.m4
 
-SUBDIRS = portals libcfs knals unals router tests doc utils
+SUBDIRS = portals libcfs knals unals router tests doc utils include
index d45f796..94d3790 100644 (file)
@@ -2,3 +2,5 @@ config.h
 stamp-h
 stamp-h1
 stamp-h.in
+Makefile
+Makefile.in
diff --git a/lustre/portals/include/Makefile.am b/lustre/portals/include/Makefile.am
new file mode 100644 (file)
index 0000000..2b3eb8c
--- /dev/null
@@ -0,0 +1,3 @@
+SUBDIRS = linux portals
+
+EXTRA_DIST = cygwin-ioctl.h
diff --git a/lustre/portals/include/linux/.cvsignore b/lustre/portals/include/linux/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/lustre/portals/include/linux/Makefile.am b/lustre/portals/include/linux/Makefile.am
new file mode 100644 (file)
index 0000000..3c28c6e
--- /dev/null
@@ -0,0 +1,4 @@
+linuxdir = $(includedir)/linux
+
+EXTRA_DIST = kp30.h kpr.h libcfs.h lustre_list.h portals_compat25.h    \
+       portals_lib.h
index efdc8fe..6772e82 100644 (file)
@@ -2,7 +2,7 @@
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
 #ifndef _LIBCFS_H
-
+#define _LIBCFS_H
 
 #define PORTAL_DEBUG
 
diff --git a/lustre/portals/include/portals/.cvsignore b/lustre/portals/include/portals/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/lustre/portals/include/portals/Makefile.am b/lustre/portals/include/portals/Makefile.am
new file mode 100644 (file)
index 0000000..5ed6090
--- /dev/null
@@ -0,0 +1,10 @@
+portalsdir=$(includedir)/portals
+
+if UTILS
+portals_HEADERS = list.h
+endif
+
+EXTRA_DIST = api.h api-support.h arg-blocks.h defines.h errno.h                \
+       internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h       \
+       list.h lltrace.h myrnal.h nal.h nalids.h p30.h ppid.h ptlctl.h  \
+       socknal.h stringtab.h types.h
index 74ef493..80995e9 100644 (file)
@@ -1,26 +1,15 @@
 #ifndef _P30_TYPES_H_
 #define _P30_TYPES_H_
 
-#ifdef __linux__
-# include <asm/types.h>
-# if defined(__powerpc__) && !defined(__KERNEL__)
-#  define __KERNEL__
-#  include <asm/timex.h>
-#  undef __KERNEL__
-# else
-#  include <asm/timex.h>
-# endif
-#else
-# include <sys/types.h>
-typedef u_int32_t __u32;
-typedef u_int64_t __u64;
-#endif
+#include <asm/types.h>
 
 #ifdef __KERNEL__
 # include <linux/time.h>
+# include <asm/timex.h>
 #else
 # include <sys/time.h>
 # define do_gettimeofday(tv) gettimeofday(tv, NULL);
+typedef unsigned long long cycles_t;
 #endif
 
 #include <portals/errno.h>
index 6bff730..08453a0 100644 (file)
@@ -585,7 +585,7 @@ kqswnal_launch (kqswnal_tx_t *ktx)
         /* Don't block for transmit descriptor if we're in interrupt context */
         int   attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
         int   dest = kqswnal_nid2elanid (ktx->ktx_nid);
-        long  flags;
+        unsigned long flags;
         int   rc;
 
         ktx->ktx_launchtime = jiffies;
@@ -1429,7 +1429,7 @@ kqswnal_rx (kqswnal_rx_t *krx)
 void 
 kqswnal_rxhandler(EP_RXD *rxd)
 {
-        long          flags;
+        unsigned long flags;
         int           nob    = ep_rxd_len (rxd);
         int           status = ep_rxd_status (rxd);
         kqswnal_rx_t *krx    = (kqswnal_rx_t *)ep_rxd_arg (rxd);
@@ -1732,7 +1732,7 @@ kqswnal_scheduler (void *arg)
         kqswnal_rx_t    *krx;
         kqswnal_tx_t    *ktx;
         kpr_fwd_desc_t  *fwd;
-        long             flags;
+        unsigned long    flags;
         int              rc;
         int              counter = 0;
         int              shuttingdown = 0;
index f02cbda..37695c9 100644 (file)
@@ -1187,7 +1187,7 @@ ksocknal_fmb_callback (void *arg, int error)
 {
         ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
         ksock_fmb_pool_t  *fmp = fmb->fmb_pool;
-        ptl_hdr_t         *hdr = (ptl_hdr_t *)page_address(fmb->fmb_kiov[0].kiov_page);
+        ptl_hdr_t         *hdr = &fmb->fmb_hdr;
         ksock_conn_t      *conn = NULL;
         ksock_sched_t     *sched;
         unsigned long      flags;
index 4c842a1..15080b0 100644 (file)
@@ -2,7 +2,12 @@ if LIBLUSTRE
 noinst_LIBRARIES = libtcpnal.a
 endif
 
-pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h     \
+       ipmap.h bridge.h procbridge.h
+
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h     \
+       dispatch.h table.h timer.h address.c procapi.c proclib.c        \
+       connection.c tcpnal.c connection.h
+
 libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
 libtcpnal_a_CFLAGS = $(LLCFLAGS)
index 15c1774..851a8e1 100644 (file)
@@ -14,8 +14,10 @@ libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS)
 libuptlctl_a_CFLAGS = $(LLCFLAGS)
 endif
 
+if UTILS
 sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
 lib_LIBRARIES = libptlctl.a
+endif
 
 acceptor_SOURCES = acceptor.c
 
index 489100e..1db0606 100644 (file)
@@ -1274,12 +1274,15 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
 
         LASSERT_SPIN_LOCKED(&imp->imp_lock);
 
+        /* clear this for new requests that were resent as well
+           as resent replayed requests. */
+        lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
         /* don't re-add requests that have been replayed */
         if (!list_empty(&req->rq_replay_list))
                 return;
 
-        lustre_msg_add_flags(req->rq_reqmsg,
-                             MSG_REPLAY);
+        lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
 
         LASSERT(imp->imp_replayable);
         /* Balanced in ptlrpc_free_committed, usually. */
@@ -1591,16 +1594,8 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         aa->praa_old_state = req->rq_send_state;
         req->rq_send_state = LUSTRE_IMP_REPLAY;
         req->rq_phase = RQ_PHASE_NEW;
-        /*
-         * Q: "How can a req get on the replay list if it wasn't replied?"
-         * A: "If we failed during the replay of this request, it will still
-         *     be on the list, but rq_replied will have been reset to 0."
-         */
-        if (req->rq_replied) {
-                aa->praa_old_status = req->rq_repmsg->status;
-                req->rq_status = 0;
-                req->rq_replied = 0;
-        }
+        aa->praa_old_status = req->rq_repmsg->status;
+        req->rq_status = 0;
 
         req->rq_interpret_reply = ptlrpc_replay_interpret;
         atomic_inc(&req->rq_import->imp_replay_inflight);
index f2d034f..0942192 100644 (file)
@@ -100,6 +100,10 @@ int ptlrpc_set_import_discon(struct obd_import *imp)
         spin_lock_irqsave(&imp->imp_lock, flags);
 
         if (imp->imp_state == LUSTRE_IMP_FULL) {
+                CERROR("%s: connection lost to %s@%s\n",
+                       imp->imp_obd->obd_name, 
+                       imp->imp_target_uuid.uuid,
+                       imp->imp_connection->c_remote_uuid.uuid);
                 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
                 obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
@@ -250,7 +254,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
         IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
 
         imp->imp_conn_cnt++;
-        imp->imp_last_replay_transno = 0;
+        imp->imp_resend_replay = 0;
 
         if (imp->imp_remote_handle.cookie == 0) {
                 initial_connect = 1;
@@ -386,19 +390,27 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                                request->rq_repmsg->handle.cookie);
                         imp->imp_remote_handle = request->rq_repmsg->handle;
                 } else {
-                        CERROR("reconnected to %s@%s after partition\n",
+                        CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
                                imp->imp_target_uuid.uuid,
                                imp->imp_connection->c_remote_uuid.uuid);
                 }
 
-                if (imp->imp_invalid)
+                if (imp->imp_invalid) {
                         IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
-                else
+                } else if (MSG_CONNECT_RECOVERING & msg_flags) {
+                        CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+                               imp->imp_obd->obd_name, 
+                               imp->imp_target_uuid.uuid);
+                        imp->imp_resend_replay = 1;
+                        IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+                } else {
                         IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+                }
         } 
         else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
                 LASSERT(imp->imp_replayable);
                 imp->imp_remote_handle = request->rq_repmsg->handle;
+                imp->imp_last_replay_transno = 0;
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
         } 
         else {
@@ -440,7 +452,7 @@ finish:
                 if (aa->pcaa_initial_connect && !imp->imp_initial_recov) {
                         ptlrpc_deactivate_import(imp);
                 }
-                CDEBUG(D_ERROR, "recovery of %s on %s failed (%d)\n",
+                CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
                        imp->imp_target_uuid.uuid,
                        (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
         }
@@ -453,7 +465,15 @@ static int completed_replay_interpret(struct ptlrpc_request *req,
                                     void * data, int rc)
 {
         atomic_dec(&req->rq_import->imp_replay_inflight);
-        ptlrpc_import_recovery_state_machine(req->rq_import);
+        if (req->rq_status == 0) {
+                ptlrpc_import_recovery_state_machine(req->rq_import);
+        } else {
+                CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+                       "reconnecting\n", 
+                       req->rq_import->imp_obd->obd_name, req->rq_status);
+                ptlrpc_connect_import(req->rq_import, NULL);
+        }
+
         RETURN(0);
 }
 
@@ -534,6 +554,10 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
                         GOTO(out, rc);
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
                 ptlrpc_activate_import(imp);
+                CERROR("%s: connection restored to %s@%s\n",
+                       imp->imp_obd->obd_name, 
+                       imp->imp_target_uuid.uuid,
+                       imp->imp_connection->c_remote_uuid.uuid);
         } 
 
         if (imp->imp_state == LUSTRE_IMP_FULL) {
index 6c7c9a3..91a9e88 100644 (file)
 #include <linux/obd.h>
 #include "ptlrpc_internal.h"
 
-static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, 
+static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
                          ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid,
                          struct ptlrpc_connection *conn, int portal, __u64 xid)
 {
         ptl_process_id_t remote_id;
         int              rc;
-        int              rc2;
         ptl_md_t         md;
         char str[PTL_NALFMT_SIZE];
         ENTRY;
@@ -78,15 +77,16 @@ static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
         CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
                len, portal, xid);
 
-        rc2 = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0);
+        rc = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0);
         if (rc != PTL_OK) {
+                int rc2;
                 /* We're going to get an UNLINK event when I unlink below,
                  * which will complete just like any other failed send, so
                  * I fall through and return success here! */
                 CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n",
                        remote_id.nid, portal, xid, rc);
                 rc2 = PtlMDUnlink(*mdh);
-                LASSERT (rc2 == PTL_OK);
+                LASSERTF(rc2 == PTL_OK, "rc2 = %d\n", rc2);
         }
 
         RETURN (0);
index 687f588..71cfdfd 100644 (file)
@@ -42,7 +42,7 @@
 #include <linux/kp30.h>
 #include <linux/lustre_net.h>
 
-#ifndef  __CYGWIN__
+#ifdef __KERNEL__
 # include <linux/ctype.h>
 # include <linux/init.h>
 #else
@@ -135,6 +135,13 @@ static int ptlrpcd_check(struct ptlrpcd_ctl *pc)
                 }
         }
 
+        if (rc == 0) {
+                /* If new requests have been added, make sure to wake up */
+                spin_lock_irqsave(&pc->pc_set->set_new_req_lock, flags);
+                rc = !list_empty(&pc->pc_set->set_new_requests);
+                spin_unlock_irqrestore(&pc->pc_set->set_new_req_lock, flags);
+        }
+
         RETURN(rc);
 }
 
index ece3a47..a86679d 100644 (file)
@@ -130,16 +130,16 @@ void ptlrpc_initiate_recovery(struct obd_import *imp)
         LASSERT (obd_lustre_upcall != NULL);
         
         if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) {
-                CDEBUG(D_ERROR, "%s: starting recovery without upcall\n",
+                CDEBUG(D_HA, "%s: starting recovery without upcall\n",
                         imp->imp_target_uuid.uuid);
                 ptlrpc_connect_import(imp, NULL);
         } 
         else if (strcmp(obd_lustre_upcall, "NONE") == 0) {
-                CDEBUG(D_ERROR, "%s: recovery diabled\n",
+                CDEBUG(D_HA, "%s: recovery disabled\n",
                         imp->imp_target_uuid.uuid);
         } 
         else {
-                CDEBUG(D_ERROR, "%s: calling upcall to start recovery\n",
+                CDEBUG(D_HA, "%s: calling upcall to start recovery\n",
                         imp->imp_target_uuid.uuid);
                 ptlrpc_run_failed_import_upcall(imp);
         }
@@ -151,7 +151,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
 {
         int rc = 0;
         struct list_head *tmp, *pos;
-        struct ptlrpc_request *req;
+        struct ptlrpc_request *req = NULL;
         unsigned long flags;
         __u64 last_transno;
         ENTRY;
@@ -187,16 +187,36 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
          */
         list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+
+                /* If need to resend the last sent transno (because a
+                   reconnect has occurred), then stop on the matching
+                   req and send it again. If, however, the last sent
+                   transno has been committed then we continue replay
+                   from the next request. */
+                if (imp->imp_resend_replay && 
+                    req->rq_transno == last_transno) {
+                        lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+                        break;
+                }
+
                 if (req->rq_transno > last_transno) {
-                        rc = ptlrpc_replay_req(req);
-                        if (rc) {
-                                CERROR("recovery replay error %d for req "
-                                       LPD64"\n", rc, req->rq_xid);
-                                RETURN(rc);
-                        }
-                        *inflight = 1;
+                        imp->imp_last_replay_transno = req->rq_transno;
                         break;
                 }
+
+                req = NULL;
+        }
+
+        imp->imp_resend_replay = 0;
+
+        if (req != NULL) {
+                rc = ptlrpc_replay_req(req);
+                if (rc) {
+                        CERROR("recovery replay error %d for req "
+                               LPD64"\n", rc, req->rq_xid);
+                        RETURN(rc);
+                }
+                *inflight = 1;
         }
         RETURN(rc);
 }
@@ -357,13 +377,13 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp,
         if (rc)
                 RETURN(rc);
 
-        CDEBUG(D_ERROR, "%s: recovery started, waiting\n",
+        CDEBUG(D_HA, "%s: recovery started, waiting\n",
                imp->imp_target_uuid.uuid);
 
         lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL);
         rc = l_wait_event(imp->imp_recovery_waitq,
                           !ptlrpc_import_in_recovery(imp), &lwi);
-        CDEBUG(D_ERROR, "%s: recovery finished\n",
+        CDEBUG(D_HA, "%s: recovery finished\n",
                imp->imp_target_uuid.uuid);
 
         RETURN(rc);
index 5e57916..fe13cc7 100644 (file)
@@ -4,10 +4,12 @@
 # See the file COPYING in this distribution
 
 EXTRA_DIST = license-status maketags.sh lustre.spec version_tag.pl.in  \
-       $(initd_SCRIPTS) lustre.spec.in lustre-kernel-2.4.spec.in       \
+       lustre lustre.spec.in lustre-kernel-2.4.spec.in \
        lmake linux-merge-config.awk linux-merge-modules.awk            \
        linux-rhconfig.h
 
 initddir = $(sysconfdir)/init.d
+if UTILS
 initd_SCRIPTS = lustre
+endif
 
index 1cd283e..9b934f0 100755 (executable)
@@ -20,6 +20,10 @@ SERIES=
 CONFIG=
 VERSION=
 
+RHBUILD=0
+LINUX26=0
+SUSEBUILD=0
+
 BASE_ARCH=
 BIGMEM_ARCHS=
 BOOT_ARCHS=
@@ -182,12 +186,12 @@ load_target()
 
     CONFIG_FILE="$TOPDIR/lustre/kernel_patches/kernel_configs/$CONFIG"
     [ -r "$CONFIG_FILE" ] || \
-       fatal 1 "Target $TARGET's config file $CONFIG missing from $TOPDIR/lustre/kernel_patches/kernel_configs/configs."
+       fatal 1 "Target $TARGET's config file $CONFIG missing from $TOPDIR/lustre/kernel_patches/kernel_configs/."
 
     if [ "$EXTRA_VERSION_save" ] ; then
        EXTRA_VERSION="$EXTRA_VERSION_save"
     elif ! (( $RELEASE )) ; then
-       EXTRA_VERSION="${EXTRA_VERSION}-${TAG//_/}.${TIMESTAMP}"
+       EXTRA_VERSION="${EXTRA_VERSION}-${TAG}.${TIMESTAMP}"
     fi
     # EXTRA_VERSION=${EXTRA_VERSION//-/_}
 
@@ -195,7 +199,7 @@ load_target()
 
     BUILD_ARCHS=
     for arch in $(uniqify "$ALL_ARCHS") ; do
-       if [ -z "$TARGET_ARCHS" ] || echo "$TARGET_ARCHS" | grep -s "$arch" ; then
+       if [ -z "$TARGET_ARCHS" ] || echo "$TARGET_ARCHS" | grep "$arch" >/dev/null 2>/dev/null ; then
            BUILD_ARCHS="$BUILD_ARCHS $arch"
        fi
     done
@@ -270,9 +274,11 @@ patch_linux()
     popd >/dev/null
     echo "Full patch has been saved in ${FULL_PATCH##*/}."
     echo "Replacing .config files..."
-    [ -d linux/configs ] || mkdir linux/configs
+    [ -d linux/configs ] || mkdir linux/configs || \
+        fatal 1 "Error creating configs directory."
     rm -f linux/configs/*
-    cp -v lustre/kernel_patches/kernel_configs/kernel-${VERSION}-${TARGET}*.config linux/configs/
+    cp -v lustre/kernel_patches/kernel_configs/kernel-${VERSION}-${TARGET}*.config linux/configs/ || \
+       fatal 1 "Error copying in kernel configs."
 }
 
 pack_linux()
@@ -310,6 +316,8 @@ prep_build()
        -e "s/@SMP_ARCHS@/$SMP_ARCHS/g" \
        -e "s/@UP_ARCHS@/$UP_ARCHS/g" \
        -e "s/@RHBUILD@/$RHBUILD/g" \
+       -e "s/@LINUX26@/$LINUX26/g" \
+       -e "s/@SUSEBUILD@/$SUSEBUILD/g" \
        < $TOPDIR/lustre/scripts/lustre-kernel-2.4.spec.in \
        > lustre-kernel-2.4.spec
     [ -d SRPMS ] || mkdir SRPMS
index f177c17..3ec63bb 100644 (file)
@@ -355,7 +355,10 @@ BuildKernel()
        --kerneldir $RPM_SOURCE_DIR \
        -j $RPM_BUILD_NCPUS \
        --destdir $RPM_BUILD_ROOT \
-       -- @CONFIGURE_FLAGS@
+       -- --enable-modules \
+       --disable-doc --disable-tests \
+       --disable-utils --disable-liblustre \
+       @CONFIGURE_FLAGS@
 }
 
 BuildLustre()
@@ -371,7 +374,10 @@ BuildLustre()
        --kerneldir $RPM_SOURCE_DIR \
        -j $RPM_BUILD_NCPUS \
        --destdir $RPM_BUILD_ROOT \
-       -- @CONFIGURE_FLAGS@
+       -- --enable-utils \
+       --disable-doc --disable-tests \
+       --disable-modules --disable-liblustre \
+       @CONFIGURE_FLAGS@
 }
 
 SaveHeaders()
@@ -401,14 +407,12 @@ BuildKernel jensen
 BuildKernel smp
 %endif
 
-# we want this one last, so that it is the one populating /usr/bin
-%if %{buildup} && %{buildbase}
+%if %{buildup}
 BuildKernel
-%elseif %{buildbase}
-BuildLustre
 %endif
 
 %if %{buildbase}
+BuildLustre
 SaveHeaders
 %endif
 
@@ -520,14 +524,14 @@ if [ -f ../../savedheaders/%{_target_cpu}/up/version.h ] ; then
     HEADER_FILE=../../savedheaders/%{_target_cpu}/up/version.h
 else
     # test build not including uniprocessor, must get info from somewhere
-    HEADER_FILE=$(ls ../../savedheaders/*/*/version.h | head -1)
+    HEADER_FILE=$(ls ../../savedheaders/*/*/version.h | head -1)
 fi
 grep -v UTS_RELEASE $HEADER_FILE >> version.h
 rm -rf ../../savedheaders
 } ; popd
 touch $RPM_BUILD_ROOT/boot/kernel.h-%{kversion}
 
-rm -f $RPM_BUILD_ROOT/usr/include/linux
+rm -f $RPM_BUILD_ROOT/usr/include/linux
 
 rm -rf $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/savedheaders
 
@@ -822,7 +826,9 @@ exit 0
 /usr/bin/*
 /usr/lib/lustre/python
 /etc/init.d/lustre
-/usr/include/lustre
+/usr/include/lustre/*
+/usr/include/portals/*
+/usr/include/linux/*
 /lib/lib*.a
 
 #%files -n lustre-doc
index 329ef4c..39ccc41 100644 (file)
@@ -68,12 +68,23 @@ Configures openldap server for LDAP Lustre config database
 %endif
 
 %build
+# if RPM_BUILD_NCPUS unset, set it
+if [ -z "$RPM_BUILD_NCPUS" ] ; then
+    RPM_BUILD_NCPUS=$(egrep -c "^cpu[0-9]+" /proc/stat || :)
+    if [ $RPM_BUILD_NCPUS -eq 0 ] ; then
+        RPM_BUILD_NCPUS=1
+    fi
+    if [ $RPM_BUILD_NCPUS -gt 8 ] ; then
+        RPM_BUILD_NCPUS=8
+    fi
+fi
+
 rm -rf $RPM_BUILD_ROOT
 
 # Set an explicit path to our Linux tree, if we can.
 cd $RPM_BUILD_DIR/lustre-%{version}
 ./configure --with-linux='%{linuxdir}' %{disable_doc} --disable-liblustre
-make
+make -j $RPM_BUILD_NCPUS -s
 
 %install
 cd $RPM_BUILD_DIR/lustre-%{version}
index 778e8f1..a27f828 100644 (file)
@@ -63,3 +63,4 @@ logs
 ostactive
 ll_dirstripe_verify
 rename_many
+openfilleddirunlink
index 13abda9..1c19ee4 100644 (file)
@@ -3,16 +3,19 @@ AM_CPPFLAGS = $(LLCPPFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
 AM_CFLAGS = $(LLCFLAGS)
 # LDADD = -lldap
 # LDADD := -lreadline -ltermcap # -lefence
-EXTRA_DIST = $(pkgexample_SCRIPTS) $(noinst_SCRIPTS) $(noinst_DATA) \
-       sanity.sh rundbench
-if TESTS
-pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh
-pkgexample_SCRIPTS += local.sh echo.sh uml.sh lov.sh
+
+pkgexample_scripts = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh
+pkgexample_scripts += local.sh echo.sh uml.sh lov.sh
 noinst_DATA =
 noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh tbox.sh
 noinst_SCRIPTS += llrmount.sh runfailure-mds runvmstat runfailure-net
 noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests
 noinst_SCRIPTS += sanity.sh rundbench
+
+EXTRA_DIST = $(pkgexample_scripts) $(noinst_SCRIPTS) $(noinst_DATA) \
+       sanity.sh rundbench
+if TESTS
+pkgexample_SCRIPTS = $(pkgexample_scripts)
 noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay
 noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy
 noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink utime cmknod
index d3f0c6e..fa15cd2 100644 (file)
@@ -6,22 +6,25 @@ EXTRA_OSTS=${EXTRA_OSTS:-mdev7}
 client_HOST=client
 LIVE_CLIENT=${LIVE_CLIENT:-mdev6}
 # This should always be a list, not a regexp
-#FAIL_CLIENTS=${FAIL_CLIENTS:-mdev7}
-FAIL_CLIENTS=${FAIL_CLIENTS:-""}
+FAIL_CLIENTS=${FAIL_CLIENTS:-mdev8}
+#FAIL_CLIENTS=${FAIL_CLIENTS:-""}
 
 NETTYPE=${NETTYPE:-tcp}
 
 TIMEOUT=${TIMEOUT:-30}
-PTLDEBUG=${PTLDEBUG:-0}
-SUBSYSTEM=${SUBSYSTEM:-0}
+PTLDEBUG=${PTLDEBUG:-0x3f0400}
+SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff}
 MOUNT=${MOUNT:-"/mnt/lustre"}
 UPCALL=${CLIENT_UPCALL:-`pwd`/replay-single-upcall.sh}
 
 MDSDEV=${MDSDEV:-/dev/sda1}
 MDSSIZE=${MDSSIZE:-50000}
+MDSJOURNALSIZE=${MDSJOURNALSIZE:-0}
 
 OSTDEV=${OSTDEV:-$TMP/ost%d-`hostname`}
-OSTSIZE=${OSTSIZE:=50000}
+OSTSIZE=${OSTSIZE:=500000}
+OSTJOURNALSIZE=${OSTJOURNALSIZE:-0}
+
 FSTYPE=${FSTYPE:-ext3}
 STRIPE_BYTES=${STRIPE_BYTES:-1048576} 
 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
index 14f2207..9af8621 100644 (file)
@@ -25,7 +25,7 @@ OSTDEV=${OSTDEV:-$ROOT/tmp/ost1-`hostname`}
 OSTSIZE=${OSTSIZE:-50000}
 FSTYPE=${FSTYPE:-ext3}
 TIMEOUT=${TIMEOUT:-20}
-UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh}
+UPCALL=${UPCALL:-DEFAULT}
 
 STRIPE_BYTES=${STRIPE_BYTES:-65536}
 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
index 4212cab..2445e19 100644 (file)
@@ -218,7 +218,7 @@ test_5b() {
        stop_mds || return 2
        stop_ost || return 3
 
-       lsmod | grep -q portals && return 3
+       lsmod | grep -q portals && return 4
        return 0
 
 }
@@ -230,7 +230,7 @@ test_5c() {
 
        [ -d $MOUNT ] || mkdir -p $MOUNT
        $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
-       llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT  && exit 1
+       llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT  && return 1
 
        # cleanup client modules
        $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
@@ -238,12 +238,33 @@ test_5c() {
        stop_mds || return 2
        stop_ost || return 3
 
-       lsmod | grep -q portals && return 3
+       lsmod | grep -q portals && return 4
        return 0
 
 }
 run_test 5c "cleanup after failed mount (bug 2712)"
 
+test_5d() {
+       start_ost
+       start_mds
+       stop_ost --force
+
+       [ -d $MOUNT ] || mkdir -p $MOUNT
+       $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+       llmount $mds_HOST://mds_svc/client_facet $MOUNT  || return 1 
+
+       umount $MOUNT || return 2
+       # cleanup client modules
+       $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+       
+       stop_mds || return 3
+
+       lsmod | grep -q portals && return 4
+       return 0
+
+}
+run_test 5d "ost down, don't crash during mount attempt"
+
 test_6() {
        setup
        manual_umount_client
index 68d0ff9..9c05b27 100755 (executable)
@@ -12,6 +12,9 @@ init_test_env $@
 
 ALWAYS_EXCEPT="10"
 
+SETUP=${SETUP:-"setup"}
+CLEANUP=${CLEANUP:-"cleanup"}
+
 build_test_filter
 
 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT 
@@ -128,6 +131,8 @@ gen_config() {
 }
 
 setup() {
+    gen_config
+
     rm -rf logs/*
     for i in `seq $NUMOST`; do
        wait_for ost$i
@@ -205,20 +210,17 @@ node_to_ost() {
 
 
 if [ "$ONLY" == "cleanup" ]; then
-    cleanup
+    $CLEANUP
     exit
 fi
 
-if [ -z "$NOSETUP" ]; then
-    gen_config
-    setup
-fi
-
 if [ ! -z "$EVAL" ]; then
     eval "$EVAL"
     exit $?
 fi
 
+$SETUP
+
 if [ "$ONLY" == "setup" ]; then
     exit 0
 fi
@@ -615,4 +617,4 @@ test_10() {
 run_test 10 "Running Availability for 6 hours..."
 
 equals_msg "Done, cleaning up"
-cleanup
+$CLEANUP
index 882c716..8e7ca55 100755 (executable)
@@ -7,7 +7,7 @@ ALWAYS_EXCEPT="20b"
 
 
 LUSTRE=${LUSTRE:-`dirname $0`/..}
-UPCALL=${UPCALL:-$PWD/recovery-small-upcall.sh}
+
 . $LUSTRE/tests/test-framework.sh
 
 init_test_env $@
@@ -342,7 +342,7 @@ test_20a() {        # bug 2983 - ldlm_handle_enqueue cleanup
        mkdir -p $DIR/$tdir
        multiop $DIR/$tdir/${tfile} O_wc &
        MULTI_PID=$!
-       usleep 500
+       sleep 1
        cancel_lru_locks OSC
 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
        do_facet ost sysctl -w lustre.fail_loc=0x80000308
index 9c1f1e1..77e66e7 100755 (executable)
@@ -9,6 +9,9 @@ init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
 
+SETUP=${SETUP:-"setup"}
+CLEANUP=${CLEANUP:-"cleanup"}
+
 gen_config() {
     rm -f $XMLCONFIG
     add_mds mds --dev $MDSDEV --size $MDSSIZE
@@ -35,8 +38,8 @@ cleanup() {
         fail mds
     fi
 
-    umount $MOUNT2
-    umount $MOUNT
+    umount $MOUNT2 || true
+    umount $MOUNT  || true
     rmmod llite
     stop mds ${FORCE}
     stop ost2 ${FORCE}
@@ -49,25 +52,18 @@ if [ "$ONLY" == "cleanup" ]; then
     exit
 fi
 
-gen_config
-start ost --reformat $OSTLCONFARGS 
-PINGER=`cat /proc/fs/lustre/pinger`
+setup() {
+    gen_config
+    start ost --reformat $OSTLCONFARGS 
+    start ost2 --reformat $OSTLCONFARGS 
+    start mds $MDSLCONFARGS --reformat
+    grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT
+    grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2
 
-if [ "$PINGER" != "on" ]; then
-    echo "ERROR: Lustre must be built with --enable-pinger for replay-dual"
-    stop mds
-    exit 1
-fi
-
-start ost2 --reformat $OSTLCONFARGS 
-[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
-start mds $MDSLCONFARGS --reformat
-grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT
-grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2
-
-echo $TIMEOUT > /proc/sys/lustre/timeout
-echo $UPCALL > /proc/sys/lustre/upcall
+#    echo $TIMEOUT > /proc/sys/lustre/timeout
+}
 
+$SETUP
 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
 
 test_1() {
@@ -175,7 +171,156 @@ test_6() {
 }
 run_test 6 "open1, open2, unlink |X| close1 [fail mds] close2"
 
+test_8() {
+    replay_barrier mds
+    drop_reint_reply "mcreate $MOUNT1/$tfile"    || return 1
+    fail mds
+    checkstat $MOUNT2/$tfile || return 2
+    rm $MOUNT1/$tfile || return 3
+
+    return 0
+}
+run_test 8 "replay of resent request"
+
+test_9() {
+    replay_barrier mds
+    mcreate $MOUNT1/$tfile-1
+    mcreate $MOUNT2/$tfile-2
+    # drop first reint reply
+    sysctl -w lustre.fail_loc=0x80000119
+    fail mds
+    sysctl -w lustre.fail_loc=0
+
+    rm $MOUNT1/$tfile-[1,2] || return 1
+
+    return 0
+}
+run_test 9 "resending a replayed create"
+
+test_10() {
+    mcreate $MOUNT1/$tfile-1
+    replay_barrier mds
+    munlink $MOUNT1/$tfile-1
+    mcreate $MOUNT2/$tfile-2
+    # drop first reint reply
+    sysctl -w lustre.fail_loc=0x80000119
+    fail mds
+    sysctl -w lustre.fail_loc=0
+
+    checkstat $MOUNT1/$tfile-1 && return 1
+    checkstat $MOUNT1/$tfile-2 || return 2
+    rm $MOUNT1/$tfile-2
+
+    return 0
+}
+run_test 10 "resending a replayed unlink"
+
+test_11() {
+    replay_barrier mds
+    mcreate $MOUNT1/$tfile-1
+    mcreate $MOUNT2/$tfile-2
+    mcreate $MOUNT1/$tfile-3
+    mcreate $MOUNT2/$tfile-4
+    mcreate $MOUNT1/$tfile-5
+    # drop all reint replies for a while
+    sysctl -w lustre.fail_loc=0x0119
+    facet_failover mds
+    #sleep for while, let both clients reconnect and timeout
+    sleep $((TIMEOUT * 2))
+    sysctl -w lustre.fail_loc=0
+
+    rm $MOUNT1/$tfile-[1-5] || return 1
+
+    return 0
+}
+run_test 11 "both clients timeout during replay"
+
+test_12() {
+    replay_barrier mds
+
+    multiop $DIR/$tfile mo_c &
+    MULTIPID=$!
+    sleep 5
+
+    # drop first enqueue
+    sysctl -w lustre.fail_loc=0x80000302
+    facet_failover mds
+    df $MOUNT || return 1
+    sysctl -w lustre.fail_loc=0
+
+    ls $DIR/$tfile
+    $CHECKSTAT -t file $DIR/$tfile || return 2
+    kill -USR1 $MULTIPID || return 3
+    wait $MULTIPID || return 4
+    rm $DIR/$tfile
+
+    return 0
+}
+run_test 12 "open resend timeout"
+
+test_13() {
+    multiop $DIR/$tfile mo_c &
+    MULTIPID=$!
+    sleep 5
+
+    replay_barrier mds
+
+    kill -USR1 $MULTIPID || return 3
+    wait $MULTIPID || return 4
+
+    # drop close 
+    sysctl -w lustre.fail_loc=0x80000115
+    facet_failover mds
+    df $MOUNT || return 1
+    sysctl -w lustre.fail_loc=0
+
+    ls $DIR/$tfile
+    $CHECKSTAT -t file $DIR/$tfile || return 2
+    rm $DIR/$tfile
+
+    return 0
+}
+run_test 13 "close resend timeout"
+
+test_14() {
+    replay_barrier mds
+    createmany -o $MOUNT1/$tfile- 25
+    createmany -o $MOUNT2/$tfile-2- 1
+    createmany -o $MOUNT1/$tfile-3- 25
+    umount $MOUNT2
+
+    facet_failover mds
+    # expect failover to fail
+    df $MOUNT && return 1
+
+    # first 25 files shouuld have been 
+    # replayed 
+    unlinkmany $MOUNT1/$tfile- 25 || return 2
+
+    zconf_mount `hostname` $MOUNT2
+    return 0
+}
+run_test 14 "timeouts waiting for lost client during replay"
+
+test_15() {
+    replay_barrier mds
+    createmany -o $MOUNT1/$tfile- 25
+    createmany -o $MOUNT2/$tfile-2- 1
+    umount $MOUNT2
+
+    facet_failover mds
+    df $MOUNT || return 1
+
+    lctl dk dk 
+    unlinkmany $MOUNT1/$tfile- 25 || return 2
+
+    zconf_mount `hostname` $MOUNT2
+    return 0
+}
+run_test 15 "timeout waiting for lost client during replay, 1 client completes"
+
+
 if [ "$ONLY" != "setup" ]; then
        equals_msg test complete, cleaning up
-       cleanup
+       $CLEANUP
 fi
index 33f9786..327ea0b 100644 (file)
@@ -232,6 +232,11 @@ facet_nid() {
 facet_active() {
     local facet=$1
     local activevar=${facet}active
+
+    if [ -f ./${facet}active ] ; then
+        source ./${facet}active
+    fi
+
     active=${!activevar}
     if [ -z "$active" ] ; then 
        echo -n ${facet}
index e8e522f..c3d9a59 100644 (file)
@@ -1,2 +1,4 @@
+if UTILS
 pymod_SCRIPTS = __init__.py lustredb.py error.py cmdline.py
-EXTRA_DIST = $(pymod_SCRIPTS)
+endif
+EXTRA_DIST = __init__.py lustredb.py error.py cmdline.py
index 1f7a8b5..5704e85 100644 (file)
@@ -6,17 +6,17 @@ AM_CFLAGS=$(LLCFLAGS)
 AM_CPPFLAGS=$(LLCPPFLAGS)
 AM_LDFLAGS := -L$(top_builddir)/portals/utils
 
+sbin_scripts = lconf lmc llanalyze llstat.pl llobdstat.pl lactive      \
+       load_ldap.sh lrun lwizard
+bin_scripts = lfind lstripe
+
 if UTILS
 rootsbin_SCRIPTS = mount.lustre
 sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount
-sbin_SCRIPTS = lconf lmc llanalyze llstat.pl llobdstat.pl lactive load_ldap.sh lrun
-sbin_SCRIPTS += lwizard
-bin_SCRIPTS = lfind lstripe
 bin_PROGRAMS = lfs
 lib_LIBRARIES = liblustreapi.a
-if LIBLUSTRE
-sbin_SCRIPTS += lrun
-endif # LIBLUSTRE
+sbin_SCRIPTS = $(sbin_scripts)
+bin_SCRIPTS = $(bin_scripts)
 endif # UTILS
 
 lctl_LDADD := $(LIBREADLINE) -lptlctl
@@ -33,7 +33,7 @@ lfs_SOURCES = lfs.c
 llmount_SOURCES = llmount.c 
 llmount_LDADD = $(LIBREADLINE) -lptlctl
 
-EXTRA_DIST = $(bin_SCRIPTS) $(sbin_SCRIPTS)
+EXTRA_DIST = $(bin_scripts) $(sbin_scripts)
 
 # NOTE: this should only be run on i386.
 newwiretest: wirehdr.c wirecheck