From d183b7635bd978b9ea0151aebb61c4d20448c8db Mon Sep 17 00:00:00 2001 From: adilger Date: Wed, 16 Jun 2004 16:50:40 +0000 Subject: [PATCH] Land b1_2_smallfix onto b1_2 (20040616_1009) - allow clients to reconnect during replay (b=1742) - re-awaken ptlrpcd if new requests arrive during check_set (b=3554) - fix cond_resched (b=3554) - only evict unfinished clients after recovery (b=3515) - allow bulk resend, prevent data loss (b=3570) - remove extraneous obd dereference causing LASSERT failure (b=3334) - don't use get_cycles() when creating temp. files on the mds (b=3156) - hold i_sem when setting i_size in ll_extent_lock() (b=3564) - fix ksocknal_fmb_callback() error messages (b=2918) --- ldiskfs/ldiskfs/autoMakefile.am | 10 +- lnet/archdep.m4 | 34 +- lnet/autoMakefile.am | 4 +- lnet/include/.cvsignore | 2 + lnet/include/Makefile.am | 3 + lnet/include/linux/.cvsignore | 2 + lnet/include/linux/Makefile.am | 4 + lnet/include/linux/libcfs.h | 2 +- lnet/include/lnet/.cvsignore | 2 + lnet/include/lnet/Makefile.am | 10 + lnet/include/lnet/types.h | 17 +- lnet/klnds/qswlnd/qswlnd_cb.c | 6 +- lnet/klnds/socklnd/socklnd_cb.c | 2 +- lnet/ulnds/Makefile.am | 9 +- lnet/ulnds/socklnd/Makefile.am | 9 +- lnet/utils/Makefile.am | 2 + lustre/ChangeLog | 9 + lustre/conf/Makefile.am | 5 +- lustre/configure.in | 8 +- lustre/include/linux/Makefile.am | 6 + lustre/include/linux/lustre_compat25.h | 14 +- lustre/include/linux/lustre_export.h | 5 +- lustre/include/linux/lustre_fsfilt.h | 6 +- lustre/include/linux/lustre_import.h | 2 +- lustre/include/linux/lustre_lib.h | 4 - lustre/include/linux/lustre_log.h | 4 +- lustre/include/linux/obd_class.h | 1 + lustre/include/lustre/Makefile.am | 5 +- .../configurable-x86-stack-2.4.21-chaos.patch | 323 +++++ .../patches/ext-2.4-patch-1-chaos.patch | 10 +- .../patches/ext-2.4-patch-1-suse.patch | 10 +- .../kernel_patches/patches/ext-2.4-patch-1.patch | 10 +- .../patches/ext3-ea-in-inode-2.4.21-chaos.patch | 4 +- .../patches/ext3-htree-2.4.19-pre1.patch | 2 +- .../patches/ext3-htree-2.4.21-chaos.patch | 2 +- .../patches/ext3-htree-2.4.22-rh.patch | 2 +- .../kernel_patches/patches/ext3-htree-suse.patch | 2 +- lustre/kernel_patches/patches/ext3-htree.patch | 2 +- .../patches/ext3-pdirops-2.4.20-rh.patch | 1248 ++++++++++++++++++++ .../patches/ext3-trusted_ea-2.4.21-chaos.patch | 170 +++ .../kernel_patches/patches/iopen-2.4.19-suse.patch | 137 ++- .../patches/iopen-2.4.21-chaos.patch | 17 +- .../patches/vfs_intent-2.6-suse.patch | 151 ++- lustre/kernel_patches/series/chaos-2.4.21 | 3 + lustre/kernel_patches/series/rh-2.4.20 | 2 +- lustre/kernel_patches/series/suse-2.4.19 | 1 - lustre/kernel_patches/series/vanilla-2.4.20 | 2 - lustre/kernel_patches/targets/rh-2.4.target | 12 +- lustre/ldiskfs/autoMakefile.am | 10 +- lustre/ldlm/ldlm_lib.c | 109 +- lustre/liblustre/rw.c | 2 +- lustre/llite/file.c | 15 +- lustre/lov/lov_obd.c | 26 +- lustre/mds/mds_fs.c | 20 +- lustre/mds/mds_lov.c | 2 +- lustre/obdclass/class_obd.c | 1 + lustre/obdclass/genops.c | 62 +- lustre/obdclass/llog_ioctl.c | 4 +- lustre/obdfilter/filter.c | 13 +- lustre/osc/osc_request.c | 11 - lustre/portals/archdep.m4 | 34 +- lustre/portals/autoMakefile.am | 4 +- lustre/portals/include/.cvsignore | 2 + lustre/portals/include/Makefile.am | 3 + lustre/portals/include/linux/.cvsignore | 2 + lustre/portals/include/linux/Makefile.am | 4 + lustre/portals/include/linux/libcfs.h | 2 +- lustre/portals/include/portals/.cvsignore | 2 + lustre/portals/include/portals/Makefile.am | 10 + lustre/portals/include/portals/types.h | 17 +- lustre/portals/knals/qswnal/qswnal_cb.c | 6 +- lustre/portals/knals/socknal/socknal_cb.c | 2 +- lustre/portals/unals/Makefile.am | 9 +- lustre/portals/utils/Makefile.am | 2 + lustre/ptlrpc/client.c | 19 +- lustre/ptlrpc/import.c | 36 +- lustre/ptlrpc/niobuf.c | 8 +- lustre/ptlrpc/ptlrpcd.c | 9 +- lustre/ptlrpc/recover.c | 46 +- lustre/scripts/Makefile.am | 4 +- lustre/scripts/lbuild | 18 +- lustre/scripts/lustre-kernel-2.4.spec.in | 24 +- lustre/scripts/lustre.spec.in | 13 +- lustre/tests/.cvsignore | 1 + lustre/tests/Makefile.am | 13 +- lustre/tests/cfg/insanity-mdev.sh | 13 +- lustre/tests/cfg/local.sh | 2 +- lustre/tests/conf-sanity.sh | 27 +- lustre/tests/insanity.sh | 16 +- lustre/tests/recovery-small.sh | 4 +- lustre/tests/replay-dual.sh | 185 ++- lustre/tests/test-framework.sh | 5 + lustre/utils/Lustre/Makefile.am | 4 +- lustre/utils/Makefile.am | 14 +- 94 files changed, 2678 insertions(+), 454 deletions(-) create mode 100644 lnet/include/Makefile.am create mode 100644 lnet/include/linux/.cvsignore create mode 100644 lnet/include/linux/Makefile.am create mode 100644 lnet/include/lnet/.cvsignore create mode 100644 lnet/include/lnet/Makefile.am create mode 100644 lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-pdirops-2.4.20-rh.patch create mode 100644 lustre/kernel_patches/patches/ext3-trusted_ea-2.4.21-chaos.patch create mode 100644 lustre/portals/include/Makefile.am create mode 100644 lustre/portals/include/linux/.cvsignore create mode 100644 lustre/portals/include/linux/Makefile.am create mode 100644 lustre/portals/include/portals/.cvsignore create mode 100644 lustre/portals/include/portals/Makefile.am diff --git a/ldiskfs/ldiskfs/autoMakefile.am b/ldiskfs/ldiskfs/autoMakefile.am index f81e6e7..eacc902 100644 --- a/ldiskfs/ldiskfs/autoMakefile.am +++ b/ldiskfs/ldiskfs/autoMakefile.am @@ -33,10 +33,17 @@ patches := @top_srcdir@/kernel_patches/patches sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) rm -rf linux-stage linux sources $(ldiskfs_SOURCES) mkdir -p linux-stage/fs/ext3 linux-stage/include/linux - cd linux-stage && quilt setup -l ../$(series) -d ../$(patches) cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3 cp $(linux_headers) linux-stage/include/linux +if USE_QUILT + cd linux-stage && quilt setup -l ../$(series) -d ../$(patches) cd linux-stage && quilt push -a -q +else + @cd linux-stage && for i in $$(<../$(series)) ; do \ + echo "patch -p1 < ../$(patches)/$$i" ; \ + patch -p1 < ../$(patches)/$$i || exit 1 ; \ + done +endif mkdir linux @echo -n "Replacing 'ext3' with 'ldiskfs':" @for i in $(notdir $(ext3_headers) $(ext3_sources)) $(new_sources) ; do \ @@ -50,6 +57,7 @@ sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) linux-stage/include/linux/ext3$$i \ > linux/ldiskfs$$i ; \ done + @echo touch sources foo-check: diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index 2a42368..c78fc34 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -92,6 +92,7 @@ AC_CHECK_FILE([$LINUX/include/linux/namei.h], [ linux25="yes" KMODEXT=".ko" + enable_ldiskfs="yes" ],[ KMODEXT=".o" linux25="no" @@ -101,6 +102,16 @@ AC_MSG_RESULT([$linux25]) AM_CONDITIONAL(LINUX25, test x$linux25 = xyes) AC_SUBST(KMODEXT) +AC_PATH_PROG(PATCH, patch, [no]) +AC_PATH_PROG(QUILT, quilt, [no]) +AM_CONDITIONAL(USE_QUILT, test x$QUILT = xno) + +if test x$enable_ldiskfs$enable_modules = xyesyes ; then + if test x$PATCH$QUILT = xnono ; then + AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)]) + fi +fi + # ------- Makeflags ------------------ CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" @@ -135,7 +146,7 @@ _ACEOF AC_DEFUN([LUSTRE_MODULE_COMPILE_IFELSE], [m4_ifvaln([$1], [LUSTRE_MODULE_CONFTEST([$1])])dnl rm -f kernel-tests/conftest.o kernel-tests/conftest.mod.c kernel-tests/conftest.ko -AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="$EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])], +AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])], [$4], [_AC_MSG_LOG_CONFTEST m4_ifvaln([$5],[$5])dnl])dnl @@ -446,7 +457,7 @@ LUSTRE_MODULE_TRY_COMPILE( # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- # This needs to run after we've defined the KCPPFLAGS -AC_MSG_CHECKING([for kernel version]) +AC_MSG_CHECKING([if task_struct has a sighand field]) LUSTRE_MODULE_TRY_COMPILE( [ #include @@ -455,9 +466,24 @@ LUSTRE_MODULE_TRY_COMPILE( p.sighand = NULL; ],[ AC_DEFINE(CONFIG_RH_2_4_20, 1, [this kernel contains Red Hat 2.4.20 patches]) - AC_MSG_RESULT([redhat-2.4.20]) + AC_MSG_RESULT([yes]) ],[ - AC_MSG_RESULT([$LINUXRELEASE]) + AC_MSG_RESULT([no]) + ]) + +# ---------- 2.4.20 introduced cond_resched -------------- + +AC_MSG_CHECKING([if kernel offers cond_resched]) +LUSTRE_MODULE_TRY_COMPILE( + [ + #include + ],[ + cond_resched(); + ],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found]) + ],[ + AC_MSG_RESULT([no]) ]) # ---------- Red Hat 2.4.21 backports some more 2.5 bits -------- diff --git a/lnet/autoMakefile.am b/lnet/autoMakefile.am index bd57e6e..485ff04 100644 --- a/lnet/autoMakefile.am +++ b/lnet/autoMakefile.am @@ -3,6 +3,6 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -EXTRA_DIST = archdep.m4 build.m4 include +EXTRA_DIST = archdep.m4 build.m4 -SUBDIRS = portals libcfs knals unals router tests doc utils +SUBDIRS = portals libcfs knals unals router tests doc utils include diff --git a/lnet/include/.cvsignore b/lnet/include/.cvsignore index d45f796..94d3790 100644 --- a/lnet/include/.cvsignore +++ b/lnet/include/.cvsignore @@ -2,3 +2,5 @@ config.h stamp-h stamp-h1 stamp-h.in +Makefile +Makefile.in diff --git a/lnet/include/Makefile.am b/lnet/include/Makefile.am new file mode 100644 index 0000000..2b3eb8c --- /dev/null +++ b/lnet/include/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = linux portals + +EXTRA_DIST = cygwin-ioctl.h diff --git a/lnet/include/linux/.cvsignore b/lnet/include/linux/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lnet/include/linux/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lnet/include/linux/Makefile.am b/lnet/include/linux/Makefile.am new file mode 100644 index 0000000..3c28c6e8 --- /dev/null +++ b/lnet/include/linux/Makefile.am @@ -0,0 +1,4 @@ +linuxdir = $(includedir)/linux + +EXTRA_DIST = kp30.h kpr.h libcfs.h lustre_list.h portals_compat25.h \ + portals_lib.h diff --git a/lnet/include/linux/libcfs.h b/lnet/include/linux/libcfs.h index efdc8fe..6772e82 100644 --- a/lnet/include/linux/libcfs.h +++ b/lnet/include/linux/libcfs.h @@ -2,7 +2,7 @@ * vim:expandtab:shiftwidth=8:tabstop=8: */ #ifndef _LIBCFS_H - +#define _LIBCFS_H #define PORTAL_DEBUG diff --git a/lnet/include/lnet/.cvsignore b/lnet/include/lnet/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lnet/include/lnet/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am new file mode 100644 index 0000000..5ed6090 --- /dev/null +++ b/lnet/include/lnet/Makefile.am @@ -0,0 +1,10 @@ +portalsdir=$(includedir)/portals + +if UTILS +portals_HEADERS = list.h +endif + +EXTRA_DIST = api.h api-support.h arg-blocks.h defines.h errno.h \ + internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h \ + list.h lltrace.h myrnal.h nal.h nalids.h p30.h ppid.h ptlctl.h \ + socknal.h stringtab.h types.h diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index 74ef493..80995e9 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -1,26 +1,15 @@ #ifndef _P30_TYPES_H_ #define _P30_TYPES_H_ -#ifdef __linux__ -# include -# if defined(__powerpc__) && !defined(__KERNEL__) -# define __KERNEL__ -# include -# undef __KERNEL__ -# else -# include -# endif -#else -# include -typedef u_int32_t __u32; -typedef u_int64_t __u64; -#endif +#include #ifdef __KERNEL__ # include +# include #else # include # define do_gettimeofday(tv) gettimeofday(tv, NULL); +typedef unsigned long long cycles_t; #endif #include diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 6bff730..08453a0 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -585,7 +585,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) /* Don't block for transmit descriptor if we're in interrupt context */ int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0; int dest = kqswnal_nid2elanid (ktx->ktx_nid); - long flags; + unsigned long flags; int rc; ktx->ktx_launchtime = jiffies; @@ -1429,7 +1429,7 @@ kqswnal_rx (kqswnal_rx_t *krx) void kqswnal_rxhandler(EP_RXD *rxd) { - long flags; + unsigned long flags; int nob = ep_rxd_len (rxd); int status = ep_rxd_status (rxd); kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); @@ -1732,7 +1732,7 @@ kqswnal_scheduler (void *arg) kqswnal_rx_t *krx; kqswnal_tx_t *ktx; kpr_fwd_desc_t *fwd; - long flags; + unsigned long flags; int rc; int counter = 0; int shuttingdown = 0; diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index f02cbda..37695c9 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1187,7 +1187,7 @@ ksocknal_fmb_callback (void *arg, int error) { ksock_fmb_t *fmb = (ksock_fmb_t *)arg; ksock_fmb_pool_t *fmp = fmb->fmb_pool; - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address(fmb->fmb_kiov[0].kiov_page); + ptl_hdr_t *hdr = &fmb->fmb_hdr; ksock_conn_t *conn = NULL; ksock_sched_t *sched; unsigned long flags; diff --git a/lnet/ulnds/Makefile.am b/lnet/ulnds/Makefile.am index 4c842a1..15080b0 100644 --- a/lnet/ulnds/Makefile.am +++ b/lnet/ulnds/Makefile.am @@ -2,7 +2,12 @@ if LIBLUSTRE noinst_LIBRARIES = libtcpnal.a endif -pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h -libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h +noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h \ + ipmap.h bridge.h procbridge.h + +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h \ + dispatch.h table.h timer.h address.c procapi.c proclib.c \ + connection.c tcpnal.c connection.h + libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS) libtcpnal_a_CFLAGS = $(LLCFLAGS) diff --git a/lnet/ulnds/socklnd/Makefile.am b/lnet/ulnds/socklnd/Makefile.am index 4c842a1..15080b0 100644 --- a/lnet/ulnds/socklnd/Makefile.am +++ b/lnet/ulnds/socklnd/Makefile.am @@ -2,7 +2,12 @@ if LIBLUSTRE noinst_LIBRARIES = libtcpnal.a endif -pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h -libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h +noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h \ + ipmap.h bridge.h procbridge.h + +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h \ + dispatch.h table.h timer.h address.c procapi.c proclib.c \ + connection.c tcpnal.c connection.h + libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS) libtcpnal_a_CFLAGS = $(LLCFLAGS) diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am index 15c1774..851a8e1 100644 --- a/lnet/utils/Makefile.am +++ b/lnet/utils/Makefile.am @@ -14,8 +14,10 @@ libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS) libuptlctl_a_CFLAGS = $(LLCFLAGS) endif +if UTILS sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid lib_LIBRARIES = libptlctl.a +endif acceptor_SOURCES = acceptor.c diff --git a/lustre/ChangeLog b/lustre/ChangeLog index df7d863..30da8bf 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -5,13 +5,22 @@ tbd Cluster File Systems, Inc. - strip trailing '/'s before comparing paths with /proc/mounts (3486) - remove assertions to work around "in-flight rpcs" recovery bug (3063) - change init script to fail more clearly if not run as root (1528) + - allow clients to reconnect during replay (1742) - fix ns_lock/i_sem lock ordering deadlock for kms update (3477) - don't do DNS lookups on NIDs too small for IP addresses (3442) + - re-awaken ptlrpcd if new requests arrive during check_set (3554) + - fix cond_resched (3554) + - only evict unfinished clients after recovery (3515) + - allow bulk resend, prevent data loss (3570) - dynamic ptlrpc request buffer allocation (2102) - don't allow unlinking open directory if it isn't empty (2904) - set MDS/OST threads to umask 0 to not clobber client modes (3359) + - remove extraneous obd dereference causing LASSERT failure (3334) + - don't use get_cycles() when creating temp. files on the mds (3156) + - hold i_sem when setting i_size in ll_extent_lock() (3564) * miscellania - servers can dump a log evicting a client - lustre.dump_on_timeout=1 + - fix ksocknal_fmb_callback() error messages (2918) 2004-05-27 Cluster File Systems, Inc. * version 1.2.2 diff --git a/lustre/conf/Makefile.am b/lustre/conf/Makefile.am index 6e3666b..627f2a2 100644 --- a/lustre/conf/Makefile.am +++ b/lustre/conf/Makefile.am @@ -6,7 +6,10 @@ EXTRA_DIST = lustre.dtd lustre.schema slapd-lustre.conf lustre2ldif.xsl top.ldif ldapconfdir = $(sysconfdir)/openldap ldapschemadir = $(sysconfdir)/openldap/schema +pkglibdir = '${exec_prefix}/usr/lib/$(PACKAGE)' + +if UTILS ldapconf_SCRIPTS = slapd-lustre.conf ldapschema_SCRIPTS = lustre.schema -pkglibdir = '${exec_prefix}/usr/lib/$(PACKAGE)' pkglib_DATA = top.ldif lustre2ldif.xsl +endif diff --git a/lustre/configure.in b/lustre/configure.in index 7b14e69..99a1347 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -5,7 +5,7 @@ AC_INIT AC_CANONICAL_SYSTEM -AM_INIT_AUTOMAKE(lustre, 1.2.2.3) +AM_INIT_AUTOMAKE(lustre, 1.2.2.4) # AM_MAINTAINER_MODE # Four main targets: lustre kernel modules, utilities, tests, and liblustre @@ -77,7 +77,6 @@ AC_ARG_ENABLE([ldiskfs], [use ldiskfs for the Lustre backing FS]), [BACKINGFS='ldiskfs'],[enable_ldiskfs='no']) AC_MSG_RESULT([$enable_ldiskfs]) -AM_CONDITIONAL(LDISKFS, test x$enable_ldiskfs = xyes) AC_MSG_CHECKING([which backing filesystem to use]) AC_MSG_RESULT([$BACKINGFS]) @@ -158,6 +157,8 @@ AM_CONDITIONAL(SMFS, test x$enable_smfs = xyes) sinclude(portals/build.m4) sinclude(portals/archdep.m4) +AM_CONDITIONAL(LDISKFS, test x$enable_ldiskfs = xyes) + if test x$enable_inkernel = xyes ; then find . -name Makefile.mk | sed 's/.mk$//' | xargs -n 1 \ sh -e -x -c '(cp -f $0.mk $0.in)' @@ -217,6 +218,9 @@ portals/Kernelenv portals/Makefile portals/autoMakefile portals/doc/Makefile +portals/include/Makefile +portals/include/linux/Makefile +portals/include/portals/Makefile portals/knals/Makefile portals/knals/autoMakefile portals/knals/gmnal/Makefile diff --git a/lustre/include/linux/Makefile.am b/lustre/include/linux/Makefile.am index cb75fe5..4c67b12 100644 --- a/lustre/include/linux/Makefile.am +++ b/lustre/include/linux/Makefile.am @@ -3,6 +3,12 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution +linuxdir = $(includedir)/linux + +if UTILS +linux_HEADERS = lustre_idl.h +endif + EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_ha.h lustre_lib.h \ lustre_mgmt.h obd_cache.h obd_lov.h lustre_dlm.h lustre_handles.h \ lustre_net.h obd_class.h obd_ost.h obd_support.h lustre_commit_confd.h \ diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 13363bd..b9a295e 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -102,10 +102,6 @@ static inline int cleanup_group_info(void) #define smp_num_cpus NR_CPUS -#ifndef conditional_schedule -#define conditional_schedule() cond_resched() -#endif - #include #else /* 2.4.. */ @@ -183,8 +179,14 @@ static inline int cleanup_group_info(void) return 0; } -#ifndef conditional_schedule -#define conditional_schedule() if (unlikely(need_resched())) schedule() +#ifndef HAVE_COND_RESCHED +static inline void cond_resched(void) +{ + if (unlikely(need_resched())) { + set_current_state(TASK_RUNNING); + schedule(); + } +} #endif /* to find proc_dir_entry from inode. 2.6 has native one -bzzz */ diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index 9be781f..52b5c7a 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -73,8 +73,9 @@ struct obd_export { spinlock_t exp_lock; /* protects flags int below */ /* ^ protects exp_outstanding_replies too */ int exp_flags; - int exp_failed:1; - int exp_libclient:1; /* liblustre client? */ + int exp_failed:1, + exp_replay_needed:1, + exp_libclient:1; /* liblustre client? */ union { struct mds_export_data eu_mds_data; struct filter_export_data eu_filter_data; diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 72f3817..b9beff5 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -132,7 +132,7 @@ static inline void *fsfilt_brw_start_log(struct obd_device *obd, void *parent_handle = oti ? oti->oti_handle : NULL; void *handle = obd->obd_fsops->fs_brw_start(objcount, fso, niocount, nb, parent_handle, logs); - CDEBUG(D_HA, "started handle %p (%p)\n", handle, parent_handle); + CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle); if (oti != NULL) { if (parent_handle == NULL) { @@ -177,7 +177,7 @@ static inline int fsfilt_commit_async(struct obd_device *obd, unsigned long now = jiffies; int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle); - CDEBUG(D_HA, "committing handle %p (async)\n", *wait_handle); + CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle); if (time_after(jiffies, now + 15 * HZ)) CERROR("long journal start time %lus\n", (jiffies - now) / HZ); @@ -189,7 +189,7 @@ static inline int fsfilt_commit_wait(struct obd_device *obd, { unsigned long now = jiffies; int rc = obd->obd_fsops->fs_commit_wait(inode, handle); - CDEBUG(D_HA, "waiting for completion %p\n", handle); + CDEBUG(D_INFO, "waiting for completion %p\n", handle); if (time_after(jiffies, now + 15 * HZ)) CERROR("long journal start time %lus\n", (jiffies - now) / HZ); return rc; diff --git a/lustre/include/linux/lustre_import.h b/lustre/include/linux/lustre_import.h index d2af141..74be113 100644 --- a/lustre/include/linux/lustre_import.h +++ b/lustre/include/linux/lustre_import.h @@ -83,7 +83,7 @@ struct obd_import { int imp_invalid:1, imp_replayable:1, imp_dlm_fake:1, imp_server_timeout:1, imp_initial_recov:1, imp_force_verify:1, - imp_pingable:1; + imp_pingable:1, imp_resend_replay:1; __u32 imp_connect_op; }; diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index a529860..c55e5ff 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -493,13 +493,9 @@ static inline void ost_checksum(obd_count *cksum, void *addr, int len) static inline int ll_insecure_random_int(void) { -#ifdef __arch_um__ struct timeval t; do_gettimeofday(&t); return (int)(t.tv_usec); -#else - return (int)(get_cycles() >> 2); -#endif } /* diff --git a/lustre/include/linux/lustre_log.h b/lustre/include/linux/lustre_log.h index 1d0ff9f..3eb75da 100644 --- a/lustre/include/linux/lustre_log.h +++ b/lustre/include/linux/lustre_log.h @@ -127,8 +127,8 @@ int obd_llog_finish(struct obd_device *obd, int count); /* llog_ioctl.c */ int llog_ioctl(struct llog_ctxt *ctxt, int cmd, struct obd_ioctl_data *data); -int llog_catlog_list(struct obd_device *obd, int count, - struct obd_ioctl_data *data); +int llog_catalog_list(struct obd_device *obd, int count, + struct obd_ioctl_data *data); /* llog_net.c */ int llog_initiator_connect(struct llog_ctxt *ctxt); diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index 1a577f0..8f2f9e2 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -137,6 +137,7 @@ int class_connect(struct lustre_handle *conn, struct obd_device *obd, struct obd_uuid *cluuid); int class_disconnect(struct obd_export *exp, int failover); void class_disconnect_exports(struct obd_device *obddev, int failover); +void class_disconnect_stale_exports(struct obd_device *obddev, int failover); /* generic operations shared by various OBD types */ int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data); int class_multi_cleanup(struct obd_device *obddev); diff --git a/lustre/include/lustre/Makefile.am b/lustre/include/lustre/Makefile.am index a785ada..6faa7cd 100644 --- a/lustre/include/lustre/Makefile.am +++ b/lustre/include/lustre/Makefile.am @@ -3,7 +3,8 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution - +if UTILS pkginclude_HEADERS = lustre_user.h liblustreapi.h +endif -EXTRA_DIST = $(pkginclude_HEADERS) +EXTRA_DIST = lustre_user.h liblustreapi.h diff --git a/lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-chaos.patch b/lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-chaos.patch new file mode 100644 index 0000000..431bdc7 --- /dev/null +++ b/lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-chaos.patch @@ -0,0 +1,323 @@ +Index: linux-p4smp/arch/i386/kernel/entry.S +=================================================================== +--- linux-p4smp.orig/arch/i386/kernel/entry.S 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/arch/i386/kernel/entry.S 2004-06-14 13:14:19.000000000 -0700 +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + #include + + EBX = 0x00 +@@ -94,10 +95,6 @@ pt_sys_exit = 8 + + ENOSYS = 38 + +-#define GET_CURRENT(reg) \ +- movl $-8192, reg; \ +- andl %esp, reg +- + #if CONFIG_X86_HIGH_ENTRY + + #define call_SYMBOL_NAME_ABS(X) movl $X, %ebp; call *%ebp +@@ -193,7 +190,7 @@ ENOSYS = 38 + GET_CURRENT(%ebx); \ + movl real_stack(%ebx), %edx; \ + movl %esp, %ebx; \ +- andl $0x1fff, %ebx; \ ++ andl $(THREAD_SIZE-1), %ebx; \ + orl %ebx, %edx; \ + movl %edx, %esp; + +@@ -228,7 +225,7 @@ ENOSYS = 38 + return_path_start_marker: \ + nop; \ + movl %esp, %ebx; \ +- andl $0x1fff, %ebx; \ ++ andl $(THREAD_SIZE-1), %ebx; \ + orl %ebx, %edx; \ + movl %esp, %eax; \ + movl %edx, %esp; \ +Index: linux-p4smp/arch/i386/kernel/smpboot.c +=================================================================== +--- linux-p4smp.orig/arch/i386/kernel/smpboot.c 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/arch/i386/kernel/smpboot.c 2004-06-14 13:14:19.000000000 -0700 +@@ -814,7 +814,7 @@ static void __init do_boot_cpu (int apic + + /* So we see what's up */ + printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); +- stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle); ++ stack_start.esp = (void *)idle->thread.esp; + + /* + * This grunge runs the startup process for +@@ -887,7 +887,7 @@ static void __init do_boot_cpu (int apic + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; +- if (*((volatile unsigned char *)phys_to_virt(8192)) ++ if (*((volatile unsigned char *)phys_to_virt(THREAD_SIZE)) + == 0xA5) + /* trampoline started but...? */ + printk("Stuck ??\n"); +@@ -910,7 +910,7 @@ static void __init do_boot_cpu (int apic + } + + /* mark "stuck" area as not stuck */ +- *((volatile unsigned long *)phys_to_virt(8192)) = 0; ++ *((volatile unsigned long *)phys_to_virt(THREAD_SIZE)) = 0; + + if(clustered_apic_mode == CLUSTERED_APIC_NUMAQ) { + printk("Restoring NMI vector\n"); +Index: linux-p4smp/arch/i386/kernel/traps.c +=================================================================== +--- linux-p4smp.orig/arch/i386/kernel/traps.c 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/arch/i386/kernel/traps.c 2004-06-14 13:14:19.000000000 -0700 +@@ -273,7 +273,7 @@ void show_trace_task(struct task_struct + unsigned long esp = tsk->thread.esp; + + /* User space on another CPU? */ +- if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1)) ++ if ((esp ^ (unsigned long)tsk) & ~(THREAD_SIZE - 1)) + return; + show_trace((unsigned long *)esp); + } +Index: linux-p4smp/arch/i386/kernel/head.S +=================================================================== +--- linux-p4smp.orig/arch/i386/kernel/head.S 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/arch/i386/kernel/head.S 2004-06-14 13:14:19.000000000 -0700 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + #define OLD_CL_MAGIC_ADDR 0x90020 + #define OLD_CL_MAGIC 0xA33F +@@ -328,7 +329,7 @@ rp_sidt: + ret + + ENTRY(stack_start) +- .long SYMBOL_NAME(init_task_union)+8192 ++ .long SYMBOL_NAME(init_task_union)+THREAD_SIZE + .long __KERNEL_DS + + /* This is the default interrupt "handler" :-) */ +Index: linux-p4smp/arch/i386/kernel/irq.c +=================================================================== +--- linux-p4smp.orig/arch/i386/kernel/irq.c 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/arch/i386/kernel/irq.c 2004-06-14 13:14:19.000000000 -0700 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + + + +@@ -585,7 +586,7 @@ asmlinkage unsigned int do_IRQ(struct pt + long esp; + + /* Debugging check for stack overflow: is there less than 1KB free? */ +- __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191)); ++ __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE-1)); + if (unlikely(esp < (sizeof(struct task_struct) + 1024))) { + extern void show_stack(unsigned long *); + +Index: linux-p4smp/arch/i386/lib/getuser.S +=================================================================== +--- linux-p4smp.orig/arch/i386/lib/getuser.S 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/arch/i386/lib/getuser.S 2004-06-14 13:14:19.000000000 -0700 +@@ -21,6 +21,10 @@ + * as they get called from within inline assembly. + */ + ++/* Duplicated from asm/processor.h */ ++#include ++#include ++ + addr_limit = 12 + + .text +@@ -28,7 +32,7 @@ addr_limit = 12 + .globl __get_user_1 + __get_user_1: + movl %esp,%edx +- andl $0xffffe000,%edx ++ andl $~(THREAD_SIZE - 1),%edx + cmpl addr_limit(%edx),%eax + jae bad_get_user + 1: movzbl (%eax),%edx +@@ -41,7 +45,7 @@ __get_user_2: + addl $1,%eax + movl %esp,%edx + jc bad_get_user +- andl $0xffffe000,%edx ++ andl $~(THREAD_SIZE - 1),%edx + cmpl addr_limit(%edx),%eax + jae bad_get_user + 2: movzwl -1(%eax),%edx +@@ -54,7 +58,7 @@ __get_user_4: + addl $3,%eax + movl %esp,%edx + jc bad_get_user +- andl $0xffffe000,%edx ++ andl $~(THREAD_SIZE - 1),%edx + cmpl addr_limit(%edx),%eax + jae bad_get_user + 3: movl -3(%eax),%edx +Index: linux-p4smp/arch/i386/config.in +=================================================================== +--- linux-p4smp.orig/arch/i386/config.in 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/arch/i386/config.in 2004-06-14 13:14:05.000000000 -0700 +@@ -310,6 +310,28 @@ if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86 + define_bool CONFIG_HAVE_DEC_LOCK y + fi + ++choice 'Bigger Stack Size Support' \ ++ "off CONFIG_NOBIGSTACK \ ++ 16KB CONFIG_STACK_SIZE_16KB \ ++ 32KB CONFIG_STACK_SIZE_32KB \ ++ 64KB CONFIG_STACK_SIZE_64KB" off ++ ++if [ "$CONFIG_NOBIGSTACK" = "y" ]; then ++ define_int CONFIG_STACK_SIZE_SHIFT 1 ++else ++ if [ "$CONFIG_STACK_SIZE_16KB" = "y" ]; then ++ define_int CONFIG_STACK_SIZE_SHIFT 2 ++ else ++ if [ "$CONFIG_STACK_SIZE_32KB" = "y" ]; then ++ define_int CONFIG_STACK_SIZE_SHIFT 3 ++ else ++ if [ "$CONFIG_STACK_SIZE_64KB" = "y" ]; then ++ define_int CONFIG_STACK_SIZE_SHIFT 4 ++ fi ++ fi ++ fi ++fi ++ + source drivers/perfctr/Config.in + + endmenu +Index: linux-p4smp/include/asm-i386/current.h +=================================================================== +--- linux-p4smp.orig/include/asm-i386/current.h 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/include/asm-i386/current.h 2004-06-14 13:41:19.000000000 -0700 +@@ -1,15 +1,64 @@ + #ifndef _I386_CURRENT_H + #define _I386_CURRENT_H ++#include ++ ++/* ++ * Configurable page sizes on i386, mainly for debugging purposes. ++ * (c) Balbir Singh ++ */ ++ ++/* enumerate the values, include/asm-i386/hw_irq.h in particular needs this */ ++#if (PAGE_SIZE != 4096) ++#error PAGE_SIZE != 4096 unsupported ++#endif ++ ++#if (CONFIG_STACK_SIZE_SHIFT == 0) ++#define THREAD_SIZE 4096 ++#elif (CONFIG_STACK_SIZE_SHIFT == 1) ++#define THREAD_SIZE 8192 ++#elif (CONFIG_STACK_SIZE_SHIFT == 2) ++#define THREAD_SIZE 16384 ++#elif (CONFIG_STACK_SIZE_SHIFT == 3) ++#define THREAD_SIZE 32768 ++#elif (CONFIG_STACK_SIZE_SHIFT == 4) ++#define THREAD_SIZE 65536 ++#else ++#error CONFIG_STACK_SIZE_SHIFT > 4 unsupported ++#endif ++ ++#if (CONFIG_STACK_SIZE_SHIFT != 1) && defined(CONFIG_X86_4G) ++#error Large stacks with 4G/4G split unsupported ++#endif ++ ++#ifdef __ASSEMBLY__ ++ ++#define GET_CURRENT(reg) \ ++ movl $-THREAD_SIZE, reg; \ ++ andl %esp, reg ++ ++#else /* __ASSEMBLY__ */ ++ ++#define __alloc_task_struct() \ ++ ((struct task_struct *) __get_free_pages(GFP_KERNEL, CONFIG_STACK_SIZE_SHIFT)) ++ ++#define __free_task_struct(p) do { \ ++ BUG_ON((p)->state < TASK_ZOMBIE); \ ++ free_pages((unsigned long) (p), CONFIG_STACK_SIZE_SHIFT); \ ++} while(0) ++ ++#define INIT_TASK_SIZE THREAD_SIZE + + struct task_struct; + + static inline struct task_struct * get_current(void) + { + struct task_struct *current; +- __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL)); ++ __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~(THREAD_SIZE - 1))); + return current; + } + + #define current get_current() + ++#endif /* __ASSEMBLY__ */ ++ + #endif /* !(_I386_CURRENT_H) */ +Index: linux-p4smp/include/asm-i386/hw_irq.h +=================================================================== +--- linux-p4smp.orig/include/asm-i386/hw_irq.h 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/include/asm-i386/hw_irq.h 2004-06-14 13:14:19.000000000 -0700 +@@ -136,21 +136,17 @@ extern char _stext, _etext; + " \ + /* load the real stack - keep the offset */ \ + \ +- movl $-8192, %ebx; \ ++ movl $- " STR(THREAD_SIZE) ", %ebx; \ + andl %esp, %ebx; \ + movl 36(%ebx), %edx; \ + movl %esp, %ebx; \ +- andl $0x1fff, %ebx; \ ++ andl $( " STR(THREAD_SIZE) "-1), %ebx; \ + orl %ebx, %edx; \ + movl %edx, %esp;" + + #define IRQ_NAME2(nr) nr##_interrupt(void) + #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) + +-#define GET_CURRENT \ +- "movl %esp, %ebx\n\t" \ +- "andl $-8192, %ebx\n\t" +- + /* + * SMP has a few special interrupts for IPI messages + */ +Index: linux-p4smp/include/asm-i386/processor.h +=================================================================== +--- linux-p4smp.orig/include/asm-i386/processor.h 2004-06-14 13:13:07.000000000 -0700 ++++ linux-p4smp/include/asm-i386/processor.h 2004-06-14 13:14:19.000000000 -0700 +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -498,10 +499,6 @@ unsigned long get_wchan(struct task_stru + #define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019]) + #define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022]) + +-#define THREAD_SIZE (2*PAGE_SIZE) +-#define __alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)) +-#define __free_task_struct(p) do { BUG_ON((p)->state < TASK_ZOMBIE); free_pages((unsigned long) (p), 1); } while (0) +- + #define init_task (init_task_union.task) + #define init_stack (init_task_union.stack) + diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch index 3de6a8f..f6b2f43 100644 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch @@ -1395,7 +1395,7 @@ + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1416,9 +1416,9 @@ + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -1642,8 +1642,8 @@ + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; -+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch index 748671f..28a1ad6 100644 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1-suse.patch @@ -1395,7 +1395,7 @@ + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1416,9 +1416,9 @@ + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -1642,8 +1642,8 @@ + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; -+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1.patch index 748671f..28a1ad6 100644 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1.patch +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1.patch @@ -1395,7 +1395,7 @@ + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; @@ -1416,9 +1416,9 @@ + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); -+ to->rec_len = rec_len; ++ to->rec_len = cpu_to_le16(rec_len); + prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); + } + de = next; + } @@ -1642,8 +1642,8 @@ + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; -+ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; ++ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch index 031e46d..63684c5 100644 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch @@ -21,8 +21,8 @@ Index: linux-2.4.21-chaos/fs/ext3/ialloc.c + EXT3_I(inode)->i_extra_isize = 0; + ei->i_state = EXT3_STATE_NEW; - err = ext3_get_inode_loc_new(inode, &iloc, 1); - if (err) goto fail; + err = ext3_get_inode_loc_new(inode, &iloc, 1); + if (err) goto fail; Index: linux-2.4.21-chaos/fs/ext3/inode.c =================================================================== --- linux-2.4.21-chaos.orig/fs/ext3/inode.c 2003-12-12 17:39:11.000000000 +0300 diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch index c168149..0806c38 100644 --- a/lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch +++ b/lustre/kernel_patches/patches/ext3-htree-2.4.19-pre1.patch @@ -1667,7 +1667,7 @@ Index: linux-2.4.19-pre1/fs/ext3/namei.c + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch index b045d53..4b445f5 100644 --- a/lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch @@ -1667,7 +1667,7 @@ Index: linux-2.4.21-chaos/fs/ext3/namei.c + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch index 853fb0c..ca2cacf 100644 --- a/lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch +++ b/lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch @@ -1657,7 +1657,7 @@ + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); diff --git a/lustre/kernel_patches/patches/ext3-htree-suse.patch b/lustre/kernel_patches/patches/ext3-htree-suse.patch index 1278f8f..3e5148e 100644 --- a/lustre/kernel_patches/patches/ext3-htree-suse.patch +++ b/lustre/kernel_patches/patches/ext3-htree-suse.patch @@ -1667,7 +1667,7 @@ Index: linux-2.4.21-suse/fs/ext3/namei.c + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); diff --git a/lustre/kernel_patches/patches/ext3-htree.patch b/lustre/kernel_patches/patches/ext3-htree.patch index 86b0061..31f2ae3 100644 --- a/lustre/kernel_patches/patches/ext3-htree.patch +++ b/lustre/kernel_patches/patches/ext3-htree.patch @@ -1657,7 +1657,7 @@ + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *) &root->dotdot; ++ de = (struct ext3_dir_entry_2 *)&root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); diff --git a/lustre/kernel_patches/patches/ext3-pdirops-2.4.20-rh.patch b/lustre/kernel_patches/patches/ext3-pdirops-2.4.20-rh.patch new file mode 100644 index 0000000..2733e7d --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-pdirops-2.4.20-rh.patch @@ -0,0 +1,1248 @@ + fs/ext3/ialloc.c | 3 + fs/ext3/inode.c | 3 + fs/ext3/namei.c | 582 +++++++++++++++++++++++++++++++++++++--------- + fs/ext3/super.c | 14 + + include/linux/ext3_fs.h | 1 + include/linux/ext3_fs_i.h | 6 + 6 files changed, 500 insertions(+), 109 deletions(-) + +Index: linux-2.4.20/fs/ext3/namei.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/namei.c 2004-05-27 15:10:40.000000000 -0400 ++++ linux-2.4.20/fs/ext3/namei.c 2004-05-27 15:29:52.000000000 -0400 +@@ -51,6 +51,9 @@ + { + struct buffer_head *bh; + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&EXT3_I(inode)->i_append_sem); + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3_bread(handle, inode, *block, 1, err))) { +@@ -58,6 +61,8 @@ + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_journal_get_write_access(handle,bh); + } ++ up(&EXT3_I(inode)->i_append_sem); ++ + return bh; + } + +@@ -134,6 +139,8 @@ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; ++ unsigned long leaf; ++ unsigned int curidx; + }; + + struct dx_map_entry +@@ -142,6 +149,30 @@ + u32 offs; + }; + ++/* FIXME: this should be reworked using bb_spin_lock ++ * introduced in -mm tree ++ */ ++#define BH_DXLock 25 ++ ++static inline void dx_lock_bh(struct buffer_head volatile *bh) ++{ ++#ifdef CONFIG_SMP ++ while (test_and_set_bit(BH_DXLock, &bh->b_state)) { ++ while (test_bit(BH_DXLock, &bh->b_state)) ++ cpu_relax(); ++ } ++#endif ++} ++ ++static inline void dx_unlock_bh(struct buffer_head *bh) ++{ ++#ifdef CONFIG_SMP ++ smp_mb__before_clear_bit(); ++ clear_bit(BH_DXLock, &bh->b_state); ++#endif ++} ++ ++ + #ifdef CONFIG_EXT3_INDEX + static inline unsigned dx_get_block (struct dx_entry *entry); + static void dx_set_block (struct dx_entry *entry, unsigned value); +@@ -153,7 +184,7 @@ + static void dx_set_limit (struct dx_entry *entries, unsigned value); + static unsigned dx_root_limit (struct inode *dir, unsigned infosize); + static unsigned dx_node_limit (struct inode *dir); +-static struct dx_frame *dx_probe(struct dentry *dentry, ++static struct dx_frame *dx_probe(struct qstr *name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +@@ -165,15 +196,18 @@ + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct inode *, struct dx_frame *, u32, u32, u32); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, int *err, + __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err); ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock); + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); ++static inline void *ext3_lock_htree(struct inode *, unsigned long, int); ++static inline void ext3_unlock_htree(struct inode *, void *); + + /* + * Future: use high four bits of block for coalesce-on-delete flags +@@ -306,6 +340,94 @@ + #endif /* DX_DEBUG */ + + /* ++ * dx_find_position ++ * ++ * search position of specified hash in index ++ * ++ */ ++ ++struct dx_entry * dx_find_position(struct dx_entry * entries, u32 hash) ++{ ++ struct dx_entry *p, *q, *m; ++ int count; ++ ++ count = dx_get_count(entries); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ return p - 1; ++} ++ ++/* ++ * returns 1 if path is unchanged ++ */ ++int dx_check_path(struct dx_frame *frame, u32 hash) ++{ ++ struct dx_entry *p; ++ int ret = 1; ++ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hash); ++ if (frame->leaf != dx_get_block(p)) ++ ret = 0; ++ dx_unlock_bh(frame->bh); ++ ++ return ret; ++} ++ ++/* ++ * 0 - changed ++ * 1 - hasn't changed ++ */ ++static int ++dx_check_full_path(struct dx_frame *frames, struct dx_hash_info *hinfo) ++{ ++ struct dx_entry *p; ++ struct dx_frame *frame = frames; ++ u32 leaf; ++ ++ /* check first level */ ++ dx_lock_bh(frame->bh); ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ /* is there 2nd level? */ ++ frame++; ++ if (frame->bh == NULL) ++ return 1; ++ ++ /* check second level */ ++ dx_lock_bh(frame->bh); ++ ++ /* probably 1st level got changed, check it */ ++ if (!dx_check_path(frames, hinfo->hash)) { ++ /* path changed */ ++ dx_unlock_bh(frame->bh); ++ return 0; ++ } ++ ++ p = dx_find_position(frame->entries, hinfo->hash); ++ leaf = dx_get_block(p); ++ dx_unlock_bh(frame->bh); ++ ++ if (leaf != frame->leaf) ++ return 0; ++ ++ return 1; ++} ++ ++/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +@@ -315,19 +437,20 @@ + * back to userspace. + */ + static struct dx_frame * +-dx_probe(struct dentry *dentry, struct inode *dir, ++dx_probe(struct qstr *name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) + { +- unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ unsigned indirect; ++ struct dx_entry *at, *entries; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; ++ unsigned int curidx; + + frame->bh = NULL; +- if (dentry) +- dir = dentry->d_parent->d_inode; ++ frame[1].bh = NULL; ++ + if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; +@@ -343,8 +466,8 @@ + } + hinfo->hash_version = root->info.hash_version; + hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; +- if (dentry) +- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ if (name) ++ ext3fs_dirhash(name->name, name->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { +@@ -356,7 +479,19 @@ + goto fail; + } + ++repeat: ++ curidx = 0; ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ dx_lock_bh(bh); ++ /* indirect must be initialized under bh lock because ++ * 2nd level creation procedure may change it and dx_probe() ++ * will suggest htree is still single-level -bzzz */ + if ((indirect = root->info.indirect_levels) > 1) { ++ dx_unlock_bh(bh); + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); +@@ -364,56 +499,46 @@ + *err = ERR_BAD_DX_DIR; + goto fail; + } +- +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); +- assert(dx_get_limit(entries) == dx_root_limit(dir, +- root->info.info_length)); +- dxtrace (printk("Look up %x", hash)); ++ + while (1) + { +- count = dx_get_count(entries); +- assert (count && count <= dx_get_limit(entries)); +- p = entries + 1; +- q = entries + count - 1; +- while (p <= q) +- { +- m = p + (q - p)/2; +- dxtrace(printk(".")); +- if (dx_get_hash(m) > hash) +- q = m - 1; +- else +- p = m + 1; +- } +- +- if (0) // linear search cross check +- { +- unsigned n = count - 1; +- at = entries; +- while (n--) +- { +- dxtrace(printk(",")); +- if (dx_get_hash(++at) > hash) +- { +- at--; +- break; +- } +- } +- assert (at == p - 1); +- } +- +- at = p - 1; +- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ at = dx_find_position(entries, hinfo->hash); ++ dxtrace(printk(" %x->%u\n", ++ at == entries? 0: dx_get_hash(at), ++ dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; +- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ frame->curidx = curidx; ++ frame->leaf = dx_get_block(at); ++ if (!indirect--) { ++ dx_unlock_bh(bh); ++ return frame; ++ } ++ ++ /* step into next htree level */ ++ curidx = dx_get_block(at); ++ dx_unlock_bh(bh); ++ if (!(bh = ext3_bread (NULL,dir, frame->leaf, 0, err))) + goto fail2; ++ ++ dx_lock_bh(bh); ++ /* splitting may change root index block and move ++ * hash we're looking for into another index block ++ * so, we have to check this situation and repeat ++ * from begining if path got changed -bzzz */ ++ if (!dx_check_path(frame, hash)) { ++ dx_unlock_bh(bh); ++ bh = frame->bh; ++ indirect++; ++ goto repeat; ++ } ++ + at = entries = ((struct dx_node *) bh->b_data)->entries; + assert (dx_get_limit(entries) == dx_node_limit (dir)); + frame++; + } ++ dx_unlock_bh(bh); + fail2: + while (frame >= frame_in) { + brelse(frame->bh); +@@ -427,8 +552,7 @@ + { + if (frames[0].bh == NULL) + return; +- +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ if (frames[1].bh != NULL) + brelse(frames[1].bh); + brelse(frames[0].bh); + } +@@ -470,8 +594,10 @@ + * nodes need to be read. + */ + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) { ++ p->leaf = dx_get_block(p->at); + break; ++ } + if (p == frames) + return 0; + num_frames++; +@@ -497,13 +623,17 @@ + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), +- 0, err))) ++ u32 idx; ++ ++ idx = p->leaf = dx_get_block(p->at); ++ if (!(bh = ext3_bread(NULL, dir, idx, 0, err))) + return -1; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ p->curidx = idx; ++ p->leaf = dx_get_block(p->at); + } + return 1; + } +@@ -543,7 +673,7 @@ + dir = dir_file->f_dentry->d_inode; + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); + if (!frame) + return err; + +@@ -625,7 +755,8 @@ + count++; + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ +- de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ de = (struct ext3_dir_entry_2 *)((char*)de + ++ le16_to_cpu(de->rec_len)); + } + return count; + } +@@ -658,7 +789,8 @@ + } while(more); + } + +-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct inode *dir, struct dx_frame *frame, ++ u32 hash, u32 block, u32 idx) + { + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; +@@ -670,6 +802,7 @@ + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); ++ + } + #endif + +@@ -752,7 +885,8 @@ + + + static struct buffer_head * ext3_find_entry (struct dentry *dentry, +- struct ext3_dir_entry_2 ** res_dir) ++ struct ext3_dir_entry_2 ** res_dir, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; +@@ -768,6 +902,7 @@ + int namelen; + const u8 *name; + unsigned blocksize; ++ int do_not_use_dx = 0; + + *res_dir = NULL; + sb = dir->i_sb; +@@ -776,9 +911,10 @@ + name = dentry->d_name.name; + if (namelen > EXT3_NAME_LEN) + return NULL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { +- bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ bh = ext3_dx_find_entry(dentry, res_dir, &err, rwlock, lock); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -787,8 +923,14 @@ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ do_not_use_dx = 1; + } + #endif ++ *lock = ext3_lock_htree(dir, 0, rwlock); ++ if (is_dx(dir) && !do_not_use_dx) { ++ ext3_unlock_htree(dir, *lock); ++ goto repeat; ++ } + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) +@@ -860,12 +1002,17 @@ + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); ++ if (!ret) { ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; ++ } + return ret; + } + + #ifdef CONFIG_EXT3_INDEX + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +- struct ext3_dir_entry_2 **res_dir, int *err) ++ struct ext3_dir_entry_2 **res_dir, int *err, ++ int rwlock, void **lock) + { + struct super_block * sb; + struct dx_hash_info hinfo; +@@ -880,11 +1027,22 @@ + struct inode *dir = dentry->d_parent->d_inode; + + sb = dir->i_sb; +- if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++repeat: ++ if (!(frame = dx_probe (&dentry->d_name, dir, &hinfo, frames, err))) + return NULL; ++ ++ *lock = ext3_lock_htree(dir, frame->leaf, rwlock); ++ /* while locking leaf we just found may get splitted ++ * so, we need another leaf. check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, *lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ + hash = hinfo.hash; + do { +- block = dx_get_block(frame->at); ++ block = frame->leaf; + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -918,6 +1076,8 @@ + *err = -ENOENT; + errout: + dxtrace(printk("%s not found\n", name)); ++ ext3_unlock_htree(dir, *lock); ++ *lock = NULL; + dx_release (frames); + return NULL; + } +@@ -928,6 +1088,7 @@ + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ void *lock = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); +@@ -935,10 +1096,11 @@ + if (ext3_check_for_iopen(dir, dentry)) + return NULL; + +- bh = ext3_find_entry(dentry, &de); ++ bh = ext3_find_entry(dentry, &de, 0, &lock); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); ++ ext3_unlock_htree(dir, lock); + brelse (bh); + inode = iget(dir->i_sb, ino); + +@@ -975,7 +1137,8 @@ + unsigned rec_len = 0; + + while (count--) { +- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ struct ext3_dir_entry_2 *de = ++ (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; +@@ -988,7 +1151,8 @@ + + static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) + { +- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ struct ext3_dir_entry_2 *next, *to, *prev; ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; +@@ -1010,7 +1174,8 @@ + + static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct dx_hash_info *hinfo, void **target, ++ int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1057,23 +1222,30 @@ + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", +- dx_get_block(frame->at), hash2, split, count-split)); +- ++ frame->leaf, hash2, split, count-split)); ++ + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); +- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf(hinfo,(struct ext3_dir_entry_2*) data2, blocksize, 1)); + + /* Which block gets the new entry? */ ++ *target = NULL; + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; +- } +- dx_insert_block (frame, hash2 + continued, newblock); ++ ++ /* entry will be stored into new block ++ * we have to lock it before add_dirent_to_buf */ ++ *target = ext3_lock_htree(dir, newblock, 1); ++ } ++ dx_lock_bh(frame->bh); ++ dx_insert_block (dir, frame, hash2 + continued, newblock, frame->curidx); ++ dx_unlock_bh(frame->bh); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1147,7 +1319,8 @@ + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if (de->inode) { +- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ struct ext3_dir_entry_2 *de1 = ++ (struct ext3_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = cpu_to_le16(rlen - nlen); + de->rec_len = cpu_to_le16(nlen); + de = de1; +@@ -1205,7 +1378,8 @@ + unsigned blocksize; + struct dx_hash_info hinfo; + u32 block; +- ++ void *lock, *new_lock; ++ + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1216,7 +1390,6 @@ + } + root = (struct dx_root *) bh->b_data; + +- EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + bh2 = ext3_append (handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); +@@ -1224,6 +1397,8 @@ + } + data1 = bh2->b_data; + ++ lock = ext3_lock_htree(dir, block, 1); ++ + /* The 0th block becomes the root, move the dirents out */ + de = (struct ext3_dir_entry_2 *) &root->dotdot; + de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len); +@@ -1253,13 +1428,25 @@ + frame->entries = entries; + frame->at = entries; + frame->bh = bh; ++ frame->curidx = 0; ++ frame->leaf = 0; ++ frame[1].bh = NULL; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ de = do_split(handle,dir, &bh, frame, &hinfo, &new_lock, &retval); + dx_release (frames); + if (!(de)) +- return retval; ++ goto cleanup; ++ ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++cleanup: ++ if (new_lock) ++ ext3_unlock_htree(dir, new_lock); ++ /* we mark directory indexed in order to ++ * avoid races while htree being created -bzzz */ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ ext3_unlock_htree(dir, lock); + +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ return retval; + } + #endif + +@@ -1288,11 +1475,13 @@ + unsigned blocksize; + unsigned nlen, rlen; + u32 block, blocks; ++ void *lock; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; ++repeat: + #ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { + retval = ext3_dx_add_entry(handle, dentry, inode); +@@ -1303,36 +1492,53 @@ + ext3_mark_inode_dirty(handle, dir); + } + #endif ++ lock = ext3_lock_htree(dir, 0, 1); ++ if (is_dx(dir)) { ++ /* we got lock for block 0 ++ * probably previous holder of the lock ++ * created htree -bzzz */ ++ ext3_unlock_htree(dir, lock); ++ goto repeat; ++ } ++ + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext3_bread(handle, dir, block, 0, &retval); +- if(!bh) ++ if(!bh) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); +- if (retval != -ENOSPC) ++ if (retval != -ENOSPC) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + + #ifdef CONFIG_EXT3_INDEX + if (blocks == 1 && !dx_fallback && +- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) +- return make_indexed_dir(handle, dentry, inode, bh); ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) { ++ retval = make_indexed_dir(handle, dentry, inode, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; ++ } + #endif + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); +- if (!bh) ++ if (!bh) { ++ ext3_unlock_htree(dir, lock); + return retval; ++ } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(rlen = blocksize); + nlen = 0; +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ext3_unlock_htree(dir, lock); ++ return retval; + } + + #ifdef CONFIG_EXT3_INDEX +-/* +- * Returns 0 for success, or a negative error value +- */ + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +@@ -1344,15 +1550,28 @@ + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; + int err; +- +- frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++ int curidx; ++ void *idx_lock, *leaf_lock, *newleaf_lock; ++ ++repeat: ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; +- entries = frame->entries; +- at = frame->at; + +- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ /* we're going to chage leaf, so lock it first */ ++ leaf_lock = ext3_lock_htree(dir, frame->leaf, 1); ++ ++ /* while locking leaf we just found may get splitted ++ * so we need to check this */ ++ if (!dx_check_full_path(frames, &hinfo)) { ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ if (!(bh = ext3_bread(handle,dir, frame->leaf, 0, &err))) { ++ printk("can't ext3_bread(%d) = %d\n", (int) frame->leaf, err); + goto cleanup; ++ } + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); +@@ -1365,6 +1584,35 @@ + goto cleanup; + } + ++ /* our leaf has no enough space. hence, we have to ++ * split it. so lock index for this leaf first */ ++ curidx = frame->curidx; ++ idx_lock = ext3_lock_htree(dir, curidx, 1); ++ ++ /* now check did path get changed? */ ++ dx_release(frames); ++ ++ frame = dx_probe(&dentry->d_name, dentry->d_parent->d_inode, ++ &hinfo, frames, &err); ++ if (!frame) { ++ /* FIXME: error handling here */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ return err; ++ } ++ ++ if (frame->curidx != curidx) { ++ /* path has been changed. we have to drop old lock ++ * and repeat */ ++ brelse(bh); ++ ext3_unlock_htree(dir, idx_lock); ++ ext3_unlock_htree(dir, leaf_lock); ++ dx_release(frames); ++ goto repeat; ++ } ++ entries = frame->entries; ++ at = frame->at; ++ + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); +@@ -1376,7 +1624,8 @@ + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; +- ++ void *nb_lock; ++ + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3_warning(sb, __FUNCTION__, +@@ -1387,6 +1636,7 @@ + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; ++ nb_lock = ext3_lock_htree(dir, newblock, 1); + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); +@@ -1398,27 +1648,73 @@ + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); ++ void *ri_lock; ++ ++ /* we have to protect root htree index against ++ * another dx_add_entry() which would want to ++ * split it too -bzzz */ ++ ri_lock = ext3_lock_htree(dir, 0, 1); ++ ++ /* as root index block blocked we must repeat ++ * searching for current position of our 2nd index -bzzz */ ++ dx_lock_bh(frame->bh); ++ frames->at = dx_find_position(frames->entries, hinfo.hash); ++ dx_unlock_bh(frame->bh); ++ + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); +- +- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; +- ++ ++ /* copy index into new one */ + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); +- dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { ++ /* unlock index we won't use */ ++ ext3_unlock_htree(dir, idx_lock); ++ idx_lock = nb_lock; + frame->at = at = at - entries - icount1 + entries2; +- frame->entries = entries = entries2; ++ frame->entries = entries2; ++ frame->curidx = curidx = newblock; + swap(frame->bh, bh2); ++ } else { ++ /* we'll use old index,so new one may be freed */ ++ ext3_unlock_htree(dir, nb_lock); + } +- dx_insert_block (frames + 0, hash2, newblock); ++ ++ /* NOTE: very subtle piece of code ++ * competing dx_probe() may find 2nd level index in root ++ * index, then we insert new index here and set new count ++ * in that 2nd level index. so, dx_probe() may see 2nd ++ * level index w/o hash it looks for. the solution is ++ * to check root index after we locked just founded 2nd ++ * level index -bzzz */ ++ dx_lock_bh(frames[0].bh); ++ dx_insert_block (dir, frames + 0, hash2, newblock, 0); ++ dx_unlock_bh(frames[0].bh); ++ ++ /* now old and new 2nd level index blocks contain ++ * all pointers, so dx_probe() may find it in the both. ++ * it's OK -bzzz */ ++ ++ dx_lock_bh(frame->bh); ++ dx_set_count(entries, icount1); ++ dx_unlock_bh(frame->bh); ++ ++ /* now old 2nd level index block points to first half ++ * of leafs. it's importand that dx_probe() must ++ * check root index block for changes under ++ * dx_lock_bh(frame->bh) -bzzz */ ++ ++ ext3_unlock_htree(dir, ri_lock); ++ + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); +@@ -1427,38 +1723,61 @@ + goto journal_error; + brelse (bh2); + } else { ++ unsigned long leaf = frame->leaf; ++ + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ ++ dx_lock_bh(frames[0].bh); + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ dx_unlock_bh(frames[0].bh); + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; ++ frame->curidx = newblock; ++ frame->leaf = leaf; + err = ext3_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; ++ ++ /* first level index was root. it's already initialized */ ++ /* we my unlock it now */ ++ ext3_unlock_htree(dir, idx_lock); ++ ++ /* current index is just created 2nd level index */ ++ curidx = newblock; ++ idx_lock = nb_lock; + } + ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, frame, &hinfo, &newleaf_lock, &err); + if (!de) + goto cleanup; ++ ++ /* index splitted */ ++ ext3_unlock_htree(dir, idx_lock); ++ + err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ ++ if (newleaf_lock) ++ ext3_unlock_htree(dir, newleaf_lock); ++ + bh = 0; + goto cleanup; + + journal_error: + ext3_std_error(dir->i_sb, err); + cleanup: ++ ext3_unlock_htree(dir, leaf_lock); + if (bh) + brelse(bh); + dx_release(frames); +@@ -1902,6 +2221,7 @@ + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) { +@@ -1909,7 +2229,7 @@ + } + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_rmdir; + +@@ -1920,14 +2240,19 @@ + DQUOT_INIT(inode); + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = -ENOTEMPTY; +- if (!empty_dir (inode)) ++ if (!empty_dir (inode)) { ++ ext3_unlock_htree(dir, lock); + goto end_rmdir; ++ } + + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) +@@ -1956,6 +2281,7 @@ + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; ++ void *lock; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) { +@@ -1966,7 +2292,7 @@ + handle->h_sync = 1; + + retval = -ENOENT; +- bh = ext3_find_entry (dentry, &de); ++ bh = ext3_find_entry (dentry, &de, 1, &lock); + if (!bh) + goto end_unlink; + +@@ -1974,8 +2300,10 @@ + DQUOT_INIT(inode); + + retval = -EIO; +- if (le32_to_cpu(de->inode) != inode->i_ino) ++ if (le32_to_cpu(de->inode) != inode->i_ino) { ++ ext3_unlock_htree(dir, lock); + goto end_unlink; ++ } + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", +@@ -1984,6 +2312,7 @@ + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); ++ ext3_unlock_htree(dir, lock); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +@@ -2121,6 +2450,7 @@ + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval; ++ void *lock1 = NULL, *lock2 = NULL, *lock3 = NULL; + + old_bh = new_bh = dir_bh = NULL; + +@@ -2133,7 +2463,10 @@ + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; + +- old_bh = ext3_find_entry (old_dentry, &old_de); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ down(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); ++ ++ old_bh = ext3_find_entry (old_dentry, &old_de, 1, &lock1 /* FIXME */); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process +@@ -2146,7 +2479,7 @@ + goto end_rename; + + new_inode = new_dentry->d_inode; +- new_bh = ext3_find_entry (new_dentry, &new_de); ++ new_bh = ext3_find_entry (new_dentry, &new_de, 1, &lock2 /* FIXME */); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); +@@ -2213,7 +2546,7 @@ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + +- old_bh2 = ext3_find_entry(old_dentry, &old_de2); ++ old_bh2 = ext3_find_entry(old_dentry, &old_de2, 1, &lock3 /* FIXME */); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, + old_de2, old_bh2); +@@ -2256,6 +2589,14 @@ + retval = 0; + + end_rename: ++ if (lock1) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock1); ++ if (lock2) ++ ext3_unlock_htree(new_dentry->d_parent->d_inode, lock2); ++ if (lock3) ++ ext3_unlock_htree(old_dentry->d_parent->d_inode, lock3); ++ if (old_dentry->d_parent == new_dentry->d_parent) ++ up(&EXT3_I(old_dentry->d_parent->d_inode)->i_rename_sem); + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); +@@ -2264,6 +2605,29 @@ + } + + /* ++ * this locking primitives are used to protect parts ++ * of dir's htree. protection unit is block: leaf or index ++ */ ++static inline void *ext3_lock_htree(struct inode *dir, ++ unsigned long value, int rwlock) ++{ ++ void *lock; ++ ++ if (!test_opt(dir->i_sb, PDIROPS)) ++ return NULL; ++ lock = dynlock_lock(&EXT3_I(dir)->i_htree_lock, value, 1, GFP_KERNEL); ++ return lock; ++} ++ ++static inline void ext3_unlock_htree(struct inode *dir, ++ void *lock) ++{ ++ if (!test_opt(dir->i_sb, PDIROPS) || !lock) ++ return; ++ dynlock_unlock(&EXT3_I(dir)->i_htree_lock, lock); ++} ++ ++/* + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { +Index: linux-2.4.20/fs/ext3/super.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/super.c 2004-05-27 15:10:41.000000000 -0400 ++++ linux-2.4.20/fs/ext3/super.c 2004-05-27 15:10:45.000000000 -0400 +@@ -796,6 +796,8 @@ + return 0; + } + } ++ else if (!strcmp (this_char, "pdirops")) ++ set_opt (sbi->s_mount_opt, PDIROPS); + else if (!strcmp (this_char, "grpid") || + !strcmp (this_char, "bsdgroups")) + set_opt (*mount_options, GRPID); +@@ -822,6 +824,9 @@ + if (want_numeric(value, "sb", sb_block)) + return 0; + } ++ else if (!strcmp (this_char, "pdirops")) { ++ set_opt (sbi->s_mount_opt, PDIROPS); ++ } + #ifdef CONFIG_JBD_DEBUG + else if (!strcmp (this_char, "ro-after")) { + unsigned long v; +@@ -985,6 +990,10 @@ + ext3_check_inodes_bitmap (sb); + } + #endif ++#ifdef S_PDIROPS ++ if (test_opt (sb, PDIROPS)) ++ sb->s_flags |= S_PDIROPS; ++#endif + setup_ro_after(sb); + return res; + } +@@ -1484,6 +1493,11 @@ + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ++ if (test_opt(sb, PDIROPS)) { ++ printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n"); ++ sb->s_flags |= S_PDIROPS; ++ } ++ + return sb; + + failed_mount3: +Index: linux-2.4.20/fs/ext3/inode.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/inode.c 2004-05-27 15:10:41.000000000 -0400 ++++ linux-2.4.20/fs/ext3/inode.c 2004-05-27 15:10:45.000000000 -0400 +@@ -2435,6 +2435,9 @@ + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; ++ dynlock_init(&EXT3_I(inode)->i_htree_lock); ++ sema_init(&EXT3_I(inode)->i_rename_sem, 1); ++ sema_init(&EXT3_I(inode)->i_append_sem, 1); + } else if (S_ISLNK(inode->i_mode)) { + if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; +Index: linux-2.4.20/fs/ext3/ialloc.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/ialloc.c 2004-05-27 15:10:39.000000000 -0400 ++++ linux-2.4.20/fs/ext3/ialloc.c 2004-05-27 15:10:45.000000000 -0400 +@@ -601,6 +601,9 @@ + return ERR_PTR(-EDQUOT); + } + ext3_debug ("allocating inode %lu\n", inode->i_ino); ++ dynlock_init(&EXT3_I(inode)->i_htree_lock); ++ sema_init(&EXT3_I(inode)->i_rename_sem, 1); ++ sema_init(&EXT3_I(inode)->i_append_sem, 1); + return inode; + + fail: +Index: linux-2.4.20/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-05-27 15:10:40.000000000 -0400 ++++ linux-2.4.20/include/linux/ext3_fs.h 2004-05-27 15:10:45.000000000 -0400 +@@ -306,6 +306,7 @@ + /* + * Mount flags + */ ++#define EXT3_MOUNT_PDIROPS 0x800000/* Parallel dir operations */ + #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ + #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ + #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ +Index: linux-2.4.20/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_fs_i.h 2001-11-22 14:46:19.000000000 -0500 ++++ linux-2.4.20/include/linux/ext3_fs_i.h 2004-05-27 15:10:45.000000000 -0400 +@@ -17,6 +17,7 @@ + #define _LINUX_EXT3_FS_I + + #include ++#include + + /* + * second extended file system inode data in memory +@@ -73,6 +74,11 @@ + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; ++ ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ struct semaphore i_rename_sem; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/lustre/kernel_patches/patches/ext3-trusted_ea-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-trusted_ea-2.4.21-chaos.patch new file mode 100644 index 0000000..92753de --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-trusted_ea-2.4.21-chaos.patch @@ -0,0 +1,170 @@ + fs/ext3/xattr.c | 12 +++++- + fs/ext3/xattr_trusted.c | 86 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_xattr.h | 6 +++ + 3 files changed, 102 insertions(+), 2 deletions(-) + +Index: linux-p4smp/fs/ext3/Makefile +=================================================================== +--- linux-p4smp.orig/fs/ext3/Makefile 2004-06-14 13:46:11.000000000 -0700 ++++ linux-p4smp/fs/ext3/Makefile 2004-06-14 13:50:46.000000000 -0700 +@@ -12,7 +12,8 @@ O_TARGET := ext3.o + export-objs := ext3-exports.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o ++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ ++ xattr_trusted.o + obj-m := $(O_TARGET) + + export-objs += xattr.o +Index: linux-p4smp/fs/ext3/xattr.c +=================================================================== +--- linux-p4smp.orig/fs/ext3/xattr.c 2004-06-14 13:46:44.000000000 -0700 ++++ linux-p4smp/fs/ext3/xattr.c 2004-06-14 13:50:46.000000000 -0700 +@@ -1780,18 +1780,25 @@ static void ext3_xattr_rehash(struct ext + int __init + init_ext3_xattr(void) + { ++ int error; ++ + ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, + sizeof(struct mb_cache_entry) + + sizeof(struct mb_cache_entry_index), 1, 61); + if (!ext3_xattr_cache) + return -ENOMEM; + +- return 0; ++ error = init_ext3_xattr_trusted(); ++ if (error) ++ mb_cache_destroy(ext3_xattr_cache); ++ ++ return error; + } + + void + exit_ext3_xattr(void) + { ++ exit_ext3_xattr_trusted(); + if (ext3_xattr_cache) + mb_cache_destroy(ext3_xattr_cache); + ext3_xattr_cache = NULL; +@@ -1802,12 +1809,13 @@ exit_ext3_xattr(void) + int __init + init_ext3_xattr(void) + { +- return 0; ++ return init_ext3_xattr_trusted(); + } + + void + exit_ext3_xattr(void) + { ++ exit_ext3_xattr_trusted(); + } + + #endif /* CONFIG_EXT3_FS_XATTR_SHARING */ +Index: linux-p4smp/fs/ext3/xattr_trusted.c +=================================================================== +--- linux-p4smp.orig/fs/ext3/xattr_trusted.c 2004-06-14 13:41:58.000000000 -0700 ++++ linux-p4smp/fs/ext3/xattr_trusted.c 2004-06-14 13:50:46.000000000 -0700 +@@ -0,0 +1,86 @@ ++/* ++ * linux/fs/ext3/xattr_trusted.c ++ * Handler for trusted extended attributes. ++ * ++ * Copyright (C) 2003 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define XATTR_TRUSTED_PREFIX "trusted." ++ ++static size_t ++ext3_xattr_trusted_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext3_xattr_trusted_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, ++ buffer, size); ++} ++ ++static int ++ext3_xattr_trusted_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ handle_t *handle; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_TRUSTED, name, ++ value, size, flags); ++ ext3_journal_stop(handle, inode); ++ ++ return error; ++} ++ ++struct ext3_xattr_handler ext3_xattr_trusted_handler = { ++ .prefix = XATTR_TRUSTED_PREFIX, ++ .list = ext3_xattr_trusted_list, ++ .get = ext3_xattr_trusted_get, ++ .set = ext3_xattr_trusted_set, ++}; ++ ++int __init ++init_ext3_xattr_trusted(void) ++{ ++ return ext3_xattr_register(EXT3_XATTR_INDEX_TRUSTED, ++ &ext3_xattr_trusted_handler); ++} ++ ++void ++exit_ext3_xattr_trusted(void) ++{ ++ ext3_xattr_unregister(EXT3_XATTR_INDEX_TRUSTED, ++ &ext3_xattr_trusted_handler); ++} +Index: linux-p4smp/include/linux/ext3_xattr.h +=================================================================== +--- linux-p4smp.orig/include/linux/ext3_xattr.h 2004-06-14 13:41:58.000000000 -0700 ++++ linux-p4smp/include/linux/ext3_xattr.h 2004-06-14 13:50:46.000000000 -0700 +@@ -93,6 +93,9 @@ extern void ext3_xattr_put_super(struct + extern int init_ext3_xattr(void) __init; + extern void exit_ext3_xattr(void); + ++extern int init_ext3_xattr_trusted(void) __init; ++extern void exit_ext3_xattr_trusted(void); ++ + # else /* CONFIG_EXT3_FS_XATTR */ + # define ext3_setxattr NULL + # define ext3_getxattr NULL diff --git a/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch b/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch index ad213c9..c0940cf 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch @@ -70,11 +70,11 @@ Index: linux-2.4.19.SuSE/fs/ext3/inode.c if(ext3_get_inode_loc(inode, &iloc)) goto bad_inode; bh = iloc.bh; -Index: linux-2.4.19.SuSE/fs/ext3/iopen.c +Index: lum/fs/ext3/iopen.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/iopen.c Sun Nov 16 01:27:31 2003 -+++ linux-2.4.19.SuSE/fs/ext3/iopen.c Sun Nov 16 01:27:31 2003 -@@ -0,0 +1,258 @@ +--- lum.orig/fs/ext3/iopen.c 2004-03-09 16:46:37.000000000 -0700 ++++ lum/fs/ext3/iopen.c 2004-03-09 16:48:03.000000000 -0700 +@@ -0,0 +1,282 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -211,13 +211,24 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.c + +/* This function is spliced into ext3_lookup and does the move of a + * disconnected dentry (if it exists) to a connected dentry. -+ * Caller must hold dcache_lock. + */ -+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) +{ + struct dentry *tmp, *goal = NULL; + struct list_head *lp; + ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ + /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); @@ -231,27 +242,40 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.c + } + + if (!goal) -+ return NULL; ++ goto do_instantiate; + + /* Move the goal to the de hash queue - like d_move() */ + goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; + list_del_init(&goal->d_hash); + + list_del(&goal->d_child); -+ list_del(&de->d_child); ++ list_del(&dentry->d_child); + + /* Switch the parents and the names.. */ -+ switch_names(goal, de); -+ do_switch(goal->d_parent, de->d_parent); -+ do_switch(goal->d_name.len, de->d_name.len); -+ do_switch(goal->d_name.hash, de->d_name.hash); ++ switch_names(goal, dentry); ++ do_switch(goal->d_parent, dentry->d_parent); ++ do_switch(goal->d_name.len, dentry->d_name.len); ++ do_switch(goal->d_name.hash, dentry->d_name.hash); + + /* And add them back to the (new) parent lists */ + list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); + __d_rehash(goal, 0); ++ spin_unlock(&dcache_lock); ++ iput(inode); + + return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; +} + +/* @@ -333,10 +357,10 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.c + + return 1; +} -Index: linux-2.4.19.SuSE/fs/ext3/iopen.h +Index: lum/fs/ext3/iopen.h =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/iopen.h Sun Nov 16 01:27:31 2003 -+++ linux-2.4.19.SuSE/fs/ext3/iopen.h Sun Nov 16 01:27:31 2003 +--- lum.orig/fs/ext3/iopen.h 2004-03-09 16:46:37.000000000 -0700 ++++ lum/fs/ext3/iopen.h 2004-03-09 16:48:03.000000000 -0700 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -351,8 +375,8 @@ Index: linux-2.4.19.SuSE/fs/ext3/iopen.h + +extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); +extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *de, -+ struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); Index: linux-2.4.19.SuSE/fs/ext3/namei.c =================================================================== --- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:23:20 2003 @@ -366,12 +390,7 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c /* * define how far ahead to read directories while searching them. -@@ -922,10 +922,14 @@ - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; -+ struct dentry *alternate = NULL; - +@@ -926,6 +927,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -381,36 +400,62 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -943,7 +948,28 @@ +@@ -943,8 +948,8 @@ return ERR_PTR(-EACCES); } } - d_add(dentry, inode); +- return NULL; + -+ /* verify this dentry is really new */ -+ assert(!dentry->d_inode); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ return alternate; ++ return iopen_connect_dentry(dentry, inode, 1); + } + + #define S_SHIFT 12 +@@ -1932,10 +1935,6 @@ + inode->i_nlink); + inode->i_version = ++event; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; +@@ -2086,6 +2085,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } + } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} + -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ if (inode) /* d_instantiate */ -+ list_add(&dentry->d_alias, &inode->i_dentry); -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ - return NULL; - } + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2113,7 +2129,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle, inode); + ext3_journal_stop(handle, dir); + return err; + } Index: linux-2.4.19.SuSE/fs/ext3/super.c =================================================================== --- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:19:22 2003 diff --git a/lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch b/lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch index 62bd8e1..3bed805 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch @@ -74,10 +74,17 @@ Index: linux-ia64/fs/ext3/iopen.c =================================================================== --- linux-ia64.orig/fs/ext3/iopen.c 2004-03-17 18:02:08.000000000 -0800 +++ linux-ia64/fs/ext3/iopen.c 2004-03-17 18:10:58.000000000 -0800 -@@ -8,3 +8,275 @@ - * This file may be redistributed under the terms of the GNU General - * Public License. - * +@@ -0,0 +1,282 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * + * + * Invariants: + * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias @@ -427,7 +434,7 @@ Index: linux-ia64/fs/ext3/namei.c + if (!err) { + err = ext3_mark_inode_dirty(handle, inode); + if (err == 0) { -+ (void)iopen_connect_dentry(dentry, inode, 0); ++ dput(iopen_connect_dentry(dentry, inode, 0)); + return 0; + } + } diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch index 12436a7..ee976f6 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch @@ -1,7 +1,7 @@ Index: linux-2.6.5-12.1/fs/exec.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/exec.c 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/fs/exec.c 2004-05-25 17:32:14.038494200 +0300 +--- linux-2.6.5-12.1.orig/fs/exec.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/fs/exec.c 2004-06-03 18:31:28.000000000 -0400 @@ -125,9 +125,10 @@ struct nameidata nd; int error; @@ -47,8 +47,8 @@ Index: linux-2.6.5-12.1/fs/exec.c if (err) { Index: linux-2.6.5-12.1/fs/namei.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/namei.c 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/fs/namei.c 2004-05-25 17:32:14.040493896 +0300 +--- linux-2.6.5-12.1.orig/fs/namei.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/fs/namei.c 2004-06-03 18:42:17.000000000 -0400 @@ -270,8 +270,19 @@ return 0; } @@ -136,25 +136,20 @@ Index: linux-2.6.5-12.1/fs/namei.c dput(next.dentry); mntput(next.mnt); if (err) -@@ -703,14 +749,29 @@ +@@ -703,14 +749,24 @@ inode = nd->dentry->d_inode; /* fallthrough */ case 1: + nd->flags |= LOOKUP_LAST; + err = revalidate_special(nd); + nd->flags &= ~LOOKUP_LAST; ++ if (!nd->dentry->d_inode) ++ err = -ENOENT; + if (err) -+ break; ++ goto return_err; goto return_reval; } -+ -+ if (err) { -+ if (!nd->dentry->d_inode) -+ err = -ENOENT; -+ -+ goto return_err; -+ } -+ ++ if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { err = nd->dentry->d_op->d_hash(nd->dentry, &this); if (err < 0) @@ -166,7 +161,7 @@ Index: linux-2.6.5-12.1/fs/namei.c if (err) break; follow_mount(&next.mnt, &next.dentry); -@@ -936,7 +997,7 @@ +@@ -936,7 +992,7 @@ } /* SMP-safe */ @@ -175,7 +170,7 @@ Index: linux-2.6.5-12.1/fs/namei.c { unsigned long hash; struct qstr this; -@@ -956,11 +1017,16 @@ +@@ -956,11 +1012,16 @@ } this.hash = end_name_hash(hash); @@ -193,7 +188,7 @@ Index: linux-2.6.5-12.1/fs/namei.c /* * namei() * -@@ -972,7 +1038,8 @@ +@@ -972,7 +1033,8 @@ * that namei follows links, while lnamei does not. * SMP-safe */ @@ -203,12 +198,12 @@ Index: linux-2.6.5-12.1/fs/namei.c { char *tmp = getname(name); int err = PTR_ERR(tmp); -@@ -987,6 +1054,13 @@ +@@ -987,6 +1049,13 @@ return err; } -+int __user_walk(const char __user *name, unsigned flags, -+ struct nameidata *nd, const char **pname) ++int fastcall __user_walk(const char __user *name, unsigned flags, ++ struct nameidata *nd, const char **pname) +{ + intent_init(&nd->intent, IT_LOOKUP); + return __user_walk_it(name, flags, nd, pname); @@ -217,7 +212,7 @@ Index: linux-2.6.5-12.1/fs/namei.c /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -1259,8 +1333,8 @@ +@@ -1259,8 +1328,8 @@ acc_mode |= MAY_APPEND; /* Fill in the open() intent data */ @@ -228,7 +223,7 @@ Index: linux-2.6.5-12.1/fs/namei.c /* * The simplest case - just a plain lookup. -@@ -1275,6 +1349,7 @@ +@@ -1275,6 +1344,7 @@ /* * Create - we need to know the parent. */ @@ -236,7 +231,7 @@ Index: linux-2.6.5-12.1/fs/namei.c error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); if (error) return error; -@@ -1291,7 +1366,9 @@ +@@ -1291,7 +1361,9 @@ dir = nd->dentry; nd->flags &= ~LOOKUP_PARENT; down(&dir->d_inode->i_sem); @@ -246,7 +241,7 @@ Index: linux-2.6.5-12.1/fs/namei.c do_last: error = PTR_ERR(dentry); -@@ -1396,7 +1473,9 @@ +@@ -1396,7 +1468,9 @@ } dir = nd->dentry; down(&dir->d_inode->i_sem); @@ -256,7 +251,7 @@ Index: linux-2.6.5-12.1/fs/namei.c putname(nd->last.name); goto do_last; } -@@ -2196,7 +2275,9 @@ +@@ -2196,7 +2270,9 @@ __vfs_follow_link(struct nameidata *nd, const char *link) { int res = 0; @@ -266,7 +261,7 @@ Index: linux-2.6.5-12.1/fs/namei.c if (IS_ERR(link)) goto fail; -@@ -2206,6 +2287,10 @@ +@@ -2206,6 +2282,10 @@ /* weird __emul_prefix() stuff did it */ goto out; } @@ -279,8 +274,8 @@ Index: linux-2.6.5-12.1/fs/namei.c if (current->link_count || res || nd->last_type!=LAST_NORM) Index: linux-2.6.5-12.1/fs/namespace.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/namespace.c 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/fs/namespace.c 2004-05-25 17:33:44.385759328 +0300 +--- linux-2.6.5-12.1.orig/fs/namespace.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/fs/namespace.c 2004-06-03 18:31:28.000000000 -0400 @@ -108,6 +108,7 @@ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) @@ -316,8 +311,8 @@ Index: linux-2.6.5-12.1/fs/namespace.c flags &= ~MS_MGC_MSK; Index: linux-2.6.5-12.1/fs/open.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/open.c 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/fs/open.c 2004-05-25 17:32:14.042493592 +0300 +--- linux-2.6.5-12.1.orig/fs/open.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/fs/open.c 2004-06-03 18:31:28.000000000 -0400 @@ -227,12 +227,12 @@ struct nameidata nd; struct inode * inode; @@ -485,8 +480,8 @@ Index: linux-2.6.5-12.1/fs/open.c */ Index: linux-2.6.5-12.1/fs/stat.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/stat.c 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/fs/stat.c 2004-05-25 17:32:14.042493592 +0300 +--- linux-2.6.5-12.1.orig/fs/stat.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/fs/stat.c 2004-06-03 18:31:28.000000000 -0400 @@ -37,7 +37,7 @@ EXPORT_SYMBOL(generic_fillattr); @@ -563,8 +558,8 @@ Index: linux-2.6.5-12.1/fs/stat.c Index: linux-2.6.5-12.1/fs/nfs/dir.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 19:21:53.000000000 +0300 -+++ linux-2.6.5-12.1/fs/nfs/dir.c 2004-05-25 17:32:14.043493440 +0300 +--- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 12:21:53.000000000 -0400 ++++ linux-2.6.5-12.1/fs/nfs/dir.c 2004-06-03 18:31:28.000000000 -0400 @@ -709,7 +709,7 @@ return 0; if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE)) @@ -585,8 +580,8 @@ Index: linux-2.6.5-12.1/fs/nfs/dir.c * The 0 argument passed into the create function should one day Index: linux-2.6.5-12.1/fs/inode.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/inode.c 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/fs/inode.c 2004-05-25 17:32:14.044493288 +0300 +--- linux-2.6.5-12.1.orig/fs/inode.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/fs/inode.c 2004-06-03 18:31:28.000000000 -0400 @@ -221,6 +221,7 @@ inodes_stat.nr_unused--; } @@ -597,8 +592,8 @@ Index: linux-2.6.5-12.1/fs/inode.c * @inode: inode to clear Index: linux-2.6.5-12.1/fs/super.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/super.c 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/fs/super.c 2004-05-25 17:32:14.045493136 +0300 +--- linux-2.6.5-12.1.orig/fs/super.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/fs/super.c 2004-06-03 18:31:28.000000000 -0400 @@ -789,6 +789,8 @@ return (struct vfsmount *)sb; } @@ -608,10 +603,22 @@ Index: linux-2.6.5-12.1/fs/super.c struct vfsmount *kern_mount(struct file_system_type *type) { return do_kern_mount(type->name, 0, type->name, NULL); +Index: linux-2.6.5-12.1/fs/block_dev.c +=================================================================== +--- linux-2.6.5-12.1.orig/fs/block_dev.c 2004-05-10 12:21:55.000000000 -0400 ++++ linux-2.6.5-12.1/fs/block_dev.c 2004-06-03 18:31:28.000000000 -0400 +@@ -834,6 +834,7 @@ + if (!path || !*path) + return ERR_PTR(-EINVAL); + ++ intent_init(&nd.intent, IT_LOOKUP); + error = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (error) + return ERR_PTR(error); Index: linux-2.6.5-12.1/include/linux/dcache.h =================================================================== ---- linux-2.6.5-12.1.orig/include/linux/dcache.h 2004-04-04 06:38:24.000000000 +0300 -+++ linux-2.6.5-12.1/include/linux/dcache.h 2004-05-25 17:32:14.045493136 +0300 +--- linux-2.6.5-12.1.orig/include/linux/dcache.h 2004-04-03 22:38:24.000000000 -0500 ++++ linux-2.6.5-12.1/include/linux/dcache.h 2004-06-03 18:31:28.000000000 -0400 @@ -4,6 +4,7 @@ #ifdef __KERNEL__ @@ -631,8 +638,8 @@ Index: linux-2.6.5-12.1/include/linux/dcache.h int nr_unused; Index: linux-2.6.5-12.1/include/linux/fs.h =================================================================== ---- linux-2.6.5-12.1.orig/include/linux/fs.h 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/include/linux/fs.h 2004-05-25 17:32:14.046492984 +0300 +--- linux-2.6.5-12.1.orig/include/linux/fs.h 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/include/linux/fs.h 2004-06-03 18:31:28.000000000 -0400 @@ -250,6 +250,8 @@ #define ATTR_ATTR_FLAG 1024 #define ATTR_KILL_SUID 2048 @@ -686,8 +693,8 @@ Index: linux-2.6.5-12.1/include/linux/fs.h Index: linux-2.6.5-12.1/include/linux/namei.h =================================================================== ---- linux-2.6.5-12.1.orig/include/linux/namei.h 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/include/linux/namei.h 2004-05-25 17:32:14.047492832 +0300 +--- linux-2.6.5-12.1.orig/include/linux/namei.h 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/include/linux/namei.h 2004-06-03 18:31:28.000000000 -0400 @@ -2,25 +2,55 @@ #define _LINUX_NAMEI_H @@ -783,32 +790,10 @@ Index: linux-2.6.5-12.1/include/linux/namei.h extern int follow_down(struct vfsmount **, struct dentry **); extern int follow_up(struct vfsmount **, struct dentry **); -Index: linux-2.6.5-12.1/kernel/exit.c -=================================================================== ---- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/kernel/exit.c 2004-05-25 17:32:14.047492832 +0300 -@@ -260,6 +260,8 @@ - write_unlock_irq(&tasklist_lock); - } - -+EXPORT_SYMBOL(reparent_to_init); -+ - void __set_special_pids(pid_t session, pid_t pgrp) - { - struct task_struct *curr = current; -@@ -429,6 +431,8 @@ - __exit_files(tsk); - } - -+EXPORT_SYMBOL(exit_files); -+ - static inline void __put_fs_struct(struct fs_struct *fs) - { - /* No need to hold fs->lock if we are killing it */ Index: linux-2.6.5-12.1/include/linux/fshooks.h =================================================================== ---- linux-2.6.5-12.1.orig/include/linux/fshooks.h 2004-05-10 19:21:56.000000000 +0300 -+++ linux-2.6.5-12.1/include/linux/fshooks.h 2004-05-25 17:32:14.048492680 +0300 +--- linux-2.6.5-12.1.orig/include/linux/fshooks.h 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/include/linux/fshooks.h 2004-06-03 18:31:28.000000000 -0400 @@ -90,12 +90,18 @@ #define FSHOOK_BEGIN_USER_WALK(type, err, path, flags, nd, field, args...) \ @@ -847,15 +832,25 @@ Index: linux-2.6.5-12.1/include/linux/fshooks.h #define FSHOOK_END_USER_WALK(type, err, field) ((void)0);} -Index: linux-2.6.5-12.1/fs/block_dev.c +Index: linux-2.6.5-12.1/kernel/exit.c =================================================================== ---- linux-2.6.5-12.1.orig/fs/block_dev.c 2004-05-10 19:21:55.000000000 +0300 -+++ linux-2.6.5-12.1/fs/block_dev.c 2004-05-25 17:32:39.517620784 +0300 -@@ -834,6 +834,7 @@ - if (!path || !*path) - return ERR_PTR(-EINVAL); +--- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/kernel/exit.c 2004-06-03 18:31:28.000000000 -0400 +@@ -260,6 +260,8 @@ + write_unlock_irq(&tasklist_lock); + } -+ intent_init(&nd.intent, IT_LOOKUP); - error = path_lookup(path, LOOKUP_FOLLOW, &nd); - if (error) - return ERR_PTR(error); ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current; +@@ -429,6 +431,8 @@ + __exit_files(tsk); + } + ++EXPORT_SYMBOL(exit_files); ++ + static inline void __put_fs_struct(struct fs_struct *fs) + { + /* No need to hold fs->lock if we are killing it */ diff --git a/lustre/kernel_patches/series/chaos-2.4.21 b/lustre/kernel_patches/series/chaos-2.4.21 index 0003912..b3e932f 100644 --- a/lustre/kernel_patches/series/chaos-2.4.21 +++ b/lustre/kernel_patches/series/chaos-2.4.21 @@ -1,3 +1,4 @@ +configurable-x86-stack-2.4.21-chaos.patch dev_read_only_2.4.21-chaos.patch exports_2.4.19-suse.patch lustre_version.patch @@ -26,6 +27,7 @@ add_page_private.patch ext3-raw-lookup.patch nfs_export_kernel-2.4.21-chaos.patch ext3-ea-in-inode-2.4.21-chaos.patch +ext3-trusted_ea-2.4.21-chaos.patch listman-2.4.21-chaos.patch gfp_memalloc-2.4.21-chaos.patch ext3-xattr-ptr-arith-fix.patch @@ -33,3 +35,4 @@ kernel_text_address-2.4.18-chaos.patch pagecache-lock-2.4.21-chaos.patch ext3-truncate-buffer-head.patch inode-max-readahead-2.4.24.patch +dcache_refcount_debug.patch diff --git a/lustre/kernel_patches/series/rh-2.4.20 b/lustre/kernel_patches/series/rh-2.4.20 index 06b2642..22491a0 100644 --- a/lustre/kernel_patches/series/rh-2.4.20 +++ b/lustre/kernel_patches/series/rh-2.4.20 @@ -28,7 +28,7 @@ ext3-o_direct-1.2.4.20-rh.patch ext3-no-write-super-chaos.patch dynamic-locks-2.4.20-rh.patch vfs-pdirops-2.4.20-rh.patch -ext3-pdirops-2.4.20-chaos.patch +ext3-pdirops-2.4.20-rh.patch tcp_zero_copy_2.4.20_chaos.patch gpl_header-chaos-2.4.20.patch add_page_private.patch diff --git a/lustre/kernel_patches/series/suse-2.4.19 b/lustre/kernel_patches/series/suse-2.4.19 index 9905491..8748256 100644 --- a/lustre/kernel_patches/series/suse-2.4.19 +++ b/lustre/kernel_patches/series/suse-2.4.19 @@ -10,7 +10,6 @@ ext-2.4-patch-1-chaos.patch ext-2.4-patch-2.patch ext-2.4-patch-3.patch ext-2.4-patch-4.patch -linux-2.4.20-xattr-0.8.54-hp.patch linux-2.4.19-xattr-0.8.54-suse.patch ext3-2.4-ino_t.patch ext3-largefile.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.20 b/lustre/kernel_patches/series/vanilla-2.4.20 index ae838ca..d11bec0 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.20 +++ b/lustre/kernel_patches/series/vanilla-2.4.20 @@ -50,7 +50,5 @@ kernel_text_address-2.4.20-vanilla.patch ext3-xattr-ptr-arith-fix.patch gfp_memalloc-2.4.22.patch procfs-ndynamic-2.4.patch -linux-2.4.20-tmpfs-xattr.patch -linux-2.4.20-tmpfs-iopen.patch linux-2.4.20-filemap.patch ext3-truncate-buffer-head.patch diff --git a/lustre/kernel_patches/targets/rh-2.4.target b/lustre/kernel_patches/targets/rh-2.4.target index cca5324..70af4ab 100644 --- a/lustre/kernel_patches/targets/rh-2.4.target +++ b/lustre/kernel_patches/targets/rh-2.4.target @@ -1,7 +1,7 @@ -KERNEL=linux-2.4.20-28.9.tar.gz +KERNEL=linux-2.4.20-31.9.tar.gz SERIES=rh-2.4.20 VERSION=2.4.20 -EXTRA_VERSION=28.9_lustre +EXTRA_VERSION=31.9_lustre.1.2.2 RHBUILD=1 BASE_ARCHS="i586" @@ -11,3 +11,11 @@ JENSEN_ARCHS="" SMP_ARCHS="i586" UP_ARCHS="" SRC_ARCHS="i586" + +# the modules in this kernel do not build with gcc 3 +for cc in i386-redhat-linux-gcc-2.96 gcc296 gcc ; do + if which $cc >/dev/null 2>/dev/null ; then + CC=$cc + break + fi +done diff --git a/lustre/ldiskfs/autoMakefile.am b/lustre/ldiskfs/autoMakefile.am index f81e6e7..eacc902 100644 --- a/lustre/ldiskfs/autoMakefile.am +++ b/lustre/ldiskfs/autoMakefile.am @@ -33,10 +33,17 @@ patches := @top_srcdir@/kernel_patches/patches sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) rm -rf linux-stage linux sources $(ldiskfs_SOURCES) mkdir -p linux-stage/fs/ext3 linux-stage/include/linux - cd linux-stage && quilt setup -l ../$(series) -d ../$(patches) cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3 cp $(linux_headers) linux-stage/include/linux +if USE_QUILT + cd linux-stage && quilt setup -l ../$(series) -d ../$(patches) cd linux-stage && quilt push -a -q +else + @cd linux-stage && for i in $$(<../$(series)) ; do \ + echo "patch -p1 < ../$(patches)/$$i" ; \ + patch -p1 < ../$(patches)/$$i || exit 1 ; \ + done +endif mkdir linux @echo -n "Replacing 'ext3' with 'ldiskfs':" @for i in $(notdir $(ext3_headers) $(ext3_sources)) $(new_sources) ; do \ @@ -50,6 +57,7 @@ sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) linux-stage/include/linux/ext3$$i \ > linux/ldiskfs$$i ; \ done + @echo touch sources foo-check: diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 906090b..cdd3b07 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -481,11 +481,6 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) if (rc && rc != EALREADY) GOTO(out, rc); - /* XXX track this all the time? */ - if (target->obd_recovering) { - target->obd_connected_clients++; - } - req->rq_repmsg->handle = conn; /* If the client and the server are the same node, we will already @@ -528,6 +523,9 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) GOTO(out, rc = 0); } + if (target->obd_recovering) + target->obd_connected_clients++; + memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn), sizeof conn); @@ -580,21 +578,37 @@ void target_destroy_export(struct obd_export *exp) * Recovery functions */ -static void abort_delayed_replies(struct obd_device *obd) +static void target_finish_recovery(struct obd_device *obd) { - struct ptlrpc_request *req; struct list_head *tmp, *n; + int rc; + + CWARN("%s: sending delayed replies to recovered clients\n", + obd->obd_name); + + ldlm_reprocess_all_ns(obd->obd_namespace); + + /* when recovery finished, cleanup orphans on mds and ost */ + if (OBT(obd) && OBP(obd, postrecov)) { + rc = OBP(obd, postrecov)(obd); + if (rc >= 0) + CWARN("%s: all clients recovered, %d MDS " + "orphans deleted\n", obd->obd_name, rc); + else + CERROR("postrecov failed %d\n", rc); + } + list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) { + struct ptlrpc_request *req; req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_ERROR, req, "aborted:"); - req->rq_status = -ENOTCONN; - req->rq_type = PTL_RPC_MSG_ERR; + DEBUG_REQ(D_ERROR, req, "delayed:"); ptlrpc_reply(req); class_export_put(req->rq_export); list_del(&req->rq_list); OBD_FREE(req->rq_reqmsg, req->rq_reqlen); OBD_FREE(req, sizeof *req); } + return; } static void abort_recovery_queue(struct obd_device *obd) @@ -625,35 +639,24 @@ static void abort_recovery_queue(struct obd_device *obd) void target_abort_recovery(void *data) { struct obd_device *obd = data; - int rc; - CERROR("disconnecting clients and aborting recovery\n"); spin_lock_bh(&obd->obd_processing_task_lock); if (!obd->obd_recovering) { spin_unlock_bh(&obd->obd_processing_task_lock); EXIT; return; } - obd->obd_recovering = obd->obd_abort_recovery = 0; - - wake_up(&obd->obd_next_transno_waitq); target_cancel_recovery_timer(obd); spin_unlock_bh(&obd->obd_processing_task_lock); - class_disconnect_exports(obd, 0); + CERROR("%s: recovery period over; disconnecting unfinished clients.\n", + obd->obd_name); + class_disconnect_stale_exports(obd, 0); + abort_recovery_queue(obd); - /* when recovery was aborted, cleanup orphans on mds and ost */ - if (OBT(obd) && OBP(obd, postrecov)) { - rc = OBP(obd, postrecov)(obd); - if (rc >= 0) - CWARN("Cleanup %d orphans after recovery was aborted\n", rc); - else - CERROR("postrecov failed %d\n", rc); - } + target_finish_recovery(obd); - abort_delayed_replies(obd); - abort_recovery_queue(obd); ptlrpc_run_recovery_over_upcall(obd); } @@ -662,7 +665,8 @@ static void target_recovery_expired(unsigned long castmeharder) struct obd_device *obd = (struct obd_device *)castmeharder; CERROR("recovery timed out, aborting\n"); spin_lock_bh(&obd->obd_processing_task_lock); - obd->obd_abort_recovery = 1; + if (obd->obd_recovering) + obd->obd_abort_recovery = 1; wake_up(&obd->obd_next_transno_waitq); spin_unlock_bh(&obd->obd_processing_task_lock); } @@ -723,6 +727,9 @@ static int check_for_next_transno(struct obd_device *obd) queue_len = obd->obd_requests_queued_for_recovery; next_transno = obd->obd_next_recovery_transno; + CDEBUG(D_HA,"max: %d, connected: %d, completed: %d, queue_len: %d, " + "req_transno: "LPU64", next_transno: "LPU64"\n", + max, connected, completed, queue_len, req_transno, next_transno); if (obd->obd_abort_recovery) { CDEBUG(D_HA, "waking for aborted recovery\n"); wake_up = 1; @@ -836,6 +843,9 @@ int target_queue_recovery_request(struct ptlrpc_request *req, * Also, if this request has a transno less than the one we're waiting * for, we should process it now. It could (and currently always will) * be an open request for a descriptor that was opened some time ago. + * + * Also, a resent, replayed request that has already been + * handled will pass through here and be processed immediately. */ if (obd->obd_processing_task == current->pid || transno < obd->obd_next_recovery_transno) { @@ -847,6 +857,17 @@ int target_queue_recovery_request(struct ptlrpc_request *req, return 1; } + /* A resent, replayed request that is still on the queue; just drop it. + The queued request will handle this. */ + if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) == + (MSG_RESENT | MSG_REPLAY)) { + DEBUG_REQ(D_ERROR, req, "dropping resent queued req"); + spin_unlock_bh(&obd->obd_processing_task_lock); + OBD_FREE(reqmsg, req->rq_reqlen); + OBD_FREE(saved_req, sizeof *saved_req); + return 0; + } + memcpy(saved_req, req, sizeof *req); memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); req = saved_req; @@ -902,7 +923,6 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) struct ptlrpc_request *saved_req; struct lustre_msg *reqmsg; int recovery_done = 0; - int rc2; LASSERT ((rc == 0) == (req->rq_reply_state != NULL)); @@ -932,39 +952,22 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) list_add(&req->rq_list, &obd->obd_delayed_reply_queue); spin_lock_bh(&obd->obd_processing_task_lock); - --obd->obd_recoverable_clients; + /* only count the first "replay over" request from each + export */ + if (req->rq_export->exp_replay_needed) { + --obd->obd_recoverable_clients; + req->rq_export->exp_replay_needed = 0; + } recovery_done = (obd->obd_recoverable_clients == 0); spin_unlock_bh(&obd->obd_processing_task_lock); if (recovery_done) { - struct list_head *tmp, *n; - ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace); - CWARN("%s: all clients recovered, sending delayed replies\n", - obd->obd_name); spin_lock_bh(&obd->obd_processing_task_lock); - obd->obd_recovering = 0; + obd->obd_recovering = obd->obd_abort_recovery = 0; target_cancel_recovery_timer(obd); spin_unlock_bh(&obd->obd_processing_task_lock); - /* when recovery finished, cleanup orphans on mds and ost */ - if (OBT(obd) && OBP(obd, postrecov)) { - rc2 = OBP(obd, postrecov)(obd); - if (rc2 >= 0) - CWARN("%s: all clients recovered, %d MDS " - "orphans deleted\n", obd->obd_name, rc2); - else - CERROR("postrecov failed %d\n", rc2); - } - - list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_ERROR, req, "delayed:"); - ptlrpc_reply(req); - class_export_put(req->rq_export); - list_del(&req->rq_list); - OBD_FREE(req->rq_reqmsg, req->rq_reqlen); - OBD_FREE(req, sizeof *req); - } + target_finish_recovery(obd); ptlrpc_run_recovery_over_upcall(obd); } else { CWARN("%s: %d recoverable clients remain\n", diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index b55e91f..bacf759 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -193,7 +193,7 @@ int llu_glimpse_size(struct inode *inode) rc = obd_enqueue(sbi->ll_osc_exp, lli->lli_smd, LDLM_EXTENT, &policy, LCK_PR, &flags, llu_extent_lock_callback, ldlm_completion_ast, llu_glimpse_callback, inode, - sizeof(*lvb), lustre_swab_ost_lvb, &lockh); + sizeof(struct ost_lvb), lustre_swab_ost_lvb, &lockh); if (rc > 0) RETURN(-EIO); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index ae8034a..4918f98 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -384,7 +384,7 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, break; } - conditional_schedule(); + cond_resched(); page = find_get_page(inode->i_mapping, i); if (page == NULL) @@ -658,8 +658,19 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, rc = -EIO; if (policy->l_extent.start == 0 && - policy->l_extent.end == OBD_OBJECT_EOF) + policy->l_extent.end == OBD_OBJECT_EOF) { + /* vmtruncate()->ll_truncate() first sets the i_size and then + * the kms under both a DLM lock and the i_sem. If we don't + * get the i_sem here we can match the DLM lock and reset + * i_size from the kms before the truncating path has updated + * the kms. generic_file_write can then trust the stale i_size + * when doing appending writes and effectively cancel the + * result of the truncate. Getting the i_sem after the enqueue + * maintains the DLM -> i_sem acquiry order. */ + down(&inode->i_sem); inode->i_size = lov_merge_size(lsm, 1); + up(&inode->i_sem); + } //inode->i_mtime = lov_merge_mtime(lsm, inode->i_mtime); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 526eeb3..5bec189 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -274,7 +274,6 @@ static int lov_disconnect(struct obd_export *exp, int flags) static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, int activate) { - struct obd_device *obd; struct lov_tgt_desc *tgt; int i, rc = 0; ENTRY; @@ -293,24 +292,14 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, if (i == lov->desc.ld_tgt_count) GOTO(out, rc = -EINVAL); - obd = class_exp2obd(tgt->ltd_exp); - if (obd == NULL) { - /* This can happen if OST failure races with node shutdown */ - GOTO(out, rc = -ENOTCONN); - } - - CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LOV idx %d\n", - obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd, - obd->obd_type->typ_name, i); - LASSERT(strcmp(obd->obd_type->typ_name, "osc") == 0); - if (tgt->active == activate) { - CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd, + CDEBUG(D_INFO, "OSC %s already %sactive!\n", uuid->uuid, activate ? "" : "in"); GOTO(out, rc); } - CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in"); + CDEBUG(D_INFO, "Marking OSC %s %sactive\n", uuid->uuid, + activate ? "" : "in"); tgt->active = activate; if (activate) @@ -2071,13 +2060,13 @@ static int lov_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, if (tmp > lock->l_policy_data.l_extent.end) tmp = lock->l_policy_data.l_extent.end + 1; if (tmp >= loi->loi_kms) { - CDEBUG(D_INODE, "lock acquired, setting rss=" + CDEBUG(D_DLMTRACE, "lock acquired, setting rss=" LPU64", kms="LPU64"\n", loi->loi_rss, tmp); loi->loi_kms = tmp; loi->loi_kms_valid = 1; } else { - CDEBUG(D_INODE, "lock acquired, setting rss=" + CDEBUG(D_DLMTRACE, "lock acquired, setting rss=" LPU64"; leaving kms="LPU64", end="LPU64 "\n", loi->loi_rss, loi->loi_kms, lock->l_policy_data.l_extent.end); @@ -2089,8 +2078,9 @@ static int lov_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, memset(lov_lockhp, 0, sizeof(*lov_lockhp)); loi->loi_rss = submd->lsm_oinfo->loi_rss; loi->loi_blocks = submd->lsm_oinfo->loi_blocks; - CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" - " kms="LPU64"\n", loi->loi_rss, loi->loi_kms); + CDEBUG(D_DLMTRACE, "glimpsed, setting rss="LPU64 + "; leaving kms="LPU64"\n", loi->loi_rss, + loi->loi_kms); } else { memset(lov_lockhp, 0, sizeof(*lov_lockhp)); if (lov->tgts[loi->loi_ost_idx].active) { diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 0c74ec0..5505329 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -182,7 +182,7 @@ static int mds_server_free_data(struct mds_obd *mds) return 0; } -static int mds_read_last_rcvd(struct obd_device *obd, struct file *file) +static int mds_init_server_data(struct obd_device *obd, struct file *file) { struct mds_obd *mds = &obd->u.mds; struct mds_server_data *msd; @@ -326,6 +326,7 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file) spin_lock_init(&med->med_open_lock); mcd = NULL; + exp->exp_replay_needed = 1; obd->obd_recoverable_clients++; obd->obd_max_recoverable_clients++; class_export_put(exp); @@ -337,7 +338,11 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file) mds->mds_last_transno = last_transno; } + if (mcd) + OBD_FREE(mcd, sizeof(*mcd)); + obd->obd_last_committed = mds->mds_last_transno; + if (obd->obd_recoverable_clients) { CWARN("RECOVERY: service %s, %d recoverable clients, " "last_transno "LPU64"\n", obd->obd_name, @@ -346,16 +351,15 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file) obd->obd_recovering = 1; } - if (mcd) - OBD_FREE(mcd, sizeof(*mcd)); - mds->mds_mount_count = mount_count + 1; msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count); /* save it, so mount count and last_transno is current */ rc = mds_update_server_data(obd, 1); + if (rc) + GOTO(err_client, rc); - RETURN(rc); + RETURN(0); err_client: class_disconnect_exports(obd, 0); @@ -455,7 +459,7 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt) GOTO(err_last_rcvd, rc = -ENOENT); } - rc = mds_read_last_rcvd(obd, file); + rc = mds_init_server_data(obd, file); if (rc) { CERROR("cannot read %s: rc = %d\n", LAST_RCVD, rc); GOTO(err_last_rcvd, rc); @@ -562,8 +566,8 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, ENTRY; push_ctxt(&saved, &exp->exp_obd->obd_ctxt, NULL); - - sprintf(fidname, "OBJECTS/%u", tmpname); + + sprintf(fidname, "OBJECTS/%u.%u", tmpname, current->pid); filp = filp_open(fidname, O_CREAT | O_EXCL, 0644); if (IS_ERR(filp)) { rc = PTR_ERR(filp); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index d93ce0e..f0bf35b 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -435,7 +435,7 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, case OBD_IOC_CATLOGLIST: { int count = mds->mds_lov_desc.ld_tgt_count; - rc = llog_catlog_list(obd, count, data); + rc = llog_catalog_list(obd, count, data); RETURN(rc); } diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index ee096ac..2952fce 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -418,6 +418,7 @@ EXPORT_SYMBOL(class_exp2cliimp); EXPORT_SYMBOL(class_conn2cliimp); EXPORT_SYMBOL(class_disconnect); EXPORT_SYMBOL(class_disconnect_exports); +EXPORT_SYMBOL(class_disconnect_stale_exports); EXPORT_SYMBOL(oig_init); EXPORT_SYMBOL(oig_release); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index a8db9cb..0429ceb 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -603,24 +603,17 @@ int class_disconnect(struct obd_export *export, int flags) RETURN(0); } -void class_disconnect_exports(struct obd_device *obd, int flags) +static void class_disconnect_export_list(struct list_head *list, int flags) { int rc; - struct list_head *tmp, *n, work_list; struct lustre_handle fake_conn; struct obd_export *fake_exp, *exp; ENTRY; - /* Move all of the exports from obd_exports to a work list, en masse. */ - spin_lock(&obd->obd_dev_lock); - list_add(&work_list, &obd->obd_exports); - list_del_init(&obd->obd_exports); - spin_unlock(&obd->obd_dev_lock); - - CDEBUG(D_HA, "OBD device %d (%p) has exports, " - "disconnecting them\n", obd->obd_minor, obd); - list_for_each_safe(tmp, n, &work_list) { - exp = list_entry(tmp, struct obd_export, exp_obd_chain); + /* It's possible that an export may disconnect itself, but + * nothing else will be added to this list. */ + while(!list_empty(list)) { + exp = list_entry(list->next, struct obd_export, exp_obd_chain); class_export_get(exp); if (obd_uuid_equals(&exp->exp_client_uuid, @@ -653,6 +646,51 @@ void class_disconnect_exports(struct obd_device *obd, int flags) EXIT; } +void class_disconnect_exports(struct obd_device *obd, int flags) +{ + struct list_head work_list; + ENTRY; + + /* Move all of the exports from obd_exports to a work list, en masse. */ + spin_lock(&obd->obd_dev_lock); + list_add(&work_list, &obd->obd_exports); + list_del_init(&obd->obd_exports); + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_HA, "OBD device %d (%p) has exports, " + "disconnecting them\n", obd->obd_minor, obd); + class_disconnect_export_list(&work_list, flags); + EXIT; +} + +/* Remove exports that have not completed recovery. + */ +void class_disconnect_stale_exports(struct obd_device *obd, int flags) +{ + struct list_head work_list; + struct list_head *pos, *n; + struct obd_export *exp; + int cnt = 0; + ENTRY; + + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_for_each_safe(pos, n, &obd->obd_exports) { + exp = list_entry(pos, struct obd_export, exp_obd_chain); + if (exp->exp_replay_needed) { + list_del(&exp->exp_obd_chain); + list_add(&exp->exp_obd_chain, &work_list); + cnt++; + } + } + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n", + obd->obd_name, cnt); + class_disconnect_export_list(&work_list, flags); + EXIT; +} + int oig_init(struct obd_io_group **oig_out) { struct obd_io_group *oig; diff --git a/lustre/obdclass/llog_ioctl.c b/lustre/obdclass/llog_ioctl.c index 6c060e7..6c53036 100644 --- a/lustre/obdclass/llog_ioctl.c +++ b/lustre/obdclass/llog_ioctl.c @@ -377,7 +377,7 @@ out: } EXPORT_SYMBOL(llog_ioctl); -int llog_catlog_list(struct obd_device *obd, int count, +int llog_catalog_list(struct obd_device *obd, int count, struct obd_ioctl_data *data) { int size, i; @@ -418,4 +418,4 @@ int llog_catlog_list(struct obd_device *obd, int count, RETURN(0); } -EXPORT_SYMBOL(llog_catlog_list); +EXPORT_SYMBOL(llog_catalog_list); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index cf4797b..dd4e563 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -477,6 +477,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) spin_lock_init(&fed->fed_lock); fcd = NULL; + exp->exp_replay_needed = 1; obd->obd_recoverable_clients++; class_export_put(exp); @@ -488,6 +489,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) } + if (fcd) + OBD_FREE(fcd, sizeof(*fcd)); + obd->obd_last_committed = le64_to_cpu(fsd->fsd_last_transno); if (obd->obd_recoverable_clients) { @@ -498,17 +502,16 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) obd->obd_recovering = 1; } - if (fcd) - OBD_FREE(fcd, sizeof(*fcd)); - out: filter->fo_mount_count = mount_count + 1; fsd->fsd_mount_count = cpu_to_le64(filter->fo_mount_count); /* save it, so mount count and last_transno is current */ rc = filter_update_server_data(obd, filp, filter->fo_fsd, 1); + if (rc) + GOTO(err_client, rc); - RETURN(rc); + RETURN(0); err_client: class_disconnect_exports(obd, 0); @@ -2336,7 +2339,7 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp, } case OBD_IOC_CATLOGLIST: { - rc = llog_catlog_list(obd, 1, data); + rc = llog_catalog_list(obd, 1, data); RETURN(rc); } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index ded86b3..da09be4 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -779,7 +779,6 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf))); osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0); spin_lock_irqsave(&req->rq_lock, flags); - req->rq_no_resend = 1; spin_unlock_irqrestore(&req->rq_lock, flags); /* size[0] still sizeof (*body) */ @@ -901,8 +900,6 @@ restart_bulk: rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm, page_count, pga, &requested_nob, &niocount, &request); - /* NB ^ sets rq_no_resend */ - if (rc != 0) return (rc); @@ -931,13 +928,6 @@ static int brw_interpret(struct ptlrpc_request *request, struct brw_page *pga = aa->aa_pga; ENTRY; - /* XXX bug 937 here */ - if (rc == -ETIMEDOUT && request->rq_resend) { - DEBUG_REQ(D_HA, request, "BULK TIMEOUT"); - LBUG(); /* re-send. later. */ - //goto restart_bulk; - } - rc = osc_brw_fini_request(request, oa, requested_nob, niocount, page_count, pga, rc); RETURN (rc); @@ -957,7 +947,6 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa, rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm, page_count, pga, &requested_nob, &nio_count, &request); - /* NB ^ sets rq_no_resend */ if (rc == 0) { LASSERT(sizeof(*aa) <= sizeof(request->rq_async_args)); diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index 2a42368..c78fc34 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -92,6 +92,7 @@ AC_CHECK_FILE([$LINUX/include/linux/namei.h], [ linux25="yes" KMODEXT=".ko" + enable_ldiskfs="yes" ],[ KMODEXT=".o" linux25="no" @@ -101,6 +102,16 @@ AC_MSG_RESULT([$linux25]) AM_CONDITIONAL(LINUX25, test x$linux25 = xyes) AC_SUBST(KMODEXT) +AC_PATH_PROG(PATCH, patch, [no]) +AC_PATH_PROG(QUILT, quilt, [no]) +AM_CONDITIONAL(USE_QUILT, test x$QUILT = xno) + +if test x$enable_ldiskfs$enable_modules = xyesyes ; then + if test x$PATCH$QUILT = xnono ; then + AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)]) + fi +fi + # ------- Makeflags ------------------ CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" @@ -135,7 +146,7 @@ _ACEOF AC_DEFUN([LUSTRE_MODULE_COMPILE_IFELSE], [m4_ifvaln([$1], [LUSTRE_MODULE_CONFTEST([$1])])dnl rm -f kernel-tests/conftest.o kernel-tests/conftest.mod.c kernel-tests/conftest.ko -AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="$EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])], +AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])], [$4], [_AC_MSG_LOG_CONFTEST m4_ifvaln([$5],[$5])dnl])dnl @@ -446,7 +457,7 @@ LUSTRE_MODULE_TRY_COMPILE( # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- # This needs to run after we've defined the KCPPFLAGS -AC_MSG_CHECKING([for kernel version]) +AC_MSG_CHECKING([if task_struct has a sighand field]) LUSTRE_MODULE_TRY_COMPILE( [ #include @@ -455,9 +466,24 @@ LUSTRE_MODULE_TRY_COMPILE( p.sighand = NULL; ],[ AC_DEFINE(CONFIG_RH_2_4_20, 1, [this kernel contains Red Hat 2.4.20 patches]) - AC_MSG_RESULT([redhat-2.4.20]) + AC_MSG_RESULT([yes]) ],[ - AC_MSG_RESULT([$LINUXRELEASE]) + AC_MSG_RESULT([no]) + ]) + +# ---------- 2.4.20 introduced cond_resched -------------- + +AC_MSG_CHECKING([if kernel offers cond_resched]) +LUSTRE_MODULE_TRY_COMPILE( + [ + #include + ],[ + cond_resched(); + ],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found]) + ],[ + AC_MSG_RESULT([no]) ]) # ---------- Red Hat 2.4.21 backports some more 2.5 bits -------- diff --git a/lustre/portals/autoMakefile.am b/lustre/portals/autoMakefile.am index bd57e6e..485ff04 100644 --- a/lustre/portals/autoMakefile.am +++ b/lustre/portals/autoMakefile.am @@ -3,6 +3,6 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -EXTRA_DIST = archdep.m4 build.m4 include +EXTRA_DIST = archdep.m4 build.m4 -SUBDIRS = portals libcfs knals unals router tests doc utils +SUBDIRS = portals libcfs knals unals router tests doc utils include diff --git a/lustre/portals/include/.cvsignore b/lustre/portals/include/.cvsignore index d45f796..94d3790 100644 --- a/lustre/portals/include/.cvsignore +++ b/lustre/portals/include/.cvsignore @@ -2,3 +2,5 @@ config.h stamp-h stamp-h1 stamp-h.in +Makefile +Makefile.in diff --git a/lustre/portals/include/Makefile.am b/lustre/portals/include/Makefile.am new file mode 100644 index 0000000..2b3eb8c --- /dev/null +++ b/lustre/portals/include/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = linux portals + +EXTRA_DIST = cygwin-ioctl.h diff --git a/lustre/portals/include/linux/.cvsignore b/lustre/portals/include/linux/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/portals/include/linux/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/portals/include/linux/Makefile.am b/lustre/portals/include/linux/Makefile.am new file mode 100644 index 0000000..3c28c6e8 --- /dev/null +++ b/lustre/portals/include/linux/Makefile.am @@ -0,0 +1,4 @@ +linuxdir = $(includedir)/linux + +EXTRA_DIST = kp30.h kpr.h libcfs.h lustre_list.h portals_compat25.h \ + portals_lib.h diff --git a/lustre/portals/include/linux/libcfs.h b/lustre/portals/include/linux/libcfs.h index efdc8fe..6772e82 100644 --- a/lustre/portals/include/linux/libcfs.h +++ b/lustre/portals/include/linux/libcfs.h @@ -2,7 +2,7 @@ * vim:expandtab:shiftwidth=8:tabstop=8: */ #ifndef _LIBCFS_H - +#define _LIBCFS_H #define PORTAL_DEBUG diff --git a/lustre/portals/include/portals/.cvsignore b/lustre/portals/include/portals/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/portals/include/portals/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/portals/include/portals/Makefile.am b/lustre/portals/include/portals/Makefile.am new file mode 100644 index 0000000..5ed6090 --- /dev/null +++ b/lustre/portals/include/portals/Makefile.am @@ -0,0 +1,10 @@ +portalsdir=$(includedir)/portals + +if UTILS +portals_HEADERS = list.h +endif + +EXTRA_DIST = api.h api-support.h arg-blocks.h defines.h errno.h \ + internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h \ + list.h lltrace.h myrnal.h nal.h nalids.h p30.h ppid.h ptlctl.h \ + socknal.h stringtab.h types.h diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h index 74ef493..80995e9 100644 --- a/lustre/portals/include/portals/types.h +++ b/lustre/portals/include/portals/types.h @@ -1,26 +1,15 @@ #ifndef _P30_TYPES_H_ #define _P30_TYPES_H_ -#ifdef __linux__ -# include -# if defined(__powerpc__) && !defined(__KERNEL__) -# define __KERNEL__ -# include -# undef __KERNEL__ -# else -# include -# endif -#else -# include -typedef u_int32_t __u32; -typedef u_int64_t __u64; -#endif +#include #ifdef __KERNEL__ # include +# include #else # include # define do_gettimeofday(tv) gettimeofday(tv, NULL); +typedef unsigned long long cycles_t; #endif #include diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index 6bff730..08453a0 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -585,7 +585,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) /* Don't block for transmit descriptor if we're in interrupt context */ int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0; int dest = kqswnal_nid2elanid (ktx->ktx_nid); - long flags; + unsigned long flags; int rc; ktx->ktx_launchtime = jiffies; @@ -1429,7 +1429,7 @@ kqswnal_rx (kqswnal_rx_t *krx) void kqswnal_rxhandler(EP_RXD *rxd) { - long flags; + unsigned long flags; int nob = ep_rxd_len (rxd); int status = ep_rxd_status (rxd); kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); @@ -1732,7 +1732,7 @@ kqswnal_scheduler (void *arg) kqswnal_rx_t *krx; kqswnal_tx_t *ktx; kpr_fwd_desc_t *fwd; - long flags; + unsigned long flags; int rc; int counter = 0; int shuttingdown = 0; diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index f02cbda..37695c9 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -1187,7 +1187,7 @@ ksocknal_fmb_callback (void *arg, int error) { ksock_fmb_t *fmb = (ksock_fmb_t *)arg; ksock_fmb_pool_t *fmp = fmb->fmb_pool; - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address(fmb->fmb_kiov[0].kiov_page); + ptl_hdr_t *hdr = &fmb->fmb_hdr; ksock_conn_t *conn = NULL; ksock_sched_t *sched; unsigned long flags; diff --git a/lustre/portals/unals/Makefile.am b/lustre/portals/unals/Makefile.am index 4c842a1..15080b0 100644 --- a/lustre/portals/unals/Makefile.am +++ b/lustre/portals/unals/Makefile.am @@ -2,7 +2,12 @@ if LIBLUSTRE noinst_LIBRARIES = libtcpnal.a endif -pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h -libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h +noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h \ + ipmap.h bridge.h procbridge.h + +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h \ + dispatch.h table.h timer.h address.c procapi.c proclib.c \ + connection.c tcpnal.c connection.h + libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS) libtcpnal_a_CFLAGS = $(LLCFLAGS) diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am index 15c1774..851a8e1 100644 --- a/lustre/portals/utils/Makefile.am +++ b/lustre/portals/utils/Makefile.am @@ -14,8 +14,10 @@ libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS) libuptlctl_a_CFLAGS = $(LLCFLAGS) endif +if UTILS sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid lib_LIBRARIES = libptlctl.a +endif acceptor_SOURCES = acceptor.c diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 489100e..1db0606 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1274,12 +1274,15 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, LASSERT_SPIN_LOCKED(&imp->imp_lock); + /* clear this for new requests that were resent as well + as resent replayed requests. */ + lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); + /* don't re-add requests that have been replayed */ if (!list_empty(&req->rq_replay_list)) return; - lustre_msg_add_flags(req->rq_reqmsg, - MSG_REPLAY); + lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); LASSERT(imp->imp_replayable); /* Balanced in ptlrpc_free_committed, usually. */ @@ -1591,16 +1594,8 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) aa->praa_old_state = req->rq_send_state; req->rq_send_state = LUSTRE_IMP_REPLAY; req->rq_phase = RQ_PHASE_NEW; - /* - * Q: "How can a req get on the replay list if it wasn't replied?" - * A: "If we failed during the replay of this request, it will still - * be on the list, but rq_replied will have been reset to 0." - */ - if (req->rq_replied) { - aa->praa_old_status = req->rq_repmsg->status; - req->rq_status = 0; - req->rq_replied = 0; - } + aa->praa_old_status = req->rq_repmsg->status; + req->rq_status = 0; req->rq_interpret_reply = ptlrpc_replay_interpret; atomic_inc(&req->rq_import->imp_replay_inflight); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index f2d034f..0942192 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -100,6 +100,10 @@ int ptlrpc_set_import_discon(struct obd_import *imp) spin_lock_irqsave(&imp->imp_lock, flags); if (imp->imp_state == LUSTRE_IMP_FULL) { + CERROR("%s: connection lost to %s@%s\n", + imp->imp_obd->obd_name, + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid); IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); spin_unlock_irqrestore(&imp->imp_lock, flags); obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); @@ -250,7 +254,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING); imp->imp_conn_cnt++; - imp->imp_last_replay_transno = 0; + imp->imp_resend_replay = 0; if (imp->imp_remote_handle.cookie == 0) { initial_connect = 1; @@ -386,19 +390,27 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, request->rq_repmsg->handle.cookie); imp->imp_remote_handle = request->rq_repmsg->handle; } else { - CERROR("reconnected to %s@%s after partition\n", + CDEBUG(D_HA, "reconnected to %s@%s after partition\n", imp->imp_target_uuid.uuid, imp->imp_connection->c_remote_uuid.uuid); } - if (imp->imp_invalid) + if (imp->imp_invalid) { IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); - else + } else if (MSG_CONNECT_RECOVERING & msg_flags) { + CDEBUG(D_HA, "%s: reconnected to %s during replay\n", + imp->imp_obd->obd_name, + imp->imp_target_uuid.uuid); + imp->imp_resend_replay = 1; + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); + } else { IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); + } } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) { LASSERT(imp->imp_replayable); imp->imp_remote_handle = request->rq_repmsg->handle; + imp->imp_last_replay_transno = 0; IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); } else { @@ -440,7 +452,7 @@ finish: if (aa->pcaa_initial_connect && !imp->imp_initial_recov) { ptlrpc_deactivate_import(imp); } - CDEBUG(D_ERROR, "recovery of %s on %s failed (%d)\n", + CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", imp->imp_target_uuid.uuid, (char *)imp->imp_connection->c_remote_uuid.uuid, rc); } @@ -453,7 +465,15 @@ static int completed_replay_interpret(struct ptlrpc_request *req, void * data, int rc) { atomic_dec(&req->rq_import->imp_replay_inflight); - ptlrpc_import_recovery_state_machine(req->rq_import); + if (req->rq_status == 0) { + ptlrpc_import_recovery_state_machine(req->rq_import); + } else { + CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, " + "reconnecting\n", + req->rq_import->imp_obd->obd_name, req->rq_status); + ptlrpc_connect_import(req->rq_import, NULL); + } + RETURN(0); } @@ -534,6 +554,10 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) GOTO(out, rc); IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); ptlrpc_activate_import(imp); + CERROR("%s: connection restored to %s@%s\n", + imp->imp_obd->obd_name, + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid); } if (imp->imp_state == LUSTRE_IMP_FULL) { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 6c7c9a3..91a9e88 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -31,13 +31,12 @@ #include #include "ptlrpc_internal.h" -static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, +static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid, struct ptlrpc_connection *conn, int portal, __u64 xid) { ptl_process_id_t remote_id; int rc; - int rc2; ptl_md_t md; char str[PTL_NALFMT_SIZE]; ENTRY; @@ -78,15 +77,16 @@ static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n", len, portal, xid); - rc2 = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0); + rc = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0); if (rc != PTL_OK) { + int rc2; /* We're going to get an UNLINK event when I unlink below, * which will complete just like any other failed send, so * I fall through and return success here! */ CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n", remote_id.nid, portal, xid, rc); rc2 = PtlMDUnlink(*mdh); - LASSERT (rc2 == PTL_OK); + LASSERTF(rc2 == PTL_OK, "rc2 = %d\n", rc2); } RETURN (0); diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c index 687f588..71cfdfd 100644 --- a/lustre/ptlrpc/ptlrpcd.c +++ b/lustre/ptlrpc/ptlrpcd.c @@ -42,7 +42,7 @@ #include #include -#ifndef __CYGWIN__ +#ifdef __KERNEL__ # include # include #else @@ -135,6 +135,13 @@ static int ptlrpcd_check(struct ptlrpcd_ctl *pc) } } + if (rc == 0) { + /* If new requests have been added, make sure to wake up */ + spin_lock_irqsave(&pc->pc_set->set_new_req_lock, flags); + rc = !list_empty(&pc->pc_set->set_new_requests); + spin_unlock_irqrestore(&pc->pc_set->set_new_req_lock, flags); + } + RETURN(rc); } diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index ece3a47..a86679d 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -130,16 +130,16 @@ void ptlrpc_initiate_recovery(struct obd_import *imp) LASSERT (obd_lustre_upcall != NULL); if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) { - CDEBUG(D_ERROR, "%s: starting recovery without upcall\n", + CDEBUG(D_HA, "%s: starting recovery without upcall\n", imp->imp_target_uuid.uuid); ptlrpc_connect_import(imp, NULL); } else if (strcmp(obd_lustre_upcall, "NONE") == 0) { - CDEBUG(D_ERROR, "%s: recovery diabled\n", + CDEBUG(D_HA, "%s: recovery disabled\n", imp->imp_target_uuid.uuid); } else { - CDEBUG(D_ERROR, "%s: calling upcall to start recovery\n", + CDEBUG(D_HA, "%s: calling upcall to start recovery\n", imp->imp_target_uuid.uuid); ptlrpc_run_failed_import_upcall(imp); } @@ -151,7 +151,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) { int rc = 0; struct list_head *tmp, *pos; - struct ptlrpc_request *req; + struct ptlrpc_request *req = NULL; unsigned long flags; __u64 last_transno; ENTRY; @@ -187,16 +187,36 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) */ list_for_each_safe(tmp, pos, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); + + /* If need to resend the last sent transno (because a + reconnect has occurred), then stop on the matching + req and send it again. If, however, the last sent + transno has been committed then we continue replay + from the next request. */ + if (imp->imp_resend_replay && + req->rq_transno == last_transno) { + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + break; + } + if (req->rq_transno > last_transno) { - rc = ptlrpc_replay_req(req); - if (rc) { - CERROR("recovery replay error %d for req " - LPD64"\n", rc, req->rq_xid); - RETURN(rc); - } - *inflight = 1; + imp->imp_last_replay_transno = req->rq_transno; break; } + + req = NULL; + } + + imp->imp_resend_replay = 0; + + if (req != NULL) { + rc = ptlrpc_replay_req(req); + if (rc) { + CERROR("recovery replay error %d for req " + LPD64"\n", rc, req->rq_xid); + RETURN(rc); + } + *inflight = 1; } RETURN(rc); } @@ -357,13 +377,13 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp, if (rc) RETURN(rc); - CDEBUG(D_ERROR, "%s: recovery started, waiting\n", + CDEBUG(D_HA, "%s: recovery started, waiting\n", imp->imp_target_uuid.uuid); lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, !ptlrpc_import_in_recovery(imp), &lwi); - CDEBUG(D_ERROR, "%s: recovery finished\n", + CDEBUG(D_HA, "%s: recovery finished\n", imp->imp_target_uuid.uuid); RETURN(rc); diff --git a/lustre/scripts/Makefile.am b/lustre/scripts/Makefile.am index 5e57916..fe13cc7 100644 --- a/lustre/scripts/Makefile.am +++ b/lustre/scripts/Makefile.am @@ -4,10 +4,12 @@ # See the file COPYING in this distribution EXTRA_DIST = license-status maketags.sh lustre.spec version_tag.pl.in \ - $(initd_SCRIPTS) lustre.spec.in lustre-kernel-2.4.spec.in \ + lustre lustre.spec.in lustre-kernel-2.4.spec.in \ lmake linux-merge-config.awk linux-merge-modules.awk \ linux-rhconfig.h initddir = $(sysconfdir)/init.d +if UTILS initd_SCRIPTS = lustre +endif diff --git a/lustre/scripts/lbuild b/lustre/scripts/lbuild index 1cd283e..9b934f0 100755 --- a/lustre/scripts/lbuild +++ b/lustre/scripts/lbuild @@ -20,6 +20,10 @@ SERIES= CONFIG= VERSION= +RHBUILD=0 +LINUX26=0 +SUSEBUILD=0 + BASE_ARCH= BIGMEM_ARCHS= BOOT_ARCHS= @@ -182,12 +186,12 @@ load_target() CONFIG_FILE="$TOPDIR/lustre/kernel_patches/kernel_configs/$CONFIG" [ -r "$CONFIG_FILE" ] || \ - fatal 1 "Target $TARGET's config file $CONFIG missing from $TOPDIR/lustre/kernel_patches/kernel_configs/configs." + fatal 1 "Target $TARGET's config file $CONFIG missing from $TOPDIR/lustre/kernel_patches/kernel_configs/." if [ "$EXTRA_VERSION_save" ] ; then EXTRA_VERSION="$EXTRA_VERSION_save" elif ! (( $RELEASE )) ; then - EXTRA_VERSION="${EXTRA_VERSION}-${TAG//_/}.${TIMESTAMP}" + EXTRA_VERSION="${EXTRA_VERSION}-${TAG}.${TIMESTAMP}" fi # EXTRA_VERSION=${EXTRA_VERSION//-/_} @@ -195,7 +199,7 @@ load_target() BUILD_ARCHS= for arch in $(uniqify "$ALL_ARCHS") ; do - if [ -z "$TARGET_ARCHS" ] || echo "$TARGET_ARCHS" | grep -s "$arch" ; then + if [ -z "$TARGET_ARCHS" ] || echo "$TARGET_ARCHS" | grep "$arch" >/dev/null 2>/dev/null ; then BUILD_ARCHS="$BUILD_ARCHS $arch" fi done @@ -270,9 +274,11 @@ patch_linux() popd >/dev/null echo "Full patch has been saved in ${FULL_PATCH##*/}." echo "Replacing .config files..." - [ -d linux/configs ] || mkdir linux/configs + [ -d linux/configs ] || mkdir linux/configs || \ + fatal 1 "Error creating configs directory." rm -f linux/configs/* - cp -v lustre/kernel_patches/kernel_configs/kernel-${VERSION}-${TARGET}*.config linux/configs/ + cp -v lustre/kernel_patches/kernel_configs/kernel-${VERSION}-${TARGET}*.config linux/configs/ || \ + fatal 1 "Error copying in kernel configs." } pack_linux() @@ -310,6 +316,8 @@ prep_build() -e "s/@SMP_ARCHS@/$SMP_ARCHS/g" \ -e "s/@UP_ARCHS@/$UP_ARCHS/g" \ -e "s/@RHBUILD@/$RHBUILD/g" \ + -e "s/@LINUX26@/$LINUX26/g" \ + -e "s/@SUSEBUILD@/$SUSEBUILD/g" \ < $TOPDIR/lustre/scripts/lustre-kernel-2.4.spec.in \ > lustre-kernel-2.4.spec [ -d SRPMS ] || mkdir SRPMS diff --git a/lustre/scripts/lustre-kernel-2.4.spec.in b/lustre/scripts/lustre-kernel-2.4.spec.in index f177c17..3ec63bb 100644 --- a/lustre/scripts/lustre-kernel-2.4.spec.in +++ b/lustre/scripts/lustre-kernel-2.4.spec.in @@ -355,7 +355,10 @@ BuildKernel() --kerneldir $RPM_SOURCE_DIR \ -j $RPM_BUILD_NCPUS \ --destdir $RPM_BUILD_ROOT \ - -- @CONFIGURE_FLAGS@ + -- --enable-modules \ + --disable-doc --disable-tests \ + --disable-utils --disable-liblustre \ + @CONFIGURE_FLAGS@ } BuildLustre() @@ -371,7 +374,10 @@ BuildLustre() --kerneldir $RPM_SOURCE_DIR \ -j $RPM_BUILD_NCPUS \ --destdir $RPM_BUILD_ROOT \ - -- @CONFIGURE_FLAGS@ + -- --enable-utils \ + --disable-doc --disable-tests \ + --disable-modules --disable-liblustre \ + @CONFIGURE_FLAGS@ } SaveHeaders() @@ -401,14 +407,12 @@ BuildKernel jensen BuildKernel smp %endif -# we want this one last, so that it is the one populating /usr/bin -%if %{buildup} && %{buildbase} +%if %{buildup} BuildKernel -%elseif %{buildbase} -BuildLustre %endif %if %{buildbase} +BuildLustre SaveHeaders %endif @@ -520,14 +524,14 @@ if [ -f ../../savedheaders/%{_target_cpu}/up/version.h ] ; then HEADER_FILE=../../savedheaders/%{_target_cpu}/up/version.h else # test build not including uniprocessor, must get info from somewhere - HEADER_FILE=$(ls ../../savedheaders/*/*/version.h | head -1) + HEADER_FILE=$(ls ../../savedheaders/*/*/version.h | head -n 1) fi grep -v UTS_RELEASE $HEADER_FILE >> version.h rm -rf ../../savedheaders } ; popd touch $RPM_BUILD_ROOT/boot/kernel.h-%{kversion} -rm -f $RPM_BUILD_ROOT/usr/include/linux +# rm -f $RPM_BUILD_ROOT/usr/include/linux rm -rf $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/savedheaders @@ -822,7 +826,9 @@ exit 0 /usr/bin/* /usr/lib/lustre/python /etc/init.d/lustre -/usr/include/lustre +/usr/include/lustre/* +/usr/include/portals/* +/usr/include/linux/* /lib/lib*.a #%files -n lustre-doc diff --git a/lustre/scripts/lustre.spec.in b/lustre/scripts/lustre.spec.in index 329ef4c..39ccc41 100644 --- a/lustre/scripts/lustre.spec.in +++ b/lustre/scripts/lustre.spec.in @@ -68,12 +68,23 @@ Configures openldap server for LDAP Lustre config database %endif %build +# if RPM_BUILD_NCPUS unset, set it +if [ -z "$RPM_BUILD_NCPUS" ] ; then + RPM_BUILD_NCPUS=$(egrep -c "^cpu[0-9]+" /proc/stat || :) + if [ $RPM_BUILD_NCPUS -eq 0 ] ; then + RPM_BUILD_NCPUS=1 + fi + if [ $RPM_BUILD_NCPUS -gt 8 ] ; then + RPM_BUILD_NCPUS=8 + fi +fi + rm -rf $RPM_BUILD_ROOT # Set an explicit path to our Linux tree, if we can. cd $RPM_BUILD_DIR/lustre-%{version} ./configure --with-linux='%{linuxdir}' %{disable_doc} --disable-liblustre -make +make -j $RPM_BUILD_NCPUS -s %install cd $RPM_BUILD_DIR/lustre-%{version} diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index 778e8f1..a27f828 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -63,3 +63,4 @@ logs ostactive ll_dirstripe_verify rename_many +openfilleddirunlink diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 13abda9..1c19ee4 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -3,16 +3,19 @@ AM_CPPFLAGS = $(LLCPPFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 AM_CFLAGS = $(LLCFLAGS) # LDADD = -lldap # LDADD := -lreadline -ltermcap # -lefence -EXTRA_DIST = $(pkgexample_SCRIPTS) $(noinst_SCRIPTS) $(noinst_DATA) \ - sanity.sh rundbench -if TESTS -pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh -pkgexample_SCRIPTS += local.sh echo.sh uml.sh lov.sh + +pkgexample_scripts = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh +pkgexample_scripts += local.sh echo.sh uml.sh lov.sh noinst_DATA = noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh tbox.sh noinst_SCRIPTS += llrmount.sh runfailure-mds runvmstat runfailure-net noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests noinst_SCRIPTS += sanity.sh rundbench + +EXTRA_DIST = $(pkgexample_scripts) $(noinst_SCRIPTS) $(noinst_DATA) \ + sanity.sh rundbench +if TESTS +pkgexample_SCRIPTS = $(pkgexample_scripts) noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink utime cmknod diff --git a/lustre/tests/cfg/insanity-mdev.sh b/lustre/tests/cfg/insanity-mdev.sh index d3f0c6e..fa15cd2 100644 --- a/lustre/tests/cfg/insanity-mdev.sh +++ b/lustre/tests/cfg/insanity-mdev.sh @@ -6,22 +6,25 @@ EXTRA_OSTS=${EXTRA_OSTS:-mdev7} client_HOST=client LIVE_CLIENT=${LIVE_CLIENT:-mdev6} # This should always be a list, not a regexp -#FAIL_CLIENTS=${FAIL_CLIENTS:-mdev7} -FAIL_CLIENTS=${FAIL_CLIENTS:-""} +FAIL_CLIENTS=${FAIL_CLIENTS:-mdev8} +#FAIL_CLIENTS=${FAIL_CLIENTS:-""} NETTYPE=${NETTYPE:-tcp} TIMEOUT=${TIMEOUT:-30} -PTLDEBUG=${PTLDEBUG:-0} -SUBSYSTEM=${SUBSYSTEM:-0} +PTLDEBUG=${PTLDEBUG:-0x3f0400} +SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff} MOUNT=${MOUNT:-"/mnt/lustre"} UPCALL=${CLIENT_UPCALL:-`pwd`/replay-single-upcall.sh} MDSDEV=${MDSDEV:-/dev/sda1} MDSSIZE=${MDSSIZE:-50000} +MDSJOURNALSIZE=${MDSJOURNALSIZE:-0} OSTDEV=${OSTDEV:-$TMP/ost%d-`hostname`} -OSTSIZE=${OSTSIZE:=50000} +OSTSIZE=${OSTSIZE:=500000} +OSTJOURNALSIZE=${OSTJOURNALSIZE:-0} + FSTYPE=${FSTYPE:-ext3} STRIPE_BYTES=${STRIPE_BYTES:-1048576} STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index 14f2207..9af8621 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -25,7 +25,7 @@ OSTDEV=${OSTDEV:-$ROOT/tmp/ost1-`hostname`} OSTSIZE=${OSTSIZE:-50000} FSTYPE=${FSTYPE:-ext3} TIMEOUT=${TIMEOUT:-20} -UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh} +UPCALL=${UPCALL:-DEFAULT} STRIPE_BYTES=${STRIPE_BYTES:-65536} STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 4212cab..2445e19 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -218,7 +218,7 @@ test_5b() { stop_mds || return 2 stop_ost || return 3 - lsmod | grep -q portals && return 3 + lsmod | grep -q portals && return 4 return 0 } @@ -230,7 +230,7 @@ test_5c() { [ -d $MOUNT ] || mkdir -p $MOUNT $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null - llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT && exit 1 + llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT && return 1 # cleanup client modules $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null @@ -238,12 +238,33 @@ test_5c() { stop_mds || return 2 stop_ost || return 3 - lsmod | grep -q portals && return 3 + lsmod | grep -q portals && return 4 return 0 } run_test 5c "cleanup after failed mount (bug 2712)" +test_5d() { + start_ost + start_mds + stop_ost --force + + [ -d $MOUNT ] || mkdir -p $MOUNT + $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null + llmount $mds_HOST://mds_svc/client_facet $MOUNT || return 1 + + umount $MOUNT || return 2 + # cleanup client modules + $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null + + stop_mds || return 3 + + lsmod | grep -q portals && return 4 + return 0 + +} +run_test 5d "ost down, don't crash during mount attempt" + test_6() { setup manual_umount_client diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 68d0ff9..9c05b27 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -12,6 +12,9 @@ init_test_env $@ ALWAYS_EXCEPT="10" +SETUP=${SETUP:-"setup"} +CLEANUP=${CLEANUP:-"cleanup"} + build_test_filter assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT @@ -128,6 +131,8 @@ gen_config() { } setup() { + gen_config + rm -rf logs/* for i in `seq $NUMOST`; do wait_for ost$i @@ -205,20 +210,17 @@ node_to_ost() { if [ "$ONLY" == "cleanup" ]; then - cleanup + $CLEANUP exit fi -if [ -z "$NOSETUP" ]; then - gen_config - setup -fi - if [ ! -z "$EVAL" ]; then eval "$EVAL" exit $? fi +$SETUP + if [ "$ONLY" == "setup" ]; then exit 0 fi @@ -615,4 +617,4 @@ test_10() { run_test 10 "Running Availability for 6 hours..." equals_msg "Done, cleaning up" -cleanup +$CLEANUP diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 882c716..8e7ca55 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -7,7 +7,7 @@ ALWAYS_EXCEPT="20b" LUSTRE=${LUSTRE:-`dirname $0`/..} -UPCALL=${UPCALL:-$PWD/recovery-small-upcall.sh} + . $LUSTRE/tests/test-framework.sh init_test_env $@ @@ -342,7 +342,7 @@ test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup mkdir -p $DIR/$tdir multiop $DIR/$tdir/${tfile} O_wc & MULTI_PID=$! - usleep 500 + sleep 1 cancel_lru_locks OSC #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 do_facet ost sysctl -w lustre.fail_loc=0x80000308 diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 9c1f1e1..77e66e7 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -9,6 +9,9 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/local.sh} +SETUP=${SETUP:-"setup"} +CLEANUP=${CLEANUP:-"cleanup"} + gen_config() { rm -f $XMLCONFIG add_mds mds --dev $MDSDEV --size $MDSSIZE @@ -35,8 +38,8 @@ cleanup() { fail mds fi - umount $MOUNT2 - umount $MOUNT + umount $MOUNT2 || true + umount $MOUNT || true rmmod llite stop mds ${FORCE} stop ost2 ${FORCE} @@ -49,25 +52,18 @@ if [ "$ONLY" == "cleanup" ]; then exit fi -gen_config -start ost --reformat $OSTLCONFARGS -PINGER=`cat /proc/fs/lustre/pinger` +setup() { + gen_config + start ost --reformat $OSTLCONFARGS + start ost2 --reformat $OSTLCONFARGS + start mds $MDSLCONFARGS --reformat + grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT + grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2 -if [ "$PINGER" != "on" ]; then - echo "ERROR: Lustre must be built with --enable-pinger for replay-dual" - stop mds - exit 1 -fi - -start ost2 --reformat $OSTLCONFARGS -[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE -start mds $MDSLCONFARGS --reformat -grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT -grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2 - -echo $TIMEOUT > /proc/sys/lustre/timeout -echo $UPCALL > /proc/sys/lustre/upcall +# echo $TIMEOUT > /proc/sys/lustre/timeout +} +$SETUP [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE test_1() { @@ -175,7 +171,156 @@ test_6() { } run_test 6 "open1, open2, unlink |X| close1 [fail mds] close2" +test_8() { + replay_barrier mds + drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1 + fail mds + checkstat $MOUNT2/$tfile || return 2 + rm $MOUNT1/$tfile || return 3 + + return 0 +} +run_test 8 "replay of resent request" + +test_9() { + replay_barrier mds + mcreate $MOUNT1/$tfile-1 + mcreate $MOUNT2/$tfile-2 + # drop first reint reply + sysctl -w lustre.fail_loc=0x80000119 + fail mds + sysctl -w lustre.fail_loc=0 + + rm $MOUNT1/$tfile-[1,2] || return 1 + + return 0 +} +run_test 9 "resending a replayed create" + +test_10() { + mcreate $MOUNT1/$tfile-1 + replay_barrier mds + munlink $MOUNT1/$tfile-1 + mcreate $MOUNT2/$tfile-2 + # drop first reint reply + sysctl -w lustre.fail_loc=0x80000119 + fail mds + sysctl -w lustre.fail_loc=0 + + checkstat $MOUNT1/$tfile-1 && return 1 + checkstat $MOUNT1/$tfile-2 || return 2 + rm $MOUNT1/$tfile-2 + + return 0 +} +run_test 10 "resending a replayed unlink" + +test_11() { + replay_barrier mds + mcreate $MOUNT1/$tfile-1 + mcreate $MOUNT2/$tfile-2 + mcreate $MOUNT1/$tfile-3 + mcreate $MOUNT2/$tfile-4 + mcreate $MOUNT1/$tfile-5 + # drop all reint replies for a while + sysctl -w lustre.fail_loc=0x0119 + facet_failover mds + #sleep for while, let both clients reconnect and timeout + sleep $((TIMEOUT * 2)) + sysctl -w lustre.fail_loc=0 + + rm $MOUNT1/$tfile-[1-5] || return 1 + + return 0 +} +run_test 11 "both clients timeout during replay" + +test_12() { + replay_barrier mds + + multiop $DIR/$tfile mo_c & + MULTIPID=$! + sleep 5 + + # drop first enqueue + sysctl -w lustre.fail_loc=0x80000302 + facet_failover mds + df $MOUNT || return 1 + sysctl -w lustre.fail_loc=0 + + ls $DIR/$tfile + $CHECKSTAT -t file $DIR/$tfile || return 2 + kill -USR1 $MULTIPID || return 3 + wait $MULTIPID || return 4 + rm $DIR/$tfile + + return 0 +} +run_test 12 "open resend timeout" + +test_13() { + multiop $DIR/$tfile mo_c & + MULTIPID=$! + sleep 5 + + replay_barrier mds + + kill -USR1 $MULTIPID || return 3 + wait $MULTIPID || return 4 + + # drop close + sysctl -w lustre.fail_loc=0x80000115 + facet_failover mds + df $MOUNT || return 1 + sysctl -w lustre.fail_loc=0 + + ls $DIR/$tfile + $CHECKSTAT -t file $DIR/$tfile || return 2 + rm $DIR/$tfile + + return 0 +} +run_test 13 "close resend timeout" + +test_14() { + replay_barrier mds + createmany -o $MOUNT1/$tfile- 25 + createmany -o $MOUNT2/$tfile-2- 1 + createmany -o $MOUNT1/$tfile-3- 25 + umount $MOUNT2 + + facet_failover mds + # expect failover to fail + df $MOUNT && return 1 + + # first 25 files shouuld have been + # replayed + unlinkmany $MOUNT1/$tfile- 25 || return 2 + + zconf_mount `hostname` $MOUNT2 + return 0 +} +run_test 14 "timeouts waiting for lost client during replay" + +test_15() { + replay_barrier mds + createmany -o $MOUNT1/$tfile- 25 + createmany -o $MOUNT2/$tfile-2- 1 + umount $MOUNT2 + + facet_failover mds + df $MOUNT || return 1 + + lctl dk dk + unlinkmany $MOUNT1/$tfile- 25 || return 2 + + zconf_mount `hostname` $MOUNT2 + return 0 +} +run_test 15 "timeout waiting for lost client during replay, 1 client completes" + + if [ "$ONLY" != "setup" ]; then equals_msg test complete, cleaning up - cleanup + $CLEANUP fi diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 33f9786..327ea0b8 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -232,6 +232,11 @@ facet_nid() { facet_active() { local facet=$1 local activevar=${facet}active + + if [ -f ./${facet}active ] ; then + source ./${facet}active + fi + active=${!activevar} if [ -z "$active" ] ; then echo -n ${facet} diff --git a/lustre/utils/Lustre/Makefile.am b/lustre/utils/Lustre/Makefile.am index e8e522f..c3d9a59 100644 --- a/lustre/utils/Lustre/Makefile.am +++ b/lustre/utils/Lustre/Makefile.am @@ -1,2 +1,4 @@ +if UTILS pymod_SCRIPTS = __init__.py lustredb.py error.py cmdline.py -EXTRA_DIST = $(pymod_SCRIPTS) +endif +EXTRA_DIST = __init__.py lustredb.py error.py cmdline.py diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 1f7a8b5..5704e85 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -6,17 +6,17 @@ AM_CFLAGS=$(LLCFLAGS) AM_CPPFLAGS=$(LLCPPFLAGS) AM_LDFLAGS := -L$(top_builddir)/portals/utils +sbin_scripts = lconf lmc llanalyze llstat.pl llobdstat.pl lactive \ + load_ldap.sh lrun lwizard +bin_scripts = lfind lstripe + if UTILS rootsbin_SCRIPTS = mount.lustre sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount -sbin_SCRIPTS = lconf lmc llanalyze llstat.pl llobdstat.pl lactive load_ldap.sh lrun -sbin_SCRIPTS += lwizard -bin_SCRIPTS = lfind lstripe bin_PROGRAMS = lfs lib_LIBRARIES = liblustreapi.a -if LIBLUSTRE -sbin_SCRIPTS += lrun -endif # LIBLUSTRE +sbin_SCRIPTS = $(sbin_scripts) +bin_SCRIPTS = $(bin_scripts) endif # UTILS lctl_LDADD := $(LIBREADLINE) -lptlctl @@ -33,7 +33,7 @@ lfs_SOURCES = lfs.c llmount_SOURCES = llmount.c llmount_LDADD = $(LIBREADLINE) -lptlctl -EXTRA_DIST = $(bin_SCRIPTS) $(sbin_SCRIPTS) +EXTRA_DIST = $(bin_scripts) $(sbin_scripts) # NOTE: this should only be run on i386. newwiretest: wirehdr.c wirecheck -- 1.8.3.1